ysb_snapshot_crawl.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. import random
  2. import signal
  3. import socket
  4. import sys
  5. import time
  6. import base64
  7. import math
  8. import requests
  9. from commons.conn_mysql import MySQLPoolOnline
  10. from DrissionPage import ChromiumPage, ChromiumOptions
  11. from commons.Logger import logger
  12. from oss_upload.oss_upload import AliyunOSSUploader
  13. from commons.config import YSB_ACCOUNT, YSB_PASSWORD
  14. CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
  15. CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
  16. SLIDER_OFFSET_FIX = 10
  17. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  18. class YaoShiBangSnapshot:
  19. def __init__(self, product=None):
  20. self.product = product
  21. self.driver = None
  22. self.account_name = "ysbang_1"
  23. self.platform = 5
  24. self.db_online = MySQLPoolOnline()
  25. self.ossuploader = AliyunOSSUploader()
  26. self._register_signal_handler()
  27. def _register_signal_handler(self):
  28. def handler(signum, frame):
  29. logger.info("收到退出信号,正在关闭浏览器...")
  30. self._quit_browser()
  31. sys.exit(0)
  32. signal.signal(signal.SIGINT, handler)
  33. if hasattr(signal, "SIGTERM"):
  34. signal.signal(signal.SIGTERM, handler)
  35. def _quit_browser(self):
  36. if self.driver:
  37. try:
  38. self.driver.quit()
  39. except Exception:
  40. pass
  41. self.driver = None
  42. @staticmethod
  43. def _get_free_port():
  44. """获取一个当前可用的本地端口,供 Chrome 调试使用。"""
  45. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  46. s.bind(("127.0.0.1", 0))
  47. return s.getsockname()[1]
  48. def init_browser(self):
  49. co = ChromiumOptions().set_browser_path(chrome_path)
  50. debug_port = self._get_free_port()
  51. co.set_user_data_path(f"./spiders/yaoshibang/{self.account_name}")
  52. co.set_local_port(debug_port)
  53. co.set_argument(f"--remote-debugging-port={debug_port}")
  54. co.set_argument("--remote-debugging-address=127.0.0.1")
  55. # co.set_argument("--disable-blink-features=AutomationControlled")
  56. co.set_argument("--disable-dev-shm-usage")
  57. co.set_argument("--no-first-run") # 避免首次运行弹窗
  58. co.set_argument("--no-default-browser-check") # 避免默认浏览器检查
  59. self.driver = ChromiumPage(co)
  60. def _solve_slider_captcha(self):
  61. """检测并处理易盾滑块验证码,成功返回 True。"""
  62. self.driver.wait.doc_loaded()
  63. time.sleep(2)
  64. yidun = self.driver.ele("xpath://div[@class='yidun_modal']", timeout=3)
  65. if not yidun:
  66. return True
  67. logger.info("检测到滑块验证码,开始处理")
  68. jpg_bytes = yidun.get_screenshot(as_bytes="jpg")
  69. distance = self._call_captcha_api(jpg_bytes)
  70. if distance is None:
  71. logger.error("验证码识别失败")
  72. return False
  73. logger.info("滑块距离: %s", distance)
  74. slider = self.driver.ele(
  75. "xpath://div[contains(@class,'yidun_slider--hover')]", timeout=5
  76. )
  77. if not slider:
  78. logger.error("未找到滑块元素")
  79. return False
  80. try:
  81. drag_distance = float(distance) + SLIDER_OFFSET_FIX
  82. except (TypeError, ValueError):
  83. logger.error("滑块距离非数字: %r", distance)
  84. return False
  85. if not math.isfinite(drag_distance) or drag_distance <= 0:
  86. logger.error("滑块距离无效: %s", drag_distance)
  87. return False
  88. self._simulate_slider_drag(slider, drag_distance)
  89. time.sleep(3)
  90. return True
  91. def _call_captcha_api(self, image_bytes):
  92. """调用云码平台识别滑块距离,失败返回 None。"""
  93. try:
  94. b64 = base64.b64encode(image_bytes).decode()
  95. resp = requests.post(
  96. CAPTCHA_API_URL,
  97. json={"token": CAPTCHA_TOKEN, "type": "22222", "image": b64},
  98. headers={"Content-Type": "application/json"},
  99. timeout=15,
  100. ).json()
  101. logger.info("验证码 API 返回: %s", resp)
  102. if not isinstance(resp, dict):
  103. return None
  104. data = resp.get("data")
  105. if isinstance(data, dict):
  106. dist = data.get("data")
  107. else:
  108. dist = data
  109. if dist is None:
  110. logger.error("验证码 API 未返回距离字段: %s", resp)
  111. return None
  112. try:
  113. d = float(dist)
  114. except (TypeError, ValueError):
  115. logger.error("验证码距离无法解析为数字: %r", dist)
  116. return None
  117. if not math.isfinite(d):
  118. logger.error("验证码距离非有限数值: %r", dist)
  119. return None
  120. return d
  121. except Exception as e:
  122. logger.exception("验证码 API 调用失败: %s", e)
  123. return None
  124. @staticmethod
  125. def _generate_human_track(distance):
  126. try:
  127. distance = float(distance)
  128. except (TypeError, ValueError):
  129. return []
  130. if distance <= 0 or not math.isfinite(distance):
  131. return []
  132. tracks = []
  133. current = 0
  134. mid = distance * 0.7
  135. t = 0.2
  136. v = 0
  137. move_points = []
  138. while current < mid:
  139. a = random.uniform(2, 4)
  140. v0 = v
  141. v = v0 + a * t
  142. move = v0 * t + 0.5 * a * t * t
  143. current += move
  144. move_points.append(move)
  145. while current < distance:
  146. a = -random.uniform(0.5, 1.5)
  147. v0 = v
  148. v = v0 + a * t
  149. if v < 0.5:
  150. v = 0.5
  151. move = v0 * t + 0.5 * a * t * t
  152. current += move
  153. move_points.append(move)
  154. total_points = len(move_points)
  155. for i, move in enumerate(move_points):
  156. y_offset = random.randint(-2, 2) if i % random.randint(2, 4) == 0 else 0
  157. if i < total_points * 0.3:
  158. duration = random.uniform(0.01, 0.03)
  159. elif i > total_points * 0.7:
  160. duration = random.uniform(0.03, 0.08)
  161. else:
  162. duration = random.uniform(0.02, 0.05)
  163. if random.random() < 0.05:
  164. duration += random.uniform(0.05, 0.1)
  165. tracks.append((move, y_offset, duration))
  166. if random.random() < 0.7:
  167. tracks.append((-random.randint(1, 3), 0, 0.05))
  168. return tracks
  169. def _simulate_slider_drag(self, slider_element, target_distance):
  170. if target_distance <= 0:
  171. logger.warning("滑块目标距离无效: %s", target_distance)
  172. return
  173. self.driver.actions.move_to(slider_element).hold()
  174. for offset_x, offset_y, duration in self._generate_human_track(target_distance):
  175. self.driver.actions.move(offset_x, offset_y, duration=duration / 1000)
  176. self.driver.actions.release()
  177. def _is_logged_in(self):
  178. # 与当前账号店铺展示文案一致;换店后需同步修改或改为配置项
  179. title = self.driver.ele(
  180. "xpath=//*[contains(text(),'广西好药师大药房连锁有限公司天峨远大药店')]",
  181. timeout=5,
  182. )
  183. return bool(title)
  184. def login(self):
  185. logger.info("开始登录药师帮")
  186. self.driver.get("https://dian.ysbang.cn/#/login", timeout=15)
  187. self.driver.wait.doc_loaded(timeout=10)
  188. time.sleep(2)
  189. input_name = self.driver.ele("xpath://input[@name='userAccount']", timeout=5)
  190. if not input_name:
  191. logger.error("未找到账号输入框")
  192. return False
  193. input_name.input(YSB_ACCOUNT)
  194. time.sleep(random.uniform(1.5, 2.5))
  195. input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
  196. if not input_pass:
  197. logger.error("未找到密码输入框")
  198. return False
  199. input_pass.input(YSB_PASSWORD)
  200. time.sleep(random.uniform(1.5, 2.5))
  201. login_btn = self.driver.ele("xpath://button[text()='登录']", timeout=5)
  202. if not login_btn:
  203. logger.error("未找到登录按钮")
  204. return False
  205. login_btn.click()
  206. time.sleep(3)
  207. for i in range(3):
  208. self._solve_slider_captcha()
  209. time.sleep(3)
  210. if self._is_logged_in():
  211. logger.info("登录成功")
  212. return True
  213. logger.error("登录后未检测到目标店铺名,登录可能失败")
  214. return False
  215. def get_snapshot(self, detail_url, row_id):
  216. self.driver.get(detail_url, timeout=15)
  217. self.driver.wait.doc_loaded(timeout=10)
  218. time.sleep(2)
  219. self._dismiss_popup_before_screenshot()
  220. ele = self.driver.ele("xpath=//div[@class='drug-shopping-wrap']", timeout=8)
  221. if not ele:
  222. ele = self.driver.ele("xpath=//div[@class='drug-info']", timeout=5)
  223. if not ele:
  224. logger.warning("未找到详情区域元素,跳过截图 row_id=%s", row_id)
  225. return ""
  226. try:
  227. jpg_bytes = ele.get_screenshot(as_bytes="jpg")
  228. if not jpg_bytes:
  229. logger.warning("截图为空 row_id=%s", row_id)
  230. return ""
  231. img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(row_id))
  232. except Exception:
  233. logger.exception("截图或 OSS 上传失败 row_id=%s url=%s", row_id, detail_url)
  234. return ""
  235. if not img_url:
  236. logger.warning("OSS 未返回有效地址 row_id=%s", row_id)
  237. return ""
  238. logger.info("截图上传完成 row_id=%s url=%s", row_id, img_url)
  239. time.sleep(random.uniform(0.5, 1.5))
  240. return img_url
  241. def _dismiss_popup_before_screenshot(self):
  242. """截图前关闭或隐藏营销弹窗,避免遮挡。"""
  243. close_locs = [
  244. "xpath=//div[contains(@class,'dialog')]//i[contains(@class,'close')]",
  245. "xpath=//div[contains(@class,'popup')]//i[contains(@class,'close')]",
  246. "xpath=//div[contains(@class,'modal')]//i[contains(@class,'close')]",
  247. "xpath=//button[contains(@class,'close')]",
  248. "xpath=//span[text()='×']",
  249. "xpath=//*[contains(text(),'智能采购')]/ancestor::div[1]//*[contains(@class,'close')]",
  250. ]
  251. for loc in close_locs:
  252. try:
  253. btn = self.driver.ele(loc, timeout=0.5)
  254. if btn:
  255. btn.click()
  256. time.sleep(0.2)
  257. except Exception:
  258. pass
  259. try:
  260. # 兜底:隐藏常见高层弹窗和遮罩
  261. self.driver.run_js(
  262. """
  263. const sels = [
  264. '[class*="modal"]',
  265. '[class*="popup"]',
  266. '[class*="dialog"]',
  267. '[class*="mask"]',
  268. '[class*="overlay"]'
  269. ];
  270. for (const s of sels) {
  271. document.querySelectorAll(s).forEach(el => {
  272. const style = getComputedStyle(el);
  273. const z = parseInt(style.zIndex || '0', 10);
  274. if (z >= 999 && style.display !== 'none') {
  275. el.style.display = 'none';
  276. }
  277. });
  278. }
  279. document.body.style.overflow = 'auto';
  280. """
  281. )
  282. time.sleep(0.2)
  283. except Exception:
  284. pass
  285. def _save_snapshot_url(self, row_id, img_url):
  286. """上传成功后回写库,避免下次任务重复拉取同一批。"""
  287. if row_id is None or not img_url:
  288. return
  289. sql = (
  290. "UPDATE `retrieve_process_lowprice_product` "
  291. "SET `snapshot_url` = %s WHERE `id` = %s AND `platform` = %s"
  292. )
  293. n = self.db_online.execute(sql, (img_url, row_id, self.platform))
  294. if n <= 0:
  295. logger.warning("snapshot_url 回写未影响行数 id=%s platform=%s", row_id, self.platform)
  296. def search(self, data_list):
  297. self.driver.get("https://dian.ysbang.cn/#/home", timeout=15)
  298. self.driver.wait.doc_loaded(timeout=10)
  299. time.sleep(2)
  300. if not self._is_logged_in():
  301. if not self.login():
  302. return False
  303. ok, fail = 0, 0
  304. for data in data_list:
  305. row_id = data.get("id")
  306. link_url = data.get("link_url")
  307. if not link_url:
  308. logger.warning("缺少 link_url,跳过 id=%s", row_id)
  309. fail += 1
  310. continue
  311. img_url = self.get_snapshot(link_url, row_id)
  312. if img_url:
  313. self._save_snapshot_url(row_id, img_url)
  314. ok += 1
  315. else:
  316. fail += 1
  317. logger.info("快照任务结束 成功=%s 失败=%s 总计=%s", ok, fail, len(data_list))
  318. return ok > 0
  319. def run(self):
  320. date_str = time.strftime("%Y-%m-%d")
  321. sql = """
  322. SELECT `id`,`link_url` FROM `retrieve_process_lowprice_product`
  323. WHERE `platform`=%s AND `snapshot_url` IS NULL AND `scrape_date`=%s
  324. LIMIT 100 """
  325. data_list = self.db_online.select_data(sql, (self.platform, date_str))
  326. if not data_list:
  327. logger.info("当前不需要更新快照")
  328. return
  329. try:
  330. self.init_browser()
  331. self.search(data_list)
  332. except Exception as e:
  333. logger.exception("运行异常: %s", e)
  334. finally:
  335. self._quit_browser()
  336. if __name__ == "__main__":
  337. YaoShiBangSnapshot().run()