ysb_snapshot_crawl_bak.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. import random
  2. import signal
  3. import socket
  4. import sys
  5. import time
  6. import base64
  7. import math
  8. import requests
  9. from commons.conn_mysql import MySQLPoolOnline
  10. from DrissionPage import ChromiumPage, ChromiumOptions
  11. from commons.Logger import logger
  12. from oss_upload.oss_upload import AliyunOSSUploader
  13. from commons.config import YSB_ACCOUNT
  14. CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
  15. CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
  16. SLIDER_OFFSET_FIX = 10
  17. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  18. class YaoShiBangSnapshot:
  19. def __init__(self, product=None):
  20. self.product = product
  21. self.driver = None
  22. self.account_name = "ysbang_1"
  23. self.platform = 5
  24. self.db_online = MySQLPoolOnline()
  25. self.ossuploader = AliyunOSSUploader()
  26. self._register_signal_handler()
  27. def _register_signal_handler(self):
  28. def handler(signum, frame):
  29. logger.info("收到退出信号,正在关闭浏览器...")
  30. self._quit_browser()
  31. sys.exit(0)
  32. signal.signal(signal.SIGINT, handler)
  33. if hasattr(signal, "SIGTERM"):
  34. signal.signal(signal.SIGTERM, handler)
  35. def _quit_browser(self):
  36. if self.driver:
  37. try:
  38. self.driver.quit()
  39. except Exception:
  40. pass
  41. self.driver = None
  42. @staticmethod
  43. def _get_free_port():
  44. """获取一个当前可用的本地端口,供 Chrome 调试使用。"""
  45. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  46. s.bind(("127.0.0.1", 0))
  47. return s.getsockname()[1]
  48. def init_browser(self):
  49. co = ChromiumOptions().set_browser_path(chrome_path)
  50. debug_port = self._get_free_port()
  51. co.set_user_data_path(f"./spiders/yaoshibang/{self.account_name}")
  52. co.set_local_port(debug_port)
  53. co.set_argument(f"--remote-debugging-port={debug_port}")
  54. co.set_argument("--remote-debugging-address=127.0.0.1")
  55. # co.set_argument("--disable-blink-features=AutomationControlled")
  56. co.set_argument("--disable-dev-shm-usage")
  57. co.set_argument("--start-maximized")
  58. co.set_argument("--no-first-run") # 避免首次运行弹窗
  59. co.set_argument("--no-default-browser-check") # 避免默认浏览器检查
  60. self.driver = ChromiumPage(co)
  61. def _solve_slider_captcha(self):
  62. """检测并处理易盾滑块验证码,成功返回 True。"""
  63. self.driver.wait.doc_loaded()
  64. time.sleep(2)
  65. yidun = self.driver.ele("xpath://div[@class='yidun_modal']", timeout=3)
  66. if not yidun:
  67. return True
  68. logger.info("检测到滑块验证码,开始处理")
  69. jpg_bytes = yidun.get_screenshot(as_bytes="jpg")
  70. distance = self._call_captcha_api(jpg_bytes)
  71. if distance is None:
  72. logger.error("验证码识别失败")
  73. return False
  74. logger.info("滑块距离: %s", distance)
  75. slider = self.driver.ele(
  76. "xpath://div[contains(@class,'yidun_slider--hover')]", timeout=5
  77. )
  78. if not slider:
  79. logger.error("未找到滑块元素")
  80. return False
  81. try:
  82. drag_distance = float(distance) + SLIDER_OFFSET_FIX
  83. except (TypeError, ValueError):
  84. logger.error("滑块距离非数字: %r", distance)
  85. return False
  86. if not math.isfinite(drag_distance) or drag_distance <= 0:
  87. logger.error("滑块距离无效: %s", drag_distance)
  88. return False
  89. self._simulate_slider_drag(slider, drag_distance)
  90. time.sleep(3)
  91. return True
  92. def _call_captcha_api(self, image_bytes):
  93. """调用云码平台识别滑块距离,失败返回 None。"""
  94. try:
  95. b64 = base64.b64encode(image_bytes).decode()
  96. resp = requests.post(
  97. CAPTCHA_API_URL,
  98. json={"token": CAPTCHA_TOKEN, "type": "22222", "image": b64},
  99. headers={"Content-Type": "application/json"},
  100. timeout=15,
  101. ).json()
  102. logger.info("验证码 API 返回: %s", resp)
  103. if not isinstance(resp, dict):
  104. return None
  105. data = resp.get("data")
  106. if isinstance(data, dict):
  107. dist = data.get("data")
  108. else:
  109. dist = data
  110. if dist is None:
  111. logger.error("验证码 API 未返回距离字段: %s", resp)
  112. return None
  113. try:
  114. d = float(dist)
  115. except (TypeError, ValueError):
  116. logger.error("验证码距离无法解析为数字: %r", dist)
  117. return None
  118. if not math.isfinite(d):
  119. logger.error("验证码距离非有限数值: %r", dist)
  120. return None
  121. return d
  122. except Exception as e:
  123. logger.exception("验证码 API 调用失败: %s", e)
  124. return None
  125. @staticmethod
  126. def _generate_human_track(distance):
  127. try:
  128. distance = float(distance)
  129. except (TypeError, ValueError):
  130. return []
  131. if distance <= 0 or not math.isfinite(distance):
  132. return []
  133. tracks = []
  134. current = 0
  135. mid = distance * 0.7
  136. t = 0.2
  137. v = 0
  138. move_points = []
  139. while current < mid:
  140. a = random.uniform(2, 4)
  141. v0 = v
  142. v = v0 + a * t
  143. move = v0 * t + 0.5 * a * t * t
  144. current += move
  145. move_points.append(move)
  146. while current < distance:
  147. a = -random.uniform(0.5, 1.5)
  148. v0 = v
  149. v = v0 + a * t
  150. if v < 0.5:
  151. v = 0.5
  152. move = v0 * t + 0.5 * a * t * t
  153. current += move
  154. move_points.append(move)
  155. total_points = len(move_points)
  156. for i, move in enumerate(move_points):
  157. y_offset = random.randint(-2, 2) if i % random.randint(2, 4) == 0 else 0
  158. if i < total_points * 0.3:
  159. duration = random.uniform(0.01, 0.03)
  160. elif i > total_points * 0.7:
  161. duration = random.uniform(0.03, 0.08)
  162. else:
  163. duration = random.uniform(0.02, 0.05)
  164. if random.random() < 0.05:
  165. duration += random.uniform(0.05, 0.1)
  166. tracks.append((move, y_offset, duration))
  167. if random.random() < 0.7:
  168. tracks.append((-random.randint(1, 3), 0, 0.05))
  169. return tracks
  170. def _simulate_slider_drag(self, slider_element, target_distance):
  171. if target_distance <= 0:
  172. logger.warning("滑块目标距离无效: %s", target_distance)
  173. return
  174. self.driver.actions.move_to(slider_element).hold()
  175. for offset_x, offset_y, duration in self._generate_human_track(target_distance):
  176. self.driver.actions.move(offset_x, offset_y, duration=duration / 1000)
  177. self.driver.actions.release()
  178. def _is_logged_in(self):
  179. # 与当前账号店铺展示文案一致;换店后需同步修改或改为配置项
  180. title = self.driver.ele(
  181. "xpath=//span[@class='logout']",
  182. timeout=5,
  183. )
  184. return bool(title)
  185. def login(self):
  186. logger.info("开始登录药师帮")
  187. self.driver.get("https://dian.ysbang.cn/#/login", timeout=15)
  188. self.driver.wait.doc_loaded(timeout=10)
  189. time.sleep(2)
  190. input_name = self.driver.ele("xpath://input[@name='userAccount']", timeout=5)
  191. if not input_name:
  192. logger.error("未找到账号输入框")
  193. return False
  194. input_name.input(YSB_ACCOUNT["account"])
  195. time.sleep(random.uniform(1.5, 2.5))
  196. input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
  197. if not input_pass:
  198. logger.error("未找到密码输入框")
  199. return False
  200. input_pass.input(YSB_ACCOUNT["password"])
  201. time.sleep(random.uniform(1.5, 2.5))
  202. login_btn = self.driver.ele("xpath://button[text()='登录']", timeout=5)
  203. if not login_btn:
  204. logger.error("未找到登录按钮")
  205. return False
  206. login_btn.click()
  207. time.sleep(3)
  208. for i in range(3):
  209. self._solve_slider_captcha()
  210. time.sleep(3)
  211. if self._is_logged_in():
  212. logger.info("登录成功")
  213. return True
  214. logger.error("登录后未检测到目标店铺名,登录可能失败")
  215. return False
  216. def get_snapshot(self, detail_url, row_id):
  217. self.driver.get(detail_url, timeout=15)
  218. self.driver.wait.doc_loaded(timeout=10)
  219. time.sleep(2)
  220. self._dismiss_popup_before_screenshot()
  221. try:
  222. # jpg_bytes = ele.get_screenshot(as_bytes="jpg")
  223. jpg_bytes = self.driver.get_screenshot(as_bytes="jpg")
  224. if not jpg_bytes:
  225. logger.warning("截图为空 row_id=%s", row_id)
  226. return ""
  227. img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(row_id))
  228. except Exception:
  229. logger.exception("截图或 OSS 上传失败 row_id=%s url=%s", row_id, detail_url)
  230. return ""
  231. if not img_url:
  232. logger.warning("OSS 未返回有效地址 row_id=%s", row_id)
  233. return ""
  234. logger.info("截图上传完成 row_id=%s url=%s", row_id, img_url)
  235. time.sleep(random.uniform(2, 3))
  236. return img_url
  237. def _dismiss_popup_before_screenshot(self):
  238. """截图前关闭或隐藏营销弹窗,避免遮挡。"""
  239. close_locs = [
  240. "xpath=//div[contains(@class,'dialog')]//i[contains(@class,'close')]",
  241. "xpath=//div[contains(@class,'popup')]//i[contains(@class,'close')]",
  242. "xpath=//div[contains(@class,'modal')]//i[contains(@class,'close')]",
  243. "xpath=//button[contains(@class,'close')]",
  244. "xpath=//span[text()='×']",
  245. "xpath=//*[contains(text(),'智能采购')]/ancestor::div[1]//*[contains(@class,'close')]",
  246. ]
  247. for loc in close_locs:
  248. try:
  249. btn = self.driver.ele(loc, timeout=0.5)
  250. if btn:
  251. btn.click()
  252. time.sleep(0.2)
  253. except Exception:
  254. pass
  255. try:
  256. # 兜底:隐藏常见高层弹窗和遮罩
  257. self.driver.run_js(
  258. """
  259. const sels = [
  260. '[class*="modal"]',
  261. '[class*="popup"]',
  262. '[class*="dialog"]',
  263. '[class*="mask"]',
  264. '[class*="overlay"]'
  265. ];
  266. for (const s of sels) {
  267. document.querySelectorAll(s).forEach(el => {
  268. const style = getComputedStyle(el);
  269. const z = parseInt(style.zIndex || '0', 10);
  270. if (z >= 999 && style.display !== 'none') {
  271. el.style.display = 'none';
  272. }
  273. });
  274. }
  275. document.body.style.overflow = 'auto';
  276. """
  277. )
  278. time.sleep(0.2)
  279. except Exception:
  280. pass
  281. def _save_snapshot_url(self, row_id, img_url):
  282. """上传成功后回写库,避免下次任务重复拉取同一批。"""
  283. if row_id is None or not img_url:
  284. return
  285. sql = (
  286. "UPDATE `retrieve_process_lowprice_product` "
  287. "SET `snapshot_url` = %s WHERE `id` = %s AND `platform` = %s"
  288. )
  289. n = self.db_online.execute(sql, (img_url, row_id, self.platform))
  290. if n <= 0:
  291. logger.warning("snapshot_url 回写未影响行数 id=%s platform=%s", row_id, self.platform)
  292. def search(self, data_list):
  293. self.driver.get("https://dian.ysbang.cn/#/home", timeout=15)
  294. self.driver.wait.doc_loaded(timeout=10)
  295. time.sleep(2)
  296. if not self._is_logged_in():
  297. if not self.login():
  298. return False
  299. ok, fail = 0, 0
  300. for data in data_list:
  301. row_id = data.get("id")
  302. link_url = data.get("link_url")
  303. if not link_url:
  304. logger.warning("缺少 link_url,跳过 id=%s", row_id)
  305. fail += 1
  306. continue
  307. img_url = self.get_snapshot(link_url, row_id)
  308. if img_url:
  309. # self._save_snapshot_url(row_id, img_url)
  310. ok += 1
  311. else:
  312. fail += 1
  313. time.sleep(2)
  314. logger.info("快照任务结束 成功=%s 失败=%s 总计=%s", ok, fail, len(data_list))
  315. return ok > 0
  316. def run(self):
  317. date_str = time.strftime("%Y-%m-%d")
  318. sql = """
  319. SELECT `id`, `link_url`
  320. FROM `retrieve_process_lowprice_product`
  321. WHERE `platform` = %s
  322. AND `snapshot_url` = ""
  323. AND `scrape_date` = %s LIMIT 100 """
  324. data_list = self.db_online.select_data(sql, (self.platform, date_str))
  325. if not data_list:
  326. logger.info("当前不需要更新快照")
  327. return
  328. try:
  329. self.init_browser()
  330. self.search(data_list)
  331. except Exception as e:
  332. logger.exception("运行异常: %s", e)
  333. finally:
  334. self._quit_browser()
  335. if __name__ == "__main__":
  336. YaoShiBangSnapshot().run()