jd_captcha.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. """
  2. 京东滑块验证码:打码识别 + 轨迹生成 + 拖动。
  3. 多处复用:from spiders.jd.jd_captcha import handle_jd_slider_captcha, JdCaptchaHandler
  4. """
  5. import base64
  6. import math
  7. import random
  8. import time
  9. from contextlib import contextmanager
  10. import requests
  11. from PIL import Image
  12. DEFAULT_CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
  13. DEFAULT_SCREENSHOT_PATH = "./element_screenshot.png"
  14. JFBYM_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
  15. CAPTCHA_MODAL_XPATH = "xpath=//div[@id='captcha_modal']"
  16. CAPTCHA_IMG_XPATH = 'xpath://img[@id="main_img"]'
  17. SLIDER_IMG_XPATH = "xpath://img[@class='move-img']"
  18. @contextmanager
  19. def pause_page_listen(page, clear=True):
  20. """处理验证码时暂停网络监听,避免与滑块拖动抢 CDP 资源(auto_crawl 场景)。"""
  21. listen = getattr(page, "listen", None)
  22. was_listening = bool(listen and getattr(listen, "listening", False))
  23. if was_listening:
  24. listen.pause(clear=clear)
  25. try:
  26. yield
  27. finally:
  28. if was_listening:
  29. listen.resume()
  30. def simulate(target_x, seed=None):
  31. while 1:
  32. x_seq = simulate_x(target_x, seed)
  33. if len(x_seq) < 50 and target_x > 150:
  34. continue
  35. t_seq = _generate_t(x_seq, seed)
  36. y_seq = _generate_y(x_seq, t_seq, seed)
  37. result = []
  38. for x, y, t in zip(x_seq, y_seq, t_seq):
  39. result.append([x, y, t])
  40. return result
  41. def _generate_t(x_seq, seed=None):
  42. if seed is not None:
  43. random.seed(seed + 9999)
  44. n = len(x_seq)
  45. t_seq = [0] * n
  46. for i in range(1, n):
  47. dx = x_seq[i] - x_seq[i - 1]
  48. is_pause = dx == 0
  49. if i == 1:
  50. t_seq[i] = random.randint(50, 95)
  51. elif is_pause:
  52. if random.random() < 0.20:
  53. t_seq[i] = random.choice([16, 24, 33, 40, 58, 71, 74, 90, 96, 150, 200, 264])
  54. else:
  55. t_seq[i] = random.choices([6, 7, 8, 9, 10], weights=[3, 25, 45, 22, 5])[0]
  56. else:
  57. r = random.random()
  58. if r < 0.90:
  59. t_seq[i] = random.choices([6, 7, 8, 9, 10], weights=[3, 25, 45, 22, 5])[0]
  60. elif r < 0.95:
  61. t_seq[i] = random.choice([6, 10])
  62. else:
  63. t_seq[i] = random.choice([16, 24, 25, 33, 40, 58, 71, 74, 90, 96])
  64. return t_seq
  65. def _generate_y(x_seq, t_seq, seed=None):
  66. if seed is not None:
  67. random.seed(seed + 19999)
  68. n = len(x_seq)
  69. y_seq = [0] * n
  70. current_y = 0
  71. direction = 0
  72. dir_remaining = 0
  73. cooldown = 0
  74. for i in range(1, n):
  75. is_abnormal_t = t_seq[i] > 10
  76. if dir_remaining > 0:
  77. dir_remaining -= 1
  78. if dir_remaining == 0:
  79. direction = 0
  80. cooldown = random.randint(4, 8)
  81. elif cooldown > 0:
  82. cooldown -= 1
  83. else:
  84. triggered = False
  85. if is_abnormal_t and random.random() < 0.40:
  86. triggered = True
  87. elif random.random() < 0.025:
  88. triggered = True
  89. if triggered:
  90. if current_y >= 4:
  91. direction = random.choices([-1, 1], weights=[85, 15])[0]
  92. elif current_y <= -4:
  93. direction = random.choices([-1, 1], weights=[15, 85])[0]
  94. else:
  95. direction = random.choice([-1, 1])
  96. dir_remaining = random.choices([1, 2, 3, 4], weights=[35, 35, 20, 10])[0]
  97. current_y += direction
  98. y_seq[i] = current_y
  99. return y_seq
  100. def simulate_x(target_x, seed=None):
  101. if seed is not None:
  102. random.seed(seed)
  103. seq = [0]
  104. step = 1
  105. x = 0
  106. phase = "accelerating"
  107. accel_threshold = target_x * 0.2
  108. cruise_threshold = target_x * 0.5
  109. while x < target_x:
  110. remaining = target_x - x
  111. if phase == "accelerating":
  112. if step >= 9 or x >= accel_threshold:
  113. phase = "cruising"
  114. continue
  115. elif phase == "cruising":
  116. if x >= cruise_threshold:
  117. phase = "decelerating"
  118. continue
  119. elif phase == "decelerating":
  120. if remaining <= 6:
  121. phase = "fine_tuning"
  122. continue
  123. if phase == "accelerating":
  124. delta = random.choices([-1, 0, 1, 2, 3], weights=[5, 10, 30, 35, 20])[0]
  125. step = _clamp(step + delta, 1, 8)
  126. elif phase == "cruising":
  127. if step <= 1:
  128. delta = random.choices([0, 1, 2], weights=[12, 55, 33])[0]
  129. elif step >= 8:
  130. delta = random.choices([-2, -1, 0], weights=[20, 50, 30])[0]
  131. else:
  132. delta = random.choices([-2, -1, 0, 1, 2], weights=[5, 22, 50, 18, 5])[0]
  133. step = _clamp(step + delta, 0, 7)
  134. elif phase == "decelerating":
  135. remaining_ratio = remaining / target_x
  136. max_step = max(3, int(2.5 + 5.5 * remaining_ratio / 0.35))
  137. if remaining_ratio > 0.18:
  138. if step <= 1:
  139. delta = random.choices([0, 1, 2], weights=[12, 50, 38])[0]
  140. elif step >= max_step:
  141. delta = random.choices([-2, -1, 0], weights=[25, 45, 30])[0]
  142. else:
  143. delta = random.choices([-2, -1, 0, 1, 2], weights=[8, 22, 46, 19, 5])[0]
  144. else:
  145. if step <= 0:
  146. delta = random.choices([1, 2], weights=[65, 35])[0]
  147. elif step == 1:
  148. delta = random.choices([-1, 0, 1], weights=[18, 52, 30])[0]
  149. elif step >= max_step:
  150. delta = random.choices([-2, -1, 0], weights=[25, 45, 30])[0]
  151. else:
  152. delta = random.choices([-2, -1, 0, 1], weights=[10, 30, 45, 15])[0]
  153. step = _clamp(step + delta, 0, max_step)
  154. if step == 0 and len(seq) >= 2 and seq[-1] == seq[-2]:
  155. step = 1
  156. elif phase == "fine_tuning":
  157. if remaining <= 0:
  158. break
  159. step = random.choices([0, 1, 2], weights=[10, 70, 20])[0]
  160. step = min(step, remaining)
  161. if step == 0 and len(seq) >= 2 and seq[-1] == seq[-2]:
  162. step = 1 if remaining >= 1 else 0
  163. x += step
  164. if x > target_x:
  165. x = target_x
  166. seq.append(x)
  167. return seq
  168. def _clamp(v, lo, hi):
  169. return max(lo, min(hi, v))
  170. class JdCaptchaHandler:
  171. """京东滑块验证码处理器,绑定 DrissionPage 的 ChromiumPage / Tab。"""
  172. def __init__(self, page, token=None, screenshot_path=None):
  173. self.page = page
  174. self.token = token or DEFAULT_CAPTCHA_TOKEN
  175. self.screenshot_path = screenshot_path or DEFAULT_SCREENSHOT_PATH
  176. @staticmethod
  177. def _safe_float(value, default=0.0):
  178. try:
  179. return float(value)
  180. except (TypeError, ValueError):
  181. return default
  182. def _run_js_safe(self, target, script, default=None):
  183. try:
  184. if hasattr(target, "run_js"):
  185. return target.run_js(script)
  186. if hasattr(target, "run_script"):
  187. return target.run_script(script)
  188. except Exception:
  189. return default
  190. return default
  191. def _get_device_pixel_ratio(self):
  192. ratio = self._run_js_safe(self.page, "return window.devicePixelRatio || 1;", default=1)
  193. ratio = self._safe_float(ratio, 1.0)
  194. return ratio if ratio > 0 else 1.0
  195. def _get_image_width(self, image_path):
  196. try:
  197. with Image.open(image_path) as img:
  198. return float(img.width)
  199. except Exception:
  200. return 0.0
  201. def _get_ele_css_width(self, ele):
  202. width = self._run_js_safe(ele, "return this.getBoundingClientRect().width || 0;", default=0)
  203. width = self._safe_float(width, 0.0)
  204. if width > 0:
  205. return width
  206. try:
  207. size = ele.rect.size
  208. if isinstance(size, (tuple, list)) and len(size) >= 1:
  209. return self._safe_float(size[0], 0.0)
  210. except Exception:
  211. pass
  212. return 0.0
  213. def _normalize_slider_distance(self, raw_distance, capt_ele, slider_ele, screenshot_path):
  214. distance = max(0.0, self._safe_float(raw_distance, 0.0))
  215. capt_css_width = self._get_ele_css_width(capt_ele)
  216. screenshot_width = self._get_image_width(screenshot_path)
  217. natural_width = self._safe_float(
  218. self._run_js_safe(capt_ele, "return this.naturalWidth || 0;", default=0),
  219. 0.0,
  220. )
  221. if capt_css_width > 0 and screenshot_width > 0:
  222. return distance * (capt_css_width / screenshot_width)
  223. if capt_css_width > 0 and natural_width > 0:
  224. return distance * (capt_css_width / natural_width)
  225. dpr = self._get_device_pixel_ratio()
  226. if dpr > 1.0:
  227. return distance / dpr
  228. return distance
  229. def generate_human_track(self, distance):
  230. try:
  231. distance = float(distance)
  232. except (TypeError, ValueError):
  233. return []
  234. if distance <= 0 or not math.isfinite(distance):
  235. return []
  236. tracks = []
  237. current = 0
  238. mid = distance * 0.7
  239. t = 0.2
  240. v = 0
  241. move_points = []
  242. while current < mid:
  243. a = random.uniform(2, 4)
  244. v0 = v
  245. v = v0 + a * t
  246. move = v0 * t + 0.5 * a * t * t
  247. current += move
  248. move_points.append(move)
  249. while current < distance:
  250. a = -random.uniform(0.5, 1.5)
  251. v0 = v
  252. v = v0 + a * t
  253. if v < 0.5:
  254. v = 0.5
  255. move = v0 * t + 0.5 * a * t * t
  256. current += move
  257. move_points.append(move)
  258. total_points = len(move_points)
  259. for i, move in enumerate(move_points):
  260. y_offset = random.randint(-2, 2) if i % random.randint(2, 4) == 0 else 0
  261. if i < total_points * 0.3:
  262. duration = random.uniform(0.01, 0.03)
  263. elif i > total_points * 0.7:
  264. duration = random.uniform(0.03, 0.08)
  265. else:
  266. duration = random.uniform(0.02, 0.05)
  267. if random.random() < 0.05:
  268. duration += random.uniform(0.05, 0.1)
  269. tracks.append((move, y_offset, duration))
  270. if random.random() < 0.7:
  271. tracks.append((-random.randint(1, 3), 0, 0.05))
  272. return tracks
  273. def simulate_slider_drag(self, slider_element, target_distance):
  274. if target_distance <= 0:
  275. return
  276. self.page.actions.move_to(slider_element).hold()
  277. for offset_x, offset_y, duration in self.generate_human_track(target_distance):
  278. self.page.actions.move(offset_x, offset_y, duration=duration / 1000)
  279. self.page.actions.release()
  280. def verify(self, type_num, image_path=None):
  281. """调用云码平台:type_num=1 坐标点选,2 滑块距离。"""
  282. image_path = image_path or self.screenshot_path
  283. with open(image_path, "rb") as f:
  284. image_b64 = base64.b64encode(f.read()).decode()
  285. if type_num == 1:
  286. data = {
  287. "token": self.token,
  288. "type": "30332",
  289. "direction": "top",
  290. "click_num": 3,
  291. "image": image_b64,
  292. }
  293. else:
  294. data = {
  295. "token": self.token,
  296. "type": "22222",
  297. "image": image_b64,
  298. }
  299. response = requests.post(
  300. JFBYM_API_URL,
  301. headers={"Content-Type": "application/json"},
  302. json=data,
  303. timeout=30,
  304. ).json()
  305. print(response)
  306. return response["data"]["data"]
  307. def handle_slider(
  308. self,
  309. capt_ele=None,
  310. slider_ele=None,
  311. drag_offset=1.5,
  312. inject_track_js=True,
  313. ):
  314. """
  315. 完整滑块流程:截图 -> 打码 -> 注入轨迹 -> 拖动。
  316. 成功返回 True,失败返回 False。
  317. """
  318. capt_ele = capt_ele or self.page.ele(CAPTCHA_IMG_XPATH, timeout=2)
  319. if not capt_ele:
  320. print("未找到验证码背景图")
  321. return False
  322. capt_ele.get_screenshot(self.screenshot_path)
  323. distance = self.verify(2)
  324. try:
  325. distance = float(distance)
  326. except (TypeError, ValueError):
  327. print(f"滑块距离格式异常:{distance}")
  328. return False
  329. print(f"滑块距离(接口原始值):{distance}")
  330. slider_ele = slider_ele or self.page.ele(SLIDER_IMG_XPATH, timeout=2)
  331. if not slider_ele:
  332. print("未找到滑块")
  333. return False
  334. drag_distance = self._normalize_slider_distance(
  335. distance,
  336. capt_ele=capt_ele,
  337. slider_ele=slider_ele,
  338. screenshot_path=self.screenshot_path,
  339. )
  340. drag_px = max(0.0, float(drag_distance) - drag_offset)
  341. if inject_track_js:
  342. result = simulate(math.ceil(int(drag_distance)))
  343. self.page.run_js("window.xxxll = {};".format(result))
  344. time.sleep(3)
  345. self.simulate_slider_drag(slider_ele, drag_px)
  346. return True
  347. def has_captcha_modal(self):
  348. return bool(self.page.ele(CAPTCHA_MODAL_XPATH, timeout=1))
  349. def has_moveslide_modal(self):
  350. capt_cha = "xpath://img[@class='move-img']"
  351. return bool(self.page.ele(capt_cha, timeout=1))
  352. def _wait_for_slider(self, rounds=5):
  353. if self.has_moveslide_modal():
  354. return True
  355. for _ in range(rounds):
  356. time.sleep(1)
  357. if self.has_moveslide_modal():
  358. return True
  359. return False
  360. def handle_slider_until_gone(self, max_attempts=3, wait_after=2, slider_wait_rounds=5, **handle_kwargs):
  361. """
  362. 处理滑块并在每次处理后检查验证码是否仍在页面。
  363. 验证码消失返回 True;达到 max_attempts 仍存在返回 False。
  364. """
  365. if not self.has_captcha_modal():
  366. return True
  367. for attempt in range(1, max_attempts + 1):
  368. print(f"验证码处理 第 {attempt}/{max_attempts} 次")
  369. if not self._wait_for_slider(slider_wait_rounds):
  370. print("验证码弹窗在,但滑块元素未出现(可能非滑块类型)")
  371. if attempt >= max_attempts:
  372. return False
  373. time.sleep(wait_after)
  374. continue
  375. ok = self.handle_slider(**handle_kwargs)
  376. if not ok:
  377. print("本次滑块处理失败")
  378. else:
  379. time.sleep(wait_after)
  380. if not self.has_captcha_modal():
  381. print("验证码已消失")
  382. return True
  383. print("验证码仍在页面")
  384. if attempt >= max_attempts:
  385. break
  386. time.sleep(wait_after)
  387. if self.has_captcha_modal():
  388. print(f"验证码处理失败,已尝试 {max_attempts} 次,弹窗仍在")
  389. return False
  390. return True
  391. def handle_jd_slider_captcha(
  392. page,
  393. token=None,
  394. screenshot_path=None,
  395. max_attempts=3,
  396. wait_after=2,
  397. slider_wait_rounds=5,
  398. pause_listen=True,
  399. pause_listen_clear=False,
  400. **kwargs,
  401. ):
  402. """
  403. 便捷入口:处理当前页面的京东滑块验证码,最多重试 max_attempts 次。
  404. 返回 True:无需验证码或已成功通过;False:处理失败或验证码仍在。
  405. pause_listen:auto_crawl 等已开启 listen 的场景建议 True。
  406. pause_listen_clear:暂停时是否清空监听队列;采集中应为 False,避免丢掉首屏 wareList。
  407. """
  408. handler = JdCaptchaHandler(page, token=token, screenshot_path=screenshot_path)
  409. if not handler.has_captcha_modal():
  410. return True
  411. if pause_listen:
  412. with pause_page_listen(page, clear=pause_listen_clear):
  413. return handler.handle_slider_until_gone(
  414. max_attempts=max_attempts,
  415. wait_after=wait_after,
  416. slider_wait_rounds=slider_wait_rounds,
  417. **kwargs,
  418. )
  419. return handler.handle_slider_until_gone(
  420. max_attempts=max_attempts,
  421. wait_after=wait_after,
  422. slider_wait_rounds=slider_wait_rounds,
  423. **kwargs,
  424. )