jd_auto_crawl_snap.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718
  1. import random
  2. import re
  3. import signal
  4. import socket
  5. import sys
  6. import time
  7. from decimal import Decimal, InvalidOperation
  8. from urllib.parse import quote
  9. from DrissionPage import ChromiumPage, ChromiumOptions
  10. import json
  11. import hashlib
  12. from commons.Logger import get_spider_logger
  13. from commons.conn_mysql import MySQLPoolOnline
  14. from pipelines.drug_pipelines import DrugPipeline
  15. from commons.feishu_webhook import send_text
  16. from spiders.jd.jd_captcha import handle_jd_slider_captcha
  17. from oss_upload.oss_upload import AliyunOSSUploader
  18. logger = get_spider_logger("jd")
  19. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  20. FETCH_TIMEOUT_FIRST = 5
  21. FETCH_TIMEOUT_SCROLL = 6
  22. LISTEN_CLEAR_ROUNDS = 3
  23. LISTEN_CLEAR_TIMEOUT = 0.45
  24. # 「下一页」是否在视口内(条件略宽)
  25. _JS_NEXT_BTN_IN_VIEWPORT = """
  26. var el = arguments[0];
  27. if (!el) return false;
  28. var r = el.getBoundingClientRect();
  29. var h = window.innerHeight || document.documentElement.clientHeight || 800;
  30. var w = window.innerWidth || document.documentElement.clientWidth || 1200;
  31. return r.bottom > 80 && r.top < h - 40 && r.right > 0 && r.left < w;
  32. """
  33. class JdCrawlerV2:
  34. def __init__(self, drug_dict=None):
  35. self.driver = None
  36. self.register_signal_handler()
  37. self.db = MySQLPoolOnline()
  38. self.ip = None
  39. self.account_name = None
  40. self.login_username = None
  41. self.login_password = None
  42. self.platform = 2
  43. self.pipeline = DrugPipeline("jd")
  44. self.task_dict = drug_dict or {}
  45. self.ossuploader = AliyunOSSUploader()
  46. self.start_page = 1
  47. self.end_page = 1
  48. if self.task_dict:
  49. self.get_product_data()
  50. self.success = True
  51. self.is_no_prodcut = 0
  52. def get_product_data(self):
  53. self.task_id = self.task_dict["id"]
  54. self.company_id = self.task_dict["company_id"]
  55. self.product = self.task_dict["product_name"]
  56. self.product_desc = self.task_dict.get("product_specs", "")
  57. self.brand = self.task_dict.get("product_brand", "")
  58. self.product_keyword = self.task_dict.get("product_keyword", "")
  59. self.collect_task_id = self.task_dict.get("collect_task_id", "")
  60. self.sampling_cycle = self.task_dict.get("sampling_cycle", "")
  61. self.sampling_start_time = self.task_dict.get("sampling_start_time", "")
  62. self.sampling_end_time = self.task_dict.get("sampling_end_time", "")
  63. self.collect_equipment_id = self.task_dict.get("collect_equipment_id", "")
  64. self.account_id = self.task_dict.get("collect_equipment_account_id", "")
  65. self.collect_region_id = self.task_dict.get("collect_region_id", "")
  66. self.collect_round = self.task_dict.get("collect_round", 1)
  67. self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
  68. self.end_page = self._parse_page(self.task_dict.get("end_page"), 20)
  69. @staticmethod
  70. def _parse_page(value, default=1):
  71. try:
  72. page = int(value)
  73. return page if page >= 1 else default
  74. except (TypeError, ValueError):
  75. return default
  76. @staticmethod
  77. def _get_free_port():
  78. """获取一个当前可用的本地端口,供 Chrome 调试使用。"""
  79. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  80. s.bind(("127.0.0.1", 0))
  81. return s.getsockname()[1]
  82. def init_browser(self):
  83. co = ChromiumOptions().set_browser_path(chrome_path)
  84. debug_port = self._get_free_port()
  85. co.set_user_data_path(f"./spiders/jd/{self.account_name}")
  86. co.set_local_port(debug_port)
  87. co.set_argument(f"--remote-debugging-port={debug_port}")
  88. co.set_argument("--remote-debugging-address=127.0.0.1")
  89. # co.set_argument("--disable-blink-features=AutomationControlled")
  90. co.set_argument("--disable-dev-shm-usage")
  91. co.set_argument("--no-first-run") # 避免首次运行弹窗
  92. co.set_argument("--no-default-browser-check") # 避免默认浏览器检查
  93. if self.ip:
  94. proxy = self.ip.strip()
  95. if not proxy.startswith(("http://", "https://")):
  96. proxy = f"http://{proxy}"
  97. co.set_argument(f"--proxy-server={proxy}")
  98. logger.info("启动浏览器: account=%s, debug_port=%s", self.account_name, debug_port)
  99. self.driver = ChromiumPage(co)
  100. self._listen_started = False
  101. def _start_listen(self):
  102. """登录完成后再开监听,避免干扰登录页/验证码拖动。"""
  103. if self._listen_started or not self.driver:
  104. return
  105. self.driver.listen.start("api?appid=search-pc-java")
  106. self._listen_started = True
  107. logger.info("已启动搜索接口监听")
  108. def register_signal_handler(self):
  109. def handler(signum, frame):
  110. print("\n⚠️ 程序退出")
  111. if self.driver:
  112. self.driver.quit()
  113. sys.exit(0)
  114. signal.signal(signal.SIGINT, handler)
  115. if hasattr(signal, "SIGTERM"):
  116. signal.signal(signal.SIGTERM, handler)
  117. def sleep(self, a, b):
  118. time.sleep(random.uniform(a, b))
  119. def _scroll_page_down(self, delta=900):
  120. self.driver.run_js(f"window.scrollBy(0, {int(delta)});")
  121. time.sleep(random.uniform(0.3, 0.6))
  122. def _scroll_next_into_view(self, el):
  123. if not el:
  124. return
  125. try:
  126. self.driver.run_js(
  127. "arguments[0].scrollIntoView({block:'center',behavior:'instant'});",
  128. el,
  129. )
  130. self.sleep(1, 2)
  131. except Exception as e:
  132. logger.warning("滚动到下一页按钮失败: %s", e)
  133. try:
  134. el.scroll.to_see()
  135. except Exception:
  136. pass
  137. def _get_scroll_info(self):
  138. return self.driver.run_js("""
  139. return {
  140. scrollY: window.scrollY || window.pageYOffset || 0,
  141. docH: Math.max(document.body.scrollHeight,
  142. document.documentElement.scrollHeight,
  143. document.body.offsetHeight),
  144. viewH: window.innerHeight || document.documentElement.clientHeight || 800
  145. };
  146. """)
  147. def _find_next_btn(self, timeout=0.3):
  148. try:
  149. return self.driver.ele("text=下一页", timeout=timeout)
  150. except Exception:
  151. return None
  152. def _is_next_btn_visible(self, btn):
  153. if not btn:
  154. return False
  155. try:
  156. return bool(self.driver.run_js(_JS_NEXT_BTN_IN_VIEWPORT, btn))
  157. except Exception:
  158. return False
  159. def _human_click(self, element):
  160. """在目标节点上触发 click,避免 move_to + 无目标 actions.click() 因布局位移点到商品链接触发详情页。"""
  161. if not element:
  162. return False
  163. try:
  164. self.sleep(0.8, 2.0)
  165. try:
  166. self.driver.run_js(
  167. "arguments[0].scrollIntoView({block:'center',behavior:'instant'});",
  168. element,
  169. )
  170. except Exception:
  171. pass
  172. self.sleep(0.2, 0.6)
  173. self.driver.run_js("arguments[0].click();", element)
  174. return True
  175. except Exception as e:
  176. logger.warning("点击失败: %s", e)
  177. try:
  178. element.click()
  179. return True
  180. except Exception:
  181. return False
  182. @staticmethod
  183. def _estimated_price(json_data):
  184. fp = json_data.get("finalPrice")
  185. if isinstance(fp, dict):
  186. return fp.get("estimatedPrice", "") or ""
  187. return ""
  188. def get_heshu(self, full_title):
  189. last_box = None
  190. last_bottle = None
  191. for match in re.finditer(r"(\d+)(盒|瓶)", full_title):
  192. if match.group(2) == '盒':
  193. last_box = match
  194. else: # 瓶
  195. last_bottle = match
  196. if last_box:
  197. return int(last_box.group(1))
  198. elif last_bottle:
  199. return int(last_bottle.group(1))
  200. else:
  201. return 1
  202. def _take_snapshot(self, upload_key, image_ele, max_retries=3):
  203. """在指定标签页截图并上传。"""
  204. for attempt in range(1, max_retries + 1):
  205. time.sleep(1)
  206. try:
  207. jpg_bytes = image_ele.get_screenshot(as_bytes="jpg")
  208. if not jpg_bytes:
  209. logger.warning(
  210. "截图为空 upload_key=%s attempt=%s/%s",
  211. upload_key, attempt, max_retries, )
  212. continue
  213. img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
  214. except Exception:
  215. logger.exception(
  216. "截图或 OSS 上传失败 upload_key=%s attempt=%s/%s",
  217. upload_key, attempt, max_retries,
  218. )
  219. continue
  220. if not img_url:
  221. logger.warning(
  222. "OSS 未返回有效地址 upload_key=%s attempt=%s/%s",
  223. upload_key, attempt, max_retries,
  224. )
  225. continue
  226. logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
  227. time.sleep(random.uniform(1, 2))
  228. return img_url
  229. logger.warning("截图失败,已达最大重试次数 upload_key=%s", upload_key)
  230. return ""
  231. def get_heshu(self, full_title):
  232. last_box = None
  233. last_bottle = None
  234. for match in re.finditer(r"(\d+)(盒|瓶)", full_title):
  235. if match.group(2) == '盒':
  236. last_box = match
  237. else: # 瓶
  238. last_bottle = match
  239. if last_box:
  240. return int(last_box.group(1))
  241. elif last_bottle:
  242. return int(last_bottle.group(1))
  243. else:
  244. return 1
  245. def parse(self, ware_list):
  246. for w in ware_list:
  247. title = w.get("wareName", "")
  248. title = re.sub(r"<[^>]*>", "", title).strip()
  249. color = w.get("color", "")
  250. full_title = title + " " + color
  251. logger.info(full_title)
  252. if self.product not in full_title:
  253. self.is_no_prodcut += 1
  254. continue
  255. if self.brand not in full_title:
  256. self.is_no_prodcut += 1
  257. continue
  258. if self.product_desc:
  259. if self.product_desc in full_title:
  260. crawl_product_desc = self.product_desc
  261. else:
  262. crawl_product_desc = ""
  263. title = full_title
  264. else:
  265. crawl_product_desc = ""
  266. title = full_title
  267. if "+[" in title:
  268. continue
  269. self.is_no_prodcut = 0
  270. status = 1
  271. if self.product_keyword:
  272. search_keyword_list = self.product_keyword.split(",")
  273. for search_keyword in search_keyword_list:
  274. if search_keyword.strip() not in title:
  275. status = 0
  276. if status == 0:
  277. continue
  278. logger.info(f"商品名:{title}")
  279. sku_id = w.get("skuId", "")
  280. sales = w.get("totalSales", "")
  281. shop_id = w.get("shopId", "")
  282. shop_name = w.get("shopName", "")
  283. heshu_count = self.get_heshu(full_title)
  284. final_price = self._estimated_price(w)
  285. jd_price = w.get("jdPrice", "")
  286. item_url = f"https://item.jd.com/{sku_id}.html"
  287. low_price = final_price if final_price else jd_price
  288. # 获取列表页快照
  289. ele_xpath = "//div[@id='main_search_conter']//div[contains(@class,'_goodsContainer_')]/div[@data-sku=" + "'" + sku_id + "'" + "]"
  290. ele_screen = self.driver.ele("xpath=" + ele_xpath)
  291. upload_key = hashlib.md5(item_url.encode("utf-8")).hexdigest()
  292. snap_url = self._take_snapshot(upload_key, ele_screen)
  293. try:
  294. price = Decimal(str(low_price)).quantize(Decimal("0.00"))
  295. except (InvalidOperation, ValueError):
  296. price = Decimal("0.00")
  297. item_url = f"https://item.jd.com/{sku_id}.html"
  298. mall_url = f"https://mall.jd.com/index-{shop_id}.html?from=pc"
  299. # 字段与 yaofangwang_crawl 对齐;键顺序须与 commons.sql_data.RETRIEVE_SCRAPE_INSERT_COLUMNS 一致
  300. now_ts = time.strftime("%Y-%m-%d %H:%M:%S")
  301. product = {
  302. "platform": self.platform,
  303. "item_id": sku_id,
  304. "enterprise_id": self.company_id,
  305. "product_name": title,
  306. "spec": crawl_product_desc,
  307. "one_price": "",
  308. "detail_url": item_url,
  309. "shop_name": shop_name,
  310. "anonymous_store_name": "",
  311. "shop_url": mall_url,
  312. "city_name": "",
  313. "city_id": "",
  314. "province_name": "",
  315. "province_id": "",
  316. "shipment_city_name": "",
  317. "shipment_city_id": "",
  318. "shipment_province_name": "",
  319. "shipment_province_id": "",
  320. "area_info": "",
  321. "factory_name": "",
  322. "scrape_date": time.strftime("%Y-%m-%d"),
  323. "price": price,
  324. "sales": sales,
  325. "stock_count": "",
  326. "snapshot_url": snap_url,
  327. "approval_num": "",
  328. "produced_time": "",
  329. "deadline": "",
  330. "update_time": now_ts,
  331. "insert_time": now_ts,
  332. "number": heshu_count,
  333. "product_brand": self.brand or "",
  334. "collect_task_id": self.collect_task_id,
  335. "search_name": self.product,
  336. "company_name": "",
  337. "collect_config_info": json.dumps(
  338. {
  339. "sampling_cycle": self.sampling_cycle,
  340. "sampling_start_time": self.sampling_start_time,
  341. "sampling_end_time": self.sampling_end_time,
  342. }
  343. ),
  344. "account_id": self.account_id,
  345. "collect_region_id": self.collect_region_id,
  346. "collect_round": self.collect_round,
  347. "is_sold_out": 0
  348. }
  349. try:
  350. self.pipeline.storge_data(product)
  351. logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
  352. except Exception as e:
  353. logger.exception("写入数据库失败: %s", e)
  354. @staticmethod
  355. def _response_has_ware_list(data):
  356. if not isinstance(data, dict):
  357. return False
  358. wl = data.get("data", {}).get("wareList")
  359. return bool(wl)
  360. def fetch_items_once(self, timeout=FETCH_TIMEOUT_FIRST):
  361. n = 0
  362. for resp in self.driver.listen.steps(timeout=timeout):
  363. try:
  364. data = resp.response.body
  365. if not self._response_has_ware_list(data):
  366. continue
  367. ware_list = data["data"]["wareList"]
  368. self.parse(ware_list)
  369. n += len(ware_list)
  370. except Exception as e:
  371. logger.warning("解析监听响应失败: %s", e)
  372. return n
  373. def clear_listen_buffer(self, rounds=LISTEN_CLEAR_ROUNDS, timeout=LISTEN_CLEAR_TIMEOUT):
  374. try:
  375. for _ in range(rounds):
  376. resps = list(self.driver.listen.steps(timeout=timeout))
  377. if not resps:
  378. break
  379. logger.debug("监听缓冲已清空")
  380. except Exception as e:
  381. logger.debug("清空监听缓冲失败: %s", e)
  382. def collect_full_page_items(self, max_steps=10):
  383. """单次循环:边滑动边收数据,到底 / 看见「下一页」即停。"""
  384. n = self.fetch_items_once(timeout=FETCH_TIMEOUT_FIRST)
  385. stagnant = 0
  386. last_scroll_y = None
  387. for step in range(max_steps):
  388. next_btn = self._find_next_btn(timeout=0.3)
  389. if self._is_next_btn_visible(next_btn):
  390. n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
  391. return n, next_btn
  392. info = self._get_scroll_info()
  393. scroll_y = info["scrollY"]
  394. doc_h = info["docH"]
  395. view_h = info["viewH"]
  396. at_bottom = (scroll_y + view_h >= doc_h - 20)
  397. if last_scroll_y is not None and abs(scroll_y - last_scroll_y) < 8:
  398. stagnant += 1
  399. else:
  400. stagnant = 0
  401. last_scroll_y = scroll_y
  402. if at_bottom and stagnant >= 2:
  403. n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
  404. next_btn = self._find_next_btn(timeout=2)
  405. if next_btn:
  406. self._scroll_next_into_view(next_btn)
  407. return n, next_btn
  408. logger.info("已到页面底部且未发现下一页,停止滑动")
  409. return n, None
  410. self._scroll_page_down(random.randint(400, 800))
  411. if random.random() < 0.15:
  412. self.driver.run_js(f"window.scrollBy(0, -{random.randint(60, 140)})")
  413. self.sleep(0.5, 1.5)
  414. if step % 3 == 2:
  415. n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
  416. n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
  417. next_btn = self._find_next_btn(timeout=3)
  418. if next_btn and not self._is_next_btn_visible(next_btn):
  419. self._scroll_next_into_view(next_btn)
  420. return n, next_btn
  421. def get_account(self):
  422. sql_account = """
  423. SELECT *
  424. FROM `retrieve_collect_equipment_account`
  425. WHERE `id` = %s
  426. and `status` = 0
  427. """
  428. account_list = self.db.select_data(sql_account, self.account_id)
  429. if not account_list:
  430. return False
  431. account_dict = account_list[0]
  432. print(account_dict)
  433. self.ip = account_dict.get("ip")
  434. self.account_name = account_dict.get("username")
  435. self.login_username = account_dict.get("phone", "")
  436. self.login_password = account_dict.get("password", "")
  437. logger.info("获取到账号: %s, ip: %s", self.account_name, self.ip)
  438. return True
  439. def disable_account(self):
  440. update_sql = f""" UPDATE `retrieve_collect_equipment_account` SET `status`= %s WHERE `name` = %s; """
  441. self.db.execute(update_sql, (1, self.account_name))
  442. def _build_search_keyword(self):
  443. parts = [p for p in (self.brand, self.product, self.product_desc) if p]
  444. return " ".join(parts).strip() or self.product
  445. def _is_logged_out(self):
  446. return bool(self.driver.ele("xpath=//*[@class='link-login']", timeout=2))
  447. def perform_jd_login(self):
  448. """
  449. 使用已有浏览器实例执行京东账号密码登录(含滑块验证码)。
  450. 成功返回 True,失败返回 False。
  451. """
  452. username = self.login_username
  453. password = self.login_password
  454. login_url = "https://passport.jd.com/new/login.aspx"
  455. self.driver.get(login_url)
  456. input_name = self.driver.ele("xpath=//input[@id='loginname']", timeout=15)
  457. if not input_name:
  458. print("未找到用户名输入框")
  459. return False
  460. input_name.input(username)
  461. time.sleep(random.uniform(1.5, 2.5))
  462. input_pass = self.driver.ele("xpath://input[@name='nloginpwd']", timeout=5)
  463. if not input_pass:
  464. print("未找到密码输入框")
  465. return False
  466. input_pass.input(password)
  467. time.sleep(random.uniform(1.5, 2.5))
  468. login_btn = self.driver.ele("xpath://a[@id='loginsubmit']", timeout=5)
  469. if not login_btn:
  470. print("未找到登录按钮")
  471. return False
  472. login_btn.click()
  473. time.sleep(random.uniform(3, 5))
  474. if not handle_jd_slider_captcha(self.driver):
  475. print("滑块验证码未通过")
  476. return False
  477. return True
  478. def _ensure_logged_in(self):
  479. """未登录时自动走登录流程(账号密码 + 滑块)。"""
  480. if not self._is_logged_out():
  481. return True
  482. logger.info("检测到未登录,开始自动登录: %s", self.account_name)
  483. ok = self.perform_jd_login()
  484. if ok and not self._is_logged_out():
  485. logger.info("自动登录成功: %s", self.account_name)
  486. return True
  487. logger.error("自动登录失败: %s", self.account_name)
  488. return False
  489. def _check_page_blocked(self):
  490. html = self.driver.html or ""
  491. if "抱歉由于访问频繁导致无法搜索" in html:
  492. logger.error("账号无法搜索(访问频繁)")
  493. self.success = False
  494. return True
  495. return False
  496. def _jump_to_page(self, target_page):
  497. """跳转到指定页码,并清空跳转前的监听残留。"""
  498. to_page_input = self.driver.ele(
  499. "xpath=//div[contains(@class,'_pagination_toPageNum_')]//input[@type='text']",
  500. timeout=3,
  501. )
  502. if not to_page_input:
  503. logger.warning("未找到跳页输入框,无法跳转到第 %s 页", target_page)
  504. return False
  505. self.clear_listen_buffer()
  506. to_page_input.input(str(target_page))
  507. self.sleep(1, 2)
  508. self.driver.actions.key_down("enter").key_up("enter")
  509. self.sleep(3, 5)
  510. self.clear_listen_buffer()
  511. logger.info("已跳转到第 %s 页", target_page)
  512. return True
  513. def _go_next_page(self, next_btn):
  514. self.clear_listen_buffer()
  515. if not self._human_click(next_btn):
  516. logger.warning("点击下一页失败")
  517. return False
  518. self.sleep(2, 4)
  519. return True
  520. def crawl(self):
  521. total = 0
  522. keyword = self._build_search_keyword()
  523. self.driver.get("https://www.jd.com/", timeout=15)
  524. time.sleep(15)
  525. if self._is_logged_out():
  526. if not self.login_password or not self.login_username:
  527. return
  528. if not self._ensure_logged_in():
  529. self.disable_account()
  530. send_text(f"京东:{self.account_name}账号登录失败")
  531. self.success = False
  532. return
  533. self.driver.get("https://www.jd.com/", timeout=15)
  534. self.sleep(3, 5)
  535. kw = quote(str(keyword or ""), safe="")
  536. self._search_kw = kw
  537. # 必须先监听再打开搜索页,否则首屏 wareList(前约 30 条)在监听开启前就返回了
  538. self._start_listen()
  539. self.driver.get(
  540. f"https://search.jd.com/Search?keyword={kw}&enc=utf-8&wq={kw}", timeout=15
  541. )
  542. self.sleep(5, 8)
  543. if self._check_page_blocked():
  544. return
  545. if not handle_jd_slider_captcha(self.driver, pause_listen=False):
  546. logger.warning("进入搜索页后滑块验证码处理失败")
  547. self.success = False
  548. return
  549. if self.start_page > 1:
  550. if not self._jump_to_page(self.start_page):
  551. logger.warning("跳页失败,将从第 1 页开始采集")
  552. self.start_page = 1
  553. logger.info(
  554. "采集页码范围: %s ~ %s(共 %s 页)",
  555. self.start_page,
  556. self.end_page,
  557. self.end_page - self.start_page + 1,
  558. )
  559. for page_no in range(self.start_page, self.end_page + 1):
  560. if self._is_logged_out():
  561. if not self._ensure_logged_in():
  562. self.success = False
  563. break
  564. self.driver.get(
  565. f"https://search.jd.com/Search?keyword={kw}&enc=utf-8&wq={kw}",
  566. timeout=15,
  567. )
  568. self.sleep(3, 5)
  569. if page_no > 1:
  570. self._jump_to_page(page_no)
  571. if not handle_jd_slider_captcha(self.driver, pause_listen=True):
  572. logger.warning("滑块验证码处理失败,停止采集")
  573. self.success = False
  574. break
  575. if self._check_page_blocked():
  576. break
  577. logger.info("===== 正在爬取第 %s 页 =====", page_no)
  578. search_ele = self.driver.ele("xpath=//div[@id='search-condition']", timeout=10)
  579. if not search_ele:
  580. logger.warning("未找到搜索结果区域,停止采集")
  581. break
  582. page_n, _ = self.collect_full_page_items()
  583. logger.info("本页监听商品条数(含可能重复): %s", page_n)
  584. total += page_n
  585. logger.info("累计监听条数: %s", total)
  586. if self.is_no_prodcut > 20:
  587. logger.info("连续无匹配商品过多,停止采集")
  588. break
  589. if page_no >= self.end_page:
  590. break
  591. next_btn = self.driver.ele("text=下一页", timeout=2)
  592. if not next_btn:
  593. logger.info("没有下一页(未找到)")
  594. break
  595. cls_str = next_btn.attr("class") or ""
  596. if "disabled" in cls_str:
  597. logger.info("没有下一页(已禁用)")
  598. break
  599. if not self._go_next_page(next_btn):
  600. break
  601. def run(self):
  602. # 检测账号
  603. if not self.get_account():
  604. logger.info("==================当前无账号可用==================")
  605. self.success = False
  606. return self.pipeline.crawl_count, self.success
  607. logger.info("获取到账号:%s,代理ip:%s", self.account_name, self.ip)
  608. # # # 每次选取账号,立马账号使用时间
  609. update_sql = f""" UPDATE `retrieve_collect_equipment_account` SET `status`= %s, `update_time`= %s WHERE `username` = %s; """
  610. self.db.execute(update_sql, (0, int(time.time()), self.account_name))
  611. try:
  612. self.init_browser()
  613. self.crawl()
  614. except Exception as e:
  615. self.success = False
  616. logger.exception("爬取异常: %s", e)
  617. self.sleep(3, 5)
  618. finally:
  619. if self.driver:
  620. self.driver.quit()
  621. self.driver = None
  622. return self.pipeline.crawl_count, self.success