yaoex_snapshot_crawl.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642
  1. import base64
  2. import hashlib
  3. import json
  4. import random
  5. import signal
  6. import socket
  7. import sys
  8. import time
  9. from pathlib import Path
  10. from urllib.parse import quote
  11. import requests
  12. from Crypto.Cipher import AES
  13. from DrissionPage import ChromiumPage, ChromiumOptions
  14. from commons.Logger import logger
  15. from oss_upload.oss_upload import AliyunOSSUploader
  16. from pipelines.drug_pipelines import DrugPipeline
  17. from area_info.city_name_to_id import get_city
  18. from commons.config import YYC_ACCOUNT
  19. from Crypto.Util.Padding import unpad
  20. CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
  21. CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
  22. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  23. # 项目根目录 → spiders/yaoex(与从哪执行脚本无关)
  24. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  25. YAOEX_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoex"
  26. BROWSER_PROFILE_SUBDIR = "chrome_profile"
  27. SLIDER_OFFSET_FIX = 10
  28. DETAIL_GET_TIMEOUT = 15
  29. DETAIL_URL_WAIT = 10
  30. DETAIL_DOM_WAIT = 8
  31. DETAIL_NAV_RETRIES = 3
  32. DETAIL_CONTENT_XPATH = "xpath://div[contains(@class,'yaoex-product-detail__content')]"
  33. REQUEST_RETRY_COUNT = 3
  34. REQUEST_TIMEOUT_SEC = 20
  35. NOT_PRODUCT_BREAK = 15
  36. headers = {
  37. "Accept": "application/json, text/plain, */*",
  38. "Accept-Language": "zh-CN,zh;q=0.9",
  39. "Connection": "keep-alive",
  40. "Content-Type": "application/x-www-form-urlencoded",
  41. "Origin": "https://mall.yaoex.com",
  42. "Referer": "https://mall.yaoex.com/",
  43. "Sec-Fetch-Dest": "empty",
  44. "Sec-Fetch-Mode": "cors",
  45. "Sec-Fetch-Site": "cross-site",
  46. "User-Agent": (
  47. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  48. "AppleWebKit/537.36 (KHTML, like Gecko) "
  49. "Chrome/146.0.0.0 Safari/537.36"
  50. ),
  51. "X-Request-Agent": "Axios",
  52. "X-Requested-With": "XMLHttpRequest",
  53. "sec-ch-ua": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
  54. "sec-ch-ua-mobile": "?0",
  55. "sec-ch-ua-platform": '"Windows"',
  56. }
  57. class YaoexSnapshotCrawl:
  58. def __init__(self, drug_dict=None):
  59. self.driver = None
  60. self.platform = 6
  61. self.pipeline = DrugPipeline("yaoex")
  62. self.task_dict = drug_dict or {}
  63. self.ossuploader = AliyunOSSUploader()
  64. self.start_page = 1
  65. self.end_page = 1
  66. self.account_name = YYC_ACCOUNT.get("username", "yyc_default")
  67. self._shop_cache = {}
  68. self._register_signal_handler()
  69. if self.task_dict:
  70. self.get_product_data()
  71. self.success = True
  72. self.is_not_product = 0
  73. self.user_id = YYC_ACCOUNT["user_id"]
  74. self.token = YYC_ACCOUNT["token"]
  75. def get_product_data(self):
  76. self.task_id = self.task_dict["id"]
  77. self.company_id = self.task_dict["company_id"]
  78. self.product = self.task_dict["product_name"]
  79. self.product_desc = self.task_dict.get("product_specs", "")
  80. self.brand = self.task_dict.get("product_brand", "")
  81. self.product_keyword = self.task_dict.get("product_keyword", "")
  82. self.collect_task_id = self.task_dict.get("collect_task_id", "")
  83. self.sampling_cycle = self.task_dict.get("sampling_cycle", "")
  84. self.sampling_start_time = self.task_dict.get("sampling_start_time", "")
  85. self.sampling_end_time = self.task_dict.get("sampling_end_time", "")
  86. self.collect_equipment_id = self.task_dict.get("collect_equipment_id", "")
  87. self.account_id = self.task_dict.get("collect_equipment_account_id", "")
  88. self.collect_region_id = self.task_dict.get("collect_region_id", "")
  89. self.collect_round = self.task_dict.get("collect_round", 1)
  90. self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
  91. self.end_page = max(
  92. self.start_page,
  93. self._parse_page(self.task_dict.get("end_page"), self.start_page),
  94. )
  95. @staticmethod
  96. def _parse_page(value, default=1):
  97. try:
  98. page = int(value)
  99. return page if page >= 1 else default
  100. except (TypeError, ValueError):
  101. return default
  102. def _register_signal_handler(self):
  103. def handler(signum, frame):
  104. logger.info("收到退出信号,正在关闭浏览器...")
  105. self._quit_browser()
  106. sys.exit(0)
  107. signal.signal(signal.SIGINT, handler)
  108. if hasattr(signal, "SIGTERM"):
  109. signal.signal(signal.SIGTERM, handler)
  110. @staticmethod
  111. def _timestamp_ms() -> str:
  112. return str(int(time.time() * 1000))
  113. def _quit_browser(self):
  114. if self.driver:
  115. try:
  116. self.driver.quit()
  117. except Exception:
  118. pass
  119. self.driver = None
  120. @staticmethod
  121. def _get_free_port():
  122. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  123. s.bind(("127.0.0.1", 0))
  124. return s.getsockname()[1]
  125. def _resolve_browser_profile_dir(self):
  126. """
  127. 浏览器数据固定落在 <项目根>/spiders/yaoex/ 下。
  128. 优先 chrome_profile/<账号>;若旧版直接在 yaoex/<账号> 已有登录态则继续沿用。
  129. """
  130. preferred = YAOEX_SPIDER_DIR / BROWSER_PROFILE_SUBDIR / self.account_name
  131. legacy_flat = YAOEX_SPIDER_DIR / self.account_name
  132. legacy_nested = YAOEX_SPIDER_DIR / "spiders" / "yaoex" / self.account_name
  133. for candidate in (preferred, legacy_flat, legacy_nested):
  134. if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
  135. logger.info("使用已有浏览器配置目录: %s", candidate)
  136. return candidate
  137. preferred.parent.mkdir(parents=True, exist_ok=True)
  138. logger.info("新建浏览器配置目录: %s", preferred)
  139. return preferred
  140. def init_browser(self):
  141. co = ChromiumOptions().set_browser_path(chrome_path)
  142. debug_port = self._get_free_port()
  143. profile_dir = self._resolve_browser_profile_dir()
  144. profile_dir.mkdir(parents=True, exist_ok=True)
  145. co.set_user_data_path(str(profile_dir))
  146. logger.info("浏览器用户目录(绝对路径): %s", profile_dir.resolve())
  147. co.set_local_port(debug_port)
  148. co.set_argument(f"--remote-debugging-port={debug_port}")
  149. co.set_argument("--remote-debugging-address=127.0.0.1")
  150. co.set_argument("--disable-dev-shm-usage")
  151. co.set_argument("--start-maximized")
  152. co.set_argument("--no-first-run")
  153. co.set_argument("--no-default-browser-check")
  154. self.driver = ChromiumPage(co)
  155. def _is_logged_in(self):
  156. return bool(self.driver.ele("xpath=//a[@id='logout']", timeout=5))
  157. def _call_captcha_api(self, image_bytes):
  158. try:
  159. b64 = base64.b64encode(image_bytes).decode()
  160. resp = requests.post(
  161. CAPTCHA_API_URL,
  162. json={"token": CAPTCHA_TOKEN, "type": "22222", "image": b64},
  163. headers={"Content-Type": "application/json"},
  164. timeout=15,
  165. ).json()
  166. logger.info("验证码 API 返回: %s", resp)
  167. return resp["data"]["data"]
  168. except Exception as e:
  169. logger.exception("验证码识别失败: %s", e)
  170. return None
  171. @staticmethod
  172. def _generate_human_track(distance):
  173. tracks = []
  174. current = 0
  175. mid = distance * 0.7
  176. t = 0.2
  177. v = 0
  178. move_points = []
  179. while current < mid:
  180. a = random.uniform(2, 4)
  181. v0 = v
  182. v = v0 + a * t
  183. move = v0 * t + 0.5 * a * t * t
  184. current += move
  185. move_points.append(move)
  186. while current < distance:
  187. a = -random.uniform(0.5, 1.5)
  188. v0 = v
  189. v = v0 + a * t
  190. if v < 0.5:
  191. v = 0.5
  192. move = v0 * t + 0.5 * a * t * t
  193. current += move
  194. move_points.append(move)
  195. total_points = len(move_points)
  196. for i, move in enumerate(move_points):
  197. y_offset = random.randint(-2, 2) if i % random.randint(2, 4) == 0 else 0
  198. if i < total_points * 0.3:
  199. duration = random.uniform(0.01, 0.03)
  200. elif i > total_points * 0.7:
  201. duration = random.uniform(0.03, 0.08)
  202. else:
  203. duration = random.uniform(0.02, 0.05)
  204. if random.random() < 0.05:
  205. duration += random.uniform(0.05, 0.1)
  206. tracks.append((move, y_offset, duration))
  207. if random.random() < 0.7:
  208. tracks.append((-random.randint(1, 3), 0, 0.05))
  209. return tracks
  210. def _simulate_slider_drag(self, slider_element, target_distance):
  211. self.driver.actions.move_to(slider_element).hold()
  212. for offset_x, offset_y, duration in self._generate_human_track(target_distance):
  213. self.driver.actions.move(offset_x, offset_y, duration=duration / 1000)
  214. self.driver.actions.release()
  215. def _solve_slider_if_present(self):
  216. modal = self.driver.ele("xpath://div[@class='yidun_modal']", timeout=3)
  217. if not modal:
  218. return True
  219. logger.info("检测到滑块验证码,开始处理")
  220. jpg_bytes = modal.get_screenshot(as_bytes="jpg")
  221. distance = self._call_captcha_api(jpg_bytes)
  222. if distance is None:
  223. return False
  224. slider = self.driver.ele(
  225. "xpath://div[contains(@class,'yidun_slider--hover')]", timeout=5
  226. )
  227. if not slider:
  228. logger.error("未找到滑块元素")
  229. return False
  230. self._simulate_slider_drag(slider, float(distance) + SLIDER_OFFSET_FIX)
  231. time.sleep(3)
  232. return True
  233. def login(self):
  234. self.driver.get("https://mall.yaoex.com/login", timeout=15)
  235. self.driver.wait.doc_loaded(timeout=10)
  236. input_name = self.driver.ele("xpath://input[@name='username']", timeout=5)
  237. if not input_name:
  238. logger.error("未找到用户名输入框")
  239. return False
  240. input_name.input(YYC_ACCOUNT["username"])
  241. time.sleep(random.uniform(1.2, 2.0))
  242. input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
  243. if not input_pass:
  244. logger.error("未找到密码输入框")
  245. return False
  246. input_pass.input(YYC_ACCOUNT["password"])
  247. time.sleep(random.uniform(1.2, 2.0))
  248. geetest_click = self.driver.ele(
  249. "xpath=//div[contains(@class,'geetest_btn_click')]", timeout=3
  250. )
  251. if geetest_click:
  252. geetest_click.click()
  253. time.sleep(1.5)
  254. login_button = self.driver.ele("xpath://input[@id='login-btn']", timeout=5)
  255. if not login_button:
  256. logger.error("未找到登录按钮")
  257. return False
  258. login_button.click()
  259. self.driver.wait.doc_loaded(timeout=10)
  260. time.sleep(2)
  261. if not self._solve_slider_if_present():
  262. return False
  263. return self._is_logged_in()
  264. def decrypt_price(self, ciphertext_b64):
  265. if not ciphertext_b64 or not str(ciphertext_b64).strip():
  266. return ""
  267. _KEY_FIXED = "GDLSAUO1KUMIIBCE"
  268. if not self.user_id:
  269. key = _KEY_FIXED.encode("utf-8")
  270. else:
  271. uid = str(self.user_id)[:6].rjust(6, "0")
  272. key = (_KEY_FIXED[:10] + uid).encode("utf-8")
  273. raw = base64.b64decode(ciphertext_b64.strip())
  274. cipher = AES.new(key, AES.MODE_ECB)
  275. plain = unpad(cipher.decrypt(raw), AES.block_size)
  276. return plain.decode("utf-8")
  277. def _post_with_retry(self, url, payload, retries=REQUEST_RETRY_COUNT, timeout=REQUEST_TIMEOUT_SEC):
  278. last_err = None
  279. for attempt in range(1, retries + 1):
  280. try:
  281. resp = requests.post(
  282. url,
  283. headers=headers,
  284. data=payload,
  285. timeout=timeout,
  286. )
  287. resp.raise_for_status()
  288. return resp
  289. except Exception as e:
  290. last_err = e
  291. if attempt < retries:
  292. logger.warning("请求失败,第%s/%s次重试: %s", attempt, retries, e)
  293. time.sleep(min(2 * attempt, 5))
  294. else:
  295. logger.error("请求失败,已达最大重试次数(%s): %s", retries, e)
  296. raise last_err
  297. def _shop_payload(self, enterprise_id):
  298. return {
  299. "traderName": "yaoex_pc",
  300. "trader": "pc",
  301. "closesignature": "yes",
  302. "signature_method": "md5",
  303. "signature": "****",
  304. "timestamp": self._timestamp_ms(),
  305. "token": self.token,
  306. "userToken": self.token,
  307. "enterpriseId": enterprise_id,
  308. }
  309. def _list_payload(self, keyword, page):
  310. return {
  311. "traderName": "yaoex_pc",
  312. "trader": "pc",
  313. "closesignature": "yes",
  314. "signature_method": "md5",
  315. "signature": "****",
  316. "timestamp": self._timestamp_ms(),
  317. "token": self.token,
  318. "userToken": self.token,
  319. "userId": self.user_id,
  320. "roleId": "101",
  321. "userType": "下游客户",
  322. "buyerCode": self.user_id,
  323. "nowPage": str(page),
  324. "per": "20",
  325. "keyword": keyword,
  326. "catSearchId": "",
  327. "specs": "",
  328. "factoryIds": "",
  329. "sellerCodes": "",
  330. "sellerFilterMode": "0",
  331. "sortColumn": "default",
  332. "sortMode": "default",
  333. "ver": "1",
  334. "stock_mode": "1",
  335. "showExtendCard": "true",
  336. "needDinnerPrice": "true",
  337. "limitStart": "",
  338. "limitEnd": "",
  339. "deadLineStart": "",
  340. "deadLineEnd": "",
  341. "filterDtos": "",
  342. "showWholePurchase": "true",
  343. }
  344. def fetch_list_page(self, keyword, page):
  345. list_url = "https://gateway-b2b.fangkuaiyi.com/home/search/homeSearchList"
  346. resp = self._post_with_retry(list_url, self._list_payload(keyword, page))
  347. return resp.json().get("data", {}).get("shopProducts", []) or []
  348. def fetch_shop(self, seller_code):
  349. detail_url = "https://gateway-b2b.fangkuaiyi.com/ycapp/shop/enterpriseQualification"
  350. resp = self._post_with_retry(detail_url, self._shop_payload(seller_code))
  351. shop_res = resp.json().get("data", {})
  352. base_info = shop_res.get("baseInfo", {})
  353. return base_info.get("address", ""), base_info.get("enterpriseName", "")
  354. def _get_shop_info(self, seller_code):
  355. if seller_code in self._shop_cache:
  356. return self._shop_cache[seller_code]
  357. try:
  358. shop_info = self.fetch_shop(seller_code)
  359. except Exception as e:
  360. logger.warning("fetch_shop 失败 seller_code=%s: %s", seller_code, e)
  361. shop_info = ("", "")
  362. self._shop_cache[seller_code] = shop_info
  363. return shop_info
  364. def _current_url(self):
  365. try:
  366. return self.driver.url or ""
  367. except Exception:
  368. return ""
  369. def _url_has_product(self, spu_code, seller_code):
  370. url = self._current_url()
  371. spu_code = str(spu_code or "")
  372. seller_code = str(seller_code or "")
  373. if spu_code and seller_code:
  374. return spu_code in url and seller_code in url
  375. return bool(spu_code and spu_code in url)
  376. def _wait_detail_ready(self, spu_code, seller_code, timeout=DETAIL_URL_WAIT):
  377. deadline = time.time() + timeout
  378. while time.time() < deadline:
  379. if self._url_has_product(spu_code, seller_code):
  380. if self.driver.ele(DETAIL_CONTENT_XPATH, timeout=1):
  381. time.sleep(0.3)
  382. return True
  383. time.sleep(0.4)
  384. return False
  385. def _build_detail_url(self, item):
  386. if not item.get("productId") and item.get("groupBuyProductDto"):
  387. item = item.get("groupBuyProductDto") or {}
  388. spu_code = item.get("spuCode", "")
  389. seller_code = item.get("sellerCode", "")
  390. group_buying_id = item.get("groupBuyingId", "")
  391. p_json = json.dumps(
  392. {"id": group_buying_id, "s": seller_code, "sp": spu_code},
  393. separators=(",", ":"),
  394. )
  395. detail_url = (
  396. f"https://mall.yaoex.com/groupBuying/#/productDetail?p={quote(p_json)}"
  397. )
  398. else:
  399. seller_code = item.get("sellerCode")
  400. spu_code = item.get("spuCode")
  401. detail_url = (
  402. f"https://mall.yaoex.com/v2/product/#/spuCode/{spu_code}/sellerCode/{seller_code}"
  403. )
  404. return item, detail_url, spu_code, seller_code
  405. def _goto_detail_page(self, detail_url, spu_code, seller_code):
  406. """get 后 refresh 一次,让 SPA 按当前 URL 重新渲染详情。"""
  407. for attempt in range(1, DETAIL_NAV_RETRIES + 1):
  408. try:
  409. self.driver.get(detail_url, timeout=DETAIL_GET_TIMEOUT)
  410. time.sleep(0.5)
  411. self.driver.refresh()
  412. time.sleep(2)
  413. return True
  414. except Exception as e:
  415. logger.warning(
  416. "跳转详情异常 spu=%s seller=%s attempt=%s: %s",
  417. spu_code, seller_code, attempt, e,
  418. )
  419. time.sleep(random.uniform(0.8, 1.5))
  420. return False
  421. def _take_snapshot(self, upload_key):
  422. time.sleep(1)
  423. try:
  424. detail_ele = self.driver.ele(DETAIL_CONTENT_XPATH, timeout=2)
  425. if detail_ele:
  426. jpg_bytes = detail_ele.get_screenshot(as_bytes="jpg")
  427. else:
  428. jpg_bytes = self.driver.get_screenshot(as_bytes="jpg")
  429. if not jpg_bytes:
  430. logger.warning("截图为空 upload_key=%s", upload_key)
  431. return ""
  432. img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
  433. except Exception:
  434. logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
  435. return ""
  436. if not img_url:
  437. logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
  438. return ""
  439. logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
  440. time.sleep(random.uniform(1, 2))
  441. return img_url
  442. def parse_product(self, item, detail_url, snap_url):
  443. seller_code = item.get("sellerCode")
  444. spu_code = item.get("spuCode")
  445. name_part = (item.get("productName") or "").strip()
  446. short_part = (item.get("shortName") or "").strip()
  447. product_name = f"{name_part} {short_part}".strip()
  448. shop_url = f"https://mall.yaoex.com/v2/store/#/detail/{seller_code}/home"
  449. company_address, company_name = self._get_shop_info(seller_code)
  450. address = item.get("cityName", "")
  451. city_id = province_id = city = province = ""
  452. if address:
  453. city_id, province_id, city, province = get_city(address.split("市")[0])
  454. price = self.decrypt_price(item.get("price"))
  455. hash_text = f"{seller_code}_{spu_code}_{price}"
  456. item_id = hashlib.md5(hash_text.encode("utf-8")).hexdigest()
  457. is_sold_out = 1 if "商品已售罄" in (item.get("statusDescription") or "") else 0
  458. shop_name = item.get("storeName") or item.get("shopName")
  459. anonymous_store_name = ""
  460. if shop_name == "预约配送中心":
  461. anonymous_store_name = item.get("supplyName", "")
  462. inventory = item.get("currentInventory") or item.get("stockCount")
  463. now = time.strftime("%Y-%m-%d %H:%M:%S")
  464. return {
  465. "platform": self.platform,
  466. "item_id": item_id,
  467. "enterprise_id": self.company_id,
  468. "product_name": product_name,
  469. "spec": item.get("spec"),
  470. "one_price": "",
  471. "detail_url": detail_url,
  472. "shop_name": shop_name,
  473. "anonymous_store_name": anonymous_store_name,
  474. "shop_url": shop_url,
  475. "city_name": city,
  476. "city_id": city_id,
  477. "province_name": province,
  478. "province_id": province_id,
  479. "shipment_city_name": "",
  480. "shipment_city_id": "",
  481. "shipment_province_name": "",
  482. "shipment_province_id": "",
  483. "area_info": company_address or "",
  484. "factory_name": item.get("factoryName"),
  485. "scrape_date": time.strftime("%Y-%m-%d"),
  486. "price": price,
  487. "sales": "",
  488. "stock_count": inventory,
  489. "snapshot_url": snap_url,
  490. "approval_num": item.get("approvalNum"),
  491. "produced_time": item.get("productionTime"),
  492. "deadline": item.get("deadLine"),
  493. "update_time": now,
  494. "insert_time": now,
  495. "number": 1,
  496. "product_brand": self.brand or "",
  497. "collect_task_id": self.collect_task_id,
  498. "search_name": self.product,
  499. "company_name": company_name,
  500. "collect_config_info": json.dumps(
  501. {
  502. "sampling_cycle": self.sampling_cycle,
  503. "sampling_start_time": self.sampling_start_time,
  504. "sampling_end_time": self.sampling_end_time,
  505. }
  506. ),
  507. "account_id": self.account_id,
  508. "collect_region_id": self.collect_region_id,
  509. "collect_round": self.collect_round,
  510. "is_sold_out": is_sold_out,
  511. }
  512. def search(self):
  513. self.driver.get("https://mall.yaoex.com/", timeout=15)
  514. self.driver.wait.doc_loaded(timeout=10)
  515. if not self._is_logged_in():
  516. if not self.login():
  517. logger.error("登录失败")
  518. return False
  519. keyword = self.product
  520. if self.brand:
  521. keyword = (self.brand + " " + self.product).strip()
  522. if self.product_desc:
  523. keyword = (keyword + " " + self.product_desc).strip()
  524. for page in range(self.start_page, self.end_page + 1):
  525. logger.info("正在爬取 %s %s,第%s页", self.brand, self.product, page)
  526. page_items = self.fetch_list_page(keyword=keyword, page=page)
  527. if not page_items:
  528. logger.info("第%s页无数据,停止", page)
  529. break
  530. for item in page_items:
  531. item, detail_url, spu_code, seller_code = self._build_detail_url(item)
  532. name_part = (item.get("productName") or "").strip()
  533. short_part = (item.get("shortName") or "").strip()
  534. product_name = f"{name_part} {short_part}".strip()
  535. if self.product not in product_name:
  536. self.is_not_product += 1
  537. continue
  538. if self.brand not in product_name:
  539. self.is_not_product += 1
  540. continue
  541. self.is_not_product = 0
  542. if not self._goto_detail_page(detail_url, spu_code, seller_code):
  543. logger.warning(
  544. "详情页跳转失败,跳过 spu=%s seller=%s url=%s",
  545. spu_code, seller_code, detail_url,
  546. )
  547. continue
  548. upload_key = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
  549. snap_url = self._take_snapshot(upload_key)
  550. product = self.parse_product(item, detail_url, snap_url)
  551. if not product.get("item_id"):
  552. continue
  553. try:
  554. self.pipeline.storge_data(product)
  555. logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
  556. except Exception as e:
  557. logger.exception("写入数据库失败: %s", e)
  558. time.sleep(random.uniform(1, 2))
  559. if self.is_not_product > NOT_PRODUCT_BREAK:
  560. logger.info("连续不匹配商品过多,停止搜索")
  561. break
  562. time.sleep(random.uniform(1, 3))
  563. return True
  564. def run(self):
  565. try:
  566. self.init_browser()
  567. self.search()
  568. except Exception as e:
  569. logger.exception("运行异常: %s", e)
  570. finally:
  571. self._quit_browser()