yaoex_snapshot_crawl.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. import base64
  2. import hashlib
  3. import json
  4. import random
  5. import signal
  6. import socket
  7. import sys
  8. import time
  9. from pathlib import Path
  10. from urllib.parse import quote
  11. import requests
  12. from Crypto.Cipher import AES
  13. from DrissionPage import ChromiumPage, ChromiumOptions
  14. from commons.Logger import logger
  15. from oss_upload.oss_upload import AliyunOSSUploader
  16. from pipelines.drug_pipelines import DrugPipeline
  17. from area_info.city_name_to_id import get_city
  18. from commons.config import YYC_ACCOUNT
  19. from Crypto.Util.Padding import unpad
  20. CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
  21. CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
  22. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  23. # 项目根目录 → spiders/yaoex(与从哪执行脚本无关)
  24. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  25. YAOEX_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoex"
  26. SLIDER_OFFSET_FIX = 10
  27. DETAIL_GET_TIMEOUT = 15
  28. DETAIL_URL_WAIT = 10
  29. DETAIL_DOM_WAIT = 8
  30. DETAIL_NAV_RETRIES = 3
  31. DETAIL_CONTENT_XPATH = "xpath://div[contains(@class,'yaoex-product-detail__content')]"
  32. REQUEST_RETRY_COUNT = 3
  33. REQUEST_TIMEOUT_SEC = 20
  34. NOT_PRODUCT_BREAK = 15
  35. headers = {
  36. "Accept": "application/json, text/plain, */*",
  37. "Accept-Language": "zh-CN,zh;q=0.9",
  38. "Connection": "keep-alive",
  39. "Content-Type": "application/x-www-form-urlencoded",
  40. "Origin": "https://mall.yaoex.com",
  41. "Referer": "https://mall.yaoex.com/",
  42. "Sec-Fetch-Dest": "empty",
  43. "Sec-Fetch-Mode": "cors",
  44. "Sec-Fetch-Site": "cross-site",
  45. "User-Agent": (
  46. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  47. "AppleWebKit/537.36 (KHTML, like Gecko) "
  48. "Chrome/146.0.0.0 Safari/537.36"
  49. ),
  50. "X-Request-Agent": "Axios",
  51. "X-Requested-With": "XMLHttpRequest",
  52. "sec-ch-ua": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
  53. "sec-ch-ua-mobile": "?0",
  54. "sec-ch-ua-platform": '"Windows"',
  55. }
  56. class YaoexSnapshotCrawl:
  57. def __init__(self, drug_dict=None):
  58. self.driver = None
  59. self.platform = 6
  60. self.pipeline = DrugPipeline("yaoex")
  61. self.task_dict = drug_dict or {}
  62. self.ossuploader = AliyunOSSUploader()
  63. self.start_page = 1
  64. self.end_page = 1
  65. self.account_name = YYC_ACCOUNT.get("username", "yyc_default")
  66. self._shop_cache = {}
  67. self._register_signal_handler()
  68. if self.task_dict:
  69. self.get_product_data()
  70. self.success = True
  71. self.is_not_product = 0
  72. self.user_id = YYC_ACCOUNT["user_id"]
  73. self.token = YYC_ACCOUNT["token"]
  74. def get_product_data(self):
  75. self.task_id = self.task_dict["id"]
  76. self.company_id = self.task_dict["company_id"]
  77. self.product = self.task_dict["product_name"]
  78. self.product_desc = self.task_dict.get("product_specs", "")
  79. self.brand = self.task_dict.get("product_brand", "")
  80. self.product_keyword = self.task_dict.get("product_keyword", "")
  81. self.collect_task_id = self.task_dict.get("collect_task_id", "")
  82. self.sampling_cycle = self.task_dict.get("sampling_cycle", "")
  83. self.sampling_start_time = self.task_dict.get("sampling_start_time", "")
  84. self.sampling_end_time = self.task_dict.get("sampling_end_time", "")
  85. self.collect_equipment_id = self.task_dict.get("collect_equipment_id", "")
  86. self.account_id = self.task_dict.get("collect_equipment_account_id", "")
  87. self.collect_region_id = self.task_dict.get("collect_region_id", "")
  88. self.collect_round = self.task_dict.get("collect_round", 1)
  89. self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
  90. self.end_page = max(
  91. self.start_page,
  92. self._parse_page(self.task_dict.get("end_page"), self.start_page),
  93. )
  94. @staticmethod
  95. def _parse_page(value, default=1):
  96. try:
  97. page = int(value)
  98. return page if page >= 1 else default
  99. except (TypeError, ValueError):
  100. return default
  101. def _register_signal_handler(self):
  102. def handler(signum, frame):
  103. logger.info("收到退出信号,正在关闭浏览器...")
  104. self._quit_browser()
  105. sys.exit(0)
  106. signal.signal(signal.SIGINT, handler)
  107. if hasattr(signal, "SIGTERM"):
  108. signal.signal(signal.SIGTERM, handler)
  109. @staticmethod
  110. def _timestamp_ms() -> str:
  111. return str(int(time.time() * 1000))
  112. def _quit_browser(self):
  113. if self.driver:
  114. try:
  115. self.driver.quit()
  116. except Exception:
  117. pass
  118. self.driver = None
  119. @staticmethod
  120. def _get_free_port():
  121. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  122. s.bind(("127.0.0.1", 0))
  123. return s.getsockname()[1]
  124. def _resolve_browser_profile_dir(self):
  125. """浏览器数据目录: <项目根>/spiders/yaoex/<账号>"""
  126. profile_dir = YAOEX_SPIDER_DIR / self.account_name
  127. # 仅兼容历史误路径,新建不再使用 chrome_profile
  128. legacy_nested = YAOEX_SPIDER_DIR / "spiders" / "yaoex" / self.account_name
  129. legacy_chrome_profile = YAOEX_SPIDER_DIR / "chrome_profile" / self.account_name
  130. for candidate in (profile_dir, legacy_nested, legacy_chrome_profile):
  131. if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
  132. logger.info("使用已有浏览器配置目录: %s", candidate)
  133. return candidate
  134. profile_dir.mkdir(parents=True, exist_ok=True)
  135. logger.info("新建浏览器配置目录: %s", profile_dir)
  136. return profile_dir
  137. def init_browser(self):
  138. co = ChromiumOptions().set_browser_path(chrome_path)
  139. debug_port = self._get_free_port()
  140. profile_dir = self._resolve_browser_profile_dir()
  141. profile_dir.mkdir(parents=True, exist_ok=True)
  142. co.set_user_data_path(str(profile_dir))
  143. logger.info("浏览器用户目录(绝对路径): %s", profile_dir.resolve())
  144. co.set_local_port(debug_port)
  145. co.set_argument(f"--remote-debugging-port={debug_port}")
  146. co.set_argument("--remote-debugging-address=127.0.0.1")
  147. co.set_argument("--disable-dev-shm-usage")
  148. co.set_argument("--start-maximized")
  149. co.set_argument("--no-first-run")
  150. co.set_argument("--no-default-browser-check")
  151. self.driver = ChromiumPage(co)
  152. def _is_logged_in(self):
  153. return bool(self.driver.ele("xpath=//a[@id='logout']", timeout=5))
  154. def _call_captcha_api(self, image_bytes):
  155. try:
  156. b64 = base64.b64encode(image_bytes).decode()
  157. resp = requests.post(
  158. CAPTCHA_API_URL,
  159. json={"token": CAPTCHA_TOKEN, "type": "22222", "image": b64},
  160. headers={"Content-Type": "application/json"},
  161. timeout=15,
  162. ).json()
  163. logger.info("验证码 API 返回: %s", resp)
  164. return resp["data"]["data"]
  165. except Exception as e:
  166. logger.exception("验证码识别失败: %s", e)
  167. return None
  168. @staticmethod
  169. def _generate_human_track(distance):
  170. tracks = []
  171. current = 0
  172. mid = distance * 0.7
  173. t = 0.2
  174. v = 0
  175. move_points = []
  176. while current < mid:
  177. a = random.uniform(2, 4)
  178. v0 = v
  179. v = v0 + a * t
  180. move = v0 * t + 0.5 * a * t * t
  181. current += move
  182. move_points.append(move)
  183. while current < distance:
  184. a = -random.uniform(0.5, 1.5)
  185. v0 = v
  186. v = v0 + a * t
  187. if v < 0.5:
  188. v = 0.5
  189. move = v0 * t + 0.5 * a * t * t
  190. current += move
  191. move_points.append(move)
  192. total_points = len(move_points)
  193. for i, move in enumerate(move_points):
  194. y_offset = random.randint(-2, 2) if i % random.randint(2, 4) == 0 else 0
  195. if i < total_points * 0.3:
  196. duration = random.uniform(0.01, 0.03)
  197. elif i > total_points * 0.7:
  198. duration = random.uniform(0.03, 0.08)
  199. else:
  200. duration = random.uniform(0.02, 0.05)
  201. if random.random() < 0.05:
  202. duration += random.uniform(0.05, 0.1)
  203. tracks.append((move, y_offset, duration))
  204. if random.random() < 0.7:
  205. tracks.append((-random.randint(1, 3), 0, 0.05))
  206. return tracks
  207. def _simulate_slider_drag(self, slider_element, target_distance):
  208. self.driver.actions.move_to(slider_element).hold()
  209. for offset_x, offset_y, duration in self._generate_human_track(target_distance):
  210. self.driver.actions.move(offset_x, offset_y, duration=duration / 1000)
  211. self.driver.actions.release()
  212. def _solve_slider_if_present(self):
  213. modal = self.driver.ele("xpath://div[@class='yidun_modal']", timeout=3)
  214. if not modal:
  215. return True
  216. logger.info("检测到滑块验证码,开始处理")
  217. jpg_bytes = modal.get_screenshot(as_bytes="jpg")
  218. distance = self._call_captcha_api(jpg_bytes)
  219. if distance is None:
  220. return False
  221. slider = self.driver.ele(
  222. "xpath://div[contains(@class,'yidun_slider--hover')]", timeout=5
  223. )
  224. if not slider:
  225. logger.error("未找到滑块元素")
  226. return False
  227. self._simulate_slider_drag(slider, float(distance) + SLIDER_OFFSET_FIX)
  228. time.sleep(3)
  229. return True
  230. def login(self):
  231. self.driver.get("https://mall.yaoex.com/login", timeout=15)
  232. self.driver.wait.doc_loaded(timeout=10)
  233. input_name = self.driver.ele("xpath://input[@name='username']", timeout=5)
  234. if not input_name:
  235. logger.error("未找到用户名输入框")
  236. return False
  237. input_name.input(YYC_ACCOUNT["username"])
  238. time.sleep(random.uniform(1.2, 2.0))
  239. input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
  240. if not input_pass:
  241. logger.error("未找到密码输入框")
  242. return False
  243. input_pass.input(YYC_ACCOUNT["password"])
  244. time.sleep(random.uniform(1.2, 2.0))
  245. geetest_click = self.driver.ele(
  246. "xpath=//div[contains(@class,'geetest_btn_click')]", timeout=3
  247. )
  248. if geetest_click:
  249. geetest_click.click()
  250. time.sleep(1.5)
  251. login_button = self.driver.ele("xpath://input[@id='login-btn']", timeout=5)
  252. if not login_button:
  253. logger.error("未找到登录按钮")
  254. return False
  255. login_button.click()
  256. self.driver.wait.doc_loaded(timeout=10)
  257. time.sleep(2)
  258. if not self._solve_slider_if_present():
  259. return False
  260. return self._is_logged_in()
  261. def decrypt_price(self, ciphertext_b64):
  262. if not ciphertext_b64 or not str(ciphertext_b64).strip():
  263. return ""
  264. _KEY_FIXED = "GDLSAUO1KUMIIBCE"
  265. if not self.user_id:
  266. key = _KEY_FIXED.encode("utf-8")
  267. else:
  268. uid = str(self.user_id)[:6].rjust(6, "0")
  269. key = (_KEY_FIXED[:10] + uid).encode("utf-8")
  270. raw = base64.b64decode(ciphertext_b64.strip())
  271. cipher = AES.new(key, AES.MODE_ECB)
  272. plain = unpad(cipher.decrypt(raw), AES.block_size)
  273. return plain.decode("utf-8")
  274. def _post_with_retry(self, url, payload, retries=REQUEST_RETRY_COUNT, timeout=REQUEST_TIMEOUT_SEC):
  275. last_err = None
  276. for attempt in range(1, retries + 1):
  277. try:
  278. resp = requests.post(
  279. url,
  280. headers=headers,
  281. data=payload,
  282. timeout=timeout,
  283. )
  284. resp.raise_for_status()
  285. return resp
  286. except Exception as e:
  287. last_err = e
  288. if attempt < retries:
  289. logger.warning("请求失败,第%s/%s次重试: %s", attempt, retries, e)
  290. time.sleep(min(2 * attempt, 5))
  291. else:
  292. logger.error("请求失败,已达最大重试次数(%s): %s", retries, e)
  293. raise last_err
  294. def _shop_payload(self, enterprise_id):
  295. return {
  296. "traderName": "yaoex_pc",
  297. "trader": "pc",
  298. "closesignature": "yes",
  299. "signature_method": "md5",
  300. "signature": "****",
  301. "timestamp": self._timestamp_ms(),
  302. "token": self.token,
  303. "userToken": self.token,
  304. "enterpriseId": enterprise_id,
  305. }
  306. def _list_payload(self, keyword, page):
  307. return {
  308. "traderName": "yaoex_pc",
  309. "trader": "pc",
  310. "closesignature": "yes",
  311. "signature_method": "md5",
  312. "signature": "****",
  313. "timestamp": self._timestamp_ms(),
  314. "token": self.token,
  315. "userToken": self.token,
  316. "userId": self.user_id,
  317. "roleId": "101",
  318. "userType": "下游客户",
  319. "buyerCode": self.user_id,
  320. "nowPage": str(page),
  321. "per": "20",
  322. "keyword": keyword,
  323. "catSearchId": "",
  324. "specs": "",
  325. "factoryIds": "",
  326. "sellerCodes": "",
  327. "sellerFilterMode": "0",
  328. "sortColumn": "default",
  329. "sortMode": "default",
  330. "ver": "1",
  331. "stock_mode": "1",
  332. "showExtendCard": "true",
  333. "needDinnerPrice": "true",
  334. "limitStart": "",
  335. "limitEnd": "",
  336. "deadLineStart": "",
  337. "deadLineEnd": "",
  338. "filterDtos": "",
  339. "showWholePurchase": "true",
  340. }
  341. def fetch_list_page(self, keyword, page):
  342. list_url = "https://gateway-b2b.fangkuaiyi.com/home/search/homeSearchList"
  343. resp = self._post_with_retry(list_url, self._list_payload(keyword, page))
  344. data = resp.json()
  345. recall_status = data.get("data", {}).get("recallStatus", 0)
  346. if int(recall_status) == 1:
  347. return data.get("data", {}).get("shopProducts", []) or []
  348. else:
  349. return []
  350. def fetch_shop(self, seller_code):
  351. detail_url = "https://gateway-b2b.fangkuaiyi.com/ycapp/shop/enterpriseQualification"
  352. resp = self._post_with_retry(detail_url, self._shop_payload(seller_code))
  353. shop_res = resp.json().get("data", {})
  354. base_info = shop_res.get("baseInfo", {})
  355. return base_info.get("address", ""), base_info.get("enterpriseName", "")
  356. def _get_shop_info(self, seller_code):
  357. if seller_code in self._shop_cache:
  358. return self._shop_cache[seller_code]
  359. try:
  360. shop_info = self.fetch_shop(seller_code)
  361. except Exception as e:
  362. logger.warning("fetch_shop 失败 seller_code=%s: %s", seller_code, e)
  363. shop_info = ("", "")
  364. self._shop_cache[seller_code] = shop_info
  365. return shop_info
  366. def _current_url(self):
  367. try:
  368. return self.driver.url or ""
  369. except Exception:
  370. return ""
  371. def _url_has_product(self, spu_code, seller_code):
  372. url = self._current_url()
  373. spu_code = str(spu_code or "")
  374. seller_code = str(seller_code or "")
  375. if spu_code and seller_code:
  376. return spu_code in url and seller_code in url
  377. return bool(spu_code and spu_code in url)
  378. def _wait_detail_ready(self, spu_code, seller_code, timeout=DETAIL_URL_WAIT):
  379. deadline = time.time() + timeout
  380. while time.time() < deadline:
  381. if self._url_has_product(spu_code, seller_code):
  382. if self.driver.ele(DETAIL_CONTENT_XPATH, timeout=1):
  383. time.sleep(0.3)
  384. return True
  385. time.sleep(0.4)
  386. return False
  387. def _build_detail_url(self, item):
  388. if not item.get("productId") and item.get("groupBuyProductDto"):
  389. item = item.get("groupBuyProductDto") or {}
  390. spu_code = item.get("spuCode", "")
  391. seller_code = item.get("sellerCode", "")
  392. group_buying_id = item.get("groupBuyingId", "")
  393. p_json = json.dumps(
  394. {"id": group_buying_id, "s": seller_code, "sp": spu_code},
  395. separators=(",", ":"),
  396. )
  397. detail_url = (
  398. f"https://mall.yaoex.com/groupBuying/#/productDetail?p={quote(p_json)}"
  399. )
  400. else:
  401. seller_code = item.get("sellerCode")
  402. spu_code = item.get("spuCode")
  403. detail_url = (
  404. f"https://mall.yaoex.com/v2/product/#/spuCode/{spu_code}/sellerCode/{seller_code}"
  405. )
  406. return item, detail_url, spu_code, seller_code
  407. def _goto_detail_page(self, detail_url, spu_code, seller_code):
  408. """get 后 refresh 一次,让 SPA 按当前 URL 重新渲染详情。"""
  409. for attempt in range(1, DETAIL_NAV_RETRIES + 1):
  410. try:
  411. self.driver.get(detail_url, timeout=DETAIL_GET_TIMEOUT)
  412. time.sleep(0.5)
  413. self.driver.refresh()
  414. time.sleep(2)
  415. ele = self.driver.ele("xpath=//div[@class='yaoex-product-detail__product-detail']")
  416. if ele:
  417. return True
  418. else:
  419. continue
  420. except Exception as e:
  421. logger.warning(
  422. "跳转详情异常 spu=%s seller=%s attempt=%s: %s",
  423. spu_code, seller_code, attempt, e,
  424. )
  425. time.sleep(random.uniform(0.8, 1.5))
  426. return False
  427. def _take_snapshot(self, upload_key):
  428. time.sleep(1)
  429. try:
  430. detail_ele = self.driver.ele(DETAIL_CONTENT_XPATH, timeout=2)
  431. if detail_ele:
  432. jpg_bytes = detail_ele.get_screenshot(as_bytes="jpg")
  433. else:
  434. jpg_bytes = self.driver.get_screenshot(as_bytes="jpg")
  435. if not jpg_bytes:
  436. logger.warning("截图为空 upload_key=%s", upload_key)
  437. return ""
  438. img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
  439. except Exception:
  440. logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
  441. return ""
  442. if not img_url:
  443. logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
  444. return ""
  445. logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
  446. time.sleep(random.uniform(1, 2))
  447. return img_url
  448. def parse_product(self, item, detail_url, snap_url):
  449. seller_code = item.get("sellerCode")
  450. spu_code = item.get("spuCode")
  451. name_part = (item.get("productName") or "").strip()
  452. short_part = (item.get("shortName") or "").strip()
  453. product_name = f"{name_part} {short_part}".strip()
  454. shop_url = f"https://mall.yaoex.com/v2/store/#/detail/{seller_code}/home"
  455. company_address, company_name = self._get_shop_info(seller_code)
  456. address = item.get("cityName", "")
  457. city_id = province_id = city = province = ""
  458. if address:
  459. city_id, province_id, city, province = get_city(address.split("市")[0])
  460. price = self.decrypt_price(item.get("price"))
  461. hash_text = f"{seller_code}_{spu_code}_{price}"
  462. item_id = hashlib.md5(hash_text.encode("utf-8")).hexdigest()
  463. is_sold_out = 1 if "商品已售罄" in (item.get("statusDescription") or "") else 0
  464. shop_name = item.get("storeName") or item.get("shopName")
  465. anonymous_store_name = ""
  466. if shop_name == "预约配送中心":
  467. anonymous_store_name = item.get("supplyName", "")
  468. inventory = item.get("currentInventory") or item.get("stockCount")
  469. now = time.strftime("%Y-%m-%d %H:%M:%S")
  470. return {
  471. "platform": self.platform,
  472. "item_id": item_id,
  473. "enterprise_id": self.company_id,
  474. "product_name": product_name,
  475. "spec": item.get("spec"),
  476. "one_price": "",
  477. "detail_url": detail_url,
  478. "shop_name": shop_name,
  479. "anonymous_store_name": anonymous_store_name,
  480. "shop_url": shop_url,
  481. "city_name": city,
  482. "city_id": city_id,
  483. "province_name": province,
  484. "province_id": province_id,
  485. "shipment_city_name": "",
  486. "shipment_city_id": "",
  487. "shipment_province_name": "",
  488. "shipment_province_id": "",
  489. "area_info": company_address or "",
  490. "factory_name": item.get("factoryName"),
  491. "scrape_date": time.strftime("%Y-%m-%d"),
  492. "price": price,
  493. "sales": "",
  494. "stock_count": inventory,
  495. "snapshot_url": snap_url,
  496. "approval_num": item.get("approvalNum"),
  497. "produced_time": item.get("productionTime"),
  498. "deadline": item.get("deadLine"),
  499. "update_time": now,
  500. "insert_time": now,
  501. "number": 1,
  502. "product_brand": self.brand or "",
  503. "collect_task_id": self.collect_task_id,
  504. "search_name": self.product,
  505. "company_name": company_name,
  506. "collect_config_info": json.dumps(
  507. {
  508. "sampling_cycle": self.sampling_cycle,
  509. "sampling_start_time": self.sampling_start_time,
  510. "sampling_end_time": self.sampling_end_time,
  511. }
  512. ),
  513. "account_id": self.account_id,
  514. "collect_region_id": self.collect_region_id,
  515. "collect_round": self.collect_round,
  516. "is_sold_out": is_sold_out,
  517. }
  518. def search(self):
  519. self.driver.get("https://mall.yaoex.com/", timeout=15)
  520. self.driver.wait.doc_loaded(timeout=10)
  521. if not self._is_logged_in():
  522. if not self.login():
  523. logger.error("登录失败")
  524. return False
  525. keyword = self.product
  526. if self.brand:
  527. keyword = (self.brand + " " + self.product).strip()
  528. if self.product_desc:
  529. keyword = (keyword + " " + self.product_desc).strip()
  530. for page in range(self.start_page, self.end_page + 1):
  531. logger.info("正在爬取 %s %s,第%s页", self.brand, self.product, page)
  532. page_items = self.fetch_list_page(keyword=keyword, page=page)
  533. if not page_items:
  534. logger.info("第%s页无数据,停止", page)
  535. break
  536. for item in page_items:
  537. item, detail_url, spu_code, seller_code = self._build_detail_url(item)
  538. name_part = (item.get("productName") or "").strip()
  539. short_part = (item.get("shortName") or "").strip()
  540. product_name = f"{name_part} {short_part}".strip()
  541. if self.product not in product_name:
  542. self.is_not_product += 1
  543. continue
  544. if self.brand not in product_name:
  545. self.is_not_product += 1
  546. continue
  547. self.is_not_product = 0
  548. if not self._goto_detail_page(detail_url, spu_code, seller_code):
  549. logger.warning(
  550. "详情页跳转失败,跳过 spu=%s seller=%s url=%s",
  551. spu_code, seller_code, detail_url,
  552. )
  553. continue
  554. upload_key = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
  555. snap_url = self._take_snapshot(upload_key)
  556. product = self.parse_product(item, detail_url, snap_url)
  557. if not product.get("item_id"):
  558. continue
  559. try:
  560. self.pipeline.storge_data(product)
  561. logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
  562. except Exception as e:
  563. logger.exception("写入数据库失败: %s", e)
  564. time.sleep(random.uniform(1, 2))
  565. if self.is_not_product > NOT_PRODUCT_BREAK:
  566. logger.info("连续不匹配商品过多,停止搜索")
  567. break
  568. time.sleep(random.uniform(1, 3))
  569. return True
  570. def run(self):
  571. try:
  572. self.init_browser()
  573. self.search()
  574. except Exception as e:
  575. logger.exception("运行异常: %s", e)
  576. finally:
  577. self._quit_browser()