ysb_snapshot_list_crawl.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667
  1. import base64
  2. import hashlib
  3. import json
  4. import math
  5. import random
  6. import signal
  7. import socket
  8. import sys
  9. import time
  10. import zlib
  11. from pathlib import Path
  12. from urllib.parse import quote
  13. import requests
  14. from Crypto.Cipher import AES
  15. from commons.conn_mysql import MySQLPoolOnline
  16. from DrissionPage import ChromiumPage, ChromiumOptions
  17. from commons.Logger import logger
  18. from oss_upload.oss_upload import AliyunOSSUploader
  19. from commons.config import YSB_ACCOUNT
  20. from pipelines.drug_pipelines import DrugPipeline
  21. from area_info.city_name_to_id import get_city
  22. CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
  23. CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
  24. SLIDER_OFFSET_FIX = 10
  25. LISTEN_CLEAR_ROUNDS = 3
  26. LISTEN_CLEAR_TIMEOUT = 0.3
  27. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  28. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  29. YSB_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoshibang"
  30. BROWSER_PROFILE_SUBDIR = "chrome_profile"
  31. def pkcs7_unpad(data):
  32. if not data:
  33. raise ValueError("Empty data for PKCS7 unpad")
  34. pad_len = data[-1]
  35. if pad_len < 1 or pad_len > 16:
  36. raise ValueError("Invalid PKCS7 padding length")
  37. if data[-pad_len:] != bytes([pad_len]) * pad_len:
  38. raise ValueError("Invalid PKCS7 padding bytes")
  39. return data[:-pad_len]
  40. def derive_ysb_key():
  41. base = "BhCLxFfFhd12K4qRGPfy"
  42. md5_hex = hashlib.md5(base.encode("utf-8")).hexdigest()
  43. return md5_hex[:16].upper().encode("utf-8")
  44. def decrypt_ysb_payload(cipher_text_b64):
  45. """解密药师帮列表接口 data.o 字段,返回 JSON 对象。"""
  46. key = derive_ysb_key()
  47. cipher_bytes = base64.b64decode(cipher_text_b64)
  48. cipher = AES.new(key, AES.MODE_ECB)
  49. decrypted = cipher.decrypt(cipher_bytes)
  50. unpadded = pkcs7_unpad(decrypted)
  51. json_bytes = zlib.decompress(unpadded, zlib.MAX_WBITS | 16)
  52. return json.loads(json_bytes.decode("utf-8"))
  53. class YaoShiBangSnapshot:
  54. def __init__(self, drug_dict=None):
  55. self.driver = None
  56. self.db = MySQLPoolOnline()
  57. self.ip = None
  58. self.login_username = None
  59. self.login_password = None
  60. self.platform = 5
  61. self.pipeline = DrugPipeline("ysb")
  62. self.task_dict = drug_dict or {}
  63. self.ossuploader = AliyunOSSUploader()
  64. self.start_page = 1
  65. self.end_page = 1
  66. self.account_name = YSB_ACCOUNT.get("username", "ysb_default")
  67. self._register_signal_handler()
  68. if self.task_dict:
  69. self.get_product_data()
  70. self.success = True
  71. self.is_no_prodcut = 0
  72. self.is_product_count = 0
  73. self._listen_started = False
  74. def get_product_data(self):
  75. self.task_id = self.task_dict["id"]
  76. self.company_id = self.task_dict["company_id"]
  77. self.product = self.task_dict["product_name"]
  78. self.product_desc = self.task_dict.get("product_specs", "")
  79. self.brand = self.task_dict.get("product_brand", "")
  80. self.product_keyword = self.task_dict.get("product_keyword", "")
  81. self.collect_task_id = self.task_dict.get("collect_task_id", "")
  82. self.sampling_cycle = self.task_dict.get("sampling_cycle", "")
  83. self.sampling_start_time = self.task_dict.get("sampling_start_time", "")
  84. self.sampling_end_time = self.task_dict.get("sampling_end_time", "")
  85. self.collect_equipment_id = self.task_dict.get("collect_equipment_id", "")
  86. self.account_id = self.task_dict.get("collect_equipment_account_id", "")
  87. self.collect_region_id = self.task_dict.get("collect_region_id", "")
  88. self.collect_round = self.task_dict.get("collect_round", 1)
  89. self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
  90. self.end_page = max(
  91. self.start_page,
  92. self._parse_page(self.task_dict.get("end_page"), self.start_page),
  93. )
  94. @staticmethod
  95. def _parse_page(value, default=1):
  96. try:
  97. page = int(value)
  98. return page if page >= 1 else default
  99. except (TypeError, ValueError):
  100. return default
  101. def _register_signal_handler(self):
  102. def handler(signum, frame):
  103. logger.info("收到退出信号,正在关闭浏览器...")
  104. self._quit_browser()
  105. sys.exit(0)
  106. signal.signal(signal.SIGINT, handler)
  107. if hasattr(signal, "SIGTERM"):
  108. signal.signal(signal.SIGTERM, handler)
  109. def _quit_browser(self):
  110. if self.driver:
  111. try:
  112. self.driver.quit()
  113. except Exception:
  114. pass
  115. self.driver = None
  116. @staticmethod
  117. def _get_free_port():
  118. """获取一个当前可用的本地端口,供 Chrome 调试使用。"""
  119. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  120. s.bind(("127.0.0.1", 0))
  121. return s.getsockname()[1]
  122. def _resolve_browser_profile_dir(self):
  123. """
  124. 浏览器数据固定落在 <项目根>/spiders/yaoshibang/ 下。
  125. 优先 chrome_profile/<账号>;若旧版目录已有登录态则继续沿用。
  126. """
  127. preferred = YSB_SPIDER_DIR / BROWSER_PROFILE_SUBDIR / self.account_name
  128. legacy_flat = YSB_SPIDER_DIR / self.account_name
  129. legacy_nested = YSB_SPIDER_DIR / "spiders" / "yaoshibang" / self.account_name
  130. for candidate in (preferred, legacy_flat, legacy_nested):
  131. if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
  132. logger.info("使用已有浏览器配置目录: %s", candidate)
  133. return candidate
  134. preferred.parent.mkdir(parents=True, exist_ok=True)
  135. logger.info("新建浏览器配置目录: %s", preferred)
  136. return preferred
  137. def init_browser(self):
  138. co = ChromiumOptions().set_browser_path(chrome_path)
  139. debug_port = self._get_free_port()
  140. profile_dir = self._resolve_browser_profile_dir()
  141. profile_dir.mkdir(parents=True, exist_ok=True)
  142. co.set_user_data_path(str(profile_dir))
  143. logger.info("浏览器用户目录(绝对路径): %s", profile_dir.resolve())
  144. co.set_local_port(debug_port)
  145. co.set_argument(f"--remote-debugging-port={debug_port}")
  146. co.set_argument("--remote-debugging-address=127.0.0.1")
  147. # co.set_argument("--disable-blink-features=AutomationControlled")
  148. co.set_argument("--disable-dev-shm-usage")
  149. co.set_argument("--start-maximized")
  150. co.set_argument("--no-first-run") # 避免首次运行弹窗
  151. co.set_argument("--no-default-browser-check") # 避免默认浏览器检查
  152. self.driver = ChromiumPage(co)
  153. def _solve_slider_captcha(self):
  154. """检测并处理易盾滑块验证码,成功返回 True。"""
  155. self.driver.wait.doc_loaded()
  156. time.sleep(2)
  157. yidun = self.driver.ele("xpath://div[@class='yidun_modal']", timeout=3)
  158. if not yidun:
  159. return True
  160. logger.info("检测到滑块验证码,开始处理")
  161. jpg_bytes = yidun.get_screenshot(as_bytes="jpg")
  162. distance = self._call_captcha_api(jpg_bytes)
  163. if distance is None:
  164. logger.error("验证码识别失败")
  165. return False
  166. logger.info("滑块距离: %s", distance)
  167. slider = self.driver.ele(
  168. "xpath://div[contains(@class,'yidun_slider--hover')]", timeout=5
  169. )
  170. if not slider:
  171. logger.error("未找到滑块元素")
  172. return False
  173. try:
  174. drag_distance = float(distance) + SLIDER_OFFSET_FIX
  175. except (TypeError, ValueError):
  176. logger.error("滑块距离非数字: %r", distance)
  177. return False
  178. if not math.isfinite(drag_distance) or drag_distance <= 0:
  179. logger.error("滑块距离无效: %s", drag_distance)
  180. return False
  181. self._simulate_slider_drag(slider, drag_distance - 5)
  182. time.sleep(3)
  183. return True
  184. def _call_captcha_api(self, image_bytes):
  185. """调用云码平台识别滑块距离,失败返回 None。"""
  186. try:
  187. b64 = base64.b64encode(image_bytes).decode()
  188. resp = requests.post(
  189. CAPTCHA_API_URL,
  190. json={"token": CAPTCHA_TOKEN, "type": "22222", "image": b64},
  191. headers={"Content-Type": "application/json"},
  192. timeout=15,
  193. ).json()
  194. logger.info("验证码 API 返回: %s", resp)
  195. if not isinstance(resp, dict):
  196. return None
  197. data = resp.get("data")
  198. if isinstance(data, dict):
  199. dist = data.get("data")
  200. else:
  201. dist = data
  202. if dist is None:
  203. logger.error("验证码 API 未返回距离字段: %s", resp)
  204. return None
  205. try:
  206. d = float(dist)
  207. except (TypeError, ValueError):
  208. logger.error("验证码距离无法解析为数字: %r", dist)
  209. return None
  210. if not math.isfinite(d):
  211. logger.error("验证码距离非有限数值: %r", dist)
  212. return None
  213. return d
  214. except Exception as e:
  215. logger.exception("验证码 API 调用失败: %s", e)
  216. return None
  217. @staticmethod
  218. def _generate_human_track(distance):
  219. try:
  220. distance = float(distance)
  221. except (TypeError, ValueError):
  222. return []
  223. if distance <= 0 or not math.isfinite(distance):
  224. return []
  225. tracks = []
  226. current = 0
  227. mid = distance * 0.7
  228. t = 0.2
  229. v = 0
  230. move_points = []
  231. while current < mid:
  232. a = random.uniform(2, 4)
  233. v0 = v
  234. v = v0 + a * t
  235. move = v0 * t + 0.5 * a * t * t
  236. current += move
  237. move_points.append(move)
  238. while current < distance:
  239. a = -random.uniform(0.5, 1.5)
  240. v0 = v
  241. v = v0 + a * t
  242. if v < 0.5:
  243. v = 0.5
  244. move = v0 * t + 0.5 * a * t * t
  245. current += move
  246. move_points.append(move)
  247. total_points = len(move_points)
  248. for i, move in enumerate(move_points):
  249. y_offset = random.randint(-2, 2) if i % random.randint(2, 4) == 0 else 0
  250. if i < total_points * 0.3:
  251. duration = random.uniform(0.01, 0.03)
  252. elif i > total_points * 0.7:
  253. duration = random.uniform(0.03, 0.08)
  254. else:
  255. duration = random.uniform(0.02, 0.05)
  256. if random.random() < 0.05:
  257. duration += random.uniform(0.05, 0.1)
  258. tracks.append((move, y_offset, duration))
  259. if random.random() < 0.7:
  260. tracks.append((-random.randint(1, 3), 0, 0.05))
  261. return tracks
  262. def _simulate_slider_drag(self, slider_element, target_distance):
  263. if target_distance <= 0:
  264. logger.warning("滑块目标距离无效: %s", target_distance)
  265. return
  266. self.driver.actions.move_to(slider_element).hold()
  267. for offset_x, offset_y, duration in self._generate_human_track(target_distance):
  268. self.driver.actions.move(offset_x, offset_y, duration=duration / 1000)
  269. self.driver.actions.release()
  270. def _is_logged_in(self):
  271. # 与当前账号店铺展示文案一致;换店后需同步修改或改为配置项
  272. title = self.driver.ele(
  273. "xpath=//span[@class='logout']",
  274. timeout=5,
  275. )
  276. return bool(title)
  277. def _start_listen(self):
  278. """监听列表接口 getWholesaleList。"""
  279. target = "wholesale-drug/sales/getWholesaleList/v4270"
  280. if self._listen_started and getattr(self.driver.listen, "listening", False):
  281. self.driver.listen.stop()
  282. self.driver.listen.start(target)
  283. self._listen_started = True
  284. logger.info("已启动监听: %s", target)
  285. def clear_listen_buffer(self, rounds=LISTEN_CLEAR_ROUNDS, timeout=LISTEN_CLEAR_TIMEOUT):
  286. if not self.driver:
  287. return
  288. try:
  289. for _ in range(rounds):
  290. resps = list(self.driver.listen.steps(timeout=timeout))
  291. if not resps:
  292. break
  293. except Exception as e:
  294. logger.debug("清空监听缓冲失败: %s", e)
  295. @staticmethod
  296. def _parse_listen_body(resp):
  297. body = resp.response.body
  298. if isinstance(body, str):
  299. body = json.loads(body)
  300. if not isinstance(body, dict):
  301. return None
  302. return body
  303. @staticmethod
  304. def _extract_encrypted_o(body):
  305. data_block = (body or {}).get("data") or {}
  306. if isinstance(data_block, dict):
  307. return data_block.get("o")
  308. return None
  309. def _consume_list_listen(self, page, timeout=10):
  310. """消费列表接口响应,返回解密后的 json_data。"""
  311. for resp in self.driver.listen.steps(timeout=timeout):
  312. try:
  313. body = self._parse_listen_body(resp)
  314. if not body:
  315. continue
  316. message = str(body.get("message", ""))
  317. if message and "成功" not in message:
  318. logger.warning("第%s页 message=%s", page, message)
  319. continue
  320. encrypted_o = self._extract_encrypted_o(body)
  321. if not encrypted_o:
  322. continue
  323. json_data = decrypt_ysb_payload(encrypted_o)
  324. logger.info("第%s页列表解密成功 wholesales=%s", page, len(json_data.get("wholesales", [])))
  325. return json_data
  326. except Exception as e:
  327. logger.warning("第%s页解析列表监听失败: %s", page, e)
  328. return None
  329. def login(self):
  330. logger.info("开始登录药师帮")
  331. self.driver.get("https://dian.ysbang.cn/#/login", timeout=15)
  332. self.driver.wait.doc_loaded(timeout=10)
  333. time.sleep(2)
  334. input_name = self.driver.ele("xpath://input[@name='userAccount']", timeout=5)
  335. if not input_name:
  336. logger.error("未找到账号输入框")
  337. return False
  338. input_name.input(YSB_ACCOUNT["username"])
  339. time.sleep(random.uniform(1.5, 2.5))
  340. input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
  341. if not input_pass:
  342. logger.error("未找到密码输入框")
  343. return False
  344. input_pass.input(YSB_ACCOUNT["password"])
  345. time.sleep(random.uniform(1.5, 2.5))
  346. login_btn = self.driver.ele("xpath://button[text()='登录']", timeout=5)
  347. if not login_btn:
  348. logger.error("未找到登录按钮")
  349. return False
  350. login_btn.click()
  351. time.sleep(3)
  352. for i in range(3):
  353. self._solve_slider_captcha()
  354. time.sleep(3)
  355. if self._is_logged_in():
  356. logger.info("登录成功")
  357. return True
  358. logger.error("登录后未检测到目标店铺名,登录可能失败")
  359. return False
  360. def _take_snapshot(self, upload_key, image_ele):
  361. """在当前页面截图并上传,不再重复跳转。"""
  362. time.sleep(1)
  363. self._dismiss_popup_before_screenshot()
  364. try:
  365. jpg_bytes = image_ele.get_screenshot(as_bytes="jpg")
  366. if not jpg_bytes:
  367. logger.warning("截图为空 upload_key=%s", upload_key)
  368. return ""
  369. img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
  370. except Exception:
  371. logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
  372. return ""
  373. if not img_url:
  374. logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
  375. return ""
  376. logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
  377. time.sleep(random.uniform(1, 2))
  378. return img_url
  379. def _human_click(self, element):
  380. """在目标节点上触发 click,避免 move_to + 无目标 actions.click() 因布局位移点到商品链接触发详情页。"""
  381. if not element:
  382. return False
  383. try:
  384. time.sleep(random.uniform(0.8, 2.0))
  385. try:
  386. self.driver.run_js(
  387. "arguments[0].scrollIntoView({block:'center',behavior:'instant'});",
  388. element,
  389. )
  390. except Exception:
  391. pass
  392. time.sleep(random.uniform(0.3, 1))
  393. self.driver.run_js("arguments[0].click();", element)
  394. return True
  395. except Exception as e:
  396. logger.warning("点击失败: %s", e)
  397. try:
  398. element.click()
  399. return True
  400. except Exception:
  401. return False
  402. def _dismiss_popup_before_screenshot(self):
  403. """截图前关闭或隐藏营销弹窗,避免遮挡。"""
  404. close_locs = [
  405. "xpath=//div[contains(@class,'dialog')]//i[contains(@class,'close')]",
  406. "xpath=//div[contains(@class,'popup')]//i[contains(@class,'close')]",
  407. "xpath=//div[contains(@class,'modal')]//i[contains(@class,'close')]",
  408. "xpath=//button[contains(@class,'close')]",
  409. "xpath=//span[text()='×']",
  410. "xpath=//*[contains(text(),'智能采购')]/ancestor::div[1]//*[contains(@class,'close')]",
  411. ]
  412. for loc in close_locs:
  413. try:
  414. btn = self.driver.ele(loc, timeout=0.5)
  415. if btn:
  416. btn.click()
  417. time.sleep(0.2)
  418. except Exception:
  419. pass
  420. try:
  421. # 兜底:隐藏常见高层弹窗和遮罩
  422. self.driver.run_js(
  423. """
  424. const sels = [
  425. '[class*="modal"]',
  426. '[class*="popup"]',
  427. '[class*="dialog"]',
  428. '[class*="mask"]',
  429. '[class*="overlay"]'
  430. ];
  431. for (const s of sels) {
  432. document.querySelectorAll(s).forEach(el => {
  433. const style = getComputedStyle(el);
  434. const z = parseInt(style.zIndex || '0', 10);
  435. if (z >= 999 && style.display !== 'none') {
  436. el.style.display = 'none';
  437. }
  438. });
  439. }
  440. document.body.style.overflow = 'auto';
  441. """
  442. )
  443. time.sleep(0.2)
  444. except Exception:
  445. pass
  446. def to_product(self, item):
  447. now = time.strftime("%Y-%m-%d %H:%M:%S")
  448. item_id = item.get("wholesaleid", "")
  449. provider_id = item.get("providerId", "")
  450. city_id = province_id = city = province = ""
  451. city_str = item.get("warehouseCity", "")
  452. if city_str:
  453. city_id, province_id, city, province = get_city(city_str)
  454. price = item.get("disPrice", "")
  455. if not price:
  456. price = item.get("minprice", "")
  457. if not price:
  458. price = item.get("price", "")
  459. shop_name = item.get("provider_name", "")
  460. if not shop_name:
  461. shop_name = item.get("abbreviation", "")
  462. product = {
  463. "platform": self.platform,
  464. "item_id": item_id,
  465. "enterprise_id": self.company_id,
  466. "product_name": item.get("drugname", ""),
  467. "spec": item.get("specification", ""),
  468. "one_price": '',
  469. "detail_url": f"https://dian.ysbang.cn/#/drugInfo?wholesaleid={item_id}&trafficType=1",
  470. "shop_name": shop_name,
  471. "anonymous_store_name": "",
  472. "shop_url": f"https://dian.ysbang.cn/#/supplierstore?providerId={provider_id}&trafficType=4",
  473. "city_name": city,
  474. "city_id": city_id,
  475. "province_name": province,
  476. "province_id": province_id,
  477. "area_info": "",
  478. "factory_name": item.get("manufacturer", ""),
  479. "scrape_date": time.strftime("%Y-%m-%d"),
  480. "price": price,
  481. "sales": "",
  482. "stock_count": item.get("stockAvailable", ""),
  483. "snapshot_url": "",
  484. "approval_num": "",
  485. "produced_time": item.get("prodDate", ""),
  486. "deadline": item.get("valid_date", ""),
  487. "update_time": now,
  488. "insert_time": now,
  489. "number": 1,
  490. "product_brand": self.brand or "",
  491. "collect_task_id": self.collect_task_id,
  492. "search_name": self.product,
  493. "company_name": "",
  494. "collect_config_info": json.dumps(
  495. {"sampling_cycle": self.sampling_cycle, "sampling_start_time": self.sampling_start_time,
  496. "sampling_end_time": self.sampling_end_time}),
  497. "account_id": self.account_id,
  498. "collect_region_id": self.collect_region_id,
  499. "collect_round": self.collect_round,
  500. "is_sold_out": 0
  501. }
  502. return product
  503. def search(self):
  504. self.driver.get("https://dian.ysbang.cn/#/home", timeout=15)
  505. self.driver.wait.doc_loaded(timeout=10)
  506. time.sleep(2)
  507. if not self._is_logged_in():
  508. if not self.login():
  509. return False
  510. keyword = self.product
  511. if self.brand:
  512. keyword = (self.brand + " " + self.product).strip()
  513. if self.product_desc:
  514. keyword = (keyword + " " + self.product_desc).strip()
  515. search_key = quote(keyword)
  516. page = self.start_page
  517. url = (
  518. f"https://dian.ysbang.cn/#/indexContent?lastClick=-1&page={page}"
  519. f"&pagesize=60&classify_id=&searchkey={search_key}"
  520. )
  521. self._start_listen()
  522. self.driver.get(url)
  523. for page in range(1, 100):
  524. self.driver.wait.doc_loaded(timeout=10)
  525. time.sleep(1.5)
  526. json_data = self._consume_list_listen(page)
  527. if not json_data:
  528. logger.warning("第%s页未收到列表监听数据", page)
  529. break
  530. wholesales = json_data.get("wholesales", [])
  531. if not wholesales:
  532. logger.info("第%s页无数据,停止", page)
  533. break
  534. list_items = wholesales[0:5]
  535. goods_wrappers = self.driver.eles(
  536. "xpath=//div[@class='drugListPage']//div[@class='drug-list']/div[contains(@class,'all-goods-wrapper')]"
  537. )
  538. for list_idx, item in enumerate(list_items, start=1):
  539. item_id = item.get("wholesaleid", "")
  540. logger.info(
  541. "第%s页 列表第%s/%s条 wholesaleid=%s",
  542. page,
  543. list_idx,
  544. len(list_items),
  545. item_id,
  546. )
  547. if not item_id:
  548. continue
  549. detail_url = (
  550. f"https://dian.ysbang.cn/#/drugInfo?wholesaleid={item_id}&trafficType=1"
  551. )
  552. product = self.to_product(item)
  553. title = product.get("product_name", "")
  554. if self.brand not in title:
  555. self.is_product_count += 1
  556. if self.product not in title:
  557. self.is_product_count += 1
  558. continue
  559. if self.product in title and self.brand in title:
  560. self.is_product_count = 0
  561. if self.is_product_count >= 20:
  562. return
  563. dom_idx = list_idx - 1
  564. image_ele = goods_wrappers[dom_idx]
  565. upload_key = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
  566. product["snapshot_url"] = self._take_snapshot(upload_key, image_ele)
  567. try:
  568. self.pipeline.storge_data(product)
  569. logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
  570. except Exception as e:
  571. logger.exception("写入数据库失败: %s", e)
  572. # 检测下一页
  573. self.clear_listen_buffer()
  574. next_button = self.driver.ele("xpath=//div[@class='condition']//div[@class='btn next']")
  575. if not next_button:
  576. logger.info("没有下一页,停止")
  577. break
  578. else:
  579. self._human_click(next_button)
  580. def run(self):
  581. try:
  582. self.init_browser()
  583. self.search()
  584. except Exception as e:
  585. logger.exception("运行异常: %s", e)
  586. finally:
  587. self._quit_browser()
  588. return self.pipeline.crawl_count, self.success