ysb_snapshot_crawl.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. import base64
  2. import hashlib
  3. import json
  4. import math
  5. import random
  6. import re
  7. import signal
  8. import socket
  9. import sys
  10. import time
  11. import zlib
  12. from pathlib import Path
  13. import requests
  14. import secrets
  15. import string
  16. from Crypto.Cipher import AES
  17. from commons.conn_mysql import MySQLPoolOnline
  18. from DrissionPage import ChromiumPage, ChromiumOptions
  19. from commons.Logger import logger
  20. from oss_upload.oss_upload import AliyunOSSUploader
  21. from commons.config import YSB_ACCOUNT
  22. from pipelines.drug_pipelines import DrugPipeline
  23. from datetime import datetime, timedelta
  24. from area_info.city_name_to_id import get_city
  25. CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
  26. CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
  27. SLIDER_OFFSET_FIX = 10
  28. DETAIL_GET_TIMEOUT = 15
  29. DETAIL_URL_WAIT = 10
  30. DETAIL_DOM_WAIT = 8
  31. DETAIL_NAV_RETRIES = 3
  32. DETAIL_APPROVAL_XPATH = (
  33. 'xpath://div[@class="drug-info"]//span[contains(text(),"批准文号")]'
  34. )
  35. chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
  36. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  37. YSB_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoshibang"
  38. headers = {
  39. "Accept": "*/*",
  40. "Accept-Language": "zh-CN,zh;q=0.9",
  41. "Connection": "keep-alive",
  42. "Content-Type": "application/json",
  43. "Origin": "https://dian.ysbang.cn",
  44. "Referer": "https://dian.ysbang.cn/",
  45. "Sec-Fetch-Dest": "empty",
  46. "Sec-Fetch-Mode": "cors",
  47. "Sec-Fetch-Site": "same-origin",
  48. "User-Agent": (
  49. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
  50. "(KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"
  51. ),
  52. "sec-ch-ua": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
  53. "sec-ch-ua-mobile": "?0",
  54. "sec-ch-ua-platform": '"Windows"',
  55. }
  56. def pkcs7_unpad(data):
  57. if not data:
  58. raise ValueError("Empty data for PKCS7 unpad")
  59. pad_len = data[-1]
  60. if pad_len < 1 or pad_len > 16:
  61. raise ValueError("Invalid PKCS7 padding length")
  62. if data[-pad_len:] != bytes([pad_len]) * pad_len:
  63. raise ValueError("Invalid PKCS7 padding bytes")
  64. return data[:-pad_len]
  65. def derive_ysb_key():
  66. base = "BhCLxFfFhd12K4qRGPfy"
  67. md5_hex = hashlib.md5(base.encode("utf-8")).hexdigest()
  68. return md5_hex[:16].upper().encode("utf-8")
  69. def decrypt_ysb_payload(cipher_text_b64):
  70. """解密药师帮列表接口 data.o 字段,返回 JSON 对象。"""
  71. key = derive_ysb_key()
  72. cipher_bytes = base64.b64decode(cipher_text_b64)
  73. cipher = AES.new(key, AES.MODE_ECB)
  74. decrypted = cipher.decrypt(cipher_bytes)
  75. unpadded = pkcs7_unpad(decrypted)
  76. json_bytes = zlib.decompress(unpadded, zlib.MAX_WBITS | 16)
  77. return json.loads(json_bytes.decode("utf-8"))
  78. class YaoShiBangSnapshot:
  79. def __init__(self, drug_dict=None):
  80. self.driver = None
  81. self.db = MySQLPoolOnline()
  82. self.ip = None
  83. self.login_username = None
  84. self.login_password = None
  85. self.platform = 5
  86. self.pipeline = DrugPipeline("ysb")
  87. self.task_dict = drug_dict or {}
  88. self.ossuploader = AliyunOSSUploader()
  89. self.start_page = 1
  90. self.end_page = 1
  91. self.account_name = YSB_ACCOUNT.get("username", "ysb_default")
  92. self._register_signal_handler()
  93. if self.task_dict:
  94. self.get_product_data()
  95. self.success = True
  96. self.is_no_prodcut = 0
  97. self.is_product_count = 0
  98. self.token = ""
  99. self._state_value = ""
  100. self.start_date = (datetime.now() - timedelta(minutes=500)).strftime("%Y-%m-%d %H:%M")
  101. def get_product_data(self):
  102. self.task_id = self.task_dict["id"]
  103. self.company_id = self.task_dict["company_id"]
  104. self.product = self.task_dict["product_name"]
  105. self.product_desc = self.task_dict.get("product_specs", "")
  106. self.brand = self.task_dict.get("product_brand", "")
  107. self.product_keyword = self.task_dict.get("product_keyword", "")
  108. self.collect_task_id = self.task_dict.get("collect_task_id", "")
  109. self.sampling_cycle = self.task_dict.get("sampling_cycle", "")
  110. self.sampling_start_time = self.task_dict.get("sampling_start_time", "")
  111. self.sampling_end_time = self.task_dict.get("sampling_end_time", "")
  112. self.collect_equipment_id = self.task_dict.get("collect_equipment_id", "")
  113. self.account_id = self.task_dict.get("collect_equipment_account_id", "")
  114. self.collect_region_id = self.task_dict.get("collect_region_id", "")
  115. self.collect_round = self.task_dict.get("collect_round", 1)
  116. self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
  117. self.end_page = max(
  118. self.start_page,
  119. self._parse_page(self.task_dict.get("end_page"), self.start_page),
  120. )
  121. @staticmethod
  122. def _parse_page(value, default=1):
  123. try:
  124. page = int(value)
  125. return page if page >= 1 else default
  126. except (TypeError, ValueError):
  127. return default
  128. def _register_signal_handler(self):
  129. def handler(signum, frame):
  130. logger.info("收到退出信号,正在关闭浏览器...")
  131. self._quit_browser()
  132. sys.exit(0)
  133. signal.signal(signal.SIGINT, handler)
  134. if hasattr(signal, "SIGTERM"):
  135. signal.signal(signal.SIGTERM, handler)
  136. def _quit_browser(self):
  137. if self.driver:
  138. try:
  139. self.driver.quit()
  140. except Exception:
  141. pass
  142. self.driver = None
  143. @staticmethod
  144. def _get_free_port():
  145. """获取一个当前可用的本地端口,供 Chrome 调试使用。"""
  146. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  147. s.bind(("127.0.0.1", 0))
  148. return s.getsockname()[1]
  149. def _resolve_browser_profile_dir(self):
  150. """浏览器数据目录: <项目根>/spiders/yaoshibang/<账号>"""
  151. profile_dir = YSB_SPIDER_DIR / self.account_name
  152. # 仅兼容历史误路径,新建不再使用 chrome_profile
  153. legacy_nested = YSB_SPIDER_DIR / "spiders" / "yaoshibang" / self.account_name
  154. legacy_chrome_profile = YSB_SPIDER_DIR / "chrome_profile" / self.account_name
  155. for candidate in (profile_dir, legacy_nested, legacy_chrome_profile):
  156. if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
  157. logger.info("使用已有浏览器配置目录: %s", candidate)
  158. return candidate
  159. profile_dir.mkdir(parents=True, exist_ok=True)
  160. logger.info("新建浏览器配置目录: %s", profile_dir)
  161. return profile_dir
  162. def init_browser(self):
  163. co = ChromiumOptions().set_browser_path(chrome_path)
  164. debug_port = self._get_free_port()
  165. profile_dir = self._resolve_browser_profile_dir()
  166. profile_dir.mkdir(parents=True, exist_ok=True)
  167. co.set_user_data_path(str(profile_dir))
  168. logger.info("浏览器用户目录(绝对路径): %s", profile_dir.resolve())
  169. co.set_local_port(debug_port)
  170. co.set_argument(f"--remote-debugging-port={debug_port}")
  171. co.set_argument("--remote-debugging-address=127.0.0.1")
  172. # co.set_argument("--disable-blink-features=AutomationControlled")
  173. co.set_argument("--disable-dev-shm-usage")
  174. co.set_argument("--start-maximized")
  175. co.set_argument("--no-first-run") # 避免首次运行弹窗
  176. co.set_argument("--no-default-browser-check") # 避免默认浏览器检查
  177. self.driver = ChromiumPage(co)
  178. def _solve_slider_captcha(self):
  179. """检测并处理易盾滑块验证码,成功返回 True。"""
  180. self.driver.wait.doc_loaded()
  181. time.sleep(2)
  182. yidun = self.driver.ele("xpath://div[@class='yidun_modal']", timeout=3)
  183. if not yidun:
  184. return True
  185. logger.info("检测到滑块验证码,开始处理")
  186. jpg_bytes = yidun.get_screenshot(as_bytes="jpg")
  187. distance = self._call_captcha_api(jpg_bytes)
  188. if distance is None:
  189. logger.error("验证码识别失败")
  190. return False
  191. logger.info("滑块距离: %s", distance)
  192. slider = self.driver.ele(
  193. "xpath://div[contains(@class,'yidun_slider--hover')]", timeout=5
  194. )
  195. if not slider:
  196. logger.error("未找到滑块元素")
  197. return False
  198. try:
  199. drag_distance = float(distance) + SLIDER_OFFSET_FIX
  200. except (TypeError, ValueError):
  201. logger.error("滑块距离非数字: %r", distance)
  202. return False
  203. if not math.isfinite(drag_distance) or drag_distance <= 0:
  204. logger.error("滑块距离无效: %s", drag_distance)
  205. return False
  206. self._simulate_slider_drag(slider, drag_distance - 5)
  207. time.sleep(3)
  208. return True
  209. def _call_captcha_api(self, image_bytes):
  210. """调用云码平台识别滑块距离,失败返回 None。"""
  211. try:
  212. b64 = base64.b64encode(image_bytes).decode()
  213. resp = requests.post(
  214. CAPTCHA_API_URL,
  215. json={"token": CAPTCHA_TOKEN, "type": "22222", "image": b64},
  216. headers={"Content-Type": "application/json"},
  217. timeout=15,
  218. ).json()
  219. logger.info("验证码 API 返回: %s", resp)
  220. if not isinstance(resp, dict):
  221. return None
  222. data = resp.get("data")
  223. if isinstance(data, dict):
  224. dist = data.get("data")
  225. else:
  226. dist = data
  227. if dist is None:
  228. logger.error("验证码 API 未返回距离字段: %s", resp)
  229. return None
  230. try:
  231. d = float(dist)
  232. except (TypeError, ValueError):
  233. logger.error("验证码距离无法解析为数字: %r", dist)
  234. return None
  235. if not math.isfinite(d):
  236. logger.error("验证码距离非有限数值: %r", dist)
  237. return None
  238. return d
  239. except Exception as e:
  240. logger.exception("验证码 API 调用失败: %s", e)
  241. return None
  242. @staticmethod
  243. def _generate_human_track(distance):
  244. try:
  245. distance = float(distance)
  246. except (TypeError, ValueError):
  247. return []
  248. if distance <= 0 or not math.isfinite(distance):
  249. return []
  250. tracks = []
  251. current = 0
  252. mid = distance * 0.7
  253. t = 0.2
  254. v = 0
  255. move_points = []
  256. while current < mid:
  257. a = random.uniform(2, 4)
  258. v0 = v
  259. v = v0 + a * t
  260. move = v0 * t + 0.5 * a * t * t
  261. current += move
  262. move_points.append(move)
  263. while current < distance:
  264. a = -random.uniform(0.5, 1.5)
  265. v0 = v
  266. v = v0 + a * t
  267. if v < 0.5:
  268. v = 0.5
  269. move = v0 * t + 0.5 * a * t * t
  270. current += move
  271. move_points.append(move)
  272. total_points = len(move_points)
  273. for i, move in enumerate(move_points):
  274. y_offset = random.randint(-2, 2) if i % random.randint(2, 4) == 0 else 0
  275. if i < total_points * 0.3:
  276. duration = random.uniform(0.01, 0.03)
  277. elif i > total_points * 0.7:
  278. duration = random.uniform(0.03, 0.08)
  279. else:
  280. duration = random.uniform(0.02, 0.05)
  281. if random.random() < 0.05:
  282. duration += random.uniform(0.05, 0.1)
  283. tracks.append((move, y_offset, duration))
  284. if random.random() < 0.7:
  285. tracks.append((-random.randint(1, 3), 0, 0.05))
  286. return tracks
  287. def _simulate_slider_drag(self, slider_element, target_distance):
  288. if target_distance <= 0:
  289. logger.warning("滑块目标距离无效: %s", target_distance)
  290. return
  291. self.driver.actions.move_to(slider_element).hold()
  292. for offset_x, offset_y, duration in self._generate_human_track(target_distance):
  293. self.driver.actions.move(offset_x, offset_y, duration=duration / 1000)
  294. self.driver.actions.release()
  295. def _is_logged_in(self):
  296. # 与当前账号店铺展示文案一致;换店后需同步修改或改为配置项
  297. title = self.driver.ele(
  298. "xpath=//span[@class='logout']",
  299. timeout=5,
  300. )
  301. return bool(title)
  302. def _current_url(self):
  303. try:
  304. return self.driver.url or ""
  305. except Exception:
  306. return ""
  307. def _goto_detail_page(self, item_id, detail_url):
  308. """get 后 refresh 一次,让 SPA 按当前 URL 重新渲染详情。"""
  309. for attempt in range(1, DETAIL_NAV_RETRIES + 1):
  310. try:
  311. self.driver.get(detail_url, timeout=5)
  312. time.sleep(1.5)
  313. eles = self.driver.eles("xpath=//div[@class='y-dialog']//button[contains(text(),'确认')]", timeout=3)
  314. if len(eles) == 2:
  315. eles[1].click()
  316. time.sleep(1)
  317. self.driver.refresh()
  318. time.sleep(1.5)
  319. ele = self.driver.ele("xpath=//div[@class='drug-pic-viewer']")
  320. if not ele:
  321. continue
  322. else:
  323. return True
  324. except Exception as e:
  325. logger.warning(
  326. "跳转详情异常 item_id=%s attempt=%s: %s",
  327. item_id, attempt, e,
  328. )
  329. time.sleep(random.uniform(0.8, 1.5))
  330. return False
  331. def login(self):
  332. logger.info("开始登录药师帮")
  333. self.driver.get("https://dian.ysbang.cn/#/login", timeout=15)
  334. self.driver.wait.doc_loaded(timeout=10)
  335. time.sleep(2)
  336. input_name = self.driver.ele("xpath://input[@name='userAccount']", timeout=5)
  337. if not input_name:
  338. logger.error("未找到账号输入框")
  339. return False
  340. input_name.input(YSB_ACCOUNT["username"])
  341. time.sleep(random.uniform(1.5, 2.5))
  342. input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
  343. if not input_pass:
  344. logger.error("未找到密码输入框")
  345. return False
  346. input_pass.input(YSB_ACCOUNT["password"])
  347. time.sleep(random.uniform(1.5, 2.5))
  348. login_btn = self.driver.ele("xpath://button[text()='登录']", timeout=5)
  349. if not login_btn:
  350. logger.error("未找到登录按钮")
  351. return False
  352. login_btn.click()
  353. time.sleep(3)
  354. for i in range(3):
  355. self._solve_slider_captcha()
  356. time.sleep(3)
  357. if self._is_logged_in():
  358. logger.info("登录成功")
  359. return True
  360. logger.error("登录后未检测到目标店铺名,登录可能失败")
  361. return False
  362. def _take_snapshot(self, upload_key):
  363. """在当前页面截图并上传,不再重复跳转。"""
  364. time.sleep(1)
  365. self._dismiss_popup_before_screenshot()
  366. try:
  367. jpg_bytes = self.driver.get_screenshot(as_bytes="jpg")
  368. if not jpg_bytes:
  369. logger.warning("截图为空 upload_key=%s", upload_key)
  370. return ""
  371. img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
  372. except Exception:
  373. logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
  374. return ""
  375. if not img_url:
  376. logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
  377. return ""
  378. logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
  379. time.sleep(random.uniform(1, 2))
  380. return img_url
  381. def gen_pair(self, ex1_len=9, o_raw_len=16):
  382. alphabet = string.ascii_lowercase + string.digits
  383. ex1 = "".join(secrets.choice(alphabet) for _ in range(ex1_len))
  384. o = base64.b64encode(secrets.token_bytes(o_raw_len)).decode("ascii")
  385. return {"ex1": ex1, "o": o}
  386. def build_base_payload(self, keyword, page, first_search):
  387. date_str = time.strftime("%Y-%m-%d %H:%M:%S")
  388. return {
  389. "platform": "pc",
  390. "version": "6.0.0",
  391. "ua": "Chrome146",
  392. 'ex': '{} drugInfo {} {}'.format(self.start_date, date_str, date_str),
  393. "trafficType": 1,
  394. "ex1": "",
  395. "o": "",
  396. "lastClick": -1,
  397. "page": page,
  398. "pagesize": "60",
  399. "classify_id": "",
  400. "searchkey": keyword,
  401. "onlyTcm": 0,
  402. "operationtype": 1,
  403. "qualifiedLoanee": 0,
  404. "drugId": -1,
  405. "tagId": "",
  406. "showRecentlyPurchasedFlag": True,
  407. "onlySimpleLoan": 0,
  408. "sn": "",
  409. "buttons": [],
  410. "buttonList": [],
  411. "synonymId": 0,
  412. "activityTypes": [],
  413. "provider_filter": "",
  414. "factoryNames": "",
  415. "tcmGradeNames": [],
  416. "tcmExeStandardIds": [],
  417. "specs": "",
  418. "deliverFloor": 0,
  419. "purchaseLimitFloor": 0,
  420. "nextRequestKey": "",
  421. "adConfigId": 0,
  422. "stateValue": self._state_value,
  423. "firstSearch": first_search,
  424. "token": self.token,
  425. }
  426. @staticmethod
  427. def _extract_state_value(json_data, data_block):
  428. for src in (json_data, data_block):
  429. if not isinstance(src, dict):
  430. continue
  431. val = src.get("stateValue") or src.get("state_value")
  432. if val:
  433. return str(val)
  434. return None
  435. def _dismiss_popup_before_screenshot(self):
  436. """截图前关闭或隐藏营销弹窗,避免遮挡。"""
  437. close_locs = [
  438. "xpath=//div[contains(@class,'dialog')]//i[contains(@class,'close')]",
  439. "xpath=//div[contains(@class,'popup')]//i[contains(@class,'close')]",
  440. "xpath=//div[contains(@class,'modal')]//i[contains(@class,'close')]",
  441. "xpath=//button[contains(@class,'close')]",
  442. "xpath=//span[text()='×']",
  443. "xpath=//*[contains(text(),'智能采购')]/ancestor::div[1]//*[contains(@class,'close')]",
  444. ]
  445. for loc in close_locs:
  446. try:
  447. btn = self.driver.ele(loc, timeout=0.5)
  448. if btn:
  449. btn.click()
  450. time.sleep(0.2)
  451. except Exception:
  452. pass
  453. try:
  454. # 兜底:隐藏常见高层弹窗和遮罩
  455. self.driver.run_js(
  456. """
  457. const sels = [
  458. '[class*="modal"]',
  459. '[class*="popup"]',
  460. '[class*="dialog"]',
  461. '[class*="mask"]',
  462. '[class*="overlay"]'
  463. ];
  464. for (const s of sels) {
  465. document.querySelectorAll(s).forEach(el => {
  466. const style = getComputedStyle(el);
  467. const z = parseInt(style.zIndex || '0', 10);
  468. if (z >= 999 && style.display !== 'none') {
  469. el.style.display = 'none';
  470. }
  471. });
  472. }
  473. document.body.style.overflow = 'auto';
  474. """
  475. )
  476. time.sleep(0.2)
  477. except Exception:
  478. pass
  479. def to_product(self, item):
  480. now = time.strftime("%Y-%m-%d %H:%M:%S")
  481. item_id = item.get("wholesaleid", "")
  482. provider_id = item.get("providerId", "")
  483. city_str = item.get("warehouseCity", "")
  484. city_id = province_id = city = province = ""
  485. price = item.get("disPrice", "")
  486. if not price:
  487. price = item.get("minprice", "")
  488. if not price:
  489. price = item.get("price", "")
  490. shop_name = item.get("provider_name", "")
  491. if not shop_name:
  492. shop_name = item.get("abbreviation", "")
  493. product = {
  494. "platform": self.platform,
  495. "item_id": item_id,
  496. "enterprise_id": self.company_id,
  497. "product_name": item.get("drugname", ""),
  498. "spec": item.get("specification", ""),
  499. "one_price": '',
  500. "detail_url": f"https://dian.ysbang.cn/#/drugInfo?wholesaleid={item_id}&trafficType=1",
  501. "shop_name": shop_name,
  502. "anonymous_store_name": "",
  503. "shop_url": f"https://dian.ysbang.cn/#/supplierstore?providerId={provider_id}&trafficType=4",
  504. "city_name": city,
  505. "city_id": city_id,
  506. "province_name": province,
  507. "province_id": province_id,
  508. "area_info": "",
  509. "factory_name": item.get("manufacturer", ""),
  510. "scrape_date": time.strftime("%Y-%m-%d"),
  511. "price": price,
  512. "sales": "",
  513. "stock_count": item.get("stockAvailable", ""),
  514. "snapshot_url": "",
  515. "approval_num": "",
  516. "produced_time": item.get("prodDate", ""),
  517. "deadline": item.get("valid_date", ""),
  518. "update_time": now,
  519. "insert_time": now,
  520. "number": 1,
  521. "product_brand": self.brand or "",
  522. "collect_task_id": self.collect_task_id,
  523. "search_name": self.product,
  524. "company_name": "",
  525. "collect_config_info": json.dumps(
  526. {"sampling_cycle": self.sampling_cycle, "sampling_start_time": self.sampling_start_time,
  527. "sampling_end_time": self.sampling_end_time}),
  528. "account_id": self.account_id,
  529. "collect_region_id": self.collect_region_id,
  530. "collect_round": self.collect_round,
  531. "is_sold_out": 0
  532. }
  533. return product
  534. def parse_detail(self, product):
  535. appvolnum_ele = self.driver.ele(
  536. 'xpath://div[@class="drug-info"]//span[contains(text(),"批准文号")]/following-sibling::span[1]')
  537. appvolnum_value = appvolnum_ele.text if appvolnum_ele else ""
  538. price = ""
  539. discount_ele = self.driver.ele(
  540. 'xpath://div[@class="sale-info-wrap"]//div[@class="tooltip-content"]',
  541. timeout=2,
  542. )
  543. discount_value = discount_ele.text if discount_ele else ""
  544. if not price and discount_value:
  545. price_re = re.search(r"¥([0-9.]+)", discount_value)
  546. if price_re:
  547. price = price_re.group(1).strip()
  548. current_ele = self.driver.ele(
  549. 'xpath://div[@class="sale-info-wrap"]//span[contains(@class,"current-price")]',
  550. timeout=3,
  551. )
  552. if current_ele and not price:
  553. price = (current_ele.text or "").replace("¥", "").strip()
  554. list_price = product.get("price", "")
  555. if price:
  556. product["price"] = price
  557. if appvolnum_value:
  558. product["approval_num"] = appvolnum_value
  559. logger.info(
  560. "详情解析 wholesaleid=%s list_price=%s dom_price=%s url=%s",
  561. product.get("item_id"),
  562. list_price,
  563. product.get("price"),
  564. self._current_url(),
  565. )
  566. return product
  567. def search(self):
  568. self.driver.get("https://dian.ysbang.cn/#/home", timeout=15)
  569. self.driver.wait.doc_loaded(timeout=10)
  570. time.sleep(2)
  571. if not self._is_logged_in():
  572. if not self.login():
  573. return False
  574. time.sleep(3)
  575. cookies_list = self.driver.cookies()
  576. cookies_dict = {c['name']: c['value'] for c in cookies_list}
  577. self.token = cookies_dict.get("Token") or cookies_dict.get("token")
  578. keyword = self.product
  579. if self.brand:
  580. keyword = (self.brand + " " + self.product).strip()
  581. if self.product_desc:
  582. keyword = (keyword + " " + self.product_desc).strip()
  583. self._state_value = ""
  584. for page in range(1, 100):
  585. first_search = page == 1
  586. logger.info("药师帮爬取第%s页 firstSearch=%s stateValue=%s", page, first_search,
  587. self._state_value or "(空)")
  588. pair = self.gen_pair()
  589. payload = self.build_base_payload(keyword, page=page, first_search=first_search)
  590. payload["ex1"] = pair["ex1"]
  591. payload["o"] = pair["o"]
  592. response = None
  593. for attempt in range(3):
  594. try:
  595. response = requests.post(
  596. "https://dian.ysbang.cn/wholesale-drug/sales/getWholesaleList/v4270", headers=headers,
  597. json=payload, timeout=30
  598. )
  599. if response.status_code == 200:
  600. break
  601. except Exception as e:
  602. logger.error("第%s页请求失败 (%s/3): %s", page, attempt + 1, e)
  603. response = None
  604. time.sleep(10)
  605. if not response or response.status_code != 200:
  606. logger.error("第%s页请求失败,停止爬取", page)
  607. return False
  608. try:
  609. data_json = response.json()
  610. except json.JSONDecodeError:
  611. logger.exception("第%s页响应不是合法 JSON", page)
  612. return False
  613. data_block = data_json.get("data") or {}
  614. if str(data_json.get("message", "")) == "该操作需要登录":
  615. logger.warning("第%s页需要登录,请检查浏览器登录态", page)
  616. return False
  617. encrypted_o = data_block.get("o")
  618. if not encrypted_o:
  619. logger.warning("第%s页返回无加密 data.o: %s", page, data_json)
  620. break
  621. try:
  622. json_data = decrypt_ysb_payload(encrypted_o)
  623. except Exception as e:
  624. logger.exception("第%s页解密失败: %s", page, e)
  625. continue
  626. state_val = self._extract_state_value(json_data, data_block)
  627. if state_val:
  628. self._state_value = state_val
  629. wholesales = json_data.get("wholesales", [])
  630. if not wholesales:
  631. logger.info(f"第{page}页无数据,停止")
  632. break
  633. for item in wholesales:
  634. item_id = item.get("wholesaleid", "")
  635. if not item_id:
  636. continue
  637. detail_url = (
  638. f"https://dian.ysbang.cn/#/drugInfo?wholesaleid={item_id}&trafficType=1"
  639. )
  640. product = self.to_product(item)
  641. title = product.get("product_name", "")
  642. if self.brand not in title:
  643. self.is_product_count += 1
  644. continue
  645. if self.product not in title:
  646. self.is_product_count += 1
  647. continue
  648. if self.product in title and self.brand in title:
  649. self.is_product_count = 0
  650. if self.is_product_count >= 20:
  651. return False
  652. if not self._goto_detail_page(item_id, detail_url):
  653. logger.warning(
  654. "详情页跳转失败,跳过 item_id=%s url=%s",
  655. item_id, detail_url,
  656. )
  657. continue
  658. product = self.parse_detail(product)
  659. upload_key = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
  660. product["snapshot_url"] = self._take_snapshot(upload_key)
  661. try:
  662. self.pipeline.storge_data(product)
  663. logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
  664. except Exception as e:
  665. logger.exception("写入数据库失败: %s", e)
  666. def run(self):
  667. try:
  668. self.init_browser()
  669. self.search()
  670. except Exception as e:
  671. logger.exception("运行异常: %s", e)
  672. finally:
  673. self._quit_browser()