|
@@ -1,33 +1,146 @@
|
|
|
|
|
+import base64
|
|
|
|
|
+import hashlib
|
|
|
|
|
+import json
|
|
|
|
|
+import math
|
|
|
import random
|
|
import random
|
|
|
|
|
+import re
|
|
|
import signal
|
|
import signal
|
|
|
import socket
|
|
import socket
|
|
|
import sys
|
|
import sys
|
|
|
import time
|
|
import time
|
|
|
-import base64
|
|
|
|
|
-import math
|
|
|
|
|
|
|
+import zlib
|
|
|
|
|
+from pathlib import Path
|
|
|
import requests
|
|
import requests
|
|
|
|
|
+import secrets
|
|
|
|
|
+import string
|
|
|
|
|
+from Crypto.Cipher import AES
|
|
|
from commons.conn_mysql import MySQLPoolOnline
|
|
from commons.conn_mysql import MySQLPoolOnline
|
|
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
|
from commons.Logger import logger
|
|
from commons.Logger import logger
|
|
|
from oss_upload.oss_upload import AliyunOSSUploader
|
|
from oss_upload.oss_upload import AliyunOSSUploader
|
|
|
-from commons.config import YSB_ACCOUNT, YSB_PASSWORD
|
|
|
|
|
|
|
+from commons.config import YSB_ACCOUNT
|
|
|
|
|
+from pipelines.drug_pipelines import DrugPipeline
|
|
|
|
|
+from datetime import datetime, timedelta
|
|
|
|
|
+from area_info.city_name_to_id import get_city
|
|
|
|
|
+
|
|
|
CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
|
|
CAPTCHA_TOKEN = "zPzmt1mG1ouCU6GTzsZN2Lmm8pdZypapPcLJTBRETco"
|
|
|
CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
|
|
CAPTCHA_API_URL = "http://api.jfbym.com/api/YmServer/customApi"
|
|
|
|
|
|
|
|
SLIDER_OFFSET_FIX = 10
|
|
SLIDER_OFFSET_FIX = 10
|
|
|
|
|
+DETAIL_GET_TIMEOUT = 15
|
|
|
|
|
+DETAIL_URL_WAIT = 10
|
|
|
|
|
+DETAIL_DOM_WAIT = 8
|
|
|
|
|
+DETAIL_NAV_RETRIES = 3
|
|
|
|
|
+DETAIL_APPROVAL_XPATH = (
|
|
|
|
|
+ 'xpath://div[@class="drug-info"]//span[contains(text(),"批准文号")]'
|
|
|
|
|
+)
|
|
|
|
|
|
|
|
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
|
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
|
|
|
|
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
|
|
|
|
+YSB_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoshibang"
|
|
|
|
|
+BROWSER_PROFILE_SUBDIR = "chrome_profile"
|
|
|
|
|
+
|
|
|
|
|
+headers = {
|
|
|
|
|
+ "Accept": "*/*",
|
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
|
|
+ "Connection": "keep-alive",
|
|
|
|
|
+ "Content-Type": "application/json",
|
|
|
|
|
+ "Origin": "https://dian.ysbang.cn",
|
|
|
|
|
+ "Referer": "https://dian.ysbang.cn/",
|
|
|
|
|
+ "Sec-Fetch-Dest": "empty",
|
|
|
|
|
+ "Sec-Fetch-Mode": "cors",
|
|
|
|
|
+ "Sec-Fetch-Site": "same-origin",
|
|
|
|
|
+ "User-Agent": (
|
|
|
|
|
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
|
|
|
+ "(KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"
|
|
|
|
|
+ ),
|
|
|
|
|
+ "sec-ch-ua": '"Chromium";v="146", "Not-A.Brand";v="24", "Google Chrome";v="146"',
|
|
|
|
|
+ "sec-ch-ua-mobile": "?0",
|
|
|
|
|
+ "sec-ch-ua-platform": '"Windows"',
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def pkcs7_unpad(data):
|
|
|
|
|
+ if not data:
|
|
|
|
|
+ raise ValueError("Empty data for PKCS7 unpad")
|
|
|
|
|
+ pad_len = data[-1]
|
|
|
|
|
+ if pad_len < 1 or pad_len > 16:
|
|
|
|
|
+ raise ValueError("Invalid PKCS7 padding length")
|
|
|
|
|
+ if data[-pad_len:] != bytes([pad_len]) * pad_len:
|
|
|
|
|
+ raise ValueError("Invalid PKCS7 padding bytes")
|
|
|
|
|
+ return data[:-pad_len]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def derive_ysb_key():
|
|
|
|
|
+ base = "BhCLxFfFhd12K4qRGPfy"
|
|
|
|
|
+ md5_hex = hashlib.md5(base.encode("utf-8")).hexdigest()
|
|
|
|
|
+ return md5_hex[:16].upper().encode("utf-8")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def decrypt_ysb_payload(cipher_text_b64):
|
|
|
|
|
+ """解密药师帮列表接口 data.o 字段,返回 JSON 对象。"""
|
|
|
|
|
+ key = derive_ysb_key()
|
|
|
|
|
+ cipher_bytes = base64.b64decode(cipher_text_b64)
|
|
|
|
|
+ cipher = AES.new(key, AES.MODE_ECB)
|
|
|
|
|
+ decrypted = cipher.decrypt(cipher_bytes)
|
|
|
|
|
+ unpadded = pkcs7_unpad(decrypted)
|
|
|
|
|
+ json_bytes = zlib.decompress(unpadded, zlib.MAX_WBITS | 16)
|
|
|
|
|
+ return json.loads(json_bytes.decode("utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
|
class YaoShiBangSnapshot:
|
|
class YaoShiBangSnapshot:
|
|
|
- def __init__(self, product=None):
|
|
|
|
|
- self.product = product
|
|
|
|
|
|
|
+ def __init__(self, drug_dict=None):
|
|
|
self.driver = None
|
|
self.driver = None
|
|
|
- self.account_name = "ysbang_1"
|
|
|
|
|
|
|
+
|
|
|
|
|
+ self.db = MySQLPoolOnline()
|
|
|
|
|
+ self.ip = None
|
|
|
|
|
+ self.login_username = None
|
|
|
|
|
+ self.login_password = None
|
|
|
self.platform = 5
|
|
self.platform = 5
|
|
|
- self.db_online = MySQLPoolOnline()
|
|
|
|
|
|
|
+ self.pipeline = DrugPipeline("ysb")
|
|
|
|
|
+ self.task_dict = drug_dict or {}
|
|
|
self.ossuploader = AliyunOSSUploader()
|
|
self.ossuploader = AliyunOSSUploader()
|
|
|
|
|
+ self.start_page = 1
|
|
|
|
|
+ self.end_page = 1
|
|
|
|
|
+ self.account_name = YSB_ACCOUNT.get("username", "ysb_default")
|
|
|
self._register_signal_handler()
|
|
self._register_signal_handler()
|
|
|
|
|
+ if self.task_dict:
|
|
|
|
|
+ self.get_product_data()
|
|
|
|
|
+ self.success = True
|
|
|
|
|
+ self.is_no_prodcut = 0
|
|
|
|
|
+ self.is_product_count = 0
|
|
|
|
|
+ self.token = ""
|
|
|
|
|
+ self._state_value = ""
|
|
|
|
|
+ self.start_date = (datetime.now() - timedelta(minutes=500)).strftime("%Y-%m-%d %H:%M")
|
|
|
|
|
+
|
|
|
|
|
+ def get_product_data(self):
|
|
|
|
|
+ self.task_id = self.task_dict["id"]
|
|
|
|
|
+ self.company_id = self.task_dict["company_id"]
|
|
|
|
|
+ self.product = self.task_dict["product_name"]
|
|
|
|
|
+ self.product_desc = self.task_dict.get("product_specs", "")
|
|
|
|
|
+ self.brand = self.task_dict.get("product_brand", "")
|
|
|
|
|
+ self.product_keyword = self.task_dict.get("product_keyword", "")
|
|
|
|
|
+ self.collect_task_id = self.task_dict.get("collect_task_id", "")
|
|
|
|
|
+ self.sampling_cycle = self.task_dict.get("sampling_cycle", "")
|
|
|
|
|
+ self.sampling_start_time = self.task_dict.get("sampling_start_time", "")
|
|
|
|
|
+ self.sampling_end_time = self.task_dict.get("sampling_end_time", "")
|
|
|
|
|
+ self.collect_equipment_id = self.task_dict.get("collect_equipment_id", "")
|
|
|
|
|
+ self.account_id = self.task_dict.get("collect_equipment_account_id", "")
|
|
|
|
|
+ self.collect_region_id = self.task_dict.get("collect_region_id", "")
|
|
|
|
|
+ self.collect_round = self.task_dict.get("collect_round", 1)
|
|
|
|
|
+ self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
|
|
|
|
|
+ self.end_page = max(
|
|
|
|
|
+ self.start_page,
|
|
|
|
|
+ self._parse_page(self.task_dict.get("end_page"), self.start_page),
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def _parse_page(value, default=1):
|
|
|
|
|
+ try:
|
|
|
|
|
+ page = int(value)
|
|
|
|
|
+ return page if page >= 1 else default
|
|
|
|
|
+ except (TypeError, ValueError):
|
|
|
|
|
+ return default
|
|
|
|
|
|
|
|
def _register_signal_handler(self):
|
|
def _register_signal_handler(self):
|
|
|
def handler(signum, frame):
|
|
def handler(signum, frame):
|
|
@@ -54,16 +167,38 @@ class YaoShiBangSnapshot:
|
|
|
s.bind(("127.0.0.1", 0))
|
|
s.bind(("127.0.0.1", 0))
|
|
|
return s.getsockname()[1]
|
|
return s.getsockname()[1]
|
|
|
|
|
|
|
|
|
|
+ def _resolve_browser_profile_dir(self):
|
|
|
|
|
+ """
|
|
|
|
|
+ 浏览器数据固定落在 <项目根>/spiders/yaoshibang/ 下。
|
|
|
|
|
+ 优先 chrome_profile/<账号>;若旧版目录已有登录态则继续沿用。
|
|
|
|
|
+ """
|
|
|
|
|
+ preferred = YSB_SPIDER_DIR / BROWSER_PROFILE_SUBDIR / self.account_name
|
|
|
|
|
+ legacy_flat = YSB_SPIDER_DIR / self.account_name
|
|
|
|
|
+ legacy_nested = YSB_SPIDER_DIR / "spiders" / "yaoshibang" / self.account_name
|
|
|
|
|
+
|
|
|
|
|
+ for candidate in (preferred, legacy_flat, legacy_nested):
|
|
|
|
|
+ if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
|
|
|
|
|
+ logger.info("使用已有浏览器配置目录: %s", candidate)
|
|
|
|
|
+ return candidate
|
|
|
|
|
+
|
|
|
|
|
+ preferred.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+ logger.info("新建浏览器配置目录: %s", preferred)
|
|
|
|
|
+ return preferred
|
|
|
|
|
+
|
|
|
def init_browser(self):
|
|
def init_browser(self):
|
|
|
co = ChromiumOptions().set_browser_path(chrome_path)
|
|
co = ChromiumOptions().set_browser_path(chrome_path)
|
|
|
debug_port = self._get_free_port()
|
|
debug_port = self._get_free_port()
|
|
|
- co.set_user_data_path(f"./spiders/yaoshibang/{self.account_name}")
|
|
|
|
|
|
|
+ profile_dir = self._resolve_browser_profile_dir()
|
|
|
|
|
+ profile_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+ co.set_user_data_path(str(profile_dir))
|
|
|
|
|
+ logger.info("浏览器用户目录(绝对路径): %s", profile_dir.resolve())
|
|
|
|
|
|
|
|
co.set_local_port(debug_port)
|
|
co.set_local_port(debug_port)
|
|
|
co.set_argument(f"--remote-debugging-port={debug_port}")
|
|
co.set_argument(f"--remote-debugging-port={debug_port}")
|
|
|
co.set_argument("--remote-debugging-address=127.0.0.1")
|
|
co.set_argument("--remote-debugging-address=127.0.0.1")
|
|
|
# co.set_argument("--disable-blink-features=AutomationControlled")
|
|
# co.set_argument("--disable-blink-features=AutomationControlled")
|
|
|
co.set_argument("--disable-dev-shm-usage")
|
|
co.set_argument("--disable-dev-shm-usage")
|
|
|
|
|
+ co.set_argument("--start-maximized")
|
|
|
co.set_argument("--no-first-run") # 避免首次运行弹窗
|
|
co.set_argument("--no-first-run") # 避免首次运行弹窗
|
|
|
co.set_argument("--no-default-browser-check") # 避免默认浏览器检查
|
|
co.set_argument("--no-default-browser-check") # 避免默认浏览器检查
|
|
|
self.driver = ChromiumPage(co)
|
|
self.driver = ChromiumPage(co)
|
|
@@ -102,7 +237,7 @@ class YaoShiBangSnapshot:
|
|
|
if not math.isfinite(drag_distance) or drag_distance <= 0:
|
|
if not math.isfinite(drag_distance) or drag_distance <= 0:
|
|
|
logger.error("滑块距离无效: %s", drag_distance)
|
|
logger.error("滑块距离无效: %s", drag_distance)
|
|
|
return False
|
|
return False
|
|
|
- self._simulate_slider_drag(slider, drag_distance)
|
|
|
|
|
|
|
+ self._simulate_slider_drag(slider, drag_distance - 5)
|
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
|
return True
|
|
return True
|
|
|
|
|
|
|
@@ -206,11 +341,41 @@ class YaoShiBangSnapshot:
|
|
|
def _is_logged_in(self):
|
|
def _is_logged_in(self):
|
|
|
# 与当前账号店铺展示文案一致;换店后需同步修改或改为配置项
|
|
# 与当前账号店铺展示文案一致;换店后需同步修改或改为配置项
|
|
|
title = self.driver.ele(
|
|
title = self.driver.ele(
|
|
|
- "xpath=//*[contains(text(),'广西好药师大药房连锁有限公司天峨远大药店')]",
|
|
|
|
|
|
|
+ "xpath=//span[@class='logout']",
|
|
|
timeout=5,
|
|
timeout=5,
|
|
|
)
|
|
)
|
|
|
return bool(title)
|
|
return bool(title)
|
|
|
|
|
|
|
|
|
|
+ def _current_url(self):
|
|
|
|
|
+ try:
|
|
|
|
|
+ return self.driver.url or ""
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return ""
|
|
|
|
|
+
|
|
|
|
|
+ def _goto_detail_page(self, item_id, detail_url):
|
|
|
|
|
+ """get 后 refresh 一次,让 SPA 按当前 URL 重新渲染详情。"""
|
|
|
|
|
+ for attempt in range(1, DETAIL_NAV_RETRIES + 1):
|
|
|
|
|
+ try:
|
|
|
|
|
+ self.driver.get(detail_url, timeout=5)
|
|
|
|
|
+ time.sleep(1.5)
|
|
|
|
|
+ eles = self.driver.eles("xpath=//div[@class='y-dialog']//button[contains(text(),'确认')]", timeout=3)
|
|
|
|
|
+ if len(eles) == 2:
|
|
|
|
|
+ eles[1].click()
|
|
|
|
|
+ time.sleep(1)
|
|
|
|
|
+ self.driver.refresh()
|
|
|
|
|
+
|
|
|
|
|
+ time.sleep(1.5)
|
|
|
|
|
+ if str(item_id) in self.driver.url:
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(
|
|
|
|
|
+ "跳转详情异常 item_id=%s attempt=%s: %s",
|
|
|
|
|
+ item_id, attempt, e,
|
|
|
|
|
+ )
|
|
|
|
|
+ time.sleep(random.uniform(0.8, 1.5))
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
def login(self):
|
|
def login(self):
|
|
|
logger.info("开始登录药师帮")
|
|
logger.info("开始登录药师帮")
|
|
|
self.driver.get("https://dian.ysbang.cn/#/login", timeout=15)
|
|
self.driver.get("https://dian.ysbang.cn/#/login", timeout=15)
|
|
@@ -221,14 +386,14 @@ class YaoShiBangSnapshot:
|
|
|
if not input_name:
|
|
if not input_name:
|
|
|
logger.error("未找到账号输入框")
|
|
logger.error("未找到账号输入框")
|
|
|
return False
|
|
return False
|
|
|
- input_name.input(YSB_ACCOUNT)
|
|
|
|
|
|
|
+ input_name.input(YSB_ACCOUNT["username"])
|
|
|
time.sleep(random.uniform(1.5, 2.5))
|
|
time.sleep(random.uniform(1.5, 2.5))
|
|
|
|
|
|
|
|
input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
|
|
input_pass = self.driver.ele("xpath://input[@name='password']", timeout=5)
|
|
|
if not input_pass:
|
|
if not input_pass:
|
|
|
logger.error("未找到密码输入框")
|
|
logger.error("未找到密码输入框")
|
|
|
return False
|
|
return False
|
|
|
- input_pass.input(YSB_PASSWORD)
|
|
|
|
|
|
|
+ input_pass.input(YSB_ACCOUNT["password"])
|
|
|
time.sleep(random.uniform(1.5, 2.5))
|
|
time.sleep(random.uniform(1.5, 2.5))
|
|
|
|
|
|
|
|
login_btn = self.driver.ele("xpath://button[text()='登录']", timeout=5)
|
|
login_btn = self.driver.ele("xpath://button[text()='登录']", timeout=5)
|
|
@@ -249,35 +414,83 @@ class YaoShiBangSnapshot:
|
|
|
logger.error("登录后未检测到目标店铺名,登录可能失败")
|
|
logger.error("登录后未检测到目标店铺名,登录可能失败")
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
- def get_snapshot(self, detail_url, row_id):
|
|
|
|
|
- self.driver.get(detail_url, timeout=15)
|
|
|
|
|
- self.driver.wait.doc_loaded(timeout=10)
|
|
|
|
|
- time.sleep(2)
|
|
|
|
|
|
|
+ def _take_snapshot(self, upload_key):
|
|
|
|
|
+ """在当前页面截图并上传,不再重复跳转。"""
|
|
|
|
|
+ time.sleep(1)
|
|
|
self._dismiss_popup_before_screenshot()
|
|
self._dismiss_popup_before_screenshot()
|
|
|
-
|
|
|
|
|
- ele = self.driver.ele("xpath=//div[@class='drug-shopping-wrap']", timeout=8)
|
|
|
|
|
- if not ele:
|
|
|
|
|
- ele = self.driver.ele("xpath=//div[@class='drug-info']", timeout=5)
|
|
|
|
|
- if not ele:
|
|
|
|
|
- logger.warning("未找到详情区域元素,跳过截图 row_id=%s", row_id)
|
|
|
|
|
- return ""
|
|
|
|
|
-
|
|
|
|
|
try:
|
|
try:
|
|
|
- jpg_bytes = ele.get_screenshot(as_bytes="jpg")
|
|
|
|
|
|
|
+ jpg_bytes = self.driver.get_screenshot(as_bytes="jpg")
|
|
|
if not jpg_bytes:
|
|
if not jpg_bytes:
|
|
|
- logger.warning("截图为空 row_id=%s", row_id)
|
|
|
|
|
|
|
+ logger.warning("截图为空 upload_key=%s", upload_key)
|
|
|
return ""
|
|
return ""
|
|
|
- img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(row_id))
|
|
|
|
|
|
|
+ img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
|
|
|
except Exception:
|
|
except Exception:
|
|
|
- logger.exception("截图或 OSS 上传失败 row_id=%s url=%s", row_id, detail_url)
|
|
|
|
|
|
|
+ logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
|
|
|
return ""
|
|
return ""
|
|
|
if not img_url:
|
|
if not img_url:
|
|
|
- logger.warning("OSS 未返回有效地址 row_id=%s", row_id)
|
|
|
|
|
|
|
+ logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
|
|
|
return ""
|
|
return ""
|
|
|
- logger.info("截图上传完成 row_id=%s url=%s", row_id, img_url)
|
|
|
|
|
- time.sleep(random.uniform(0.5, 1.5))
|
|
|
|
|
|
|
+ logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
|
|
|
|
|
+ time.sleep(random.uniform(1, 2))
|
|
|
return img_url
|
|
return img_url
|
|
|
|
|
|
|
|
|
|
+ def gen_pair(self, ex1_len=9, o_raw_len=16):
|
|
|
|
|
+ alphabet = string.ascii_lowercase + string.digits
|
|
|
|
|
+ ex1 = "".join(secrets.choice(alphabet) for _ in range(ex1_len))
|
|
|
|
|
+ o = base64.b64encode(secrets.token_bytes(o_raw_len)).decode("ascii")
|
|
|
|
|
+ return {"ex1": ex1, "o": o}
|
|
|
|
|
+
|
|
|
|
|
+ def build_base_payload(self, keyword, page, first_search):
|
|
|
|
|
+ date_str = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
+ return {
|
|
|
|
|
+ "platform": "pc",
|
|
|
|
|
+ "version": "6.0.0",
|
|
|
|
|
+ "ua": "Chrome146",
|
|
|
|
|
+ 'ex': '{} drugInfo {} {}'.format(self.start_date, date_str, date_str),
|
|
|
|
|
+ "trafficType": 1,
|
|
|
|
|
+ "ex1": "",
|
|
|
|
|
+ "o": "",
|
|
|
|
|
+ "lastClick": -1,
|
|
|
|
|
+ "page": page,
|
|
|
|
|
+ "pagesize": "60",
|
|
|
|
|
+ "classify_id": "",
|
|
|
|
|
+ "searchkey": keyword,
|
|
|
|
|
+ "onlyTcm": 0,
|
|
|
|
|
+ "operationtype": 1,
|
|
|
|
|
+ "qualifiedLoanee": 0,
|
|
|
|
|
+ "drugId": -1,
|
|
|
|
|
+ "tagId": "",
|
|
|
|
|
+ "showRecentlyPurchasedFlag": True,
|
|
|
|
|
+ "onlySimpleLoan": 0,
|
|
|
|
|
+ "sn": "",
|
|
|
|
|
+ "buttons": [],
|
|
|
|
|
+ "buttonList": [],
|
|
|
|
|
+ "synonymId": 0,
|
|
|
|
|
+ "activityTypes": [],
|
|
|
|
|
+ "provider_filter": "",
|
|
|
|
|
+ "factoryNames": "",
|
|
|
|
|
+ "tcmGradeNames": [],
|
|
|
|
|
+ "tcmExeStandardIds": [],
|
|
|
|
|
+ "specs": "",
|
|
|
|
|
+ "deliverFloor": 0,
|
|
|
|
|
+ "purchaseLimitFloor": 0,
|
|
|
|
|
+ "nextRequestKey": "",
|
|
|
|
|
+ "adConfigId": 0,
|
|
|
|
|
+ "stateValue": self._state_value,
|
|
|
|
|
+ "firstSearch": first_search,
|
|
|
|
|
+ "token": self.token,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def _extract_state_value(json_data, data_block):
|
|
|
|
|
+ for src in (json_data, data_block):
|
|
|
|
|
+ if not isinstance(src, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ val = src.get("stateValue") or src.get("state_value")
|
|
|
|
|
+ if val:
|
|
|
|
|
+ return str(val)
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
def _dismiss_popup_before_screenshot(self):
|
|
def _dismiss_popup_before_screenshot(self):
|
|
|
"""截图前关闭或隐藏营销弹窗,避免遮挡。"""
|
|
"""截图前关闭或隐藏营销弹窗,避免遮挡。"""
|
|
|
close_locs = [
|
|
close_locs = [
|
|
@@ -324,19 +537,106 @@ class YaoShiBangSnapshot:
|
|
|
except Exception:
|
|
except Exception:
|
|
|
pass
|
|
pass
|
|
|
|
|
|
|
|
- def _save_snapshot_url(self, row_id, img_url):
|
|
|
|
|
- """上传成功后回写库,避免下次任务重复拉取同一批。"""
|
|
|
|
|
- if row_id is None or not img_url:
|
|
|
|
|
- return
|
|
|
|
|
- sql = (
|
|
|
|
|
- "UPDATE `retrieve_process_lowprice_product` "
|
|
|
|
|
- "SET `snapshot_url` = %s WHERE `id` = %s AND `platform` = %s"
|
|
|
|
|
|
|
+ def to_product(self, item):
|
|
|
|
|
+
|
|
|
|
|
+ now = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
+ item_id = item.get("wholesaleid", "")
|
|
|
|
|
+ provider_id = item.get("providerId", "")
|
|
|
|
|
+
|
|
|
|
|
+ city_str = item.get("warehouseCity", "")
|
|
|
|
|
+ city_id = province_id = city = province = ""
|
|
|
|
|
+ price = item.get("disPrice", "")
|
|
|
|
|
+
|
|
|
|
|
+ if not price:
|
|
|
|
|
+ price = item.get("minprice", "")
|
|
|
|
|
+ if not price:
|
|
|
|
|
+ price = item.get("price", "")
|
|
|
|
|
+
|
|
|
|
|
+ shop_name = item.get("provider_name", "")
|
|
|
|
|
+ if not shop_name:
|
|
|
|
|
+ shop_name = item.get("abbreviation", "")
|
|
|
|
|
+
|
|
|
|
|
+ product = {
|
|
|
|
|
+ "platform": self.platform,
|
|
|
|
|
+ "item_id": item_id,
|
|
|
|
|
+ "enterprise_id": self.company_id,
|
|
|
|
|
+ "product_name": item.get("drugname", ""),
|
|
|
|
|
+ "spec": item.get("specification", ""),
|
|
|
|
|
+ "one_price": '',
|
|
|
|
|
+ "detail_url": f"https://dian.ysbang.cn/#/drugInfo?wholesaleid={item_id}&trafficType=1",
|
|
|
|
|
+ "shop_name": shop_name,
|
|
|
|
|
+ "anonymous_store_name": "",
|
|
|
|
|
+ "shop_url": f"https://dian.ysbang.cn/#/supplierstore?providerId={provider_id}&trafficType=4",
|
|
|
|
|
+ "city_name": city,
|
|
|
|
|
+ "city_id": city_id,
|
|
|
|
|
+ "province_name": province,
|
|
|
|
|
+ "province_id": province_id,
|
|
|
|
|
+ "area_info": "",
|
|
|
|
|
+ "factory_name": item.get("manufacturer", ""),
|
|
|
|
|
+ "scrape_date": time.strftime("%Y-%m-%d"),
|
|
|
|
|
+ "price": price,
|
|
|
|
|
+ "sales": "",
|
|
|
|
|
+ "stock_count": item.get("stockAvailable", ""),
|
|
|
|
|
+ "snapshot_url": "",
|
|
|
|
|
+ "approval_num": "",
|
|
|
|
|
+ "produced_time": item.get("prodDate", ""),
|
|
|
|
|
+ "deadline": item.get("valid_date", ""),
|
|
|
|
|
+ "update_time": now,
|
|
|
|
|
+ "insert_time": now,
|
|
|
|
|
+ "number": 1,
|
|
|
|
|
+ "product_brand": self.brand or "",
|
|
|
|
|
+ "collect_task_id": self.collect_task_id,
|
|
|
|
|
+ "search_name": self.product,
|
|
|
|
|
+ "company_name": "",
|
|
|
|
|
+ "collect_config_info": json.dumps(
|
|
|
|
|
+ {"sampling_cycle": self.sampling_cycle, "sampling_start_time": self.sampling_start_time,
|
|
|
|
|
+ "sampling_end_time": self.sampling_end_time}),
|
|
|
|
|
+ "account_id": self.account_id,
|
|
|
|
|
+ "collect_region_id": self.collect_region_id,
|
|
|
|
|
+ "collect_round": self.collect_round,
|
|
|
|
|
+ "is_sold_out": 0
|
|
|
|
|
+ }
|
|
|
|
|
+ return product
|
|
|
|
|
+
|
|
|
|
|
+ def parse_detail(self, product):
|
|
|
|
|
+ appvolnum_ele = self.driver.ele(
|
|
|
|
|
+ 'xpath://div[@class="drug-info"]//span[contains(text(),"批准文号")]/following-sibling::span[1]')
|
|
|
|
|
+ appvolnum_value = appvolnum_ele.text if appvolnum_ele else ""
|
|
|
|
|
+ price = ""
|
|
|
|
|
+ discount_ele = self.driver.ele(
|
|
|
|
|
+ 'xpath://div[@class="sale-info-wrap"]//div[@class="tooltip-content"]',
|
|
|
|
|
+ timeout=2,
|
|
|
|
|
+ )
|
|
|
|
|
+ discount_value = discount_ele.text if discount_ele else ""
|
|
|
|
|
+ if not price and discount_value:
|
|
|
|
|
+ price_re = re.search(r"¥([0-9.]+)", discount_value)
|
|
|
|
|
+ if price_re:
|
|
|
|
|
+ price = price_re.group(1).strip()
|
|
|
|
|
+
|
|
|
|
|
+ current_ele = self.driver.ele(
|
|
|
|
|
+ 'xpath://div[@class="sale-info-wrap"]//span[contains(@class,"current-price")]',
|
|
|
|
|
+ timeout=3,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if current_ele and not price:
|
|
|
|
|
+ price = (current_ele.text or "").replace("¥", "").strip()
|
|
|
|
|
+
|
|
|
|
|
+ list_price = product.get("price", "")
|
|
|
|
|
+ if price:
|
|
|
|
|
+ product["price"] = price
|
|
|
|
|
+
|
|
|
|
|
+ if appvolnum_value:
|
|
|
|
|
+ product["approval_num"] = appvolnum_value
|
|
|
|
|
+ logger.info(
|
|
|
|
|
+ "详情解析 wholesaleid=%s list_price=%s dom_price=%s url=%s",
|
|
|
|
|
+ product.get("item_id"),
|
|
|
|
|
+ list_price,
|
|
|
|
|
+ product.get("price"),
|
|
|
|
|
+ self._current_url(),
|
|
|
)
|
|
)
|
|
|
- n = self.db_online.execute(sql, (img_url, row_id, self.platform))
|
|
|
|
|
- if n <= 0:
|
|
|
|
|
- logger.warning("snapshot_url 回写未影响行数 id=%s platform=%s", row_id, self.platform)
|
|
|
|
|
|
|
+ return product
|
|
|
|
|
|
|
|
- def search(self, data_list):
|
|
|
|
|
|
|
+ def search(self):
|
|
|
self.driver.get("https://dian.ysbang.cn/#/home", timeout=15)
|
|
self.driver.get("https://dian.ysbang.cn/#/home", timeout=15)
|
|
|
self.driver.wait.doc_loaded(timeout=10)
|
|
self.driver.wait.doc_loaded(timeout=10)
|
|
|
time.sleep(2)
|
|
time.sleep(2)
|
|
@@ -344,44 +644,111 @@ class YaoShiBangSnapshot:
|
|
|
if not self._is_logged_in():
|
|
if not self._is_logged_in():
|
|
|
if not self.login():
|
|
if not self.login():
|
|
|
return False
|
|
return False
|
|
|
|
|
+ cookies_list = self.driver.cookies()
|
|
|
|
|
+ cookies_dict = {c['name']: c['value'] for c in cookies_list}
|
|
|
|
|
+ self.token = cookies_dict.get("Token") or cookies_dict.get("token")
|
|
|
|
|
+
|
|
|
|
|
+ keyword = self.product
|
|
|
|
|
+ if self.brand:
|
|
|
|
|
+ keyword = (self.brand + " " + self.product).strip()
|
|
|
|
|
+ if self.product_desc:
|
|
|
|
|
+ keyword = (keyword + " " + self.product_desc).strip()
|
|
|
|
|
+
|
|
|
|
|
+ self._state_value = ""
|
|
|
|
|
+ for page in range(1, 100):
|
|
|
|
|
+ first_search = page == 1
|
|
|
|
|
+ logger.info("药师帮爬取第%s页 firstSearch=%s stateValue=%s", page, first_search,
|
|
|
|
|
+ self._state_value or "(空)")
|
|
|
|
|
+ pair = self.gen_pair()
|
|
|
|
|
+ payload = self.build_base_payload(keyword, page=page, first_search=first_search)
|
|
|
|
|
+ payload["ex1"] = pair["ex1"]
|
|
|
|
|
+ payload["o"] = pair["o"]
|
|
|
|
|
+
|
|
|
|
|
+ response = None
|
|
|
|
|
+ for attempt in range(3):
|
|
|
|
|
+ try:
|
|
|
|
|
+ response = requests.post(
|
|
|
|
|
+ "https://dian.ysbang.cn/wholesale-drug/sales/getWholesaleList/v4270", headers=headers,
|
|
|
|
|
+ json=payload, timeout=30
|
|
|
|
|
+ )
|
|
|
|
|
+ if response.status_code == 200:
|
|
|
|
|
+ break
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error("第%s页请求失败 (%s/3): %s", page, attempt + 1, e)
|
|
|
|
|
+ response = None
|
|
|
|
|
+ time.sleep(10)
|
|
|
|
|
+ if not response or response.status_code != 200:
|
|
|
|
|
+ logger.error("第%s页请求失败,停止爬取", page)
|
|
|
|
|
+ return False
|
|
|
|
|
|
|
|
- ok, fail = 0, 0
|
|
|
|
|
- for data in data_list:
|
|
|
|
|
- row_id = data.get("id")
|
|
|
|
|
- link_url = data.get("link_url")
|
|
|
|
|
- if not link_url:
|
|
|
|
|
- logger.warning("缺少 link_url,跳过 id=%s", row_id)
|
|
|
|
|
- fail += 1
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ data_json = response.json()
|
|
|
|
|
+ except json.JSONDecodeError:
|
|
|
|
|
+ logger.exception("第%s页响应不是合法 JSON", page)
|
|
|
|
|
+ return False
|
|
|
|
|
+ data_block = data_json.get("data") or {}
|
|
|
|
|
+ if str(data_json.get("message", "")) == "该操作需要登录":
|
|
|
|
|
+ logger.warning("第%s页需要登录,请检查浏览器登录态", page)
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ encrypted_o = data_block.get("o")
|
|
|
|
|
+ if not encrypted_o:
|
|
|
|
|
+ logger.warning("第%s页返回无加密 data.o: %s", page, data_json)
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ json_data = decrypt_ysb_payload(encrypted_o)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.exception("第%s页解密失败: %s", page, e)
|
|
|
continue
|
|
continue
|
|
|
- img_url = self.get_snapshot(link_url, row_id)
|
|
|
|
|
- if img_url:
|
|
|
|
|
- self._save_snapshot_url(row_id, img_url)
|
|
|
|
|
- ok += 1
|
|
|
|
|
- else:
|
|
|
|
|
- fail += 1
|
|
|
|
|
- logger.info("快照任务结束 成功=%s 失败=%s 总计=%s", ok, fail, len(data_list))
|
|
|
|
|
- return ok > 0
|
|
|
|
|
|
|
|
|
|
- def run(self):
|
|
|
|
|
- date_str = time.strftime("%Y-%m-%d")
|
|
|
|
|
- sql = """
|
|
|
|
|
- SELECT `id`,`link_url` FROM `retrieve_process_lowprice_product`
|
|
|
|
|
- WHERE `platform`=%s AND `snapshot_url` IS NULL AND `scrape_date`=%s
|
|
|
|
|
- LIMIT 100 """
|
|
|
|
|
-
|
|
|
|
|
- data_list = self.db_online.select_data(sql, (self.platform, date_str))
|
|
|
|
|
- if not data_list:
|
|
|
|
|
- logger.info("当前不需要更新快照")
|
|
|
|
|
- return
|
|
|
|
|
|
|
+ state_val = self._extract_state_value(json_data, data_block)
|
|
|
|
|
+ if state_val:
|
|
|
|
|
+ self._state_value = state_val
|
|
|
|
|
+
|
|
|
|
|
+ wholesales = json_data.get("wholesales", [])
|
|
|
|
|
+ if not wholesales:
|
|
|
|
|
+ logger.info(f"第{page}页无数据,停止")
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ for item in wholesales:
|
|
|
|
|
+ item_id = item.get("wholesaleid", "")
|
|
|
|
|
+ if not item_id:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ detail_url = (
|
|
|
|
|
+ f"https://dian.ysbang.cn/#/drugInfo?wholesaleid={item_id}&trafficType=1"
|
|
|
|
|
+ )
|
|
|
|
|
+ product = self.to_product(item)
|
|
|
|
|
+ title = product.get("product_name", "")
|
|
|
|
|
+ if self.brand not in title:
|
|
|
|
|
+ self.is_product_count += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ if self.product not in title:
|
|
|
|
|
+ self.is_product_count += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ if self.product in title and self.brand in title:
|
|
|
|
|
+ self.is_product_count = 0
|
|
|
|
|
+ if self.is_product_count >= 20:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ self._goto_detail_page(item_id, detail_url)
|
|
|
|
|
+
|
|
|
|
|
+ product = self.parse_detail(product)
|
|
|
|
|
+ upload_key = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
|
|
|
|
|
+ product["snapshot_url"] = self._take_snapshot(upload_key)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ self.pipeline.storge_data(product)
|
|
|
|
|
+ logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.exception("写入数据库失败: %s", e)
|
|
|
|
|
|
|
|
|
|
+ def run(self):
|
|
|
try:
|
|
try:
|
|
|
self.init_browser()
|
|
self.init_browser()
|
|
|
- self.search(data_list)
|
|
|
|
|
|
|
+ self.search()
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.exception("运行异常: %s", e)
|
|
logger.exception("运行异常: %s", e)
|
|
|
finally:
|
|
finally:
|
|
|
self._quit_browser()
|
|
self._quit_browser()
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
|
- YaoShiBangSnapshot().run()
|
|
|