main.py 90 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046
  1. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  2. from logger_config import logger
  3. from datetime import datetime
  4. import random
  5. import csv
  6. import os
  7. import time
  8. import json
  9. import pymysql
  10. from pymysql.err import OperationalError, ProgrammingError, DataError
  11. from config import *
  12. import re
  13. import uuid
  14. import requests
  15. import base64
  16. from io import BytesIO
  17. from PIL import Image
  18. import traceback
  19. # ===================== 工具函数:获取当前时间字符串 =====================
  20. def get_current_time():
  21. """统一日志时间格式"""
  22. return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  23. # 代理IP池
  24. PROXY_POOL_URL =""
  25. PROXY_VALIDATION_URL = "" # 用于验证代理有效性的URL
  26. PROXY_TIMEOUT = 10 # 代理验证超时时间(秒)
  27. def get_random_proxy():
  28. """从代理池获取随机代理IP"""
  29. try:
  30. response = requests.get(PROXY_POOL_URL, timeout=10)
  31. if response.status_code == 200:
  32. proxy = response.text.strip()
  33. if validate_proxy(proxy):
  34. logger.info(f"获取到有效代理: {proxy}")
  35. return proxy
  36. logger.warning(f"代理无效: {proxy}")
  37. except Exception as e:
  38. logger.error(f"获取代理失败: {str(e)}")
  39. return None
  40. def validate_proxy(proxy):
  41. """验证代理IP有效性"""
  42. try:
  43. proxies = {
  44. "http": f"http://{proxy}",
  45. "https": f"https://{proxy}"
  46. }
  47. response = requests.get(
  48. PROXY_VALIDATION_URL,
  49. proxies=proxies,
  50. timeout=PROXY_TIMEOUT
  51. )
  52. return response.status_code == 200
  53. except:
  54. return False
  55. def init_browser_with_proxy(playwright):
  56. proxy = get_random_proxy()
  57. proxy_config = None
  58. if proxy:
  59. proxy_server, proxy_port = proxy.split(":")
  60. proxy_config = {
  61. "server": f"http://{proxy_server}:{proxy_port}",
  62. # "username": "your_proxy_username",
  63. # "password": "your_proxy_password"
  64. }
  65. logger.info(f"使用代理: {proxy_server}:{proxy_port}")
  66. else:
  67. logger.warning("未获取到有效代理,将使用本地IP")
  68. # 启动浏览器(保留原有反爬配置)
  69. return playwright.chromium.launch(
  70. headless=False, # 非无头模式
  71. channel="chrome", # 使用Chrome内核
  72. slow_mo=random.randint(100, 300), # 随机操作延迟
  73. proxy=proxy_config, # 代理配置(None则不使用代理)
  74. args=[
  75. "--disable-blink-features=AutomationControlled", # 核心反检测
  76. "--enable-automation=false",
  77. "--disable-infobars",
  78. "--remote-debugging-port=0",
  79. "--start-maximized",
  80. "--disable-extensions",
  81. "--disable-plugins-discovery",
  82. "--no-sandbox",
  83. "--disable-dev-shm-usage",
  84. # 随机Chrome版本UA
  85. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36"
  86. ]
  87. )
  88. # ==================== 2. 反爬工具函数 ====================
  89. def random_delay(min_seconds, max_seconds):
  90. """生成随机延迟(核心反爬:避免固定间隔)"""
  91. delay = random.uniform(min_seconds, max_seconds)
  92. time.sleep(delay)
  93. return delay
  94. def simulate_human_typing(page, locator, text):
  95. """模拟真人打字(逐个字符输入,带随机间隔)"""
  96. try:
  97. locator.click()
  98. locator.clear()
  99. for char in text:
  100. locator.type(char, delay=random.uniform(MIN_INPUT_DELAY, MAX_INPUT_DELAY))
  101. random_delay(0.05, 0.1) # 字符间额外小延迟
  102. logger.info(f" 模拟真人输入完成:{text}")
  103. except Exception as e:
  104. logger.error(f"模拟打字失败:{e}")
  105. locator.fill(text) # 兜底:直接填充
  106. def save_cookies(context, cookie_path=COOKIE_FILE_PATH):
  107. """保存Cookie到本地JSON文件"""
  108. try:
  109. cookies = context.cookies()
  110. with open(cookie_path, "w", encoding="utf-8") as f:
  111. json.dump(cookies, f, ensure_ascii=False, indent=2)
  112. logger.info(f"Cookie已保存到:{cookie_path}")
  113. return True
  114. except Exception as e:
  115. logger.error(f" 保存Cookie失败:{e}")
  116. return False
  117. def load_cookies(context, cookie_path=COOKIE_FILE_PATH):
  118. """从本地JSON文件加载Cookie到浏览器上下文"""
  119. if not os.path.exists(cookie_path):
  120. logger.warning(f" Cookie文件不存在:{cookie_path}")
  121. return False
  122. try:
  123. with open(cookie_path, "r", encoding="utf-8") as f:
  124. cookies = json.load(f)
  125. context.add_cookies(cookies)
  126. logger.info(f"✅ 已从{cookie_path}加载Cookie")
  127. return True
  128. except Exception as e:
  129. logger.error(f" 加载Cookie失败:{e}")
  130. return False
  131. def is_login(page):
  132. """验证是否已登录(核心:检测登录态)"""
  133. try:
  134. # 访问需要登录的页面
  135. page.goto(LOGIN_VALIDATE_URL, timeout=300000)
  136. page.wait_for_load_state("networkidle")
  137. # 检测是否跳转到登录页(URL包含login则未登录)
  138. if "login" in page.url.lower():
  139. logger.warning(" Cookie失效,需要重新登录")
  140. return False
  141. # 可选:检测登录后的专属元素(比如用户名、个人中心等)
  142. # if page.locator("用户中心选择器").count() > 0:
  143. # return True
  144. logger.info(" Cookie有效,已保持登录状态")
  145. return True
  146. except Exception as e:
  147. logger.error(f" 验证登录状态失败:{e}")
  148. return False
  149. # ==================== 滚动函数重构(核心修改) ====================
  150. def slow_scroll_400px(page,scroll_distance1=400):
  151. """
  152. 慢速滚动400px±50px(模拟真人滑动)
  153. :param page: 页面对象
  154. :return: 滚动是否成功
  155. """
  156. try:
  157. # 生成400±50px的随机滚动距离
  158. scroll_distance = random.randint(
  159. scroll_distance1 - SCROLL_OFFSET_RANGE,
  160. scroll_distance1 + SCROLL_OFFSET_RANGE
  161. )
  162. remaining_distance = scroll_distance
  163. total_steps = int(scroll_distance / SCROLL_STEP)
  164. logger.info(
  165. f"📜 开始慢速滚动(目标距离:{scroll_distance}px,总步数:{total_steps},总时长约{total_steps*SCROLL_INTERVAL:.2f}秒)"
  166. )
  167. # 渐进式滚动(每步50px,间隔0.05秒)
  168. for _ in range(total_steps):
  169. step = min(SCROLL_STEP, remaining_distance)
  170. page.evaluate(f"window.scrollBy(0, {step});")
  171. remaining_distance -= step
  172. time.sleep(SCROLL_INTERVAL)
  173. # 处理剩余不足一步的距离
  174. if remaining_distance > 0:
  175. page.evaluate(f"window.scrollBy(0, {remaining_distance});")
  176. time.sleep(SCROLL_INTERVAL)
  177. # 滚动后等待懒加载完成
  178. page.wait_for_load_state("networkidle", timeout=8000)
  179. random_delay(2.0, 3.0) # 滚动后额外停顿,模拟真人
  180. logger.info(f" 慢速滚动完成,实际滚动距离:{scroll_distance - remaining_distance}px")
  181. return True
  182. except Exception as e:
  183. logger.warning(f" 慢速滚动失败:{e}")
  184. return False
  185. # def check_anti_crawl(page):
  186. # """检测反爬弹窗/验证码(核心:提前识别反爬)"""
  187. # anti_crawl_selectors = [
  188. # "//div[contains(text(), '验证')]",
  189. # "//div[contains(text(), '人机验证')]",
  190. # "//div[contains(text(), '访问过于频繁')]",
  191. # "//button[contains(text(), '验证')]"
  192. # ]
  193. # for selector in anti_crawl_selectors:
  194. # if page.locator(selector).count() > 0:
  195. # logger.error("❌ 检测到反爬验证弹窗!请手动完成验证后按回车继续...")
  196. # input() # 暂停等待手动验证
  197. # return True
  198. # return False
  199. # CSV配置
  200. CSV_FILE_PATH = f"ybm_collect_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" # CSV保存路径
  201. CSV_HEADERS = [
  202. "商品标题", "商品采购价格", "商品折扣价格", "规格", "盒数",
  203. "店铺名称", "公司名称",
  204. "有效日期", "生产日期", "批准文号", "采集时间"
  205. ] #表头
  206. # ==================== 登录函数 ====================
  207. def login_operation(page, username, password):
  208. """登录操作函数"""
  209. try:
  210. # 输入手机号(直接用单个变量)
  211. page.wait_for_selector(USERNAME_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  212. page.wait_for_timeout(timeout=3000)
  213. page.fill(USERNAME_SELECTOR, username)
  214. logger.info(" 已输入登录账号")
  215. # 输入密码
  216. page.wait_for_selector(PASSWORD_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  217. page.wait_for_timeout(timeout=3000)
  218. page.fill(PASSWORD_SELECTOR, password)
  219. logger.info(" 已输入登录密码")
  220. random_delay(1, 2)
  221. agree_btn = page.locator('span.el-checkbox__inner')
  222. agree_btn.click()
  223. # 点击登录按钮
  224. page.wait_for_selector(LOGIN_BTN_SELECTOR, timeout=ELEMENT_TIMEOUT)
  225. page.wait_for_timeout(timeout=3000)
  226. page.click(LOGIN_BTN_SELECTOR)
  227. logger.info(" 已点击登录按钮")
  228. page.wait_for_timeout(LOGIN_AFTER_CLICK)
  229. return True
  230. except PlaywrightTimeoutError as e:
  231. logger.error(f" 登录失败:元素定位超时 - {str(e)}")
  232. return False
  233. except Exception as e:
  234. logger.error(f" 登录异常:{str(e)}")
  235. return False
  236. def kill_masks(page):
  237. """
  238. 强制清理残留遮罩层/覆盖层,并恢复 body 可滚动、可点击状态
  239. """
  240. page.evaluate(r"""
  241. () => {
  242. const removed = [];
  243. const hidden = [];
  244. // 1) 先处理已知常见遮罩
  245. const knownSelectors = [
  246. '.v-modal',
  247. '.el-overlay',
  248. '.el-overlay-dialog',
  249. '.el-dialog__wrapper',
  250. '.el-message-box__wrapper',
  251. '.el-loading-mask',
  252. '.el-popup-parent--hidden'
  253. ];
  254. for (const sel of knownSelectors) {
  255. document.querySelectorAll(sel).forEach(el => {
  256. // v-modal / overlay 直接 remove 最省事
  257. removed.push(sel);
  258. el.remove();
  259. });
  260. }
  261. // 2) 再做一次“泛化兜底”:全屏 fixed/absolute + 高 z-index 的覆盖层
  262. // 注意:不要误删页面正常的固定导航,所以加上“近似全屏”的判断
  263. const all = Array.from(document.querySelectorAll('body *'));
  264. for (const el of all) {
  265. const s = window.getComputedStyle(el);
  266. if (!s) continue;
  267. const z = parseInt(s.zIndex || '0', 10);
  268. const pos = s.position;
  269. const pe = s.pointerEvents;
  270. if ((pos === 'fixed' || pos === 'absolute') && z >= 1000 && pe !== 'none') {
  271. const r = el.getBoundingClientRect();
  272. const nearFullScreen =
  273. r.width >= window.innerWidth * 0.8 &&
  274. r.height >= window.innerHeight * 0.8 &&
  275. r.left <= window.innerWidth * 0.1 &&
  276. r.top <= window.innerHeight * 0.1;
  277. // 常见遮罩是半透明背景色,或者透明但拦截点击
  278. const bg = s.backgroundColor || '';
  279. const looksLikeMask =
  280. nearFullScreen && (bg.includes('rgba') || bg.includes('rgb') || s.opacity !== '1');
  281. if (nearFullScreen) {
  282. // 不管透明不透明,只要近似全屏且高 z-index,就先让它不拦截点击
  283. el.style.pointerEvents = 'none';
  284. el.style.display = 'none';
  285. hidden.push(el.tagName + '.' + (el.className || ''));
  286. }
  287. }
  288. }
  289. // 3) 恢复 body / html 的滚动与交互(很多弹窗会锁滚动)
  290. document.documentElement.style.overflow = 'auto';
  291. document.body.style.overflow = 'auto';
  292. document.body.style.position = 'static';
  293. document.body.style.width = 'auto';
  294. document.body.style.paddingRight = '0px';
  295. // 4) 去掉 Element-UI 常见的锁定 class
  296. document.body.classList.remove('el-popup-parent--hidden');
  297. return { removed, hiddenCount: hidden.length, hidden };
  298. }
  299. """)
  300. def force_close_popup(page):
  301. """关闭新手引导/遮罩(多步:下一步/完成/我知道了),并兜底移除遮罩层"""
  302. try:
  303. # 1) 尝试连续点“下一步/完成/我知道了/关闭”
  304. for _ in range(5): # 最多点5次,足够覆盖多步引导
  305. btn = page.locator(
  306. "//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  307. ).first
  308. if btn.count() > 0 and btn.is_visible():
  309. btn.click(timeout=1500)
  310. page.wait_for_timeout(300)
  311. continue
  312. # 有些引导是右上角 X(如果存在就点)
  313. close_icon = page.locator(
  314. "xpath=//*[contains(@class,'close') or contains(@class,'el-icon-close') or name()='svg' or name()='i'][1]"
  315. ).first
  316. if close_icon.count() > 0 and close_icon.is_visible():
  317. close_icon.click(timeout=1000)
  318. page.wait_for_timeout(300)
  319. continue
  320. break
  321. # 2) 兜底:移除常见遮罩层(element-ui / 通用 mask/overlay)
  322. page.evaluate("""
  323. const selectors = [
  324. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  325. '[class*="mask"]', '[class*="overlay"]', '[style*="z-index"]'
  326. ];
  327. for (const sel of selectors) {
  328. document.querySelectorAll(sel).forEach(el => {
  329. const s = window.getComputedStyle(el);
  330. // 只移除“覆盖层”倾向的元素:fixed/absolute 且 z-index 很高
  331. if ((s.position === 'fixed' || s.position === 'absolute') && parseInt(s.zIndex || '0', 10) >= 1000) {
  332. el.remove();
  333. }
  334. });
  335. }
  336. """)
  337. except Exception:
  338. pass
  339. # 调用方式和方案1一致:在搜索后、采集前执行
  340. # force_close_popup(page)
  341. def pick_search_input(page):
  342. """优先选可见且可用的搜索输入框;第一个不行就尝试第二个"""
  343. inputs = page.locator(SEARCH_INPUT_SELECTOR)
  344. cnt = inputs.count()
  345. # 优先检查前两个(你说只有两个)
  346. for i in range(min(cnt, 2)):
  347. candidate = inputs.nth(i)
  348. try:
  349. candidate.wait_for(state="visible", timeout=1500) # 小超时快速试探
  350. if candidate.is_enabled():
  351. return candidate
  352. except PlaywrightTimeoutError:
  353. continue
  354. # 兜底:直接找任意可见的(避免命中 hidden 模板)
  355. candidate = page.locator(f"{SEARCH_INPUT_SELECTOR}:visible").first
  356. candidate.wait_for(state="visible", timeout=ELEMENT_TIMEOUT)
  357. return candidate
  358. def type_slow(locator, text: str, min_delay=0.06, max_delay=0.18):
  359. """逐字输入,模拟真人打字"""
  360. for ch in text:
  361. locator.type(ch, delay=int(random.uniform(min_delay, max_delay) * 1000))
  362. # ==================== 搜索操作函数 ====================
  363. def search_operation(page, keyword, is_first_search: bool = True):
  364. """
  365. 搜索框填充+提交搜索
  366. :param page: 页面对象
  367. :param keyword: 搜索关键词
  368. :param is_first_search: 是否是首次搜索(首次开新页面,后续原页面跳转)
  369. :return: (detail_page, 是否成功)
  370. """
  371. try:
  372. # 1) 找到“可用”的搜索框(第一个不行就用第二个)
  373. search_locator = page.locator(SEARCH_INPUT_SELECTOR)
  374. # 清空并填充搜索框
  375. search_locator.wait_for(timeout=ELEMENT_TIMEOUT)
  376. # 2. 清空搜索框(双重保障:先调用locator的clear,再手动全选删除)
  377. search_locator.click(force=True) # 聚焦
  378. search_locator.fill("")
  379. page.keyboard.down("Control") # 按住Control键
  380. page.keyboard.press("a") # 按a键
  381. page.keyboard.up("Control") # 松开Control键
  382. page.keyboard.press("Backspace") # 删除选中内容
  383. # 3) 逐字输入
  384. type_slow(search_locator, keyword, min_delay=0.06, max_delay=0.18)
  385. # 3. 输入搜索关键词
  386. # search_locator.fill(keyword)
  387. logger.info(f"📝 已输入搜索关键词:{keyword}")
  388. # 3) 搜索按钮也建议点可见的那个
  389. btn = page.locator(f"{SEARCH_BTN_SELECTOR}")
  390. btn.wait_for(state="visible", timeout=SEARCH_BTN_TIMEOUT)
  391. # btn.click()
  392. page.wait_for_timeout(3000)
  393. detail_page = page
  394. if is_first_search:
  395. #获取新页面对象
  396. try:
  397. # 先开始监听新页面事件(在点击前)
  398. with page.context.expect_page(timeout=60000) as new_page_info:
  399. # 再执行点击操作
  400. btn.click()
  401. # 点击后获取新页面
  402. detail_page = new_page_info.value
  403. detail_page.wait_for_load_state("networkidle", timeout=20000)
  404. # #点击出现的按钮
  405. # test_btn = detail_page.locator("div[data-v-c65c36bc].first-time-highlight-message-btn button")
  406. # btn_count = test_btn.count()
  407. # logger.info(f"✅ 匹配到的元素数量:{btn_count}")
  408. # test_btn.wait_for(state="attached", timeout=5000)
  409. # test_btn.click()
  410. except PlaywrightTimeoutError:
  411. logger.warning(f"{get_current_time()} 未检测到新标签页")
  412. return None, False
  413. except Exception as e:
  414. logger.warning(f"{get_current_time()} 等待新标签页异常:{e}")
  415. return None, False
  416. else:
  417. btn.click()
  418. # 等待原页面跳转并加载完成(替代新页面监听)
  419. page.wait_for_load_state("networkidle", timeout=20000)
  420. # 详情页就是原页面,无需新建
  421. detail_page = page
  422. logger.info("✅ 后续搜索:已在原页面完成跳转加载")
  423. test_btn = detail_page.locator("div[data-v-c65c36bc].first-time-highlight-message-btn button")
  424. btn_count = test_btn.count()
  425. logger.info(f"✅ 匹配到的元素数量:{btn_count}")
  426. if btn_count > 0:
  427. test_btn.wait_for(state="attached", timeout=5000)
  428. test_btn.click()
  429. force_close_popup(detail_page)
  430. kill_masks(detail_page)
  431. logger.info("✅ 已触发搜索")
  432. return detail_page, True
  433. # 搜索后等待结果加载
  434. # page.wait_for_timeout(COLLECT_DELAY)
  435. # return True
  436. except PlaywrightTimeoutError as e:
  437. logger.error(f" 搜索失败:元素定位超时 - {str(e)}")
  438. return None, False # 失败时返回 (None, False)
  439. except Exception as e:
  440. logger.error(f" 搜索异常:{str(e)}")
  441. return None, False # 失败时返回 (None, False)
  442. #翻下一页
  443. def goto_next_page(page) -> bool:
  444. """
  445. 基于 button.btn-next 的 aria-disabled 属性判断是否有下一页
  446. :param page: 搜索结果页面对象(detail_page)
  447. :return: True=翻页成功,False=无下一页/翻页失败
  448. """
  449. try:
  450. next_btn = page.locator("button.btn-next").first
  451. # 2. 先等待按钮加载(确保元素存在)
  452. next_btn.wait_for(state="attached", timeout=3000)
  453. # 3. 获取 aria-disabled 属性值(核心判断依据)
  454. aria_disabled = next_btn.get_attribute("aria-disabled")
  455. logger.info(f"下一页按钮 aria-disabled 属性值:{aria_disabled}")
  456. # 4. 判断是否有下一页:aria-disabled="true" 表示无下一页
  457. if aria_disabled == "true":
  458. logger.warning("⚠️ 下一页按钮 aria-disabled=true,已无更多页面")
  459. return False
  460. page.wait_for_timeout(500)
  461. # 6. 确保按钮可见且可点击(强制点击兜底)
  462. if next_btn.is_visible() and next_btn.is_enabled():
  463. next_btn.click(timeout=5000)
  464. else:
  465. # 兜底:强制点击(避免元素不可见但实际可点击的情况)
  466. next_btn.click(force=True, timeout=5000)
  467. logger.info("✅ 翻页成功,下一页按钮 aria-disabled=false")
  468. return True
  469. except PlaywrightTimeoutError:
  470. logger.warning("⚠️ 下一页按钮加载超时,判定无更多页面")
  471. return False
  472. except Exception as e:
  473. logger.warning(f"⚠️ 翻页操作异常:{e},判定无更多页面")
  474. return False
  475. def popup_guard(page, tag=""):
  476. """
  477. 全局弹窗/遮罩守卫:多步引导 + 关闭按钮 + 遮罩清理 + 恢复滚动
  478. tag 仅用于日志区分调用位置
  479. """
  480. try:
  481. # 给弹窗一点出现时间
  482. page.wait_for_timeout(300)
  483. # 1) 连续点“下一步/完成/我知道了/关闭”
  484. for _ in range(6):
  485. btn = page.locator(
  486. "xpath=//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  487. ).first
  488. if btn.count() > 0 and btn.is_visible():
  489. btn.click(timeout=1500)
  490. page.wait_for_timeout(250)
  491. continue
  492. # 2) 常见的 close icon
  493. close_btn = page.locator(
  494. "css=.el-dialog__headerbtn, .el-message-box__headerbtn, .close, .icon-close, .el-icon-close"
  495. ).first
  496. if close_btn.count() > 0 and close_btn.is_visible():
  497. close_btn.click(timeout=1200)
  498. page.wait_for_timeout(250)
  499. continue
  500. break
  501. # 3) 清遮罩 + 恢复滚动/交互
  502. page.evaluate(r"""
  503. () => {
  504. // 第一步:精准清理已知的遮罩/弹窗类名(Element UI框架常用)
  505. const selectors = [
  506. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  507. '.el-message-box__wrapper', '.el-loading-mask'
  508. ];
  509. selectors.forEach(sel => document.querySelectorAll(sel).forEach(e => e.remove()));
  510. // 泛化兜底:近似全屏 + 高 z-index 的层直接屏蔽
  511. const all = Array.from(document.querySelectorAll('body *'));
  512. for (const el of all) {
  513. const s = getComputedStyle(el); // 获取元素的实际样式(含CSS生效的样式)
  514. const z = parseInt(s.zIndex || '0', 10); // 取元素的层级(z-index),默认0
  515. // 条件1:元素是固定/绝对定位(弹窗/遮罩常见定位方式)+ 层级≥1000(高优先级遮挡)+ 能拦截鼠标事件
  516. if ((s.position === 'fixed' || s.position === 'absolute') && z >= 1000 && s.pointerEvents !== 'none') {
  517. const r = el.getBoundingClientRect(); // 获取元素的尺寸和位置
  518. // 条件2:元素宽度/高度≥屏幕80%(近似全屏遮罩)
  519. const nearFull = r.width >= innerWidth * 0.8 && r.height >= innerHeight * 0.8;
  520. if (nearFull) {
  521. el.style.pointerEvents = 'none'; // 让元素不拦截鼠标点击
  522. el.style.display = 'none'; // 隐藏元素
  523. }
  524. }
  525. }
  526. // 第三步:恢复页面滚动功能(弹窗常把页面设为不可滚动)
  527. document.documentElement.style.overflow = 'auto'; // html标签恢复滚动
  528. document.body.style.overflow = 'auto'; // body标签恢复滚动
  529. document.body.classList.remove('el-popup-parent--hidden'); // 移除Element UI的滚动禁用类
  530. }
  531. """)
  532. logger.info("杀除弹窗成功")
  533. except Exception:
  534. pass
  535. def open_detail_page(list_page, item, keyword, idx, *, timeout=15000):
  536. """
  537. 点击商品进入详情页,兼容:
  538. 1) 新开 tab(返回 detail_page != list_page, opened_new_tab=True)
  539. 2) 同 tab 跳转(detail_page == list_page, opened_new_tab=False)
  540. """
  541. ctx = list_page.context
  542. list_url = list_page.url
  543. detail_page = None
  544. opened_new_tab = False
  545. try:
  546. # 期望新开 tab(很多站点会这样)
  547. with ctx.expect_page(timeout=timeout) as p:
  548. item.click(delay=random.uniform(0.1, 0.3))
  549. detail_page = p.value
  550. opened_new_tab = True
  551. logger.info(f" 「{keyword}」第{idx}个商品 - 新开标签页进入详情")
  552. except PlaywrightTimeoutError:
  553. # 兜底:没新开 tab,大概率是同页跳转/弹层
  554. detail_page = list_page
  555. opened_new_tab = False
  556. logger.info(f" 「{keyword}」第{idx}个商品 - 未新开标签页,按同页进入详情处理")
  557. return detail_page, opened_new_tab, list_url
  558. def return_to_list(list_page, detail_page, opened_new_tab, list_url, keyword, idx):
  559. """
  560. 从详情页返回列表页:
  561. - 新 tab:关闭 tab,然后 bring_to_front 切回
  562. - 同 tab:尽量 go_back 回到 list_url;如果没跳转而是弹层,尝试 ESC
  563. """
  564. # 如果浏览器/页面已经被关了,直接退出,避免二次异常
  565. if list_page is None or list_page.is_closed():
  566. logger.warning(f" 「{keyword}」第{idx}个商品 - 列表页已关闭,无法切回")
  567. return
  568. if opened_new_tab:
  569. # 只关“新开的详情 tab”,绝不关 list_page
  570. try:
  571. if detail_page and (detail_page is not list_page) and (not detail_page.is_closed()):
  572. detail_page.close()
  573. logger.info(f"📌 「{keyword}」第{idx}个商品 - 已关闭详情页标签页")
  574. except Exception as e:
  575. logger.warning(f" 「{keyword}」第{idx}个商品 - 关闭详情页失败:{e}")
  576. # 切回列表页
  577. try:
  578. list_page.bring_to_front()
  579. list_page.mouse.move(random.randint(100, 300), random.randint(200, 400))
  580. random_delay(0.3, 0.8)
  581. list_page.wait_for_load_state("networkidle")
  582. logger.info(f" 「{keyword}」第{idx}个商品 - 已切回列表页(新tab模式)")
  583. except Exception as e:
  584. logger.warning(f" 「{keyword}」第{idx}个商品 - 切回列表页失败:{e}")
  585. return
  586. # 同 tab:detail_page == list_page
  587. try:
  588. # 1) 如果 URL 变了,说明确实跳转了 → go_back 回去
  589. if list_page.url != list_url:
  590. for _ in range(3): # 最多退 3 次,防止死循环
  591. list_page.go_back(timeout=15000)
  592. list_page.wait_for_load_state("domcontentloaded", timeout=15000)
  593. random_delay(0.2, 0.5)
  594. if list_page.url == list_url:
  595. break
  596. logger.info(f" 「{keyword}」第{idx}个商品 - 已返回列表页(同tab跳转模式)")
  597. else:
  598. # 2) URL 没变:可能是弹层详情 → 尝试 ESC 关闭弹层
  599. list_page.keyboard.press("Escape")
  600. random_delay(0.2, 0.5)
  601. logger.info(f" 「{keyword}」第{idx}个商品 - 已尝试关闭弹层并留在列表页(同tab弹层模式)")
  602. list_page.bring_to_front()
  603. list_page.wait_for_load_state("networkidle")
  604. except Exception as e:
  605. logger.warning(f" 「{keyword}」第{idx}个商品 - 同tab返回列表页失败:{e}")
  606. #判断店名是否已经在数据库
  607. def shop_is_exists_database(shop):
  608. try:
  609. conn = pymysql.connect(**MYSQL_CONFIG)
  610. cursor = conn.cursor(pymysql.cursors.DictCursor) # 改为字典游标
  611. query_sql = """
  612. SELECT province, city, business_license_company, qualification_number FROM ybm_shop_info_middle
  613. WHERE shop = %s
  614. """
  615. cursor.execute(query_sql, (shop,))
  616. result = cursor.fetchone()
  617. # 正确的调试方式(替代cursor._last_executed)
  618. print(f"【调试】传入的店铺名:{repr(shop)}") # repr能显示空格/隐藏字符
  619. print(f"【调试】查询参数:{shop}")
  620. print(f"【调试】查询结果:{result} → 函数返回:{bool(result)}")
  621. is_exists = bool(result)
  622. if is_exists:
  623. logger.info(f"【店铺存在校验】店铺已存在 | 店铺名:{repr(shop)} | 结果:存在(True)不要执行采集店铺")
  624. else:
  625. logger.info(f"【店铺存在校验】店铺不存在 | 店铺名:{repr(shop)} | 结果:不存在(False)")
  626. return is_exists, result
  627. except Exception as e:
  628. logger.error(f"查询店铺失败:{e}")
  629. return False, None # 异常时明确返回False,避免返回None
  630. finally:
  631. # 修复:关闭游标和连接,避免泄露
  632. if cursor:
  633. cursor.close()
  634. if conn:
  635. conn.close()
  636. def insert_shop_info_to_db(shop,contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform, province, city, create_time, update_time):
  637. """
  638. 把字段插入到ybm_shop_info_middle表
  639. :param 各参数: 你要插入的字段值(空字符串也可)
  640. :return: bool - 插入成功返回True,失败返回False
  641. """
  642. # 1. 初始化数据库连接和游标
  643. conn = None
  644. cursor = None
  645. try:
  646. conn = pymysql.connect(**MYSQL_CONFIG)
  647. cursor = conn.cursor()
  648. # 2. 构造INSERT SQL语句(参数化查询,防止SQL注入)
  649. # 注意:请确认ybm_shop_info_middle表的字段名和以下%s的顺序对应!
  650. # 若表字段名不同,修改INSERT后的字段列表(比如你的表字段是credit_code而非qualification_number,要对应改)
  651. sql = """
  652. INSERT INTO ybm_shop_info_middle (
  653. shop,
  654. contact_address,
  655. qualification_number,
  656. business_license_company,
  657. business_license_address,
  658. scrape_date,
  659. platform,
  660. province,
  661. city,
  662. create_time,
  663. update_time
  664. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  665. ON DUPLICATE KEY UPDATE
  666. contact_address = VALUES(contact_address), # 重复时更新联系地址
  667. qualification_number = VALUES(qualification_number), # 更新社会信用代码
  668. business_license_company = VALUES(business_license_company), # 更新公司名
  669. business_license_address = VALUES(business_license_address), # 更新地址
  670. scrape_date = VALUES(scrape_date),
  671. platform = VALUES(platform),
  672. province = VALUES(province),
  673. city = VALUES(city),
  674. update_time = VALUES(update_time) # 重复时更新update_time
  675. """
  676. # 3. 构造插入的参数(顺序必须和SQL中的%s一一对应)
  677. params = (
  678. shop, # 店铺名称
  679. contact_address, # 联系地址
  680. qualification_number, # 社会信用代码
  681. business_license_company, # 营业执照公司名
  682. business_license_address, # 营业执照地址
  683. scrape_date, # 爬取日期
  684. platform, # 平台名称(药九九)
  685. province, # 省份
  686. city, # 城市
  687. create_time, # create_time(当前时间)
  688. update_time
  689. )
  690. # 4. 执行SQL并提交事务
  691. cursor.execute(sql, params)
  692. conn.commit()
  693. print(f"✅ 数据插入成功!店铺:{shop} | 公司:{business_license_company}")
  694. return True
  695. except pymysql.MySQLError as e:
  696. # 数据库相关错误(连接失败、SQL语法错误、字段不匹配等)
  697. print(f"MySQL插入失败:{e}")
  698. print(f"详细异常信息:{traceback.format_exc()}") # 打印详细堆栈,方便排查
  699. if conn:
  700. conn.rollback() # 插入失败回滚事务
  701. return False
  702. except Exception as e:
  703. # 其他未知错误
  704. print(f"插入数据时发生未知错误:{e}")
  705. print(f"详细异常信息:{traceback.format_exc()}")
  706. if conn:
  707. conn.rollback()
  708. return False
  709. finally:
  710. # 5. 无论成功/失败,都关闭游标和连接(释放资源)
  711. if cursor:
  712. cursor.close()
  713. if conn:
  714. conn.close()
  715. def insert_single_to_mysql(single_data):
  716. """
  717. 逐条插入单条数据到MySQL数据库
  718. :param single_data: 单条商品数据元组
  719. :return: 插入是否成功
  720. """
  721. conn = None
  722. cursor = None
  723. try:
  724. conn = pymysql.connect(**MYSQL_CONFIG)
  725. cursor = conn.cursor()
  726. # 2. 确保表存在(兼容表未创建的情况)
  727. # cursor.execute(CREATE_TABLE_SQL)
  728. insert_sql = """
  729. INSERT INTO ybm_drug_middle (
  730. product, my_good_price, min_price, manufacture_date, expiry_date,
  731. shop, business_license_company, province, city, manufacturer,
  732. specification, approval_number, product_link, scrape_date,
  733. scrape_province, availability, credit_code, platform, search_key,
  734. number, is_sold_out, sales, inventory, snapshot_url, update_time, create_time
  735. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  736. """
  737. # 字段值(与SQL占位符顺序严格对应)
  738. values = (
  739. single_data["product"],
  740. single_data["my_good_price"],
  741. single_data["min_price"],
  742. single_data["manufacture_date"],
  743. single_data["expiry_date"],
  744. single_data["shop"],
  745. single_data["business_license_company"],
  746. single_data["province"],
  747. single_data["city"],
  748. single_data["manufacturer"],
  749. single_data["specification"],
  750. single_data["approval_number"],
  751. single_data["product_link"],
  752. single_data["scrape_date"],
  753. single_data["scrape_province"],
  754. single_data["availability"],
  755. single_data["credit_code"],
  756. single_data["platform"],
  757. single_data["search_key"],
  758. single_data["number"],
  759. single_data["is_sold_out"],
  760. single_data["sales"],
  761. single_data["inventory"],
  762. single_data["snapshot_url"],
  763. single_data["update_time"],
  764. single_data["create_time"]
  765. )
  766. cursor.execute(insert_sql, values)
  767. conn.commit()
  768. logger.info(f" 单条数据插入成功:...") # 仅打印标题前20字
  769. return True
  770. except OperationalError as e:
  771. logger.error(f" MySQL连接失败:{str(e)}")
  772. if conn:
  773. conn.rollback()
  774. return False
  775. except ProgrammingError as e:
  776. logger.error(f" SQL语法错误:{str(e)}")
  777. if conn:
  778. conn.rollback()
  779. return False
  780. except Exception as e:
  781. logger.error(f" 单条数据插入失败:{str(e)}")
  782. if conn:
  783. conn.rollback()
  784. return False
  785. finally:
  786. # 关闭游标和连接
  787. if cursor:
  788. cursor.close()
  789. if conn:
  790. conn.close()
  791. def clean_shop_name(raw_shop_name):
  792. """
  793. 清洗店铺名称:移除无关前缀(如【xx截单】)、多余空格/特殊符号,提取核心店名
  794. :param raw_shop_name: 原始采集的店铺名称字符串
  795. :return: 清洗后的纯店铺名称
  796. """
  797. if not raw_shop_name: #处理空值
  798. return ''
  799. # 步骤1:移除【】/()/[]包裹的所有内容(如【2月13日11点截单】)
  800. # 正则解释:匹配【任意字符】、(任意字符)、[任意字符],并替换为空
  801. pattern = r'【.*?】|\(.*?\)|\[.*?\]'
  802. cleaned = re.sub(pattern, '', raw_shop_name)
  803. # 步骤2:移除首尾空格、换行符,替换中间多余空格为单个空格
  804. cleaned = cleaned.strip().replace('\n', '').replace('\r', '')
  805. cleaned = re.sub(r'\s+', ' ', cleaned)
  806. # 步骤3:兜底处理(若清洗后为空,返回原始值避免空字符串)
  807. return cleaned if cleaned else raw_shop_name
  808. def check_dup_in_biz_db(product_link, discount_price_val, scrape_date):
  809. """直接查询业务表是否存在该商品链接+价格"""
  810. conn = None
  811. cursor = None
  812. log_context = (
  813. f"【去重校验】商品链接:{product_link.strip()} | 价格:{discount_price_val} "
  814. f"采集日期:{scrape_date.strip()}"
  815. )
  816. try:
  817. conn = pymysql.connect(**MYSQL_CONFIG)
  818. cursor = conn.cursor()
  819. sql = """
  820. SELECT * FROM ybm_drug_middle
  821. WHERE product_link = %s AND min_price = %s AND scrape_date=%s
  822. """
  823. # 先执行查询
  824. cursor.execute(sql, (product_link.strip(), discount_price_val, scrape_date.strip()))
  825. # 再判断是否有结果
  826. # 如果 fetchone() 返回元组(比如(1,))→ (1,) is not None → 结果为 True;
  827. # 如果 fetchone() 返回 None → None is not None → 结果为 False。
  828. is_dup = cursor.fetchone() is not None
  829. if is_dup:
  830. logger.warning(f"{log_context} - 表中已存在重复记录,跳过本次采集")
  831. else:
  832. logger.info(f"{log_context} - 表中无重复记录,正常采集")
  833. return is_dup
  834. except Exception as e:
  835. logger.error(f"查询业务表去重失败:{str(e)}")
  836. return False
  837. finally:
  838. if cursor:
  839. cursor.close()
  840. if conn:
  841. conn.close()
  842. # 压缩图片函数
  843. def compress_image(image_data, max_size=4*1024*1024): # 4MB上限
  844. try:
  845. img = Image.open(BytesIO(image_data))
  846. # 将RGBA模式转为RGB(兼容JPEG)
  847. if img.mode in ('RGBA', 'P'): # P是PNG的调色板模式,也需转换
  848. # 新建白色背景的RGB图片,把透明图贴上去(避免透明区域变黑)
  849. bg_img = Image.new('RGB', img.size, (255, 255, 255))
  850. bg_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
  851. img = bg_img
  852. # 缩小分辨率(按比例缩到宽≤1000px)
  853. if img.width > 1000:
  854. ratio = 1000 / img.width
  855. new_size = (int(img.width*ratio), int(img.height*ratio))
  856. img = img.resize(new_size, Image.Resampling.LANCZOS)
  857. # 降低质量(JPG)/压缩(PNG)
  858. output = BytesIO()
  859. img.save(output, format='JPEG', quality=80) # quality越小体积越小
  860. compressed_data = output.getvalue()
  861. # 若仍超限,继续降质量
  862. if len(compressed_data) > max_size:
  863. img.save(output, format='JPEG', quality=60)
  864. compressed_data = output.getvalue()
  865. return compressed_data
  866. except Exception as e:
  867. logger.debug(f"图片压缩失败:{e}")
  868. return image_data # 压缩失败返回原始数据
  869. def download_image_to_base64(image_url, save_dir = "./download_images"):
  870. """下载网络图片,返回图片二进制数据(BytesIO)"""
  871. try:
  872. if not os.path.exists(save_dir):
  873. os.makedirs(save_dir) # 创建多级目录(比如a/b/c)
  874. print(f"创建本地保存目录:{save_dir}")
  875. except Exception as e:
  876. print(f"创建保存目录失败:{str(e)}")
  877. return None
  878. try:
  879. # 模拟浏览器请求头,避免被服务器拦截
  880. headers = {
  881. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  882. }
  883. response = requests.get(image_url, headers=headers, timeout=15)
  884. response.raise_for_status()
  885. compressed_data = compress_image(response.content)
  886. image_base64 = base64.b64encode(compressed_data).decode("utf-8")
  887. image_data = compressed_data
  888. # 步骤3:提取图片文件名(从URL中截取,避免重复)
  889. # 示例URL:https://xxx.com/123.jpg → 文件名:123.jpg
  890. file_name = image_url.split("/")[-1]
  891. # 处理特殊字符(避免文件名非法)
  892. file_name = file_name.replace("?", "").replace("&", "").replace("=", "")
  893. save_path = os.path.join(save_dir, file_name) # 完整保存路径
  894. # 步骤4:保存图片到本地
  895. with open(save_path, "wb") as f:
  896. f.write(image_data)
  897. print(f"图片已保存到本地:{save_path}")
  898. return image_base64
  899. except requests.exceptions.Timeout:
  900. print(f"下载图片超时:{image_url}")
  901. return None
  902. except requests.exceptions.HTTPError as e:
  903. print(f"图片URL无效(状态码:{response.status_code}):{image_url}")
  904. return None
  905. except Exception as e:
  906. print(f"下载图片失败:{str(e)}")
  907. return None
  908. def get_ocr_res(img):
  909. try:
  910. #img地址
  911. print(f'开始识别图片:{img}')
  912. request_url = request_url_config
  913. img_base64 = download_image_to_base64(img)
  914. if not img_base64:
  915. print("图片下载/转Base64失败,终止OCR识别")
  916. return None
  917. # 获取access_token
  918. access_token = get_access_token()
  919. if not access_token:
  920. print("获取access_token失败,无法调用OCR接口")
  921. return None
  922. params = {"image": img_base64}
  923. request_url = request_url + "?access_token=" + access_token
  924. headers = {'content-type': 'application/x-www-form-urlencoded'}
  925. response = requests.post(request_url, data=params, headers=headers)
  926. if response:
  927. res = response.json()
  928. # 检查OCR返回是否有错误
  929. if "error_code" in res:
  930. print(f"百度OCR接口错误:{res['error_msg']}(错误码:{res['error_code']})")
  931. return None
  932. # 解析识别结果
  933. new_dic = dict()
  934. for ite in res['words_result'].keys():
  935. new_dic[ite] = res['words_result'][ite]['words']
  936. print('资质数据信息', new_dic)
  937. return new_dic
  938. else:
  939. print("OCR接口返回空响应")
  940. return None
  941. except requests.exceptions.RequestException as e:
  942. print(f"网络错误(图片下载/OCR请求失败):{str(e)}")
  943. return None
  944. except KeyError as e:
  945. print(f"OCR响应格式异常,缺失字段:{str(e)}")
  946. return None
  947. except Exception as e:
  948. print(f"OCR识别未知错误:{str(e)}")
  949. return None
  950. def get_access_token():
  951. AppKey = AppKey_config
  952. AppSrcret = AppSecret_config
  953. token_url =token_url_config
  954. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  955. payload = ""
  956. headers = {
  957. 'Content-Type': 'application/json',
  958. 'Accept': 'application/json'
  959. }
  960. try:
  961. response = requests.request("POST", url, headers=headers, data=payload)
  962. response.raise_for_status() # 触发HTTP错误
  963. return response.json()['access_token']
  964. except Exception as e:
  965. print(f"获取access_token失败:{str(e)}")
  966. return None
  967. def extract_province_city(address):
  968. """
  969. 从地址中提取省份和城市
  970. :param address: 营业执照地址(如"福建省福州市马尾区")
  971. :return: (province, city) - 提取到的省份/城市,提取失败返回空字符串
  972. """
  973. if not address: # 地址为空,直接返回空
  974. return "", ""
  975. # 正则1:匹配省份(兼容省/自治区/直辖市/特别行政区)
  976. province_pattern = re.compile(r'([^省]+省|.+自治区|北京市|上海市|天津市|重庆市|.+特别行政区)')
  977. province_match = province_pattern.search(address)
  978. province = province_match.group(1) if province_match else ""
  979. # 正则2:匹配城市(兼容市/自治州/地区/盟,且排除省份已匹配的部分)
  980. # 先去掉已匹配的省份,再匹配城市
  981. address_remain = address.replace(province, "").strip() if province else address.strip()
  982. city_pattern = re.compile(r'([^市]+市|.+自治州|.+地区|.+盟|^[^\d区县镇]+)')
  983. city_match = city_pattern.search(address_remain)
  984. city = city_match.group(1).strip() if city_match else ""
  985. # 兼容直辖市(如"北京市朝阳区"→city=北京市)
  986. if province in ["北京市", "上海市", "天津市", "重庆市"]:
  987. city = province
  988. # 兼容地址不规范的情况(如"福建福州马尾区",无"省"/"市"字)
  989. if not province and not city:
  990. # 匹配前两个地名(如"福建福州"→province=福建,city=福州)
  991. simple_pattern = re.compile(r'^([^\d区县镇]+)')
  992. simple_match = simple_pattern.search(address)
  993. if simple_match:
  994. city = simple_match.group(1).strip() # 只有城市,省份留空
  995. if city and province in city:
  996. city = city.replace(province, "").strip()
  997. return province.strip(), city.strip()
  998. #采集数据核心
  999. def collect_data(store_page, keyword):
  1000. """
  1001. 1) 先获取当前页商品个数(count)
  1002. 2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
  1003. 3) 当前页循环完 -> goto_next_page;有下一页继续;无下一页结束该关键词
  1004. """
  1005. collect_result = []
  1006. # seen = set()
  1007. logger.info(f"📊 开始采集「{keyword}」的商品数据")
  1008. store_page.wait_for_load_state("networkidle")
  1009. #没有找到商品就跳过这个商品
  1010. page_no = 1
  1011. while True:
  1012. logger.info(f"\n📄 「{keyword}」开始采集第 {page_no} 页")
  1013. # 记录列表页URL(可用于你后续兜底)
  1014. list_page_url = store_page.url
  1015. logger.info(f"📌 已记录商品列表页URL:{list_page_url}")
  1016. # ✅ 先获取当前页商品个数
  1017. store_page.wait_for_load_state("domcontentloaded") # 先等DOM加载
  1018. store_page.wait_for_load_state("networkidle")
  1019. store_page.wait_for_timeout(500) # 额外等待渲染稳定
  1020. total_limit = store_page.locator(PRODUCT_ITEM_SELECTOR).count()
  1021. logger.info(f"📌 「{keyword}」第{page_no}页 初始商品个数(count):{total_limit}")
  1022. # 重置当前页的采集计数
  1023. collected_count = 0
  1024. # ========= 初始化无匹配计数器(记录标题不包含核心关键词的次数) =========
  1025. # no_match_count = 0 # 无匹配次数初始化为0
  1026. # MAX_NO_MATCH = 10 # 最大无匹配次数阈值
  1027. #补充没找到关键词的兜底
  1028. not_found_keywords = store_page.locator("div.filter-panel-container-empty-text")
  1029. if not_found_keywords.count() > 0:
  1030. logger.warning(f"⚠️ 关键词「{keyword}」无匹配商品,直接跳过整个关键词采集")
  1031. return []
  1032. # 获取当前页面
  1033. # store_page = context.pages[0] # 从上下文中获取当前页面
  1034. # store_page.wait_for_load_state("networkidle")
  1035. for idx in range(total_limit):
  1036. detail_page = None
  1037. try:
  1038. item = store_page.locator(PRODUCT_ITEM_SELECTOR).nth(idx)
  1039. collected_count += 1 # 实际采集计数(用于日志)
  1040. # ========= 反爬随机延迟(保留你的原逻辑也行) =========
  1041. store_page.wait_for_load_state("networkidle")
  1042. delay = random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1043. logger.info(f"📌 「{keyword}」第{page_no}页 第{collected_count}/{total_limit}个商品 - 等待{delay:.2f}秒后采集(反爬)")
  1044. # 1. 初始化所有字段默认值
  1045. title = "无标题"
  1046. price = "0.00"
  1047. shop = "无店名"
  1048. expiry_date = "无有效期"
  1049. manufacture_date = "无生产日期"
  1050. approval_number = "无批准文号"
  1051. manufacturer = "未知公司"
  1052. # discount_price = "0.00"
  1053. spec = "未知规格"
  1054. num = 1 # ✅ 默认 1
  1055. platform = '药帮忙'
  1056. current_time = datetime.now().strftime("%Y-%m-%d")
  1057. is_sold_out = 0
  1058. # ========= 售罄不跳过 =========
  1059. sold_locator = item.locator('div.product-status')
  1060. if sold_locator.count() > 0:
  1061. is_sold_out = 1
  1062. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品已售罄")
  1063. # if collected_count % 5 == 0 and collected_count > 0:
  1064. # logger.info("采满5个往下滑")
  1065. # slow_scroll_400px(page)
  1066. # page.wait_for_load_state("networkidle")
  1067. # continue
  1068. #提取商品ID
  1069. # product_id_elem = item.locator('div.product-card[data-product-id]')
  1070. # if product_id.count() > 0:
  1071. # product_id = product_id_elem.get_attribute("data-product-id")
  1072. # logger.info(f"✅ 提取到data-product-id:{product_id}") # 输出:5678955
  1073. # 提取商品标题(处理空值)
  1074. product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
  1075. if product_locator.count() > 0:
  1076. title = product_locator.inner_text(timeout=3000).strip()
  1077. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页标题:{title}{'='*10}")
  1078. else:
  1079. logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
  1080. #关键词不在标题中,跳过当前商品
  1081. # core_keyword = re.sub(r'^999[\s\(\)()、·]*', '', keyword)
  1082. # if core_keyword not in title:
  1083. # no_match_count += 1
  1084. # logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含核心关键词「{core_keyword}」(无匹配次数:{no_match_count}/{MAX_NO_MATCH}),跳过本次循环")
  1085. # continue
  1086. # if no_match_count >= MAX_NO_MATCH:
  1087. # logger.error(f"❌ 关键词「{keyword}」无匹配商品次数已达{MAX_NO_MATCH}次,直接终止当前关键词采集,进入下一个关键词")
  1088. # return []
  1089. # 提取价格(带缺失日志)
  1090. # price_locator = item.locator(PRODUCT_PRICE_SELECTOR)
  1091. price_int = item.locator('//span[@class="price-int"]').text_content().strip()
  1092. # 2. 提取小数部分(注意可能为空,比如价格是整数13)
  1093. price_decimal_elem = item.locator('//span[@class="price-decimal"]')
  1094. if price_decimal_elem.count() > 0:
  1095. price_decimal = price_decimal_elem.text_content().strip()
  1096. else:
  1097. price_decimal = ''
  1098. # 3. 拼接完整价格
  1099. full_price = f"{price_int}{price_decimal}"
  1100. # 转成浮点数(便于后续计算/入库)
  1101. full_price_num = float(full_price)
  1102. logger.info(f"✅ 提取到价格:{full_price_num}")
  1103. if full_price_num is None:
  1104. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页采购价格元素未找到,使用默认值:{price}")
  1105. # if full_price_num > 0:
  1106. # price = price_locator.inner_text(timeout=3000).strip()
  1107. # logger.info(f"{'='*10}{keyword}」第{collected_count}个商品 - 列表页采购价格:{price}{'='*10}")
  1108. # else:
  1109. # price = "0.00" # 初始化默认值,避免后续报错
  1110. # logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页采购价格元素未找到,使用默认值:{price}")
  1111. # 5. 提取公司名称(带缺失日志)
  1112. manufacturer_locator = item.locator(PRODUCT_COMPANY_SELECTOR)
  1113. if manufacturer_locator.count() > 0:
  1114. manufacturer = manufacturer_locator.inner_text(timeout=3000).strip()
  1115. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页公司名:{manufacturer}{'='*10}")
  1116. else:
  1117. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页公司名称元素未找到,使用默认值:{manufacturer}")
  1118. #提取店铺名称
  1119. shop_locator = item.locator(PRODUCT_STORE_SELECTOR)
  1120. if shop_locator.count() > 0:
  1121. raw_shop = shop_locator.inner_text(timeout=3000).strip()
  1122. # 2. 清洗店名(核心新增步骤)
  1123. shop = clean_shop_name(raw_shop)
  1124. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页店名:{shop}{'='*10}")
  1125. logger.info(f"原始店名:{raw_shop}")
  1126. logger.info(f"清洗后店名:{shop}{'='*10}")
  1127. else:
  1128. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页店铺名称元素未找到,使用默认值:{shop}")
  1129. # 提取折扣价
  1130. discount_price_val_origin = ""
  1131. discount_price = ""
  1132. discount_price_locator = item.locator('span[data-v-4cb6cc1f].discount-int').first
  1133. if discount_price_locator.count() > 0:
  1134. discount_price = discount_price_locator.inner_text(timeout=3000).strip()
  1135. discount_price_val_origin = discount_price
  1136. match = re.search(r'\d+\.?\d*', str(discount_price_val_origin))
  1137. discount_price_val = float(match.group()) if match else 0.00
  1138. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页折扣价:{discount_price_val}{'='*10}")
  1139. else:
  1140. #如果没有拿原价替换
  1141. # price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "") else "0.00"
  1142. discount_price_val = full_price_num
  1143. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 折扣价元素未找到,使用采购价兜底:{discount_price_val}")
  1144. merged_price = f"{full_price_num}{discount_price_val_origin}" if discount_price_val_origin else full_price_num
  1145. # 提取有效期(处理空值)
  1146. expiry_date_locator = item.locator(f"{PRODUCT_VALIDITY_SELECTOR}")
  1147. if expiry_date_locator.count() > 0:
  1148. expiry_date = expiry_date_locator.inner_text(timeout=3000).strip().replace('-', '') #.replace('近效期','')
  1149. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页有效期:{expiry_date}{'='*10}")
  1150. else:
  1151. # 修复:替换未定义的i为collected_count
  1152. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 有效期元素未找到,使用默认值:{expiry_date}")
  1153. #获取product_id
  1154. # product_id = None
  1155. # try:
  1156. # product_id = item.get_attribute("data-product-id")
  1157. # if product_id:
  1158. # product_id = product_id.strip()
  1159. # logger.info(f"✅ 「{keyword}」第{collected_count}个商品 - 提取到product_id:{product_id}")
  1160. # ========= 模拟点击商品进入详情页 =========
  1161. logger.info(
  1162. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 模拟鼠标移动并点击"
  1163. )
  1164. # 点击商品项容器,触发详情展示
  1165. # ========== 点击商品跳详情页 ==========
  1166. # 反爬:模拟真人鼠标移动到商品上再点击(不是直接点击)
  1167. item.hover() # 先悬停
  1168. random_delay(0.2, 0.5) # 悬停后延迟
  1169. item.dispatch_event("mousedown")
  1170. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1171. item.dispatch_event("mouseup")
  1172. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1173. try:
  1174. with store_page.context.expect_page(timeout=60000) as p:
  1175. item.click(delay=random.uniform(0.1, 0.3))
  1176. detail_page = p.value
  1177. except PlaywrightTimeoutError:
  1178. logger.warning(
  1179. f" 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 未检测到新标签页,使用当前页采集详情"
  1180. )
  1181. detail_page = None # 标记为无新标签页,避免关闭列表页
  1182. # 等待详情加载(优先用新标签页,无则用列表页)
  1183. target_page = detail_page if detail_page else store_page
  1184. target_page.wait_for_load_state("networkidle", timeout=20000)
  1185. delay = random_delay(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
  1186. logger.info(
  1187. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 详情页加载完成,等待{delay:.2f}秒(反爬)"
  1188. )
  1189. # 反爬:检测详情页反爬验证
  1190. # check_anti_crawl(page)
  1191. # ========== 采集详情页的专属信息(有效期/生产日期/批准文号) ==========
  1192. #获取商品详情页链接
  1193. product_link = target_page.url
  1194. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页链接:{product_link}{'='*10}")
  1195. # ========= ✅ 去重逻辑,拿商品链接和折扣价和有效期和采集日期 =========
  1196. if check_dup_in_biz_db(product_link, full_price_num, current_time):
  1197. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过")
  1198. # ========== 关闭新标签页,切回列表页 ==========
  1199. if detail_page and not detail_page.is_closed():
  1200. detail_page.close() # 关闭详情页标签
  1201. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1202. # 切回原列表页(第一个标签页)
  1203. store_page.bring_to_front() # 激活列表页
  1204. store_page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1205. random_delay(0.5, 1.0) # 增加切换后延迟
  1206. store_page.wait_for_load_state("networkidle")
  1207. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1208. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1209. if collected_count % 6 == 0 and collected_count > 0:
  1210. logger.info("采满6个往下滑")
  1211. slow_scroll_400px(store_page)
  1212. store_page.wait_for_load_state("networkidle")
  1213. continue
  1214. # key = f"{product_link.strip()}|{discount_price_val}"
  1215. # if key in seen:
  1216. # logger.warning(
  1217. # f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过"
  1218. # )
  1219. # if collected_count % 5 == 0 and collected_count > 0:
  1220. # logger.info("采满15个往下滑")
  1221. # slow_scroll_400px(page)
  1222. # page.wait_for_load_state("networkidle")
  1223. # continue
  1224. # seen.add(key)
  1225. # 提取生产日期(修复完成)
  1226. manufacture_date_locator = target_page.locator('//div[contains(@class, "spec-info-item") and .//div[contains(@class, "spec-info-item-label") and normalize-space(.)="生产日期"]]//div[contains(@class, "spec-info-item-value-text")]')
  1227. if manufacture_date_locator.count() > 0:
  1228. manufacture_date = manufacture_date_locator.inner_text(timeout=3000).strip()
  1229. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页生产日期:{manufacture_date}{'='*10}")
  1230. else:
  1231. # 修复:替换未定义的i为collected_count
  1232. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 生产日期元素未找到,使用默认值:{manufacture_date}")
  1233. # 提取批准文号
  1234. approval_number_locator = target_page.locator('//div[contains(@class, "spec-info-item") and .//div[contains(@class, "spec-info-item-label") and normalize-space(.)="批准文号"]]//div[contains(@class, "spec-info-item-value-text")]')
  1235. if approval_number_locator.count() > 0:
  1236. approval_number = approval_number_locator.inner_text(timeout=3000).strip()
  1237. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页批准文号:{approval_number}{'='*10}")
  1238. else:
  1239. # 修复:替换未定义的i为collected_count
  1240. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 批准文号元素未找到,使用默认值:{approval_number}")
  1241. #提取规格
  1242. spec_locator = target_page.locator('//div[contains(@class, "spec-info-item") and .//div[contains(@class, "spec-info-item-label") and normalize-space(.)="规格"]]//div[contains(@class, "spec-info-item-value-text")]')
  1243. if spec_locator.count() > 0:
  1244. spec = spec_locator.inner_text(timeout=3000).strip()
  1245. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页规格:{spec}{'='*10}")
  1246. else:
  1247. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  1248. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 规格元素数量不足,使用默认值:{spec}")
  1249. # input("...")
  1250. #提取库存
  1251. storage = ''
  1252. storage_locator = target_page.locator('[data-v-51f0e85d].detail-input-num-right-title')
  1253. if storage_locator.count() > 0:
  1254. storage = storage_locator.inner_text(timeout=3000).strip()
  1255. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页库存:{storage}{'='*10}")
  1256. else:
  1257. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  1258. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 库存元素数量不足,使用默认值:{storage}")
  1259. #提取销量
  1260. sell = ''
  1261. sell_locator = target_page.locator('div.detail-info-content-item-value-price-top-right div[data-v-95163d4a]',has_text='已售')
  1262. if sell_locator.count() > 0:
  1263. sell = sell_locator.inner_text(timeout=3000).strip()
  1264. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页销量:{sell}{'='*10}")
  1265. else:
  1266. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 没有销量元素,使用默认值:{sell}")
  1267. #保存快照url上传到oss
  1268. try:
  1269. local_path, oss_url = screenshot_target_page_to_local_then_oss(
  1270. target_page=target_page,
  1271. full_page=True # 截取全屏
  1272. )
  1273. print(f"最终结果:")
  1274. print(f" 本地文件路径:{local_path}")
  1275. logger.info(f" OSS访问链接:{oss_url}")
  1276. except Exception as e:
  1277. logger.warning(f"整体流程执行失败:{str(e)}")
  1278. # input("...")
  1279. province = ""
  1280. city = ""
  1281. business_license_company = ""
  1282. qualification_number = ''
  1283. #如果店名为商品预约中心
  1284. # if shop == '药店品种预约中心':
  1285. # #https://www.ybm100.com/new-front/product-info/detail?type__1241=222029ad07-tWcfAcrWtc_CSPpP_%2FtW_cfB_ETca0SugQSbgC7gAb5RAdZyTA5UdS%3DUAoogIsKBqyWgKP_tgAPItgePrBgRPrlgQP_ug0PTZgEPrugpPA5lq%3DSQPg%3Dgt2_xg%3D2FPgs0oBgYqwcg9%3DWPTuSgTHgtBsfgGEh%3D%2FXvko2R%3DGvhceloleBnCGBqcG%2F2V_uKVUBftg
  1286. # #获取pidhttps://www.ybm100.com/new-front/product-info/detail?type__1241=222029ad07-G%2FxP7PxPJgfPUgu%2FIbv7Wg6gpIgwJg5q4PfAg%2FTWZ_Q6gtHaHG%2FgWCPKsClvGsLPVsgQyuBlVVPTqgtvgQgWvG6gOPTkg5%2F_jgAvTog6vT4g5v_6gSU7vC9cggZgvPAtgZJBPgysGg_OuH%2Fg9ToPgjkBgO%2FgaCQggY7KNlo7itg%2FBGP2GrJpPV6%2FQ6f_u6qvMjPvQVIgPg
  1287. # url = 'https://www.ybm100.com/new-front/product-info/detail?type__1241=222029ad07-G%2FxP7PxPJgfPUgu%2FIbv7Wg6gpIgwJg5q4PfAg%2FTWZ_Q6gtHaHG%2FgWCPKsClvGsLPVsgQyuBlVVPTqgtvgQgWvG6gOPTkg5%2F_jgAvTog6vT4g5v_6gSU7vC9cggZgvPAtgZJBPgysGg_OuH%2Fg9ToPgjkBgO%2FgaCQggY7KNlo7itg%2FBGP2GrJpPV6%2FQ6f_u6qvMjPvQVIgPg'
  1288. # data = {
  1289. # 'id': f'{product_id}',
  1290. # 'isMainProductVirtualSupplier': 0
  1291. # }
  1292. # headers = {
  1293. # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36 Edg/144.0.0.0",
  1294. # 'Cookie': '_abfpc=48083f46aa22e0eaefbace39874e38acc7c631ea_2.0; cna=2b5bf2a0d04d0ec45367fda825d4fa6b; xyy=MjM2JjE4MDA4NjUwMzAw; JSESSIONID=922A896126C5961D09622E042CAAA01D; xyy_token=eyJhbGciOiJIUzUxMiJ9.eyJhY2NvdW50X2lkIjoyMzYsImRldmljZV9pZCI6IiIsIm9zIjoiV2luZG93cyAxMCIsImxvZ2luX3RpbWUiOjE3NjkxNjAzNDQ5MDYsImJyb3dzZXIiOiJDaHJvbWUgMTQiLCJtZXJjaGFudF9pZCI6MjM2LCJpcF9hZGRyIjoiMTEzLjk4LjYyLjE2NiIsInZlcnNpb24iOiIiLCJsb2dpbl91c2VyX2tleSI6IjM3NzQ2ZjM5LTE3MjQtNDBjYi1hNTk4LWRlYTM5MTU2NjllNSJ9.IN8gFX6p4KuClT2KysZLNVuyQuszfdNW5gz7m_u4yq60zqbvSOg1yo0f7TuKcbZVvd-t5mVsb4hoNBRNV6nsYQ; xyy_principal=236&Y2MwY2FiZGYzZjU4NzUzNGE5OWRkZTIwYmRiMmQ4NTk2ZDg5N2QxOQ&236; xyy_last_login_time=1769160344906; acw_tc=1a0c650c17694095621061999e5d6b6730068c59854298f31bdd661882a009; qt_session=KsnsuMqE_1769409754197; ssxmod_itna=1-eq0xgDnDyAeYqDKi=G0KKG7DRDIEpDpxgGDBP01G7DuExjKidtDUDQulGmFgG4G=oG7iheet3RLKNDlpLeDZDGKQDqx0Eb0iiD4Ns3ImkiT53QQGvqUdaeOENowZaTRbY9oVG6MxfXy/UDgEeDU4GnD068CY6bDYYLDBYD74G_DDeDi2rD84D_DGpdMnudxi33nDeDzqr=xG3txYpdweDgADDB_RiDKkP=hDDlGA7YREbPAcTq6PmzxGU8lCGxUeDMFxGXmikYUQy6MK4rZCSfp1EYH1aDtqD9DgbDb42zvrTbp6ebF_mbS_83r1Ki=3iifhNQ2rt0iC0_Yiofx4lxxfxx3Be5WHiTHDDW=fd1xxq05p71UdznuzuAernD=xIxRtbj=/74anQqf5Dxx4hYb0DnOGK0D3j=bGrxnD4D; ssxmod_itna2=1-eq0xgDnDyAeYqDKi=G0KKG7DRDIEpDpxgGDBP01G7DuExjKidtDUDQulGmFgG4G=oG7iheet3RLFoDiaRAqzbCD7pxTs4GNeYfb78=o8pWc0HY8dN0vO6z5i69OeF5Dg34naHHkD98UZ3tVAb=9/L3BSLIczMds0bxfCAIfG0eY3oTQym5z/oAhmi4qDLetNaD',
  1295. # 'Referer': f'https://www.ybm100.com/new/base/skuDetail?id={product_id}&combination=1&type=1',
  1296. # "Content-Type" : "application/json"
  1297. # }
  1298. # response = requests.post(url, json=data, headers=headers)
  1299. # print(response.status_code)
  1300. # try:
  1301. # response_json = response.json()
  1302. # print("✅ 成功解析JSON响应")
  1303. # if 'data' in response_json and 'detail' in response_json['data'] and 'pid' in response_json['data']['detail']:
  1304. # pid = response_json['data']['detail']['pid']
  1305. # print(f"✅ 提取到pid:{pid}")
  1306. # elif 'pid' in response_json:
  1307. # pid = response_json['pid']
  1308. # print(f"✅ 方式二提取到pid:{pid}")
  1309. # else:
  1310. # # 打印响应的前1000个字符,帮助你确认JSON结构
  1311. # print("⚠️ 未找到pid字段,响应数据预览:")
  1312. # print(json.dumps(response_json, ensure_ascii=False, indent=2)[:1000])
  1313. # pid = None
  1314. # except json.JSONDecodeError:
  1315. # # 响应不是JSON格式的情况
  1316. # print("❌ 响应不是JSON格式,无法解析")
  1317. # print("响应文本:", response.text[:1000])
  1318. # pid = None
  1319. # except Exception as e:
  1320. # # 其他异常
  1321. # print(f"❌ 提取pid时出错:{str(e)}")
  1322. # pid = None
  1323. # target_page.goto(f'https://www.ybm100.com/new/base/skuDetail?id={product_id}&combination=1&type=1')
  1324. # shop_name_elem = target_page.locator('span[data-v-5485589c]')
  1325. # shop_name = shop_name_elem.inner_text(timeout=3000).strip()
  1326. # shop_exists, shop_info = shop_is_exists_database(shop_name)
  1327. # if not shop_exists:
  1328. # if shop_info:
  1329. # province = shop_info['province']
  1330. # city = shop_info['city']
  1331. # business_license_company = shop_info['business_license_company']
  1332. # qualification_number = shop_info['qualification_number']
  1333. # #去往药店品种预约中心后面的链接
  1334. # target_page.goto(f"https://www.ybm100.com/new/base/skuDetail?id={pid}&combination=1&type=1")
  1335. # if not shop_exists:
  1336. shop_exists, shop_info = shop_is_exists_database(shop)
  1337. shop_page = None
  1338. #店铺名不是药品预约中心且店铺名不在数据库就要点击
  1339. if shop != "药店品种预约中心" and not shop_exists:
  1340. logger.info("店铺名不是药店品种预约中心且数据库没有该公司的营业执照")
  1341. # 获取营业执照图片
  1342. # 进入店铺
  1343. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1344. entershop_btn = target_page.locator('div[data-v-5485589c].shop-info-container-left-info')
  1345. # 增强:先等待进入店铺按钮可见
  1346. entershop_btn.wait_for(state="visible", timeout=10000)
  1347. entershop_btn.scroll_into_view_if_needed() # 确保按钮在视口内
  1348. entershop_btn.hover() # 先悬停
  1349. random_delay(0.2, 0.5) # 悬停后延迟
  1350. with target_page.expect_popup(timeout=15000) as pop:
  1351. entershop_btn.click()
  1352. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1353. shop_page = pop.value
  1354. shop_page.wait_for_load_state("domcontentloaded") # 比 networkidle 更
  1355. #点击店铺资质
  1356. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1357. shop_license_page = shop_page.locator('//div[contains(@class, "shop-info-container-right-btns-item") and contains(span, "资质/售后")]')
  1358. shop_license_page.wait_for(state="attached", timeout=15000) # 等待元素加载完成
  1359. shop_license_page.scroll_into_view_if_needed() # 确保在视口内
  1360. shop_license_page.hover() # 先悬停
  1361. random_delay(0.2, 0.5) # 悬停后延迟
  1362. # shop_license_page.dispatch_event("mousedown")
  1363. shop_license_page.click()
  1364. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1365. # shop_license_page.dispatch_event("mouseup")
  1366. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1367. shop_page.wait_for_load_state("networkidle")
  1368. # slow_scroll_400px(shop_page, scroll_distance1=700)
  1369. #获取药品经营许可证图片
  1370. shop_page.wait_for_load_state("load")
  1371. ocr_res = None
  1372. # shop_license_div = target_page.locator('//span[contains(text(), "营业执照")]')
  1373. shop_license_img = shop_page.locator('//span[contains(text(), "企业营业执照") or contains(text(), "营业执照(正本)")]/ancestor::div[@class="shop-info-drawer-zz-tab1-list-item"]/img').first
  1374. shop_license_img.wait_for(state="visible", timeout=60000)
  1375. try:
  1376. if shop_license_img.count() > 0:
  1377. shop_license_src = shop_license_img.get_attribute('src')
  1378. shop_license_src = shop_license_src.strip() if shop_license_src else None
  1379. ocr_res = get_ocr_res(shop_license_src)
  1380. # print(f'ocr_res:{ocr_res}')
  1381. # input(".....")
  1382. else:
  1383. shop_license_src = None
  1384. except Exception as e:
  1385. # 捕获定位/提取失败的异常,避免程序崩溃
  1386. logger.warning(f"提取营业执照图片src失败:{e}")
  1387. shop_license_src = None
  1388. print("营业执照图片链接:", shop_license_src)
  1389. # input("..")
  1390. contact_address = ''
  1391. qualification_number = ocr_res.get('社会信用代码', '') if ocr_res else ''
  1392. business_license_company = ocr_res.get('单位名称', '') if ocr_res else ''
  1393. business_license_address = ocr_res.get('地址', '') if ocr_res else ''
  1394. # scrape_date = ''
  1395. # 调用提取函数,获取省份和城市
  1396. province, city = extract_province_city(business_license_address)
  1397. logger.info(f"原始地址:{business_license_address}")
  1398. logger.info(f"提取的省份:{province} | 城市:{city}")
  1399. insert_result = insert_shop_info_to_db(
  1400. shop=shop,
  1401. contact_address=contact_address,
  1402. qualification_number=qualification_number,
  1403. business_license_company=business_license_company,
  1404. business_license_address=business_license_address,
  1405. scrape_date=current_time,
  1406. platform=platform,
  1407. province=province,
  1408. city=city,
  1409. create_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S") ,
  1410. update_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  1411. )
  1412. else:
  1413. logger.info("数据库有该店名,在数据库拿取对应字段填充ybm_drug_middle表")
  1414. if shop_info:
  1415. province = shop_info['province']
  1416. city = shop_info['city']
  1417. business_license_company = shop_info['business_license_company']
  1418. qualification_number = shop_info['qualification_number']
  1419. try:
  1420. if shop_page and not shop_page.is_closed():
  1421. random_delay(4,8)
  1422. shop_page.close()
  1423. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭店铺页标签 shop_page")
  1424. except Exception as e:
  1425. logger.warning(f"⚠️ 关闭 shop_page 失败:{e}")
  1426. # # purchase_price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "").isdigit() else 0.00
  1427. random_delay(5,8)
  1428. # ========== 关闭新标签页,切回列表页 ==========
  1429. if detail_page and not detail_page.is_closed():
  1430. detail_page.close() # 关闭详情页标签
  1431. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1432. # 切回原列表页(第一个标签页)
  1433. store_page.bring_to_front() # 激活列表页
  1434. store_page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1435. random_delay(0.5, 1.0) # 增加切换后延迟
  1436. store_page.wait_for_load_state("networkidle")
  1437. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1438. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1439. random_delay(2,4)
  1440. # credit_code = ""
  1441. availability = ""
  1442. # input(".....")
  1443. # 组装单条数据(仅新增生产日期/批准文号字段,原有字段顺序/逻辑不变)
  1444. # 构造单条数据元组(适配MySQL字段)
  1445. single_data = {
  1446. # 核心商品信息
  1447. "product": title, # 商品名称
  1448. "my_good_price": merged_price, # 自定义价格(可与min_price相同或单独提取)
  1449. "min_price": discount_price_val, # 最低价格
  1450. "manufacture_date": manufacture_date, # 生产日期
  1451. "expiry_date": expiry_date, # 有效期
  1452. "shop": shop, # 店铺名
  1453. "business_license_company": business_license_company, # 营业执照主体(公司名称)
  1454. "province": province, # 省份
  1455. "city": city, # 城市
  1456. "manufacturer": manufacturer, # 生产厂家
  1457. "specification": spec, # 规格
  1458. "approval_number": approval_number, # 批准文号
  1459. "product_link": product_link, # 商品链接
  1460. "scrape_date": current_time, # 采集日期
  1461. "scrape_province": "", # 采集省份(可留空或根据IP获取)
  1462. "availability": availability, # 库存状态
  1463. "credit_code": qualification_number, # 统一信用代码(如有可补充提取)
  1464. "platform": platform, # 平台名称(固定或动态获取)
  1465. "search_key": keyword, # 搜索关键词
  1466. "number": num, # 数量(盒数)
  1467. "is_sold_out": is_sold_out, # 售罄标记(0/1)
  1468. "sales": sell, #销量
  1469. "inventory": storage, #库存
  1470. "snapshot_url": oss_url, #快照链接
  1471. "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # 更新时间
  1472. "create_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 创建时间
  1473. }
  1474. # 调用逐条插入函数
  1475. insert_single_to_mysql(single_data)
  1476. collect_result.append(single_data)
  1477. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」采集完成")
  1478. # input("....")
  1479. except Exception as e:
  1480. # 异常处理:关闭详情页,强制切回列表页
  1481. logger.exception(f" 「{keyword}」第{collected_count}个商品采集核心异常:{str(e)}")
  1482. try:
  1483. if detail_page and not detail_page.is_closed():
  1484. detail_page.close()
  1485. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 异常时关闭详情页标签页")
  1486. if store_page and not store_page.is_closed():
  1487. store_page.bring_to_front() # 切回列表页
  1488. store_page.wait_for_load_state("networkidle")
  1489. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1490. except Exception as e2:
  1491. logger.error(f" 「{keyword}」第{collected_count}个商品详情采集异常(处理时):{str(e2)},原异常:{str(e)}")
  1492. continue
  1493. # ✅ 每15次滚动一次(修复:用collected_count,且排除0的情况)
  1494. if collected_count % 6 == 0 and collected_count > 0 and collected_count != total_limit:
  1495. logger.info("采满5个往下滑")
  1496. slow_scroll_400px(store_page,)
  1497. store_page.wait_for_load_state("networkidle")
  1498. # ====== 当前页采集完毕,尝试翻页 ======
  1499. delay = random_delay(1.5, 3.0)
  1500. logger.info(f"⏳ 翻页前随机等待 {delay:.2f}s(反爬)")
  1501. if goto_next_page(store_page):
  1502. logger.info(f"「{keyword}」还有下一页")
  1503. page_no += 1
  1504. continue
  1505. else:
  1506. logger.info(f" 「{keyword}」已无下一页,关键词采集结束")
  1507. break
  1508. # 关键词采集完成后长延迟
  1509. long_delay = random_delay(MIN_KEYWORD_DELAY, MAX_KEYWORD_DELAY)
  1510. logger.info(f" 「{keyword}」采集完成,共{len(collect_result)}条数据,等待{long_delay:.2f}秒后继续下一个关键词(反爬)")
  1511. return collect_result
  1512. # ==================== 保存到CSV函数(适配新表头) ====================
  1513. # def save_to_csv(data_list):
  1514. # """
  1515. # 保存数据到CSV(适配新表头)
  1516. # :param data_list: list - 采集到的字典数据列表
  1517. # :return: bool - 保存是否成功
  1518. # """
  1519. # if not data_list:
  1520. # logger.warning(" 无数据可保存到CSV")
  1521. # return False
  1522. # try:
  1523. # # 判断文件是否存在,不存在则写入表头
  1524. # file_exists = os.path.exists(CSV_FILE_PATH)
  1525. # # 打开CSV文件(追加模式,utf-8-sig避免Excel乱码)
  1526. # with open(CSV_FILE_PATH, "a", newline="", encoding="utf-8-sig") as f:
  1527. # # 用新表头作为字段名
  1528. # writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
  1529. # # 首次写入表头
  1530. # if not file_exists:
  1531. # writer.writeheader()
  1532. # logger.info(f" 已创建CSV文件并写入新表头:{CSV_FILE_PATH}")
  1533. # # 写入数据行
  1534. # writer.writerows(data_list)
  1535. # logger.info(f" 成功将 {len(data_list)} 条数据写入CSV")
  1536. # return True
  1537. # except Exception as e:
  1538. # logger.error(f" 保存CSV失败:{str(e)}")
  1539. # return False
  1540. # ==================== 主函数(登录+批量搜索) ====================
  1541. def main():
  1542. logger.info("\n" + "="*50)
  1543. logger.info("🚀 药帮忙采集程序启动")
  1544. logger.info(f"⏰ 启动时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  1545. logger.info("="*50)
  1546. # 待搜索的关键词列表(直接写在这里,改起来更直观)
  1547. # 存储所有关键词的采集数据
  1548. # all_collect_data = []
  1549. with sync_playwright() as p:
  1550. # browser = init_browser_with_proxy(p)
  1551. # 启动浏览器(用单个配置变量)
  1552. browser = p.chromium.launch(
  1553. headless=False, # 不要用无头模式(反爬:无头模式易被识别)
  1554. channel="chrome", # 使用真实Chrome内核
  1555. slow_mo=random.randint(100, 300), # 全局操作延迟(模拟真人慢速操作)
  1556. args=[
  1557. "--disable-blink-features=AutomationControlled", # 禁用webdriver特征(核心!)
  1558. "--enable-automation=false", # 新增:禁用自动化标识
  1559. "--disable-infobars", # 新增:禁用信息栏
  1560. "--remote-debugging-port=0", # 新增:随机调试端口
  1561. "--start-maximized", # 最大化窗口(模拟真人使用)
  1562. "--disable-extensions", # 禁用扩展(避免特征)
  1563. "--disable-plugins-discovery", # 禁用插件发现
  1564. "--no-sandbox", # 避免沙箱模式特征
  1565. "--disable-dev-shm-usage", # 避免内存限制导致的异常
  1566. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36" # 随机Chrome版本的UA
  1567. ]
  1568. )
  1569. # 创建页面时伪装指纹
  1570. context = browser.new_context(
  1571. locale="zh-CN", # 中文环境
  1572. timezone_id="Asia/Shanghai", # 上海时区
  1573. geolocation={"latitude": 31.230416, "longitude": 121.473701}, # 模拟上海地理位置(可选)
  1574. permissions=["geolocation"], # 授予定位权限(模拟真人)
  1575. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  1576. viewport={"width": 1800, "height": 1000},
  1577. # 关键:隐藏自动化特征
  1578. java_script_enabled=True,
  1579. bypass_csp=True,
  1580. # user_data_dir="./temp_user_data" # 模拟真实用户数据目录
  1581. )
  1582. page = context.new_page()
  1583. # 关键:移除navigator.webdriver标识(反爬核心)
  1584. page.add_init_script("""
  1585. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  1586. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); // 新增:模拟插件
  1587. Object.defineProperty(navigator, 'mimeTypes', { get: () => [1, 2, 3] }); // 新增:模拟MIME类型
  1588. window.chrome = { runtime: {}, loadTimes: () => ({}) }; // 增强Chrome模拟
  1589. delete window.navigator.languages;
  1590. window.navigator.languages = ['zh-CN', 'zh'];
  1591. // 新增:模拟真实鼠标移动特征
  1592. (() => {
  1593. const originalAddEventListener = EventTarget.prototype.addEventListener;
  1594. EventTarget.prototype.addEventListener = function(type, listener) {
  1595. if (type === 'mousemove') {
  1596. return originalAddEventListener.call(this, type, (e) => {
  1597. e._automation = undefined;
  1598. listener(e);
  1599. });
  1600. }
  1601. return originalAddEventListener.call(this, type, listener);
  1602. };
  1603. })();
  1604. """)
  1605. try:
  1606. # ========== 核心:Cookie复用逻辑 ==========
  1607. # 1. 加载本地Cookie
  1608. load_cookies(context)
  1609. # 2. 验证登录状态
  1610. if not is_login(page):
  1611. # 3. Cookie失效/不存在,执行登录
  1612. page.goto(TARGET_LOGIN_URL)
  1613. page.wait_for_load_state("networkidle")
  1614. logger.info("🔑 开始执行登录流程")
  1615. # 执行登录操作
  1616. login_success = login_operation(page, USERNAME, PASSWORD)
  1617. if not login_success:
  1618. logger.error(" 登录失败,程序终止")
  1619. return
  1620. # 4. 登录成功后保存Cookie
  1621. save_cookies(context)
  1622. logger.info(" 登录并保存Cookie成功!")
  1623. # 初始化变量:保存首次搜索的新页面对象
  1624. store_page = None
  1625. #统计搜索次数
  1626. nums = 0
  1627. # 2. 批量搜索+采集+保存
  1628. for keyword_idx, keyword in enumerate(SEARCH_KEYWORDS, 1):
  1629. logger.info(f"\n=====================================")
  1630. logger.info(f"🔍 开始处理第{keyword_idx}/{len(SEARCH_KEYWORDS)}个关键词:{keyword}")
  1631. logger.info(f"=====================================")
  1632. # 执行搜索
  1633. popup_guard(page, "before_search")
  1634. if nums == 0:
  1635. popup_guard(store_page if store_page else page, "before_search") # page是你的初始页面对象,需提前定义
  1636. store_page, search_success = search_operation(page, keyword, is_first_search=True)
  1637. nums += 1
  1638. else:
  1639. if store_page is None:
  1640. logger.error(f"{get_current_time()} ❌ 无可用的搜索页面,跳过「{keyword}」")
  1641. continue
  1642. popup_guard(store_page, "before_search")
  1643. store_page, search_success = search_operation(store_page, keyword, is_first_search=False)
  1644. # input("")
  1645. popup_guard(store_page, "after_search")
  1646. # store_page = detail_page
  1647. if store_page is None:
  1648. break
  1649. if not search_success:
  1650. logger.warning(f" 「{keyword}」搜索失败,跳过采集")
  1651. continue
  1652. # ✅ 再等页面稳定一下(networkidle 有时会等不到,建议加超时或换成 domcontentloaded)
  1653. store_page.wait_for_load_state("domcontentloaded")
  1654. store_page.wait_for_load_state('networkidle')
  1655. # 采集数据
  1656. data_list = collect_data(store_page, keyword)
  1657. # # 保存到CSV
  1658. # if data_list:
  1659. # save_to_csv(data_list)
  1660. # else:
  1661. # logger.warning(f" 「{keyword}」无数据,跳过保存")
  1662. logger.info("\n🎉 所有关键词处理完成!CSV文件路径:" + os.path.abspath(CSV_FILE_PATH))
  1663. # input("\n按回车关闭程序...")
  1664. except Exception as e:
  1665. logger.error(f" 程序异常:{str(e)}")
  1666. finally:
  1667. browser.close()
  1668. logger.info(" 浏览器已关闭,程序结束")
  1669. # ==================== 程序入口 ====================
  1670. if __name__ == '__main__':
  1671. main()