main.py 77 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866
  1. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  2. from logger_config import logger
  3. from datetime import datetime
  4. import random
  5. import csv
  6. import os
  7. import time
  8. import json
  9. import pymysql
  10. from pymysql.err import OperationalError, ProgrammingError, DataError
  11. from config import *
  12. import re
  13. import uuid
  14. import requests
  15. import base64
  16. from io import BytesIO
  17. from PIL import Image
  18. import traceback
  19. import oss2
  20. # from faker import Faker
  21. # 代理IP池
  22. PROXY_POOL_URL =""
  23. PROXY_VALIDATION_URL = "" # 用于验证代理有效性的URL
  24. PROXY_TIMEOUT = 10 # 代理验证超时时间(秒)
  25. def get_random_proxy():
  26. """从代理池获取随机代理IP"""
  27. try:
  28. response = requests.get(PROXY_POOL_URL, timeout=10)
  29. if response.status_code == 200:
  30. proxy = response.text.strip()
  31. if validate_proxy(proxy):
  32. logger.info(f"获取到有效代理: {proxy}")
  33. return proxy
  34. logger.warning(f"代理无效: {proxy}")
  35. except Exception as e:
  36. logger.error(f"获取代理失败: {str(e)}")
  37. return None
  38. def validate_proxy(proxy):
  39. """验证代理IP有效性"""
  40. try:
  41. proxies = {
  42. "http": f"http://{proxy}",
  43. "https": f"https://{proxy}"
  44. }
  45. response = requests.get(
  46. PROXY_VALIDATION_URL,
  47. proxies=proxies,
  48. timeout=PROXY_TIMEOUT
  49. )
  50. return response.status_code == 200
  51. except:
  52. return False
  53. def init_browser_with_proxy(playwright):
  54. proxy = get_random_proxy()
  55. proxy_config = None
  56. if proxy:
  57. proxy_server, proxy_port = proxy.split(":")
  58. proxy_config = {
  59. "server": f"http://{proxy_server}:{proxy_port}",
  60. # "username": "your_proxy_username",
  61. # "password": "your_proxy_password"
  62. }
  63. logger.info(f"使用代理: {proxy_server}:{proxy_port}")
  64. else:
  65. logger.warning("未获取到有效代理,将使用本地IP")
  66. # 启动浏览器(保留原有反爬配置)
  67. return playwright.chromium.launch(
  68. headless=False, # 非无头模式
  69. channel="chrome", # 使用Chrome内核
  70. slow_mo=random.randint(100, 300), # 随机操作延迟
  71. proxy=proxy_config, # 代理配置(None则不使用代理)
  72. args=[
  73. "--disable-blink-features=AutomationControlled", # 核心反检测
  74. "--enable-automation=false",
  75. "--disable-infobars",
  76. "--remote-debugging-port=0",
  77. "--start-maximized",
  78. "--disable-extensions",
  79. "--disable-plugins-discovery",
  80. "--no-sandbox",
  81. "--disable-dev-shm-usage",
  82. # 随机Chrome版本UA
  83. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36"
  84. ]
  85. )
  86. # ==================== 2. 反爬工具函数 ====================
  87. def random_delay(min_seconds, max_seconds):
  88. """生成随机延迟(核心反爬:避免固定间隔)"""
  89. delay = random.uniform(min_seconds, max_seconds)
  90. time.sleep(delay)
  91. return delay
  92. def simulate_human_typing(page, locator, text):
  93. """模拟真人打字(逐个字符输入,带随机间隔)"""
  94. try:
  95. locator.click()
  96. locator.clear()
  97. for char in text:
  98. locator.type(char, delay=random.uniform(MIN_INPUT_DELAY, MAX_INPUT_DELAY))
  99. random_delay(0.05, 0.1) # 字符间额外小延迟
  100. logger.info(f" 模拟真人输入完成:{text}")
  101. except Exception as e:
  102. logger.error(f"模拟打字失败:{e}")
  103. locator.fill(text) # 兜底:直接填充
  104. def save_cookies(context, cookie_path=COOKIE_FILE_PATH):
  105. """保存Cookie到本地JSON文件"""
  106. try:
  107. cookies = context.cookies()
  108. with open(cookie_path, "w", encoding="utf-8") as f:
  109. json.dump(cookies, f, ensure_ascii=False, indent=2)
  110. logger.info(f"Cookie已保存到:{cookie_path}")
  111. return True
  112. except Exception as e:
  113. logger.error(f" 保存Cookie失败:{e}")
  114. return False
  115. def load_cookies(context, cookie_path=COOKIE_FILE_PATH):
  116. """从本地JSON文件加载Cookie到浏览器上下文"""
  117. if not os.path.exists(cookie_path):
  118. logger.warning(f" Cookie文件不存在:{cookie_path}")
  119. return False
  120. try:
  121. with open(cookie_path, "r", encoding="utf-8") as f:
  122. cookies = json.load(f)
  123. context.add_cookies(cookies)
  124. logger.info(f"✅ 已从{cookie_path}加载Cookie")
  125. return True
  126. except Exception as e:
  127. logger.error(f" 加载Cookie失败:{e}")
  128. return False
  129. def is_login(page):
  130. """验证是否已登录(核心:检测登录态)"""
  131. try:
  132. # 访问需要登录的页面
  133. page.goto(LOGIN_VALIDATE_URL, timeout=ELEMENT_TIMEOUT)
  134. page.wait_for_load_state("networkidle")
  135. # 检测是否跳转到登录页(URL包含login则未登录)
  136. if "login" in page.url.lower():
  137. logger.warning(" Cookie失效,需要重新登录")
  138. return False
  139. # 可选:检测登录后的专属元素(比如用户名、个人中心等)
  140. # if page.locator("用户中心选择器").count() > 0:
  141. # return True
  142. logger.info(" Cookie有效,已保持登录状态")
  143. return True
  144. except Exception as e:
  145. logger.error(f" 验证登录状态失败:{e}")
  146. return False
  147. # ==================== 滚动函数重构(核心修改) ====================
  148. def slow_scroll_400px(page,scroll_distance1=400):
  149. """
  150. 慢速滚动400px±50px(模拟真人滑动)
  151. :param page: 页面对象
  152. :return: 滚动是否成功
  153. """
  154. try:
  155. # 生成400±50px的随机滚动距离
  156. scroll_distance = random.randint(
  157. scroll_distance1 - SCROLL_OFFSET_RANGE,
  158. scroll_distance1 + SCROLL_OFFSET_RANGE
  159. )
  160. remaining_distance = scroll_distance
  161. total_steps = int(scroll_distance / SCROLL_STEP)
  162. logger.info(
  163. f"📜 开始慢速滚动(目标距离:{scroll_distance}px,总步数:{total_steps},总时长约{total_steps*SCROLL_INTERVAL:.2f}秒)"
  164. )
  165. # 渐进式滚动(每步50px,间隔0.05秒)
  166. for _ in range(total_steps):
  167. step = min(SCROLL_STEP, remaining_distance)
  168. page.evaluate(f"window.scrollBy(0, {step});")
  169. remaining_distance -= step
  170. time.sleep(SCROLL_INTERVAL)
  171. # 处理剩余不足一步的距离
  172. if remaining_distance > 0:
  173. page.evaluate(f"window.scrollBy(0, {remaining_distance});")
  174. time.sleep(SCROLL_INTERVAL)
  175. # 滚动后等待懒加载完成
  176. page.wait_for_load_state("networkidle", timeout=8000)
  177. random_delay(2.0, 3.0) # 滚动后额外停顿,模拟真人
  178. logger.info(f" 慢速滚动完成,实际滚动距离:{scroll_distance - remaining_distance}px")
  179. return True
  180. except Exception as e:
  181. logger.warning(f" 慢速滚动失败:{e}")
  182. return False
  183. # def check_anti_crawl(page):
  184. # """检测反爬弹窗/验证码(核心:提前识别反爬)"""
  185. # anti_crawl_selectors = [
  186. # "//div[contains(text(), '验证')]",
  187. # "//div[contains(text(), '人机验证')]",
  188. # "//div[contains(text(), '访问过于频繁')]",
  189. # "//button[contains(text(), '验证')]"
  190. # ]
  191. # for selector in anti_crawl_selectors:
  192. # if page.locator(selector).count() > 0:
  193. # logger.error("❌ 检测到反爬验证弹窗!请手动完成验证后按回车继续...")
  194. # input() # 暂停等待手动验证
  195. # return True
  196. # return False
  197. # CSV配置
  198. CSV_FILE_PATH = f"yjj_collect_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" # CSV保存路径
  199. CSV_HEADERS = [
  200. "商品标题", "商品采购价格", "商品折扣价格", "规格", "盒数",
  201. "店铺名称", "公司名称",
  202. "有效日期", "生产日期", "批准文号", "采集时间"
  203. ] #表头
  204. # ==================== 登录函数 ====================
  205. def login_operation(page, username, password):
  206. """登录操作函数"""
  207. try:
  208. # 输入手机号(直接用单个变量)
  209. page.wait_for_selector(USERNAME_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  210. page.wait_for_timeout(timeout=3000)
  211. page.fill(USERNAME_SELECTOR, username)
  212. logger.info(" 已输入登录账号")
  213. # 输入密码
  214. page.wait_for_selector(PASSWORD_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  215. page.wait_for_timeout(timeout=3000)
  216. page.fill(PASSWORD_SELECTOR, password)
  217. logger.info(" 已输入登录密码")
  218. # 点击登录按钮
  219. page.wait_for_selector(LOGIN_BTN_SELECTOR, timeout=ELEMENT_TIMEOUT)
  220. page.wait_for_timeout(timeout=3000)
  221. page.click(LOGIN_BTN_SELECTOR)
  222. logger.info(" 已点击登录按钮")
  223. page.wait_for_timeout(LOGIN_AFTER_CLICK)
  224. return True
  225. except PlaywrightTimeoutError as e:
  226. logger.error(f" 登录失败:元素定位超时 - {str(e)}")
  227. return False
  228. except Exception as e:
  229. logger.error(f" 登录异常:{str(e)}")
  230. return False
  231. def kill_masks(page):
  232. """
  233. 强制清理残留遮罩层/覆盖层,并恢复 body 可滚动、可点击状态
  234. """
  235. page.evaluate(r"""
  236. () => {
  237. const removed = [];
  238. const hidden = [];
  239. // 1) 先处理已知常见遮罩
  240. const knownSelectors = [
  241. '.v-modal',
  242. '.el-overlay',
  243. '.el-overlay-dialog',
  244. '.el-dialog__wrapper',
  245. '.el-message-box__wrapper',
  246. '.el-loading-mask',
  247. '.el-popup-parent--hidden'
  248. ];
  249. for (const sel of knownSelectors) {
  250. document.querySelectorAll(sel).forEach(el => {
  251. // v-modal / overlay 直接 remove 最省事
  252. removed.push(sel);
  253. el.remove();
  254. });
  255. }
  256. // 2) 再做一次“泛化兜底”:全屏 fixed/absolute + 高 z-index 的覆盖层
  257. // 注意:不要误删页面正常的固定导航,所以加上“近似全屏”的判断
  258. const all = Array.from(document.querySelectorAll('body *'));
  259. for (const el of all) {
  260. const s = window.getComputedStyle(el);
  261. if (!s) continue;
  262. const z = parseInt(s.zIndex || '0', 10);
  263. const pos = s.position;
  264. const pe = s.pointerEvents;
  265. if ((pos === 'fixed' || pos === 'absolute') && z >= 1000 && pe !== 'none') {
  266. const r = el.getBoundingClientRect();
  267. const nearFullScreen =
  268. r.width >= window.innerWidth * 0.8 &&
  269. r.height >= window.innerHeight * 0.8 &&
  270. r.left <= window.innerWidth * 0.1 &&
  271. r.top <= window.innerHeight * 0.1;
  272. // 常见遮罩是半透明背景色,或者透明但拦截点击
  273. const bg = s.backgroundColor || '';
  274. const looksLikeMask =
  275. nearFullScreen && (bg.includes('rgba') || bg.includes('rgb') || s.opacity !== '1');
  276. if (nearFullScreen) {
  277. // 不管透明不透明,只要近似全屏且高 z-index,就先让它不拦截点击
  278. el.style.pointerEvents = 'none';
  279. el.style.display = 'none';
  280. hidden.push(el.tagName + '.' + (el.className || ''));
  281. }
  282. }
  283. }
  284. // 3) 恢复 body / html 的滚动与交互(很多弹窗会锁滚动)
  285. document.documentElement.style.overflow = 'auto';
  286. document.body.style.overflow = 'auto';
  287. document.body.style.position = 'static';
  288. document.body.style.width = 'auto';
  289. document.body.style.paddingRight = '0px';
  290. // 4) 去掉 Element-UI 常见的锁定 class
  291. document.body.classList.remove('el-popup-parent--hidden');
  292. return { removed, hiddenCount: hidden.length, hidden };
  293. }
  294. """)
  295. def force_close_popup(page):
  296. """关闭新手引导/遮罩(多步:下一步/完成/我知道了),并兜底移除遮罩层"""
  297. try:
  298. # 1) 尝试连续点“下一步/完成/我知道了/关闭”
  299. for _ in range(5): # 最多点5次,足够覆盖多步引导
  300. btn = page.locator(
  301. "//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  302. ).first
  303. if btn.count() > 0 and btn.is_visible():
  304. btn.click(timeout=1500)
  305. page.wait_for_timeout(300)
  306. continue
  307. # 有些引导是右上角 X(如果存在就点)
  308. close_icon = page.locator(
  309. "xpath=//*[contains(@class,'close') or contains(@class,'el-icon-close') or name()='svg' or name()='i'][1]"
  310. ).first
  311. if close_icon.count() > 0 and close_icon.is_visible():
  312. close_icon.click(timeout=1000)
  313. page.wait_for_timeout(300)
  314. continue
  315. break
  316. # 2) 兜底:移除常见遮罩层(element-ui / 通用 mask/overlay)
  317. page.evaluate("""
  318. const selectors = [
  319. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  320. '[class*="mask"]', '[class*="overlay"]', '[style*="z-index"]'
  321. ];
  322. for (const sel of selectors) {
  323. document.querySelectorAll(sel).forEach(el => {
  324. const s = window.getComputedStyle(el);
  325. // 只移除“覆盖层”倾向的元素:fixed/absolute 且 z-index 很高
  326. if ((s.position === 'fixed' || s.position === 'absolute') && parseInt(s.zIndex || '0', 10) >= 1000) {
  327. el.remove();
  328. }
  329. });
  330. }
  331. """)
  332. except Exception:
  333. pass
  334. # 调用方式和方案1一致:在搜索后、采集前执行
  335. # force_close_popup(page)
  336. def pick_search_input(page):
  337. """优先选可见且可用的搜索输入框;第一个不行就尝试第二个"""
  338. inputs = page.locator(SEARCH_INPUT_SELECTOR)
  339. cnt = inputs.count()
  340. # 优先检查前两个(你说只有两个)
  341. for i in range(min(cnt, 2)):
  342. candidate = inputs.nth(i)
  343. try:
  344. candidate.wait_for(state="visible", timeout=1500) # 小超时快速试探
  345. if candidate.is_enabled():
  346. return candidate
  347. except PlaywrightTimeoutError:
  348. continue
  349. # 兜底:直接找任意可见的(避免命中 hidden 模板)
  350. candidate = page.locator(f"{SEARCH_INPUT_SELECTOR}:visible").first
  351. candidate.wait_for(state="visible", timeout=ELEMENT_TIMEOUT)
  352. return candidate
  353. def type_slow(locator, text: str, min_delay=0.06, max_delay=0.18):
  354. """逐字输入,模拟真人打字"""
  355. for ch in text:
  356. locator.type(ch, delay=int(random.uniform(min_delay, max_delay) * 1000))
  357. # ==================== 搜索操作函数 ====================
  358. def search_operation(page, keyword):
  359. """搜索框填充+提交搜索"""
  360. try:
  361. # 1) 找到“可用”的搜索框(第一个不行就用第二个)
  362. search_locator = pick_search_input(page)
  363. # 清空并填充搜索框
  364. search_locator.wait_for(timeout=ELEMENT_TIMEOUT)
  365. # 2. 清空搜索框(双重保障:先调用locator的clear,再手动全选删除)
  366. search_locator.click() # 聚焦
  367. search_locator.fill("")
  368. page.keyboard.down("Control") # 按住Control键
  369. page.keyboard.press("a") # 按a键
  370. page.keyboard.up("Control") # 松开Control键
  371. page.keyboard.press("Backspace") # 删除选中内容
  372. # 3) 逐字输入
  373. type_slow(search_locator, keyword, min_delay=0.06, max_delay=0.18)
  374. # 3. 输入搜索关键词
  375. # search_locator.fill(keyword)
  376. logger.info(f"📝 已输入搜索关键词:{keyword}")
  377. # 3) 搜索按钮也建议点可见的那个
  378. btn = page.locator(f"{SEARCH_BTN_SELECTOR}:visible").first
  379. btn.wait_for(state="visible", timeout=SEARCH_BTN_TIMEOUT)
  380. btn.click()
  381. page.wait_for_timeout(600)
  382. try:
  383. page.wait_for_load_state("networkidle", timeout=10000)
  384. except Exception:
  385. pass
  386. force_close_popup(page)
  387. kill_masks(page)
  388. logger.info("✅ 已触发搜索")
  389. return True
  390. # 搜索后等待结果加载
  391. # page.wait_for_timeout(COLLECT_DELAY)
  392. # return True
  393. except PlaywrightTimeoutError as e:
  394. logger.error(f" 搜索失败:元素定位超时 - {str(e)}")
  395. return False
  396. except Exception as e:
  397. logger.error(f" 搜索异常:{str(e)}")
  398. return False
  399. #翻下一页
  400. def goto_next_page(page) -> bool:
  401. """
  402. 尝试翻到下一页;成功返回True,没下一页/翻页失败返回False
  403. 适配常见 ElementUI: .el-pagination .btn-next / .el-pagination__next
  404. """
  405. # 多写几个候选,哪个能用就用哪个
  406. candidates = [
  407. ".el-pagination button.btn-next:not(.is-disabled)",
  408. ".el-pagination__next:not(.is-disabled)",
  409. "button:has-text('下一页'):not([disabled])",
  410. "a:has-text('下一页')",
  411. ]
  412. next_btn = None
  413. for sel in candidates:
  414. loc = page.locator(sel).first
  415. if loc.count() > 0:
  416. next_btn = loc
  417. break
  418. if not next_btn:
  419. return False
  420. # 用“当前页第一个商品标题”做翻页完成的判据(比只等networkidle更稳)
  421. first_title = page.locator(PRODUCT_TITLE_SELECTOR).first
  422. before = ""
  423. try:
  424. if first_title.count() > 0:
  425. before = first_title.inner_text(timeout=2000).strip()
  426. except:
  427. pass
  428. try:
  429. page.evaluate("window.scrollTo(0, 0);")
  430. next_btn.click(timeout=5000)
  431. page.wait_for_load_state("networkidle")
  432. # 等列表发生变化(标题变了 / 或者至少第一个标题重新出现)
  433. if before:
  434. page.wait_for_function(
  435. """(sel, oldText) => {
  436. const el = document.querySelector(sel);
  437. return el && el.innerText && el.innerText.trim() !== oldText;
  438. }""",
  439. arg=(PRODUCT_TITLE_SELECTOR, before),
  440. timeout=5000
  441. )
  442. else:
  443. first_title.wait_for(timeout=1000)
  444. return True
  445. except Exception as e:
  446. logger.warning(f" 翻页失败:{e}")
  447. return False
  448. def popup_guard(page, tag=""):
  449. """
  450. 全局弹窗/遮罩守卫:多步引导 + 关闭按钮 + 遮罩清理 + 恢复滚动
  451. tag 仅用于日志区分调用位置
  452. """
  453. try:
  454. # 给弹窗一点出现时间
  455. page.wait_for_timeout(300)
  456. # 1) 连续点“下一步/完成/我知道了/关闭”
  457. for _ in range(6):
  458. btn = page.locator(
  459. "xpath=//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  460. ).first
  461. if btn.count() > 0 and btn.is_visible():
  462. btn.click(timeout=1500)
  463. page.wait_for_timeout(250)
  464. continue
  465. # 2) 常见的 close icon
  466. close_btn = page.locator(
  467. "css=.el-dialog__headerbtn, .el-message-box__headerbtn, .close, .icon-close, .el-icon-close"
  468. ).first
  469. if close_btn.count() > 0 and close_btn.is_visible():
  470. close_btn.click(timeout=1200)
  471. page.wait_for_timeout(250)
  472. continue
  473. break
  474. # 3) 清遮罩 + 恢复滚动/交互
  475. page.evaluate(r"""
  476. () => {
  477. // 第一步:精准清理已知的遮罩/弹窗类名(Element UI框架常用)
  478. const selectors = [
  479. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  480. '.el-message-box__wrapper', '.el-loading-mask'
  481. ];
  482. selectors.forEach(sel => document.querySelectorAll(sel).forEach(e => e.remove()));
  483. // 泛化兜底:近似全屏 + 高 z-index 的层直接屏蔽
  484. const all = Array.from(document.querySelectorAll('body *'));
  485. for (const el of all) {
  486. const s = getComputedStyle(el); // 获取元素的实际样式(含CSS生效的样式)
  487. const z = parseInt(s.zIndex || '0', 10); // 取元素的层级(z-index),默认0
  488. // 条件1:元素是固定/绝对定位(弹窗/遮罩常见定位方式)+ 层级≥1000(高优先级遮挡)+ 能拦截鼠标事件
  489. if ((s.position === 'fixed' || s.position === 'absolute') && z >= 1000 && s.pointerEvents !== 'none') {
  490. const r = el.getBoundingClientRect(); // 获取元素的尺寸和位置
  491. // 条件2:元素宽度/高度≥屏幕80%(近似全屏遮罩)
  492. const nearFull = r.width >= innerWidth * 0.8 && r.height >= innerHeight * 0.8;
  493. if (nearFull) {
  494. el.style.pointerEvents = 'none'; // 让元素不拦截鼠标点击
  495. el.style.display = 'none'; // 隐藏元素
  496. }
  497. }
  498. }
  499. // 第三步:恢复页面滚动功能(弹窗常把页面设为不可滚动)
  500. document.documentElement.style.overflow = 'auto'; // html标签恢复滚动
  501. document.body.style.overflow = 'auto'; // body标签恢复滚动
  502. document.body.classList.remove('el-popup-parent--hidden'); // 移除Element UI的滚动禁用类
  503. }
  504. """)
  505. logger.info("杀除弹窗成功")
  506. except Exception:
  507. pass
  508. def open_detail_page(list_page, item, keyword, idx, *, timeout=15000):
  509. """
  510. 点击商品进入详情页,兼容:
  511. 1) 新开 tab(返回 detail_page != list_page, opened_new_tab=True)
  512. 2) 同 tab 跳转(detail_page == list_page, opened_new_tab=False)
  513. """
  514. ctx = list_page.context
  515. list_url = list_page.url
  516. detail_page = None
  517. opened_new_tab = False
  518. try:
  519. # 期望新开 tab(很多站点会这样)
  520. with ctx.expect_page(timeout=timeout) as p:
  521. item.click(delay=random.uniform(0.1, 0.3))
  522. detail_page = p.value
  523. opened_new_tab = True
  524. logger.info(f" 「{keyword}」第{idx}个商品 - 新开标签页进入详情")
  525. except PlaywrightTimeoutError:
  526. # 兜底:没新开 tab,大概率是同页跳转/弹层
  527. detail_page = list_page
  528. opened_new_tab = False
  529. logger.info(f" 「{keyword}」第{idx}个商品 - 未新开标签页,按同页进入详情处理")
  530. return detail_page, opened_new_tab, list_url
  531. def return_to_list(list_page, detail_page, opened_new_tab, list_url, keyword, idx):
  532. """
  533. 从详情页返回列表页:
  534. - 新 tab:关闭 tab,然后 bring_to_front 切回
  535. - 同 tab:尽量 go_back 回到 list_url;如果没跳转而是弹层,尝试 ESC
  536. """
  537. # 如果浏览器/页面已经被关了,直接退出,避免二次异常
  538. if list_page is None or list_page.is_closed():
  539. logger.warning(f" 「{keyword}」第{idx}个商品 - 列表页已关闭,无法切回")
  540. return
  541. if opened_new_tab:
  542. # 只关“新开的详情 tab”,绝不关 list_page
  543. try:
  544. if detail_page and (detail_page is not list_page) and (not detail_page.is_closed()):
  545. detail_page.close()
  546. logger.info(f"📌 「{keyword}」第{idx}个商品 - 已关闭详情页标签页")
  547. except Exception as e:
  548. logger.warning(f" 「{keyword}」第{idx}个商品 - 关闭详情页失败:{e}")
  549. # 切回列表页
  550. try:
  551. list_page.bring_to_front()
  552. list_page.mouse.move(random.randint(100, 300), random.randint(200, 400))
  553. random_delay(0.3, 0.8)
  554. list_page.wait_for_load_state("networkidle")
  555. logger.info(f" 「{keyword}」第{idx}个商品 - 已切回列表页(新tab模式)")
  556. except Exception as e:
  557. logger.warning(f" 「{keyword}」第{idx}个商品 - 切回列表页失败:{e}")
  558. return
  559. # 同 tab:detail_page == list_page
  560. try:
  561. # 1) 如果 URL 变了,说明确实跳转了 → go_back 回去
  562. if list_page.url != list_url:
  563. for _ in range(3): # 最多退 3 次,防止死循环
  564. list_page.go_back(timeout=15000)
  565. list_page.wait_for_load_state("domcontentloaded", timeout=15000)
  566. random_delay(0.2, 0.5)
  567. if list_page.url == list_url:
  568. break
  569. logger.info(f" 「{keyword}」第{idx}个商品 - 已返回列表页(同tab跳转模式)")
  570. else:
  571. # 2) URL 没变:可能是弹层详情 → 尝试 ESC 关闭弹层
  572. list_page.keyboard.press("Escape")
  573. random_delay(0.2, 0.5)
  574. logger.info(f" 「{keyword}」第{idx}个商品 - 已尝试关闭弹层并留在列表页(同tab弹层模式)")
  575. list_page.bring_to_front()
  576. list_page.wait_for_load_state("networkidle")
  577. except Exception as e:
  578. logger.warning(f" 「{keyword}」第{idx}个商品 - 同tab返回列表页失败:{e}")
  579. #判断店名是否已经在数据库
  580. def shop_is_exists_database(shop):
  581. try:
  582. conn = pymysql.connect(**MYSQL_CONFIG)
  583. cursor = conn.cursor(pymysql.cursors.DictCursor) # 改为字典游标
  584. query_sql = """
  585. SELECT province, city, business_license_company, qualification_number FROM yjj_shop_info_middle
  586. WHERE shop = %s
  587. """
  588. cursor.execute(query_sql, (shop,))
  589. result = cursor.fetchone()
  590. # 正确的调试方式(替代cursor._last_executed)
  591. print(f"【调试】传入的店铺名:{repr(shop)}") # repr能显示空格/隐藏字符
  592. print(f"【调试】查询参数:{shop}")
  593. print(f"【调试】查询结果:{result} → 函数返回:{bool(result)}")
  594. is_exists = bool(result)
  595. if is_exists:
  596. logger.info(f"【店铺存在校验】店铺已存在 | 店铺名:{repr(shop)} | 结果:存在(True)不要执行采集店铺")
  597. else:
  598. logger.info(f"【店铺存在校验】店铺不存在 | 店铺名:{repr(shop)} | 结果:不存在(False)")
  599. return is_exists, result
  600. except Exception as e:
  601. logger.error(f"查询店铺失败:{e}")
  602. return False, None # 异常时明确返回False,避免返回None
  603. finally:
  604. # 修复:关闭游标和连接,避免泄露
  605. if cursor:
  606. cursor.close()
  607. if conn:
  608. conn.close()
  609. def insert_shop_info_to_db(shop,contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform, province, city, create_time, update_time):
  610. """
  611. 把字段插入到yjj_shop_info_middle表
  612. :param 各参数: 你要插入的字段值(空字符串也可)
  613. :return: bool - 插入成功返回True,失败返回False
  614. """
  615. # 1. 初始化数据库连接和游标
  616. conn = None
  617. cursor = None
  618. try:
  619. conn = pymysql.connect(**MYSQL_CONFIG)
  620. cursor = conn.cursor()
  621. # 2. 构造INSERT SQL语句(参数化查询,防止SQL注入)
  622. # 注意:请确认yjj_shop_info_middle表的字段名和以下%s的顺序对应!
  623. # 若表字段名不同,修改INSERT后的字段列表(比如你的表字段是credit_code而非qualification_number,要对应改)
  624. sql = """
  625. INSERT INTO yjj_shop_info_middle (
  626. shop,
  627. contact_address,
  628. qualification_number,
  629. business_license_company,
  630. business_license_address,
  631. scrape_date,
  632. platform,
  633. province,
  634. city,
  635. create_time,
  636. update_time
  637. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  638. ON DUPLICATE KEY UPDATE
  639. contact_address = VALUES(contact_address), # 重复时更新联系地址
  640. qualification_number = VALUES(qualification_number), # 更新社会信用代码
  641. business_license_company = VALUES(business_license_company), # 更新公司名
  642. business_license_address = VALUES(business_license_address), # 更新地址
  643. scrape_date = VALUES(scrape_date),
  644. platform = VALUES(platform),
  645. province = VALUES(province),
  646. city = VALUES(city),
  647. update_time = VALUES(update_time) # 重复时更新update_time
  648. """
  649. # 3. 构造插入的参数(顺序必须和SQL中的%s一一对应)
  650. params = (
  651. shop, # 店铺名称
  652. contact_address, # 联系地址
  653. qualification_number, # 社会信用代码
  654. business_license_company, # 营业执照公司名
  655. business_license_address, # 营业执照地址
  656. scrape_date, # 爬取日期
  657. platform, # 平台名称(药九九)
  658. province, # 省份
  659. city, # 城市
  660. create_time, # create_time(当前时间)
  661. update_time
  662. )
  663. # 4. 执行SQL并提交事务
  664. cursor.execute(sql, params)
  665. conn.commit()
  666. print(f"✅ 数据插入成功!店铺:{shop} | 公司:{business_license_company}")
  667. return True
  668. except pymysql.MySQLError as e:
  669. # 数据库相关错误(连接失败、SQL语法错误、字段不匹配等)
  670. print(f"MySQL插入失败:{e}")
  671. print(f"详细异常信息:{traceback.format_exc()}") # 打印详细堆栈,方便排查
  672. if conn:
  673. conn.rollback() # 插入失败回滚事务
  674. return False
  675. except Exception as e:
  676. # 其他未知错误
  677. print(f"插入数据时发生未知错误:{e}")
  678. print(f"详细异常信息:{traceback.format_exc()}")
  679. if conn:
  680. conn.rollback()
  681. return False
  682. finally:
  683. # 5. 无论成功/失败,都关闭游标和连接(释放资源)
  684. if cursor:
  685. cursor.close()
  686. if conn:
  687. conn.close()
  688. def insert_single_to_mysql(single_data):
  689. """
  690. 逐条插入单条数据到MySQL数据库
  691. :param single_data: 单条商品数据元组
  692. :return: 插入是否成功
  693. """
  694. conn = None
  695. cursor = None
  696. try:
  697. conn = pymysql.connect(**MYSQL_CONFIG)
  698. cursor = conn.cursor()
  699. # 2. 确保表存在(兼容表未创建的情况)
  700. # cursor.execute(CREATE_TABLE_SQL)
  701. insert_sql = """
  702. INSERT INTO yjj_drug_middle (
  703. product, my_good_price, min_price, manufacture_date, expiry_date,
  704. shop, business_license_company, province, city, manufacturer,
  705. specification, approval_number, product_link, scrape_date,
  706. scrape_province, availability, credit_code, platform, search_key,
  707. number, is_sold_out, sales, inventory, snapshot_url, update_time, create_time
  708. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  709. """
  710. # 字段值(与SQL占位符顺序严格对应)
  711. values = (
  712. single_data["product"],
  713. single_data["my_good_price"],
  714. single_data["min_price"],
  715. single_data["manufacture_date"],
  716. single_data["expiry_date"],
  717. single_data["shop"],
  718. single_data["business_license_company"],
  719. single_data["province"],
  720. single_data["city"],
  721. single_data["manufacturer"],
  722. single_data["specification"],
  723. single_data["approval_number"],
  724. single_data["product_link"],
  725. single_data["scrape_date"],
  726. single_data["scrape_province"],
  727. single_data["availability"],
  728. single_data["credit_code"],
  729. single_data["platform"],
  730. single_data["search_key"],
  731. single_data["number"],
  732. single_data["is_sold_out"],
  733. single_data["sales"],
  734. single_data["inventory"],
  735. single_data["snapshot_url"],
  736. single_data["update_time"],
  737. single_data["create_time"]
  738. )
  739. cursor.execute(insert_sql, values)
  740. conn.commit()
  741. logger.info(f" 单条数据插入成功:...") # 仅打印标题前20字
  742. return True
  743. except OperationalError as e:
  744. logger.error(f" MySQL连接失败:{str(e)}")
  745. if conn:
  746. conn.rollback()
  747. return False
  748. except ProgrammingError as e:
  749. logger.error(f" SQL语法错误:{str(e)}")
  750. if conn:
  751. conn.rollback()
  752. return False
  753. except Exception as e:
  754. logger.error(f" 单条数据插入失败:{str(e)}")
  755. if conn:
  756. conn.rollback()
  757. return False
  758. finally:
  759. # 关闭游标和连接
  760. if cursor:
  761. cursor.close()
  762. if conn:
  763. conn.close()
  764. def check_dup_in_biz_db(product_link, discount_price_val, scrape_date):
  765. """直接查询业务表是否存在该商品链接+价格"""
  766. conn = None
  767. cursor = None
  768. log_context = (
  769. f"【去重校验】商品链接:{product_link.strip()} | 价格:{discount_price_val} "
  770. f"采集日期:{scrape_date.strip()}"
  771. )
  772. try:
  773. conn = pymysql.connect(**MYSQL_CONFIG)
  774. cursor = conn.cursor()
  775. sql = """
  776. SELECT * FROM yjj_drug_middle
  777. WHERE product_link = %s AND min_price = %s AND scrape_date=%s
  778. """
  779. # 先执行查询
  780. cursor.execute(sql, (product_link.strip(), discount_price_val, scrape_date.strip()))
  781. # 再判断是否有结果
  782. # 如果 fetchone() 返回元组(比如(1,))→ (1,) is not None → 结果为 True;
  783. # 如果 fetchone() 返回 None → None is not None → 结果为 False。
  784. is_dup = cursor.fetchone() is not None
  785. if is_dup:
  786. logger.warning(f"{log_context} - 表中已存在重复记录,跳过本次采集")
  787. else:
  788. logger.info(f"{log_context} - 表中无重复记录,正常采集")
  789. return is_dup
  790. except Exception as e:
  791. logger.error(f"查询业务表去重失败:{str(e)}")
  792. return False
  793. finally:
  794. if cursor:
  795. cursor.close()
  796. if conn:
  797. conn.close()
  798. # 压缩图片函数
  799. def compress_image(image_data, max_size=4*1024*1024): # 4MB上限
  800. try:
  801. img = Image.open(BytesIO(image_data))
  802. # 将RGBA模式转为RGB(兼容JPEG)
  803. if img.mode in ('RGBA', 'P'): # P是PNG的调色板模式,也需转换
  804. # 新建白色背景的RGB图片,把透明图贴上去(避免透明区域变黑)
  805. bg_img = Image.new('RGB', img.size, (255, 255, 255))
  806. bg_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
  807. img = bg_img
  808. # 缩小分辨率(按比例缩到宽≤1000px)
  809. if img.width > 1000:
  810. ratio = 1000 / img.width
  811. new_size = (int(img.width*ratio), int(img.height*ratio))
  812. img = img.resize(new_size, Image.Resampling.LANCZOS)
  813. # 降低质量(JPG)/压缩(PNG)
  814. output = BytesIO()
  815. img.save(output, format='JPEG', quality=80) # quality越小体积越小
  816. compressed_data = output.getvalue()
  817. # 若仍超限,继续降质量
  818. if len(compressed_data) > max_size:
  819. img.save(output, format='JPEG', quality=60)
  820. compressed_data = output.getvalue()
  821. return compressed_data
  822. except Exception as e:
  823. logger.debug(f"图片压缩失败:{e}")
  824. return image_data # 压缩失败返回原始数据
  825. def download_image_to_base64(image_url, save_dir = "./download_images"):
  826. """下载网络图片,返回图片二进制数据(BytesIO)"""
  827. try:
  828. if not os.path.exists(save_dir):
  829. os.makedirs(save_dir) # 创建多级目录(比如a/b/c)
  830. print(f"创建本地保存目录:{save_dir}")
  831. except Exception as e:
  832. print(f"创建保存目录失败:{str(e)}")
  833. return None
  834. try:
  835. # 模拟浏览器请求头,避免被服务器拦截
  836. headers = {
  837. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  838. }
  839. response = requests.get(image_url, headers=headers, timeout=15)
  840. response.raise_for_status()
  841. compressed_data = compress_image(response.content)
  842. image_base64 = base64.b64encode(compressed_data).decode("utf-8")
  843. image_data = compressed_data
  844. # 步骤3:提取图片文件名(从URL中截取,避免重复)
  845. # 示例URL:https://xxx.com/123.jpg → 文件名:123.jpg
  846. file_name = image_url.split("/")[-1]
  847. # 处理特殊字符(避免文件名非法)
  848. file_name = file_name.replace("?", "").replace("&", "").replace("=", "")
  849. save_path = os.path.join(save_dir, file_name) # 完整保存路径
  850. # 步骤4:保存图片到本地
  851. with open(save_path, "wb") as f:
  852. f.write(image_data)
  853. print(f"图片已保存到本地:{save_path}")
  854. return image_base64
  855. except requests.exceptions.Timeout:
  856. print(f"下载图片超时:{image_url}")
  857. return None
  858. except requests.exceptions.HTTPError as e:
  859. print(f"图片URL无效(状态码:{response.status_code}):{image_url}")
  860. return None
  861. except Exception as e:
  862. print(f"下载图片失败:{str(e)}")
  863. return None
  864. def get_ocr_res(img):
  865. try:
  866. #img地址
  867. print(f'开始识别图片:{img}')
  868. request_url = request_url_config
  869. img_base64 = download_image_to_base64(img)
  870. if not img_base64:
  871. print("图片下载/转Base64失败,终止OCR识别")
  872. return None
  873. # 获取access_token
  874. access_token = get_access_token()
  875. if not access_token:
  876. print("获取access_token失败,无法调用OCR接口")
  877. return None
  878. params = {"image": img_base64}
  879. request_url = request_url + "?access_token=" + access_token
  880. headers = {'content-type': 'application/x-www-form-urlencoded'}
  881. response = requests.post(request_url, data=params, headers=headers)
  882. if response:
  883. res = response.json()
  884. # 检查OCR返回是否有错误
  885. if "error_code" in res:
  886. print(f"百度OCR接口错误:{res['error_msg']}(错误码:{res['error_code']})")
  887. return None
  888. # 解析识别结果
  889. new_dic = dict()
  890. for ite in res['words_result'].keys():
  891. new_dic[ite] = res['words_result'][ite]['words']
  892. print('资质数据信息', new_dic)
  893. return new_dic
  894. else:
  895. print("OCR接口返回空响应")
  896. return None
  897. except requests.exceptions.RequestException as e:
  898. print(f"网络错误(图片下载/OCR请求失败):{str(e)}")
  899. return None
  900. except KeyError as e:
  901. print(f"OCR响应格式异常,缺失字段:{str(e)}")
  902. return None
  903. except Exception as e:
  904. print(f"OCR识别未知错误:{str(e)}")
  905. return None
  906. def get_access_token():
  907. AppKey = AppKey_config
  908. AppSrcret = AppSecret_config
  909. token_url =token_url_config
  910. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  911. payload = ""
  912. headers = {
  913. 'Content-Type': 'application/json',
  914. 'Accept': 'application/json'
  915. }
  916. try:
  917. response = requests.request("POST", url, headers=headers, data=payload)
  918. response.raise_for_status() # 触发HTTP错误
  919. return response.json()['access_token']
  920. except Exception as e:
  921. print(f"获取access_token失败:{str(e)}")
  922. return None
  923. def extract_province_city(address):
  924. """
  925. 从地址中提取省份和城市
  926. :param address: 营业执照地址(如"福建省福州市马尾区")
  927. :return: (province, city) - 提取到的省份/城市,提取失败返回空字符串
  928. """
  929. if not address: # 地址为空,直接返回空
  930. return "", ""
  931. # 正则1:匹配省份(兼容省/自治区/直辖市/特别行政区)
  932. province_pattern = re.compile(r'([^省]+省|.+自治区|北京市|上海市|天津市|重庆市|.+特别行政区)')
  933. province_match = province_pattern.search(address)
  934. province = province_match.group(1) if province_match else ""
  935. # 正则2:匹配城市(兼容市/自治州/地区/盟,且排除省份已匹配的部分)
  936. # 先去掉已匹配的省份,再匹配城市
  937. address_remain = address.replace(province, "").strip() if province else address.strip()
  938. city_pattern = re.compile(r'([^市]+市|.+自治州|.+地区|.+盟|^[^\d区县镇]+)')
  939. city_match = city_pattern.search(address_remain)
  940. city = city_match.group(1).strip() if city_match else ""
  941. # 兼容直辖市(如"北京市朝阳区"→city=北京市)
  942. if province in ["北京市", "上海市", "天津市", "重庆市"]:
  943. city = province
  944. # 兼容地址不规范的情况(如"福建福州马尾区",无"省"/"市"字)
  945. if not province and not city:
  946. # 匹配前两个地名(如"福建福州"→province=福建,city=福州)
  947. simple_pattern = re.compile(r'^([^\d区县镇]+)')
  948. simple_match = simple_pattern.search(address)
  949. if simple_match:
  950. city = simple_match.group(1).strip() # 只有城市,省份留空
  951. if city and province in city:
  952. city = city.replace(province, "").strip()
  953. return province.strip(), city.strip()
  954. #采集数据核心
  955. def collect_data(page, keyword):
  956. """
  957. 1) 先获取当前页商品个数(count)
  958. 2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
  959. 3) 当前页循环完 -> goto_next_page;有下一页继续;无下一页结束该关键词
  960. """
  961. collect_result = []
  962. # seen = set()
  963. # ========== 初始化异常关键词存储 ==========
  964. error_keywords = []
  965. kw = '' # 单个异常关键词变量
  966. logger.info(f"📊 开始采集「{keyword}」的商品数据")
  967. page.wait_for_load_state("networkidle")
  968. #没有找到商品就跳过这个商品
  969. page_no = 1
  970. while True:
  971. logger.info(f"\n📄 「{keyword}」开始采集第 {page_no} 页")
  972. # 记录列表页URL(可用于你后续兜底)
  973. list_page_url = page.url
  974. logger.info(f"📌 已记录商品列表页URL:{list_page_url}")
  975. # ✅ 先获取当前页商品个数
  976. page.wait_for_load_state("networkidle")
  977. total_limit = page.locator(PRODUCT_ITEM_SELECTOR).count()
  978. logger.info(f"📌 「{keyword}」第{page_no}页 初始商品个数(count):{total_limit}")
  979. #获取该商品的总个数
  980. total_goods_nums_elem = page.locator("div.sr-page_turner-pagination-total")
  981. if total_goods_nums_elem.count() > 0:
  982. total_goods_nums = total_goods_nums_elem.inner_text().strip()
  983. logger.info(f"📌 「{keyword} 商品个数(count):{total_goods_nums}")
  984. else:
  985. logger.info(f"📌 「{keyword} 商品个数(count):不超过60个")
  986. # 重置当前页的采集计数
  987. collected_count = 0
  988. # ========= 初始化无匹配计数器(记录标题不包含核心关键词的次数) =========
  989. # no_match_count = 0 # 无匹配次数初始化为0
  990. # MAX_NO_MATCH = 10 # 最大无匹配次数阈值
  991. #补充没找到关键词的兜底
  992. not_found_keywords = page.locator("span:has-text('新品登记')")
  993. if not_found_keywords.count() > 0:
  994. logger.warning(f"⚠️ 关键词「{keyword}」无匹配商品,直接跳过整个关键词采集")
  995. return []
  996. for idx in range(total_limit):
  997. detail_page = None
  998. try:
  999. item = page.locator(PRODUCT_ITEM_SELECTOR).nth(idx)
  1000. collected_count += 1 # 实际采集计数(用于日志)
  1001. # ========= 反爬随机延迟(保留你的原逻辑也行) =========
  1002. page.wait_for_load_state("networkidle")
  1003. delay = random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1004. logger.info(f"📌 「{keyword}」第{page_no}页 第{collected_count}/{total_limit}个商品 - 等待{delay:.2f}秒后采集(反爬)")
  1005. # 1. 初始化所有字段默认值
  1006. title = "无标题"
  1007. price = "0.00"
  1008. shop = "无店名"
  1009. expiry_date = "无有效期"
  1010. manufacture_date = "无生产日期"
  1011. approval_number = "无批准文号"
  1012. manufacturer = "未知公司"
  1013. # discount_price = "0.00"
  1014. spec = "未知规格"
  1015. num = 1 # ✅ 默认 1
  1016. platform = '药九九'
  1017. current_time = datetime.now().strftime("%Y-%m-%d")
  1018. is_sold_out = 0
  1019. # ========= 售罄不跳过 =========
  1020. sold_locator = item.locator('div.gc-l1-cirle_tip')
  1021. if sold_locator.count() > 0:
  1022. is_sold_out = 1
  1023. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品已售罄")
  1024. # if collected_count % 5 == 0 and collected_count > 0:
  1025. # logger.info("采满5个往下滑")
  1026. # slow_scroll_400px(page)
  1027. # page.wait_for_load_state("networkidle")
  1028. # continue
  1029. # 提取商品标题(处理空值)
  1030. product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
  1031. if product_locator.count() > 0:
  1032. title = product_locator.inner_text(timeout=3000).strip()
  1033. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页标题:{title}{'='*10}")
  1034. else:
  1035. logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
  1036. #关键词不在标题中,跳过当前商品
  1037. # core_keyword = re.sub(r'^999[\s\(\)()、·]*', '', keyword)
  1038. # if core_keyword not in title:
  1039. # no_match_count += 1
  1040. # logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含核心关键词「{core_keyword}」(无匹配次数:{no_match_count}/{MAX_NO_MATCH}),跳过本次循环")
  1041. # continue
  1042. # if no_match_count >= MAX_NO_MATCH:
  1043. # logger.error(f"❌ 关键词「{keyword}」无匹配商品次数已达{MAX_NO_MATCH}次,直接终止当前关键词采集,进入下一个关键词")
  1044. # return []
  1045. # 提取价格(带缺失日志)
  1046. price_locator = item.locator(PRODUCT_PRICE_SELECTOR).nth(0)
  1047. if price_locator.count() > 0:
  1048. price = price_locator.inner_text(timeout=3000).strip()
  1049. logger.info(f"{'='*10}{keyword}」第{collected_count}个商品 - 列表页采购价格:{price}{'='*10}")
  1050. else:
  1051. price = "0.00" # 初始化默认值,避免后续报错
  1052. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页采购价格元素未找到,使用默认值:{price}")
  1053. # 5. 提取公司名称(带缺失日志)
  1054. manufacturer_locator = item.locator(PRODUCT_COMPANY_SELECTOR)
  1055. if manufacturer_locator.count() > 0:
  1056. manufacturer = manufacturer_locator.inner_text(timeout=3000).strip()
  1057. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页公司名:{manufacturer}{'='*10}")
  1058. else:
  1059. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页公司名称元素未找到,使用默认值:{manufacturer}")
  1060. #提取店铺名称
  1061. shop_locator = item.locator(PRODUCT_STORE_SELECTOR)
  1062. if shop_locator.count() > 0:
  1063. shop = shop_locator.inner_text(timeout=3000).strip()
  1064. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页店名:{shop}{'='*10}")
  1065. else:
  1066. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页店铺名称元素未找到,使用默认值:{shop}")
  1067. #提取折扣价
  1068. discount_price_val_origin = ""
  1069. discount_price = ""
  1070. discount_price_locator = item.locator('span.gc-l2-discount_price').first
  1071. if discount_price_locator.count() > 0:
  1072. discount_price = discount_price_locator.inner_text(timeout=3000).strip()
  1073. discount_price_val_origin = discount_price
  1074. match = re.search(r'\d+\.?\d*', str(discount_price_val_origin))
  1075. discount_price_val = float(match.group()) if match else 0.00
  1076. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页折扣价:{discount_price_val}{'='*10}")
  1077. else:
  1078. #如果没有拿原价替换
  1079. price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "") else "0.00"
  1080. discount_price_val = float(price)
  1081. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 折扣价元素未找到,使用采购价兜底:{discount_price_val}")
  1082. merged_price = f"{price}{discount_price_val_origin}" if discount_price_val_origin else price
  1083. store_id = page.locator("")
  1084. product_url = f"https://www.ybm100.com/new/base/skuDetail?id={568576512}&combination=1&type=1"
  1085. # ========= 模拟点击商品进入详情页 =========
  1086. logger.info(
  1087. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 模拟鼠标移动并点击"
  1088. )
  1089. # 点击商品项容器,触发详情展示
  1090. # ========== 点击商品跳详情页 ==========
  1091. # 反爬:模拟真人鼠标移动到商品上再点击(不是直接点击)
  1092. item.hover() # 先悬停
  1093. random_delay(0.2, 0.5) # 悬停后延迟
  1094. item.dispatch_event("mousedown")
  1095. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1096. item.dispatch_event("mouseup")
  1097. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1098. try:
  1099. with page.context.expect_page(timeout=60000) as p:
  1100. item.click(delay=random.uniform(0.1, 0.3))
  1101. detail_page = p.value
  1102. except PlaywrightTimeoutError:
  1103. logger.warning(
  1104. f" 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 未检测到新标签页,使用当前页采集详情"
  1105. )
  1106. detail_page = None # 标记为无新标签页,避免关闭列表页
  1107. # 等待详情加载(优先用新标签页,无则用列表页)
  1108. target_page = detail_page if detail_page else page
  1109. target_page.wait_for_load_state("networkidle", timeout=20000)
  1110. delay = random_delay(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
  1111. logger.info(
  1112. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 详情页加载完成,等待{delay:.2f}秒(反爬)"
  1113. )
  1114. # 反爬:检测详情页反爬验证
  1115. # check_anti_crawl(page)
  1116. # ========== 采集详情页的专属信息(有效期/生产日期/批准文号) ==========
  1117. #获取商品详情页链接
  1118. product_link = target_page.url
  1119. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页链接:{product_link}{'='*10}")
  1120. #如果有需要,还可能要加兜底。
  1121. # ========= ✅ 去重逻辑,拿商品链接和折扣价和有效期和采集日期 =========
  1122. if check_dup_in_biz_db(product_link, discount_price_val, current_time):
  1123. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过")
  1124. # ========== 关闭新标签页,切回列表页 ==========
  1125. if detail_page and not detail_page.is_closed():
  1126. detail_page.close() # 关闭详情页标签
  1127. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1128. # 切回原列表页(第一个标签页)
  1129. page.bring_to_front() # 激活列表页
  1130. page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1131. random_delay(0.5, 1.0) # 增加切换后延迟
  1132. page.wait_for_load_state("networkidle")
  1133. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1134. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1135. if collected_count % 5 == 0 and collected_count > 0:
  1136. logger.info("采满5个往下滑")
  1137. slow_scroll_400px(page)
  1138. page.wait_for_load_state("networkidle")
  1139. continue
  1140. # key = f"{product_link.strip()}|{discount_price_val}"
  1141. # if key in seen:
  1142. # logger.warning(
  1143. # f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过"
  1144. # )
  1145. # if collected_count % 5 == 0 and collected_count > 0:
  1146. # logger.info("采满15个往下滑")
  1147. # slow_scroll_400px(page)
  1148. # page.wait_for_load_state("networkidle")
  1149. # continue
  1150. # seen.add(key)
  1151. # 提取有效期(处理空值)
  1152. expiry_date_locator = target_page.locator("//span[contains(text(), '有效期')]/following-sibling::span[contains(@class, 'gdb-desc-value4')]")
  1153. if expiry_date_locator.count() > 0:
  1154. expiry_date = expiry_date_locator.inner_text(timeout=3000).strip().replace('-', '') #.replace('近效期','')
  1155. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页有效期:{expiry_date}{'='*10}")
  1156. else:
  1157. # 修复:替换未定义的i为collected_count
  1158. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 有效期元素未找到,使用默认值:{expiry_date}")
  1159. # 提取生产日期(修复完成)
  1160. manufacture_date_locator = target_page.locator("//span[@class='gdb-desc-label' and text()='生产日期']/following-sibling::span[1]")
  1161. if manufacture_date_locator.count() > 0:
  1162. manufacture_date = manufacture_date_locator.inner_text(timeout=3000).strip().replace('-', "")
  1163. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页生产日期:{manufacture_date}{'='*10}")
  1164. else:
  1165. # 修复:替换未定义的i为collected_count
  1166. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 生产日期元素未找到,使用默认值:{manufacture_date}")
  1167. # 提取批准文号
  1168. approval_number_locator = target_page.locator("//span[contains(text(), '国药准字')]").first
  1169. if approval_number_locator.count() > 0:
  1170. approval_number = approval_number_locator.inner_text(timeout=3000).strip()
  1171. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页批准文号:{approval_number}{'='*10}")
  1172. else:
  1173. # 修复:替换未定义的i为collected_count
  1174. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 批准文号元素未找到,使用默认值:{approval_number}")
  1175. #提取规格
  1176. spec_locator = target_page.locator('span.gddd-params_text_line_1[title]')
  1177. if spec_locator.count() > 0:
  1178. spec = spec_locator.nth(2).inner_text(timeout=3000).strip()
  1179. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页规格:{spec}{'='*10}")
  1180. else:
  1181. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  1182. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 规格元素数量不足,使用默认值:{spec}")
  1183. #提取库存
  1184. storage = ''
  1185. storage_locator = target_page.locator("span.gdb-desc-value7")
  1186. if storage_locator.count() > 0:
  1187. storage = storage_locator.inner_text(timeout=3000).strip()
  1188. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页库存:{storage}{'='*10}")
  1189. else:
  1190. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 库存元素数量不足,使用默认值:{storage}")
  1191. #提取销量
  1192. sell = ''
  1193. sell_locator = target_page.locator('.has-join-group span.packUnit-class')
  1194. if sell_locator.count() > 0:
  1195. sell = sell_locator.inner_text(timeout=3000).strip()
  1196. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页销量:{sell}{'='*10}")
  1197. else:
  1198. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 没有销量元素,使用默认值:{sell}")
  1199. #保存快照url上传到oss
  1200. try:
  1201. local_path, oss_url = screenshot_target_page_to_local_then_oss(
  1202. target_page=target_page,
  1203. full_page=True # 截取全屏
  1204. )
  1205. print(f"最终结果:")
  1206. print(f" 本地文件路径:{local_path}")
  1207. logger.info(f" OSS访问链接:{oss_url}")
  1208. except Exception as e:
  1209. logger.warning(f"整体流程执行失败:{str(e)}")
  1210. # input("...")
  1211. # if shop_is_exists_database(shop):
  1212. # continue
  1213. # province = ""
  1214. # city = ""
  1215. # business_license_company = ""
  1216. # qualification_number = ''
  1217. # input('....')
  1218. shop_exists, shop_info = shop_is_exists_database(shop)
  1219. #店铺名不是药品预约中心且店铺名不在数据库就要点击
  1220. if shop != "药品预约中心" and not shop_exists :
  1221. logger.info("店铺名不是药品预约中心且数据库没有该公司的营业执照")
  1222. # 获取营业执照图片 li[data-v-4f79abe8].nth(2)
  1223. # 进入店铺
  1224. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1225. entershop_btn = target_page.locator('[data-v-c5790f48].btn-text')
  1226. # 增强:先等待进入店铺按钮可见
  1227. entershop_btn.wait_for(state="visible", timeout=10000)
  1228. entershop_btn.scroll_into_view_if_needed() # 确保按钮在视口内
  1229. entershop_btn.hover() # 先悬停
  1230. random_delay(0.2, 0.5) # 悬停后延迟
  1231. entershop_btn.click()
  1232. # entershop_btn.dispatch_event("mousedown")
  1233. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1234. # entershop_btn.dispatch_event("mouseup")
  1235. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1236. target_page.wait_for_load_state("domcontentloaded") # 等DOM加载(比networkidle更适合页面内切换)
  1237. #点击店铺资质
  1238. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1239. shop_license_page = target_page.locator('li:has-text("店铺资质")')
  1240. shop_license_page.wait_for(state="visible", timeout=10000) # 等待元素加载完成
  1241. shop_license_page.hover() # 先悬停
  1242. random_delay(0.2, 0.5) # 悬停后延迟
  1243. # shop_license_page.dispatch_event("mousedown")
  1244. shop_license_page.click()
  1245. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1246. # shop_license_page.dispatch_event("mouseup")
  1247. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1248. target_page.wait_for_load_state("networkidle")
  1249. slow_scroll_400px(target_page, scroll_distance1=700)
  1250. #获取营业执照图片
  1251. target_page.wait_for_load_state("load")
  1252. ocr_res = None
  1253. shop_license_div = target_page.locator('div.shop-licensesImg').nth(0)
  1254. shop_license_div.wait_for(state="visible", timeout=60000)
  1255. shop_license_img = shop_license_div.locator('img')
  1256. try:
  1257. if shop_license_img.count() > 0:
  1258. shop_license_src = shop_license_img.get_attribute('src')
  1259. shop_license_src = shop_license_src.strip() if shop_license_src else None
  1260. ocr_res = get_ocr_res(shop_license_src)
  1261. # print(f'ocr_res:{ocr_res}')
  1262. else:
  1263. shop_license_src = None
  1264. except Exception as e:
  1265. # 捕获定位/提取失败的异常,避免程序崩溃
  1266. logger.warning(f"提取营业执照图片src失败:{e}")
  1267. shop_license_src = None
  1268. print("营业执照图片链接:", shop_license_src)
  1269. # input("..")
  1270. contact_address = ''
  1271. qualification_number = ocr_res.get('社会信用代码', '') if ocr_res else ''
  1272. business_license_company = ocr_res.get('单位名称', '') if ocr_res else ''
  1273. business_license_address = ocr_res.get('地址', '') if ocr_res else ''
  1274. # scrape_date = ''
  1275. # 调用提取函数,获取省份和城市
  1276. province, city = extract_province_city(business_license_address)
  1277. logger.info(f"原始地址:{business_license_address}")
  1278. logger.info(f"提取的省份:{province} | 城市:{city}")
  1279. insert_result = insert_shop_info_to_db(
  1280. shop=shop,
  1281. contact_address=contact_address,
  1282. qualification_number=qualification_number,
  1283. business_license_company=business_license_company,
  1284. business_license_address=business_license_address,
  1285. scrape_date=current_time,
  1286. platform=platform,
  1287. province=province,
  1288. city=city,
  1289. create_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S") ,
  1290. update_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  1291. )
  1292. else:
  1293. logger.info("数据库有该店名,在数据库拿取对应字段填充yjj_drug_middle表")
  1294. if shop_info:
  1295. province = shop_info['province']
  1296. city = shop_info['city']
  1297. business_license_company = shop_info['business_license_company']
  1298. qualification_number = shop_info['qualification_number']
  1299. # purchase_price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "").isdigit() else 0.00
  1300. # ========== 关闭新标签页,切回列表页 ==========
  1301. if detail_page and not detail_page.is_closed():
  1302. detail_page.close() # 关闭详情页标签
  1303. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1304. # 切回原列表页(第一个标签页)
  1305. page.bring_to_front() # 激活列表页
  1306. page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1307. random_delay(0.5, 1.0) # 增加切换后延迟
  1308. page.wait_for_load_state("networkidle")
  1309. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1310. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1311. # credit_code = ""
  1312. availability = ""
  1313. # 组装单条数据(仅新增生产日期/批准文号字段,原有字段顺序/逻辑不变)
  1314. # 构造单条数据元组(适配MySQL字段)
  1315. single_data = {
  1316. # 核心商品信息
  1317. "product": title, # 商品名称
  1318. "my_good_price": merged_price, # 自定义价格(可与min_price相同或单独提取)
  1319. "min_price": discount_price_val, # 最低价格
  1320. "manufacture_date": manufacture_date, # 生产日期
  1321. "expiry_date": expiry_date, # 有效期
  1322. "shop": shop, # 店铺名
  1323. "business_license_company": business_license_company, # 营业执照主体(公司名称)
  1324. "province": province, # 省份
  1325. "city": city, # 城市
  1326. "manufacturer": manufacturer, # 生产厂家
  1327. "specification": spec, # 规格
  1328. "approval_number": approval_number, # 批准文号
  1329. "product_link": product_link, # 商品链接
  1330. "scrape_date": current_time, # 采集日期
  1331. "scrape_province": "", # 采集省份(可留空或根据IP获取)
  1332. "availability": availability, # 库存状态
  1333. "credit_code": qualification_number, # 统一信用代码(如有可补充提取)
  1334. "platform": platform, # 平台名称(固定或动态获取)
  1335. "search_key": keyword, # 搜索关键词
  1336. "number": num, # 数量(盒数)
  1337. "is_sold_out": is_sold_out, # 售罄标记(0/1)
  1338. "sales": sell, #销量
  1339. "inventory": storage, #库存
  1340. "snapshot_url": oss_url, #快照链接
  1341. "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # 更新时间
  1342. "create_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 创建时间
  1343. }
  1344. # 调用逐条插入函数
  1345. insert_single_to_mysql(single_data)
  1346. collect_result.append(single_data)
  1347. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」采集完成")
  1348. except Exception as e:
  1349. kw = keyword
  1350. # 2. 把kw添加到异常关键词数组(去重,避免重复添加)
  1351. if kw not in error_keywords:
  1352. error_keywords.append(kw)
  1353. # 异常处理:关闭详情页,强制切回列表页
  1354. logger.exception(f" 「{keyword}」第{collected_count}个商品采集核心异常:{str(e)}")
  1355. try:
  1356. if detail_page and not detail_page.is_closed():
  1357. detail_page.close()
  1358. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 异常时关闭详情页标签页")
  1359. if page and not page.is_closed():
  1360. page.bring_to_front() # 切回列表页
  1361. page.wait_for_load_state("networkidle")
  1362. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1363. except Exception as e2:
  1364. logger.error(f" 「{keyword}」第{collected_count}个商品详情采集异常(处理时):{str(e2)},原异常:{str(e)}")
  1365. continue
  1366. # ✅ 每15次滚动一次(修复:用collected_count,且排除0的情况)
  1367. if collected_count % 5 == 0 and collected_count > 0 and collected_count != total_limit:
  1368. logger.info("采满5个往下滑")
  1369. slow_scroll_400px(page,)
  1370. page.wait_for_load_state("networkidle")
  1371. # ====== 当前页采集完毕,尝试翻页 ======
  1372. delay = random_delay(1.5, 3.0)
  1373. logger.info(f"⏳ 翻页前随机等待 {delay:.2f}s(反爬)")
  1374. if goto_next_page(page):
  1375. page_no += 1
  1376. continue
  1377. else:
  1378. logger.info(f" 「{keyword}」已无下一页,关键词采集结束")
  1379. break
  1380. # 关键词采集完成后长延迟
  1381. long_delay = random_delay(MIN_KEYWORD_DELAY, MAX_KEYWORD_DELAY)
  1382. logger.info(f" 「{keyword}」采集完成,共{len(collect_result)}条数据,等待{long_delay:.2f}秒后继续下一个关键词(反爬)")
  1383. return collect_result
  1384. # ==================== 保存到CSV函数(适配新表头) ====================
  1385. # def save_to_csv(data_list):
  1386. # """
  1387. # 保存数据到CSV(适配新表头)
  1388. # :param data_list: list - 采集到的字典数据列表
  1389. # :return: bool - 保存是否成功
  1390. # """
  1391. # if not data_list:
  1392. # logger.warning(" 无数据可保存到CSV")
  1393. # return False
  1394. # try:
  1395. # # 判断文件是否存在,不存在则写入表头
  1396. # file_exists = os.path.exists(CSV_FILE_PATH)
  1397. # # 打开CSV文件(追加模式,utf-8-sig避免Excel乱码)
  1398. # with open(CSV_FILE_PATH, "a", newline="", encoding="utf-8-sig") as f:
  1399. # # 用新表头作为字段名
  1400. # writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
  1401. # # 首次写入表头
  1402. # if not file_exists:
  1403. # writer.writeheader()
  1404. # logger.info(f" 已创建CSV文件并写入新表头:{CSV_FILE_PATH}")
  1405. # # 写入数据行
  1406. # writer.writerows(data_list)
  1407. # logger.info(f" 成功将 {len(data_list)} 条数据写入CSV")
  1408. # return True
  1409. # except Exception as e:
  1410. # logger.error(f" 保存CSV失败:{str(e)}")
  1411. # return False
  1412. # ==================== 主函数(登录+批量搜索) ====================
  1413. def main():
  1414. logger.info("\n" + "="*50)
  1415. logger.info("🚀 药九九采集程序启动")
  1416. logger.info(f"⏰ 启动时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  1417. logger.info("="*50)
  1418. # 待搜索的关键词列表(直接写在这里,改起来更直观)
  1419. # 存储所有关键词的采集数据
  1420. # all_collect_data = []
  1421. with sync_playwright() as p:
  1422. # browser = init_browser_with_proxy(p)
  1423. # 启动浏览器(用单个配置变量)
  1424. browser = p.chromium.launch(
  1425. headless=False, # 不要用无头模式(反爬:无头模式易被识别)
  1426. channel="chrome", # 使用真实Chrome内核
  1427. slow_mo=random.randint(100, 300), # 全局操作延迟(模拟真人慢速操作)
  1428. args=[
  1429. "--disable-blink-features=AutomationControlled", # 禁用webdriver特征(核心!)
  1430. "--enable-automation=false", # 新增:禁用自动化标识
  1431. "--disable-infobars", # 新增:禁用信息栏
  1432. "--remote-debugging-port=0", # 新增:随机调试端口
  1433. "--start-maximized", # 最大化窗口(模拟真人使用)
  1434. "--disable-extensions", # 禁用扩展(避免特征)
  1435. "--disable-plugins-discovery", # 禁用插件发现
  1436. "--no-sandbox", # 避免沙箱模式特征
  1437. "--disable-dev-shm-usage", # 避免内存限制导致的异常
  1438. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36" # 随机Chrome版本的UA
  1439. ]
  1440. )
  1441. # 创建页面时伪装指纹
  1442. context = browser.new_context(
  1443. locale="zh-CN", # 中文环境
  1444. timezone_id="Asia/Shanghai", # 上海时区
  1445. geolocation={"latitude": 31.230416, "longitude": 121.473701}, # 模拟上海地理位置(可选)
  1446. permissions=["geolocation"], # 授予定位权限(模拟真人)
  1447. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  1448. viewport={"width": 1600, "height": 1400},
  1449. # 关键:隐藏自动化特征
  1450. java_script_enabled=True,
  1451. bypass_csp=True,
  1452. # user_data_dir="./temp_user_data" # 模拟真实用户数据目录
  1453. )
  1454. page = context.new_page()
  1455. # 关键:移除navigator.webdriver标识(反爬核心)
  1456. page.add_init_script("""
  1457. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  1458. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); // 新增:模拟插件
  1459. Object.defineProperty(navigator, 'mimeTypes', { get: () => [1, 2, 3] }); // 新增:模拟MIME类型
  1460. window.chrome = { runtime: {}, loadTimes: () => ({}) }; // 增强Chrome模拟
  1461. delete window.navigator.languages;
  1462. window.navigator.languages = ['zh-CN', 'zh'];
  1463. // 新增:模拟真实鼠标移动特征
  1464. (() => {
  1465. const originalAddEventListener = EventTarget.prototype.addEventListener;
  1466. EventTarget.prototype.addEventListener = function(type, listener) {
  1467. if (type === 'mousemove') {
  1468. return originalAddEventListener.call(this, type, (e) => {
  1469. e._automation = undefined;
  1470. listener(e);
  1471. });
  1472. }
  1473. return originalAddEventListener.call(this, type, listener);
  1474. };
  1475. })();
  1476. """)
  1477. try:
  1478. # ========== 核心:Cookie复用逻辑 ==========
  1479. # 1. 加载本地Cookie
  1480. load_cookies(context)
  1481. # 2. 验证登录状态
  1482. if not is_login(page):
  1483. # 3. Cookie失效/不存在,执行登录
  1484. page.goto(TARGET_LOGIN_URL)
  1485. page.wait_for_load_state("networkidle")
  1486. logger.info("🔑 开始执行登录流程")
  1487. # 执行登录操作
  1488. login_success = login_operation(page, USERNAME, PASSWORD)
  1489. if not login_success:
  1490. logger.error(" 登录失败,程序终止")
  1491. return
  1492. # 4. 登录成功后保存Cookie
  1493. save_cookies(context)
  1494. logger.info(" 登录并保存Cookie成功!")
  1495. # 2. 批量搜索+采集+保存
  1496. for keyword_idx, keyword in enumerate(SEARCH_KEYWORDS, 1):
  1497. logger.info(f"\n=====================================")
  1498. logger.info(f"🔍 开始处理第{keyword_idx}/{len(SEARCH_KEYWORDS)}个关键词:{keyword}")
  1499. logger.info(f"=====================================")
  1500. # 执行搜索
  1501. popup_guard(page, "before_search")
  1502. search_success = search_operation(page, keyword)
  1503. # input("")
  1504. popup_guard(page, "after_search")
  1505. if not search_success:
  1506. logger.warning(f" 「{keyword}」搜索失败,跳过采集")
  1507. continue
  1508. # ✅ 再等页面稳定一下(networkidle 有时会等不到,建议加超时或换成 domcontentloaded)
  1509. page.wait_for_load_state("domcontentloaded")
  1510. page.wait_for_load_state('networkidle')
  1511. # 采集数据
  1512. data_list = collect_data(page, keyword)
  1513. # # 保存到CSV
  1514. # if data_list:
  1515. # save_to_csv(data_list)
  1516. # else:
  1517. # logger.warning(f" 「{keyword}」无数据,跳过保存")
  1518. logger.info("\n🎉 所有关键词处理完成!CSV文件路径:" + os.path.abspath(CSV_FILE_PATH))
  1519. # input("\n按回车关闭程序...")
  1520. except Exception as e:
  1521. logger.error(f" 程序异常:{str(e)}")
  1522. finally:
  1523. browser.close()
  1524. logger.info(" 浏览器已关闭,程序结束")
  1525. # ==================== 程序入口 ====================
  1526. if __name__ == '__main__':
  1527. main()