main.py 78 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867
  1. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  2. from logger_config import logger
  3. from datetime import datetime
  4. import random
  5. import csv
  6. import os
  7. import time
  8. import json
  9. from config import *
  10. from conn_mysql import MySQLPoolOnline
  11. import re
  12. import uuid
  13. import requests
  14. import base64
  15. from io import BytesIO
  16. from PIL import Image
  17. import traceback
  18. import oss2
  19. import schedule
  20. # from faker import Faker
  21. # 代理IP池
  22. PROXY_POOL_URL = ""
  23. PROXY_VALIDATION_URL = "" # 用于验证代理有效性的URL
  24. PROXY_TIMEOUT = 10 # 代理验证超时时间(秒)
  25. mysql_pool = MySQLPoolOnline()
  26. def _chromium_launch_args(chrome_major=None):
  27. """Playwright Chromium 启动参数(反自动化 + 随机 UA 主版本)。"""
  28. major = chrome_major if chrome_major is not None else random.randint(110, 120)
  29. return [
  30. "--disable-blink-features=AutomationControlled",
  31. "--enable-automation=false",
  32. "--disable-infobars",
  33. "--remote-debugging-port=0",
  34. "--start-maximized",
  35. "--disable-extensions",
  36. "--disable-plugins-discovery",
  37. "--no-sandbox",
  38. "--disable-dev-shm-usage",
  39. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
  40. f"(KHTML, like Gecko) Chrome/{major}.0.0.0 Safari/537.36",
  41. ]
  42. def get_random_proxy():
  43. """从代理池获取随机代理IP"""
  44. if not PROXY_POOL_URL.strip():
  45. return None
  46. try:
  47. response = requests.get(PROXY_POOL_URL, timeout=10)
  48. if response.status_code == 200:
  49. proxy = response.text.strip()
  50. if validate_proxy(proxy):
  51. logger.info(f"获取到有效代理: {proxy}")
  52. return proxy
  53. logger.warning(f"代理无效: {proxy}")
  54. except Exception as e:
  55. logger.error(f"获取代理失败: {str(e)}")
  56. return None
  57. def validate_proxy(proxy):
  58. """验证代理IP有效性"""
  59. if not PROXY_VALIDATION_URL.strip():
  60. return False
  61. try:
  62. proxies = {
  63. "http": f"http://{proxy}",
  64. "https": f"https://{proxy}"
  65. }
  66. response = requests.get(
  67. PROXY_VALIDATION_URL,
  68. proxies=proxies,
  69. timeout=PROXY_TIMEOUT
  70. )
  71. return response.status_code == 200
  72. except Exception:
  73. return False
  74. def init_browser_with_proxy(playwright):
  75. proxy = get_random_proxy()
  76. proxy_config = None
  77. if proxy:
  78. proxy_server, proxy_port = proxy.split(":")
  79. proxy_config = {
  80. "server": f"http://{proxy_server}:{proxy_port}",
  81. # "username": "your_proxy_username",
  82. # "password": "your_proxy_password"
  83. }
  84. logger.info(f"使用代理: {proxy_server}:{proxy_port}")
  85. else:
  86. logger.warning("未获取到有效代理,将使用本地IP")
  87. # 启动浏览器(保留原有反爬配置)
  88. return playwright.chromium.launch(
  89. headless=False, # 非无头模式
  90. channel="chrome", # 使用Chrome内核
  91. slow_mo=random.randint(100, 300), # 随机操作延迟
  92. proxy=proxy_config, # 代理配置(None则不使用代理)
  93. args=_chromium_launch_args(),
  94. )
  95. # ==================== 2. 反爬工具函数 ====================
  96. def random_delay(min_seconds, max_seconds):
  97. """生成随机延迟(核心反爬:避免固定间隔)"""
  98. delay = random.uniform(min_seconds, max_seconds)
  99. time.sleep(delay)
  100. return delay
  101. def simulate_human_typing(page, locator, text):
  102. """模拟真人打字(逐个字符输入,带随机间隔)"""
  103. try:
  104. locator.click()
  105. locator.clear()
  106. for char in text:
  107. locator.type(char, delay=random.uniform(MIN_INPUT_DELAY, MAX_INPUT_DELAY))
  108. random_delay(0.05, 0.1) # 字符间额外小延迟
  109. logger.info(f" 模拟真人输入完成:{text}")
  110. except Exception as e:
  111. logger.error(f"模拟打字失败:{e}")
  112. locator.fill(text) # 兜底:直接填充
  113. def save_cookies(context, cookie_path=COOKIE_FILE_PATH):
  114. """保存Cookie到本地JSON文件"""
  115. try:
  116. cookies = context.cookies()
  117. with open(cookie_path, "w", encoding="utf-8") as f:
  118. json.dump(cookies, f, ensure_ascii=False, indent=2)
  119. logger.info(f"Cookie已保存到:{cookie_path}")
  120. return True
  121. except Exception as e:
  122. logger.error(f" 保存Cookie失败:{e}")
  123. return False
  124. def load_cookies(context, cookie_path=COOKIE_FILE_PATH):
  125. """从本地JSON文件加载Cookie到浏览器上下文"""
  126. if not os.path.exists(cookie_path):
  127. logger.warning(f" Cookie文件不存在:{cookie_path}")
  128. return False
  129. try:
  130. with open(cookie_path, "r", encoding="utf-8") as f:
  131. cookies = json.load(f)
  132. context.add_cookies(cookies)
  133. logger.info(f"✅ 已从{cookie_path}加载Cookie")
  134. return True
  135. except Exception as e:
  136. logger.error(f" 加载Cookie失败:{e}")
  137. return False
  138. def is_login(page):
  139. """验证是否已登录(核心:检测登录态)"""
  140. try:
  141. # 访问需要登录的页面
  142. page.goto(LOGIN_VALIDATE_URL, timeout=ELEMENT_TIMEOUT)
  143. time.sleep(5)
  144. page.wait_for_load_state("networkidle")
  145. # 检测是否跳转到登录页(URL包含login则未登录)
  146. if "login" in page.url.lower():
  147. logger.warning(" Cookie失效,需要重新登录")
  148. return False
  149. # 可选:检测登录后的专属元素(比如用户名、个人中心等)
  150. # if page.locator("用户中心选择器").count() > 0:
  151. # return True
  152. logger.info(" Cookie有效,已保持登录状态")
  153. return True
  154. except Exception as e:
  155. logger.error(f" 验证登录状态失败:{e}")
  156. return False
  157. # ==================== 滚动函数重构(核心修改) ====================
  158. def slow_scroll_400px(page,scroll_distance1=400):
  159. """
  160. 慢速滚动400px±50px(模拟真人滑动)
  161. :param page: 页面对象
  162. :return: 滚动是否成功
  163. """
  164. try:
  165. # 生成400±50px的随机滚动距离
  166. scroll_distance = random.randint(
  167. scroll_distance1 - SCROLL_OFFSET_RANGE,
  168. scroll_distance1 + SCROLL_OFFSET_RANGE
  169. )
  170. remaining_distance = scroll_distance
  171. total_steps = int(scroll_distance / SCROLL_STEP)
  172. logger.info(
  173. f"📜 开始慢速滚动(目标距离:{scroll_distance}px,总步数:{total_steps},总时长约{total_steps*SCROLL_INTERVAL:.2f}秒)"
  174. )
  175. # 渐进式滚动(每步50px,间隔0.05秒)
  176. for _ in range(total_steps):
  177. step = min(SCROLL_STEP, remaining_distance)
  178. page.evaluate(f"window.scrollBy(0, {step});")
  179. remaining_distance -= step
  180. time.sleep(SCROLL_INTERVAL)
  181. # 处理剩余不足一步的距离
  182. if remaining_distance > 0:
  183. page.evaluate(f"window.scrollBy(0, {remaining_distance});")
  184. time.sleep(SCROLL_INTERVAL)
  185. # 滚动后等待懒加载完成
  186. page.wait_for_load_state("networkidle", timeout=8000)
  187. random_delay(2.0, 3.0) # 滚动后额外停顿,模拟真人
  188. logger.info(f" 慢速滚动完成,实际滚动距离:{scroll_distance - remaining_distance}px")
  189. return True
  190. except Exception as e:
  191. logger.warning(f" 慢速滚动失败:{e}")
  192. return False
  193. # def check_anti_crawl(page):
  194. # """检测反爬弹窗/验证码(核心:提前识别反爬)"""
  195. # anti_crawl_selectors = [
  196. # "//div[contains(text(), '验证')]",
  197. # "//div[contains(text(), '人机验证')]",
  198. # "//div[contains(text(), '访问过于频繁')]",
  199. # "//button[contains(text(), '验证')]"
  200. # ]
  201. # for selector in anti_crawl_selectors:
  202. # if page.locator(selector).count() > 0:
  203. # logger.error("❌ 检测到反爬验证弹窗!请手动完成验证后按回车继续...")
  204. # input() # 暂停等待手动验证
  205. # return True
  206. # return False
  207. # CSV配置
  208. CSV_FILE_PATH = f"yjj_collect_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" # CSV保存路径
  209. CSV_HEADERS = [
  210. "商品标题", "商品采购价格", "商品折扣价格", "规格", "盒数",
  211. "店铺名称", "公司名称",
  212. "有效日期", "生产日期", "批准文号", "采集时间"
  213. ] #表头
  214. # ==================== 登录函数 ====================
  215. def login_operation(page, username, password):
  216. """登录操作函数"""
  217. try:
  218. # 输入手机号(直接用单个变量)
  219. page.wait_for_selector(USERNAME_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  220. page.wait_for_timeout(timeout=3000)
  221. page.fill(USERNAME_SELECTOR, username)
  222. logger.info(" 已输入登录账号")
  223. # 输入密码
  224. page.wait_for_selector(PASSWORD_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  225. page.wait_for_timeout(timeout=3000)
  226. page.fill(PASSWORD_SELECTOR, password)
  227. logger.info(" 已输入登录密码")
  228. # 点击登录按钮
  229. page.wait_for_selector(LOGIN_BTN_SELECTOR, timeout=ELEMENT_TIMEOUT)
  230. page.wait_for_timeout(timeout=3000)
  231. page.click(LOGIN_BTN_SELECTOR)
  232. logger.info(" 已点击登录按钮")
  233. page.wait_for_timeout(LOGIN_AFTER_CLICK)
  234. return True
  235. except PlaywrightTimeoutError as e:
  236. logger.error(f" 登录失败:元素定位超时 - {str(e)}")
  237. return False
  238. except Exception as e:
  239. logger.error(f" 登录异常:{str(e)}")
  240. return False
  241. def kill_masks(page):
  242. """
  243. 强制清理残留遮罩层/覆盖层,并恢复 body 可滚动、可点击状态
  244. """
  245. page.evaluate(r"""
  246. () => {
  247. const removed = [];
  248. const hidden = [];
  249. // 1) 先处理已知常见遮罩
  250. const knownSelectors = [
  251. '.v-modal',
  252. '.el-overlay',
  253. '.el-overlay-dialog',
  254. '.el-dialog__wrapper',
  255. '.el-message-box__wrapper',
  256. '.el-loading-mask',
  257. '.el-popup-parent--hidden'
  258. ];
  259. for (const sel of knownSelectors) {
  260. document.querySelectorAll(sel).forEach(el => {
  261. // v-modal / overlay 直接 remove 最省事
  262. removed.push(sel);
  263. el.remove();
  264. });
  265. }
  266. // 2) 再做一次“泛化兜底”:全屏 fixed/absolute + 高 z-index 的覆盖层
  267. // 注意:不要误删页面正常的固定导航,所以加上“近似全屏”的判断
  268. const all = Array.from(document.querySelectorAll('body *'));
  269. for (const el of all) {
  270. const s = window.getComputedStyle(el);
  271. if (!s) continue;
  272. const z = parseInt(s.zIndex || '0', 10);
  273. const pos = s.position;
  274. const pe = s.pointerEvents;
  275. if ((pos === 'fixed' || pos === 'absolute') && z >= 1000 && pe !== 'none') {
  276. const r = el.getBoundingClientRect();
  277. const nearFullScreen =
  278. r.width >= window.innerWidth * 0.8 &&
  279. r.height >= window.innerHeight * 0.8 &&
  280. r.left <= window.innerWidth * 0.1 &&
  281. r.top <= window.innerHeight * 0.1;
  282. // 常见遮罩是半透明背景色,或者透明但拦截点击
  283. const bg = s.backgroundColor || '';
  284. const looksLikeMask =
  285. nearFullScreen && (bg.includes('rgba') || bg.includes('rgb') || s.opacity !== '1');
  286. if (nearFullScreen) {
  287. // 不管透明不透明,只要近似全屏且高 z-index,就先让它不拦截点击
  288. el.style.pointerEvents = 'none';
  289. el.style.display = 'none';
  290. hidden.push(el.tagName + '.' + (el.className || ''));
  291. }
  292. }
  293. }
  294. // 3) 恢复 body / html 的滚动与交互(很多弹窗会锁滚动)
  295. document.documentElement.style.overflow = 'auto';
  296. document.body.style.overflow = 'auto';
  297. document.body.style.position = 'static';
  298. document.body.style.width = 'auto';
  299. document.body.style.paddingRight = '0px';
  300. // 4) 去掉 Element-UI 常见的锁定 class
  301. document.body.classList.remove('el-popup-parent--hidden');
  302. return { removed, hiddenCount: hidden.length, hidden };
  303. }
  304. """)
  305. def force_close_popup(page):
  306. """关闭新手引导/遮罩(多步:下一步/完成/我知道了),并兜底移除遮罩层"""
  307. try:
  308. # 1) 尝试连续点“下一步/完成/我知道了/关闭”
  309. for _ in range(5): # 最多点5次,足够覆盖多步引导
  310. btn = page.locator(
  311. "//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  312. ).first
  313. if btn.count() > 0 and btn.is_visible():
  314. btn.click(timeout=1500)
  315. page.wait_for_timeout(300)
  316. continue
  317. # 有些引导是右上角 X(如果存在就点)
  318. close_icon = page.locator(
  319. "xpath=//*[contains(@class,'close') or contains(@class,'el-icon-close') or name()='svg' or name()='i'][1]"
  320. ).first
  321. if close_icon.count() > 0 and close_icon.is_visible():
  322. close_icon.click(timeout=1000)
  323. page.wait_for_timeout(300)
  324. continue
  325. break
  326. # 2) 兜底:移除常见遮罩层(element-ui / 通用 mask/overlay)
  327. page.evaluate("""
  328. const selectors = [
  329. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  330. '[class*="mask"]', '[class*="overlay"]', '[style*="z-index"]'
  331. ];
  332. for (const sel of selectors) {
  333. document.querySelectorAll(sel).forEach(el => {
  334. const s = window.getComputedStyle(el);
  335. // 只移除“覆盖层”倾向的元素:fixed/absolute 且 z-index 很高
  336. if ((s.position === 'fixed' || s.position === 'absolute') && parseInt(s.zIndex || '0', 10) >= 1000) {
  337. el.remove();
  338. }
  339. });
  340. }
  341. """)
  342. except Exception:
  343. pass
  344. # 调用方式和方案1一致:在搜索后、采集前执行
  345. # force_close_popup(page)
  346. def pick_search_input(page):
  347. """优先选可见且可用的搜索输入框;第一个不行就尝试第二个"""
  348. inputs = page.locator(SEARCH_INPUT_SELECTOR)
  349. cnt = inputs.count()
  350. # 优先检查前两个(你说只有两个)
  351. for i in range(min(cnt, 2)):
  352. candidate = inputs.nth(i)
  353. try:
  354. candidate.wait_for(state="visible", timeout=1500) # 小超时快速试探
  355. if candidate.is_enabled():
  356. return candidate
  357. except PlaywrightTimeoutError:
  358. continue
  359. # 兜底:直接找任意可见的(避免命中 hidden 模板)
  360. candidate = page.locator(f"{SEARCH_INPUT_SELECTOR}:visible").first
  361. candidate.wait_for(state="visible", timeout=ELEMENT_TIMEOUT)
  362. return candidate
  363. def type_slow(locator, text, min_delay=0.06, max_delay=0.18):
  364. """逐字输入,模拟真人打字"""
  365. for ch in text:
  366. locator.type(ch, delay=int(random.uniform(min_delay, max_delay) * 1000))
  367. # ==================== 搜索操作函数 ====================
  368. def search_operation(page, keyword):
  369. """搜索框填充+提交搜索"""
  370. try:
  371. # 1) 找到“可用”的搜索框(第一个不行就用第二个)
  372. search_locator = pick_search_input(page)
  373. # 清空并填充搜索框
  374. search_locator.wait_for(timeout=ELEMENT_TIMEOUT)
  375. # 2. 清空搜索框(双重保障:先调用locator的clear,再手动全选删除)
  376. search_locator.click() # 聚焦
  377. search_locator.fill("")
  378. page.keyboard.down("Control") # 按住Control键
  379. page.keyboard.press("a") # 按a键
  380. page.keyboard.up("Control") # 松开Control键
  381. page.keyboard.press("Backspace") # 删除选中内容
  382. # 3) 逐字输入
  383. type_slow(search_locator, keyword, min_delay=0.06, max_delay=0.18)
  384. # 3. 输入搜索关键词
  385. # search_locator.fill(keyword)
  386. logger.info(f"📝 已输入搜索关键词:{keyword}")
  387. # 3) 搜索按钮也建议点可见的那个
  388. btn = page.locator(f"{SEARCH_BTN_SELECTOR}:visible").first
  389. btn.wait_for(state="visible", timeout=SEARCH_BTN_TIMEOUT)
  390. btn.click()
  391. page.wait_for_timeout(600)
  392. try:
  393. page.wait_for_load_state("networkidle", timeout=10000)
  394. except Exception:
  395. pass
  396. force_close_popup(page)
  397. kill_masks(page)
  398. logger.info("✅ 已触发搜索")
  399. return True
  400. except PlaywrightTimeoutError as e:
  401. logger.error(f" 搜索失败:元素定位超时 - {str(e)}")
  402. return False
  403. except Exception as e:
  404. logger.error(f" 搜索异常:{str(e)}")
  405. return False
  406. #翻下一页
  407. def goto_next_page(page):
  408. """
  409. 尝试翻到下一页;成功返回True,没下一页/翻页失败返回False
  410. 适配常见 ElementUI: .el-pagination .btn-next / .el-pagination__next
  411. """
  412. # 多写几个候选,哪个能用就用哪个
  413. candidates = [
  414. ".el-pagination button.btn-next:not(.is-disabled)",
  415. ".el-pagination__next:not(.is-disabled)",
  416. "button:has-text('下一页'):not([disabled])",
  417. "a:has-text('下一页')",
  418. ]
  419. next_btn = None
  420. for sel in candidates:
  421. loc = page.locator(sel).first
  422. if loc.count() > 0:
  423. next_btn = loc
  424. break
  425. if not next_btn:
  426. return False
  427. # 用“当前页第一个商品标题”做翻页完成的判据(比只等networkidle更稳)
  428. first_title = page.locator(PRODUCT_TITLE_SELECTOR).first
  429. before = ""
  430. try:
  431. if first_title.count() > 0:
  432. before = first_title.inner_text(timeout=2000).strip()
  433. except:
  434. pass
  435. try:
  436. page.evaluate("window.scrollTo(0, 0);")
  437. next_btn.click(timeout=5000)
  438. page.wait_for_load_state("networkidle")
  439. # 等列表发生变化(标题变了 / 或者至少第一个标题重新出现)
  440. if before:
  441. page.wait_for_function(
  442. """(sel, oldText) => {
  443. const el = document.querySelector(sel);
  444. return el && el.innerText && el.innerText.trim() !== oldText;
  445. }""",
  446. arg=(PRODUCT_TITLE_SELECTOR, before),
  447. timeout=5000
  448. )
  449. else:
  450. first_title.wait_for(timeout=1000)
  451. return True
  452. except Exception as e:
  453. logger.warning(f" 翻页失败:{e}")
  454. return False
  455. def popup_guard(page, tag=""):
  456. """
  457. 全局弹窗/遮罩守卫:多步引导 + 关闭按钮 + 遮罩清理 + 恢复滚动
  458. tag 仅用于日志区分调用位置
  459. """
  460. try:
  461. # 给弹窗一点出现时间
  462. page.wait_for_timeout(300)
  463. # 1) 连续点“下一步/完成/我知道了/关闭”
  464. for _ in range(6):
  465. btn = page.locator(
  466. "xpath=//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  467. ).first
  468. if btn.count() > 0 and btn.is_visible():
  469. btn.click(timeout=1500)
  470. page.wait_for_timeout(250)
  471. continue
  472. # 2) 常见的 close icon
  473. close_btn = page.locator(
  474. "css=.el-dialog__headerbtn, .el-message-box__headerbtn, .close, .icon-close, .el-icon-close"
  475. ).first
  476. if close_btn.count() > 0 and close_btn.is_visible():
  477. close_btn.click(timeout=1200)
  478. page.wait_for_timeout(250)
  479. continue
  480. break
  481. # 3) 清遮罩 + 恢复滚动/交互
  482. page.evaluate(r"""
  483. () => {
  484. // 第一步:精准清理已知的遮罩/弹窗类名(Element UI框架常用)
  485. const selectors = [
  486. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  487. '.el-message-box__wrapper', '.el-loading-mask'
  488. ];
  489. selectors.forEach(sel => document.querySelectorAll(sel).forEach(e => e.remove()));
  490. // 泛化兜底:近似全屏 + 高 z-index 的层直接屏蔽
  491. const all = Array.from(document.querySelectorAll('body *'));
  492. for (const el of all) {
  493. const s = getComputedStyle(el); // 获取元素的实际样式(含CSS生效的样式)
  494. const z = parseInt(s.zIndex || '0', 10); // 取元素的层级(z-index),默认0
  495. // 条件1:元素是固定/绝对定位(弹窗/遮罩常见定位方式)+ 层级≥1000(高优先级遮挡)+ 能拦截鼠标事件
  496. if ((s.position === 'fixed' || s.position === 'absolute') && z >= 1000 && s.pointerEvents !== 'none') {
  497. const r = el.getBoundingClientRect(); // 获取元素的尺寸和位置
  498. // 条件2:元素宽度/高度≥屏幕80%(近似全屏遮罩)
  499. const nearFull = r.width >= innerWidth * 0.8 && r.height >= innerHeight * 0.8;
  500. if (nearFull) {
  501. el.style.pointerEvents = 'none'; // 让元素不拦截鼠标点击
  502. el.style.display = 'none'; // 隐藏元素
  503. }
  504. }
  505. }
  506. // 第三步:恢复页面滚动功能(弹窗常把页面设为不可滚动)
  507. document.documentElement.style.overflow = 'auto'; // html标签恢复滚动
  508. document.body.style.overflow = 'auto'; // body标签恢复滚动
  509. document.body.classList.remove('el-popup-parent--hidden'); // 移除Element UI的滚动禁用类
  510. }
  511. """)
  512. logger.info("杀除弹窗成功")
  513. except Exception:
  514. pass
  515. def open_detail_page(list_page, item, keyword, idx, *, timeout=15000):
  516. """
  517. 点击商品进入详情页,兼容:
  518. 1) 新开 tab(返回 detail_page != list_page, opened_new_tab=True)
  519. 2) 同 tab 跳转(detail_page == list_page, opened_new_tab=False)
  520. """
  521. ctx = list_page.context
  522. list_url = list_page.url
  523. detail_page = None
  524. opened_new_tab = False
  525. try:
  526. # 期望新开 tab(很多站点会这样)
  527. with ctx.expect_page(timeout=timeout) as p:
  528. item.click(delay=random.uniform(0.1, 0.3))
  529. detail_page = p.value
  530. opened_new_tab = True
  531. logger.info(f" 「{keyword}」第{idx}个商品 - 新开标签页进入详情")
  532. except PlaywrightTimeoutError:
  533. # 兜底:没新开 tab,大概率是同页跳转/弹层
  534. detail_page = list_page
  535. opened_new_tab = False
  536. logger.info(f" 「{keyword}」第{idx}个商品 - 未新开标签页,按同页进入详情处理")
  537. return detail_page, opened_new_tab, list_url
  538. def return_to_list(list_page, detail_page, opened_new_tab, list_url, keyword, idx):
  539. """
  540. 从详情页返回列表页:
  541. - 新 tab:关闭 tab,然后 bring_to_front 切回
  542. - 同 tab:尽量 go_back 回到 list_url;如果没跳转而是弹层,尝试 ESC
  543. """
  544. # 如果浏览器/页面已经被关了,直接退出,避免二次异常
  545. if list_page is None or list_page.is_closed():
  546. logger.warning(f" 「{keyword}」第{idx}个商品 - 列表页已关闭,无法切回")
  547. return
  548. if opened_new_tab:
  549. # 只关“新开的详情 tab”,绝不关 list_page
  550. try:
  551. if detail_page and (detail_page is not list_page) and (not detail_page.is_closed()):
  552. detail_page.close()
  553. logger.info(f"📌 「{keyword}」第{idx}个商品 - 已关闭详情页标签页")
  554. except Exception as e:
  555. logger.warning(f" 「{keyword}」第{idx}个商品 - 关闭详情页失败:{e}")
  556. # 切回列表页
  557. try:
  558. list_page.bring_to_front()
  559. list_page.mouse.move(random.randint(100, 300), random.randint(200, 400))
  560. random_delay(0.3, 0.8)
  561. list_page.wait_for_load_state("networkidle")
  562. logger.info(f" 「{keyword}」第{idx}个商品 - 已切回列表页(新tab模式)")
  563. except Exception as e:
  564. logger.warning(f" 「{keyword}」第{idx}个商品 - 切回列表页失败:{e}")
  565. return
  566. # 同 tab:detail_page == list_page
  567. try:
  568. # 1) 如果 URL 变了,说明确实跳转了 → go_back 回去
  569. if list_page.url != list_url:
  570. for _ in range(3): # 最多退 3 次,防止死循环
  571. list_page.go_back(timeout=15000)
  572. list_page.wait_for_load_state("domcontentloaded", timeout=15000)
  573. random_delay(0.2, 0.5)
  574. if list_page.url == list_url:
  575. break
  576. logger.info(f" 「{keyword}」第{idx}个商品 - 已返回列表页(同tab跳转模式)")
  577. else:
  578. # 2) URL 没变:可能是弹层详情 → 尝试 ESC 关闭弹层
  579. list_page.keyboard.press("Escape")
  580. random_delay(0.2, 0.5)
  581. logger.info(f" 「{keyword}」第{idx}个商品 - 已尝试关闭弹层并留在列表页(同tab弹层模式)")
  582. list_page.bring_to_front()
  583. list_page.wait_for_load_state("networkidle")
  584. except Exception as e:
  585. logger.warning(f" 「{keyword}」第{idx}个商品 - 同tab返回列表页失败:{e}")
  586. #判断店名是否已经在数据库
  587. def shop_is_exists_database(shop):
  588. query_sql = """
  589. SELECT province, city, business_license_company, qualification_number
  590. FROM retrieve_yjj_shop_info_middle
  591. WHERE shop = %s
  592. """
  593. try:
  594. rows = mysql_pool.select_data(query_sql, (shop,))
  595. result = rows[0] if rows else None
  596. logger.debug("店铺存在校验 | shop=%r | result=%s", shop, result)
  597. is_exists = bool(result)
  598. if is_exists:
  599. logger.info(f"【店铺存在校验】店铺已存在 | 店铺名:{repr(shop)} | 结果:存在(True)不要执行采集店铺")
  600. else:
  601. logger.info(f"【店铺存在校验】店铺不存在 | 店铺名:{repr(shop)} | 结果:不存在(False)")
  602. return is_exists, result
  603. except Exception as e:
  604. logger.error(f"查询店铺失败:{e}")
  605. return False, None
  606. def insert_shop_info_to_db(shop,contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform, province, city, create_time, update_time):
  607. """
  608. 把字段插入到yjj_shop_info_middle表
  609. :param 各参数: 你要插入的字段值(空字符串也可)
  610. :return: bool - 插入成功返回True,失败返回False
  611. """
  612. sql = """
  613. INSERT INTO retrieve_yjj_shop_info_middle (
  614. shop,
  615. contact_address,
  616. qualification_number,
  617. business_license_company,
  618. business_license_address,
  619. scrape_date,
  620. platform,
  621. province,
  622. city,
  623. create_time,
  624. update_time
  625. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  626. ON DUPLICATE KEY UPDATE
  627. contact_address = VALUES(contact_address),
  628. qualification_number = VALUES(qualification_number),
  629. business_license_company = VALUES(business_license_company),
  630. business_license_address = VALUES(business_license_address),
  631. scrape_date = VALUES(scrape_date),
  632. platform = VALUES(platform),
  633. province = VALUES(province),
  634. city = VALUES(city),
  635. update_time = VALUES(update_time)
  636. """
  637. params = (
  638. shop,
  639. contact_address,
  640. qualification_number,
  641. business_license_company,
  642. business_license_address,
  643. scrape_date,
  644. platform,
  645. province,
  646. city,
  647. create_time,
  648. update_time,
  649. )
  650. try:
  651. n = mysql_pool.execute(sql, params)
  652. if n > 0:
  653. logger.info("店铺信息已入库 | shop=%s", shop)
  654. return True
  655. logger.error("店铺信息插入失败:受影响行数为 0 | shop=%s", shop)
  656. return False
  657. except Exception as e:
  658. logger.error("店铺信息插入失败:%s", e)
  659. return False
  660. def insert_single_to_mysql(single_data):
  661. """
  662. 逐条插入单条数据到MySQL数据库
  663. :param single_data: 单条商品数据元组
  664. :return: 插入是否成功
  665. """
  666. insert_sql = """
  667. INSERT INTO retrieve_scrape_data
  668. (enterprise_id,product_name, min_price, manufacture_date,
  669. expiry_date, store_name, company_name, province_name, city_name, manufacturer, product_specs, approval_number, link_url,
  670. scrape_date, is_sold_out, qualification_number, platform_id, number, sales,inventory,snapshot_url,search_name,product_brand,collect_config_info,task_id,insert_time,update_time,collect_equipment_account_id,collect_region_id,collect_round) VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s, %s);
  671. """
  672. values = (
  673. single_data["company_id"],
  674. single_data["product"],
  675. single_data["min_price"],
  676. single_data["manufacture_date"],
  677. single_data["expiry_date"],
  678. single_data["shop"],
  679. single_data["business_license_company"],
  680. single_data["province"],
  681. single_data["city"],
  682. single_data["manufacturer"],
  683. single_data["specification"],
  684. single_data["approval_number"],
  685. single_data["product_link"],
  686. single_data["scrape_date"],
  687. single_data["is_sold_out"],
  688. single_data["credit_code"],
  689. 7,
  690. single_data["number"],
  691. single_data["sales"],
  692. single_data["inventory"],
  693. single_data["snapshot_url"],
  694. single_data["search_name"],
  695. single_data["product_brand"],
  696. single_data["collect_config_info"],
  697. single_data["task_id"],
  698. single_data["insert_time"],
  699. single_data["update_time"],
  700. single_data["collect_equipment_account_id"],
  701. single_data["collect_region_id"],
  702. single_data["collect_round"],
  703. )
  704. try:
  705. n = mysql_pool.execute(insert_sql, values)
  706. if n > 0:
  707. logger.info("单条采集数据已入库 | product=%s", single_data.get("product", "")[:80])
  708. return True
  709. logger.error("单条数据插入失败:受影响行数为 0")
  710. return False
  711. except Exception as e:
  712. logger.error("单条数据插入失败:%s", e)
  713. return False
  714. def check_dup_in_biz_db(product_link, discount_price_val, scrape_date,keyword_dict):
  715. """直接查询业务表是否存在该商品链接+价格"""
  716. log_context = (
  717. f"【去重校验】商品链接:{product_link.strip()} | 价格:{discount_price_val} "
  718. f"采集日期:{scrape_date.strip()}"
  719. )
  720. collect_round = keyword_dict["collect_round"]
  721. collect_equipment_account_id = keyword_dict["collect_equipment_account_id"]
  722. collect_region_id = keyword_dict["collect_region_id"]
  723. sql = """
  724. SELECT 1 FROM retrieve_scrape_data
  725. WHERE link_url = %s AND min_price = %s AND scrape_date = %s AND platform_id = %s
  726. AND collect_round = %s AND collect_equipment_account_id = %s AND collect_region_id = %s
  727. LIMIT 1
  728. """
  729. params = (
  730. product_link.strip(),
  731. discount_price_val,
  732. scrape_date.strip(),
  733. 7,
  734. collect_round,
  735. collect_equipment_account_id,
  736. collect_region_id,
  737. )
  738. try:
  739. rows = mysql_pool.select_data(sql, params)
  740. is_dup = bool(rows)
  741. if is_dup:
  742. logger.warning(f"{log_context} - 表中已存在重复记录,跳过本次采集")
  743. else:
  744. logger.info(f"{log_context} - 表中无重复记录,正常采集")
  745. return is_dup
  746. except Exception as e:
  747. logger.error(f"查询业务表去重失败:{str(e)}")
  748. return False
  749. # 压缩图片函数
  750. def compress_image(image_data, max_size=4*1024*1024): # 4MB上限
  751. try:
  752. img = Image.open(BytesIO(image_data))
  753. # 将RGBA模式转为RGB(兼容JPEG)
  754. if img.mode in ('RGBA', 'P'): # P是PNG的调色板模式,也需转换
  755. # 新建白色背景的RGB图片,把透明图贴上去(避免透明区域变黑)
  756. bg_img = Image.new('RGB', img.size, (255, 255, 255))
  757. bg_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
  758. img = bg_img
  759. # 缩小分辨率(按比例缩到宽≤1000px)
  760. if img.width > 1000:
  761. ratio = 1000 / img.width
  762. new_size = (int(img.width*ratio), int(img.height*ratio))
  763. img = img.resize(new_size, Image.Resampling.LANCZOS)
  764. output = BytesIO()
  765. img.save(output, format='JPEG', quality=80)
  766. compressed_data = output.getvalue()
  767. if len(compressed_data) > max_size:
  768. output2 = BytesIO()
  769. img.save(output2, format='JPEG', quality=60)
  770. compressed_data = output2.getvalue()
  771. return compressed_data
  772. except Exception as e:
  773. logger.debug(f"图片压缩失败:{e}")
  774. return image_data # 压缩失败返回原始数据
  775. def download_image_to_base64(image_url, save_dir = "./download_images"):
  776. """下载网络图片,返回图片二进制数据(BytesIO)"""
  777. try:
  778. if not os.path.exists(save_dir):
  779. os.makedirs(save_dir) # 创建多级目录(比如a/b/c)
  780. print(f"创建本地保存目录:{save_dir}")
  781. except Exception as e:
  782. print(f"创建保存目录失败:{str(e)}")
  783. return None
  784. try:
  785. # 模拟浏览器请求头,避免被服务器拦截
  786. headers = {
  787. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  788. }
  789. response = requests.get(image_url, headers=headers, timeout=15)
  790. response.raise_for_status()
  791. compressed_data = compress_image(response.content)
  792. image_base64 = base64.b64encode(compressed_data).decode("utf-8")
  793. image_data = compressed_data
  794. # 步骤3:提取图片文件名(从URL中截取,避免重复)
  795. # 示例URL:https://xxx.com/123.jpg → 文件名:123.jpg
  796. file_name = image_url.split("/")[-1]
  797. # 处理特殊字符(避免文件名非法)
  798. file_name = file_name.replace("?", "").replace("&", "").replace("=", "")
  799. save_path = os.path.join(save_dir, file_name) # 完整保存路径
  800. # 步骤4:保存图片到本地
  801. with open(save_path, "wb") as f:
  802. f.write(image_data)
  803. print(f"图片已保存到本地:{save_path}")
  804. return image_base64
  805. except requests.exceptions.Timeout:
  806. print(f"下载图片超时:{image_url}")
  807. return None
  808. except requests.exceptions.HTTPError as e:
  809. code = e.response.status_code if e.response is not None else "?"
  810. logger.warning("下载图片 HTTP 错误 | url=%s | status=%s", image_url, code)
  811. return None
  812. except Exception as e:
  813. print(f"下载图片失败:{str(e)}")
  814. return None
  815. def get_ocr_res(img):
  816. try:
  817. #img地址
  818. print(f'开始识别图片:{img}')
  819. request_url = request_url_config
  820. img_base64 = download_image_to_base64(img)
  821. if not img_base64:
  822. print("图片下载/转Base64失败,终止OCR识别")
  823. return None
  824. # 获取access_token
  825. access_token = get_access_token()
  826. if not access_token:
  827. print("获取access_token失败,无法调用OCR接口")
  828. return None
  829. params = {"image": img_base64}
  830. request_url = request_url + "?access_token=" + access_token
  831. headers = {'content-type': 'application/x-www-form-urlencoded'}
  832. response = requests.post(request_url, data=params, headers=headers)
  833. if response:
  834. res = response.json()
  835. # 检查OCR返回是否有错误
  836. if "error_code" in res:
  837. print(f"百度OCR接口错误:{res['error_msg']}(错误码:{res['error_code']})")
  838. return None
  839. # 解析识别结果
  840. new_dic = dict()
  841. for ite in res['words_result'].keys():
  842. new_dic[ite] = res['words_result'][ite]['words']
  843. print('资质数据信息', new_dic)
  844. return new_dic
  845. else:
  846. print("OCR接口返回空响应")
  847. return None
  848. except requests.exceptions.RequestException as e:
  849. print(f"网络错误(图片下载/OCR请求失败):{str(e)}")
  850. return None
  851. except KeyError as e:
  852. print(f"OCR响应格式异常,缺失字段:{str(e)}")
  853. return None
  854. except Exception as e:
  855. print(f"OCR识别未知错误:{str(e)}")
  856. return None
  857. def get_access_token():
  858. AppKey = AppKey_config
  859. AppSrcret = AppSecret_config
  860. token_url =token_url_config
  861. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  862. payload = ""
  863. headers = {
  864. 'Content-Type': 'application/json',
  865. 'Accept': 'application/json'
  866. }
  867. try:
  868. response = requests.request("POST", url, headers=headers, data=payload)
  869. response.raise_for_status() # 触发HTTP错误
  870. return response.json()['access_token']
  871. except Exception as e:
  872. print(f"获取access_token失败:{str(e)}")
  873. return None
  874. def extract_province_city(address):
  875. """
  876. 从地址中提取省份和城市
  877. :param address: 营业执照地址(如"福建省福州市马尾区")
  878. :return: (province, city) - 提取到的省份/城市,提取失败返回空字符串
  879. """
  880. if not address: # 地址为空,直接返回空
  881. return "", ""
  882. # 正则1:匹配省份(兼容省/自治区/直辖市/特别行政区)
  883. province_pattern = re.compile(r'([^省]+省|.+自治区|北京市|上海市|天津市|重庆市|.+特别行政区)')
  884. province_match = province_pattern.search(address)
  885. province = province_match.group(1) if province_match else ""
  886. # 正则2:匹配城市(兼容市/自治州/地区/盟,且排除省份已匹配的部分)
  887. # 先去掉已匹配的省份,再匹配城市
  888. address_remain = address.replace(province, "").strip() if province else address.strip()
  889. city_pattern = re.compile(r'([^市]+市|.+自治州|.+地区|.+盟|^[^\d区县镇]+)')
  890. city_match = city_pattern.search(address_remain)
  891. city = city_match.group(1).strip() if city_match else ""
  892. # 兼容直辖市(如"北京市朝阳区"→city=北京市)
  893. if province in ["北京市", "上海市", "天津市", "重庆市"]:
  894. city = province
  895. # 兼容地址不规范的情况(如"福建福州马尾区",无"省"/"市"字)
  896. if not province and not city:
  897. # 匹配前两个地名(如"福建福州"→province=福建,city=福州)
  898. simple_pattern = re.compile(r'^([^\d区县镇]+)')
  899. simple_match = simple_pattern.search(address)
  900. if simple_match:
  901. city = simple_match.group(1).strip() # 只有城市,省份留空
  902. if city and province in city:
  903. city = city.replace(province, "").strip()
  904. return province.strip(), city.strip()
  905. #采集数据核心
  906. def collect_data(page, keyword, keyword_dict):
  907. """
  908. 1) 先获取当前页商品个数(count)
  909. 2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
  910. 3) 当前页循环完 -> goto_next_page;有下一页继续;无下一页结束该关键词
  911. """
  912. collect_result = []
  913. # seen = set()
  914. # ========== 初始化异常关键词存储 ==========
  915. error_keywords = []
  916. kw = '' # 单个异常关键词变量
  917. logger.info(f"📊 开始采集「{keyword}」的商品数据")
  918. page.wait_for_load_state("networkidle")
  919. #没有找到商品就跳过这个商品
  920. page_no = 1
  921. while True:
  922. logger.info(f"\n📄 「{keyword}」开始采集第 {page_no} 页")
  923. # 记录列表页URL(可用于你后续兜底)
  924. list_page_url = page.url
  925. logger.info(f"📌 已记录商品列表页URL:{list_page_url}")
  926. # ✅ 先获取当前页商品个数
  927. page.wait_for_load_state("networkidle")
  928. total_limit = page.locator(PRODUCT_ITEM_SELECTOR).count()
  929. logger.info(f"📌 「{keyword}」第{page_no}页 初始商品个数(count):{total_limit}")
  930. #获取该商品的总个数
  931. total_goods_nums_elem = page.locator("div.sr-page_turner-pagination-total")
  932. if total_goods_nums_elem.count() > 0:
  933. total_goods_nums = total_goods_nums_elem.inner_text().strip()
  934. logger.info(f"📌 「{keyword} 商品个数(count):{total_goods_nums}")
  935. else:
  936. logger.info(f"📌 「{keyword} 商品个数(count):不超过60个")
  937. # 重置当前页的采集计数
  938. collected_count = 0
  939. # ========= 初始化无匹配计数器(记录标题不包含核心关键词的次数) =========
  940. # no_match_count = 0 # 无匹配次数初始化为0
  941. # MAX_NO_MATCH = 10 # 最大无匹配次数阈值
  942. #补充没找到关键词的兜底
  943. not_found_keywords = page.locator("span:has-text('新品登记')")
  944. if not_found_keywords.count() > 0:
  945. logger.warning(f"⚠️ 关键词「{keyword}」无匹配商品,直接跳过整个关键词采集")
  946. return []
  947. for idx in range(total_limit):
  948. detail_page = None
  949. try:
  950. item = page.locator(PRODUCT_ITEM_SELECTOR).nth(idx)
  951. collected_count += 1 # 实际采集计数(用于日志)
  952. # ========= 反爬随机延迟(保留你的原逻辑也行) =========
  953. page.wait_for_load_state("networkidle")
  954. delay = random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  955. logger.info(f"📌 「{keyword}」第{page_no}页 第{collected_count}/{total_limit}个商品 - 等待{delay:.2f}秒后采集(反爬)")
  956. # 1. 初始化所有字段默认值
  957. title = "无标题"
  958. price = "0.00"
  959. shop = "无店名"
  960. expiry_date = "无有效期"
  961. manufacture_date = "无生产日期"
  962. approval_number = "无批准文号"
  963. manufacturer = "未知公司"
  964. # discount_price = "0.00"
  965. spec = "未知规格"
  966. num = 1 # ✅ 默认 1
  967. platform = '药九九'
  968. current_time = datetime.now().strftime("%Y-%m-%d")
  969. is_sold_out = 0
  970. # ========= 售罄不跳过 =========
  971. sold_locator = item.locator('div.gc-l1-cirle_tip')
  972. if sold_locator.count() > 0:
  973. is_sold_out = 1
  974. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品已售罄")
  975. # if collected_count % 5 == 0 and collected_count > 0:
  976. # logger.info("采满5个往下滑")
  977. # slow_scroll_400px(page)
  978. # page.wait_for_load_state("networkidle")
  979. # continue
  980. # 提取商品标题(处理空值)
  981. product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
  982. if product_locator.count() > 0:
  983. title = product_locator.inner_text(timeout=3000).strip()
  984. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页标题:{title}{'='*10}")
  985. else:
  986. logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
  987. #关键词不在标题中,跳过当前商品
  988. # core_keyword = re.sub(r'^999[\s\(\)()、·]*', '', keyword)
  989. # if core_keyword not in title:
  990. # no_match_count += 1
  991. # logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含核心关键词「{core_keyword}」(无匹配次数:{no_match_count}/{MAX_NO_MATCH}),跳过本次循环")
  992. # continue
  993. # if no_match_count >= MAX_NO_MATCH:
  994. # logger.error(f"❌ 关键词「{keyword}」无匹配商品次数已达{MAX_NO_MATCH}次,直接终止当前关键词采集,进入下一个关键词")
  995. # return []
  996. # 提取价格(带缺失日志)
  997. price_locator = item.locator(PRODUCT_PRICE_SELECTOR).nth(0)
  998. if price_locator.count() > 0:
  999. price = price_locator.inner_text(timeout=3000).strip()
  1000. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页采购价格:{price}{'='*10}")
  1001. else:
  1002. price = "0.00" # 初始化默认值,避免后续报错
  1003. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页采购价格元素未找到,使用默认值:{price}")
  1004. # 5. 提取公司名称(带缺失日志)
  1005. manufacturer_locator = item.locator(PRODUCT_COMPANY_SELECTOR)
  1006. if manufacturer_locator.count() > 0:
  1007. manufacturer = manufacturer_locator.inner_text(timeout=3000).strip()
  1008. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页公司名:{manufacturer}{'='*10}")
  1009. else:
  1010. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页公司名称元素未找到,使用默认值:{manufacturer}")
  1011. #提取店铺名称
  1012. shop_locator = item.locator(PRODUCT_STORE_SELECTOR)
  1013. if shop_locator.count() > 0:
  1014. shop = shop_locator.inner_text(timeout=3000).strip()
  1015. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页店名:{shop}{'='*10}")
  1016. else:
  1017. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页店铺名称元素未找到,使用默认值:{shop}")
  1018. #提取折扣价
  1019. discount_price_val_origin = ""
  1020. discount_price = ""
  1021. discount_price_locator = item.locator('span.gc-l2-discount_price').first
  1022. if discount_price_locator.count() > 0:
  1023. discount_price = discount_price_locator.inner_text(timeout=3000).strip()
  1024. discount_price_val_origin = discount_price
  1025. match = re.search(r'\d+\.?\d*', str(discount_price_val_origin))
  1026. discount_price_val = float(match.group()) if match else 0.00
  1027. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页折扣价:{discount_price_val}{'='*10}")
  1028. else:
  1029. #如果没有拿原价替换
  1030. price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "") else "0.00"
  1031. discount_price_val = float(price)
  1032. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 折扣价元素未找到,使用采购价兜底:{discount_price_val}")
  1033. merged_price = f"{price}{discount_price_val_origin}" if discount_price_val_origin else price
  1034. # ========= 模拟点击商品进入详情页 =========
  1035. logger.info(
  1036. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 模拟鼠标移动并点击"
  1037. )
  1038. # 点击商品项容器,触发详情展示
  1039. # ========== 点击商品跳详情页 ==========
  1040. # 反爬:模拟真人鼠标移动到商品上再点击(不是直接点击)
  1041. item.hover() # 先悬停
  1042. random_delay(0.2, 0.5) # 悬停后延迟
  1043. item.dispatch_event("mousedown")
  1044. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1045. item.dispatch_event("mouseup")
  1046. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1047. try:
  1048. with page.context.expect_page(timeout=60000) as p:
  1049. item.click(delay=random.uniform(0.1, 0.3))
  1050. detail_page = p.value
  1051. except PlaywrightTimeoutError:
  1052. logger.warning(
  1053. f" 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 未检测到新标签页,使用当前页采集详情"
  1054. )
  1055. detail_page = None # 标记为无新标签页,避免关闭列表页
  1056. # 等待详情加载(优先用新标签页,无则用列表页)
  1057. target_page = detail_page if detail_page else page
  1058. target_page.wait_for_load_state("networkidle", timeout=20000)
  1059. delay = random_delay(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
  1060. logger.info(
  1061. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 详情页加载完成,等待{delay:.2f}秒(反爬)"
  1062. )
  1063. # 反爬:检测详情页反爬验证
  1064. # check_anti_crawl(page)
  1065. # ========== 采集详情页的专属信息(有效期/生产日期/批准文号) ==========
  1066. #获取商品详情页链接
  1067. product_link = target_page.url
  1068. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页链接:{product_link}{'='*10}")
  1069. #如果有需要,还可能要加兜底。
  1070. # ========= ✅ 去重逻辑,拿商品链接和折扣价和有效期和采集日期 =========
  1071. if check_dup_in_biz_db(product_link, discount_price_val, current_time,keyword_dict):
  1072. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过")
  1073. # ========== 关闭新标签页,切回列表页 ==========
  1074. if detail_page and not detail_page.is_closed():
  1075. detail_page.close() # 关闭详情页标签
  1076. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1077. # 切回原列表页(第一个标签页)
  1078. page.bring_to_front() # 激活列表页
  1079. page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1080. random_delay(0.5, 1.0) # 增加切换后延迟
  1081. page.wait_for_load_state("networkidle")
  1082. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1083. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1084. if collected_count % 5 == 0 and collected_count > 0:
  1085. logger.info("采满5个往下滑")
  1086. slow_scroll_400px(page)
  1087. page.wait_for_load_state("networkidle")
  1088. continue
  1089. # key = f"{product_link.strip()}|{discount_price_val}"
  1090. # if key in seen:
  1091. # logger.warning(
  1092. # f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过"
  1093. # )
  1094. # if collected_count % 5 == 0 and collected_count > 0:
  1095. # logger.info("采满15个往下滑")
  1096. # slow_scroll_400px(page)
  1097. # page.wait_for_load_state("networkidle")
  1098. # continue
  1099. # seen.add(key)
  1100. # 提取有效期(处理空值)
  1101. expiry_date_locator = target_page.locator("//span[contains(text(), '有效期')]/following-sibling::span[contains(@class, 'gdb-desc-value4')]")
  1102. if expiry_date_locator.count() > 0:
  1103. expiry_date = expiry_date_locator.inner_text(timeout=3000).strip().replace('-', '') #.replace('近效期','')
  1104. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页有效期:{expiry_date}{'='*10}")
  1105. else:
  1106. # 修复:替换未定义的i为collected_count
  1107. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 有效期元素未找到,使用默认值:{expiry_date}")
  1108. # 提取生产日期(修复完成)
  1109. manufacture_date_locator = target_page.locator("//span[@class='gdb-desc-label' and text()='生产日期']/following-sibling::span[1]")
  1110. if manufacture_date_locator.count() > 0:
  1111. manufacture_date = manufacture_date_locator.inner_text(timeout=3000).strip().replace('-', "")
  1112. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页生产日期:{manufacture_date}{'='*10}")
  1113. else:
  1114. # 修复:替换未定义的i为collected_count
  1115. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 生产日期元素未找到,使用默认值:{manufacture_date}")
  1116. # 提取批准文号
  1117. approval_number_locator = target_page.locator("//span[contains(text(), '国药准字')]").first
  1118. if approval_number_locator.count() > 0:
  1119. approval_number = approval_number_locator.inner_text(timeout=3000).strip()
  1120. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页批准文号:{approval_number}{'='*10}")
  1121. else:
  1122. # 修复:替换未定义的i为collected_count
  1123. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 批准文号元素未找到,使用默认值:{approval_number}")
  1124. #提取规格
  1125. spec_locator = target_page.locator('span.gddd-params_text_line_1[title]')
  1126. if spec_locator.count() > 0:
  1127. spec = spec_locator.nth(2).inner_text(timeout=3000).strip()
  1128. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页规格:{spec}{'='*10}")
  1129. else:
  1130. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  1131. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 规格元素数量不足,使用默认值:{spec}")
  1132. #提取库存
  1133. storage = ''
  1134. storage_locator = target_page.locator("span.gdb-desc-value7")
  1135. if storage_locator.count() > 0:
  1136. storage = storage_locator.inner_text(timeout=3000).strip()
  1137. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页库存:{storage}{'='*10}")
  1138. else:
  1139. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 库存元素数量不足,使用默认值:{storage}")
  1140. #提取销量
  1141. sell = ''
  1142. sell_locator = target_page.locator('.has-join-group span.packUnit-class')
  1143. if sell_locator.count() > 0:
  1144. sell = sell_locator.inner_text(timeout=3000).strip()
  1145. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页销量:{sell}{'='*10}")
  1146. else:
  1147. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 没有销量元素,使用默认值:{sell}")
  1148. oss_url = ""
  1149. try:
  1150. local_path, oss_url = screenshot_target_page_to_local_then_oss(
  1151. target_page=target_page,
  1152. full_page=True # 截取全屏
  1153. )
  1154. logger.info("详情页快照已上传 | local=%s | oss=%s", local_path, oss_url)
  1155. except Exception as e:
  1156. logger.warning("详情页快照上传失败:%s", e)
  1157. # input("...")
  1158. # if shop_is_exists_database(shop):
  1159. # continue
  1160. # province = ""
  1161. # city = ""
  1162. # business_license_company = ""
  1163. # qualification_number = ''
  1164. # input('....')
  1165. shop_exists, shop_info = shop_is_exists_database(shop)
  1166. #店铺名不是药品预约中心且店铺名不在数据库就要点击
  1167. if shop != "药品预约中心" and not shop_exists :
  1168. logger.info("店铺名不是药品预约中心且数据库没有该公司的营业执照")
  1169. # 获取营业执照图片 li[data-v-4f79abe8].nth(2)
  1170. # 进入店铺
  1171. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1172. entershop_btn = target_page.locator('[data-v-c5790f48].btn-text')
  1173. # 增强:先等待进入店铺按钮可见
  1174. entershop_btn.wait_for(state="visible", timeout=10000)
  1175. entershop_btn.scroll_into_view_if_needed() # 确保按钮在视口内
  1176. entershop_btn.hover() # 先悬停
  1177. random_delay(0.2, 0.5) # 悬停后延迟
  1178. entershop_btn.click()
  1179. # entershop_btn.dispatch_event("mousedown")
  1180. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1181. # entershop_btn.dispatch_event("mouseup")
  1182. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1183. target_page.wait_for_load_state("domcontentloaded") # 等DOM加载(比networkidle更适合页面内切换)
  1184. #点击店铺资质
  1185. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1186. shop_license_page = target_page.locator('li:has-text("店铺资质")')
  1187. shop_license_page.wait_for(state="visible", timeout=10000) # 等待元素加载完成
  1188. shop_license_page.hover() # 先悬停
  1189. random_delay(0.2, 0.5) # 悬停后延迟
  1190. # shop_license_page.dispatch_event("mousedown")
  1191. shop_license_page.click()
  1192. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1193. # shop_license_page.dispatch_event("mouseup")
  1194. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1195. target_page.wait_for_load_state("networkidle")
  1196. slow_scroll_400px(target_page, scroll_distance1=700)
  1197. #获取营业执照图片
  1198. target_page.wait_for_load_state("load")
  1199. ocr_res = None
  1200. shop_license_div = target_page.locator('div.shop-licensesImg').nth(0)
  1201. shop_license_div.wait_for(state="visible", timeout=60000)
  1202. shop_license_img = shop_license_div.locator('img')
  1203. try:
  1204. if shop_license_img.count() > 0:
  1205. shop_license_src = shop_license_img.get_attribute('src')
  1206. shop_license_src = shop_license_src.strip() if shop_license_src else None
  1207. ocr_res = get_ocr_res(shop_license_src)
  1208. # print(f'ocr_res:{ocr_res}')
  1209. else:
  1210. shop_license_src = None
  1211. except Exception as e:
  1212. # 捕获定位/提取失败的异常,避免程序崩溃
  1213. logger.warning(f"提取营业执照图片src失败:{e}")
  1214. shop_license_src = None
  1215. print("营业执照图片链接:", shop_license_src)
  1216. # input("..")
  1217. contact_address = ''
  1218. qualification_number = ocr_res.get('社会信用代码', '') if ocr_res else ''
  1219. business_license_company = ocr_res.get('单位名称', '') if ocr_res else ''
  1220. business_license_address = ocr_res.get('地址', '') if ocr_res else ''
  1221. # scrape_date = ''
  1222. # 调用提取函数,获取省份和城市
  1223. province, city = extract_province_city(business_license_address)
  1224. logger.info(f"原始地址:{business_license_address}")
  1225. logger.info(f"提取的省份:{province} | 城市:{city}")
  1226. insert_result = insert_shop_info_to_db(
  1227. shop=shop,
  1228. contact_address=contact_address,
  1229. qualification_number=qualification_number,
  1230. business_license_company=business_license_company,
  1231. business_license_address=business_license_address,
  1232. scrape_date=current_time,
  1233. platform=platform,
  1234. province=province,
  1235. city=city,
  1236. create_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S") ,
  1237. update_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  1238. )
  1239. else:
  1240. logger.info("数据库有该店名,在数据库拿取对应字段填充yjj_drug_middle表")
  1241. province, city = "", ""
  1242. business_license_company, qualification_number = "", ""
  1243. if shop_info:
  1244. province = shop_info["province"]
  1245. city = shop_info["city"]
  1246. business_license_company = shop_info["business_license_company"]
  1247. qualification_number = shop_info["qualification_number"]
  1248. # purchase_price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "").isdigit() else 0.00
  1249. # ========== 关闭新标签页,切回列表页 ==========
  1250. if detail_page and not detail_page.is_closed():
  1251. detail_page.close() # 关闭详情页标签
  1252. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1253. # 切回原列表页(第一个标签页)
  1254. page.bring_to_front() # 激活列表页
  1255. page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1256. random_delay(0.5, 1.0) # 增加切换后延迟
  1257. page.wait_for_load_state("networkidle")
  1258. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1259. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1260. # credit_code = ""
  1261. availability = ""
  1262. # 组装单条数据(仅新增生产日期/批准文号字段,原有字段顺序/逻辑不变)
  1263. # 构造单条数据元组(适配MySQL字段)
  1264. single_data = {
  1265. # 核心商品信息
  1266. "product": title, # 商品名称
  1267. "my_good_price": merged_price, # 自定义价格(可与min_price相同或单独提取)
  1268. "min_price": discount_price_val, # 最低价格
  1269. "manufacture_date": manufacture_date, # 生产日期
  1270. "expiry_date": expiry_date, # 有效期
  1271. "shop": shop, # 店铺名
  1272. "business_license_company": business_license_company, # 营业执照主体(公司名称)
  1273. "province": province, # 省份
  1274. "city": city, # 城市
  1275. "manufacturer": manufacturer, # 生产厂家
  1276. "specification": spec, # 规格
  1277. "approval_number": approval_number, # 批准文号
  1278. "product_link": product_link, # 商品链接
  1279. "scrape_date": current_time, # 采集日期
  1280. "scrape_province": "", # 采集省份(可留空或根据IP获取)
  1281. "availability": availability, # 库存状态
  1282. "credit_code": qualification_number, # 统一信用代码(如有可补充提取)
  1283. "platform": platform, # 平台名称(固定或动态获取)
  1284. "search_key": keyword, # 搜索关键词
  1285. "number": num, # 数量(盒数)
  1286. "is_sold_out": is_sold_out, # 售罄标记(0/1)
  1287. "sales": sell, #销量
  1288. "inventory": storage, #库存
  1289. "snapshot_url": oss_url, #快照链接
  1290. "update_time": time.strftime("%Y-%m-%d %H:%M:%S"), # 更新时间
  1291. "insert_time": time.strftime("%Y-%m-%d %H:%M:%S"), # 创建时间
  1292. "task_id": keyword_dict["id"], # 任务id
  1293. "company_id": keyword_dict["company_id"], # 平台id
  1294. "product_brand": keyword_dict["product_brand"], # 创建时间
  1295. "search_name": keyword_dict["product_name"], # 创建时间
  1296. "collect_config_info": json.dumps(
  1297. {"sampling_cycle": keyword_dict["sampling_cycle"], "sampling_start_time":keyword_dict["sampling_start_time"],
  1298. "sampling_end_time":keyword_dict["sampling_end_time"]}),
  1299. "collect_equipment_account_id": keyword_dict["collect_equipment_account_id"],
  1300. "collect_region_id": keyword_dict["collect_region_id"],
  1301. "collect_round": keyword_dict["collect_round"],
  1302. }
  1303. # 调用逐条插入函数
  1304. insert_single_to_mysql(single_data)
  1305. collect_result.append(single_data)
  1306. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」采集完成")
  1307. except Exception as e:
  1308. kw = keyword
  1309. # 2. 把kw添加到异常关键词数组(去重,避免重复添加)
  1310. if kw not in error_keywords:
  1311. error_keywords.append(kw)
  1312. # 异常处理:关闭详情页,强制切回列表页
  1313. logger.exception(f" 「{keyword}」第{collected_count}个商品采集核心异常:{str(e)}")
  1314. try:
  1315. if detail_page and not detail_page.is_closed():
  1316. detail_page.close()
  1317. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 异常时关闭详情页标签页")
  1318. if page and not page.is_closed():
  1319. page.bring_to_front() # 切回列表页
  1320. page.wait_for_load_state("networkidle")
  1321. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1322. except Exception as e2:
  1323. logger.error(f" 「{keyword}」第{collected_count}个商品详情采集异常(处理时):{str(e2)},原异常:{str(e)}")
  1324. continue
  1325. # ✅ 每15次滚动一次(修复:用collected_count,且排除0的情况)
  1326. if collected_count % 5 == 0 and collected_count > 0 and collected_count != total_limit:
  1327. logger.info("采满5个往下滑")
  1328. slow_scroll_400px(page,)
  1329. page.wait_for_load_state("networkidle")
  1330. # ====== 当前页采集完毕,尝试翻页 ======
  1331. delay = random_delay(1.5, 3.0)
  1332. logger.info(f"⏳ 翻页前随机等待 {delay:.2f}s(反爬)")
  1333. if goto_next_page(page):
  1334. page_no += 1
  1335. continue
  1336. else:
  1337. logger.info(f" 「{keyword}」已无下一页,关键词采集结束")
  1338. break
  1339. # 关键词采集完成后长延迟
  1340. long_delay = random_delay(MIN_KEYWORD_DELAY, MAX_KEYWORD_DELAY)
  1341. logger.info(f" 「{keyword}」采集完成,共{len(collect_result)}条数据,等待{long_delay:.2f}秒后继续下一个关键词(反爬)")
  1342. result_data = {
  1343. "real_count": len(collect_result),
  1344. "end_page": page_no,
  1345. "start_page": 1,
  1346. }
  1347. return result_data
  1348. def _call_report_api(data):
  1349. """调用上报接口"""
  1350. try:
  1351. url = 'https://scheduleapi.findit.ltd/api/collect_equipment_execute/result_report'
  1352. resp = requests.post(url, json=data, timeout=10)
  1353. if resp.status_code == 200:
  1354. logger.info(f"任务 {data['collect_task_allocate_id']} 上报成功")
  1355. # self.loggerMT.info(f"任务 {data['collect_task_allocate_id']} 上报成功")
  1356. else:
  1357. logger.info(f"任务 {data['collect_task_allocate_id']} 上报失败: {resp.status_code}")
  1358. # self.loggerMT.info(f"任务 {data['collect_task_allocate_id']} 上报失败: {resp.status_code}")
  1359. except Exception as e:
  1360. logger.info(f"上报接口调用异常: {e}")
  1361. # ==================== 保存到CSV函数(适配新表头) ====================
  1362. # def save_to_csv(data_list):
  1363. # """
  1364. # 保存数据到CSV(适配新表头)
  1365. # :param data_list: list - 采集到的字典数据列表
  1366. # :return: bool - 保存是否成功
  1367. # """
  1368. # if not data_list:
  1369. # logger.warning(" 无数据可保存到CSV")
  1370. # return False
  1371. # try:
  1372. # # 判断文件是否存在,不存在则写入表头
  1373. # file_exists = os.path.exists(CSV_FILE_PATH)
  1374. # # 打开CSV文件(追加模式,utf-8-sig避免Excel乱码)
  1375. # with open(CSV_FILE_PATH, "a", newline="", encoding="utf-8-sig") as f:
  1376. # # 用新表头作为字段名
  1377. # writer = csv.DictWriter(f, fieldnames=CSV_HEADERS)
  1378. # # 首次写入表头
  1379. # if not file_exists:
  1380. # writer.writeheader()
  1381. # logger.info(f" 已创建CSV文件并写入新表头:{CSV_FILE_PATH}")
  1382. # # 写入数据行
  1383. # writer.writerows(data_list)
  1384. # logger.info(f" 成功将 {len(data_list)} 条数据写入CSV")
  1385. # return True
  1386. # except Exception as e:
  1387. # logger.error(f" 保存CSV失败:{str(e)}")
  1388. # return False
  1389. # ==================== 主函数(登录+批量搜索) ====================
  1390. def main():
  1391. logger.info("\n" + "="*50)
  1392. logger.info("🚀 药九九采集程序启动")
  1393. logger.info(f"⏰ 启动时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  1394. logger.info("="*50)
  1395. # 待搜索的关键词列表(直接写在这里,改起来更直观)
  1396. # 存储所有关键词的采集数据
  1397. # all_collect_data = []
  1398. with sync_playwright() as p:
  1399. # browser = init_browser_with_proxy(p)
  1400. # 启动浏览器(用单个配置变量)
  1401. browser = p.chromium.launch(
  1402. headless=False,
  1403. channel="chrome",
  1404. slow_mo=random.randint(100, 300),
  1405. args=_chromium_launch_args(),
  1406. )
  1407. # 创建页面时伪装指纹
  1408. context = browser.new_context(
  1409. locale="zh-CN", # 中文环境
  1410. timezone_id="Asia/Shanghai", # 上海时区
  1411. geolocation={"latitude": 31.230416, "longitude": 121.473701}, # 模拟上海地理位置(可选)
  1412. permissions=["geolocation"], # 授予定位权限(模拟真人)
  1413. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  1414. viewport={"width": 1600, "height": 1400},
  1415. # 关键:隐藏自动化特征
  1416. java_script_enabled=True,
  1417. bypass_csp=True,
  1418. # user_data_dir="./temp_user_data" # 模拟真实用户数据目录
  1419. )
  1420. page = context.new_page()
  1421. # 关键:移除navigator.webdriver标识(反爬核心)
  1422. page.add_init_script("""
  1423. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  1424. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); // 新增:模拟插件
  1425. Object.defineProperty(navigator, 'mimeTypes', { get: () => [1, 2, 3] }); // 新增:模拟MIME类型
  1426. window.chrome = { runtime: {}, loadTimes: () => ({}) }; // 增强Chrome模拟
  1427. delete window.navigator.languages;
  1428. window.navigator.languages = ['zh-CN', 'zh'];
  1429. // 新增:模拟真实鼠标移动特征
  1430. (() => {
  1431. const originalAddEventListener = EventTarget.prototype.addEventListener;
  1432. EventTarget.prototype.addEventListener = function(type, listener) {
  1433. if (type === 'mousemove') {
  1434. return originalAddEventListener.call(this, type, (e) => {
  1435. e._automation = undefined;
  1436. listener(e);
  1437. });
  1438. }
  1439. return originalAddEventListener.call(this, type, listener);
  1440. };
  1441. })();
  1442. """)
  1443. keyword_dict = None
  1444. start_time = None
  1445. try:
  1446. # ========== 核心:Cookie复用逻辑 ==========
  1447. # 1. 加载本地Cookie
  1448. load_cookies(context)
  1449. # 2. 验证登录状态
  1450. if not is_login(page):
  1451. # 3. Cookie失效/不存在,执行登录
  1452. page.goto(TARGET_LOGIN_URL)
  1453. time.sleep(5)
  1454. page.wait_for_load_state("networkidle")
  1455. logger.info("🔑 开始执行登录流程")
  1456. # 执行登录操作
  1457. login_success = login_operation(page, USERNAME, PASSWORD)
  1458. if not login_success:
  1459. logger.error(" 登录失败,程序终止")
  1460. return
  1461. # 4. 登录成功后保存Cookie
  1462. save_cookies(context)
  1463. logger.info(" 登录并保存Cookie成功!")
  1464. logger.info(f"开始处理:SEARCH_KEYWORDS")
  1465. # 2. 批量搜索+采集+保存
  1466. for keyword_idx, keyword_dict in enumerate(SEARCH_KEYWORDS, 1):
  1467. # 准备上报数据
  1468. report_data = {
  1469. "collect_task_allocate_id": keyword_dict["id"],
  1470. "status": 2,
  1471. }
  1472. start_time = time.time()
  1473. logger.info(f"上报开始接口:{keyword_dict}")
  1474. _call_report_api(report_data)
  1475. keyword =keyword_dict["product_brand"] + keyword_dict["product_name"] + keyword_dict["product_specs"]
  1476. logger.info(f"\n=====================================")
  1477. logger.info(f"🔍 开始处理第{keyword_idx}/{len(SEARCH_KEYWORDS)}个关键词:{keyword}")
  1478. logger.info(f"=====================================")
  1479. # 执行搜索
  1480. popup_guard(page, "before_search")
  1481. search_success = search_operation(page, keyword)
  1482. # input("")
  1483. popup_guard(page, "after_search")
  1484. if not search_success:
  1485. logger.warning(f" 「{keyword}」搜索失败,跳过采集")
  1486. continue
  1487. # ✅ 再等页面稳定一下(networkidle 有时会等不到,建议加超时或换成 domcontentloaded)
  1488. page.wait_for_load_state("domcontentloaded")
  1489. page.wait_for_load_state('networkidle')
  1490. # 采集数据
  1491. data_list = collect_data(page, keyword, keyword_dict)
  1492. # 准备上报数据
  1493. report_data = {
  1494. "collect_task_allocate_id": keyword_dict["id"],
  1495. "status": 3,
  1496. "start_time": start_time,
  1497. "end_time": time.time(),
  1498. "finish_status": 1,
  1499. "real_count": 0,
  1500. "start_page": data_list["start_page"],
  1501. "end_page": data_list["end_page"],
  1502. }
  1503. _call_report_api(report_data)
  1504. # # 保存到CSV
  1505. # if data_list:
  1506. # save_to_csv(data_list)
  1507. # else:
  1508. # logger.warning(f" 「{keyword}」无数据,跳过保存")
  1509. logger.info("\n🎉 所有关键词处理完成!CSV文件路径:" + os.path.abspath(CSV_FILE_PATH))
  1510. # input("\n按回车关闭程序...")
  1511. except Exception as e:
  1512. if keyword_dict is not None:
  1513. report_data = {
  1514. "collect_task_allocate_id": keyword_dict["id"],
  1515. "status": 4,
  1516. "start_time": start_time if start_time is not None else time.time(),
  1517. "end_time": time.time(),
  1518. "finish_status": 0,
  1519. }
  1520. _call_report_api(report_data)
  1521. logger.exception("程序异常:%s", e)
  1522. finally:
  1523. browser.close()
  1524. logger.info(" 浏览器已关闭,程序结束")
  1525. # ==================== 程序入口 ====================
  1526. if __name__ == '__main__':
  1527. def run_collection():
  1528. """执行采集任务"""
  1529. try:
  1530. print(f"【定时任务开始】时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  1531. main()
  1532. print(f"【定时任务结束】时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  1533. except Exception as e:
  1534. print(f"【定时任务异常】: {e}")
  1535. # 设置定时任务
  1536. schedule.every(10).minutes.do(run_collection)
  1537. # 立即执行一次
  1538. run_collection()
  1539. print("定时任务已设置,每 10 分钟执行一次采集")
  1540. # 循环执行
  1541. while True:
  1542. schedule.run_pending()
  1543. time.sleep(60) # 每分钟检查一次