get_url_test.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  2. import os
  3. import json
  4. import random
  5. from logger_config import logger
  6. from config import *
  7. import re
  8. import pymysql
  9. COOKIE_FILE_PATH = "ybm_cookies.json" # Cookie保存路径
  10. LOGIN_VALIDATE_URL = "https://www.ybm100.com/new/"
  11. TARGET_LOGIN_URL = "https://www.ybm100.com/new/login"
  12. def load_cookies(context, cookie_path=COOKIE_FILE_PATH):
  13. """从本地JSON文件加载Cookie到浏览器上下文"""
  14. if not os.path.exists(cookie_path):
  15. # logger.warning(f" Cookie文件不存在:{cookie_path}")
  16. return False
  17. try:
  18. with open(cookie_path, "r", encoding="utf-8") as f:
  19. cookies = json.load(f)
  20. context.add_cookies(cookies)
  21. # logger.info(f"✅ 已从{cookie_path}加载Cookie")
  22. return True
  23. except Exception as e:
  24. # logger.error(f" 加载Cookie失败:{e}")
  25. return False
  26. def is_login(page):
  27. """验证是否已登录(核心:检测登录态)"""
  28. try:
  29. # 访问需要登录的页面
  30. page.goto(LOGIN_VALIDATE_URL, timeout=5000)
  31. page.wait_for_load_state("networkidle")
  32. # 检测是否跳转到登录页(URL包含login则未登录)
  33. if "login" in page.url.lower():
  34. # logger.warning(" Cookie失效,需要重新登录")
  35. return False
  36. # 可选:检测登录后的专属元素(比如用户名、个人中心等)
  37. # if page.locator("用户中心选择器").count() > 0:
  38. # return True
  39. # logger.info(" Cookie有效,已保持登录状态")
  40. return True
  41. except Exception as e:
  42. # logger.error(f" 验证登录状态失败:{e}")
  43. return False
  44. def popup_guard(page, tag=""):
  45. """
  46. 全局弹窗/遮罩守卫:多步引导 + 关闭按钮 + 遮罩清理 + 恢复滚动
  47. tag 仅用于日志区分调用位置
  48. """
  49. try:
  50. # 给弹窗一点出现时间
  51. page.wait_for_timeout(300)
  52. # 1) 连续点“下一步/完成/我知道了/关闭”
  53. for _ in range(6):
  54. btn = page.locator(
  55. "xpath=//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  56. ).first
  57. if btn.count() > 0 and btn.is_visible():
  58. btn.click(timeout=1500)
  59. page.wait_for_timeout(250)
  60. continue
  61. # 2) 常见的 close icon
  62. close_btn = page.locator(
  63. "css=.el-dialog__headerbtn, .el-message-box__headerbtn, .close, .icon-close, .el-icon-close"
  64. ).first
  65. if close_btn.count() > 0 and close_btn.is_visible():
  66. close_btn.click(timeout=1200)
  67. page.wait_for_timeout(250)
  68. continue
  69. break
  70. # 3) 清遮罩 + 恢复滚动/交互
  71. page.evaluate(r"""
  72. () => {
  73. // 第一步:精准清理已知的遮罩/弹窗类名(Element UI框架常用)
  74. const selectors = [
  75. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  76. '.el-message-box__wrapper', '.el-loading-mask'
  77. ];
  78. selectors.forEach(sel => document.querySelectorAll(sel).forEach(e => e.remove()));
  79. // 泛化兜底:近似全屏 + 高 z-index 的层直接屏蔽
  80. const all = Array.from(document.querySelectorAll('body *'));
  81. for (const el of all) {
  82. const s = getComputedStyle(el); // 获取元素的实际样式(含CSS生效的样式)
  83. const z = parseInt(s.zIndex || '0', 10); // 取元素的层级(z-index),默认0
  84. // 条件1:元素是固定/绝对定位(弹窗/遮罩常见定位方式)+ 层级≥1000(高优先级遮挡)+ 能拦截鼠标事件
  85. if ((s.position === 'fixed' || s.position === 'absolute') && z >= 1000 && s.pointerEvents !== 'none') {
  86. const r = el.getBoundingClientRect(); // 获取元素的尺寸和位置
  87. // 条件2:元素宽度/高度≥屏幕80%(近似全屏遮罩)
  88. const nearFull = r.width >= innerWidth * 0.8 && r.height >= innerHeight * 0.8;
  89. if (nearFull) {
  90. el.style.pointerEvents = 'none'; // 让元素不拦截鼠标点击
  91. el.style.display = 'none'; // 隐藏元素
  92. }
  93. }
  94. }
  95. // 第三步:恢复页面滚动功能(弹窗常把页面设为不可滚动)
  96. document.documentElement.style.overflow = 'auto'; // html标签恢复滚动
  97. document.body.style.overflow = 'auto'; // body标签恢复滚动
  98. document.body.classList.remove('el-popup-parent--hidden'); // 移除Element UI的滚动禁用类
  99. }
  100. """)
  101. # logger.info("杀除弹窗成功")
  102. except Exception:
  103. pass
  104. SEARCH_INPUT_SELECTOR = "input[placeholder*='药品名称/厂家名称']"
  105. def pick_search_input(page):
  106. """优先选可见且可用的搜索输入框;第一个不行就尝试第二个"""
  107. inputs = page.locator(SEARCH_INPUT_SELECTOR)
  108. cnt = inputs.count()
  109. # 优先检查前两个(你说只有两个)
  110. for i in range(min(cnt, 2)):
  111. candidate = inputs.nth(i)
  112. try:
  113. candidate.wait_for(state="visible", timeout=1500) # 小超时快速试探
  114. if candidate.is_enabled():
  115. return candidate
  116. except PlaywrightTimeoutError:
  117. continue
  118. # 兜底:直接找任意可见的(避免命中 hidden 模板)
  119. candidate = page.locator(f"{SEARCH_INPUT_SELECTOR}:visible").first
  120. candidate.wait_for(state="visible", timeout=5000)
  121. return candidate
  122. def type_slow(locator, text: str, min_delay=0.06, max_delay=0.18):
  123. """逐字输入,模拟真人打字"""
  124. for ch in text:
  125. locator.type(ch, delay=int(random.uniform(min_delay, max_delay) * 1000))
  126. SEARCH_BTN_SELECTOR = 'div.home-search-container-search-head-btn[data-scmd="text-搜索"]'
  127. def force_close_popup(page):
  128. """关闭新手引导/遮罩(多步:下一步/完成/我知道了),并兜底移除遮罩层"""
  129. try:
  130. # 1) 尝试连续点“下一步/完成/我知道了/关闭”
  131. for _ in range(5): # 最多点5次,足够覆盖多步引导
  132. btn = page.locator(
  133. "//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  134. ).first
  135. if btn.count() > 0 and btn.is_visible():
  136. btn.click(timeout=1500)
  137. page.wait_for_timeout(300)
  138. continue
  139. # 有些引导是右上角 X(如果存在就点)
  140. close_icon = page.locator(
  141. "xpath=//*[contains(@class,'close') or contains(@class,'el-icon-close') or name()='svg' or name()='i'][1]"
  142. ).first
  143. if close_icon.count() > 0 and close_icon.is_visible():
  144. close_icon.click(timeout=1000)
  145. page.wait_for_timeout(300)
  146. continue
  147. break
  148. # 2) 兜底:移除常见遮罩层(element-ui / 通用 mask/overlay)
  149. page.evaluate("""
  150. const selectors = [
  151. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  152. '[class*="mask"]', '[class*="overlay"]', '[style*="z-index"]'
  153. ];
  154. for (const sel of selectors) {
  155. document.querySelectorAll(sel).forEach(el => {
  156. const s = window.getComputedStyle(el);
  157. // 只移除“覆盖层”倾向的元素:fixed/absolute 且 z-index 很高
  158. if ((s.position === 'fixed' || s.position === 'absolute') && parseInt(s.zIndex || '0', 10) >= 1000) {
  159. el.remove();
  160. }
  161. });
  162. }
  163. """)
  164. except Exception:
  165. pass
  166. def kill_masks(page):
  167. """
  168. 强制清理残留遮罩层/覆盖层,并恢复 body 可滚动、可点击状态
  169. """
  170. page.evaluate(r"""
  171. () => {
  172. const removed = [];
  173. const hidden = [];
  174. // 1) 先处理已知常见遮罩
  175. const knownSelectors = [
  176. '.v-modal',
  177. '.el-overlay',
  178. '.el-overlay-dialog',
  179. '.el-dialog__wrapper',
  180. '.el-message-box__wrapper',
  181. '.el-loading-mask',
  182. '.el-popup-parent--hidden'
  183. ];
  184. for (const sel of knownSelectors) {
  185. document.querySelectorAll(sel).forEach(el => {
  186. // v-modal / overlay 直接 remove 最省事
  187. removed.push(sel);
  188. el.remove();
  189. });
  190. }
  191. // 2) 再做一次“泛化兜底”:全屏 fixed/absolute + 高 z-index 的覆盖层
  192. // 注意:不要误删页面正常的固定导航,所以加上“近似全屏”的判断
  193. const all = Array.from(document.querySelectorAll('body *'));
  194. for (const el of all) {
  195. const s = window.getComputedStyle(el);
  196. if (!s) continue;
  197. const z = parseInt(s.zIndex || '0', 10);
  198. const pos = s.position;
  199. const pe = s.pointerEvents;
  200. if ((pos === 'fixed' || pos === 'absolute') && z >= 1000 && pe !== 'none') {
  201. const r = el.getBoundingClientRect();
  202. const nearFullScreen =
  203. r.width >= window.innerWidth * 0.8 &&
  204. r.height >= window.innerHeight * 0.8 &&
  205. r.left <= window.innerWidth * 0.1 &&
  206. r.top <= window.innerHeight * 0.1;
  207. // 常见遮罩是半透明背景色,或者透明但拦截点击
  208. const bg = s.backgroundColor || '';
  209. const looksLikeMask =
  210. nearFullScreen && (bg.includes('rgba') || bg.includes('rgb') || s.opacity !== '1');
  211. if (nearFullScreen) {
  212. // 不管透明不透明,只要近似全屏且高 z-index,就先让它不拦截点击
  213. el.style.pointerEvents = 'none';
  214. el.style.display = 'none';
  215. hidden.push(el.tagName + '.' + (el.className || ''));
  216. }
  217. }
  218. }
  219. // 3) 恢复 body / html 的滚动与交互(很多弹窗会锁滚动)
  220. document.documentElement.style.overflow = 'auto';
  221. document.body.style.overflow = 'auto';
  222. document.body.style.position = 'static';
  223. document.body.style.width = 'auto';
  224. document.body.style.paddingRight = '0px';
  225. // 4) 去掉 Element-UI 常见的锁定 class
  226. document.body.classList.remove('el-popup-parent--hidden');
  227. return { removed, hiddenCount: hidden.length, hidden };
  228. }
  229. """)
  230. # ==================== 搜索操作函数 ====================
  231. def search_operation(page, keyword):
  232. """搜索框填充+提交搜索"""
  233. try:
  234. # 1) 找到“可用”的搜索框(第一个不行就用第二个)
  235. search_locator = pick_search_input(page)
  236. # 清空并填充搜索框
  237. search_locator.wait_for(timeout=5000)
  238. # 2. 清空搜索框(双重保障:先调用locator的clear,再手动全选删除)
  239. search_locator.click() # 聚焦
  240. search_locator.fill("")
  241. page.keyboard.down("Control") # 按住Control键
  242. page.keyboard.press("a") # 按a键
  243. page.keyboard.up("Control") # 松开Control键
  244. page.keyboard.press("Backspace") # 删除选中内容
  245. # 3) 逐字输入
  246. type_slow(search_locator, keyword, min_delay=0.25, max_delay=0.50)
  247. # 3. 输入搜索关键词
  248. # search_locator.fill(keyword)
  249. logger.info(f"📝 已输入搜索关键词:{keyword}")
  250. # 3) 搜索按钮也建议点可见的那个
  251. btn = page.locator(f"{SEARCH_BTN_SELECTOR}:visible").first
  252. btn.wait_for(state="visible", timeout=5000)
  253. # btn.click()
  254. page.wait_for_timeout(600)
  255. #获取新页面对象
  256. try:
  257. # 先开始监听新页面事件(在点击前)
  258. with page.context.expect_page(timeout=60000) as new_page_info:
  259. # 再执行点击操作
  260. btn.click()
  261. # 点击后获取新页面
  262. detail_page = new_page_info.value
  263. detail_page.wait_for_load_state("networkidle", timeout=20000)
  264. except PlaywrightTimeoutError:
  265. logger.warning(f"未检测到新标签页")
  266. return False
  267. # force_close_popup(page)
  268. # kill_masks(page)
  269. logger.info("✅ 已触发搜索")
  270. detail_page.wait_for_load_state("networkidle", timeout=20000)
  271. test_btn = detail_page.locator("div[data-v-c65c36bc].first-time-highlight-message-btn button")
  272. btn_count = test_btn.count()
  273. logger.info(f"✅ 匹配到的元素数量:{btn_count}")
  274. test_btn.wait_for(state="attached", timeout=5000)
  275. test_btn.click()
  276. input("....")
  277. return detail_page, True
  278. # 搜索后等待结果加载
  279. # page.wait_for_timeout(COLLECT_DELAY)
  280. # return True
  281. except PlaywrightTimeoutError as e:
  282. logger.error(f" 搜索失败:元素定位超时 - {str(e)}")
  283. return None, False # 失败时返回 (None, False)
  284. except Exception as e:
  285. logger.error(f" 搜索异常:{str(e)}")
  286. return None, False # 失败时返回 (None, False)
  287. def goto_next_page(page) -> bool:
  288. """
  289. 尝试翻到下一页;成功返回True,没下一页/翻页失败返回False
  290. 适配常见 ElementUI: .el-pagination .btn-next / .el-pagination__next
  291. """
  292. # 多写几个候选,哪个能用就用哪个
  293. candidates = [
  294. ".el-pagination button.btn-next:not(.is-disabled)",
  295. ".el-pagination__next:not(.is-disabled)",
  296. "button:has-text('下一页'):not([disabled])",
  297. "a:has-text('下一页')",
  298. ]
  299. next_btn = None
  300. for sel in candidates:
  301. loc = page.locator(sel).first
  302. if loc.count() > 0:
  303. next_btn = loc
  304. break
  305. if not next_btn:
  306. return False
  307. # 用“当前页第一个商品标题”做翻页完成的判据(比只等networkidle更稳)
  308. first_title = page.locator(PRODUCT_TITLE_SELECTOR).first
  309. before = ""
  310. try:
  311. if first_title.count() > 0:
  312. before = first_title.inner_text(timeout=2000).strip()
  313. except:
  314. pass
  315. try:
  316. page.evaluate("window.scrollTo(0, 0);")
  317. next_btn.click(timeout=5000)
  318. page.wait_for_load_state("networkidle")
  319. # 等列表发生变化(标题变了 / 或者至少第一个标题重新出现)
  320. if before:
  321. page.wait_for_function(
  322. """(sel, oldText) => {
  323. const el = document.querySelector(sel);
  324. return el && el.innerText && el.innerText.trim() !== oldText;
  325. }""",
  326. arg=(PRODUCT_TITLE_SELECTOR, before),
  327. timeout=5000
  328. )
  329. else:
  330. first_title.wait_for(timeout=1000)
  331. return True
  332. except Exception as e:
  333. logger.warning(f" 翻页失败:{e}")
  334. return False
  335. PRODUCT_ITEM_SELECTOR = "div.product-list-item"
  336. def collect_data(page, keyword):
  337. collect_result = []
  338. collected_count = 0 # ✅ 初始化计数变量
  339. logger.info(f"📊 开始采集「{keyword}」的商品数据")
  340. page.wait_for_load_state("networkidle")
  341. page_no = 1
  342. while True:
  343. logger.info(f"\n📄 「{keyword}」开始采集第 {page_no} 页")
  344. # ✅ 先获取当前页商品个数
  345. page.wait_for_load_state("networkidle")
  346. total_limit = page.locator(PRODUCT_ITEM_SELECTOR).count()
  347. logger.info(f"📌 「{keyword}」第{page_no}页 初始商品个数(count):{total_limit}")
  348. for idx in range(total_limit):
  349. detail_page = None
  350. # total_limit += 1
  351. try:
  352. item = page.locator(PRODUCT_ITEM_SELECTOR).nth(idx)
  353. collected_count += 1 # 实际采集计数(用于日志)
  354. # ========= 反爬随机延迟(保留你的原逻辑也行) =========
  355. page.wait_for_load_state("networkidle")
  356. # delay = random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  357. logger.info(f"📌 「{keyword}」第{page_no}页 第{collected_count}/{total_limit}个商品")
  358. #获取product_id
  359. product_id = None
  360. #这里还得改
  361. child_item = item.locator("> [data-product-id]")
  362. product_id = child_item.get_attribute("data-product-id")
  363. if product_id:
  364. product_id = product_id.strip()
  365. logger.info(f"✅ 「{keyword}」第{collected_count}个商品 - 提取到product_id:{product_id}")
  366. else:
  367. logger.warning(f"没提取到{product_id}")
  368. product_url = f"https://www.ybm100.com/new/base/skuDetail?id={product_id}"
  369. print(product_url)
  370. db_match_result = fuzzy_match_product_url_in_db_mysql(product_url)
  371. if db_match_result:
  372. logger.info(f"✅ 「{keyword}」第{collected_count}个商品 - MySQL 匹配到URL,直接返回结果:{db_match_result}")
  373. print(db_match_result)
  374. else:
  375. # 4. 匹配不存在:准备执行后续点击提取流程
  376. logger.info(f"ℹ️ 「{keyword}」第{collected_count}个商品 - MySQL 未匹配到URL,执行点击提取")
  377. except Exception as e:
  378. logger.info("该商品链接没有在数据库,进行点击提取。")
  379. continue
  380. # ====== 当前页采集完毕,尝试翻页 ======
  381. delay = page.wait_for_timeout(5000)
  382. logger.info(f"⏳ 翻页前随机等待 {delay:.2f}s(反爬)")
  383. if goto_next_page(page):
  384. page_no += 1
  385. continue
  386. else:
  387. logger.info(f" 「{keyword}」已无下一页,关键词采集结束")
  388. break
  389. def main():
  390. with sync_playwright() as p:
  391. browser = p.chromium.launch(
  392. headless=False, # 不要用无头模式(反爬:无头模式易被识别)
  393. channel="chrome", # 使用真实Chrome内核
  394. slow_mo=random.randint(100, 300), # 全局操作延迟(模拟真人慢速操作)
  395. args=[
  396. "--disable-blink-features=AutomationControlled", # 禁用webdriver特征(核心!)
  397. "--enable-automation=false", # 新增:禁用自动化标识
  398. "--disable-infobars", # 新增:禁用信息栏
  399. "--remote-debugging-port=0", # 新增:随机调试端口
  400. "--start-maximized", # 最大化窗口(模拟真人使用)
  401. "--disable-extensions", # 禁用扩展(避免特征)
  402. "--disable-plugins-discovery", # 禁用插件发现
  403. "--no-sandbox", # 避免沙箱模式特征
  404. "--disable-dev-shm-usage", # 避免内存限制导致的异常
  405. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36" # 随机Chrome版本的UA
  406. ]
  407. )
  408. # 创建页面时伪装指纹
  409. context = browser.new_context(
  410. locale="zh-CN", # 中文环境
  411. timezone_id="Asia/Shanghai", # 上海时区
  412. geolocation={"latitude": 31.230416, "longitude": 121.473701}, # 模拟上海地理位置(可选)
  413. permissions=["geolocation"], # 授予定位权限(模拟真人)
  414. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  415. viewport={"width": 1600, "height": 1400},
  416. # 关键:隐藏自动化特征
  417. java_script_enabled=True,
  418. bypass_csp=True,
  419. # user_data_dir="./temp_user_data" # 模拟真实用户数据目录
  420. )
  421. page = context.new_page()
  422. # 关键:移除navigator.webdriver标识(反爬核心)
  423. page.add_init_script("""
  424. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  425. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); // 新增:模拟插件
  426. Object.defineProperty(navigator, 'mimeTypes', { get: () => [1, 2, 3] }); // 新增:模拟MIME类型
  427. window.chrome = { runtime: {}, loadTimes: () => ({}) }; // 增强Chrome模拟
  428. delete window.navigator.languages;
  429. window.navigator.languages = ['zh-CN', 'zh'];
  430. // 新增:模拟真实鼠标移动特征
  431. (() => {
  432. const originalAddEventListener = EventTarget.prototype.addEventListener;
  433. EventTarget.prototype.addEventListener = function(type, listener) {
  434. if (type === 'mousemove') {
  435. return originalAddEventListener.call(this, type, (e) => {
  436. e._automation = undefined;
  437. listener(e);
  438. });
  439. }
  440. return originalAddEventListener.call(this, type, listener);
  441. };
  442. })();
  443. """)
  444. try:
  445. # ========== 核心:Cookie复用逻辑 ==========
  446. # 1. 加载本地Cookie
  447. load_cookies(context)
  448. # 2. 验证登录状态
  449. if not is_login(page):
  450. # 3. Cookie失效/不存在,执行登录
  451. page.goto(TARGET_LOGIN_URL)
  452. page.wait_for_load_state("networkidle")
  453. # logger.info("🔑 开始执行登录流程")
  454. # 执行登录操作
  455. # login_success = login_operation(page, USERNAME, PASSWORD)
  456. # if not login_success:
  457. # logger.error(" 登录失败,程序终止")
  458. # return
  459. # # 4. 登录成功后保存Cookie
  460. # save_cookies(context)
  461. # logger.info(" 登录并保存Cookie成功!")
  462. KEYWORDS = get_search_keywords_from_db()
  463. # get_search_keywords_from_db()
  464. # 执行搜索
  465. total_num = 0
  466. for kw in KEYWORDS:
  467. popup_guard(page, "before_search")
  468. detail_page, search_success = search_operation(page, kw)
  469. if not search_success:
  470. print(f"❌ 搜索失败:{kw}")
  471. continue
  472. popup_guard(page, "after_search")
  473. detail_page.wait_for_load_state('networkidle')
  474. data_list = collect_data(detail_page, kw)
  475. #找不到数据跳过判断和出现杂数据跳过
  476. # not_found_keywords = page.locator("span:has-text('新品登记')")
  477. # if not_found_keywords.count() > 0:
  478. # logger.warning(f"⚠️ 关键词「{kw}」无匹配商品,直接跳过整个关键词采集")
  479. # continue
  480. # TARGET_SELECTOR = page.locator('div[data-v-4c22c8c9].sr-page_turner-pagination-total')
  481. # total_count = 0 # ⚠️ 每一轮关键词都重置
  482. # if TARGET_SELECTOR.count() > 0:
  483. # nums = TARGET_SELECTOR.inner_text(timeout=5000).strip()
  484. # print(nums)
  485. # match = re.search(r'\d+', nums)
  486. # if match:
  487. # total_count = int(match.group())
  488. # print(total_count)
  489. # else:
  490. # itme_boxes = page.locator("div[data-v-4c22c8c9].sr-list-item[data-item_loc]")
  491. # total_count = itme_boxes.count()
  492. # print(f"【{kw}】无分页,当前页盒子数:{total_count}")
  493. # total_num += total_count
  494. # print(f"截止到这个{kw}关键词有{total_num}条数据")
  495. # page.wait_for_timeout(10000)
  496. # print(f"✅ 本次采集总数据量:{total_num}")
  497. except Exception as e:
  498. print(f" 程序异常:{str(e)}")
  499. finally:
  500. browser.close()
  501. print(" 浏览器已关闭,程序结束")
  502. # ==================== 程序入口 ====================
  503. if __name__ == '__main__':
  504. main()