nextpage.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  2. import os
  3. import json
  4. import random
  5. from logger_config import logger
  6. from config import *
  7. import re
  8. COOKIE_FILE_PATH = "ybm_cookies.json" # Cookie保存路径
  9. LOGIN_VALIDATE_URL = "https://www.ybm100.com/new/"
  10. TARGET_LOGIN_URL = "https://www.ybm100.com/new/login"
  11. # ========== 新增:滚动相关配置(可根据需要调整) ==========
  12. SCROLL_STEP = 200 # 每次滚动的步长(像素),越小越慢越逼真
  13. SCROLL_DELAY = 0.15 # 每步滚动的延迟(秒),越大越慢
  14. MAX_SCROLL_ATTEMPTS = 50 # 最大滚动次数(防止无限循环)
  15. def load_cookies(context, cookie_path=COOKIE_FILE_PATH):
  16. """从本地JSON文件加载Cookie到浏览器上下文"""
  17. if not os.path.exists(cookie_path):
  18. # logger.warning(f" Cookie文件不存在:{cookie_path}")
  19. return False
  20. try:
  21. with open(cookie_path, "r", encoding="utf-8") as f:
  22. cookies = json.load(f)
  23. context.add_cookies(cookies)
  24. # logger.info(f"✅ 已从{cookie_path}加载Cookie")
  25. return True
  26. except Exception as e:
  27. # logger.error(f" 加载Cookie失败:{e}")
  28. return False
  29. def is_login(page):
  30. """验证是否已登录(核心:检测登录态)"""
  31. try:
  32. # 访问需要登录的页面
  33. page.goto(LOGIN_VALIDATE_URL, timeout=5000)
  34. page.wait_for_load_state("networkidle")
  35. # 检测是否跳转到登录页(URL包含login则未登录)
  36. if "login" in page.url.lower():
  37. # logger.warning(" Cookie失效,需要重新登录")
  38. return False
  39. # 可选:检测登录后的专属元素(比如用户名、个人中心等)
  40. # if page.locator("用户中心选择器").count() > 0:
  41. # return True
  42. # logger.info(" Cookie有效,已保持登录状态")
  43. return True
  44. except Exception as e:
  45. # logger.error(f" 验证登录状态失败:{e}")
  46. return False
  47. def popup_guard(page, tag=""):
  48. """
  49. 全局弹窗/遮罩守卫:多步引导 + 关闭按钮 + 遮罩清理 + 恢复滚动
  50. tag 仅用于日志区分调用位置
  51. """
  52. try:
  53. # 给弹窗一点出现时间
  54. page.wait_for_timeout(300)
  55. # 1) 连续点“下一步/完成/我知道了/关闭”
  56. for _ in range(6):
  57. btn = page.locator(
  58. "xpath=//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  59. ).first
  60. if btn.count() > 0 and btn.is_visible():
  61. btn.click(timeout=1500)
  62. page.wait_for_timeout(250)
  63. continue
  64. # 2) 常见的 close icon
  65. close_btn = page.locator(
  66. "css=.el-dialog__headerbtn, .el-message-box__headerbtn, .close, .icon-close, .el-icon-close"
  67. ).first
  68. if close_btn.count() > 0 and close_btn.is_visible():
  69. close_btn.click(timeout=1200)
  70. page.wait_for_timeout(250)
  71. continue
  72. break
  73. # 3) 清遮罩 + 恢复滚动/交互
  74. page.evaluate(r"""
  75. () => {
  76. // 第一步:精准清理已知的遮罩/弹窗类名(Element UI框架常用)
  77. const selectors = [
  78. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  79. '.el-message-box__wrapper', '.el-loading-mask'
  80. ];
  81. selectors.forEach(sel => document.querySelectorAll(sel).forEach(e => e.remove()));
  82. // 泛化兜底:近似全屏 + 高 z-index 的层直接屏蔽
  83. const all = Array.from(document.querySelectorAll('body *'));
  84. for (const el of all) {
  85. const s = getComputedStyle(el); // 获取元素的实际样式(含CSS生效的样式)
  86. const z = parseInt(s.zIndex || '0', 10); // 取元素的层级(z-index),默认0
  87. // 条件1:元素是固定/绝对定位(弹窗/遮罩常见定位方式)+ 层级≥1000(高优先级遮挡)+ 能拦截鼠标事件
  88. if ((s.position === 'fixed' || s.position === 'absolute') && z >= 1000 && s.pointerEvents !== 'none') {
  89. const r = el.getBoundingClientRect(); // 获取元素的尺寸和位置
  90. // 条件2:元素宽度/高度≥屏幕80%(近似全屏遮罩)
  91. const nearFull = r.width >= innerWidth * 0.8 && r.height >= innerHeight * 0.8;
  92. if (nearFull) {
  93. el.style.pointerEvents = 'none'; // 让元素不拦截鼠标点击
  94. el.style.display = 'none'; // 隐藏元素
  95. }
  96. }
  97. }
  98. // 第三步:恢复页面滚动功能(弹窗常把页面设为不可滚动)
  99. document.documentElement.style.overflow = 'auto'; // html标签恢复滚动
  100. document.body.style.overflow = 'auto'; // body标签恢复滚动
  101. document.body.classList.remove('el-popup-parent--hidden'); // 移除Element UI的滚动禁用类
  102. }
  103. """)
  104. # logger.info("杀除弹窗成功")
  105. except Exception:
  106. pass
  107. SEARCH_INPUT_SELECTOR = "input[placeholder*='药品名称/厂家名称']"
  108. def pick_search_input(page):
  109. """优先选可见且可用的搜索输入框;第一个不行就尝试第二个"""
  110. inputs = page.locator(SEARCH_INPUT_SELECTOR)
  111. cnt = inputs.count()
  112. # 优先检查前两个(你说只有两个)
  113. for i in range(min(cnt, 2)):
  114. candidate = inputs.nth(i)
  115. try:
  116. candidate.wait_for(state="visible", timeout=1500) # 小超时快速试探
  117. if candidate.is_enabled():
  118. return candidate
  119. except PlaywrightTimeoutError:
  120. continue
  121. # 兜底:直接找任意可见的(避免命中 hidden 模板)
  122. candidate = page.locator(f"{SEARCH_INPUT_SELECTOR}:visible").first
  123. candidate.wait_for(state="visible", timeout=5000)
  124. return candidate
  125. def type_slow(locator, text: str, min_delay=0.06, max_delay=0.18):
  126. """逐字输入,模拟真人打字"""
  127. for ch in text:
  128. locator.type(ch, delay=int(random.uniform(min_delay, max_delay) * 1000))
  129. SEARCH_BTN_SELECTOR = 'div.home-search-container-search-head-btn[data-scmd="text-搜索"]'
  130. def force_close_popup(page):
  131. """增强版:优先清理遮罩,再处理引导按钮"""
  132. try:
  133. # 第一步:先强制移除所有遮罩层(前置操作,关键!)
  134. page.evaluate("""
  135. () => {
  136. // 1. 移除所有高优先级遮罩
  137. const maskSelectors = [
  138. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  139. '.el-message-box__wrapper', '.el-loading-mask', '[class*="mask"]', '[class*="overlay"]',
  140. '[style*="position: fixed"][style*="z-index: 9999"]', '[style*="position: absolute"][style*="z-index: 9999"]'
  141. ];
  142. maskSelectors.forEach(sel => {
  143. document.querySelectorAll(sel).forEach(el => {
  144. el.remove(); // 直接删除遮罩元素
  145. });
  146. });
  147. // 2. 恢复body和列表容器的交互
  148. document.body.style.overflow = 'auto';
  149. document.body.style.pointerEvents = 'auto';
  150. // 3. 强制解除列表容器的样式限制
  151. document.querySelectorAll('.product-list-container, .list-container, .el-table__body-wrapper').forEach(el => {
  152. el.style.overflow = 'auto !important';
  153. el.style.height = 'auto !important';
  154. el.style.maxHeight = 'calc(100vh - 200px) !important';
  155. el.style.pointerEvents = 'auto !important';
  156. });
  157. }
  158. """)
  159. page.wait_for_timeout(500)
  160. # 第二步:处理引导按钮(下一步/完成/关闭)
  161. for _ in range(5):
  162. btn = page.locator(
  163. "//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  164. ).first
  165. if btn.count() > 0 and btn.is_visible():
  166. btn.click(timeout=1500)
  167. page.wait_for_timeout(300)
  168. continue
  169. close_icon = page.locator(
  170. "xpath=//*[contains(@class,'close') or contains(@class,'el-icon-close') or name()='svg' or name()='i'][1]"
  171. ).first
  172. if close_icon.count() > 0 and close_icon.is_visible():
  173. close_icon.click(timeout=1000)
  174. page.wait_for_timeout(300)
  175. continue
  176. break
  177. except Exception as e:
  178. logger.warning(f"⚠️ 强制清理弹窗时异常:{e}")
  179. def kill_masks(page):
  180. """增强版:强制解除所有样式限制"""
  181. page.evaluate(r"""
  182. () => {
  183. // 1. 移除所有已知遮罩类
  184. const knownSelectors = [
  185. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  186. '.el-message-box__wrapper', '.el-loading-mask', '.el-popup-parent--hidden'
  187. ];
  188. knownSelectors.forEach(sel => {
  189. document.querySelectorAll(sel).forEach(el => el.remove());
  190. });
  191. // 2. 泛化清理所有高z-index遮挡层
  192. const all = Array.from(document.querySelectorAll('body *'));
  193. for (const el of all) {
  194. const s = getComputedStyle(el);
  195. if (!s) continue;
  196. const z = parseInt(s.zIndex || '0', 10);
  197. const pos = s.position;
  198. const pe = s.pointerEvents;
  199. if ((pos === 'fixed' || pos === 'absolute') && z >= 1000) {
  200. el.remove(); // 直接删除高优先级遮挡元素
  201. }
  202. }
  203. // 3. 强制恢复页面所有元素的交互和滚动
  204. document.documentElement.style.overflow = 'auto !important';
  205. document.body.style.overflow = 'auto !important';
  206. document.body.style.position = 'static !important';
  207. document.body.style.width = 'auto !important';
  208. document.body.style.paddingRight = '0px !important';
  209. document.body.style.pointerEvents = 'auto !important';
  210. document.body.classList.remove('el-popup-parent--hidden');
  211. // 4. 强制恢复商品列表容器的样式(关键!)
  212. document.querySelectorAll('.product-list-container').forEach(el => {
  213. el.style.overflow = 'auto !important';
  214. el.style.height = 'auto !important';
  215. el.style.maxHeight = 'calc(100vh - 200px) !important';
  216. el.style.display = 'block !important';
  217. el.style.visibility = 'visible !important';
  218. el.style.pointerEvents = 'auto !important';
  219. });
  220. return { success: true };
  221. }
  222. """)
  223. def slow_scroll_to_bottom(page):
  224. """
  225. 模拟真人慢速滚动到页面最底部(适配全局/局部滚动容器)
  226. :param page: 页面对象
  227. :return: None
  228. """
  229. try:
  230. logger.info("📜 开始慢速滚动到页面底部...")
  231. # ========== 前置:强制刷新容器样式 ==========
  232. page.evaluate("""
  233. () => {
  234. // 1. 强制重绘商品列表容器
  235. const container = document.querySelector('.product-list-container');
  236. if (container) {
  237. container.style.overflow = 'auto !important';
  238. container.offsetHeight; // 触发重绘,更新高度
  239. }
  240. // 2. 强制刷新页面滚动高度
  241. document.documentElement.offsetHeight;
  242. }
  243. """)
  244. page.wait_for_timeout(500)
  245. # 调试日志:打印滚动容器信息(保留)
  246. container_debug = page.evaluate("""
  247. () => {
  248. const container = document.querySelector('.product-list-container') || document.documentElement;
  249. return {
  250. tag: container.tagName,
  251. className: container.className,
  252. scrollHeight: container.scrollHeight,
  253. clientHeight: container.clientHeight,
  254. scrollTop: container.scrollTop
  255. };
  256. }
  257. """)
  258. logger.info(f"滚动容器信息:{container_debug}")
  259. scroll_attempts = 0
  260. last_scroll_height = -1 # 初始值改为-1,避免首次误判
  261. while scroll_attempts < MAX_SCROLL_ATTEMPTS:
  262. # ========== 关键:每次循环都重新获取容器信息 ==========
  263. scroll_container_info = page.evaluate("""
  264. () => {
  265. // 优先找商品列表容器
  266. const container = document.querySelector('.product-list-container') || document.documentElement;
  267. return {
  268. isGlobal: container === document.documentElement,
  269. scrollHeight: container.scrollHeight,
  270. scrollTop: container.scrollTop,
  271. clientHeight: container.clientHeight
  272. };
  273. }
  274. """)
  275. current_scroll_height = scroll_container_info["scrollHeight"]
  276. current_scroll_top = scroll_container_info["scrollTop"]
  277. client_height = scroll_container_info["clientHeight"]
  278. is_global = scroll_container_info["isGlobal"]
  279. # ========== 优化到底判断逻辑 ==========
  280. # 条件1:滚动高度无变化(连续2次相同);条件2:已滚到底(留50px余量)
  281. is_height_same = current_scroll_height == last_scroll_height
  282. is_scroll_bottom = (current_scroll_top + client_height) >= (current_scroll_height - 50)
  283. if is_height_same and scroll_attempts > 2: # 至少滚动2次再判断高度无变化
  284. logger.info(f"✅ 滚动高度无变化,判定已到底部")
  285. break
  286. if is_scroll_bottom:
  287. logger.info(f"✅ 已滚动到容器底部")
  288. break
  289. # ========== 执行滚动 ==========
  290. random_delay = random.uniform(SCROLL_DELAY - 0.05, SCROLL_DELAY + 0.05)
  291. if is_global:
  292. # 全局滚动
  293. page.evaluate(f"window.scrollBy(0, {SCROLL_STEP})")
  294. else:
  295. # 局部容器滚动(核心!)
  296. page.evaluate(f"""
  297. () => {{
  298. const container = document.querySelector('.product-list-container');
  299. if (container) {{
  300. container.scrollTop += {SCROLL_STEP};
  301. // 滚动后触发重绘
  302. container.offsetHeight;
  303. }}
  304. }}
  305. """)
  306. page.wait_for_timeout(int(random_delay * 1000))
  307. # 更新状态
  308. last_scroll_height = current_scroll_height
  309. scroll_attempts += 1
  310. # 最后强制滚到底
  311. page.evaluate("""
  312. () => {
  313. const container = document.querySelector('.product-list-container') || document.documentElement;
  314. container.scrollTop = container.scrollHeight;
  315. }
  316. """)
  317. page.wait_for_timeout(500)
  318. except Exception as e:
  319. logger.warning(f"⚠️ 慢速滚动到底部时出现异常:{e}")
  320. def search_operation(page, keyword, is_first_search: bool = True):
  321. """搜索框填充+提交搜索(遮罩前置清理)"""
  322. try:
  323. # ========== 前置清理:先清遮罩,再操作搜索框 ==========
  324. force_close_popup(page)
  325. kill_masks(page)
  326. search_locator = page.locator(SEARCH_INPUT_SELECTOR)
  327. search_locator.wait_for(timeout=ELEMENT_TIMEOUT)
  328. # 清空搜索框
  329. search_locator.click(force=True)
  330. search_locator.fill("")
  331. page.keyboard.down("Control")
  332. page.keyboard.press("a")
  333. page.keyboard.up("Control")
  334. page.keyboard.press("Backspace")
  335. # 逐字输入
  336. type_slow(search_locator, keyword, min_delay=0.06, max_delay=0.18)
  337. logger.info(f"📝 已输入搜索关键词:{keyword}")
  338. # ========== 再次清理:点击搜索前再清一次 ==========
  339. force_close_popup(page)
  340. # 点击搜索按钮
  341. btn = page.locator(f"{SEARCH_BTN_SELECTOR}")
  342. btn.wait_for(state="visible", timeout=SEARCH_BTN_TIMEOUT)
  343. page.wait_for_timeout(3000)
  344. detail_page = page
  345. if is_first_search:
  346. try:
  347. with page.context.expect_page(timeout=60000) as new_page_info:
  348. btn.click()
  349. detail_page = new_page_info.value
  350. # ========== 新页面立即清遮罩 ==========
  351. detail_page.wait_for_load_state("domcontentloaded", timeout=20000)
  352. force_close_popup(detail_page)
  353. kill_masks(detail_page)
  354. detail_page.wait_for_load_state("networkidle", timeout=20000)
  355. except PlaywrightTimeoutError:
  356. logger.warning(f" 未检测到新标签页")
  357. return None, False
  358. except Exception as e:
  359. logger.warning(f" 等待新标签页异常:{e}")
  360. return None, False
  361. else:
  362. btn.click()
  363. # ========== 原页面跳转后立即清遮罩 ==========
  364. page.wait_for_load_state("domcontentloaded", timeout=20000)
  365. force_close_popup(page)
  366. kill_masks(page)
  367. page.wait_for_load_state("networkidle", timeout=20000)
  368. detail_page = page
  369. logger.info("✅ 后续搜索:已在原页面完成跳转加载")
  370. # 处理引导按钮
  371. test_btn = detail_page.locator("div[data-v-c65c36bc].first-time-highlight-message-btn button")
  372. btn_count = test_btn.count()
  373. logger.info(f"✅ 匹配到的引导按钮数量:{btn_count}")
  374. if btn_count > 0:
  375. test_btn.wait_for(state="attached", timeout=5000)
  376. test_btn.click()
  377. # 最终清理:确保无残留遮罩
  378. force_close_popup(detail_page)
  379. kill_masks(detail_page)
  380. logger.info("✅ 已触发搜索并清理弹窗")
  381. # ========== 关键:等待列表渲染 + 强制刷新容器高度 ==========
  382. # 等待商品列表容器加载
  383. detail_page.wait_for_selector(".product-list-container", timeout=5000)
  384. # 强制刷新容器样式(解决高度计算错误)
  385. detail_page.evaluate("""
  386. () => {
  387. const container = document.querySelector('.product-list-container');
  388. if (container) {
  389. container.style.overflow = 'auto !important';
  390. // 强制重绘容器
  391. container.offsetHeight; // 触发重绘
  392. }
  393. }
  394. """)
  395. detail_page.wait_for_timeout(1000)
  396. return detail_page, True
  397. except PlaywrightTimeoutError as e:
  398. logger.error(f" 搜索失败:元素定位超时 - {str(e)}")
  399. return None, False
  400. except Exception as e:
  401. logger.error(f" 搜索异常:{str(e)}")
  402. return None, False
  403. #翻下一页
  404. def goto_next_page(page) -> bool:
  405. """
  406. 核心修改:基于 button.btn-next 的 aria-disabled 属性判断是否有下一页
  407. :param page: 搜索结果页面对象(detail_page)
  408. :return: True=翻页成功,False=无下一页/翻页失败
  409. """
  410. try:
  411. # 1. 定位下一页按钮(精准匹配你指定的元素)
  412. next_btn = page.locator("button.btn-next").first
  413. # 2. 先等待按钮加载(确保元素存在)
  414. next_btn.wait_for(state="attached", timeout=3000)
  415. # 3. 获取 aria-disabled 属性值(核心判断依据)
  416. aria_disabled = next_btn.get_attribute("aria-disabled")
  417. logger.info(f"下一页按钮 aria-disabled 属性值:{aria_disabled}")
  418. # 4. 判断是否有下一页:aria-disabled="true" 表示无下一页
  419. if aria_disabled == "true":
  420. logger.warning("⚠️ 下一页按钮 aria-disabled=true,已无更多页面")
  421. return False
  422. # 5. 按钮可用(aria-disabled="false"),先滚动到顶部(避免按钮被遮挡)
  423. page.evaluate("window.scrollTo(0, 0);")
  424. page.wait_for_timeout(500)
  425. # 6. 确保按钮可见且可点击(强制点击兜底)
  426. if next_btn.is_visible() and next_btn.is_enabled():
  427. next_btn.click(timeout=5000)
  428. else:
  429. # 兜底:强制点击(避免元素不可见但实际可点击的情况)
  430. next_btn.click(force=True, timeout=5000)
  431. # 7. 等待页面加载完成(确保翻页后内容刷新)
  432. page.wait_for_load_state("networkidle", timeout=15000)
  433. # 8. 翻页后清理遮罩(避免新页面遮罩影响)
  434. force_close_popup(page)
  435. kill_masks(page)
  436. logger.info("✅ 翻页成功,下一页按钮 aria-disabled=false")
  437. return True
  438. except PlaywrightTimeoutError:
  439. logger.warning("⚠️ 下一页按钮加载超时,判定无更多页面")
  440. return False
  441. except Exception as e:
  442. logger.warning(f"⚠️ 翻页操作异常:{e},判定无更多页面")
  443. return False
  444. import time
  445. import random
  446. # from playwright.sync_api import MouseWheelDirection
  447. # ========== 滚动配置(保持你的原有配置) ==========
  448. SCROLL_STEP = 50 # 每步滚动50px
  449. SCROLL_INTERVAL = 0.05 # 每步间隔0.05秒
  450. SCROLL_OFFSET_RANGE = 50 # 滚动距离随机偏移±50px
  451. MIN_CLICK_DELAY = 0.5
  452. MAX_CLICK_DELAY = 1.0
  453. def random_delay(min_delay, max_delay):
  454. """随机延迟(模拟真人操作)"""
  455. time.sleep(random.uniform(min_delay, max_delay))
  456. def slow_scroll_400px(page, scroll_distance1=400):
  457. """
  458. 慢速滚动400px±50px(模拟真人鼠标滚轮+强制解除滚动限制)
  459. 适配:全局滚动条动,但JS scrollBy无效的场景
  460. :param page: 页面对象
  461. :return: 滚动是否成功
  462. """
  463. try:
  464. # 1. 前置:强制解除页面所有滚动限制(核心!)
  465. page.evaluate("""
  466. () => {
  467. // 强制恢复html/body的滚动能力
  468. document.documentElement.style.overflow = 'auto !important';
  469. document.body.style.overflow = 'auto !important';
  470. document.documentElement.style.pointerEvents = 'auto !important';
  471. document.body.style.pointerEvents = 'auto !important';
  472. document.documentElement.style.position = 'static !important';
  473. document.body.style.position = 'static !important';
  474. // 移除所有可能禁用滚动的类/属性
  475. document.body.classList.remove('el-popup-parent--hidden', 'no-scroll');
  476. document.documentElement.classList.remove('el-popup-parent--hidden', 'no-scroll');
  477. // 强制刷新滚动高度(避免计算错误)
  478. document.documentElement.offsetHeight;
  479. }
  480. """)
  481. page.wait_for_timeout(300)
  482. # 2. 生成随机滚动距离(400±50px)
  483. scroll_distance = random.randint(
  484. scroll_distance1 - SCROLL_OFFSET_RANGE,
  485. scroll_distance1 + SCROLL_OFFSET_RANGE
  486. )
  487. # 转换为鼠标滚轮的“步长”(1个滚轮步长≈100px,需适配)
  488. wheel_steps = int(scroll_distance / 100)
  489. remaining_pixels = scroll_distance % 100
  490. logger.info(
  491. f"📜 开始模拟鼠标滚轮滚动(目标距离:{scroll_distance}px,滚轮步数:{wheel_steps}步 + {remaining_pixels}px)"
  492. )
  493. # 3. 第一步:用鼠标滚轮模拟真人滚动(反爬友好)
  494. # 先把鼠标移到页面中间(商品列表区域),避免滚动空白处
  495. page.mouse.move(random.randint(300, 800), random.randint(400, 600))
  496. for _ in range(wheel_steps):
  497. # 模拟鼠标滚轮向下滚动(1步≈100px)
  498. page.mouse.wheel(delta_x=0, delta_y=100)
  499. time.sleep(random.uniform(SCROLL_INTERVAL*2, SCROLL_INTERVAL*5)) # 随机间隔,更像真人
  500. # 4. 第二步:处理剩余不足1步的像素(用scrollTo兜底)
  501. if remaining_pixels > 0:
  502. current_scroll_top = page.evaluate("window.scrollY || document.documentElement.scrollTop")
  503. target_scroll_top = current_scroll_top + remaining_pixels
  504. # 用scrollTo强制滚动(比scrollBy更稳定)
  505. page.evaluate(f"window.scrollTo(0, {target_scroll_top});")
  506. time.sleep(SCROLL_INTERVAL)
  507. # 5. 验证滚动是否生效
  508. final_scroll_top = page.evaluate("window.scrollY || document.documentElement.scrollTop")
  509. logger.info(f" 滚动完成,当前全局滚动位置:{final_scroll_top}px")
  510. # 6. 滚动后等待懒加载+模拟真人停顿
  511. page.wait_for_load_state("networkidle", timeout=8000)
  512. random_delay(2.0, 3.0)
  513. return True
  514. except Exception as e:
  515. logger.warning(f" 慢速滚动失败:{e}")
  516. return False
  517. def main():
  518. with sync_playwright() as p:
  519. browser = p.chromium.launch(
  520. headless=False, # 不要用无头模式(反爬:无头模式易被识别)
  521. channel="chrome", # 使用真实Chrome内核
  522. slow_mo=random.randint(100, 300), # 全局操作延迟(模拟真人慢速操作)
  523. args=[
  524. "--disable-blink-features=AutomationControlled", # 禁用webdriver特征(核心!)
  525. "--enable-automation=false", # 新增:禁用自动化标识
  526. "--disable-infobars", # 新增:禁用信息栏
  527. "--remote-debugging-port=0", # 新增:随机调试端口
  528. "--start-maximized", # 最大化窗口(模拟真人使用)
  529. "--disable-extensions", # 禁用扩展(避免特征)
  530. "--disable-plugins-discovery", # 禁用插件发现
  531. "--no-sandbox", # 避免沙箱模式特征
  532. "--disable-dev-shm-usage", # 避免内存限制导致的异常
  533. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36" # 随机Chrome版本的UA
  534. ]
  535. )
  536. # 创建页面时伪装指纹
  537. context = browser.new_context(
  538. locale="zh-CN", # 中文环境
  539. timezone_id="Asia/Shanghai", # 上海时区
  540. geolocation={"latitude": 31.230416, "longitude": 121.473701}, # 模拟上海地理位置(可选)
  541. permissions=["geolocation"], # 授予定位权限(模拟真人)
  542. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  543. no_viewport=True,
  544. # 关键:隐藏自动化特征
  545. java_script_enabled=True,
  546. bypass_csp=True,
  547. # user_data_dir="./temp_user_data" # 模拟真实用户数据目录
  548. )
  549. # input("...")
  550. page = context.new_page()
  551. # 关键:移除navigator.webdriver标识(反爬核心)
  552. page.add_init_script("""
  553. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  554. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); // 新增:模拟插件
  555. Object.defineProperty(navigator, 'mimeTypes', { get: () => [1, 2, 3] }); // 新增:模拟MIME类型
  556. window.chrome = { runtime: {}, loadTimes: () => ({}) }; // 增强Chrome模拟
  557. delete window.navigator.languages;
  558. window.navigator.languages = ['zh-CN', 'zh'];
  559. // 新增:模拟真实鼠标移动特征
  560. (() => {
  561. const originalAddEventListener = EventTarget.prototype.addEventListener;
  562. EventTarget.prototype.addEventListener = function(type, listener) {
  563. if (type === 'mousemove') {
  564. return originalAddEventListener.call(this, type, (e) => {
  565. e._automation = undefined;
  566. listener(e);
  567. });
  568. }
  569. return originalAddEventListener.call(this, type, listener);
  570. };
  571. })();
  572. """)
  573. try:
  574. # ========== 核心:Cookie复用逻辑 ==========
  575. # 1. 加载本地Cookie
  576. load_cookies(context)
  577. # 2. 验证登录状态
  578. if not is_login(page):
  579. # 3. Cookie失效/不存在,执行登录
  580. page.goto(TARGET_LOGIN_URL)
  581. page.wait_for_load_state("networkidle")
  582. # logger.info("🔑 开始执行登录流程")
  583. # 执行登录操作
  584. # login_success = login_operation(page, USERNAME, PASSWORD)
  585. # if not login_success:
  586. # logger.error(" 登录失败,程序终止")
  587. # return
  588. # # 4. 登录成功后保存Cookie
  589. # save_cookies(context)
  590. # logger.info(" 登录并保存Cookie成功!")
  591. KEYWORDS = ['999皮炎平 糠酸莫米松凝胶']
  592. # get_search_keywords_from_db()
  593. # 执行搜索
  594. total_num = 0
  595. # current_page = page
  596. detail_page = None
  597. nums = 0
  598. page_no = 1
  599. for kw in KEYWORDS:
  600. popup_guard(page, "before_search")
  601. if nums == 0:
  602. popup_guard(detail_page if detail_page else page, "before_search") # page是你的初始页面对象,需提前定义
  603. detail_page, search_success = search_operation(page, kw, is_first_search=True)
  604. nums += 1
  605. else:
  606. if detail_page is None:
  607. logger.error(f" ❌ 无可用的搜索页面,跳过「{kw}」")
  608. continue
  609. popup_guard(detail_page, "before_search")
  610. detail_page, search_success = search_operation(detail_page, kw, is_first_search=False)
  611. if not search_success:
  612. print(f"❌ 搜索失败:{kw}")
  613. continue
  614. if detail_page is None:
  615. break
  616. popup_guard(detail_page, "after_search")
  617. #找不到数据跳过判断和出现杂数据跳过
  618. not_found_keywords = detail_page.locator("div.filter-panel-container-empty-text")
  619. if not_found_keywords.count() > 0:
  620. logger.warning(f"⚠️ 关键词「{kw}」无匹配商品,直接跳过整个关键词采集")
  621. continue
  622. # detail_page.wait_for_selector("div[class*='product-list'], .el-table", timeout=5000)
  623. # slow_scroll_to_bottom(detail_page)
  624. while True:
  625. # ✅ 先获取当前页商品个数
  626. detail_page.wait_for_load_state("domcontentloaded") # 先等DOM加载
  627. detail_page.wait_for_load_state("networkidle")
  628. detail_page.wait_for_timeout(500) # 额外等待渲染稳定
  629. goods_item = detail_page.locator("div.product-list-item").count()
  630. print(f"这页商品有{goods_item}个")
  631. slow_scroll_400px(detail_page)
  632. if goto_next_page(detail_page):
  633. logger.info(f"「{kw}」还有下一页")
  634. page_no += 1
  635. continue
  636. else:
  637. logger.info(f" 「{kw}」已无下一页,关键词采集结束")
  638. break
  639. except Exception as e:
  640. print(f" 程序异常:{str(e)}")
  641. finally:
  642. browser.close()
  643. print(" 浏览器已关闭,程序结束")
  644. # ==================== 程序入口 ====================
  645. if __name__ == '__main__':
  646. main()