get_shop_licenseName.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  2. from logger_config import logger
  3. from datetime import datetime
  4. import random
  5. import csv
  6. import os
  7. import time
  8. import json
  9. import pymysql
  10. from pymysql.err import OperationalError, ProgrammingError, DataError
  11. from config import *
  12. import re
  13. import uuid
  14. import requests
  15. from main import *
  16. #采集数据核心
  17. def collect_data1(page, keyword):
  18. """
  19. 1) 先获取当前页商品个数(count)
  20. 2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
  21. 3) 当前页循环完 -> goto_next_page;有下一页继续;无下一页结束该关键词
  22. """
  23. collect_result = []
  24. seen = set()
  25. logger.info(f"📊 开始采集「{keyword}」的商品数据")
  26. page.wait_for_load_state("networkidle")
  27. page_no = 1
  28. while True:
  29. logger.info(f"\n📄 「{keyword}」开始采集第 {page_no} 页")
  30. # 记录列表页URL(可用于你后续兜底)
  31. list_page_url = page.url
  32. logger.info(f"📌 已记录商品列表页URL:{list_page_url}")
  33. # ✅ 先获取当前页商品个数
  34. page.wait_for_load_state("networkidle")
  35. total_limit = page.locator(PRODUCT_ITEM_SELECTOR).count()
  36. logger.info(f"📌 「{keyword}」第{page_no}页 初始商品个数(count):{total_limit}")
  37. # 重置当前页的采集计数
  38. collected_count = 0
  39. for idx in range(total_limit):
  40. detail_page = None
  41. try:
  42. item = page.locator(PRODUCT_ITEM_SELECTOR).nth(idx)
  43. collected_count += 1 # 实际采集计数(用于日志)
  44. # ========= 反爬随机延迟(保留你的原逻辑也行) =========
  45. page.wait_for_load_state("networkidle")
  46. delay = random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  47. logger.info(f"📌 「{keyword}」第{page_no}页 第{collected_count}/{total_limit}个商品 - 等待{delay:.2f}秒后采集(反爬)")
  48. # ========= 售罄跳过 =========
  49. sold_locator = item.locator('div[data-v-480da687].gc-l1-cirle_tip')
  50. if sold_locator.count() > 0:
  51. is_sold_out = 1
  52. logger.info(f" 「{keyword}」第{page_no}页 第{collected_count}个商品已售罄")
  53. # if collected_count % 5 == 0 and collected_count > 0:
  54. # logger.info("采满5个往下滑")
  55. # slow_scroll_400px(page)
  56. # page.wait_for_load_state("networkidle")
  57. # continue
  58. # 1. 初始化所有字段默认值
  59. product = "无标题"
  60. price = "0.00"
  61. shop = "无店名"
  62. expiry_date = "无有效期"
  63. manufacture_date = "无生产日期"
  64. approval_number = "无批准文号"
  65. manufacturer = "未知公司"
  66. # discount_price = "0.00"
  67. spec = "未知规格"
  68. num = 1 # ✅ 默认 1
  69. platform = '药九九'
  70. current_time = datetime.now().strftime("%Y-%m-%d")
  71. is_sold_out = 0
  72. # 提取商品标题(处理空值)
  73. product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
  74. if product_locator.count() > 0:
  75. title = product_locator.inner_text(timeout=3000).strip()
  76. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页标题:{title}{'='*10}")
  77. else:
  78. logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
  79. # 提取价格(带缺失日志)
  80. # 4. 提取价格(带缺失日志)
  81. price_locator = item.locator(PRODUCT_PRICE_SELECTOR).nth(0)
  82. if price_locator.count() > 0:
  83. price = price_locator.inner_text(timeout=3000).strip()
  84. logger.info(f"{'='*10}{keyword}」第{collected_count}个商品 - 列表页采购价格:{price}{'='*10}")
  85. else:
  86. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页采购价格元素未找到,使用默认值:{price}")
  87. # 5. 提取公司名称(带缺失日志)
  88. manufacturer_locator = item.locator(PRODUCT_COMPANY_SELECTOR)
  89. if manufacturer_locator.count() > 0:
  90. manufacturer = manufacturer_locator.inner_text(timeout=3000).strip()
  91. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页公司名:{manufacturer}{'='*10}")
  92. else:
  93. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页公司名称元素未找到,使用默认值:{manufacturer}")
  94. #提取店铺名称
  95. shop_locator = item.locator(PRODUCT_STORE_SELECTOR)
  96. if shop_locator.count() > 0:
  97. shop = shop_locator.inner_text(timeout=3000).strip()
  98. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页店名:{shop}{'='*10}")
  99. else:
  100. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页店铺名称元素未找到,使用默认值:{shop}")
  101. #提取折扣价
  102. discount_price = ""
  103. discount_price_locator = item.locator('span[data-v-480da687].gc-l2-discount_price').first
  104. if discount_price_locator.count() > 0:
  105. discount_price = discount_price_locator.inner_text(timeout=3000).strip()
  106. discount_price_val_origin = discount_price
  107. match = re.search(r'\d+\.?\d*', str(discount_price_val_origin))
  108. discount_price_val = float(match.group()) if match else 0.00
  109. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页折扣价:{discount_price_val}{'='*10}")
  110. else:
  111. #如果没有拿原价替换
  112. price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "") else "0.00"
  113. discount_price_val = price
  114. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 折扣价元素未找到,使用采购价兜底:{discount_price_val}")
  115. merged_price = f"{price}{discount_price_val_origin}" if discount_price_val_origin else price
  116. # ========= ✅ 去重(最小且稳:可换成 href/data-id 更稳) =========
  117. # key = f"{title.strip()}|{store.strip()}|{company_name.strip()}|{price.strip()}"
  118. # if key in seen:
  119. # logger.warning(
  120. # f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过"
  121. # )
  122. # if collected_count % 5 == 0 and collected_count > 0:
  123. # logger.info("采满15个往下滑")
  124. # slow_scroll_400px(page)
  125. # page.wait_for_load_state("networkidle")
  126. # continue
  127. # seen.add(key)
  128. # ========= 模拟点击商品进入详情页 =========
  129. logger.info(
  130. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 模拟鼠标移动并点击"
  131. )
  132. # 点击商品项容器,触发详情展示
  133. # ========== 点击商品跳详情页 ==========
  134. # 反爬:模拟真人鼠标移动到商品上再点击(不是直接点击)
  135. logger.info(f"📌 「{keyword}」第{collected_count}个商品「{title}」- 模拟鼠标移动并点击")
  136. item.hover() # 先悬停
  137. random_delay(0.2, 0.5) # 悬停后延迟
  138. item.dispatch_event("mousedown")
  139. random_delay(0.05, 0.15) # 鼠标按下后延迟
  140. item.dispatch_event("mouseup")
  141. random_delay(0.05, 0.1) # 鼠标松开后延迟
  142. try:
  143. with page.context.expect_page(timeout=60000) as p:
  144. item.click(delay=random.uniform(0.1, 0.3))
  145. detail_page = p.value
  146. except PlaywrightTimeoutError:
  147. logger.warning(
  148. f" 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 未检测到新标签页,使用当前页采集详情"
  149. )
  150. detail_page = None # 标记为无新标签页,避免关闭列表页
  151. # 等待详情加载(优先用新标签页,无则用列表页)
  152. target_page = detail_page if detail_page else page
  153. target_page.wait_for_load_state("networkidle", timeout=20000)
  154. delay = random_delay(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
  155. logger.info(
  156. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 详情页加载完成,等待{delay:.2f}秒(反爬)"
  157. )
  158. # 反爬:检测详情页反爬验证
  159. # check_anti_crawl(page)
  160. # ========== 采集详情页的专属信息(有效期/生产日期/批准文号) ==========
  161. #获取商品详情页链接
  162. product_link = target_page.url
  163. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页链接:{product_link}{'='*10}")
  164. # 提取有效期(处理空值)
  165. expiry_date_locator = target_page.locator("//span[contains(text(), '有效期')]/following-sibling::span[contains(@class, 'gdb-desc-value4')]")
  166. if expiry_date_locator.count() > 0:
  167. expiry_date = expiry_date_locator.inner_text(timeout=3000).strip()
  168. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页有效期:{expiry_date}{'='*10}")
  169. else:
  170. # 修复:替换未定义的i为collected_count
  171. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 有效期元素未找到,使用默认值:{expiry_date}")
  172. # 提取生产日期(修复完成)
  173. manufacture_date_locator = target_page.locator("//span[@class='gdb-desc-label' and text()='生产日期']/following-sibling::span[1]")
  174. if manufacture_date_locator.count() > 0:
  175. manufacture_date = manufacture_date_locator.inner_text(timeout=3000).strip().replace('-', "")
  176. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页生产日期:{manufacture_date}{'='*10}")
  177. else:
  178. # 修复:替换未定义的i为collected_count
  179. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 生产日期元素未找到,使用默认值:{manufacture_date}")
  180. # 提取批准文号(替换为你实际的选择器)
  181. approval_number_locator = target_page.locator("//span[contains(text(), '国药准字')]").first
  182. if approval_number_locator.count() > 0:
  183. approval_number = approval_number_locator.inner_text(timeout=3000).strip()
  184. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页批准文号:{approval_number}{'='*10}")
  185. else:
  186. # 修复:替换未定义的i为collected_count
  187. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 批准文号元素未找到,使用默认值:{approval_number}")
  188. #提取规格
  189. spec_locator = target_page.locator('span.gddd-params_text_line_1[title]')
  190. if spec_locator.count() > 0:
  191. spec = spec_locator.nth(2).inner_text(timeout=3000).strip()
  192. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页规格:{spec}{'='*10}")
  193. else:
  194. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  195. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 规格元素数量不足,使用默认值:{spec}")
  196. #获取营业执照图片 li[data-v-4f79abe8].nth(2)
  197. #进入店铺
  198. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  199. entershop_btn = target_page.locator('[data-v-c5790f48].btn-text')
  200. entershop_btn.click()
  201. target_page.wait_for_load_state("networkidle")
  202. #点击店铺资质
  203. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  204. shop_license_page = target_page.locator('li[data-v-4f79abe8]').nth(2)
  205. shop_license_page.click()
  206. target_page.wait_for_load_state("networkidle")
  207. SCROLL_TARGET_DISTANCE = 1500
  208. try:
  209. # 生成400±50px的随机滚动距离
  210. scroll_distance = random.randint(
  211. SCROLL_TARGET_DISTANCE - SCROLL_OFFSET_RANGE,
  212. SCROLL_TARGET_DISTANCE + SCROLL_OFFSET_RANGE
  213. )
  214. remaining_distance = scroll_distance
  215. total_steps = int(scroll_distance / SCROLL_STEP)
  216. logger.info(
  217. f"📜 开始慢速滚动(目标距离:{scroll_distance}px,总步数:{total_steps},总时长约{total_steps*SCROLL_INTERVAL:.2f}秒)"
  218. )
  219. # 渐进式滚动(每步50px,间隔0.05秒)
  220. for _ in range(total_steps):
  221. step = min(SCROLL_STEP, remaining_distance)
  222. page.evaluate(f"window.scrollBy(0, {step});")
  223. remaining_distance -= step
  224. time.sleep(SCROLL_INTERVAL)
  225. # 处理剩余不足一步的距离
  226. if remaining_distance > 0:
  227. page.evaluate(f"window.scrollBy(0, {remaining_distance});")
  228. time.sleep(SCROLL_INTERVAL)
  229. # 滚动后等待懒加载完成
  230. page.wait_for_load_state("networkidle", timeout=8000)
  231. random_delay(2.0, 3.0) # 滚动后额外停顿,模拟真人
  232. logger.info(f" 慢速滚动完成,实际滚动距离:{scroll_distance - remaining_distance}px")
  233. except Exception as e:
  234. logger.warning(f" 慢速滚动失败:{e}")
  235. #获取店铺资质图片
  236. shop_license_div = target_page.locator('div[data-v-7f7214f6].shop-licensesImg').nth(2)
  237. shop_license_img = shop_license_div.locator('img')
  238. try:
  239. if shop_license_img.count() > 0:
  240. shop_license_src = shop_license_img.get_attribute('src')
  241. shop_license_src = shop_license_src.strip() if shop_license_src else None
  242. else:
  243. shop_license_src = None
  244. except Exception as e:
  245. # 捕获定位/提取失败的异常,避免程序崩溃
  246. print(f"提取营业执照图片src失败:{e}")
  247. shop_license_src = None
  248. print("营业执照图片链接:", shop_license_src)
  249. # purchase_price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "").isdigit() else 0.00
  250. # ========== 关闭新标签页,切回列表页 ==========
  251. if detail_page and not detail_page.is_closed():
  252. detail_page.close() # 关闭详情页标签
  253. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  254. # 切回原列表页(第一个标签页)
  255. page.bring_to_front() # 激活列表页
  256. page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  257. random_delay(0.5, 1.0) # 增加切换后延迟
  258. page.wait_for_load_state("networkidle")
  259. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  260. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  261. province = ""
  262. city = ""
  263. business_license_company = ""
  264. credit_code = ""
  265. availability = ""
  266. # 组装单条数据(仅新增生产日期/批准文号字段,原有字段顺序/逻辑不变)
  267. # 构造单条数据元组(适配MySQL字段)
  268. single_data = {
  269. # 核心商品信息
  270. "product": title, # 商品名称
  271. "my_good_price": merged_price, # 自定义价格(可与min_price相同或单独提取)
  272. "min_price": discount_price_val, # 最低价格
  273. "manufacture_date": manufacture_date, # 生产日期
  274. "expiry_date": expiry_date, # 有效期
  275. "shop": shop, # 店铺名
  276. "business_license_company": business_license_company, # 营业执照主体(公司名称)
  277. "province": province, # 省份
  278. "city": city, # 城市
  279. "manufacturer": manufacturer, # 生产厂家
  280. "specification": spec, # 规格
  281. "approval_number": approval_number, # 批准文号
  282. "product_link": product_link, # 商品链接
  283. "scrape_date": current_time, # 采集日期
  284. "scrape_province": "", # 采集省份(可留空或根据IP获取)
  285. "availability": availability, # 库存状态
  286. "credit_code": credit_code, # 统一信用代码(如有可补充提取)
  287. "platform": platform, # 平台名称(固定或动态获取)
  288. "search_key": keyword, # 搜索关键词
  289. "number": num, # 数量(盒数)
  290. "is_sold_out": is_sold_out, # 售罄标记(0/1)
  291. "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # 更新时间
  292. "create_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 创建时间
  293. }
  294. # 调用逐条插入函数
  295. insert_single_to_mysql(single_data)
  296. collect_result.append(single_data)
  297. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」采集完成")
  298. except Exception as e:
  299. # 异常处理:关闭详情页,强制切回列表页
  300. logger.exception(f" 「{keyword}」第{collected_count}个商品采集核心异常:{str(e)}")
  301. try:
  302. if detail_page and not detail_page.is_closed():
  303. detail_page.close()
  304. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 异常时关闭详情页标签页")
  305. if page and not page.is_closed():
  306. page.bring_to_front() # 切回列表页
  307. page.wait_for_load_state("networkidle")
  308. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  309. except Exception as e2:
  310. logger.error(f" 「{keyword}」第{collected_count}个商品详情采集异常(处理时):{str(e2)},原异常:{str(e)}")
  311. continue
  312. # ✅ 每15次滚动一次(修复:用collected_count,且排除0的情况)
  313. if collected_count % 5 == 0 and collected_count > 0 and collected_count != total_limit:
  314. logger.info("采满5个往下滑")
  315. slow_scroll_400px(page)
  316. page.wait_for_load_state("networkidle")
  317. # ====== 当前页采集完毕,尝试翻页 ======
  318. delay = random_delay(1.5, 3.0)
  319. logger.info(f"⏳ 翻页前随机等待 {delay:.2f}s(反爬)")
  320. if goto_next_page(page):
  321. page_no += 1
  322. continue
  323. else:
  324. logger.info(f" 「{keyword}」已无下一页,关键词采集结束")
  325. break
  326. # 关键词采集完成后长延迟
  327. long_delay = random_delay(MIN_KEYWORD_DELAY, MAX_KEYWORD_DELAY)
  328. logger.info(f" 「{keyword}」采集完成,共{len(collect_result)}条数据,等待{long_delay:.2f}秒后继续下一个关键词(反爬)")
  329. return collect_result
  330. # ==================== 主函数(登录+批量搜索) ====================
  331. def main():
  332. logger.info("\n" + "="*50)
  333. logger.info("🚀 药九九采集程序启动")
  334. logger.info(f"⏰ 启动时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  335. logger.info("="*50)
  336. # 待搜索的关键词列表(直接写在这里,改起来更直观)
  337. # 存储所有关键词的采集数据
  338. # all_collect_data = []
  339. with sync_playwright() as p:
  340. # browser = init_browser_with_proxy(p)
  341. # 启动浏览器(用单个配置变量)
  342. browser = p.chromium.launch(
  343. headless=False, # 不要用无头模式(反爬:无头模式易被识别)
  344. channel="chrome", # 使用真实Chrome内核
  345. slow_mo=random.randint(100, 300), # 全局操作延迟(模拟真人慢速操作)
  346. args=[
  347. "--disable-blink-features=AutomationControlled", # 禁用webdriver特征(核心!)
  348. "--enable-automation=false", # 新增:禁用自动化标识
  349. "--disable-infobars", # 新增:禁用信息栏
  350. "--remote-debugging-port=0", # 新增:随机调试端口
  351. "--start-maximized", # 最大化窗口(模拟真人使用)
  352. "--disable-extensions", # 禁用扩展(避免特征)
  353. "--disable-plugins-discovery", # 禁用插件发现
  354. "--no-sandbox", # 避免沙箱模式特征
  355. "--disable-dev-shm-usage", # 避免内存限制导致的异常
  356. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36" # 随机Chrome版本的UA
  357. ]
  358. )
  359. # 创建页面时伪装指纹
  360. context = browser.new_context(
  361. locale="zh-CN", # 中文环境
  362. timezone_id="Asia/Shanghai", # 上海时区
  363. geolocation={"latitude": 31.230416, "longitude": 121.473701}, # 模拟上海地理位置(可选)
  364. permissions=["geolocation"], # 授予定位权限(模拟真人)
  365. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  366. viewport={"width": 1920, "height": 1080},
  367. # 关键:隐藏自动化特征
  368. java_script_enabled=True,
  369. bypass_csp=True,
  370. # user_data_dir="./temp_user_data" # 模拟真实用户数据目录
  371. )
  372. page = context.new_page()
  373. # 关键:移除navigator.webdriver标识(反爬核心)
  374. page.add_init_script("""
  375. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  376. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); // 新增:模拟插件
  377. Object.defineProperty(navigator, 'mimeTypes', { get: () => [1, 2, 3] }); // 新增:模拟MIME类型
  378. window.chrome = { runtime: {}, loadTimes: () => ({}) }; // 增强Chrome模拟
  379. delete window.navigator.languages;
  380. window.navigator.languages = ['zh-CN', 'zh'];
  381. // 新增:模拟真实鼠标移动特征
  382. (() => {
  383. const originalAddEventListener = EventTarget.prototype.addEventListener;
  384. EventTarget.prototype.addEventListener = function(type, listener) {
  385. if (type === 'mousemove') {
  386. return originalAddEventListener.call(this, type, (e) => {
  387. e._automation = undefined;
  388. listener(e);
  389. });
  390. }
  391. return originalAddEventListener.call(this, type, listener);
  392. };
  393. })();
  394. """)
  395. try:
  396. # ========== 核心:Cookie复用逻辑 ==========
  397. # 1. 加载本地Cookie
  398. load_cookies(context)
  399. # 2. 验证登录状态
  400. if not is_login(page):
  401. # 3. Cookie失效/不存在,执行登录
  402. page.goto(TARGET_LOGIN_URL)
  403. page.wait_for_load_state("networkidle")
  404. logger.info("🔑 开始执行登录流程")
  405. # 执行登录操作
  406. login_success = login_operation(page, USERNAME, PASSWORD)
  407. if not login_success:
  408. logger.error(" 登录失败,程序终止")
  409. return
  410. # 4. 登录成功后保存Cookie
  411. save_cookies(context)
  412. logger.info(" 登录并保存Cookie成功!")
  413. # 2. 批量搜索+采集+保存
  414. for keyword_idx, keyword in enumerate(SEARCH_KEYWORDS, 1):
  415. logger.info(f"\n=====================================")
  416. logger.info(f"🔍 开始处理第{keyword_idx}/{len(SEARCH_KEYWORDS)}个关键词:{keyword}")
  417. logger.info(f"=====================================")
  418. # 执行搜索
  419. popup_guard(page, "before_search")
  420. search_success = search_operation(page, keyword)
  421. # input("")
  422. popup_guard(page, "after_search")
  423. if not search_success:
  424. logger.warning(f" 「{keyword}」搜索失败,跳过采集")
  425. continue
  426. # ✅ 再等页面稳定一下(networkidle 有时会等不到,建议加超时或换成 domcontentloaded)
  427. page.wait_for_load_state("domcontentloaded")
  428. page.wait_for_load_state('networkidle')
  429. # 采集数据
  430. data_list = collect_data1(page, keyword)
  431. # # 保存到CSV
  432. # if data_list:
  433. # save_to_csv(data_list)
  434. # else:
  435. # logger.warning(f" 「{keyword}」无数据,跳过保存")
  436. logger.info("\n🎉 所有关键词处理完成!CSV文件路径:" + os.path.abspath(CSV_FILE_PATH))
  437. input("\n按回车关闭程序...")
  438. except Exception as e:
  439. logger.error(f" 程序异常:{str(e)}")
  440. finally:
  441. browser.close()
  442. logger.info(" 浏览器已关闭,程序结束")
  443. if __name__ == '__main__':
  444. main()