main.py 83 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978
  1. from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
  2. from logger_config import logger
  3. from datetime import datetime
  4. import random
  5. import os
  6. import time
  7. import json
  8. from config import *
  9. import re
  10. import requests
  11. import base64
  12. from io import BytesIO
  13. from PIL import Image
  14. import traceback
  15. from feishu_webhook import send_text as feishu_send_text, send_error_card as feishu_send_error_card
  16. # mysql_pool 由 config.mysql_pool(MySQLPoolOnline)提供
  17. # 加载城市JSON数据(全局只加载一次)
  18. CITY_JSON_PATH = "city.json"
  19. # 全局映射字典
  20. PROVINCE_ID_MAP = {} # 省名 -> 省ID
  21. CITY_ID_MAP = {} # (省名, 市名) -> 市ID
  22. CITY_TO_PROVINCES_MAP = {} # 市名 -> 省名集合(用于城市反推省份)
  23. DIRECT_MUNICIPALITIES = {"北京市", "上海市", "天津市", "重庆市"}
  24. DIRECT_MUNICIPALITY_BASE_NAMES = {"北京", "上海", "天津", "重庆"}
  25. DIRECT_MUNICIPALITY_ALIAS = {
  26. "北京": "北京市",
  27. "上海": "上海市",
  28. "天津": "天津市",
  29. "重庆": "重庆市",
  30. }
  31. def load_city_mapping():
  32. """加载 city.json 并构建快速查找字典"""
  33. global PROVINCE_ID_MAP, CITY_ID_MAP, CITY_TO_PROVINCES_MAP
  34. PROVINCE_ID_MAP.clear()
  35. CITY_ID_MAP.clear()
  36. CITY_TO_PROVINCES_MAP.clear()
  37. if not os.path.exists(CITY_JSON_PATH):
  38. logger.error(f"❌ 城市JSON文件不存在:{CITY_JSON_PATH}")
  39. return
  40. try:
  41. with open(CITY_JSON_PATH, "r", encoding="utf-8") as f:
  42. data = json.load(f)
  43. for province_item in data:
  44. p_name = province_item['name']
  45. p_id = province_item['id']
  46. PROVINCE_ID_MAP[p_name] = p_id
  47. for city_item in province_item.get('sons', []):
  48. c_name = city_item['name']
  49. c_id = city_item['id']
  50. CITY_ID_MAP[(p_name, c_name)] = c_id
  51. CITY_TO_PROVINCES_MAP.setdefault(c_name, set()).add(p_name)
  52. logger.info(f"✅ 城市映射加载完成,共 {len(PROVINCE_ID_MAP)} 个省份,{len(CITY_ID_MAP)} 个城市")
  53. except Exception as e:
  54. logger.error(f"❌ 加载城市JSON失败:{str(e)}")
  55. def _clean_province_name(name: str) -> str:
  56. return (name or "").replace("省", "").replace("市", "").replace("自治区", "").replace("特别行政区", "").strip()
  57. def _clean_city_name(name: str) -> str:
  58. return (name or "").replace("市", "").replace("自治州", "").replace("地区", "").replace("盟", "").strip()
  59. def normalize_province_city_names(province_name: str, city_name: str):
  60. """标准化省市名称,并在“省份缺失+城市唯一”时反推省份。"""
  61. province = (province_name or "").strip()
  62. city = (city_name or "").strip()
  63. # 标准化省份名称(保留直辖市的“北京市/上海市”等完整写法)
  64. if province and province not in DIRECT_MUNICIPALITIES and province not in PROVINCE_ID_MAP:
  65. clean_p = _clean_province_name(province)
  66. for standard_name in PROVINCE_ID_MAP.keys():
  67. if clean_p and clean_p == _clean_province_name(standard_name):
  68. province = standard_name
  69. break
  70. # 省份为空时,按“城市唯一”精确反推省份
  71. if not province and city:
  72. matched_provinces = CITY_TO_PROVINCES_MAP.get(city, set())
  73. if not matched_provinces:
  74. clean_c = _clean_city_name(city)
  75. if clean_c:
  76. matched_provinces = {
  77. p_name
  78. for (p_name, c_name) in CITY_ID_MAP.keys()
  79. if _clean_city_name(c_name) == clean_c
  80. }
  81. if len(matched_provinces) == 1:
  82. province = next(iter(matched_provinces))
  83. elif len(matched_provinces) > 1:
  84. logger.warning(
  85. f"⚠️ 城市名存在跨省重名,无法唯一反推省份: city={city}, candidates={sorted(matched_provinces)}"
  86. )
  87. if province in DIRECT_MUNICIPALITY_BASE_NAMES:
  88. province = DIRECT_MUNICIPALITY_ALIAS[province]
  89. # 标准化城市名称(仅在省份确定后做同省映射,避免跨省误匹配)
  90. if province and city and (province, city) not in CITY_ID_MAP:
  91. clean_c = _clean_city_name(city)
  92. for (p_name, c_name), _ in CITY_ID_MAP.items():
  93. if _clean_province_name(p_name) == _clean_province_name(province) and clean_c and _clean_city_name(c_name) == clean_c:
  94. city = c_name
  95. break
  96. # 直辖市兜底
  97. if province in DIRECT_MUNICIPALITIES and not city:
  98. city = province
  99. return province, city
  100. def get_province_city_ids(province_name, city_name):
  101. """
  102. 根据省份名称和城市名称返回对应的ID
  103. :return: (province_id, city_id) 若找不到返回 (0, 0)
  104. """
  105. province_name, city_name = normalize_province_city_names(province_name, city_name)
  106. # ---- 查找省份ID ----
  107. province_id = PROVINCE_ID_MAP.get(province_name) if province_name else None
  108. if province_name and province_id is None:
  109. # 尝试去掉"省"、"自治区"、"市"后缀再匹配
  110. clean_p = _clean_province_name(province_name)
  111. for name, pid in PROVINCE_ID_MAP.items():
  112. if clean_p and clean_p == _clean_province_name(name):
  113. province_id = pid
  114. province_name = name # 更新为标准名称,方便后续查城市
  115. break
  116. if province_id is None:
  117. logger.warning(f"⚠️ 未找到省份ID: {province_name}")
  118. province_id = 0
  119. elif province_id is None:
  120. province_id = 0
  121. # 直辖市兜底:省份有值但城市为空时,城市按省份补齐
  122. if _clean_province_name(province_name) in DIRECT_MUNICIPALITY_BASE_NAMES and not city_name:
  123. city_name = f"{_clean_province_name(province_name)}市"
  124. # ---- 查找城市ID ----
  125. city_id = CITY_ID_MAP.get((province_name, city_name)) if province_name and city_name else None
  126. if province_name and city_name and city_id is None:
  127. # 尝试去掉"市"、"自治州"等后缀
  128. clean_c = _clean_city_name(city_name)
  129. for (p_name, c_name), cid in CITY_ID_MAP.items():
  130. if p_name == province_name:
  131. if clean_c and clean_c == _clean_city_name(c_name):
  132. city_id = cid
  133. city_name = c_name
  134. break
  135. if city_id is None:
  136. # 直辖市特殊处理:城市ID与省份ID相同(或取第一个下属城市)
  137. if _clean_province_name(province_name) in DIRECT_MUNICIPALITY_BASE_NAMES and province_id:
  138. city_id = province_id
  139. else:
  140. logger.warning(f"⚠️ 未找到城市ID: {province_name} - {city_name}")
  141. city_id = 0
  142. elif city_id is None:
  143. city_id = 0
  144. return province_id, city_id
  145. # ===================== 工具函数:获取当前时间字符串 =====================
  146. def get_current_time():
  147. """统一日志时间格式"""
  148. return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  149. def report_start(task_id: int, keyword: str):
  150. """第一次上报:status=2 和 task_id"""
  151. payload = {
  152. "status": 2,
  153. "collect_task_allocate_id": task_id
  154. }
  155. logger.info("上报开始 task_id=%s keyword=%s payload=%s", task_id, keyword, payload)
  156. _send_report(payload, f"开始上报(status=2)关键词:{keyword}")
  157. def report_end(task_id: int, keyword: str, company_id: int, success: bool,real_count: int, start_ts: int):
  158. """
  159. 第二次上报:采集结束时调用,上传所有字段
  160. :param keyword: 关键词(用于日志)
  161. :param success: 采集是否成功
  162. :param company_id: 企业ID(从任务表获取)
  163. :param real_count: 实际采集到的商品数量
  164. :param start_ts: 开始采集时的Unix时间戳(秒)
  165. """
  166. end_ts = int(time.time())
  167. payload = {
  168. "collect_task_allocate_id": task_id, #任务ID
  169. "status": 3 if success else 4,# 3=已完成,4=失败
  170. 'finish_status': 1 if success else 0, # 1=是(采集最后的数据)0=否
  171. 'real_count': real_count,
  172. "start_time": start_ts,
  173. "end_time": end_ts,
  174. # "enterprise_id": company_id
  175. }
  176. logger.info("上报结束 task_id=%s keyword=%s payload=%s", task_id, keyword, payload)
  177. _send_report(payload, f"结束上报(全部字段)关键词:{keyword}")
  178. def _send_report(params: dict, log_msg: str):
  179. """通用上报请求,失败不中断主流程(与调度服务约定为 JSON POST)"""
  180. REPORT_URL = "https://scheduleapi.findit.ltd/api/collect_equipment_execute/result_report"
  181. try:
  182. resp = requests.post(REPORT_URL, json=params, timeout=10)
  183. if resp.status_code == 200:
  184. # 尝试解析 JSON
  185. try:
  186. data = resp.json()
  187. if data.get("code") == "success":
  188. logger.info(f"✅ 上报成功:{log_msg}")
  189. else:
  190. logger.warning(f"⚠️ 上报接口返回错误:code={data.get('code')}, msg={data.get('msg')},参数:{params}")
  191. except ValueError:
  192. # 响应不是 JSON 格式
  193. logger.warning(f"⚠️ 上报响应非 JSON:{resp.text[:200]},参数:{params}")
  194. else:
  195. logger.warning(f"⚠️ 上报 HTTP {resp.status_code},参数:{params}")
  196. except Exception as e:
  197. logger.error(f"❌ 上报失败:{log_msg},错误:{str(e)}")
  198. def notify_feishu_after_report_end(
  199. task_id: int,
  200. keyword: str,
  201. company_id: int,
  202. success: bool,
  203. spec: str,
  204. real_count: int,
  205. start_ts: int,
  206. platform_name: str = "药帮忙",
  207. drug_name: str = "",
  208. ):
  209. """在任务上报结束后发送飞书通知。"""
  210. notice_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  211. display_drug_name = (drug_name or keyword or "").strip()
  212. spec_items = [item.strip() for item in re.split(r"[|、,,\n\r]+", spec or "") if item.strip()]
  213. display_spec = "、".join(spec_items)
  214. text_msg = (
  215. f"{notice_time} 通知:\n"
  216. f"平台: {platform_name}, 药品: {display_drug_name}, 品规: {display_spec}, 爬取数据: {real_count}条"
  217. )
  218. try:
  219. if success:
  220. feishu_send_text(text_msg)
  221. else:
  222. feishu_send_error_card(
  223. task_name=display_drug_name,
  224. err_msg=(
  225. f"task_id={task_id}, company_id={company_id}, "
  226. f"real_count={real_count}, notice={text_msg}"
  227. ),
  228. mention_all=False,
  229. )
  230. except Exception as e:
  231. logger.warning(f"飞书通知发送失败:{e}")
  232. # ==================== 2. 反爬工具函数 ====================
  233. def random_delay(min_seconds, max_seconds):
  234. """生成随机延迟(核心反爬:避免固定间隔)"""
  235. delay = random.uniform(min_seconds, max_seconds)
  236. time.sleep(delay)
  237. return delay
  238. def save_cookies(context, cookie_path=COOKIE_FILE_PATH):
  239. """保存Cookie到本地JSON文件"""
  240. try:
  241. cookies = context.cookies()
  242. with open(cookie_path, "w", encoding="utf-8") as f:
  243. json.dump(cookies, f, ensure_ascii=False, indent=2)
  244. logger.info(f"Cookie已保存到:{cookie_path}")
  245. return True
  246. except Exception as e:
  247. logger.error(f" 保存Cookie失败:{e}")
  248. return False
  249. def load_cookies(context, cookie_path=COOKIE_FILE_PATH):
  250. """从本地JSON文件加载Cookie到浏览器上下文"""
  251. if not os.path.exists(cookie_path):
  252. logger.warning(f" Cookie文件不存在:{cookie_path}")
  253. return False
  254. try:
  255. with open(cookie_path, "r", encoding="utf-8") as f:
  256. cookies = json.load(f)
  257. context.add_cookies(cookies)
  258. logger.info(f"✅ 已从{cookie_path}加载Cookie")
  259. return True
  260. except Exception as e:
  261. logger.error(f" 加载Cookie失败:{e}")
  262. return False
  263. def is_login(page):
  264. """验证是否已登录(核心:检测登录态)"""
  265. try:
  266. # 访问需要登录的页面
  267. page.goto(LOGIN_VALIDATE_URL, timeout=300000)
  268. page.wait_for_load_state("networkidle")
  269. # 检测是否跳转到登录页(URL包含login则未登录)
  270. if "login" in page.url.lower():
  271. logger.warning(" Cookie失效,需要重新登录")
  272. return False
  273. # 可选:检测登录后的专属元素(比如用户名、个人中心等)
  274. # if page.locator("用户中心选择器").count() > 0:
  275. # return True
  276. logger.info(" Cookie有效,已保持登录状态")
  277. return True
  278. except Exception as e:
  279. logger.error(f" 验证登录状态失败:{e}")
  280. return False
  281. # ==================== 滚动函数重构(核心修改) ====================
  282. def slow_scroll_400px(page,scroll_distance1=400):
  283. """
  284. 慢速滚动400px±50px(模拟真人滑动)
  285. :param page: 页面对象
  286. :return: 滚动是否成功
  287. """
  288. try:
  289. # 生成400±50px的随机滚动距离
  290. scroll_distance = random.randint(
  291. scroll_distance1 - SCROLL_OFFSET_RANGE,
  292. scroll_distance1 + SCROLL_OFFSET_RANGE
  293. )
  294. remaining_distance = scroll_distance
  295. total_steps = int(scroll_distance / SCROLL_STEP)
  296. logger.info(
  297. f"📜 开始慢速滚动(目标距离:{scroll_distance}px,总步数:{total_steps},总时长约{total_steps*SCROLL_INTERVAL:.2f}秒)"
  298. )
  299. # 渐进式滚动(每步50px,间隔0.05秒)
  300. for _ in range(total_steps):
  301. step = min(SCROLL_STEP, remaining_distance)
  302. page.evaluate(f"window.scrollBy(0, {step});")
  303. remaining_distance -= step
  304. time.sleep(SCROLL_INTERVAL)
  305. # 处理剩余不足一步的距离
  306. if remaining_distance > 0:
  307. page.evaluate(f"window.scrollBy(0, {remaining_distance});")
  308. time.sleep(SCROLL_INTERVAL)
  309. # 滚动后等待懒加载完成
  310. page.wait_for_load_state("networkidle", timeout=8000)
  311. random_delay(2.0, 3.0) # 滚动后额外停顿,模拟真人
  312. logger.info(f" 慢速滚动完成,实际滚动距离:{scroll_distance - remaining_distance}px")
  313. return True
  314. except Exception as e:
  315. logger.warning(f" 慢速滚动失败:{e}")
  316. return False
  317. # ==================== 登录函数 ====================
  318. def login_operation(page, username, password):
  319. """登录操作函数"""
  320. try:
  321. # 输入手机号(直接用单个变量)
  322. page.wait_for_selector(USERNAME_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  323. page.wait_for_timeout(timeout=3000)
  324. page.fill(USERNAME_SELECTOR, username)
  325. logger.info(" 已输入登录账号")
  326. # 输入密码
  327. page.wait_for_selector(PASSWORD_SELECTOR, timeout=ELEMENT_TIMEOUT, state="visible")
  328. page.wait_for_timeout(timeout=3000)
  329. page.fill(PASSWORD_SELECTOR, password)
  330. logger.info(" 已输入登录密码")
  331. random_delay(1, 2)
  332. agree_btn = page.locator('span.el-checkbox__inner')
  333. agree_btn.click()
  334. # 点击登录按钮
  335. page.wait_for_selector(LOGIN_BTN_SELECTOR, timeout=ELEMENT_TIMEOUT)
  336. page.wait_for_timeout(timeout=3000)
  337. page.click(LOGIN_BTN_SELECTOR)
  338. logger.info(" 已点击登录按钮")
  339. page.wait_for_timeout(LOGIN_AFTER_CLICK)
  340. return True
  341. except PlaywrightTimeoutError as e:
  342. logger.error(f" 登录失败:元素定位超时 - {str(e)}")
  343. return False
  344. except Exception as e:
  345. logger.error(f" 登录异常:{str(e)}")
  346. return False
  347. def kill_masks(page):
  348. """
  349. 强制清理残留遮罩层/覆盖层,并恢复 body 可滚动、可点击状态
  350. """
  351. page.evaluate(r"""
  352. () => {
  353. const removed = [];
  354. const hidden = [];
  355. // 1) 先处理已知常见遮罩
  356. const knownSelectors = [
  357. '.v-modal',
  358. '.el-overlay',
  359. '.el-overlay-dialog',
  360. '.el-dialog__wrapper',
  361. '.el-message-box__wrapper',
  362. '.el-loading-mask',
  363. '.el-popup-parent--hidden'
  364. ];
  365. for (const sel of knownSelectors) {
  366. document.querySelectorAll(sel).forEach(el => {
  367. // v-modal / overlay 直接 remove 最省事
  368. removed.push(sel);
  369. el.remove();
  370. });
  371. }
  372. // 2) 再做一次“泛化兜底”:全屏 fixed/absolute + 高 z-index 的覆盖层
  373. // 注意:不要误删页面正常的固定导航,所以加上“近似全屏”的判断
  374. const all = Array.from(document.querySelectorAll('body *'));
  375. for (const el of all) {
  376. const s = window.getComputedStyle(el);
  377. if (!s) continue;
  378. const z = parseInt(s.zIndex || '0', 10);
  379. const pos = s.position;
  380. const pe = s.pointerEvents;
  381. if ((pos === 'fixed' || pos === 'absolute') && z >= 1000 && pe !== 'none') {
  382. const r = el.getBoundingClientRect();
  383. const nearFullScreen =
  384. r.width >= window.innerWidth * 0.8 &&
  385. r.height >= window.innerHeight * 0.8 &&
  386. r.left <= window.innerWidth * 0.1 &&
  387. r.top <= window.innerHeight * 0.1;
  388. // 常见遮罩是半透明背景色,或者透明但拦截点击
  389. const bg = s.backgroundColor || '';
  390. const looksLikeMask =
  391. nearFullScreen && (bg.includes('rgba') || bg.includes('rgb') || s.opacity !== '1');
  392. if (nearFullScreen) {
  393. // 不管透明不透明,只要近似全屏且高 z-index,就先让它不拦截点击
  394. el.style.pointerEvents = 'none';
  395. el.style.display = 'none';
  396. hidden.push(el.tagName + '.' + (el.className || ''));
  397. }
  398. }
  399. }
  400. // 3) 恢复 body / html 的滚动与交互(很多弹窗会锁滚动)
  401. document.documentElement.style.overflow = 'auto';
  402. document.body.style.overflow = 'auto';
  403. document.body.style.position = 'static';
  404. document.body.style.width = 'auto';
  405. document.body.style.paddingRight = '0px';
  406. // 4) 去掉 Element-UI 常见的锁定 class
  407. document.body.classList.remove('el-popup-parent--hidden');
  408. return { removed, hiddenCount: hidden.length, hidden };
  409. }
  410. """)
  411. def force_close_popup(page):
  412. """关闭新手引导/遮罩(多步:下一步/完成/我知道了),并兜底移除遮罩层"""
  413. try:
  414. # 1) 尝试连续点“下一步/完成/我知道了/关闭”
  415. for _ in range(5): # 最多点5次,足够覆盖多步引导
  416. btn = page.locator(
  417. "//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  418. ).first
  419. if btn.count() > 0 and btn.is_visible():
  420. btn.click(timeout=1500)
  421. page.wait_for_timeout(300)
  422. continue
  423. # 有些引导是右上角 X(如果存在就点)
  424. close_icon = page.locator(
  425. "xpath=//*[contains(@class,'close') or contains(@class,'el-icon-close') or name()='svg' or name()='i'][1]"
  426. ).first
  427. if close_icon.count() > 0 and close_icon.is_visible():
  428. close_icon.click(timeout=1000)
  429. page.wait_for_timeout(300)
  430. continue
  431. break
  432. # 2) 兜底:移除常见遮罩层(element-ui / 通用 mask/overlay)
  433. page.evaluate("""
  434. const selectors = [
  435. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  436. '[class*="mask"]', '[class*="overlay"]', '[style*="z-index"]'
  437. ];
  438. for (const sel of selectors) {
  439. document.querySelectorAll(sel).forEach(el => {
  440. const s = window.getComputedStyle(el);
  441. // 只移除“覆盖层”倾向的元素:fixed/absolute 且 z-index 很高
  442. if ((s.position === 'fixed' || s.position === 'absolute') && parseInt(s.zIndex || '0', 10) >= 1000) {
  443. el.remove();
  444. }
  445. });
  446. }
  447. """)
  448. except Exception:
  449. pass
  450. def pick_search_input(page):
  451. """优先选可见且可用的搜索输入框;第一个不行就尝试第二个"""
  452. inputs = page.locator(SEARCH_INPUT_SELECTOR)
  453. cnt = inputs.count()
  454. # 优先检查前两个(你说只有两个)
  455. for i in range(min(cnt, 2)):
  456. candidate = inputs.nth(i)
  457. try:
  458. candidate.wait_for(state="visible", timeout=1500) # 小超时快速试探
  459. if candidate.is_enabled():
  460. return candidate
  461. except PlaywrightTimeoutError:
  462. continue
  463. # 兜底:直接找任意可见的(避免命中 hidden 模板)
  464. candidate = page.locator(f"{SEARCH_INPUT_SELECTOR}:visible").first
  465. candidate.wait_for(state="visible", timeout=ELEMENT_TIMEOUT)
  466. return candidate
  467. def type_slow(locator, text: str, min_delay=0.06, max_delay=0.18):
  468. """逐字输入,模拟真人打字"""
  469. for ch in text:
  470. locator.type(ch, delay=int(random.uniform(min_delay, max_delay) * 1000))
  471. # ==================== 搜索操作函数 ====================
  472. def search_operation(page, keyword, is_first_search: bool = True):
  473. """
  474. 搜索框填充+提交搜索
  475. :param page: 页面对象
  476. :param keyword: 搜索关键词
  477. :param is_first_search: 是否是首次搜索(首次开新页面,后续原页面跳转)
  478. :return: (detail_page, 是否成功)
  479. """
  480. try:
  481. # 1) 找到“可用”的搜索框(第一个不行就用第二个)
  482. search_locator = pick_search_input(page)
  483. # 清空并填充搜索框
  484. search_locator.wait_for(timeout=ELEMENT_TIMEOUT)
  485. # 2. 清空搜索框(双重保障:先调用locator的clear,再手动全选删除)
  486. search_locator.click(force=True) # 聚焦
  487. search_locator.fill("")
  488. page.keyboard.down("Control") # 按住Control键
  489. page.keyboard.press("a") # 按a键
  490. page.keyboard.up("Control") # 松开Control键
  491. page.keyboard.press("Backspace") # 删除选中内容
  492. type_slow(search_locator, keyword, min_delay=0.06, max_delay=0.18)
  493. logger.info(f"📝 已输入搜索关键词:{keyword}")
  494. # 3) 搜索按钮也建议点可见的那个
  495. btn = page.locator(f"{SEARCH_BTN_SELECTOR}")
  496. btn.wait_for(state="visible", timeout=SEARCH_BTN_TIMEOUT)
  497. page.wait_for_timeout(3000)
  498. detail_page = page
  499. if is_first_search:
  500. #获取新页面对象
  501. try:
  502. # 先开始监听新页面事件(在点击前)
  503. with page.context.expect_page(timeout=60000) as new_page_info:
  504. # 再执行点击操作
  505. btn.click()
  506. # 点击后获取新页面
  507. detail_page = new_page_info.value
  508. detail_page.wait_for_load_state("networkidle", timeout=20000)
  509. except PlaywrightTimeoutError:
  510. logger.warning(f"{get_current_time()} 未检测到新标签页")
  511. return None, False
  512. except Exception as e:
  513. logger.warning(f"{get_current_time()} 等待新标签页异常:{e}")
  514. return None, False
  515. else:
  516. btn.click()
  517. # 等待原页面跳转并加载完成(替代新页面监听)
  518. page.wait_for_load_state("networkidle", timeout=20000)
  519. # 详情页就是原页面,无需新建
  520. detail_page = page
  521. logger.info("✅ 后续搜索:已在原页面完成跳转加载")
  522. test_btn = detail_page.locator("div[data-v-c65c36bc].first-time-highlight-message-btn button")
  523. btn_count = test_btn.count()
  524. logger.info(f"✅ 匹配到的元素数量:{btn_count}")
  525. if btn_count > 0:
  526. test_btn.wait_for(state="attached", timeout=5000)
  527. test_btn.click()
  528. force_close_popup(detail_page)
  529. kill_masks(detail_page)
  530. logger.info("✅ 已触发搜索")
  531. return detail_page, True
  532. except PlaywrightTimeoutError as e:
  533. logger.error(f" 搜索失败:元素定位超时 - {str(e)}")
  534. return None, False # 失败时返回 (None, False)
  535. except Exception as e:
  536. logger.error(f" 搜索异常:{str(e)}")
  537. return None, False # 失败时返回 (None, False)
  538. #翻下一页
  539. def goto_next_page(page) -> bool:
  540. """
  541. 基于 button.btn-next 的 aria-disabled 属性判断是否有下一页
  542. :param page: 搜索结果页面对象(detail_page)
  543. :return: True=翻页成功,False=无下一页/翻页失败
  544. """
  545. try:
  546. next_btn = page.locator("button.btn-next").first
  547. # 2. 先等待按钮加载(确保元素存在)
  548. next_btn.wait_for(state="attached", timeout=3000)
  549. # 3. 获取 aria-disabled 属性值(核心判断依据)
  550. aria_disabled = next_btn.get_attribute("aria-disabled")
  551. logger.info(f"下一页按钮 aria-disabled 属性值:{aria_disabled}")
  552. # 4. 判断是否有下一页:aria-disabled="true" 表示无下一页
  553. if aria_disabled == "true":
  554. logger.warning("⚠️ 下一页按钮 aria-disabled=true,已无更多页面")
  555. return False
  556. page.wait_for_timeout(500)
  557. # 6. 确保按钮可见且可点击(强制点击兜底)
  558. if next_btn.is_visible() and next_btn.is_enabled():
  559. next_btn.click(timeout=5000)
  560. else:
  561. # 兜底:强制点击(避免元素不可见但实际可点击的情况)
  562. next_btn.click(force=True, timeout=5000)
  563. logger.info("✅ 翻页成功,下一页按钮 aria-disabled=false")
  564. return True
  565. except PlaywrightTimeoutError:
  566. logger.warning("⚠️ 下一页按钮加载超时,判定无更多页面")
  567. return False
  568. except Exception as e:
  569. logger.warning(f"⚠️ 翻页操作异常:{e},判定无更多页面")
  570. return False
  571. def popup_guard(page, tag=""):
  572. """
  573. 全局弹窗/遮罩守卫:多步引导 + 关闭按钮 + 遮罩清理 + 恢复滚动
  574. tag 仅用于日志区分调用位置
  575. """
  576. try:
  577. # 给弹窗一点出现时间
  578. page.wait_for_timeout(300)
  579. # 1) 连续点“下一步/完成/我知道了/关闭”
  580. for _ in range(6):
  581. btn = page.locator(
  582. "xpath=//button[normalize-space()='下一步' or normalize-space()='完成' or normalize-space()='我知道了' or normalize-space()='关闭']"
  583. ).first
  584. if btn.count() > 0 and btn.is_visible():
  585. btn.click(timeout=1500)
  586. page.wait_for_timeout(250)
  587. continue
  588. # 2) 常见的 close icon
  589. close_btn = page.locator(
  590. "css=.el-dialog__headerbtn, .el-message-box__headerbtn, .close, .icon-close, .el-icon-close"
  591. ).first
  592. if close_btn.count() > 0 and close_btn.is_visible():
  593. close_btn.click(timeout=1200)
  594. page.wait_for_timeout(250)
  595. continue
  596. break
  597. # 3) 清遮罩 + 恢复滚动/交互
  598. page.evaluate(r"""
  599. () => {
  600. // 第一步:精准清理已知的遮罩/弹窗类名(Element UI框架常用)
  601. const selectors = [
  602. '.v-modal', '.el-overlay', '.el-overlay-dialog', '.el-dialog__wrapper',
  603. '.el-message-box__wrapper', '.el-loading-mask'
  604. ];
  605. selectors.forEach(sel => document.querySelectorAll(sel).forEach(e => e.remove()));
  606. // 泛化兜底:近似全屏 + 高 z-index 的层直接屏蔽
  607. const all = Array.from(document.querySelectorAll('body *'));
  608. for (const el of all) {
  609. const s = getComputedStyle(el); // 获取元素的实际样式(含CSS生效的样式)
  610. const z = parseInt(s.zIndex || '0', 10); // 取元素的层级(z-index),默认0
  611. // 条件1:元素是固定/绝对定位(弹窗/遮罩常见定位方式)+ 层级≥1000(高优先级遮挡)+ 能拦截鼠标事件
  612. if ((s.position === 'fixed' || s.position === 'absolute') && z >= 1000 && s.pointerEvents !== 'none') {
  613. const r = el.getBoundingClientRect(); // 获取元素的尺寸和位置
  614. // 条件2:元素宽度/高度≥屏幕80%(近似全屏遮罩)
  615. const nearFull = r.width >= innerWidth * 0.8 && r.height >= innerHeight * 0.8;
  616. if (nearFull) {
  617. el.style.pointerEvents = 'none'; // 让元素不拦截鼠标点击
  618. el.style.display = 'none'; // 隐藏元素
  619. }
  620. }
  621. }
  622. // 第三步:恢复页面滚动功能(弹窗常把页面设为不可滚动)
  623. document.documentElement.style.overflow = 'auto'; // html标签恢复滚动
  624. document.body.style.overflow = 'auto'; // body标签恢复滚动
  625. document.body.classList.remove('el-popup-parent--hidden'); // 移除Element UI的滚动禁用类
  626. }
  627. """)
  628. logger.info("杀除弹窗成功")
  629. except Exception:
  630. pass
  631. #判断店名是否已经在数据库
  632. def shop_is_exists_database(shop):
  633. query_sql = """
  634. SELECT province, city, business_license_company, qualification_number, business_license_address
  635. FROM retrieve_ybm_shop_info_middle
  636. WHERE shop = %s
  637. """
  638. try:
  639. rows = mysql_pool.select_data(query_sql, (shop,))
  640. result = rows[0] if rows else None
  641. logger.debug("店铺存在校验 shop=%r result=%s", shop, result)
  642. is_exists = bool(result)
  643. if is_exists:
  644. logger.info(f"【店铺存在校验】店铺已存在 | 店铺名:{repr(shop)} | 结果:存在(True)不要执行采集店铺")
  645. else:
  646. logger.info(f"【店铺存在校验】店铺不存在 | 店铺名:{repr(shop)} | 结果:不存在(False)")
  647. return is_exists, result
  648. except Exception as e:
  649. logger.error(f"查询店铺失败:{e}")
  650. return False, None
  651. def insert_shop_info_to_db(shop,contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform, province, city, create_time, update_time):
  652. """
  653. 把字段插入到ybm_shop_info_middle表
  654. :param 各参数: 你要插入的字段值(空字符串也可)
  655. :return: bool - 插入成功返回True,失败返回False
  656. """
  657. sql = """
  658. INSERT INTO retrieve_ybm_shop_info_middle (
  659. shop,
  660. contact_address,
  661. qualification_number,
  662. business_license_company,
  663. business_license_address,
  664. scrape_date,
  665. platform,
  666. province,
  667. city,
  668. create_time,
  669. update_time
  670. ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  671. ON DUPLICATE KEY UPDATE
  672. contact_address = VALUES(contact_address),
  673. qualification_number = VALUES(qualification_number),
  674. business_license_company = VALUES(business_license_company),
  675. business_license_address = VALUES(business_license_address),
  676. scrape_date = VALUES(scrape_date),
  677. platform = VALUES(platform),
  678. province = VALUES(province),
  679. city = VALUES(city),
  680. update_time = VALUES(update_time)
  681. """
  682. params = (
  683. shop,
  684. contact_address,
  685. qualification_number,
  686. business_license_company,
  687. business_license_address,
  688. scrape_date,
  689. platform,
  690. province,
  691. city,
  692. create_time,
  693. update_time,
  694. )
  695. conn = None
  696. cursor = None
  697. try:
  698. conn = mysql_pool.get_conn()
  699. cursor = conn.cursor()
  700. cursor.execute(sql, params)
  701. conn.commit()
  702. logger.info("店铺信息写入成功 shop=%s company=%s", shop, business_license_company)
  703. return True
  704. except Exception as e:
  705. logger.error("插入店铺失败:%s\n%s", e, traceback.format_exc())
  706. if conn:
  707. conn.rollback()
  708. return False
  709. finally:
  710. if cursor:
  711. cursor.close()
  712. if conn:
  713. conn.close()
  714. def insert_single_to_mysql(single_data):
  715. """
  716. 逐条插入单条数据到MySQL数据库
  717. :param single_data: 单条商品数据元组
  718. :return: 插入是否成功
  719. """
  720. conn = None
  721. cursor = None
  722. try:
  723. insert_sql = """
  724. INSERT INTO retrieve_scrape_data(
  725. enterprise_id,
  726. platform_id,
  727. platform_item_id,
  728. province_id,
  729. city_id,
  730. province_name,
  731. city_name,
  732. area_info,
  733. product_brand,
  734. product_name,
  735. product_specs,
  736. one_box_price,
  737. manufacture_date,
  738. expiry_date,
  739. manufacturer,
  740. approval_number,
  741. is_sold_out,
  742. online_posting_count,
  743. continuous_listing_count,
  744. link_url,
  745. store_name,
  746. store_url,
  747. search_name,
  748. collect_config_info,
  749. shipment_province_id,
  750. shipment_province_name,
  751. shipment_city_id,
  752. shipment_city_name,
  753. company_name,
  754. qualification_number,
  755. scrape_date,
  756. min_price,
  757. number,
  758. sales,
  759. inventory,
  760. snapshot_url,
  761. insert_time,
  762. update_time,
  763. collect_equipment_account_id,
  764. collect_region_id,
  765. collect_round
  766. ) VALUES (
  767. %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
  768. %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
  769. %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
  770. %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s
  771. )
  772. """
  773. # 字段值(与SQL占位符顺序严格对应)
  774. values = (
  775. single_data.get("enterprise_id", 0),
  776. single_data.get("platform_id", 9),
  777. single_data.get("platform_item_id", ""),
  778. single_data.get("province_id", ""),
  779. single_data.get("city_id", ""),
  780. single_data.get("province_name", ""),
  781. single_data.get("city_name", ""),
  782. single_data.get("area_info", ""),
  783. single_data.get("product_brand", ""),
  784. single_data.get("product_name", ""),
  785. single_data.get("product_specs", ""),
  786. single_data.get("one_box_price", 0.0),
  787. single_data.get("manufacture_date", ""),
  788. single_data.get("expiry_date", ""),
  789. single_data.get("manufacturer", ""),
  790. single_data.get("approval_number", ""),
  791. single_data.get("is_sold_out", 0),
  792. single_data.get("online_posting_count", ""),
  793. single_data.get("continuous_listing_count", ""),
  794. single_data.get("link_url", ""),
  795. single_data.get("store_name", ""),
  796. single_data.get("store_url", ""),
  797. single_data.get("search_name", ""),
  798. single_data.get("collect_config_info", ""),
  799. single_data.get("shipment_province_id", 0),
  800. single_data.get("shipment_province_name", ""),
  801. single_data.get("shipment_city_id", 0),
  802. single_data.get("shipment_city_name", ""),
  803. single_data.get("company_name", ""),
  804. single_data.get("qualification_number", ""),
  805. single_data.get("scrape_date", ""),
  806. single_data.get("min_price", 0.0),
  807. single_data.get("number", 1),
  808. single_data.get("sales", ""),
  809. single_data.get("inventory", ""),
  810. single_data.get("snapshot_url", ""),
  811. single_data.get("insert_time", datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
  812. single_data.get("update_time", datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
  813. single_data.get("collect_equipment_account_id", ""),
  814. single_data.get("collect_region_id", ""),
  815. single_data.get("collect_round", ""),
  816. )
  817. conn = mysql_pool.get_conn()
  818. cursor = conn.cursor()
  819. cursor.execute(insert_sql, values)
  820. conn.commit()
  821. logger.info("单条数据已入库 product=%s", single_data.get("product_name", "")[:80])
  822. return True
  823. except Exception as e:
  824. logger.error("单条数据插入失败:%s", e)
  825. if conn:
  826. conn.rollback()
  827. return False
  828. finally:
  829. if cursor:
  830. cursor.close()
  831. if conn:
  832. conn.close()
  833. def clean_shop_name(raw_shop_name):
  834. """
  835. 清洗店铺名称:移除无关前缀(如【xx截单】)、多余空格/特殊符号,提取核心店名
  836. :param raw_shop_name: 原始采集的店铺名称字符串
  837. :return: 清洗后的纯店铺名称
  838. """
  839. if not raw_shop_name: #处理空值
  840. return ''
  841. # 步骤1:移除【】/()/[]包裹的所有内容(如【2月13日11点截单】)
  842. # 正则解释:匹配【任意字符】、(任意字符)、[任意字符],并替换为空
  843. pattern = r'【.*?】|\(.*?\)|\[.*?\]'
  844. cleaned = re.sub(pattern, '', raw_shop_name)
  845. # 步骤2:移除首尾空格、换行符,替换中间多余空格为单个空格
  846. cleaned = cleaned.strip().replace('\n', '').replace('\r', '')
  847. cleaned = re.sub(r'\s+', ' ', cleaned)
  848. # 步骤3:兜底处理(若清洗后为空,返回原始值避免空字符串)
  849. return cleaned if cleaned else raw_shop_name
  850. def check_dup_in_biz_db(product_link, discount_price_val, scrape_date,collect_equipment_account_id,collect_region_id,collect_round):
  851. """直接查询业务表是否存在该商品链接+价格"""
  852. log_context = (
  853. f"【去重校验】商品链接:{product_link.strip()} | 价格:{discount_price_val} "
  854. f"采集日期:{scrape_date.strip()}"
  855. )
  856. sql = """
  857. SELECT 1 FROM retrieve_scrape_data
  858. WHERE link_url = %s AND min_price = %s AND scrape_date = %s AND platform_id = %s
  859. AND collect_equipment_account_id = %s AND collect_region_id = %s AND collect_round = %s
  860. LIMIT 1
  861. """
  862. params = (
  863. product_link.strip(),
  864. discount_price_val,
  865. scrape_date.strip(),
  866. 9,
  867. collect_equipment_account_id,
  868. collect_region_id,
  869. collect_round,
  870. )
  871. try:
  872. rows = mysql_pool.select_data(sql, params)
  873. is_dup = bool(rows)
  874. if is_dup:
  875. logger.warning(f"{log_context} - 表中已存在重复记录,跳过本次采集")
  876. else:
  877. logger.info(f"{log_context} - 表中无重复记录,正常采集")
  878. return is_dup
  879. except Exception as e:
  880. logger.error(f"查询业务表去重失败:{str(e)}")
  881. return False
  882. # 压缩图片函数
  883. def compress_image(image_data, max_size=4*1024*1024): # 4MB上限
  884. try:
  885. img = Image.open(BytesIO(image_data))
  886. # 将RGBA模式转为RGB(兼容JPEG)
  887. if img.mode in ('RGBA', 'P'): # P是PNG的调色板模式,也需转换
  888. # 新建白色背景的RGB图片,把透明图贴上去(避免透明区域变黑)
  889. bg_img = Image.new('RGB', img.size, (255, 255, 255))
  890. bg_img.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
  891. img = bg_img
  892. # 缩小分辨率(按比例缩到宽≤1000px)
  893. if img.width > 1000:
  894. ratio = 1000 / img.width
  895. new_size = (int(img.width*ratio), int(img.height*ratio))
  896. img = img.resize(new_size, Image.Resampling.LANCZOS)
  897. # 降低质量(JPG)/压缩(PNG)
  898. output = BytesIO()
  899. img.save(output, format='JPEG', quality=80) # quality越小体积越小
  900. compressed_data = output.getvalue()
  901. if len(compressed_data) > max_size:
  902. output2 = BytesIO()
  903. img.save(output2, format='JPEG', quality=60)
  904. compressed_data = output2.getvalue()
  905. return compressed_data
  906. except Exception as e:
  907. logger.debug(f"图片压缩失败:{e}")
  908. return image_data # 压缩失败返回原始数据
  909. def download_image_to_base64(image_url, save_dir = "./download_images"):
  910. """下载网络图片,返回图片二进制数据(BytesIO)"""
  911. try:
  912. if not os.path.exists(save_dir):
  913. os.makedirs(save_dir) # 创建多级目录(比如a/b/c)
  914. print(f"创建本地保存目录:{save_dir}")
  915. except Exception as e:
  916. print(f"创建保存目录失败:{str(e)}")
  917. return None
  918. try:
  919. # 模拟浏览器请求头,避免被服务器拦截
  920. headers = {
  921. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  922. }
  923. response = requests.get(image_url, headers=headers, timeout=15)
  924. response.raise_for_status()
  925. compressed_data = compress_image(response.content)
  926. image_base64 = base64.b64encode(compressed_data).decode("utf-8")
  927. image_data = compressed_data
  928. # 步骤3:提取图片文件名(从URL中截取,避免重复)
  929. # 示例URL:https://xxx.com/123.jpg → 文件名:123.jpg
  930. file_name = image_url.split("/")[-1]
  931. # 处理特殊字符(避免文件名非法)
  932. file_name = file_name.replace("?", "").replace("&", "").replace("=", "")
  933. save_path = os.path.join(save_dir, file_name) # 完整保存路径
  934. # 步骤4:保存图片到本地
  935. with open(save_path, "wb") as f:
  936. f.write(image_data)
  937. print(f"图片已保存到本地:{save_path}")
  938. return image_base64
  939. except requests.exceptions.Timeout:
  940. print(f"下载图片超时:{image_url}")
  941. return None
  942. except requests.exceptions.HTTPError as e:
  943. code = e.response.status_code if e.response is not None else "?"
  944. logger.warning("下载图片 HTTP 错误 url=%s status=%s", image_url, code)
  945. return None
  946. except Exception as e:
  947. print(f"下载图片失败:{str(e)}")
  948. return None
  949. def get_ocr_res(img):
  950. try:
  951. #img地址
  952. print(f'开始识别图片:{img}')
  953. request_url = request_url_config
  954. img_base64 = download_image_to_base64(img)
  955. if not img_base64:
  956. print("图片下载/转Base64失败,终止OCR识别")
  957. return None
  958. # 获取access_token
  959. access_token = get_access_token()
  960. if not access_token:
  961. print("获取access_token失败,无法调用OCR接口")
  962. return None
  963. params = {"image": img_base64}
  964. request_url = request_url + "?access_token=" + access_token
  965. headers = {'content-type': 'application/x-www-form-urlencoded'}
  966. response = requests.post(request_url, data=params, headers=headers)
  967. if response:
  968. res = response.json()
  969. # 检查OCR返回是否有错误
  970. if "error_code" in res:
  971. print(f"百度OCR接口错误:{res['error_msg']}(错误码:{res['error_code']})")
  972. return None
  973. # 解析识别结果
  974. new_dic = dict()
  975. for ite in res['words_result'].keys():
  976. new_dic[ite] = res['words_result'][ite]['words']
  977. print('资质数据信息', new_dic)
  978. return new_dic
  979. else:
  980. print("OCR接口返回空响应")
  981. return None
  982. except requests.exceptions.RequestException as e:
  983. print(f"网络错误(图片下载/OCR请求失败):{str(e)}")
  984. return None
  985. except KeyError as e:
  986. print(f"OCR响应格式异常,缺失字段:{str(e)}")
  987. return None
  988. except Exception as e:
  989. print(f"OCR识别未知错误:{str(e)}")
  990. return None
  991. def get_access_token():
  992. AppKey = AppKey_config
  993. AppSrcret = AppSecret_config
  994. token_url =token_url_config
  995. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  996. payload = ""
  997. headers = {
  998. 'Content-Type': 'application/json',
  999. 'Accept': 'application/json'
  1000. }
  1001. try:
  1002. response = requests.request("POST", url, headers=headers, data=payload)
  1003. response.raise_for_status() # 触发HTTP错误
  1004. return response.json()['access_token']
  1005. except Exception as e:
  1006. print(f"获取access_token失败:{str(e)}")
  1007. return None
  1008. def extract_province_city(address):
  1009. """
  1010. 从地址中提取省份和城市
  1011. :param address: 营业执照地址(如"福建省福州市马尾区")
  1012. :return: (province, city) - 提取到的省份/城市,提取失败返回空字符串
  1013. """
  1014. if not address: # 地址为空,直接返回空
  1015. return "", ""
  1016. # 正则1:匹配省份(兼容省/自治区/直辖市/特别行政区)
  1017. province_pattern = re.compile(r'([^省]+省|.+自治区|北京市|上海市|天津市|重庆市|.+特别行政区)')
  1018. province_match = province_pattern.search(address)
  1019. province = province_match.group(1) if province_match else ""
  1020. # 正则2:匹配城市(兼容市/自治州/地区/盟,且排除省份已匹配的部分)
  1021. # 先去掉已匹配的省份,再匹配城市
  1022. address_remain = address.replace(province, "").strip() if province else address.strip()
  1023. city_pattern = re.compile(r'([^市]+市|.+自治州|.+地区|.+盟|^[^\d区县镇]+)')
  1024. city_match = city_pattern.search(address_remain)
  1025. city = city_match.group(1).strip() if city_match else ""
  1026. # 兼容直辖市(如"北京市朝阳区"→city=北京市)
  1027. if province in ["北京市", "上海市", "天津市", "重庆市"]:
  1028. city = province
  1029. # 兼容地址不规范的情况(如"福建福州马尾区",无"省"/"市"字)
  1030. if not province and not city:
  1031. # 匹配前两个地名(如"福建福州"→province=福建,city=福州)
  1032. simple_pattern = re.compile(r'^([^\d区县镇]+)')
  1033. simple_match = simple_pattern.search(address)
  1034. if simple_match:
  1035. city = simple_match.group(1).strip() # 只有城市,省份留空
  1036. if city and province and city != province and province in city:
  1037. city = city.replace(province, "").strip()
  1038. province, city = normalize_province_city_names(province, city)
  1039. return province.strip(), city.strip()
  1040. #采集数据核心
  1041. def collect_data(store_page, brand, name, keyword, spec, company_id, collect_config_info="", collect_equipment_account_id=1,collect_region_id=1,collect_round=1):
  1042. """
  1043. 1) 先获取当前页商品个数(count)
  1044. 2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
  1045. 3) 当前页循环完 -> goto_next_page;有下一页继续;无下一页结束该关键词
  1046. """
  1047. collect_result = []
  1048. logger.info(f"📊 开始采集「{keyword}」的商品数据")
  1049. store_page.wait_for_load_state("networkidle")
  1050. #没有找到商品就跳过这个商品
  1051. page_no = 1
  1052. while True:
  1053. logger.info(f"\n📄 「{keyword}」开始采集第 {page_no} 页")
  1054. # 记录列表页URL(可用于你后续兜底)
  1055. list_page_url = store_page.url
  1056. logger.info(f"📌 已记录商品列表页URL:{list_page_url}")
  1057. # ✅ 先获取当前页商品个数
  1058. store_page.wait_for_load_state("domcontentloaded") # 先等DOM加载
  1059. store_page.wait_for_load_state("networkidle")
  1060. store_page.wait_for_timeout(500) # 额外等待渲染稳定
  1061. total_limit = store_page.locator(PRODUCT_ITEM_SELECTOR).count()
  1062. logger.info(f"📌 「{keyword}」第{page_no}页 初始商品个数(count):{total_limit}")
  1063. # 重置当前页的采集计数
  1064. collected_count = 0
  1065. #补充没找到关键词的兜底
  1066. not_found_keywords = store_page.locator("div.filter-panel-container-empty-text")
  1067. if not_found_keywords.count() > 0:
  1068. logger.warning(f"⚠️ 关键词「{keyword}」无匹配商品,直接跳过整个关键词采集")
  1069. return []
  1070. for idx in range(total_limit):
  1071. detail_page = None
  1072. try:
  1073. item = store_page.locator(PRODUCT_ITEM_SELECTOR).nth(idx)
  1074. collected_count += 1 # 实际采集计数(用于日志)
  1075. # ========= 反爬随机延迟(保留你的原逻辑也行) =========
  1076. store_page.wait_for_load_state("networkidle")
  1077. delay = random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1078. logger.info(f"📌 「{keyword}」第{page_no}页 第{collected_count}/{total_limit}个商品 - 等待{delay:.2f}秒后采集(反爬)")
  1079. # 1. 初始化所有字段默认值
  1080. title = ""
  1081. shop = ""
  1082. expiry_date = "无有效期"
  1083. manufacture_date = "无生产日期"
  1084. approval_number = "无批准文号"
  1085. manufacturer = "未知公司"
  1086. spec = "未知规格"
  1087. num = 1 # ✅ 默认 1
  1088. platform = '药帮忙'
  1089. current_time = datetime.now().strftime("%Y-%m-%d")
  1090. is_sold_out = 0
  1091. business_license_address = '' #店铺地址为空
  1092. # =========1、 售罄不跳过 =========
  1093. sold_locator = item.locator('div.product-status')
  1094. if sold_locator.count() > 0:
  1095. is_sold_out = 1
  1096. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品已售罄")
  1097. #2、提取商品ID
  1098. product_id = ''
  1099. product_id_elem = item.locator('div.product-card[data-product-id]')
  1100. if product_id_elem.count() > 0:
  1101. product_id = product_id_elem.get_attribute("data-product-id")
  1102. logger.info(f"✅ 提取到data-product-id:{product_id}") # 输出:5678955
  1103. else:
  1104. logger.warning("⚠️ 未找到商品ID,使用默认空字符串")
  1105. #3、 提取商品标题(处理空值)
  1106. product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
  1107. if product_locator.count() > 0:
  1108. title = product_locator.inner_text(timeout=3000).strip()
  1109. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页标题:{title}{'='*10}")
  1110. else:
  1111. logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
  1112. #筛选非想要的品牌、名称、品规等等。
  1113. if not (brand in title and name in title):
  1114. logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含品牌「{brand}」、名称「{name}」、规格「{spec}」,跳过本次循环")
  1115. continue
  1116. #4、 提取价格(带缺失日志)
  1117. price_int = item.locator('//span[@class="price-int"]').text_content().strip()
  1118. # 2. 提取小数部分(注意可能为空,比如价格是整数13)
  1119. price_decimal_elem = item.locator('//span[@class="price-decimal"]')
  1120. if price_decimal_elem.count() > 0:
  1121. price_decimal = price_decimal_elem.text_content().strip()
  1122. else:
  1123. price_decimal = ''
  1124. full_price = f"{price_int}{price_decimal}".strip()
  1125. try:
  1126. full_price_num = float(full_price)
  1127. except ValueError:
  1128. full_price_num = 0.0
  1129. logger.warning(
  1130. " 「%s」第%s个商品「%s」- 列表页价格无法解析 raw=%r,按 0 处理",
  1131. keyword,
  1132. collected_count,
  1133. title,
  1134. full_price,
  1135. )
  1136. logger.info(f"✅ 提取到价格:{full_price_num}")
  1137. # 5. 提取公司名称(带缺失日志)
  1138. manufacturer_locator = item.locator(PRODUCT_COMPANY_SELECTOR)
  1139. if manufacturer_locator.count() > 0:
  1140. manufacturer = manufacturer_locator.inner_text(timeout=3000).strip()
  1141. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页公司名:{manufacturer}{'='*10}")
  1142. else:
  1143. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页公司名称元素未找到,使用默认值:{manufacturer}")
  1144. #6、提取店铺名称
  1145. shop_locator = item.locator(PRODUCT_STORE_SELECTOR)
  1146. if shop_locator.count() > 0:
  1147. raw_shop = shop_locator.inner_text(timeout=3000).strip()
  1148. # 2. 清洗店名(核心新增步骤)
  1149. shop = clean_shop_name(raw_shop)
  1150. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页店名:{shop}{'='*10}")
  1151. logger.info(f"原始店名:{raw_shop}")
  1152. logger.info(f"清洗后店名:{shop}{'='*10}")
  1153. else:
  1154. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页店铺名称元素未找到,使用默认值:{shop}")
  1155. #7、 提取折扣价
  1156. discount_price_val_origin = ""
  1157. discount_price = ""
  1158. discount_price_locator = item.locator('span[data-v-4cb6cc1f].discount-int').first
  1159. if discount_price_locator.count() > 0:
  1160. discount_price = discount_price_locator.inner_text(timeout=3000).strip()
  1161. discount_price_val_origin = discount_price
  1162. match = re.search(r'\d+\.?\d*', str(discount_price_val_origin))
  1163. discount_price_val = float(match.group()) if match else 0.00
  1164. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页折扣价:{discount_price_val}{'='*10}")
  1165. else:
  1166. discount_price_val = full_price_num
  1167. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 折扣价元素未找到,使用采购价兜底:{discount_price_val}")
  1168. #8、 提取有效期(处理空值)
  1169. expiry_date_locator = item.locator(f"{PRODUCT_VALIDITY_SELECTOR}")
  1170. if expiry_date_locator.count() > 0:
  1171. expiry_date = expiry_date_locator.inner_text(timeout=3000).strip().replace('-', '') #.replace('近效期','')
  1172. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页有效期:{expiry_date}{'='*10}")
  1173. else:
  1174. # 修复:替换未定义的i为collected_count
  1175. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 有效期元素未找到,使用默认值:{expiry_date}")
  1176. # ========= 模拟点击商品进入详情页 =========
  1177. logger.info(
  1178. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 模拟鼠标移动并点击"
  1179. )
  1180. item.hover() # 先悬停
  1181. random_delay(0.2, 0.5) # 悬停后延迟
  1182. item.dispatch_event("mousedown")
  1183. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1184. item.dispatch_event("mouseup")
  1185. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1186. try:
  1187. with store_page.context.expect_page(timeout=60000) as p:
  1188. item.click(delay=random.uniform(0.1, 0.3))
  1189. detail_page = p.value
  1190. except PlaywrightTimeoutError:
  1191. logger.warning(
  1192. f" 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 未检测到新标签页,使用当前页采集详情"
  1193. )
  1194. detail_page = None # 标记为无新标签页,避免关闭列表页
  1195. # 等待详情加载(优先用新标签页,无则用列表页)
  1196. target_page = detail_page if detail_page else store_page
  1197. target_page.wait_for_load_state("networkidle", timeout=20000)
  1198. delay = random_delay(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
  1199. logger.info(
  1200. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 详情页加载完成,等待{delay:.2f}秒(反爬)"
  1201. )
  1202. #点击后:1、获取商品详情页链接
  1203. product_link = target_page.url
  1204. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页链接:{product_link}{'='*10}")
  1205. # ========= ✅ 去重逻辑,拿商品链接和折扣价和有效期和采集日期 =========
  1206. if check_dup_in_biz_db(
  1207. product_link,
  1208. discount_price_val,
  1209. current_time,
  1210. collect_equipment_account_id,
  1211. collect_region_id,
  1212. collect_round,
  1213. ):
  1214. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过")
  1215. # ========== 关闭新标签页,切回列表页 ==========
  1216. if detail_page and not detail_page.is_closed():
  1217. detail_page.close() # 关闭详情页标签
  1218. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1219. # 切回原列表页(第一个标签页)
  1220. store_page.bring_to_front() # 激活列表页
  1221. store_page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1222. random_delay(0.5, 1.0) # 增加切换后延迟
  1223. store_page.wait_for_load_state("networkidle")
  1224. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1225. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1226. if collected_count % 6 == 0 and collected_count > 0:
  1227. logger.info("采满6个往下滑")
  1228. slow_scroll_400px(store_page)
  1229. store_page.wait_for_load_state("networkidle")
  1230. continue
  1231. #点击后:2、提取生产日期(修复完成)
  1232. manufacture_date_locator = target_page.locator('//div[contains(@class, "spec-info-item") and .//div[contains(@class, "spec-info-item-label") and normalize-space(.)="生产日期"]]//div[contains(@class, "spec-info-item-value-text")]')
  1233. if manufacture_date_locator.count() > 0:
  1234. manufacture_date = manufacture_date_locator.inner_text(timeout=3000).strip()
  1235. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页生产日期:{manufacture_date}{'='*10}")
  1236. else:
  1237. # 修复:替换未定义的i为collected_count
  1238. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 生产日期元素未找到,使用默认值:{manufacture_date}")
  1239. #详情页: 3、提取批准文号
  1240. approval_number_locator = target_page.locator('//div[contains(@class, "spec-info-item") and .//div[contains(@class, "spec-info-item-label") and normalize-space(.)="批准文号"]]//div[contains(@class, "spec-info-item-value-text")]')
  1241. if approval_number_locator.count() > 0:
  1242. approval_number = approval_number_locator.inner_text(timeout=3000).strip()
  1243. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页批准文号:{approval_number}{'='*10}")
  1244. else:
  1245. # 修复:替换未定义的i为collected_count
  1246. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 批准文号元素未找到,使用默认值:{approval_number}")
  1247. #详情页 4、提取规格
  1248. spec_locator = target_page.locator('//div[contains(@class, "spec-info-item") and .//div[contains(@class, "spec-info-item-label") and normalize-space(.)="规格"]]//div[contains(@class, "spec-info-item-value-text")]')
  1249. if spec_locator.count() > 0:
  1250. spec = spec_locator.inner_text(timeout=3000).strip()
  1251. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页规格:{spec}{'='*10}")
  1252. else:
  1253. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  1254. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 规格元素数量不足,使用默认值:{spec}")
  1255. # input("...")
  1256. #详情页 5、提取库存
  1257. storage = ''
  1258. storage_locator = target_page.locator('[data-v-51f0e85d].detail-input-num-right-title')
  1259. if storage_locator.count() > 0:
  1260. storage = storage_locator.inner_text(timeout=3000).strip()
  1261. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页库存:{storage}{'='*10}")
  1262. else:
  1263. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  1264. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 库存元素数量不足,使用默认值:{storage}")
  1265. #详情页 6、提取销量
  1266. sell = ''
  1267. sell_locator = target_page.locator('div.detail-info-content-item-value-price-top-right div[data-v-95163d4a]',has_text='已售')
  1268. if sell_locator.count() > 0:
  1269. sell = sell_locator.inner_text(timeout=3000).strip()
  1270. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页销量:{sell}{'='*10}")
  1271. else:
  1272. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 没有销量元素,使用默认值:{sell}")
  1273. #详情页 7、保存快照url上传到oss
  1274. oss_url = ""
  1275. try:
  1276. local_path, oss_url = screenshot_target_page_to_local_then_oss(
  1277. target_page=target_page,
  1278. full_page=True # 截取全屏
  1279. )
  1280. logger.info("详情页快照已上传 local=%s oss=%s", local_path, oss_url)
  1281. except Exception as e:
  1282. logger.warning("详情页快照上传失败:%s", e)
  1283. province = ""
  1284. city = ""
  1285. business_license_company = ""
  1286. qualification_number = ''
  1287. shop_exists, shop_info = shop_is_exists_database(shop)
  1288. shop_page = None
  1289. store_url = ''
  1290. #店铺名不是药品预约中心且店铺名不在数据库就要点击
  1291. if shop != "药店品种预约中心" and not shop_exists:
  1292. logger.info("店铺名不是药店品种预约中心且数据库没有该公司的营业执照")
  1293. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1294. entershop_btn = target_page.locator('div[data-v-5485589c].shop-info-container-left-info')
  1295. # 增强:先等待进入店铺按钮可见
  1296. entershop_btn.wait_for(state="visible", timeout=10000)
  1297. entershop_btn.scroll_into_view_if_needed() # 确保按钮在视口内
  1298. entershop_btn.hover() # 先悬停
  1299. random_delay(0.2, 0.5) # 悬停后延迟
  1300. with target_page.expect_popup(timeout=15000) as pop:
  1301. entershop_btn.click()
  1302. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1303. shop_page = pop.value
  1304. shop_page.wait_for_load_state("domcontentloaded") # 比 networkidle 更
  1305. #在这里获取店铺链接
  1306. store_url = shop_page.url # 获取店铺链接
  1307. logger.info(f"📌 获取到店铺链接:{store_url}")
  1308. #点击店铺资质
  1309. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1310. shop_license_page = shop_page.locator('//div[contains(@class, "shop-info-container-right-btns-item") and contains(span, "资质/售后")]')
  1311. shop_license_page.wait_for(state="attached", timeout=15000) # 等待元素加载完成
  1312. shop_license_page.scroll_into_view_if_needed() # 确保在视口内
  1313. shop_license_page.hover() # 先悬停
  1314. random_delay(0.2, 0.5) # 悬停后延迟
  1315. shop_license_page.click()
  1316. random_delay(0.05, 0.15) # 鼠标按下后延迟
  1317. random_delay(0.05, 0.1) # 鼠标松开后延迟
  1318. shop_page.wait_for_load_state("networkidle")
  1319. #获取药品经营许可证图片
  1320. shop_page.wait_for_load_state("load")
  1321. ocr_res = None
  1322. shop_license_img = shop_page.locator('//span[contains(text(), "企业营业执照") or contains(text(), "营业执照(正本)")]/ancestor::div[@class="shop-info-drawer-zz-tab1-list-item"]/img').first
  1323. shop_license_img.wait_for(state="visible", timeout=60000)
  1324. shop_license_src = None
  1325. try:
  1326. if shop_license_img.count() > 0:
  1327. shop_license_src = shop_license_img.get_attribute('src')
  1328. shop_license_src = shop_license_src.strip() if shop_license_src else None
  1329. ocr_res = get_ocr_res(shop_license_src)
  1330. else:
  1331. shop_license_src = None
  1332. except Exception as e:
  1333. logger.warning(f"提取营业执照图片src失败:{e}")
  1334. shop_license_src = None
  1335. logger.debug("营业执照图片链接:%s", shop_license_src)
  1336. contact_address = ''
  1337. qualification_number = ocr_res.get('社会信用代码', '') if ocr_res else ''
  1338. business_license_company = ocr_res.get('单位名称', '') if ocr_res else ''
  1339. business_license_address = ocr_res.get('地址', '') if ocr_res else ''
  1340. # scrape_date = ''
  1341. # 调用提取函数,获取省份和城市
  1342. province, city = extract_province_city(business_license_address)
  1343. logger.info(f"原始地址:{business_license_address}")
  1344. logger.info(f"提取的省份:{province} | 城市:{city}")
  1345. insert_result = insert_shop_info_to_db(
  1346. shop=shop,
  1347. contact_address=store_url, #改为店铺链接,到时可以从数据库获取
  1348. qualification_number=qualification_number,
  1349. business_license_company=business_license_company,
  1350. business_license_address=business_license_address,
  1351. scrape_date=current_time,
  1352. platform=platform,
  1353. province=province,
  1354. city=city,
  1355. create_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S") ,
  1356. update_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  1357. )
  1358. else:
  1359. logger.info("数据库有该店名,在数据库拿取对应字段填充ybm_drug_middle表")
  1360. province, city = "", ""
  1361. business_license_company, qualification_number = "", ""
  1362. business_license_address = ""
  1363. if shop_info:
  1364. province = shop_info["province"]
  1365. city = shop_info["city"]
  1366. business_license_company = shop_info["business_license_company"]
  1367. qualification_number = shop_info["qualification_number"]
  1368. business_license_address = shop_info.get("business_license_address", "") or ""
  1369. try:
  1370. if shop_page and not shop_page.is_closed():
  1371. random_delay(4,8)
  1372. shop_page.close()
  1373. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭店铺页标签 shop_page")
  1374. except Exception as e:
  1375. logger.warning(f"⚠️ 关闭 shop_page 失败:{e}")
  1376. random_delay(5,8)
  1377. # ========== 关闭新标签页,切回列表页 ==========
  1378. if detail_page and not detail_page.is_closed():
  1379. detail_page.close() # 关闭详情页标签
  1380. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  1381. # 切回原列表页(第一个标签页)
  1382. store_page.bring_to_front() # 激活列表页
  1383. store_page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  1384. random_delay(0.5, 1.0) # 增加切换后延迟
  1385. store_page.wait_for_load_state("networkidle")
  1386. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1387. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  1388. random_delay(2,4)
  1389. # credit_code = ""
  1390. availability = ""
  1391. # input(".....")
  1392. # 调用函数获取省市ID(修复:传入province和city变量)
  1393. province_id, city_id = get_province_city_ids(province, city)
  1394. # 组装单条数据(仅新增生产日期/批准文号字段,原有字段顺序/逻辑不变)
  1395. # 构造单条数据元组(适配MySQL字段)
  1396. single_data = {
  1397. # 核心商品信息
  1398. "enterprise_id": company_id,
  1399. 'platform_id': 9,
  1400. 'platform_item_id': product_id,
  1401. 'province_id': province_id,
  1402. 'city_id': city_id,
  1403. 'province_name': province,
  1404. 'city_name': city,
  1405. 'area_info': business_license_address,
  1406. 'product_brand': brand, # 品牌
  1407. "product_name": keyword, # 搜索商品名称
  1408. 'product_specs': spec, # 规格
  1409. # "my_good_price": merged_price, # 自定义价格(可与min_price相同或单独提取)
  1410. "one_box_price": discount_price_val, # 最低价格
  1411. "manufacture_date": manufacture_date, # 生产日期
  1412. "expiry_date": expiry_date, # 有效期
  1413. "manufacturer": manufacturer, # 生产厂家
  1414. "approval_number": approval_number, # 批准文号
  1415. "is_sold_out": is_sold_out, # 售罄标记(0/1)
  1416. 'online_posting_count': 1,
  1417. 'continuous_listing_count': 1,
  1418. 'link_url': product_link,
  1419. "store_name": shop, # 店铺名称
  1420. 'store_url': store_url, # 店铺链接
  1421. "search_name": keyword, # 搜索关键词
  1422. "collect_config_info": collect_config_info, # 采集配置 JSON 字符串
  1423. 'shipment_province_id': 0,
  1424. 'shipment_province_name': '',
  1425. 'shipment_city_id': 0,
  1426. 'shipment_city_name': '',
  1427. "company_name": business_license_company, # 营业执照主体(公司名称)
  1428. "qualification_number": qualification_number, # 统一信用代码(如有可补充提取)
  1429. "scrape_date": current_time, # 采集日期
  1430. "min_price": discount_price_val, # 最低价格
  1431. "number": num, # 数量(盒数)
  1432. "sales": sell, #销量
  1433. "inventory": storage, #库存
  1434. "snapshot_url": oss_url, #快照链接
  1435. "insert_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # 创建时间
  1436. "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # 更新时间
  1437. "collect_equipment_account_id":collect_equipment_account_id,
  1438. "collect_region_id":collect_region_id,
  1439. "collect_round":collect_round,
  1440. #可能会用到
  1441. # "province": province, # 省份
  1442. # "city": city, # 城市
  1443. # "scrape_province": "", # 采集省份(可留空或根据IP获取)
  1444. # "availability": availability, # 库存状态
  1445. #暂时用不到
  1446. # "platform": platform, # 平台名称(固定或动态获取)
  1447. # "search_key": keyword, # 搜索关键词
  1448. }
  1449. # 调用逐条插入函数
  1450. insert_single_to_mysql(single_data)
  1451. collect_result.append(single_data)
  1452. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」采集完成")
  1453. # input("....")
  1454. except Exception as e:
  1455. # 异常处理:关闭详情页,强制切回列表页
  1456. logger.exception(f" 「{keyword}」第{collected_count}个商品采集核心异常:{str(e)}")
  1457. try:
  1458. if detail_page and not detail_page.is_closed():
  1459. detail_page.close()
  1460. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 异常时关闭详情页标签页")
  1461. if store_page and not store_page.is_closed():
  1462. store_page.bring_to_front() # 切回列表页
  1463. store_page.wait_for_load_state("networkidle")
  1464. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  1465. except Exception as e2:
  1466. logger.error(f" 「{keyword}」第{collected_count}个商品详情采集异常(处理时):{str(e2)},原异常:{str(e)}")
  1467. continue
  1468. if collected_count % 6 == 0 and collected_count > 0 and collected_count != total_limit:
  1469. logger.info("采满6个往下滑")
  1470. slow_scroll_400px(store_page,)
  1471. store_page.wait_for_load_state("networkidle")
  1472. # ====== 当前页采集完毕,尝试翻页 ======
  1473. delay = random_delay(1.5, 3.0)
  1474. logger.info(f"⏳ 翻页前随机等待 {delay:.2f}s(反爬)")
  1475. if goto_next_page(store_page):
  1476. logger.info(f"「{keyword}」还有下一页")
  1477. page_no += 1
  1478. store_page.wait_for_load_state("networkidle")
  1479. total_limit = store_page.locator(PRODUCT_ITEM_SELECTOR).count()
  1480. logger.info(f"📌 「{keyword}」第{page_no}页 商品个数更新为:{total_limit}")
  1481. continue
  1482. else:
  1483. logger.info(f" 「{keyword}」已无下一页,关键词采集结束")
  1484. break
  1485. # 关键词采集完成后长延迟
  1486. long_delay = random_delay(MIN_KEYWORD_DELAY, MAX_KEYWORD_DELAY)
  1487. logger.info(f" 「{keyword}」采集完成,共{len(collect_result)}条数据,等待{long_delay:.2f}秒后继续下一个关键词(反爬)")
  1488. return collect_result
  1489. # ==================== 主函数(登录+批量搜索) ====================
  1490. def main():
  1491. # 在程序启动时加载一次
  1492. load_city_mapping()
  1493. logger.info("\n" + "="*50)
  1494. logger.info("🚀 药帮忙采集程序启动")
  1495. logger.info(f"⏰ 启动时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  1496. logger.info("="*50)
  1497. with (sync_playwright() as p):
  1498. browser = p.chromium.launch(
  1499. channel="chrome", # 使用真实Chrome内核
  1500. slow_mo=random.randint(100, 300), # 全局操作延迟(模拟真人慢速操作)
  1501. args=[
  1502. "--disable-blink-features=AutomationControlled", # 禁用webdriver特征(核心!)
  1503. "--enable-automation=false", # 新增:禁用自动化标识
  1504. "--disable-infobars", # 新增:禁用信息栏
  1505. "--remote-debugging-port=0", # 新增:随机调试端口
  1506. "--start-maximized", # 最大化窗口(模拟真人使用)
  1507. "--disable-extensions", # 禁用扩展(避免特征)
  1508. "--disable-plugins-discovery", # 禁用插件发现
  1509. "--no-sandbox", # 避免沙箱模式特征
  1510. "--disable-dev-shm-usage", # 避免内存限制导致的异常
  1511. f"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(110, 120)}.0.0.0 Safari/537.36" # 随机Chrome版本的UA
  1512. ]
  1513. )
  1514. # 创建页面时伪装指纹
  1515. context = browser.new_context(
  1516. locale="zh-CN", # 中文环境
  1517. timezone_id="Asia/Shanghai", # 上海时区
  1518. geolocation={"latitude": 31.230416, "longitude": 121.473701}, # 模拟上海地理位置(可选)
  1519. permissions=["geolocation"], # 授予定位权限(模拟真人)
  1520. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  1521. viewport={"width": 1800, "height": 1000},
  1522. # 关键:隐藏自动化特征
  1523. java_script_enabled=True,
  1524. bypass_csp=True,
  1525. # user_data_dir="./temp_user_data" # 模拟真实用户数据目录
  1526. )
  1527. page = context.new_page()
  1528. # 关键:移除navigator.webdriver标识(反爬核心)
  1529. page.add_init_script("""
  1530. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  1531. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); // 新增:模拟插件
  1532. Object.defineProperty(navigator, 'mimeTypes', { get: () => [1, 2, 3] }); // 新增:模拟MIME类型
  1533. window.chrome = { runtime: {}, loadTimes: () => ({}) }; // 增强Chrome模拟
  1534. delete window.navigator.languages;
  1535. window.navigator.languages = ['zh-CN', 'zh'];
  1536. // 新增:模拟真实鼠标移动特征
  1537. (() => {
  1538. const originalAddEventListener = EventTarget.prototype.addEventListener;
  1539. EventTarget.prototype.addEventListener = function(type, listener) {
  1540. if (type === 'mousemove') {
  1541. return originalAddEventListener.call(this, type, (e) => {
  1542. e._automation = undefined;
  1543. listener(e);
  1544. });
  1545. }
  1546. return originalAddEventListener.call(this, type, listener);
  1547. };
  1548. })();
  1549. """)
  1550. try:
  1551. # ========== 核心:Cookie复用逻辑 ==========
  1552. # 1. 加载本地Cookie
  1553. load_cookies(context)
  1554. # 2. 验证登录状态
  1555. if not is_login(page):
  1556. # 3. Cookie失效/不存在,执行登录
  1557. page.goto(TARGET_LOGIN_URL)
  1558. page.wait_for_load_state("networkidle")
  1559. logger.info("🔑 开始执行登录流程")
  1560. # 执行登录操作
  1561. login_success = login_operation(page, USERNAME, PASSWORD)
  1562. if not login_success:
  1563. logger.error(" 登录失败,程序终止")
  1564. return
  1565. # 4. 登录成功后保存Cookie
  1566. save_cookies(context)
  1567. logger.info(" 登录并保存Cookie成功!")
  1568. # ==================== 1. 核心业务配置 ====================
  1569. task = get_search_keywords_from_db(include_collect_config=True)
  1570. if not task:
  1571. logger.error("未获取到任何任务,程序退出")
  1572. return
  1573. # 2. 批量搜索+采集+保存
  1574. task_id = task['id']
  1575. brand = task['product_brand']
  1576. name = task["product_name"]
  1577. sampling_cycle = task["sampling_cycle"]
  1578. sampling_start_time = task["sampling_start_time"]
  1579. sampling_end_time = task["sampling_end_time"]
  1580. company_id = task["company_id"]
  1581. spec = task["product_specs"]
  1582. collect_equipment_account_id = task["collect_equipment_account_id"]
  1583. collect_region_id = task["collect_region_id"]
  1584. collect_round = task["collect_round"]
  1585. keyword = brand + name
  1586. collect_config_info = json.dumps({
  1587. "sampling_cycle": sampling_cycle,
  1588. "sampling_start_time": sampling_start_time,
  1589. "sampling_end_time": sampling_end_time,
  1590. }, ensure_ascii=False, default=str)
  1591. logger.info(f"\n=====================================")
  1592. logger.info(f"开始处理任务 {task_id},公司ID:{company_id},关键词:{keyword}")
  1593. logger.info(f"=====================================")
  1594. # ---------- 第一次上报 ----------
  1595. report_start(task_id, keyword) # 上报开始任务(仅关键词)
  1596. logger.info("上报状态")
  1597. start_ts = int(time.time()) # 记录开始时间戳
  1598. # 采集状态变量
  1599. success = False
  1600. real_count = 0
  1601. popup_guard(page, "before_search")
  1602. store_page, search_success = search_operation(page, keyword, is_first_search=True)
  1603. if store_page:
  1604. popup_guard(store_page, "after_search")
  1605. if store_page is None:
  1606. return
  1607. if not search_success:
  1608. logger.warning(f" 「{keyword}」搜索失败,跳过采集")
  1609. return
  1610. # ✅ 再等页面稳定一下(networkidle 有时会等不到,建议加超时或换成 domcontentloaded)
  1611. store_page.wait_for_load_state("domcontentloaded")
  1612. store_page.wait_for_load_state('networkidle')
  1613. # 采集数据
  1614. data_list = collect_data(store_page, brand, name, keyword, spec, company_id, collect_config_info, collect_equipment_account_id,collect_region_id,collect_round)
  1615. real_count = len(data_list)
  1616. success = True
  1617. logger.info(f"关键词「{keyword}」采集完成,共 {real_count} 条数据")
  1618. # ---------- 第二次上报:结束任务(全部字段) ----------
  1619. report_end(task_id, keyword, company_id,success=success, real_count=real_count, start_ts=start_ts)
  1620. notify_feishu_after_report_end(
  1621. task_id=task_id,
  1622. keyword=keyword,
  1623. company_id=company_id,
  1624. success=success,
  1625. spec=spec,
  1626. real_count=real_count,
  1627. start_ts=start_ts,
  1628. platform_name="药帮忙",
  1629. drug_name=name,
  1630. )
  1631. logger.info("任务 %s 处理完成,入库条数:%s", task_id, real_count)
  1632. except Exception as e:
  1633. logger.exception("程序异常:%s", e)
  1634. finally:
  1635. browser.close()
  1636. logger.info(" 浏览器已关闭,程序结束")
  1637. # ==================== 程序入口 ====================
  1638. if __name__ == '__main__':
  1639. main()