test.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. def collect_data(page, keyword):
  2. """
  3. 1) 先获取当前页商品个数(count)
  4. 2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
  5. 3) 当前页循环完 -> goto_next_page;有下一页继续;无下一页结束该关键词
  6. """
  7. collect_result = []
  8. # seen = set()
  9. logger.info(f"📊 开始采集「{keyword}」的商品数据")
  10. page.wait_for_load_state("networkidle")
  11. #没有找到商品就跳过这个商品
  12. page_no = 1
  13. while True:
  14. logger.info(f"\n📄 「{keyword}」开始采集第 {page_no} 页")
  15. # 记录列表页URL(可用于你后续兜底)
  16. list_page_url = page.url
  17. logger.info(f"📌 已记录商品列表页URL:{list_page_url}")
  18. # ✅ 先获取当前页商品个数
  19. page.wait_for_load_state("networkidle")
  20. total_limit = page.locator(PRODUCT_ITEM_SELECTOR).count()
  21. logger.info(f"📌 「{keyword}」第{page_no}页 初始商品个数(count):{total_limit}")
  22. # 重置当前页的采集计数
  23. collected_count = 0
  24. # ========= 初始化无匹配计数器(记录标题不包含核心关键词的次数) =========
  25. # no_match_count = 0 # 无匹配次数初始化为0
  26. # MAX_NO_MATCH = 10 # 最大无匹配次数阈值
  27. #补充没找到关键词的兜底
  28. not_found_keywords = page.locator("span:has-text('新品登记')")
  29. if not_found_keywords.count() > 0:
  30. logger.warning(f"⚠️ 关键词「{keyword}」无匹配商品,直接跳过整个关键词采集")
  31. return []
  32. for idx in range(total_limit):
  33. detail_page = None
  34. try:
  35. item = page.locator(PRODUCT_ITEM_SELECTOR).nth(idx)
  36. collected_count += 1 # 实际采集计数(用于日志)
  37. # ========= 反爬随机延迟(保留你的原逻辑也行) =========
  38. page.wait_for_load_state("networkidle")
  39. delay = random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  40. logger.info(f"📌 「{keyword}」第{page_no}页 第{collected_count}/{total_limit}个商品 - 等待{delay:.2f}秒后采集(反爬)")
  41. # 1. 初始化所有字段默认值
  42. title = "无标题"
  43. price = "0.00"
  44. shop = "无店名"
  45. expiry_date = "无有效期"
  46. manufacture_date = "无生产日期"
  47. approval_number = "无批准文号"
  48. manufacturer = "未知公司"
  49. # discount_price = "0.00"
  50. spec = "未知规格"
  51. num = 1 # ✅ 默认 1
  52. platform = '药九九'
  53. current_time = datetime.now().strftime("%Y-%m-%d")
  54. is_sold_out = 0
  55. # ========= 售罄不跳过 =========
  56. sold_locator = item.locator('div[data-v-480da687].gc-l1-cirle_tip')
  57. if sold_locator.count() > 0:
  58. is_sold_out = 1
  59. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品已售罄")
  60. # if collected_count % 5 == 0 and collected_count > 0:
  61. # logger.info("采满5个往下滑")
  62. # slow_scroll_400px(page)
  63. # page.wait_for_load_state("networkidle")
  64. # continue
  65. # 提取商品标题(处理空值)
  66. product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
  67. if product_locator.count() > 0:
  68. title = product_locator.inner_text(timeout=3000).strip()
  69. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页标题:{title}{'='*10}")
  70. else:
  71. logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
  72. #关键词不在标题中,跳过当前商品
  73. # core_keyword = re.sub(r'^999[\s\(\)()、·]*', '', keyword)
  74. # if core_keyword not in title:
  75. # no_match_count += 1
  76. # logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含核心关键词「{core_keyword}」(无匹配次数:{no_match_count}/{MAX_NO_MATCH}),跳过本次循环")
  77. # continue
  78. # if no_match_count >= MAX_NO_MATCH:
  79. # logger.error(f"❌ 关键词「{keyword}」无匹配商品次数已达{MAX_NO_MATCH}次,直接终止当前关键词采集,进入下一个关键词")
  80. # return []
  81. # 提取价格(带缺失日志)
  82. price_locator = item.locator(PRODUCT_PRICE_SELECTOR).nth(0)
  83. if price_locator.count() > 0:
  84. price = price_locator.inner_text(timeout=3000).strip()
  85. logger.info(f"{'='*10}{keyword}」第{collected_count}个商品 - 列表页采购价格:{price}{'='*10}")
  86. else:
  87. price = "0.00" # 初始化默认值,避免后续报错
  88. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页采购价格元素未找到,使用默认值:{price}")
  89. # 5. 提取公司名称(带缺失日志)
  90. manufacturer_locator = item.locator(PRODUCT_COMPANY_SELECTOR)
  91. if manufacturer_locator.count() > 0:
  92. manufacturer = manufacturer_locator.inner_text(timeout=3000).strip()
  93. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页公司名:{manufacturer}{'='*10}")
  94. else:
  95. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页公司名称元素未找到,使用默认值:{manufacturer}")
  96. #提取店铺名称
  97. shop_locator = item.locator(PRODUCT_STORE_SELECTOR)
  98. if shop_locator.count() > 0:
  99. shop = shop_locator.inner_text(timeout=3000).strip()
  100. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 列表页店名:{shop}{'='*10}")
  101. else:
  102. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 列表页店铺名称元素未找到,使用默认值:{shop}")
  103. #提取折扣价
  104. discount_price_val_origin = ""
  105. discount_price = ""
  106. discount_price_locator = item.locator('span[data-v-480da687].gc-l2-discount_price').first
  107. if discount_price_locator.count() > 0:
  108. discount_price = discount_price_locator.inner_text(timeout=3000).strip()
  109. discount_price_val_origin = discount_price
  110. match = re.search(r'\d+\.?\d*', str(discount_price_val_origin))
  111. discount_price_val = float(match.group()) if match else 0.00
  112. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页折扣价:{discount_price_val}{'='*10}")
  113. else:
  114. #如果没有拿原价替换
  115. price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "") else "0.00"
  116. discount_price_val = float(price)
  117. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 折扣价元素未找到,使用采购价兜底:{discount_price_val}")
  118. merged_price = f"{price}{discount_price_val_origin}" if discount_price_val_origin else price
  119. # ========= 模拟点击商品进入详情页 =========
  120. logger.info(
  121. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 模拟鼠标移动并点击"
  122. )
  123. # 点击商品项容器,触发详情展示
  124. # ========== 点击商品跳详情页 ==========
  125. # 反爬:模拟真人鼠标移动到商品上再点击(不是直接点击)
  126. logger.info(f"📌 「{keyword}」第{collected_count}个商品「{title}」- 模拟鼠标移动并点击")
  127. item.hover() # 先悬停
  128. random_delay(0.2, 0.5) # 悬停后延迟
  129. item.dispatch_event("mousedown")
  130. random_delay(0.05, 0.15) # 鼠标按下后延迟
  131. item.dispatch_event("mouseup")
  132. random_delay(0.05, 0.1) # 鼠标松开后延迟
  133. try:
  134. with page.context.expect_page(timeout=60000) as p:
  135. item.click(delay=random.uniform(0.1, 0.3))
  136. detail_page = p.value
  137. except PlaywrightTimeoutError:
  138. logger.warning(
  139. f" 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 未检测到新标签页,使用当前页采集详情"
  140. )
  141. detail_page = None # 标记为无新标签页,避免关闭列表页
  142. # 等待详情加载(优先用新标签页,无则用列表页)
  143. target_page = detail_page if detail_page else page
  144. target_page.wait_for_load_state("networkidle", timeout=20000)
  145. delay = random_delay(MIN_PAGE_DELAY, MAX_PAGE_DELAY)
  146. logger.info(
  147. f"📌 「{keyword}」第{page_no}页 第{collected_count}个商品「{title}」- 详情页加载完成,等待{delay:.2f}秒(反爬)"
  148. )
  149. # 反爬:检测详情页反爬验证
  150. # check_anti_crawl(page)
  151. # ========== 采集详情页的专属信息(有效期/生产日期/批准文号) ==========
  152. #获取商品详情页链接
  153. product_link = target_page.url
  154. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页链接:{product_link}{'='*10}")
  155. # ========= ✅ 去重逻辑,拿商品链接和折扣价 =========
  156. if check_dup_in_biz_db(product_link, discount_price_val):
  157. logger.warning(f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过")
  158. # ========== 关闭新标签页,切回列表页 ==========
  159. if detail_page and not detail_page.is_closed():
  160. detail_page.close() # 关闭详情页标签
  161. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  162. # 切回原列表页(第一个标签页)
  163. page.bring_to_front() # 激活列表页
  164. page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  165. random_delay(0.5, 1.0) # 增加切换后延迟
  166. page.wait_for_load_state("networkidle")
  167. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  168. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  169. if collected_count % 5 == 0 and collected_count > 0:
  170. logger.info("采满5个往下滑")
  171. slow_scroll_400px(page)
  172. page.wait_for_load_state("networkidle")
  173. continue
  174. # key = f"{product_link.strip()}|{discount_price_val}"
  175. # if key in seen:
  176. # logger.warning(
  177. # f" 「{keyword}」第{page_no}页 第{collected_count}个商品(重复):{title},跳过"
  178. # )
  179. # if collected_count % 5 == 0 and collected_count > 0:
  180. # logger.info("采满15个往下滑")
  181. # slow_scroll_400px(page)
  182. # page.wait_for_load_state("networkidle")
  183. # continue
  184. # seen.add(key)
  185. # 提取有效期(处理空值)
  186. expiry_date_locator = target_page.locator("//span[contains(text(), '有效期')]/following-sibling::span[contains(@class, 'gdb-desc-value4')]")
  187. if expiry_date_locator.count() > 0:
  188. expiry_date = expiry_date_locator.inner_text(timeout=3000).strip().replace('-', '') #.replace('近效期','')
  189. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页有效期:{expiry_date}{'='*10}")
  190. else:
  191. # 修复:替换未定义的i为collected_count
  192. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 有效期元素未找到,使用默认值:{expiry_date}")
  193. # 提取生产日期(修复完成)
  194. manufacture_date_locator = target_page.locator("//span[@class='gdb-desc-label' and text()='生产日期']/following-sibling::span[1]")
  195. if manufacture_date_locator.count() > 0:
  196. manufacture_date = manufacture_date_locator.inner_text(timeout=3000).strip().replace('-', "")
  197. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页生产日期:{manufacture_date}{'='*10}")
  198. else:
  199. # 修复:替换未定义的i为collected_count
  200. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 生产日期元素未找到,使用默认值:{manufacture_date}")
  201. # 提取批准文号
  202. approval_number_locator = target_page.locator("//span[contains(text(), '国药准字')]").first
  203. if approval_number_locator.count() > 0:
  204. approval_number = approval_number_locator.inner_text(timeout=3000).strip()
  205. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页批准文号:{approval_number}{'='*10}")
  206. else:
  207. # 修复:替换未定义的i为collected_count
  208. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 批准文号元素未找到,使用默认值:{approval_number}")
  209. #提取规格
  210. spec_locator = target_page.locator('span.gddd-params_text_line_1[title]')
  211. if spec_locator.count() > 0:
  212. spec = spec_locator.nth(2).inner_text(timeout=3000).strip()
  213. logger.info(f"{'='*10}「{keyword}」第{collected_count}个商品 - 详情页规格:{spec}{'='*10}")
  214. else:
  215. # 修复:替换未定义的i为collected_count,补充规格数量不足的提示
  216. logger.warning(f" 「{keyword}」第{collected_count}个商品「{title}」- 规格元素数量不足,使用默认值:{spec}")
  217. # input("...")
  218. # if shop_is_exists_database(shop):
  219. # continue
  220. # province = ""
  221. # city = ""
  222. # business_license_company = ""
  223. # qualification_number = ''
  224. if shop != "药品预约中心" and not shop_is_exists_database(shop):
  225. # 获取营业执照图片 li[data-v-4f79abe8].nth(2)
  226. # 进入店铺
  227. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  228. entershop_btn = target_page.locator('[data-v-c5790f48].btn-text')
  229. # 增强:先等待进入店铺按钮可见
  230. entershop_btn.wait_for(state="visible", timeout=10000)
  231. entershop_btn.scroll_into_view_if_needed() # 确保按钮在视口内
  232. entershop_btn.hover() # 先悬停
  233. random_delay(0.2, 0.5) # 悬停后延迟
  234. entershop_btn.click()
  235. # entershop_btn.dispatch_event("mousedown")
  236. random_delay(0.05, 0.15) # 鼠标按下后延迟
  237. # entershop_btn.dispatch_event("mouseup")
  238. random_delay(0.05, 0.1) # 鼠标松开后延迟
  239. target_page.wait_for_load_state("domcontentloaded") # 等DOM加载(比networkidle更适合页面内切换)
  240. #点击店铺资质
  241. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  242. shop_license_page = target_page.locator('li:has-text("店铺资质")')
  243. shop_license_page.wait_for(state="visible", timeout=10000) # 等待元素加载完成
  244. shop_license_page.hover() # 先悬停
  245. random_delay(0.2, 0.5) # 悬停后延迟
  246. # shop_license_page.dispatch_event("mousedown")
  247. shop_license_page.click()
  248. random_delay(0.05, 0.15) # 鼠标按下后延迟
  249. # shop_license_page.dispatch_event("mouseup")
  250. random_delay(0.05, 0.1) # 鼠标松开后延迟
  251. target_page.wait_for_load_state("networkidle")
  252. slow_scroll_400px(target_page, scroll_distance1=700)
  253. #获取药品经营许可证图片
  254. target_page.wait_for_load_state("load")
  255. ocr_res = None
  256. shop_license_div = target_page.locator('div[data-v-7f7214f6].shop-licensesImg').nth(0)
  257. shop_license_div.wait_for(state="attached", timeout=60000)
  258. shop_license_img = shop_license_div.locator('img')
  259. try:
  260. if shop_license_img.count() > 0:
  261. shop_license_src = shop_license_img.get_attribute('src')
  262. shop_license_src = shop_license_src.strip() if shop_license_src else None
  263. ocr_res = get_ocr_res(shop_license_src)
  264. # print(f'ocr_res:{ocr_res}')
  265. else:
  266. shop_license_src = None
  267. except Exception as e:
  268. # 捕获定位/提取失败的异常,避免程序崩溃
  269. logger.warning(f"提取营业执照图片src失败:{e}")
  270. shop_license_src = None
  271. print("营业执照图片链接:", shop_license_src)
  272. # input("..")
  273. contact_address = ''
  274. qualification_number = ocr_res.get('社会信用代码', '') if ocr_res else ''
  275. business_license_company = ocr_res.get('单位名称', '') if ocr_res else ''
  276. business_license_address = ocr_res.get('地址', '') if ocr_res else ''
  277. # scrape_date = ''
  278. # 调用提取函数,获取省份和城市
  279. province, city = extract_province_city(business_license_address)
  280. logger.info(f"原始地址:{business_license_address}")
  281. logger.info(f"提取的省份:{province} | 城市:{city}")
  282. insert_result = insert_shop_info_to_db(
  283. shop=shop,
  284. contact_address=contact_address,
  285. qualification_number=qualification_number,
  286. business_license_company=business_license_company,
  287. business_license_address=business_license_address,
  288. scrape_date=current_time,
  289. platform=platform,
  290. province=province,
  291. city=city,
  292. create_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S") ,
  293. update_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  294. )
  295. # purchase_price = float(price.replace("¥", "").replace(",", "")) if price.replace("¥", "").replace(",", "").replace(".", "").isdigit() else 0.00
  296. # ========== 关闭新标签页,切回列表页 ==========
  297. if detail_page and not detail_page.is_closed():
  298. detail_page.close() # 关闭详情页标签
  299. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 已关闭详情页标签页")
  300. # 切回原列表页(第一个标签页)
  301. page.bring_to_front() # 激活列表页
  302. page.mouse.move(random.randint(100, 300), random.randint(200, 400)) # 随机移动鼠标
  303. random_delay(0.5, 1.0) # 增加切换后延迟
  304. page.wait_for_load_state("networkidle")
  305. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  306. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」- 已切回列表页")
  307. # credit_code = ""
  308. availability = ""
  309. # 组装单条数据(仅新增生产日期/批准文号字段,原有字段顺序/逻辑不变)
  310. # 构造单条数据元组(适配MySQL字段)
  311. single_data = {
  312. # 核心商品信息
  313. "product": title, # 商品名称
  314. "my_good_price": merged_price, # 自定义价格(可与min_price相同或单独提取)
  315. "min_price": discount_price_val, # 最低价格
  316. "manufacture_date": manufacture_date, # 生产日期
  317. "expiry_date": expiry_date, # 有效期
  318. "shop": shop, # 店铺名
  319. "business_license_company": business_license_company, # 营业执照主体(公司名称)
  320. "province": province, # 省份
  321. "city": city, # 城市
  322. "manufacturer": manufacturer, # 生产厂家
  323. "specification": spec, # 规格
  324. "approval_number": approval_number, # 批准文号
  325. "product_link": product_link, # 商品链接
  326. "scrape_date": current_time, # 采集日期
  327. "scrape_province": "", # 采集省份(可留空或根据IP获取)
  328. "availability": availability, # 库存状态
  329. "credit_code": qualification_number, # 统一信用代码(如有可补充提取)
  330. "platform": platform, # 平台名称(固定或动态获取)
  331. "search_key": keyword, # 搜索关键词
  332. "number": num, # 数量(盒数)
  333. "is_sold_out": is_sold_out, # 售罄标记(0/1)
  334. "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # 更新时间
  335. "create_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 创建时间
  336. }
  337. # 调用逐条插入函数
  338. insert_single_to_mysql(single_data)
  339. collect_result.append(single_data)
  340. logger.info(f" 「{keyword}」第{collected_count}个商品「{title}」采集完成")
  341. except Exception as e:
  342. # 异常处理:关闭详情页,强制切回列表页
  343. logger.exception(f" 「{keyword}」第{collected_count}个商品采集核心异常:{str(e)}")
  344. try:
  345. if detail_page and not detail_page.is_closed():
  346. detail_page.close()
  347. logger.info(f"📌 「{keyword}」第{collected_count}个商品 - 异常时关闭详情页标签页")
  348. if page and not page.is_closed():
  349. page.bring_to_front() # 切回列表页
  350. page.wait_for_load_state("networkidle")
  351. random_delay(MIN_CLICK_DELAY, MAX_CLICK_DELAY)
  352. except Exception as e2:
  353. logger.error(f" 「{keyword}」第{collected_count}个商品详情采集异常(处理时):{str(e2)},原异常:{str(e)}")
  354. continue
  355. # ✅ 每15次滚动一次(修复:用collected_count,且排除0的情况)
  356. if collected_count % 5 == 0 and collected_count > 0 and collected_count != total_limit:
  357. logger.info("采满5个往下滑")
  358. slow_scroll_400px(page,)
  359. page.wait_for_load_state("networkidle")
  360. # ====== 当前页采集完毕,尝试翻页 ======
  361. delay = random_delay(1.5, 3.0)
  362. logger.info(f"⏳ 翻页前随机等待 {delay:.2f}s(反爬)")
  363. if goto_next_page(page):
  364. page_no += 1
  365. continue
  366. else:
  367. logger.info(f" 「{keyword}」已无下一页,关键词采集结束")
  368. break
  369. # 关键词采集完成后长延迟
  370. long_delay = random_delay(MIN_KEYWORD_DELAY, MAX_KEYWORD_DELAY)
  371. logger.info(f" 「{keyword}」采集完成,共{len(collect_result)}条数据,等待{long_delay:.2f}秒后继续下一个关键词(反爬)")
  372. return collect_result
  373. def check_dup_in_biz_db(product_link, discount_price_val):
  374. """直接查询业务表是否存在该商品链接+价格"""
  375. conn = None
  376. cursor = None
  377. try:
  378. conn = pymysql.connect(**MYSQL_CONFIG)
  379. cursor = conn.cursor()
  380. sql = """
  381. SELECT * FROM yjj_drug_middle
  382. WHERE product_link = %s AND min_price = %s
  383. """
  384. # 先执行查询
  385. cursor.execute(sql, (product_link.strip(), discount_price_val))
  386. # 再判断是否有结果
  387. # 如果 fetchone() 返回元组(比如(1,))→ (1,) is not None → 结果为 True;
  388. # 如果 fetchone() 返回 None → None is not None → 结果为 False。
  389. is_dup = cursor.fetchone() is not None
  390. if is_dup:
  391. logger.debug(f"【去重校验】商品链接:{product_link} | 价格:{discount_price_val} - 表中已存在重复,跳过本次采集")
  392. else:
  393. logger.debug(f"【去重校验】商品链接:{product_link} | 价格:{discount_price_val} - 表中无重复,正常采集")
  394. return is_dup
  395. except Exception as e:
  396. logger.error(f"查询业务表去重失败:{str(e)}")
  397. return False
  398. finally:
  399. if cursor:
  400. cursor.close()
  401. if conn:
  402. conn.close()
  403. #判断店名是否已经在数据库
  404. def shop_is_exists_database(shop):
  405. try:
  406. conn = pymysql.connect(**MYSQL_CONFIG)
  407. cursor = conn.cursor()
  408. query_sql = """
  409. SELECT * FROM yjj_shop_info_middle
  410. WHERE shop = %s
  411. """
  412. cursor.execute(query_sql, (shop,))
  413. result = cursor.fetchone()
  414. # 正确的调试方式(替代cursor._last_executed)
  415. print(f"【调试】传入的店铺名:{repr(shop)}") # repr能显示空格/隐藏字符
  416. print(f"【调试】查询参数:{shop}")
  417. print(f"【调试】查询结果:{result} → 函数返回:{bool(result)}")
  418. is_exists = bool(result)
  419. if is_exists:
  420. logger.info(f"【店铺存在校验】店铺已存在 | 店铺名:{repr(shop)} | 结果:存在(True),跳过本次循环")
  421. else:
  422. logger.info(f"【店铺存在校验】店铺不存在 | 店铺名:{repr(shop)} | 结果:不存在(False)")
  423. return is_exists
  424. except Exception as e:
  425. print(f"MySQL 错误: {str(e)}")
  426. return False # 异常时明确返回False,避免返回None
  427. finally:
  428. # 修复:关闭游标和连接,避免泄露
  429. if cursor:
  430. cursor.close()
  431. if conn:
  432. conn.close()