1 hafta önce · 408f7305d0
--- a/spiders/yaoshibang/ysb_snapshot_list_crawl.py
+++ b/spiders/yaoshibang/ysb_snapshot_list_crawl.py
@@ -412,25 +412,36 @@ class YaoShiBangSnapshot:
 
				         logger.error("登录后未检测到目标店铺名，登录可能失败")
			
 
				         return False
			
 
				 
			
 
				-    def _take_snapshot(self, upload_key, image_ele):
			
 
				+    def _take_snapshot(self, upload_key, image_ele, max_retries=3):
			
 
				         """在当前页面截图并上传，不再重复跳转。"""
			
 
				-        time.sleep(1)
			
 
				         self._dismiss_popup_before_screenshot()
			
 
				-        try:
			
 
				-            jpg_bytes = image_ele.get_screenshot(as_bytes="jpg")
			
 
				-            if not jpg_bytes:
			
 
				-                logger.warning("截图为空 upload_key=%s", upload_key)
			
 
				-                return ""
			
 
				-            img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
			
 
				-        except Exception:
			
 
				-            logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
			
 
				-            return ""
			
 
				-        if not img_url:
			
 
				-            logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
			
 
				-            return ""
			
 
				-        logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
			
 
				-        time.sleep(random.uniform(1, 2))
			
 
				-        return img_url
			
 
				+        for attempt in range(1, max_retries + 1):
			
 
				+            time.sleep(1)
			
 
				+            try:
			
 
				+                jpg_bytes = image_ele.get_screenshot(as_bytes="jpg")
			
 
				+                if not jpg_bytes:
			
 
				+                    logger.warning(
			
 
				+                        "截图为空 upload_key=%s attempt=%s/%s",
			
 
				+                        upload_key, attempt, max_retries,)
			
 
				+                    continue
			
 
				+                img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
			
 
				+            except Exception:
			
 
				+                logger.exception(
			
 
				+                    "截图或 OSS 上传失败 upload_key=%s attempt=%s/%s",
			
 
				+                    upload_key, attempt, max_retries,
			
 
				+                )
			
 
				+                continue
			
 
				+            if not img_url:
			
 
				+                logger.warning(
			
 
				+                    "OSS 未返回有效地址 upload_key=%s attempt=%s/%s",
			
 
				+                    upload_key, attempt, max_retries,
			
 
				+                )
			
 
				+                continue
			
 
				+            logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
			
 
				+            time.sleep(random.uniform(1, 2))
			
 
				+            return img_url
			
 
				+        logger.warning("截图失败，已达最大重试次数 upload_key=%s", upload_key)
			
 
				+        return ""
			
 
				 
			
 
				     def _human_click(self, element):
			
 
				         """在目标节点上触发 click，避免 move_to + 无目标 actions.click() 因布局位移点到商品链接触发详情页。"""
			
@@ -524,6 +535,7 @@ class YaoShiBangSnapshot:
 
				         if not shop_name:
			
 
				             shop_name = item.get("abbreviation", "")
			
 
				 
			
 
				+        brand = item.get("brand","")
			
 
				         product = {
			
 
				             "platform": self.platform,
			
 
				             "item_id": item_id,
			
@@ -552,7 +564,7 @@ class YaoShiBangSnapshot:
 
				             "update_time": now,
			
 
				             "insert_time": now,
			
 
				             "number": 1,
			
 
				-            "product_brand": self.brand or "",
			
 
				+            "product_brand": brand,
			
 
				             "collect_task_id": self.collect_task_id,
			
 
				             "search_name": self.product,
			
 
				             "company_name": "",
			
@@ -662,6 +674,7 @@ class YaoShiBangSnapshot:
 
				             self.search()
			
 
				         except Exception as e:
			
 
				             logger.exception("运行异常: %s", e)
			
 
				+            self.success=False
			
 
				         finally:
			
 
				             self._quit_browser()
			
 
				         return self.pipeline.crawl_count, self.success
			
--- a/spiders/yaoshibang/ysbang_crawl.py
+++ b/spiders/yaoshibang/ysbang_crawl.py
@@ -6,10 +6,13 @@ import re
 
				 import secrets
			
 
				 import string
			
 
				 import time
			
 
				+import token
			
 
				 import zlib
			
 
				 from datetime import datetime, timedelta
			
 
				 import requests
			
 
				 from Crypto.Cipher import AES
			
 
				+from openpyxl.worksheet import page
			
 
				+
			
 
				 from commons.Logger import get_spider_logger
			
 
				 from pipelines.drug_pipelines import DrugPipeline
			
 
				 from area_info.city_name_to_id import get_city
			
@@ -35,9 +38,10 @@ class YsbSpider:
 
				             self.get_product_data()
			
 
				         self.is_success = True
			
 
				         self.db_online = MySQLPoolOnline()
			
 
				+        self.is_product_count = 0
			
 
				 
			
 
				     def get_token(self, _retry_login=False):
			
 
				-        sql_account = f""" select `name`,`cookie_str` from `accounts_platform` where `platform`=5 and `status`=1 and `equipment_id`=1 order by `cookie_timestamp` asc limit 1 """
			
 
				+        sql_account = f""" select `name`,`cookie_str` from `accounts_platform` where `platform`=5 and `status`=1 and `equipment_id`=3 order by `cookie_timestamp` asc limit 1 """
			
 
				         account_list = self.db_online.select_data(sql_account)
			
 
				         if not account_list:
			
 
				             logger.error("无可用爬取账号")
			
@@ -133,51 +137,54 @@ class YsbSpider:
 
				         }
			
 
				 
			
 
				     def build_base_payload(self):
			
 
				+
			
 
				         keyword = self.product
			
 
				         if self.brand:
			
 
				             keyword = self.brand + " " + self.product
			
 
				         if self.product_desc:
			
 
				             keyword = keyword + self.product_desc
			
 
				 
			
 
				-        date_str = time.strftime("%Y-%m-%d %H:%M:%S")
			
 
				-        return {
			
 
				-            "platform": "pc",
			
 
				-            "version": "6.0.0",
			
 
				-            "ua": "Chrome146",
			
 
				-            'ex': '{} drugInfo {} {}'.format(self.start_date, date_str, date_str),
			
 
				-            "trafficType": 1,
			
 
				-            "ex1": "",
			
 
				-            "o": "",
			
 
				-            "lastClick": -1,
			
 
				-            "page": 1,
			
 
				-            "pagesize": "60",
			
 
				-            "classify_id": "",
			
 
				-            "searchkey": keyword,
			
 
				-            "onlyTcm": 0,
			
 
				-            "operationtype": 1,
			
 
				-            "qualifiedLoanee": 0,
			
 
				-            "drugId": -1,
			
 
				-            "tagId": "",
			
 
				-            "showRecentlyPurchasedFlag": True,
			
 
				-            "onlySimpleLoan": 0,
			
 
				-            "sn": "",
			
 
				-            "buttons": [],
			
 
				-            "buttonList": [],
			
 
				-            "synonymId": 0,
			
 
				-            "activityTypes": [],
			
 
				-            "provider_filter": "",
			
 
				-            "factoryNames": "",
			
 
				-            "tcmGradeNames": [],
			
 
				-            "tcmExeStandardIds": [],
			
 
				-            "specs": "",
			
 
				-            "deliverFloor": 0,
			
 
				-            "purchaseLimitFloor": 0,
			
 
				-            "nextRequestKey": "",
			
 
				-            "adConfigId": 0,
			
 
				-            "stateValue": "",
			
 
				-            "firstSearch": True,
			
 
				-            "token": self.token,
			
 
				+        date_str = time.strftime("%Y-%m-%d %H:%M")
			
 
				+        json_data = {
			
 
				+            'platform': 'pc',
			
 
				+            'version': '6.1.10',
			
 
				+            'ua': 'Chrome148',
			
 
				+            'ex': f'{date_str} https://dian.ysbang.cn os=Windows 10 indexContent 05-25 11:20:53 05-27 17:27:01',
			
 
				+            'trafficType': 1,
			
 
				+            'ex1': '',
			
 
				+            'o': '',
			
 
				+            'lastClick': -1,
			
 
				+            'page': page,
			
 
				+            'pagesize': '60',
			
 
				+            'classify_id': '',
			
 
				+            'searchkey': keyword,
			
 
				+            'onlyTcm': 0,
			
 
				+            'operationtype': 1,
			
 
				+            'qualifiedLoanee': 0,
			
 
				+            'drugId': -1,
			
 
				+            'tagId': '',
			
 
				+            'showRecentlyPurchasedFlag': True,
			
 
				+            'onlySimpleLoan': 0,
			
 
				+            'sn': '',
			
 
				+            'buttons': [],
			
 
				+            'buttonList': [],
			
 
				+            'synonymId': 0,
			
 
				+            'activityTypes': [],
			
 
				+            'provider_filter': '',
			
 
				+            'factoryNames': '',
			
 
				+            'tcmGradeNames': [],
			
 
				+            'tcmExeStandardIds': [],
			
 
				+            'specs': '',
			
 
				+            'deliverFloor': 0,
			
 
				+            'purchaseLimitFloor': 0,
			
 
				+            'nextRequestKey': '',
			
 
				+            'adConfigId': 0,
			
 
				+            'stateValue': '',
			
 
				+            'filterLeyoProvider': False,
			
 
				+            'firstSearch': False,
			
 
				+            'token': self.token,
			
 
				         }
			
 
				+        return json_data
			
 
				 
			
 
				     def get_price(self, price_token):
			
 
				         pattern = re.compile(r'(?<!\d)(\d+\.\d{2})(?!\d)')
			
@@ -220,6 +227,7 @@ class YsbSpider:
 
				         if not shop_name:
			
 
				             shop_name = item.get("abbreviation", "")
			
 
				 
			
 
				+        brand = item.get("brand","")
			
 
				         product = {
			
 
				             "platform": self.platform,
			
 
				             "item_id": item_id,
			
@@ -248,7 +256,7 @@ class YsbSpider:
 
				             "update_time": now,
			
 
				             "insert_time": now,
			
 
				             "number": 1,
			
 
				-            "product_brand": self.brand or "",
			
 
				+            "product_brand": brand,
			
 
				             "collect_task_id": self.collect_task_id,
			
 
				             "search_name": self.product,
			
 
				             "company_name": "",
			
@@ -342,6 +350,7 @@ class YsbSpider:
 
				                     response = None
			
 
				                     time.sleep(10)
			
 
				             if not response or response.status_code != 200:
			
 
				+                self.is_success = False
			
 
				                 logger.error("第%s页请求失败，停止爬取", page)
			
 
				                 return
			
 
				 
			
@@ -349,14 +358,17 @@ class YsbSpider:
 
				                 data_json = response.json()
			
 
				             except json.JSONDecodeError:
			
 
				                 logger.exception("第%s页响应不是合法 JSON", page)
			
 
				+                self.is_success = False
			
 
				                 return
			
 
				+
			
 
				             data_block = data_json.get("data") or {}
			
 
				-            if data_json.get("message", "") == "该操作需要登录":
			
 
				+            if data_json.get("message", "") in ["该操作需要登录","需要前端行为验证！"] :
			
 
				                 logger.info("登录账号中。。。")
			
 
				                 YaoShiBangLogin().run()
			
 
				                 time.sleep(10)
			
 
				                 if not self.get_token():
			
 
				                     logger.error("登录后仍未从库中读到有效 Token，停止重试")
			
 
				+                    self.is_success = False
			
 
				                     return
			
 
				                 logger.info("token 已刷新，重试第 %s 页", page)
			
 
				                 continue
			
@@ -370,7 +382,8 @@ class YsbSpider:
 
				                 json_data = self.decrypt_payload(encrypted_o)
			
 
				             except Exception as e:
			
 
				                 logger.exception("第%s页解密失败: %s", page, e)
			
 
				-                continue
			
 
				+                self.is_success = False
			
 
				+                return
			
 
				             wholesales = json_data.get("wholesales", [])
			
 
				             if not wholesales:
			
 
				                 logger.info(f"第{page}页无数据，停止")
			
@@ -388,7 +401,17 @@ class YsbSpider:
 
				                 product = self.to_product(item, type_data)
			
 
				                 if not product.get("item_id"):
			
 
				                     continue
			
 
				+                title = product.get("product_name","")
			
 
				+                if self.brand not in title:
			
 
				+                    self.is_product_count +=1
			
 
				+                if self.product not in title:
			
 
				+                    self.is_product_count +=1
			
 
				+                    continue
			
 
				+                if self.product in title and self.brand in title:
			
 
				+                    self.is_product_count = 0
			
 
				 
			
 
				+                if self.is_product_count >=20:
			
 
				+                    return
			
 
				                 try:
			
 
				                     self.pipeline.storge_data(product)
			
 
				                     logger.info("%s", json.dumps(product, ensure_ascii=False))
			
@@ -404,6 +427,8 @@ class YsbSpider:
 
				         try:
			
 
				             self.search_data()
			
 
				         except Exception as e:
			
 
				+
			
 
				+            self.is_success = False
			
 
				             logger.error(e)
			
 
				 
			
 
				         logger.info(f"爬取总数{self.pipeline.crawl_count}")