Kaynağa Gözat

药师帮增加品牌判断

zhuoyuncheng 1 hafta önce
ebeveyn
işleme
408f7305d0

+ 31 - 18
spiders/yaoshibang/ysb_snapshot_list_crawl.py

@@ -412,25 +412,36 @@ class YaoShiBangSnapshot:
         logger.error("登录后未检测到目标店铺名,登录可能失败")
         return False
 
-    def _take_snapshot(self, upload_key, image_ele):
+    def _take_snapshot(self, upload_key, image_ele, max_retries=3):
         """在当前页面截图并上传,不再重复跳转。"""
-        time.sleep(1)
         self._dismiss_popup_before_screenshot()
-        try:
-            jpg_bytes = image_ele.get_screenshot(as_bytes="jpg")
-            if not jpg_bytes:
-                logger.warning("截图为空 upload_key=%s", upload_key)
-                return ""
-            img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
-        except Exception:
-            logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
-            return ""
-        if not img_url:
-            logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
-            return ""
-        logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
-        time.sleep(random.uniform(1, 2))
-        return img_url
+        for attempt in range(1, max_retries + 1):
+            time.sleep(1)
+            try:
+                jpg_bytes = image_ele.get_screenshot(as_bytes="jpg")
+                if not jpg_bytes:
+                    logger.warning(
+                        "截图为空 upload_key=%s attempt=%s/%s",
+                        upload_key, attempt, max_retries,)
+                    continue
+                img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
+            except Exception:
+                logger.exception(
+                    "截图或 OSS 上传失败 upload_key=%s attempt=%s/%s",
+                    upload_key, attempt, max_retries,
+                )
+                continue
+            if not img_url:
+                logger.warning(
+                    "OSS 未返回有效地址 upload_key=%s attempt=%s/%s",
+                    upload_key, attempt, max_retries,
+                )
+                continue
+            logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
+            time.sleep(random.uniform(1, 2))
+            return img_url
+        logger.warning("截图失败,已达最大重试次数 upload_key=%s", upload_key)
+        return ""
 
     def _human_click(self, element):
         """在目标节点上触发 click,避免 move_to + 无目标 actions.click() 因布局位移点到商品链接触发详情页。"""
@@ -524,6 +535,7 @@ class YaoShiBangSnapshot:
         if not shop_name:
             shop_name = item.get("abbreviation", "")
 
+        brand = item.get("brand","")
         product = {
             "platform": self.platform,
             "item_id": item_id,
@@ -552,7 +564,7 @@ class YaoShiBangSnapshot:
             "update_time": now,
             "insert_time": now,
             "number": 1,
-            "product_brand": self.brand or "",
+            "product_brand": brand,
             "collect_task_id": self.collect_task_id,
             "search_name": self.product,
             "company_name": "",
@@ -662,6 +674,7 @@ class YaoShiBangSnapshot:
             self.search()
         except Exception as e:
             logger.exception("运行异常: %s", e)
+            self.success=False
         finally:
             self._quit_browser()
         return self.pipeline.crawl_count, self.success

+ 67 - 42
spiders/yaoshibang/ysbang_crawl.py

@@ -6,10 +6,13 @@ import re
 import secrets
 import string
 import time
+import token
 import zlib
 from datetime import datetime, timedelta
 import requests
 from Crypto.Cipher import AES
+from openpyxl.worksheet import page
+
 from commons.Logger import get_spider_logger
 from pipelines.drug_pipelines import DrugPipeline
 from area_info.city_name_to_id import get_city
@@ -35,9 +38,10 @@ class YsbSpider:
             self.get_product_data()
         self.is_success = True
         self.db_online = MySQLPoolOnline()
+        self.is_product_count = 0
 
     def get_token(self, _retry_login=False):
-        sql_account = f""" select `name`,`cookie_str` from `accounts_platform` where `platform`=5 and `status`=1 and `equipment_id`=1 order by `cookie_timestamp` asc limit 1 """
+        sql_account = f""" select `name`,`cookie_str` from `accounts_platform` where `platform`=5 and `status`=1 and `equipment_id`=3 order by `cookie_timestamp` asc limit 1 """
         account_list = self.db_online.select_data(sql_account)
         if not account_list:
             logger.error("无可用爬取账号")
@@ -133,51 +137,54 @@ class YsbSpider:
         }
 
     def build_base_payload(self):
+
         keyword = self.product
         if self.brand:
             keyword = self.brand + " " + self.product
         if self.product_desc:
             keyword = keyword + self.product_desc
 
-        date_str = time.strftime("%Y-%m-%d %H:%M:%S")
-        return {
-            "platform": "pc",
-            "version": "6.0.0",
-            "ua": "Chrome146",
-            'ex': '{} drugInfo {} {}'.format(self.start_date, date_str, date_str),
-            "trafficType": 1,
-            "ex1": "",
-            "o": "",
-            "lastClick": -1,
-            "page": 1,
-            "pagesize": "60",
-            "classify_id": "",
-            "searchkey": keyword,
-            "onlyTcm": 0,
-            "operationtype": 1,
-            "qualifiedLoanee": 0,
-            "drugId": -1,
-            "tagId": "",
-            "showRecentlyPurchasedFlag": True,
-            "onlySimpleLoan": 0,
-            "sn": "",
-            "buttons": [],
-            "buttonList": [],
-            "synonymId": 0,
-            "activityTypes": [],
-            "provider_filter": "",
-            "factoryNames": "",
-            "tcmGradeNames": [],
-            "tcmExeStandardIds": [],
-            "specs": "",
-            "deliverFloor": 0,
-            "purchaseLimitFloor": 0,
-            "nextRequestKey": "",
-            "adConfigId": 0,
-            "stateValue": "",
-            "firstSearch": True,
-            "token": self.token,
+        date_str = time.strftime("%Y-%m-%d %H:%M")
+        json_data = {
+            'platform': 'pc',
+            'version': '6.1.10',
+            'ua': 'Chrome148',
+            'ex': f'{date_str} https://dian.ysbang.cn os=Windows 10 indexContent 05-25 11:20:53 05-27 17:27:01',
+            'trafficType': 1,
+            'ex1': '',
+            'o': '',
+            'lastClick': -1,
+            'page': page,
+            'pagesize': '60',
+            'classify_id': '',
+            'searchkey': keyword,
+            'onlyTcm': 0,
+            'operationtype': 1,
+            'qualifiedLoanee': 0,
+            'drugId': -1,
+            'tagId': '',
+            'showRecentlyPurchasedFlag': True,
+            'onlySimpleLoan': 0,
+            'sn': '',
+            'buttons': [],
+            'buttonList': [],
+            'synonymId': 0,
+            'activityTypes': [],
+            'provider_filter': '',
+            'factoryNames': '',
+            'tcmGradeNames': [],
+            'tcmExeStandardIds': [],
+            'specs': '',
+            'deliverFloor': 0,
+            'purchaseLimitFloor': 0,
+            'nextRequestKey': '',
+            'adConfigId': 0,
+            'stateValue': '',
+            'filterLeyoProvider': False,
+            'firstSearch': False,
+            'token': self.token,
         }
+        return json_data
 
     def get_price(self, price_token):
         pattern = re.compile(r'(?<!\d)(\d+\.\d{2})(?!\d)')
@@ -220,6 +227,7 @@ class YsbSpider:
         if not shop_name:
             shop_name = item.get("abbreviation", "")
 
+        brand = item.get("brand","")
         product = {
             "platform": self.platform,
             "item_id": item_id,
@@ -248,7 +256,7 @@ class YsbSpider:
             "update_time": now,
             "insert_time": now,
             "number": 1,
-            "product_brand": self.brand or "",
+            "product_brand": brand,
             "collect_task_id": self.collect_task_id,
             "search_name": self.product,
             "company_name": "",
@@ -342,6 +350,7 @@ class YsbSpider:
                     response = None
                     time.sleep(10)
             if not response or response.status_code != 200:
+                self.is_success = False
                 logger.error("第%s页请求失败,停止爬取", page)
                 return
 
@@ -349,14 +358,17 @@ class YsbSpider:
                 data_json = response.json()
             except json.JSONDecodeError:
                 logger.exception("第%s页响应不是合法 JSON", page)
+                self.is_success = False
                 return
+
             data_block = data_json.get("data") or {}
-            if data_json.get("message", "") == "该操作需要登录":
+            if data_json.get("message", "") in ["该操作需要登录","需要前端行为验证!"] :
                 logger.info("登录账号中。。。")
                 YaoShiBangLogin().run()
                 time.sleep(10)
                 if not self.get_token():
                     logger.error("登录后仍未从库中读到有效 Token,停止重试")
+                    self.is_success = False
                     return
                 logger.info("token 已刷新,重试第 %s 页", page)
                 continue
@@ -370,7 +382,8 @@ class YsbSpider:
                 json_data = self.decrypt_payload(encrypted_o)
             except Exception as e:
                 logger.exception("第%s页解密失败: %s", page, e)
-                continue
+                self.is_success = False
+                return
             wholesales = json_data.get("wholesales", [])
             if not wholesales:
                 logger.info(f"第{page}页无数据,停止")
@@ -388,7 +401,17 @@ class YsbSpider:
                 product = self.to_product(item, type_data)
                 if not product.get("item_id"):
                     continue
+                title = product.get("product_name","")
+                if self.brand not in title:
+                    self.is_product_count +=1
+                if self.product not in title:
+                    self.is_product_count +=1
+                    continue
+                if self.product in title and self.brand in title:
+                    self.is_product_count = 0
 
+                if self.is_product_count >=20:
+                    return
                 try:
                     self.pipeline.storge_data(product)
                     logger.info("%s", json.dumps(product, ensure_ascii=False))
@@ -404,6 +427,8 @@ class YsbSpider:
         try:
             self.search_data()
         except Exception as e:
+
+            self.is_success = False
             logger.error(e)
 
         logger.info(f"爬取总数{self.pipeline.crawl_count}")