|
|
@@ -6,10 +6,13 @@ import re
|
|
|
import secrets
|
|
|
import string
|
|
|
import time
|
|
|
+import token
|
|
|
import zlib
|
|
|
from datetime import datetime, timedelta
|
|
|
import requests
|
|
|
from Crypto.Cipher import AES
|
|
|
+from openpyxl.worksheet import page
|
|
|
+
|
|
|
from commons.Logger import get_spider_logger
|
|
|
from pipelines.drug_pipelines import DrugPipeline
|
|
|
from area_info.city_name_to_id import get_city
|
|
|
@@ -35,9 +38,10 @@ class YsbSpider:
|
|
|
self.get_product_data()
|
|
|
self.is_success = True
|
|
|
self.db_online = MySQLPoolOnline()
|
|
|
+ self.is_product_count = 0
|
|
|
|
|
|
def get_token(self, _retry_login=False):
|
|
|
- sql_account = f""" select `name`,`cookie_str` from `accounts_platform` where `platform`=5 and `status`=1 and `equipment_id`=1 order by `cookie_timestamp` asc limit 1 """
|
|
|
+ sql_account = f""" select `name`,`cookie_str` from `accounts_platform` where `platform`=5 and `status`=1 and `equipment_id`=3 order by `cookie_timestamp` asc limit 1 """
|
|
|
account_list = self.db_online.select_data(sql_account)
|
|
|
if not account_list:
|
|
|
logger.error("无可用爬取账号")
|
|
|
@@ -133,51 +137,54 @@ class YsbSpider:
|
|
|
}
|
|
|
|
|
|
def build_base_payload(self):
|
|
|
+
|
|
|
keyword = self.product
|
|
|
if self.brand:
|
|
|
keyword = self.brand + " " + self.product
|
|
|
if self.product_desc:
|
|
|
keyword = keyword + self.product_desc
|
|
|
|
|
|
- date_str = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
- return {
|
|
|
- "platform": "pc",
|
|
|
- "version": "6.0.0",
|
|
|
- "ua": "Chrome146",
|
|
|
- 'ex': '{} drugInfo {} {}'.format(self.start_date, date_str, date_str),
|
|
|
- "trafficType": 1,
|
|
|
- "ex1": "",
|
|
|
- "o": "",
|
|
|
- "lastClick": -1,
|
|
|
- "page": 1,
|
|
|
- "pagesize": "60",
|
|
|
- "classify_id": "",
|
|
|
- "searchkey": keyword,
|
|
|
- "onlyTcm": 0,
|
|
|
- "operationtype": 1,
|
|
|
- "qualifiedLoanee": 0,
|
|
|
- "drugId": -1,
|
|
|
- "tagId": "",
|
|
|
- "showRecentlyPurchasedFlag": True,
|
|
|
- "onlySimpleLoan": 0,
|
|
|
- "sn": "",
|
|
|
- "buttons": [],
|
|
|
- "buttonList": [],
|
|
|
- "synonymId": 0,
|
|
|
- "activityTypes": [],
|
|
|
- "provider_filter": "",
|
|
|
- "factoryNames": "",
|
|
|
- "tcmGradeNames": [],
|
|
|
- "tcmExeStandardIds": [],
|
|
|
- "specs": "",
|
|
|
- "deliverFloor": 0,
|
|
|
- "purchaseLimitFloor": 0,
|
|
|
- "nextRequestKey": "",
|
|
|
- "adConfigId": 0,
|
|
|
- "stateValue": "",
|
|
|
- "firstSearch": True,
|
|
|
- "token": self.token,
|
|
|
+ date_str = time.strftime("%Y-%m-%d %H:%M")
|
|
|
+ json_data = {
|
|
|
+ 'platform': 'pc',
|
|
|
+ 'version': '6.1.10',
|
|
|
+ 'ua': 'Chrome148',
|
|
|
+ 'ex': f'{date_str} https://dian.ysbang.cn os=Windows 10 indexContent 05-25 11:20:53 05-27 17:27:01',
|
|
|
+ 'trafficType': 1,
|
|
|
+ 'ex1': '',
|
|
|
+ 'o': '',
|
|
|
+ 'lastClick': -1,
|
|
|
+ 'page': page,
|
|
|
+ 'pagesize': '60',
|
|
|
+ 'classify_id': '',
|
|
|
+ 'searchkey': keyword,
|
|
|
+ 'onlyTcm': 0,
|
|
|
+ 'operationtype': 1,
|
|
|
+ 'qualifiedLoanee': 0,
|
|
|
+ 'drugId': -1,
|
|
|
+ 'tagId': '',
|
|
|
+ 'showRecentlyPurchasedFlag': True,
|
|
|
+ 'onlySimpleLoan': 0,
|
|
|
+ 'sn': '',
|
|
|
+ 'buttons': [],
|
|
|
+ 'buttonList': [],
|
|
|
+ 'synonymId': 0,
|
|
|
+ 'activityTypes': [],
|
|
|
+ 'provider_filter': '',
|
|
|
+ 'factoryNames': '',
|
|
|
+ 'tcmGradeNames': [],
|
|
|
+ 'tcmExeStandardIds': [],
|
|
|
+ 'specs': '',
|
|
|
+ 'deliverFloor': 0,
|
|
|
+ 'purchaseLimitFloor': 0,
|
|
|
+ 'nextRequestKey': '',
|
|
|
+ 'adConfigId': 0,
|
|
|
+ 'stateValue': '',
|
|
|
+ 'filterLeyoProvider': False,
|
|
|
+ 'firstSearch': False,
|
|
|
+ 'token': self.token,
|
|
|
}
|
|
|
+ return json_data
|
|
|
|
|
|
def get_price(self, price_token):
|
|
|
pattern = re.compile(r'(?<!\d)(\d+\.\d{2})(?!\d)')
|
|
|
@@ -220,6 +227,7 @@ class YsbSpider:
|
|
|
if not shop_name:
|
|
|
shop_name = item.get("abbreviation", "")
|
|
|
|
|
|
+ brand = item.get("brand","")
|
|
|
product = {
|
|
|
"platform": self.platform,
|
|
|
"item_id": item_id,
|
|
|
@@ -248,7 +256,7 @@ class YsbSpider:
|
|
|
"update_time": now,
|
|
|
"insert_time": now,
|
|
|
"number": 1,
|
|
|
- "product_brand": self.brand or "",
|
|
|
+ "product_brand": brand,
|
|
|
"collect_task_id": self.collect_task_id,
|
|
|
"search_name": self.product,
|
|
|
"company_name": "",
|
|
|
@@ -342,6 +350,7 @@ class YsbSpider:
|
|
|
response = None
|
|
|
time.sleep(10)
|
|
|
if not response or response.status_code != 200:
|
|
|
+ self.is_success = False
|
|
|
logger.error("第%s页请求失败,停止爬取", page)
|
|
|
return
|
|
|
|
|
|
@@ -349,14 +358,17 @@ class YsbSpider:
|
|
|
data_json = response.json()
|
|
|
except json.JSONDecodeError:
|
|
|
logger.exception("第%s页响应不是合法 JSON", page)
|
|
|
+ self.is_success = False
|
|
|
return
|
|
|
+
|
|
|
data_block = data_json.get("data") or {}
|
|
|
- if data_json.get("message", "") == "该操作需要登录":
|
|
|
+ if data_json.get("message", "") in ["该操作需要登录","需要前端行为验证!"] :
|
|
|
logger.info("登录账号中。。。")
|
|
|
YaoShiBangLogin().run()
|
|
|
time.sleep(10)
|
|
|
if not self.get_token():
|
|
|
logger.error("登录后仍未从库中读到有效 Token,停止重试")
|
|
|
+ self.is_success = False
|
|
|
return
|
|
|
logger.info("token 已刷新,重试第 %s 页", page)
|
|
|
continue
|
|
|
@@ -370,7 +382,8 @@ class YsbSpider:
|
|
|
json_data = self.decrypt_payload(encrypted_o)
|
|
|
except Exception as e:
|
|
|
logger.exception("第%s页解密失败: %s", page, e)
|
|
|
- continue
|
|
|
+ self.is_success = False
|
|
|
+ return
|
|
|
wholesales = json_data.get("wholesales", [])
|
|
|
if not wholesales:
|
|
|
logger.info(f"第{page}页无数据,停止")
|
|
|
@@ -388,7 +401,17 @@ class YsbSpider:
|
|
|
product = self.to_product(item, type_data)
|
|
|
if not product.get("item_id"):
|
|
|
continue
|
|
|
+ title = product.get("product_name","")
|
|
|
+ if self.brand not in title:
|
|
|
+ self.is_product_count +=1
|
|
|
+ if self.product not in title:
|
|
|
+ self.is_product_count +=1
|
|
|
+ continue
|
|
|
+ if self.product in title and self.brand in title:
|
|
|
+ self.is_product_count = 0
|
|
|
|
|
|
+ if self.is_product_count >=20:
|
|
|
+ return
|
|
|
try:
|
|
|
self.pipeline.storge_data(product)
|
|
|
logger.info("%s", json.dumps(product, ensure_ascii=False))
|
|
|
@@ -404,6 +427,8 @@ class YsbSpider:
|
|
|
try:
|
|
|
self.search_data()
|
|
|
except Exception as e:
|
|
|
+
|
|
|
+ self.is_success = False
|
|
|
logger.error(e)
|
|
|
|
|
|
logger.info(f"爬取总数{self.pipeline.crawl_count}")
|