| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- import json
- import requests
- import re
- import time
- import base64
- from commons.conn_mysql import MySQLPoolOnline
- from area_info.city_name_to_id import get_city_2
- from pipelines.shop_pipelines import ShopPipeline
- headers = {
- 'Accept': '*/*',
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- 'Connection': 'keep-alive',
- 'Content-Type': 'application/json',
- 'Origin': 'https://dian.ysbang.cn',
- 'Referer': 'https://dian.ysbang.cn/',
- 'Sec-Fetch-Dest': 'empty',
- 'Sec-Fetch-Mode': 'cors',
- 'Sec-Fetch-Site': 'same-origin',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36',
- 'sec-ch-ua': '"Google Chrome";v="147", "Not.A/Brand";v="8", "Chromium";v="147"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"',
- }
- class GetStore:
- def __init__(self):
- self.token = '2052ab19263347728867e3f76a5160aa'
- self.db_online = MySQLPoolOnline()
- def get_access_token(self):
- app_key = "tRK2RhyItCSh6BzyT4CNVXQa"
- app_secret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
- token_url = 'https://aip.baidubce.com/oauth/2.0/token'
- url = f"{token_url}?grant_type=client_credentials&client_id={app_key}&client_secret={app_secret}"
- payload = ""
- headers = {
- 'Content-Type': 'application/json',
- 'Accept': 'application/json'
- }
- response = requests.request("POST", url, headers=headers, data=payload, timeout=10)
- try:
- response.raise_for_status()
- return response.json().get('access_token')
- except Exception as exc:
- print(f"获取 access_token 失败: {exc}")
- return None
- def get_ocr_res(self, img_content):
- try:
- # img地址
- request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
- # 二进制方式打开图片文件
- # f = open(img, 'rb')
- img = base64.b64encode(img_content)
- params = {"image": img}
- access_token = self.get_access_token()
- if not access_token:
- print("OCR 调用失败: access_token 为空")
- return None
- request_url = request_url + "?access_token=" + access_token
- headers = {'content-type': 'application/x-www-form-urlencoded'}
- response = requests.post(request_url, data=params, headers=headers, timeout=15)
- if response:
- res = response.json()
- if 'words_result' not in res:
- print(f"OCR 返回异常: {res}")
- return None
- new_dic = dict()
- for ite in res['words_result'].keys():
- new_dic[ite] = res['words_result'][ite]['words']
- print('资质数据信息', new_dic)
- return new_dic
- else:
- return None
- except Exception as exc:
- print(f"OCR 解析失败: {exc}")
- return None
- def get_img_info(self, shop_name, shop_url, shop_id):
- json_data = {
- 'platform': 'pc',
- 'version': '6.1.0',
- 'ua': 'Chrome147',
- 'ex': '2026-4-17 13:42 supplierstore 04-27 11:09:49 04-27 11:10:15',
- 'trafficType': 4,
- 'ex1': 'o5u5rr408',
- 'providerId': shop_id,
- 'token': self.token
- }
- response = requests.post(
- 'https://dian.ysbang.cn/ysb-provider/provider/getProviderDetails/v340',
- headers=headers,
- json=json_data,
- timeout=15,
- )
- try:
- response.raise_for_status()
- res = response.json()
- pic = None
- provide_pics = res.get("data", {}).get("providerPics", [])
- for pics in provide_pics:
- if "营业执照" in pics.get("picTitle", ""):
- pic = pics.get("picUrl")
- print(pic)
- break
- if pic:
- img_content = requests.get(pic, timeout=15).content
- ocr_res = self.get_ocr_res(img_content)
- if not ocr_res:
- print(f"门店OCR失败,shop_id={shop_id}")
- return None
- address = ocr_res.get("地址", "")
- business_license_company = ocr_res.get("单位名称", "")
- qualification_number = ocr_res.get("社会信用代码", "")
- if not address:
- print(f"未识别到地址信息,shop_id={shop_id}")
- return None
- address_str = address.split("市")[0].split("区")[0]
- city_id, province_id, city, province = get_city_2(address_str)
- # 以下为 ShopPipeline -> retrieve_scrape_shop_info 字段集,与 yaofangwang DrugPipeline 的 product 不同,勿混用列名
- product = {
- "shop": shop_name,
- "shop_url": shop_url,
- "city": city,
- "qualification_number": qualification_number,
- "business_license_company": business_license_company,
- "province": province,
- "scrape_date": time.strftime("%Y-%m-%d %H:%M:%S"),
- "business_license_address": address,
- "create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
- "update_time": time.strftime("%Y-%m-%d %H:%M:%S"),
- "platform": 5
- }
- affected_rows = ShopPipeline("yaosb_shop_info").storge_data(product)
- print(f"入库结果: {affected_rows} 行, shop={shop_name}")
- return None
- except Exception as exc:
- print(f"获取门店图片失败: {exc}")
- return None
- def get_shop_list(self):
- sql_data = """ select `store_name`, MIN(`store_url`) as `store_url`
- from `retrieve_scrape_data`
- where `platform_id` = 5
- and `shipment_city_name` = '' and `store_name` !=''
- GROUP BY `store_url` """
- data_list = self.db_online.select_data(sql_data)
- for data in data_list:
- shop_name = data.get("store_name")
- shop_url = data.get("store_url")
- if not shop_url:
- continue
- shop_id = re.search(r"providerId=(\d+)", shop_url)
- if shop_id:
- shop_id = shop_id.group(1)
- self.get_img_info(shop_name, shop_url, shop_id)
- else:
- print(shop_url)
- continue
- time.sleep(1.2)
- def run(self):
- self.get_shop_list()
- if __name__ == '__main__':
- get_store = GetStore()
- get_store.run()
|