import json import random import requests import re import time import base64 from commons.conn_mysql import MySQLPoolOnline from area_info.city_name_to_id import get_city_2 from pipelines.shop_pipelines import ShopPipeline headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Origin': 'https://dian.ysbang.cn', 'Referer': 'https://dian.ysbang.cn/', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36', 'sec-ch-ua': '"Google Chrome";v="147", "Not.A/Brand";v="8", "Chromium";v="147"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } class GetStore: def __init__(self): self.token = '2052ab19263347728867e3f76a5160aa' self.db_online = MySQLPoolOnline() def get_access_token(self): app_key = "tRK2RhyItCSh6BzyT4CNVXQa" app_secret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh" token_url = 'https://aip.baidubce.com/oauth/2.0/token' url = f"{token_url}?grant_type=client_credentials&client_id={app_key}&client_secret={app_secret}" payload = "" headers = { 'Content-Type': 'application/json', 'Accept': 'application/json' } response = requests.request("POST", url, headers=headers, data=payload, timeout=10) try: response.raise_for_status() return response.json().get('access_token') except Exception as exc: print(f"获取 access_token 失败: {exc}") return None def get_ocr_res(self, img_content): try: # img地址 request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license" # 二进制方式打开图片文件 # f = open(img, 'rb') img = base64.b64encode(img_content) params = {"image": img} access_token = self.get_access_token() if not access_token: print("OCR 调用失败: access_token 为空") return None request_url = request_url + "?access_token=" + access_token headers = {'content-type': 'application/x-www-form-urlencoded'} response = requests.post(request_url, data=params, headers=headers, timeout=15) if response: res = response.json() if 'words_result' not in res: print(f"OCR 返回异常: {res}") return None new_dic = dict() for ite in res['words_result'].keys(): new_dic[ite] = res['words_result'][ite]['words'] print('资质数据信息', new_dic) return new_dic else: return None except Exception as exc: print(f"OCR 解析失败: {exc}") return None def get_img_info(self, shop_id): json_data = { 'platform': 'pc', 'version': '6.1.0', 'ua': 'Chrome147', 'ex': '2026-4-17 13:42 supplierstore 04-27 11:09:49 04-27 11:10:15', 'trafficType': 4, 'ex1': 'o5u5rr408', 'providerId': shop_id, 'token': self.token } response = requests.post( 'https://dian.ysbang.cn/ysb-provider/provider/getProviderDetails/v340', headers=headers, json=json_data, timeout=15, ) try: response.raise_for_status() res = response.json() pic = None provide_pics = res.get("data", {}).get("providerPics", []) shop_name = res.get("data",{}).get("name") for pics in provide_pics: if "营业执照" in pics.get("picTitle", ""): pic = pics.get("picUrl") print(pic) break if pic: img_content = requests.get(pic, timeout=15).content ocr_res = self.get_ocr_res(img_content) if not ocr_res: print(f"门店OCR失败,shop_id={shop_id}") return None address = ocr_res.get("地址", "") business_license_company = ocr_res.get("单位名称", "") qualification_number = ocr_res.get("社会信用代码", "") if not address: print(f"未识别到地址信息,shop_id={shop_id}") return None address_str = address.split("市")[0].split("区")[0] city_id, province_id, city, province = get_city_2(address_str) # 以下为 ShopPipeline -> retrieve_scrape_shop_info 字段集,与 yaofangwang DrugPipeline 的 product 不同,勿混用列名 product = { "shop": shop_name, "shop_url": f"https://dian.ysbang.cn/#/supplierstore?providerId={shop_id}&trafficType=4", "city": city, "qualification_number": qualification_number, "business_license_company": business_license_company, "province": province, "scrape_date": time.strftime("%Y-%m-%d %H:%M:%S"), "business_license_address": address, "create_time": time.strftime("%Y-%m-%d %H:%M:%S"), "update_time": time.strftime("%Y-%m-%d %H:%M:%S"), "platform": 5 } affected_rows = ShopPipeline("yaosb_shop_info").storge_data(product) print(f"入库结果: {affected_rows} 行, shop={shop_name}") return None except Exception as exc: print(f"获取门店图片失败: {exc}") return None def get_shop_list(self): # sql_data = """ select `store_name`, MIN(`store_url`) as `store_url` # from `retrieve_scrape_data` # where `platform_id` = 5 # and `shipment_city_name` = '' and `store_name` !='' # GROUP BY `store_url` """ # data_list = self.db_online.select_data(sql_data) # for data in data_list: for i in range(150,300): # shop_name = data.get("store_name") # shop_url = data.get("store_url") # # if not shop_url: # continue # shop_id = re.search(r"providerId=(\d+)", shop_url) print(i,f"https://dian.ysbang.cn/#/supplierstore?providerId={i}&trafficType=4") shop_id = i if shop_id: # shop_id = shop_id.group(1) self.get_img_info(shop_id) else: # print(shop_url) continue time.sleep(random.uniform(2,4)) def run(self): self.get_shop_list() if __name__ == '__main__': get_store = GetStore() get_store.run()