yaoshibang_shop.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. import json
  2. import random
  3. import requests
  4. import re
  5. import time
  6. import base64
  7. from commons.conn_mysql import MySQLPoolOnline
  8. from area_info.city_name_to_id import get_city_2
  9. from pipelines.shop_pipelines import ShopPipeline
  10. headers = {
  11. 'Accept': '*/*',
  12. 'Accept-Language': 'zh-CN,zh;q=0.9',
  13. 'Connection': 'keep-alive',
  14. 'Content-Type': 'application/json',
  15. 'Origin': 'https://dian.ysbang.cn',
  16. 'Referer': 'https://dian.ysbang.cn/',
  17. 'Sec-Fetch-Dest': 'empty',
  18. 'Sec-Fetch-Mode': 'cors',
  19. 'Sec-Fetch-Site': 'same-origin',
  20. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36',
  21. 'sec-ch-ua': '"Google Chrome";v="147", "Not.A/Brand";v="8", "Chromium";v="147"',
  22. 'sec-ch-ua-mobile': '?0',
  23. 'sec-ch-ua-platform': '"Windows"',
  24. }
  25. class GetStore:
  26. def __init__(self):
  27. self.token = '2052ab19263347728867e3f76a5160aa'
  28. self.db_online = MySQLPoolOnline()
  29. def get_access_token(self):
  30. app_key = "tRK2RhyItCSh6BzyT4CNVXQa"
  31. app_secret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  32. token_url = 'https://aip.baidubce.com/oauth/2.0/token'
  33. url = f"{token_url}?grant_type=client_credentials&client_id={app_key}&client_secret={app_secret}"
  34. payload = ""
  35. headers = {
  36. 'Content-Type': 'application/json',
  37. 'Accept': 'application/json'
  38. }
  39. response = requests.request("POST", url, headers=headers, data=payload, timeout=10)
  40. try:
  41. response.raise_for_status()
  42. return response.json().get('access_token')
  43. except Exception as exc:
  44. print(f"获取 access_token 失败: {exc}")
  45. return None
  46. def get_ocr_res(self, img_content):
  47. try:
  48. # img地址
  49. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  50. # 二进制方式打开图片文件
  51. # f = open(img, 'rb')
  52. img = base64.b64encode(img_content)
  53. params = {"image": img}
  54. access_token = self.get_access_token()
  55. if not access_token:
  56. print("OCR 调用失败: access_token 为空")
  57. return None
  58. request_url = request_url + "?access_token=" + access_token
  59. headers = {'content-type': 'application/x-www-form-urlencoded'}
  60. response = requests.post(request_url, data=params, headers=headers, timeout=15)
  61. if response:
  62. res = response.json()
  63. if 'words_result' not in res:
  64. print(f"OCR 返回异常: {res}")
  65. return None
  66. new_dic = dict()
  67. for ite in res['words_result'].keys():
  68. new_dic[ite] = res['words_result'][ite]['words']
  69. print('资质数据信息', new_dic)
  70. return new_dic
  71. else:
  72. return None
  73. except Exception as exc:
  74. print(f"OCR 解析失败: {exc}")
  75. return None
  76. def get_img_info(self, shop_id):
  77. json_data = {
  78. 'platform': 'pc',
  79. 'version': '6.1.0',
  80. 'ua': 'Chrome147',
  81. 'ex': '2026-4-17 13:42 supplierstore 04-27 11:09:49 04-27 11:10:15',
  82. 'trafficType': 4,
  83. 'ex1': 'o5u5rr408',
  84. 'providerId': shop_id,
  85. 'token': self.token
  86. }
  87. response = requests.post(
  88. 'https://dian.ysbang.cn/ysb-provider/provider/getProviderDetails/v340',
  89. headers=headers,
  90. json=json_data,
  91. timeout=15,
  92. )
  93. try:
  94. response.raise_for_status()
  95. res = response.json()
  96. pic = None
  97. provide_pics = res.get("data", {}).get("providerPics", [])
  98. shop_name = res.get("data",{}).get("name")
  99. for pics in provide_pics:
  100. if "营业执照" in pics.get("picTitle", ""):
  101. pic = pics.get("picUrl")
  102. print(pic)
  103. break
  104. if pic:
  105. img_content = requests.get(pic, timeout=15).content
  106. ocr_res = self.get_ocr_res(img_content)
  107. if not ocr_res:
  108. print(f"门店OCR失败,shop_id={shop_id}")
  109. return None
  110. address = ocr_res.get("地址", "")
  111. business_license_company = ocr_res.get("单位名称", "")
  112. qualification_number = ocr_res.get("社会信用代码", "")
  113. if not address:
  114. print(f"未识别到地址信息,shop_id={shop_id}")
  115. return None
  116. address_str = address.split("市")[0].split("区")[0]
  117. city_id, province_id, city, province = get_city_2(address_str)
  118. # 以下为 ShopPipeline -> retrieve_scrape_shop_info 字段集,与 yaofangwang DrugPipeline 的 product 不同,勿混用列名
  119. product = {
  120. "shop": shop_name,
  121. "shop_url": f"https://dian.ysbang.cn/#/supplierstore?providerId={shop_id}&trafficType=4",
  122. "city": city,
  123. "qualification_number": qualification_number,
  124. "business_license_company": business_license_company,
  125. "province": province,
  126. "scrape_date": time.strftime("%Y-%m-%d %H:%M:%S"),
  127. "business_license_address": address,
  128. "create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
  129. "update_time": time.strftime("%Y-%m-%d %H:%M:%S"),
  130. "platform": 5
  131. }
  132. affected_rows = ShopPipeline("yaosb_shop_info").storge_data(product)
  133. print(f"入库结果: {affected_rows} 行, shop={shop_name}")
  134. return None
  135. except Exception as exc:
  136. print(f"获取门店图片失败: {exc}")
  137. return None
  138. def get_shop_list(self):
  139. # sql_data = """ select `store_name`, MIN(`store_url`) as `store_url`
  140. # from `retrieve_scrape_data`
  141. # where `platform_id` = 5
  142. # and `shipment_city_name` = '' and `store_name` !=''
  143. # GROUP BY `store_url` """
  144. # data_list = self.db_online.select_data(sql_data)
  145. # for data in data_list:
  146. for i in range(150,300):
  147. # shop_name = data.get("store_name")
  148. # shop_url = data.get("store_url")
  149. #
  150. # if not shop_url:
  151. # continue
  152. # shop_id = re.search(r"providerId=(\d+)", shop_url)
  153. print(i,f"https://dian.ysbang.cn/#/supplierstore?providerId={i}&trafficType=4")
  154. shop_id = i
  155. if shop_id:
  156. # shop_id = shop_id.group(1)
  157. self.get_img_info(shop_id)
  158. else:
  159. # print(shop_url)
  160. continue
  161. time.sleep(random.uniform(2,4))
  162. def run(self):
  163. self.get_shop_list()
  164. if __name__ == '__main__':
  165. get_store = GetStore()
  166. get_store.run()