yaoshibang_shop_info.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. import json
  2. import requests
  3. import re
  4. import time
  5. import base64
  6. from commons.conn_mysql import MySQLPoolOnline
  7. from area_info.city_name_to_id import get_city_2
  8. from pipelines.shop_pipelines import ShopPipeline
  9. headers = {
  10. 'Accept': '*/*',
  11. 'Accept-Language': 'zh-CN,zh;q=0.9',
  12. 'Connection': 'keep-alive',
  13. 'Content-Type': 'application/json',
  14. 'Origin': 'https://dian.ysbang.cn',
  15. 'Referer': 'https://dian.ysbang.cn/',
  16. 'Sec-Fetch-Dest': 'empty',
  17. 'Sec-Fetch-Mode': 'cors',
  18. 'Sec-Fetch-Site': 'same-origin',
  19. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36',
  20. 'sec-ch-ua': '"Google Chrome";v="147", "Not.A/Brand";v="8", "Chromium";v="147"',
  21. 'sec-ch-ua-mobile': '?0',
  22. 'sec-ch-ua-platform': '"Windows"',
  23. }
  24. class GetStore:
  25. def __init__(self):
  26. self.token = '2052ab19263347728867e3f76a5160aa'
  27. self.db_online = MySQLPoolOnline()
  28. def get_access_token(self):
  29. app_key = "tRK2RhyItCSh6BzyT4CNVXQa"
  30. app_secret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  31. token_url = 'https://aip.baidubce.com/oauth/2.0/token'
  32. url = f"{token_url}?grant_type=client_credentials&client_id={app_key}&client_secret={app_secret}"
  33. payload = ""
  34. headers = {
  35. 'Content-Type': 'application/json',
  36. 'Accept': 'application/json'
  37. }
  38. response = requests.request("POST", url, headers=headers, data=payload, timeout=10)
  39. try:
  40. response.raise_for_status()
  41. return response.json().get('access_token')
  42. except Exception as exc:
  43. print(f"获取 access_token 失败: {exc}")
  44. return None
  45. def get_ocr_res(self, img_content):
  46. try:
  47. # img地址
  48. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  49. # 二进制方式打开图片文件
  50. # f = open(img, 'rb')
  51. img = base64.b64encode(img_content)
  52. params = {"image": img}
  53. access_token = self.get_access_token()
  54. if not access_token:
  55. print("OCR 调用失败: access_token 为空")
  56. return None
  57. request_url = request_url + "?access_token=" + access_token
  58. headers = {'content-type': 'application/x-www-form-urlencoded'}
  59. response = requests.post(request_url, data=params, headers=headers, timeout=15)
  60. if response:
  61. res = response.json()
  62. if 'words_result' not in res:
  63. print(f"OCR 返回异常: {res}")
  64. return None
  65. new_dic = dict()
  66. for ite in res['words_result'].keys():
  67. new_dic[ite] = res['words_result'][ite]['words']
  68. print('资质数据信息', new_dic)
  69. return new_dic
  70. else:
  71. return None
  72. except Exception as exc:
  73. print(f"OCR 解析失败: {exc}")
  74. return None
  75. def get_img_info(self, shop_name, shop_url, shop_id):
  76. json_data = {
  77. 'platform': 'pc',
  78. 'version': '6.1.0',
  79. 'ua': 'Chrome147',
  80. 'ex': '2026-4-17 13:42 supplierstore 04-27 11:09:49 04-27 11:10:15',
  81. 'trafficType': 4,
  82. 'ex1': 'o5u5rr408',
  83. 'providerId': shop_id,
  84. 'token': self.token
  85. }
  86. response = requests.post(
  87. 'https://dian.ysbang.cn/ysb-provider/provider/getProviderDetails/v340',
  88. headers=headers,
  89. json=json_data,
  90. timeout=15,
  91. )
  92. try:
  93. response.raise_for_status()
  94. res = response.json()
  95. pic = None
  96. provide_pics = res.get("data", {}).get("providerPics", [])
  97. for pics in provide_pics:
  98. if "营业执照" in pics.get("picTitle", ""):
  99. pic = pics.get("picUrl")
  100. print(pic)
  101. break
  102. if pic:
  103. img_content = requests.get(pic, timeout=15).content
  104. ocr_res = self.get_ocr_res(img_content)
  105. if not ocr_res:
  106. print(f"门店OCR失败,shop_id={shop_id}")
  107. return None
  108. address = ocr_res.get("地址", "")
  109. business_license_company = ocr_res.get("单位名称", "")
  110. qualification_number = ocr_res.get("社会信用代码", "")
  111. if not address:
  112. print(f"未识别到地址信息,shop_id={shop_id}")
  113. return None
  114. address_str = address.split("市")[0].split("区")[0]
  115. city_id, province_id, city, province = get_city_2(address_str)
  116. # 以下为 ShopPipeline -> retrieve_scrape_shop_info 字段集,与 yaofangwang DrugPipeline 的 product 不同,勿混用列名
  117. product = {
  118. "shop": shop_name,
  119. "shop_url": shop_url,
  120. "city": city,
  121. "qualification_number": qualification_number,
  122. "business_license_company": business_license_company,
  123. "province": province,
  124. "scrape_date": time.strftime("%Y-%m-%d %H:%M:%S"),
  125. "business_license_address": address,
  126. "create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
  127. "update_time": time.strftime("%Y-%m-%d %H:%M:%S"),
  128. "platform": 5
  129. }
  130. affected_rows = ShopPipeline("yaosb_shop_info").storge_data(product)
  131. print(f"入库结果: {affected_rows} 行, shop={shop_name}")
  132. return None
  133. except Exception as exc:
  134. print(f"获取门店图片失败: {exc}")
  135. return None
  136. def get_shop_list(self):
  137. sql_data = """ select `store_name`, MIN(`store_url`) as `store_url`
  138. from `retrieve_scrape_data`
  139. where `platform_id` = 5
  140. and `shipment_city_name` = '' and `store_name` !=''
  141. GROUP BY `store_url` """
  142. data_list = self.db_online.select_data(sql_data)
  143. for data in data_list:
  144. shop_name = data.get("store_name")
  145. shop_url = data.get("store_url")
  146. if not shop_url:
  147. continue
  148. shop_id = re.search(r"providerId=(\d+)", shop_url)
  149. if shop_id:
  150. shop_id = shop_id.group(1)
  151. self.get_img_info(shop_name, shop_url, shop_id)
  152. else:
  153. print(shop_url)
  154. continue
  155. time.sleep(1.2)
  156. def run(self):
  157. self.get_shop_list()
  158. if __name__ == '__main__':
  159. get_store = GetStore()
  160. get_store.run()