no_search_0710.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940
  1. import requests
  2. import base64
  3. import cv2
  4. import uiautomator2 as u2
  5. import time
  6. import subprocess
  7. import re
  8. import random
  9. import datetime
  10. import json
  11. from apscheduler.schedulers.blocking import BlockingScheduler
  12. from db_mysql import mysqlClient
  13. from config import Config
  14. import logging
  15. # from database import MySQLClient
  16. # 配置日志
  17. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  18. def get_access_token():
  19. AppKey = "tRK2RhyItCSh6BzyT4CNVXQa"
  20. AppSrcret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  21. token_url = 'https://aip.baidubce.com/oauth/2.0/token'
  22. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  23. payload = ""
  24. headers = {
  25. 'Content-Type': 'application/json',
  26. 'Accept': 'application/json'
  27. }
  28. response = requests.request("POST", url, headers=headers, data=payload)
  29. try:
  30. return response.json()['access_token']
  31. except:
  32. return None
  33. def get_mysql():
  34. """
  35. 建立并返回一个到数据库的连接对象
  36. """
  37. import pymysql
  38. return pymysql.connect(
  39. host = Config.DB_HOST, #"localhost", # 修改后的主机
  40. port = Config.DB_PORT, #3306, # 添加端口号
  41. user = Config.DB_USER, #'root', # 修改后的用户名
  42. password = Config.DB_PASSWORD, # 修改后的密码
  43. db = Config.DB_NAME, #"drug_data", # 修改后的数据库名
  44. charset='utf8mb4'
  45. )
  46. class MT:
  47. def __init__(self, key):
  48. # self.package_name = 'com.sankuai.meituan'
  49. self.package_name = Config.PACKAGE_NAME
  50. self.access_token = get_access_token()
  51. self.city2province = self.get_city_info()
  52. # host = Config.DB_HOST #"localhost"
  53. # user = Config.DB_USER #"root"
  54. # password = Config.DB_PASSWORD #"dfwy2025"
  55. # database = Config.DB_NAME #"drug_data"
  56. # port = Config.DB_PORT#3306
  57. # print(f'数据库配置:host:{host},user:{user},password:{password},database:{database},port:{port}')
  58. self.table_name = Config.DB_TABLE #"mt_drug"
  59. self.shop_table_name = Config.DB_SHOP_TABLE
  60. print(f'数据库表名:table_name:{self.table_name},shop_table_name:{self.shop_table_name}')
  61. # self.mysql_client = mysqlClient(host, user, password, database, port)
  62. self.search_key = key # 参苓健脾胃颗粒 舒肝颗粒 清肺化痰丸 香砂平胃颗粒
  63. self.unrelated_data = 0 # 无关数据数量
  64. def stop_app(self):
  65. self.d.app_stop(self.package_name)
  66. time.sleep(5)
  67. def start_app(self):
  68. self.d.app_start(self.package_name)
  69. time.sleep(5)
  70. def restart_app(self):
  71. """
  72. 重启app
  73. :return:
  74. """
  75. self.stop_app()
  76. self.start_app()
  77. @staticmethod
  78. def get_sleep_time():
  79. return random.randint(5, 8)
  80. @staticmethod
  81. def get_current_date():
  82. return datetime.datetime.now().strftime('%Y/%m/%d')
  83. @staticmethod
  84. def get_city_info():
  85. """
  86. 获取所有的省市数据
  87. :return:
  88. """
  89. file_path = '../kailin_city.json'
  90. with open(file_path, 'r', encoding='utf-8') as f:
  91. data = json.load(f)
  92. province = {province_one["id"]: province_one for province_one in data['province']}
  93. city2province = dict()
  94. city = data['city']
  95. for city_one in city:
  96. name = city_one['name']
  97. pid = city_one['pid']
  98. if len(str(pid)) > 2:
  99. pid = int(re.match('^\d{2}', str(pid)).group())
  100. city2province[name] = province[pid]['name']
  101. return city2province
  102. def get_shop_name(self):
  103. """
  104. 获取店铺名
  105. :return:
  106. """
  107. try:
  108. shop_name = self.d.xpath(
  109. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  110. print(f'获取到店铺名:{shop_name}')
  111. return shop_name
  112. except:
  113. try:
  114. shop_name = self.d.xpath(
  115. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()-1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  116. print(f'获取到店铺名2:{shop_name}')
  117. return shop_name
  118. except Exception as e:
  119. print(f'获取店铺名出错:{e}')
  120. return None
  121. def get_qualification_number(self):
  122. """
  123. 获取资质编号
  124. :return:
  125. """
  126. try:
  127. qualification_number_str = self.d.xpath(
  128. '//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text
  129. qualification_number = qualification_number_str.strip('资质编号:').strip()
  130. return qualification_number
  131. except:
  132. return None
  133. def get_shop_address(self):
  134. try:
  135. shop_address = self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/wm_sc_drug_shop_content_mrn_container_id_2"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.TextView').text
  136. print(f'获取到店铺地址:{shop_address}')
  137. return shop_address
  138. except:
  139. print(f'获取店铺地址出错-get_shop_address')
  140. return None
  141. def enter_detail(self):
  142. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/recycler"]/android.widget.FrameLayout[1]').click()
  143. time.sleep(self.get_sleep_time())
  144. def save_to_database(self, data):
  145. print(f'保存数据到数据库:{data}')
  146. # 连接数据库
  147. conn = get_mysql()
  148. # 创建游标对象
  149. cur = conn.cursor()
  150. # add_sql = "insert into delete_friend_table(delete_user_name,delete_user_id,delete_content,delete_time) value(%s,%s,%s,%s)"
  151. add_sql = f"""
  152. INSERT INTO {self.table_name}
  153. (product, min_price, manufacture_date, expiry_date, shop, business_license_company, province, city, manufacturer, specification, approval_number, product_link, scrape_date, scrape_province, availability, credit_code, platform)
  154. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  155. """
  156. cur.execute(add_sql, (data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'], data['business_license_company'],data['province'], data['city'], data['manufacturer'], data['specification'], data['approval_number'], data['product_link'], self.get_current_date(), data['scrape_province'], data['availability'], data['credit_code'], data['platform']))
  157. conn.commit() # 提交数据
  158. #self.mysql_client.insert(self.table_name, data)
  159. print(f"存入数据库成功")
  160. def save_shop_info_to_database(self, data):
  161. print(f'保存店铺数据到数据库:{data}')
  162. # 连接数据库
  163. conn = get_mysql()
  164. # 创建游标对象
  165. cur = conn.cursor()
  166. add_sql = f"""
  167. INSERT INTO {self.shop_table_name}
  168. (shop, contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform)
  169. VALUES (%s, %s, %s, %s, %s, %s, %s)
  170. """
  171. cur.execute(add_sql, (data['shop'], data['contact_address'], data['qualification_number'], data['business_license_company'], data['business_license_address'], data['scrape_date'], data['platform']))
  172. conn.commit() # 提交数据
  173. #self.mysql_client.insert(self.shop_table_name, data)
  174. print(f'存入店铺信息到数据库成功')
  175. def swipe_up(self):
  176. """
  177. 上滑
  178. :return:
  179. """
  180. screen_width = self.d.info['displayWidth']
  181. screen_height = self.d.info['displayHeight']
  182. duration_rate = random.uniform(0, 0.3)
  183. self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate)
  184. no = random.uniform(0, 1)
  185. if no > 0.85:
  186. # 有的时候卡着 再稍微往上滑一点点
  187. self.d.swipe_ext("up", 0.1)
  188. time.sleep(self.get_sleep_time())
  189. def swipe_back(self, no):
  190. """
  191. 返回
  192. :param no: 回退次数
  193. :return:
  194. """
  195. for idx in range(no):
  196. self.d.press('back')
  197. time.sleep(self.get_sleep_time())
  198. def drug_price(self):
  199. """
  200. 获取药品价格
  201. :return:
  202. """
  203. try:
  204. price_str = self.d.xpath('//*[starts-with(@text,"¥")]').text
  205. price = float(re.search('[\d\.]+', price_str).group())
  206. print(f'获取到价格:{price}')
  207. return price
  208. except Exception as e:
  209. print(f'提取价格出错-->{e}')
  210. return None
  211. def restart_uiautomator_services(self, device_id):
  212. """
  213. 重启atx的uiautomator 服务
  214. :param device_id:
  215. :return:
  216. """
  217. stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop'
  218. start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'
  219. # result = subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  220. # print(result.stdout)
  221. subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  222. time.sleep(self.get_sleep_time())
  223. subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True)
  224. time.sleep(self.get_sleep_time())
  225. def connect_devices(self, device_id):
  226. """
  227. 连接设备
  228. :return:
  229. """
  230. try:
  231. self.d = u2.connect_usb(device_id)
  232. # 设置隐形等待时间
  233. # self.d.implicitly_wait(5)
  234. self.restart_uiautomator_services(device_id)
  235. print(f'连接到设备:{device_id}')
  236. except Exception as e:
  237. print(f'{device_id} 连接错误: {e}')
  238. raise Exception(e)
  239. def get_ocr_res(self, img):
  240. try:
  241. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  242. # 二进制方式打开图片文件
  243. f = open(img, 'rb')
  244. img = base64.b64encode(f.read())
  245. params = {"image": img}
  246. # access_token = get_access_token()
  247. request_url = request_url + "?access_token=" + self.access_token
  248. headers = {'content-type': 'application/x-www-form-urlencoded'}
  249. response = requests.post(request_url, data=params, headers=headers)
  250. if response:
  251. res = response.json()
  252. new_dic = dict()
  253. for ite in res['words_result'].keys():
  254. new_dic[ite] = res['words_result'][ite]['words']
  255. print('资质数据信息', new_dic)
  256. return new_dic
  257. else:
  258. return None
  259. except:
  260. return None
  261. def screenshot_the_business_license(self, qualification_number):
  262. screenshot_path = 'screenshot1.png'
  263. self.d.screenshot(screenshot_path)
  264. img = cv2.imread(screenshot_path)
  265. # 指定裁剪区域 (left, top, right, bottom)
  266. left = 0
  267. top = 480
  268. right = 720
  269. bottom = 1420
  270. cropped_img = img[top:bottom, left:right]
  271. if qualification_number:
  272. cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\mt\\screenshot\\' + qualification_number + '.png'
  273. else:
  274. cropped_screenshot_path = 'cropped_screenshot.png'
  275. cv2.imwrite(cropped_screenshot_path, cropped_img)
  276. def get_title(self):
  277. # try:
  278. # title = self.d.xpath(
  279. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView').text
  280. # except:
  281. # title = self.d.xpath(
  282. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.TextView').text
  283. # title = self.d.xpath('//*[contains(@text, "舒肝颗粒")]').text
  284. title = self.d.xpath(f'//*[contains(@text, "{self.search_key}")]').text
  285. # title = self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView').text
  286. print(f'获取到药品标题:{title}')
  287. # 从里面匹配出药品名和规格
  288. # drugs_name
  289. # specifications
  290. # match = re.search(r'([^\d]+)([\d\D]+)', title)
  291. match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
  292. if match:
  293. drugs_name = match.group(1).strip() + match.group(2).strip()
  294. specifications = match.group(3).strip()
  295. print("药品名:", drugs_name)
  296. print("规格:", specifications)
  297. print('完整药名:', drugs_name + specifications)
  298. return drugs_name, specifications
  299. else:
  300. print("没有匹配到预期格式")
  301. def enter_shop(self):
  302. """
  303. 进店,方便提取资质环境
  304. :return:
  305. """
  306. self.d.xpath('//*[@text="进店"]').click()
  307. time.sleep(self.get_sleep_time())
  308. def enter_shoper(self):
  309. """
  310. 进入商家
  311. :return:
  312. """
  313. self.d.xpath('//*[@text="商家"]').click()
  314. time.sleep(self.get_sleep_time())
  315. def scan_shoper_license(self):
  316. self.d.xpath('//*[@text="查看商家资质"]').click()
  317. time.sleep(self.get_sleep_time())
  318. def data_is_exists(self, data):
  319. """
  320. 检查指定数据是否已存在于数据库表中(仅检查存在性)
  321. 参数:
  322. data: 包含查询条件的字典,键为列名,值为条件值
  323. 返回:
  324. True: 数据存在
  325. False: 数据不存在
  326. None: 检查过程中出错
  327. """
  328. # dup_data = {'product': product, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
  329. # 'platform': '美团'}
  330. # 1. 验证必要字段
  331. required_keys = ['product', 'min_price', 'shop', 'scrape_date', 'platform']
  332. if not all(key in data for key in required_keys):
  333. missing = [key for key in required_keys if key not in data]
  334. logging.error(f"缺少必要字段: {', '.join(missing)}")
  335. return None
  336. try:
  337. # 连接数据库
  338. conn = get_mysql()
  339. # 创建游标对象
  340. cur = conn.cursor()
  341. # query_sql = f"SELECT * FROM {self.table_name} WHERE product = '{data['product']}' AND min_price = '{data['min_price']}' AND shop = '{data['shop']}' AND scrape_date = '{data['scrape_date']}' AND platform = '{data['platform']}'"
  342. # cur.execute(query_sql)
  343. query_sql = """
  344. SELECT * FROM {}
  345. WHERE product = %s
  346. AND min_price = %s
  347. AND shop = %s
  348. AND scrape_date = %s
  349. AND platform = %s
  350. """.format(self.table_name)
  351. cur.execute(query_sql, (
  352. data['product'],
  353. data['min_price'],
  354. data['shop'],
  355. data['scrape_date'],
  356. data['platform']
  357. ))
  358. result = cur.fetchone()
  359. return bool(result) # 如果存在返回True,否则False
  360. except Exception as e:
  361. print(f"MySQL 错误: {str(e)}")
  362. # try:
  363. # columns = data.keys()
  364. # placeholders = [f"{col} = %({col})s" for col in columns]
  365. # query = f"SELECT * FROM `{self.table_name}` WHERE {' AND '.join(placeholders)}"
  366. # cur = self.mysql_client.cur
  367. # cur.execute(query, data)
  368. # exists = cur.fetchone()
  369. # return exists
  370. # except Exception as e:
  371. # print(f"MySQL 错误: {str(e)}")
  372. # return None
  373. def shop_is_exists_database(self, shop):
  374. try:
  375. # 连接数据库
  376. conn = get_mysql()
  377. # 创建游标对象
  378. cur = conn.cursor()
  379. query_sql = """
  380. SELECT * FROM {}
  381. WHERE shop = %s
  382. """.format(self.shop_table_name)
  383. cur.execute(query_sql, (
  384. shop
  385. ))
  386. result = cur.fetchone()
  387. return bool(result) # 如果存在返回True,否则False
  388. except Exception as e:
  389. print(f"MySQL 错误: {str(e)}")
  390. def get_instructions_data(self):
  391. """
  392. 确定有说明书之后,提取所有的说明书数据
  393. :return:
  394. """
  395. self.d.xpath('//*[@text="说明"]').click()
  396. time.sleep(random.randint(3, 5))
  397. self.d.xpath('//*[@text="查看详细说明"]').click()
  398. time.sleep(random.randint(3, 5))
  399. self.d.xpath('//*[@text="加载更多"]').click_exists()
  400. loop_page = 5
  401. new_list = list()
  402. for i in range(loop_page):
  403. self.d.xpath('//*[@text="加载更多"]').click_exists()
  404. time.sleep(1)
  405. if i == 0:
  406. self.d.swipe(200, 1000, 200, 300, 0.4)
  407. else:
  408. self.d.swipe(200, 1000, 200, 62)
  409. time.sleep(1)
  410. if self.d.xpath('//*[@text="加载更多"]').exists:
  411. self.d.xpath('//*[@text="加载更多"]').click()
  412. time.sleep(1)
  413. all_tt = self.d.xpath(
  414. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup').all()
  415. for idx in range(1, len(all_tt) + 1):
  416. all_tt1 = self.d.xpath(
  417. f'//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[{idx}]//android.widget.TextView').all()
  418. for tt in all_tt1:
  419. if tt.text:
  420. new_list.append(tt.text)
  421. if i == 0:
  422. height = 938
  423. else:
  424. drug_box = self.d.xpath(
  425. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]').info
  426. bounds = drug_box['bounds']
  427. height = bounds['bottom'] - bounds['top']
  428. if height < 938:
  429. # print('说明书翻页到底部')
  430. break
  431. # 展开全文
  432. new_list = [item for item in new_list if item != '展开全文']
  433. print(f'当前说明书列表数据:{new_list}')
  434. expiry_date_index = next(idx for idx, i in enumerate(new_list) if i == '有效期')
  435. manufacturer_index = next(idx for idx, i in enumerate(new_list) if i == '生产单位')
  436. approval_number_index = next(idx for idx, i in enumerate(new_list) if i == '批准文号')
  437. res_data = {
  438. "有效期": new_list[expiry_date_index + 1],
  439. "生产单位": new_list[manufacturer_index + 1],
  440. "批准文号": new_list[approval_number_index + 1]
  441. }
  442. print(f'当前说明书字典数据:{res_data}')
  443. return res_data
  444. def has_instructions(self):
  445. """
  446. 是否有说明书
  447. :return:
  448. """
  449. # 没有说明书的无法采集具体数据
  450. time.sleep(self.get_sleep_time())
  451. is_has_instructions = self.d.xpath('//*[@text="说明"]').exists
  452. return is_has_instructions
  453. def has_shop(self):
  454. """
  455. 是否有进店按钮
  456. :return:
  457. """
  458. # self.d.swipe_ext('up', 0.1)
  459. time.sleep(self.get_sleep_time())
  460. is_has_enter_shop = self.d.xpath('//*[@text="进店"]').exists
  461. return is_has_enter_shop
  462. def get_license_info_ex(self):
  463. self.enter_shop()
  464. self.enter_shoper()
  465. #获取地址
  466. contact_address = self.get_shop_address()
  467. # time.sleep(50000)
  468. ###
  469. self.scan_shoper_license()
  470. # 获取资质编码
  471. qualification_number = self.get_qualification_number()
  472. #营业执照公司名称
  473. business_license_company = ''
  474. #营业执照地址
  475. business_license_address = ''
  476. self.d.click(0.603, 0.27)
  477. time.sleep(self.get_sleep_time())
  478. self.screenshot_the_business_license(qualification_number)
  479. ocr_res = self.get_ocr_res('cropped_screenshot.png')
  480. print(f'ocr_res:{ocr_res}')
  481. #获取ocr_res 中的地址、单位名称
  482. if ocr_res:
  483. if '单位名称' in ocr_res.keys():
  484. business_license_company = ocr_res['单位名称']
  485. if '地址' in ocr_res.keys():
  486. business_license_address = ocr_res['地址']
  487. license_info_data = {'contact_address': contact_address, 'qualification_number': qualification_number, 'business_license_company': business_license_company, 'business_license_address': business_license_address}
  488. return license_info_data
  489. def get_license_info(self):
  490. self.enter_shop()
  491. self.enter_shoper()
  492. self.scan_shoper_license()
  493. # 获取资质编码
  494. qualification_number = self.get_qualification_number()
  495. if qualification_number:
  496. table_license_info = self.get_table_license_info(qualification_number)
  497. if table_license_info:
  498. return {
  499. '单位名称': table_license_info[0],
  500. '地址': table_license_info[1],
  501. '社会信用代码': table_license_info[2]
  502. }
  503. else:
  504. # operate_no = random.randint(0, 1)
  505. self.d.click(0.603, 0.27)
  506. # if operate_no == 0:
  507. # self.d.xpath('//*[@text="营业执照"]').click()
  508. # else:
  509. # self.d.click(0.603, 0.27)
  510. time.sleep(self.get_sleep_time())
  511. self.screenshot_the_business_license()
  512. ocr_res = self.get_ocr_res('cropped_screenshot.png')
  513. return ocr_res
  514. # operate_no = random.randint(0, 1)
  515. self.d.click(0.603, 0.27)
  516. # if operate_no == 0:
  517. # self.d.xpath('//*[@text="营业执照"]').click()
  518. # else:
  519. # self.d.click(0.603, 0.27)
  520. time.sleep(self.get_sleep_time())
  521. self.screenshot_the_business_license()
  522. ocr_res = self.get_ocr_res('cropped_screenshot.png')
  523. return ocr_res
  524. def distinct_target(self):
  525. is_position = self.d.xpath(
  526. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]').exists
  527. return is_position
  528. def enter_target_page(self):
  529. self.d.xpath('//*[@content-desc="看病买药"]').click()
  530. time.sleep(self.get_sleep_time())
  531. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/vf_search_carousel_text"]').click()
  532. time.sleep(self.get_sleep_time())
  533. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]').click()
  534. time.sleep(self.get_sleep_time())
  535. self.d.send_keys(self.search_key, clear=True)
  536. time.sleep(self.get_sleep_time())
  537. self.d.xpath('//*[@text="搜索"]').click()
  538. time.sleep(self.get_sleep_time())
  539. def get_table_license_info(self, qualification_number):
  540. try:
  541. sql = f'select business_license_company,city,credit_code from mt_drug where credit_code = "{qualification_number}"'
  542. self.mysql_client.cur.execute(sql)
  543. res = self.mysql_client.cur.fetchone()
  544. return res
  545. except:
  546. return None
  547. def integrate_data(self):
  548. """
  549. 整合数据
  550. :return:
  551. """
  552. title_info = self.get_title() # 药品,规格
  553. if title_info:
  554. product, specifications = title_info
  555. if self.search_key not in product.replace(' ', ''):
  556. self.swipe_back(1)
  557. self.unrelated_data += 1
  558. return
  559. else:
  560. self.swipe_back(1)
  561. return
  562. min_price = self.drug_price() # 最低价格
  563. #判断是否有自营的文本,有的话不需要获取店铺的信息
  564. if self.d.xpath('//*[@text="自营"]').exists:
  565. shop = "美团自营大药房(快递电商)"
  566. # 爬取日期
  567. scrape_date = self.get_current_date()
  568. dup_data = {'product': product, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
  569. 'platform': '美团'}
  570. print(f'当前数据:{dup_data}')
  571. if self.data_is_exists(dup_data):
  572. print('存在相同数据不入库')
  573. self.swipe_back(1)
  574. return
  575. else:
  576. for i in range(3):
  577. if self.d.xpath('//*[@text="进店"]').exists:
  578. print('开始获取店铺名1')
  579. break
  580. self.d.swipe_ext('up', 0.2)
  581. time.sleep(1)
  582. # detail_info = self.d.xpath(
  583. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[6]').info
  584. # bounds = detail_info['bounds']
  585. # height = bounds['bottom'] - bounds['top']
  586. # if self.d.xpath('//*[@text="进店"]').exists and height > 100:
  587. if self.d.xpath('//*[@text="进店"]').exists:
  588. print('开始获取店铺名2')
  589. break
  590. shop = self.get_shop_name()
  591. # 爬取日期
  592. scrape_date = self.get_current_date()
  593. dup_data = {'product': product, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
  594. 'platform': '美团'}
  595. print(f'当前数据:{dup_data}')
  596. #获取店铺信息开始
  597. is_has_enter_shop = self.has_shop()
  598. #需要判断shop是否已经在数据库中存在,如果存在,则不再进入店铺,直接进入下一个商品
  599. shop_is_exists = self.shop_is_exists_database(shop)
  600. #存在进店 并且店铺的名称不包含美团官方的字样
  601. if is_has_enter_shop and '美团官方' not in shop and not shop_is_exists:
  602. license_info = self.get_license_info_ex()
  603. contact_address = license_info['contact_address']
  604. qualification_number = license_info['qualification_number']
  605. business_license_company = license_info['business_license_company']
  606. business_license_address = license_info['business_license_address']
  607. save_shop_data = {
  608. 'shop': shop,
  609. 'contact_address': contact_address,
  610. 'qualification_number': qualification_number,
  611. 'scrape_date': scrape_date,
  612. 'business_license_company':business_license_company,
  613. 'business_license_address':business_license_address,
  614. 'platform': '美团'
  615. }
  616. self.save_shop_info_to_database(save_shop_data)
  617. self.swipe_back(2)
  618. #获取店铺信息结束
  619. if self.data_is_exists(dup_data):
  620. print('存在相同数据不入库')
  621. self.swipe_back(1)
  622. return
  623. if not shop:
  624. print('未获取到店铺名:开始回退')
  625. self.swipe_back(1)
  626. return
  627. if not shop or '自营' in shop:
  628. self.swipe_back(1)
  629. return
  630. time.sleep(self.get_sleep_time())
  631. # 生产日期为空
  632. manufacture_date = ''
  633. # 执政信息
  634. # if is_has_enter_shop:
  635. # license_info = self.get_license_info()
  636. # business_license_company = license_info["单位名称"]
  637. # credit_code = license_info['社会信用代码']
  638. # city_str = license_info['地址']
  639. # # 先把省份啥的替换掉
  640. # city_sub_str = re.sub(r'[u4e00-\u9fa5]+省', '', city_str)
  641. # try:
  642. # city = re.search(r'[\u4e00-\u9fa5]+?(市|区|县)', city_sub_str).group(0)
  643. # except:
  644. # city = city_sub_str
  645. # try:
  646. # province = self.city2province[city]
  647. # except:
  648. # province = ''
  649. # self.swipe_back(2)
  650. # else:
  651. # business_license_company = ''
  652. # credit_code = ''
  653. # city = ''
  654. # province = ''
  655. business_license_company = ''
  656. credit_code = ''
  657. city = ''
  658. province = ''
  659. expiry_date = ''
  660. manufacturer = ''
  661. approval_number = ''
  662. #是否存在说明书
  663. is_has_instructions = self.has_instructions()
  664. # 说明书等信息
  665. if is_has_instructions:
  666. print('开始获取说明书信息')
  667. instructions_info = self.get_instructions_data()
  668. expiry_date = instructions_info['有效期'].strip('。')
  669. manufacturer = instructions_info['生产单位'].strip('。')
  670. approval_number = instructions_info['批准文号'].strip('。')
  671. else:
  672. # 没有说明书不入库
  673. self.swipe_back(1)
  674. return
  675. self.unrelated_data = 0
  676. # 商品链接
  677. product_link = ''
  678. # 爬取省份
  679. scrape_province = '广东' # 这里先默认广东
  680. # 是否有货
  681. availability = ''
  682. save_data = {
  683. 'product': product,
  684. 'min_price': min_price,
  685. 'manufacture_date': manufacture_date,
  686. 'expiry_date': expiry_date,
  687. 'shop': shop,
  688. 'business_license_company': business_license_company,
  689. 'province': province,
  690. 'city': city,
  691. 'manufacturer': manufacturer,
  692. 'specification': specifications,
  693. 'approval_number': approval_number,
  694. 'product_link': product_link,
  695. 'scrape_date': scrape_date,
  696. 'scrape_province': scrape_province,
  697. 'availability': availability,
  698. 'credit_code': credit_code,
  699. 'platform': '美团'
  700. }
  701. self.save_to_database(save_data)
  702. # time.sleep(100000)
  703. if self.distinct_target():
  704. print('已到达搜索列表页')
  705. else:
  706. for i in range(1):
  707. self.swipe_back(1)
  708. # 最外部有个定位按钮
  709. if self.distinct_target():
  710. break
  711. def main(self, device_id):
  712. spider_no = 0
  713. self.connect_devices(device_id)
  714. time.sleep(self.get_sleep_time())
  715. # 重新开启美团应用
  716. self.restart_app()
  717. # 搜索关键字
  718. self.enter_target_page()
  719. for idx in range(300):
  720. print(f'第{idx + 1}页')
  721. if spider_no > 30:
  722. time.sleep(120)
  723. spider_no = 0
  724. print('目前无关数据量: ', self.unrelated_data)
  725. # if self.unrelated_data > 10:
  726. # # 连续超过5个不达标的数据则停止采集
  727. # break
  728. drug_lis = self.d.xpath('//android.support.v7.widget.RecyclerView/android.widget.FrameLayout').all()
  729. lis_len = len(drug_lis)
  730. print(f'当前页面共有{lis_len}个商品')
  731. for drug_one in drug_lis:
  732. bounds = drug_one.info['bounds']
  733. top = bounds['top']
  734. bottom = bounds['bottom']
  735. # height = bottom - top
  736. print(f'当前商品高度{bottom - top}')
  737. if 304 <= top and bottom <= 1475: # 默认高度241的才行
  738. # print('目标-->', drug_one.info)
  739. drug_one.click()
  740. # print('点击目标药品完毕')
  741. time.sleep(2)
  742. # 采集药品信息
  743. try:
  744. self.integrate_data()
  745. # 检测下是否回退到列表页
  746. if self.distinct_target():
  747. print('回退到列表页', True)
  748. else:
  749. print('回退到列表页失败,终止采集')
  750. return
  751. time.sleep(self.get_sleep_time())
  752. spider_no += 1
  753. except Exception as e:
  754. print(f'采集药品详情数据出错:{e}')
  755. if not self.distinct_target():
  756. for i in range(1):
  757. self.swipe_back(1)
  758. # 最外部有个定位按钮
  759. if self.distinct_target():
  760. break
  761. if i == 0 and not self.distinct_target():
  762. print('页面出错,退出采集')
  763. return
  764. else:
  765. continue
  766. if self.d.xpath('//*[@text="已经到底啦"]').exists:
  767. print('已经到达列表页最底部')
  768. return
  769. search_list = self.d.xpath('//android.support.v7.widget.RecyclerView').info
  770. bounds = search_list['bounds']
  771. #print('搜索列表高度', 1400 + bounds['top'] - bounds['bottom'])
  772. # self.d.swipe(200, 1400, 200, 1400 + bounds['top'] - bounds['bottom'])
  773. # 计算滑动距离
  774. scroll_distance = bounds['bottom'] - bounds['top'] # 正数
  775. start_y = 1400
  776. end_y = start_y - scroll_distance # 向上滑动,y 坐标减小
  777. # 确保 end_y 不小于 0
  778. end_y = max(end_y, 100) # 留出一点边距,避免滑出屏幕
  779. print('滑动起点 y:', start_y, '终点 y:', end_y)
  780. self.d.swipe(200, start_y, 200, end_y, 0.4)
  781. #print('搜索列表高度', 1400 + bounds['top'] - bounds['bottom'])
  782. # self.d.swipe(200, 1400, 200, 1400 + bounds['top'] - bounds['bottom'])
  783. # self.d.swipe(200, 1400, 200, 1400 + bounds['top'] - bounds['bottom'], 0.4)
  784. time.sleep(self.get_sleep_time())
  785. def unitest(self):
  786. """
  787. 单元测试
  788. :return:
  789. """
  790. save_data = {
  791. 'product':"[昆中药]舒肝颗粒(低糖型)",
  792. 'min_price': 14.0,
  793. 'manufacture_date': '',
  794. 'expiry_date': '36个月',
  795. 'shop': '美团自营大药房(快递电商)',
  796. 'business_license_company': '',
  797. 'province': '',
  798. 'city': '',
  799. 'manufacturer': '昆明中药厂有限公司',
  800. 'specification': '3g*16袋/盒',
  801. 'approval_number': '国药准字Z53021161',
  802. 'product_link': '',
  803. 'scrape_date': '2025/07/09',
  804. 'scrape_province': '广东',
  805. 'availability': '',
  806. 'credit_code': '',
  807. 'platform': '美团'
  808. }
  809. self.save_to_database(save_data)
  810. time.sleep(100000)
  811. pass
  812. def main():
  813. mt = MT('舒肝颗粒') # 参苓健脾胃颗粒 舒肝颗粒 清肺化痰丸 香砂平胃颗粒
  814. # mt.main('95b2c764')
  815. mt.main('fcb3c749')
  816. if __name__ == '__main__':
  817. main()
  818. # scheduler = BlockingScheduler()
  819. # scheduler.add_job(main, 'cron', hour=21, minute=30, misfire_grace_time=120)
  820. # try:
  821. # scheduler.start()
  822. # except (KeyboardInterrupt, SystemExit):
  823. # pass