no_search_bak.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. import requests
  2. import base64
  3. import cv2
  4. import uiautomator2 as u2
  5. import time
  6. import subprocess
  7. import re
  8. import random
  9. import datetime
  10. import json
  11. from apscheduler.schedulers.blocking import BlockingScheduler
  12. from db_mysql import mysqlClient
  13. def get_access_token():
  14. AppKey = "tRK2RhyItCSh6BzyT4CNVXQa"
  15. AppSrcret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  16. token_url = 'https://aip.baidubce.com/oauth/2.0/token'
  17. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  18. payload = ""
  19. headers = {
  20. 'Content-Type': 'application/json',
  21. 'Accept': 'application/json'
  22. }
  23. response = requests.request("POST", url, headers=headers, data=payload)
  24. try:
  25. return response.json()['access_token']
  26. except:
  27. return None
  28. class MT:
  29. def __init__(self, key):
  30. self.package_name = 'com.sankuai.meituan'
  31. self.access_token = get_access_token()
  32. self.city2province = self.get_city_info()
  33. host = "localhost"
  34. user = "root"
  35. password = "dfwy2025"
  36. database = "drug_data"
  37. port = 3306
  38. self.table_name = "mt_drug"
  39. self.mysql_client = mysqlClient(host, user, password, database, port)
  40. self.search_key = key # 参苓健脾胃颗粒 舒肝颗粒 清肺化痰丸 香砂平胃颗粒
  41. self.unrelated_data = 0 # 无关数据数量
  42. def stop_app(self):
  43. self.d.app_stop(self.package_name)
  44. time.sleep(5)
  45. def start_app(self):
  46. self.d.app_start(self.package_name)
  47. time.sleep(5)
  48. def restart_app(self):
  49. """
  50. 重启app
  51. :return:
  52. """
  53. self.stop_app()
  54. self.start_app()
  55. @staticmethod
  56. def get_sleep_time():
  57. return random.randint(5, 8)
  58. @staticmethod
  59. def get_current_date():
  60. return datetime.datetime.now().strftime('%Y/%m/%d')
  61. @staticmethod
  62. def get_city_info():
  63. """
  64. 获取所有的省市数据
  65. :return:
  66. """
  67. file_path = '../kailin_city.json'
  68. with open(file_path, 'r', encoding='utf-8') as f:
  69. data = json.load(f)
  70. province = {province_one["id"]: province_one for province_one in data['province']}
  71. city2province = dict()
  72. city = data['city']
  73. for city_one in city:
  74. name = city_one['name']
  75. pid = city_one['pid']
  76. if len(str(pid)) > 2:
  77. pid = int(re.match('^\d{2}', str(pid)).group())
  78. city2province[name] = province[pid]['name']
  79. return city2province
  80. def get_shop_name(self):
  81. """
  82. 获取店铺名
  83. :return:
  84. """
  85. try:
  86. shop_name = self.d.xpath(
  87. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  88. print(f'获取到店铺名:{shop_name}')
  89. return shop_name
  90. except:
  91. try:
  92. shop_name = self.d.xpath(
  93. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()-1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  94. print(f'获取到店铺名:{shop_name}')
  95. return shop_name
  96. except Exception as e:
  97. print(f'获取店铺名出错:{e}')
  98. return None
  99. def get_qualification_number(self):
  100. """
  101. 获取资质编号
  102. :return:
  103. """
  104. try:
  105. qualification_number_str = self.d.xpath(
  106. '//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text
  107. qualification_number = qualification_number_str.strip('资质编号:').strip()
  108. return qualification_number
  109. except:
  110. return None
  111. def enter_detail(self):
  112. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/recycler"]/android.widget.FrameLayout[1]').click()
  113. time.sleep(self.get_sleep_time())
  114. def save_to_database(self, data):
  115. self.mysql_client.insert(self.table_name, data)
  116. print(f'{data}-->存入数据库成功')
  117. def swipe_up(self):
  118. """
  119. 上滑
  120. :return:
  121. """
  122. screen_width = self.d.info['displayWidth']
  123. screen_height = self.d.info['displayHeight']
  124. duration_rate = random.uniform(0, 0.3)
  125. self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate)
  126. no = random.uniform(0, 1)
  127. if no > 0.85:
  128. # 有的时候卡着 再稍微往上滑一点点
  129. self.d.swipe_ext("up", 0.1)
  130. time.sleep(self.get_sleep_time())
  131. def swipe_back(self, no):
  132. """
  133. 返回
  134. :param no: 回退次数
  135. :return:
  136. """
  137. for idx in range(no):
  138. self.d.press('back')
  139. time.sleep(self.get_sleep_time())
  140. def drug_price(self):
  141. """
  142. 获取药品价格
  143. :return:
  144. """
  145. try:
  146. price_str = self.d.xpath('//*[starts-with(@text,"¥")]').text
  147. price = float(re.search('[\d\.]+', price_str).group())
  148. print(f'获取到价格:{price}')
  149. return price
  150. except Exception as e:
  151. print(f'提取价格出错-->{e}')
  152. return None
  153. def restart_uiautomator_services(self, device_id):
  154. """
  155. 重启atx的uiautomator 服务
  156. :param device_id:
  157. :return:
  158. """
  159. stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop'
  160. start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'
  161. # result = subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  162. # print(result.stdout)
  163. subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  164. time.sleep(self.get_sleep_time())
  165. subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True)
  166. time.sleep(self.get_sleep_time())
  167. def connect_devices(self, device_id):
  168. """
  169. 连接设备
  170. :return:
  171. """
  172. try:
  173. self.d = u2.connect_usb(device_id)
  174. # 设置隐形等待时间
  175. # self.d.implicitly_wait(5)
  176. self.restart_uiautomator_services(device_id)
  177. print(f'连接到设备:{device_id}')
  178. except Exception as e:
  179. print(f'{device_id} 连接错误: {e}')
  180. raise Exception(e)
  181. def get_ocr_res(self, img):
  182. try:
  183. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  184. # 二进制方式打开图片文件
  185. f = open(img, 'rb')
  186. img = base64.b64encode(f.read())
  187. params = {"image": img}
  188. # access_token = get_access_token()
  189. request_url = request_url + "?access_token=" + self.access_token
  190. headers = {'content-type': 'application/x-www-form-urlencoded'}
  191. response = requests.post(request_url, data=params, headers=headers)
  192. if response:
  193. res = response.json()
  194. new_dic = dict()
  195. for ite in res['words_result'].keys():
  196. new_dic[ite] = res['words_result'][ite]['words']
  197. print('资质数据信息', new_dic)
  198. return new_dic
  199. except:
  200. return None
  201. def screenshot_the_business_license(self):
  202. screenshot_path = 'screenshot1.png'
  203. self.d.screenshot(screenshot_path)
  204. img = cv2.imread(screenshot_path)
  205. # 指定裁剪区域 (left, top, right, bottom)
  206. left = 0
  207. top = 480
  208. right = 720
  209. bottom = 1420
  210. cropped_img = img[top:bottom, left:right]
  211. cropped_screenshot_path = 'cropped_screenshot.png'
  212. cv2.imwrite(cropped_screenshot_path, cropped_img)
  213. def get_title(self):
  214. try:
  215. title = self.d.xpath(
  216. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]//android.widget.TextView').text
  217. except:
  218. title = self.d.xpath(
  219. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]//android.widget.TextView').text
  220. print(f'获取到药品标题:{title}')
  221. # 从里面匹配出药品名和规格
  222. # drugs_name
  223. # specifications
  224. # match = re.search(r'([^\d]+)([\d\D]+)', title)
  225. match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
  226. if match:
  227. drugs_name = match.group(1).strip() + match.group(2).strip()
  228. specifications = match.group(3).strip()
  229. print("药品名:", drugs_name)
  230. print("规格:", specifications)
  231. print('完整药名:', drugs_name + specifications)
  232. return drugs_name, specifications
  233. else:
  234. print("没有匹配到预期格式")
  235. def enter_shop(self):
  236. """
  237. 进店,方便提取资质环境
  238. :return:
  239. """
  240. self.d.xpath('//*[@text="进店"]').click()
  241. time.sleep(self.get_sleep_time())
  242. def enter_shoper(self):
  243. """
  244. 进入商家
  245. :return:
  246. """
  247. self.d.xpath('//*[@text="商家"]').click()
  248. time.sleep(self.get_sleep_time())
  249. def scan_shoper_license(self):
  250. self.d.xpath('//*[@text="查看商家资质"]').click()
  251. time.sleep(self.get_sleep_time())
  252. def data_is_exists(self, data):
  253. try:
  254. columns = data.keys()
  255. placeholders = [f"{col} = %({col})s" for col in columns]
  256. query = f"SELECT * FROM `{self.table_name}` WHERE {' AND '.join(placeholders)}"
  257. cur = self.mysql_client.cur
  258. cur.execute(query, data)
  259. exists = cur.fetchone()
  260. return exists
  261. except Exception as e:
  262. print(f"MySQL 错误: {str(e)}")
  263. return None
  264. def get_instructions_data(self):
  265. """
  266. 确定有说明书之后,提取所有的说明书数据
  267. :return:
  268. """
  269. self.d.xpath('//*[@text="说明"]').click()
  270. time.sleep(random.randint(3, 5))
  271. self.d.xpath('//*[@text="查看详细说明"]').click()
  272. time.sleep(random.randint(3, 5))
  273. self.d.xpath('//*[@text="加载更多"]').click_exists()
  274. loop_page = 5
  275. new_list = list()
  276. for i in range(loop_page):
  277. self.d.xpath('//*[@text="加载更多"]').click_exists()
  278. time.sleep(1)
  279. if i == 0:
  280. self.d.swipe(200, 1000, 200, 300, 0.4)
  281. else:
  282. self.d.swipe(200, 1000, 200, 62)
  283. time.sleep(1)
  284. if self.d.xpath('//*[@text="加载更多"]').exists:
  285. self.d.xpath('//*[@text="加载更多"]').click()
  286. time.sleep(1)
  287. all_tt = self.d.xpath(
  288. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup').all()
  289. for idx in range(1, len(all_tt) + 1):
  290. all_tt1 = self.d.xpath(
  291. f'//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[{idx}]//android.widget.TextView').all()
  292. for tt in all_tt1:
  293. if tt.text:
  294. new_list.append(tt.text)
  295. if i == 0:
  296. height = 938
  297. else:
  298. drug_box = self.d.xpath(
  299. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]').info
  300. bounds = drug_box['bounds']
  301. height = bounds['bottom'] - bounds['top']
  302. if height < 938:
  303. # print('说明书翻页到底部')
  304. break
  305. # 展开全文
  306. new_list = [item for item in new_list if item != '展开全文']
  307. print(f'当前说明书列表数据:{new_list}')
  308. expiry_date_index = next(idx for idx, i in enumerate(new_list) if i == '有效期')
  309. manufacturer_index = next(idx for idx, i in enumerate(new_list) if i == '生产单位')
  310. approval_number_index = next(idx for idx, i in enumerate(new_list) if i == '批准文号')
  311. res_data = {
  312. "有效期": new_list[expiry_date_index + 1],
  313. "生产单位": new_list[manufacturer_index + 1],
  314. "批准文号": new_list[approval_number_index + 1]
  315. }
  316. print(f'当前说明书字典数据:{res_data}')
  317. return res_data
  318. def has_instructions(self):
  319. """
  320. 是否有说明书
  321. :return:
  322. """
  323. # 没有说明书的无法采集具体数据
  324. time.sleep(self.get_sleep_time())
  325. is_has_instructions = self.d.xpath('//*[@text="说明"]').exists
  326. return is_has_instructions
  327. def has_shop(self):
  328. """
  329. 是否有进店按钮
  330. :return:
  331. """
  332. # self.d.swipe_ext('up', 0.1)
  333. time.sleep(self.get_sleep_time())
  334. is_has_enter_shop = self.d.xpath('//*[@text="进店"]').exists
  335. return is_has_enter_shop
  336. def get_license_info(self):
  337. self.enter_shop()
  338. self.enter_shoper()
  339. self.scan_shoper_license()
  340. # 获取资质编码
  341. qualification_number = self.get_qualification_number()
  342. if qualification_number:
  343. table_license_info = self.get_table_license_info(qualification_number)
  344. if table_license_info:
  345. return {
  346. '单位名称': table_license_info[0],
  347. '地址': table_license_info[1],
  348. '社会信用代码': table_license_info[2]
  349. }
  350. else:
  351. # operate_no = random.randint(0, 1)
  352. self.d.click(0.603, 0.27)
  353. # if operate_no == 0:
  354. # self.d.xpath('//*[@text="营业执照"]').click()
  355. # else:
  356. # self.d.click(0.603, 0.27)
  357. time.sleep(self.get_sleep_time())
  358. self.screenshot_the_business_license()
  359. ocr_res = self.get_ocr_res('cropped_screenshot.png')
  360. return ocr_res
  361. # operate_no = random.randint(0, 1)
  362. self.d.click(0.603, 0.27)
  363. # if operate_no == 0:
  364. # self.d.xpath('//*[@text="营业执照"]').click()
  365. # else:
  366. # self.d.click(0.603, 0.27)
  367. time.sleep(self.get_sleep_time())
  368. self.screenshot_the_business_license()
  369. ocr_res = self.get_ocr_res('cropped_screenshot.png')
  370. return ocr_res
  371. def distinct_target(self):
  372. is_position = self.d.xpath(
  373. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]').exists
  374. return is_position
  375. def enter_target_page(self):
  376. self.d.xpath('//*[@content-desc="看病买药"]').click()
  377. time.sleep(self.get_sleep_time())
  378. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/vf_search_carousel_text"]').click()
  379. time.sleep(self.get_sleep_time())
  380. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]').click()
  381. time.sleep(self.get_sleep_time())
  382. self.d.send_keys(self.search_key, clear=True)
  383. time.sleep(self.get_sleep_time())
  384. self.d.xpath('//*[@text="搜索"]').click()
  385. time.sleep(self.get_sleep_time())
  386. def get_table_license_info(self, qualification_number):
  387. try:
  388. sql = f'select business_license_company,city,credit_code from mt_drug where credit_code = "{qualification_number}"'
  389. self.mysql_client.cur.execute(sql)
  390. res = self.mysql_client.cur.fetchone()
  391. return res
  392. except:
  393. return None
  394. def integrate_data(self):
  395. """
  396. 整合数据
  397. :return:
  398. """
  399. title_info = self.get_title() # 药品,规格
  400. if title_info:
  401. product, specifications = title_info
  402. if self.search_key not in product.replace(' ', ''):
  403. self.swipe_back(1)
  404. self.unrelated_data += 1
  405. return
  406. else:
  407. self.swipe_back(1)
  408. return
  409. min_price = self.drug_price() # 最低价格
  410. for i in range(3):
  411. if self.d.xpath('//*[@text="进店"]').exists:
  412. print('开始获取店铺名')
  413. break
  414. self.d.swipe_ext('up', 0.2)
  415. time.sleep(1)
  416. # detail_info = self.d.xpath(
  417. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[6]').info
  418. # bounds = detail_info['bounds']
  419. # height = bounds['bottom'] - bounds['top']
  420. # if self.d.xpath('//*[@text="进店"]').exists and height > 100:
  421. if self.d.xpath('//*[@text="进店"]').exists:
  422. print('开始获取店铺名')
  423. break
  424. shop = self.get_shop_name()
  425. # 爬取日期
  426. scrape_date = self.get_current_date()
  427. dup_data = {'product': product, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
  428. 'platform': '美团'}
  429. if self.data_is_exists(dup_data):
  430. print('存在相同数据不入库')
  431. self.swipe_back(1)
  432. return
  433. is_has_instructions = self.has_instructions()
  434. if not shop:
  435. print('未获取到店铺名:开始回退')
  436. self.swipe_back(1)
  437. return
  438. if not shop or '京东自营' in shop:
  439. self.swipe_back(1)
  440. return
  441. time.sleep(self.get_sleep_time())
  442. # 生产日期为空
  443. manufacture_date = ''
  444. # 执政信息
  445. # if is_has_enter_shop:
  446. # license_info = self.get_license_info()
  447. # business_license_company = license_info["单位名称"]
  448. # credit_code = license_info['社会信用代码']
  449. # city_str = license_info['地址']
  450. # # 先把省份啥的替换掉
  451. # city_sub_str = re.sub(r'[u4e00-\u9fa5]+省', '', city_str)
  452. # try:
  453. # city = re.search(r'[\u4e00-\u9fa5]+?(市|区|县)', city_sub_str).group(0)
  454. # except:
  455. # city = city_sub_str
  456. # try:
  457. # province = self.city2province[city]
  458. # except:
  459. # province = ''
  460. # self.swipe_back(2)
  461. # else:
  462. # business_license_company = ''
  463. # credit_code = ''
  464. # city = ''
  465. # province = ''
  466. business_license_company = ''
  467. credit_code = ''
  468. city = ''
  469. province = ''
  470. # 说明书等信息
  471. if is_has_instructions:
  472. print('开始获取说明书信息')
  473. instructions_info = self.get_instructions_data()
  474. expiry_date = instructions_info['有效期'].strip('。')
  475. manufacturer = instructions_info['生产单位'].strip('。')
  476. approval_number = instructions_info['批准文号'].strip('。')
  477. else:
  478. # 没有说明书不入库
  479. self.swipe_back(1)
  480. return
  481. self.unrelated_data = 0
  482. # 商品链接
  483. product_link = ''
  484. # 爬取省份
  485. scrape_province = '广东' # 这里先默认广东
  486. # 是否有货
  487. availability = ''
  488. save_data = {
  489. 'product': product,
  490. 'min_price': min_price,
  491. 'manufacture_date': manufacture_date,
  492. 'expiry_date': expiry_date,
  493. 'shop': shop,
  494. 'business_license_company': business_license_company,
  495. 'province': province,
  496. 'city': city,
  497. 'manufacturer': manufacturer,
  498. 'specification': specifications,
  499. 'approval_number': approval_number,
  500. 'product_link': product_link,
  501. 'scrape_date': scrape_date,
  502. 'scrape_province': scrape_province,
  503. 'availability': availability,
  504. 'credit_code': credit_code,
  505. 'platform': '美团'
  506. }
  507. self.save_to_database(save_data)
  508. if self.distinct_target():
  509. print('已到达搜索列表页')
  510. else:
  511. for i in range(1):
  512. self.swipe_back(1)
  513. # 最外部有个定位按钮
  514. if self.distinct_target():
  515. break
  516. def main(self, device_id):
  517. spider_no = 0
  518. self.connect_devices(device_id)
  519. time.sleep(self.get_sleep_time())
  520. # 重新开启美团应用
  521. self.restart_app()
  522. # 搜索关键字
  523. self.enter_target_page()
  524. for idx in range(100):
  525. print(f'第{idx + 1}页')
  526. if spider_no > 30:
  527. time.sleep(120)
  528. spider_no = 0
  529. print('目前无关数据量: ', self.unrelated_data)
  530. # if self.unrelated_data > 10:
  531. # # 连续超过5个不达标的数据则停止采集
  532. # break
  533. drug_lis = self.d.xpath('//android.support.v7.widget.RecyclerView/android.widget.FrameLayout').all()
  534. for drug_one in drug_lis:
  535. bounds = drug_one.info['bounds']
  536. top = bounds['top']
  537. bottom = bounds['bottom']
  538. # height = bottom - top
  539. if 304 <= top and bottom <= 1475: # 默认高度241的才行
  540. # print('目标-->', drug_one.info)
  541. drug_one.click()
  542. # print('点击目标药品完毕')
  543. time.sleep(2)
  544. # 采集药品信息
  545. try:
  546. self.integrate_data()
  547. # 检测下是否回退到列表页
  548. if self.distinct_target():
  549. print('回退到列表页', True)
  550. else:
  551. print('回退到列表页失败,终止采集')
  552. return
  553. time.sleep(self.get_sleep_time())
  554. spider_no += 1
  555. except Exception as e:
  556. print(f'采集药品详情数据出错:{e}')
  557. if not self.distinct_target():
  558. for i in range(1):
  559. self.swipe_back(1)
  560. # 最外部有个定位按钮
  561. if self.distinct_target():
  562. break
  563. if i == 0 and not self.distinct_target():
  564. print('页面出错,退出采集')
  565. return
  566. else:
  567. continue
  568. if self.d.xpath('//*[@text="已经到底啦"]').exists:
  569. print('已经到达列表页最底部')
  570. return
  571. search_list = self.d.xpath('//android.support.v7.widget.RecyclerView').info
  572. bounds = search_list['bounds']
  573. print('搜索列表高度', bounds['bottom'] - bounds['top'])
  574. self.d.swipe(200, 1400, 200, 1400 + bounds['top'] - bounds['bottom'])
  575. time.sleep(self.get_sleep_time())
  576. def unitest(self):
  577. """
  578. 单元测试
  579. :return:
  580. """
  581. pass
  582. def main():
  583. mt = MT('舒肝颗粒') # 参苓健脾胃颗粒 舒肝颗粒 清肺化痰丸 香砂平胃颗粒
  584. # mt.main('95b2c764')
  585. mt.main('fcb3c749')
  586. if __name__ == '__main__':
  587. main()
  588. # scheduler = BlockingScheduler()
  589. # scheduler.add_job(main, 'cron', hour=21, minute=30, misfire_grace_time=120)
  590. # try:
  591. # scheduler.start()
  592. # except (KeyboardInterrupt, SystemExit):
  593. # pass