import requests import base64 import cv2 import uiautomator2 as u2 import time import subprocess import re import random import datetime import json from apscheduler.schedulers.blocking import BlockingScheduler from db_mysql import mysqlClient def get_access_token(): AppKey = "tRK2RhyItCSh6BzyT4CNVXQa" AppSrcret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh" token_url = 'https://aip.baidubce.com/oauth/2.0/token' url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}" payload = "" headers = { 'Content-Type': 'application/json', 'Accept': 'application/json' } response = requests.request("POST", url, headers=headers, data=payload) try: return response.json()['access_token'] except: return None class MT: def __init__(self, key): self.package_name = 'com.sankuai.meituan' self.access_token = get_access_token() self.city2province = self.get_city_info() host = "localhost" user = "root" password = "dfwy2025" database = "drug_data" port = 3306 self.table_name = "mt_drug" self.mysql_client = mysqlClient(host, user, password, database, port) self.search_key = key # 参苓健脾胃颗粒 舒肝颗粒 清肺化痰丸 香砂平胃颗粒 self.unrelated_data = 0 # 无关数据数量 def stop_app(self): self.d.app_stop(self.package_name) time.sleep(5) def start_app(self): self.d.app_start(self.package_name) time.sleep(5) def restart_app(self): """ 重启app :return: """ self.stop_app() self.start_app() @staticmethod def get_sleep_time(): return random.randint(5, 8) @staticmethod def get_current_date(): return datetime.datetime.now().strftime('%Y/%m/%d') @staticmethod def get_city_info(): """ 获取所有的省市数据 :return: """ file_path = '../kailin_city.json' with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) province = {province_one["id"]: province_one for province_one in data['province']} city2province = dict() city = data['city'] for city_one in city: name = city_one['name'] pid = city_one['pid'] if len(str(pid)) > 2: pid = int(re.match('^\d{2}', str(pid)).group()) city2province[name] = province[pid]['name'] return city2province def get_shop_name(self): """ 获取店铺名 :return: """ try: shop_name = self.d.xpath( '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text print(f'获取到店铺名:{shop_name}') return shop_name except: try: shop_name = self.d.xpath( '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()-1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text print(f'获取到店铺名:{shop_name}') return shop_name except Exception as e: print(f'获取店铺名出错:{e}') return None def get_qualification_number(self): """ 获取资质编号 :return: """ try: qualification_number_str = self.d.xpath( '//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text qualification_number = qualification_number_str.strip('资质编号:').strip() return qualification_number except: return None def enter_detail(self): self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/recycler"]/android.widget.FrameLayout[1]').click() time.sleep(self.get_sleep_time()) def save_to_database(self, data): self.mysql_client.insert(self.table_name, data) print(f'{data}-->存入数据库成功') def swipe_up(self): """ 上滑 :return: """ screen_width = self.d.info['displayWidth'] screen_height = self.d.info['displayHeight'] duration_rate = random.uniform(0, 0.3) self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate) no = random.uniform(0, 1) if no > 0.85: # 有的时候卡着 再稍微往上滑一点点 self.d.swipe_ext("up", 0.1) time.sleep(self.get_sleep_time()) def swipe_back(self, no): """ 返回 :param no: 回退次数 :return: """ for idx in range(no): self.d.press('back') time.sleep(self.get_sleep_time()) def drug_price(self): """ 获取药品价格 :return: """ try: price_str = self.d.xpath('//*[starts-with(@text,"¥")]').text price = float(re.search('[\d\.]+', price_str).group()) print(f'获取到价格:{price}') return price except Exception as e: print(f'提取价格出错-->{e}') return None def restart_uiautomator_services(self, device_id): """ 重启atx的uiautomator 服务 :param device_id: :return: """ stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop' start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d' # result = subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True) # print(result.stdout) subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True) time.sleep(self.get_sleep_time()) subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True) time.sleep(self.get_sleep_time()) def connect_devices(self, device_id): """ 连接设备 :return: """ try: self.d = u2.connect_usb(device_id) # 设置隐形等待时间 # self.d.implicitly_wait(5) self.restart_uiautomator_services(device_id) print(f'连接到设备:{device_id}') except Exception as e: print(f'{device_id} 连接错误: {e}') raise Exception(e) def get_ocr_res(self, img): try: request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license" # 二进制方式打开图片文件 f = open(img, 'rb') img = base64.b64encode(f.read()) params = {"image": img} # access_token = get_access_token() request_url = request_url + "?access_token=" + self.access_token headers = {'content-type': 'application/x-www-form-urlencoded'} response = requests.post(request_url, data=params, headers=headers) if response: res = response.json() new_dic = dict() for ite in res['words_result'].keys(): new_dic[ite] = res['words_result'][ite]['words'] print('资质数据信息', new_dic) return new_dic except: return None def screenshot_the_business_license(self): screenshot_path = 'screenshot1.png' self.d.screenshot(screenshot_path) img = cv2.imread(screenshot_path) # 指定裁剪区域 (left, top, right, bottom) left = 0 top = 480 right = 720 bottom = 1420 cropped_img = img[top:bottom, left:right] cropped_screenshot_path = 'cropped_screenshot.png' cv2.imwrite(cropped_screenshot_path, cropped_img) def get_title(self): try: title = self.d.xpath( '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]//android.widget.TextView').text except: title = self.d.xpath( '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]//android.widget.TextView').text print(f'获取到药品标题:{title}') # 从里面匹配出药品名和规格 # drugs_name # specifications # match = re.search(r'([^\d]+)([\d\D]+)', title) match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title) if match: drugs_name = match.group(1).strip() + match.group(2).strip() specifications = match.group(3).strip() print("药品名:", drugs_name) print("规格:", specifications) print('完整药名:', drugs_name + specifications) return drugs_name, specifications else: print("没有匹配到预期格式") def enter_shop(self): """ 进店,方便提取资质环境 :return: """ self.d.xpath('//*[@text="进店"]').click() time.sleep(self.get_sleep_time()) def enter_shoper(self): """ 进入商家 :return: """ self.d.xpath('//*[@text="商家"]').click() time.sleep(self.get_sleep_time()) def scan_shoper_license(self): self.d.xpath('//*[@text="查看商家资质"]').click() time.sleep(self.get_sleep_time()) def data_is_exists(self, data): try: columns = data.keys() placeholders = [f"{col} = %({col})s" for col in columns] query = f"SELECT * FROM `{self.table_name}` WHERE {' AND '.join(placeholders)}" cur = self.mysql_client.cur cur.execute(query, data) exists = cur.fetchone() return exists except Exception as e: print(f"MySQL 错误: {str(e)}") return None def get_instructions_data(self): """ 确定有说明书之后,提取所有的说明书数据 :return: """ self.d.xpath('//*[@text="说明"]').click() time.sleep(random.randint(3, 5)) self.d.xpath('//*[@text="查看详细说明"]').click() time.sleep(random.randint(3, 5)) self.d.xpath('//*[@text="加载更多"]').click_exists() loop_page = 5 new_list = list() for i in range(loop_page): self.d.xpath('//*[@text="加载更多"]').click_exists() time.sleep(1) if i == 0: self.d.swipe(200, 1000, 200, 300, 0.4) else: self.d.swipe(200, 1000, 200, 62) time.sleep(1) if self.d.xpath('//*[@text="加载更多"]').exists: self.d.xpath('//*[@text="加载更多"]').click() time.sleep(1) all_tt = self.d.xpath( '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup').all() for idx in range(1, len(all_tt) + 1): all_tt1 = self.d.xpath( f'//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[{idx}]//android.widget.TextView').all() for tt in all_tt1: if tt.text: new_list.append(tt.text) if i == 0: height = 938 else: drug_box = self.d.xpath( '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]').info bounds = drug_box['bounds'] height = bounds['bottom'] - bounds['top'] if height < 938: # print('说明书翻页到底部') break # 展开全文 new_list = [item for item in new_list if item != '展开全文'] print(f'当前说明书列表数据:{new_list}') expiry_date_index = next(idx for idx, i in enumerate(new_list) if i == '有效期') manufacturer_index = next(idx for idx, i in enumerate(new_list) if i == '生产单位') approval_number_index = next(idx for idx, i in enumerate(new_list) if i == '批准文号') res_data = { "有效期": new_list[expiry_date_index + 1], "生产单位": new_list[manufacturer_index + 1], "批准文号": new_list[approval_number_index + 1] } print(f'当前说明书字典数据:{res_data}') return res_data def has_instructions(self): """ 是否有说明书 :return: """ # 没有说明书的无法采集具体数据 time.sleep(self.get_sleep_time()) is_has_instructions = self.d.xpath('//*[@text="说明"]').exists return is_has_instructions def has_shop(self): """ 是否有进店按钮 :return: """ # self.d.swipe_ext('up', 0.1) time.sleep(self.get_sleep_time()) is_has_enter_shop = self.d.xpath('//*[@text="进店"]').exists return is_has_enter_shop def get_license_info(self): self.enter_shop() self.enter_shoper() self.scan_shoper_license() # 获取资质编码 qualification_number = self.get_qualification_number() if qualification_number: table_license_info = self.get_table_license_info(qualification_number) if table_license_info: return { '单位名称': table_license_info[0], '地址': table_license_info[1], '社会信用代码': table_license_info[2] } else: # operate_no = random.randint(0, 1) self.d.click(0.603, 0.27) # if operate_no == 0: # self.d.xpath('//*[@text="营业执照"]').click() # else: # self.d.click(0.603, 0.27) time.sleep(self.get_sleep_time()) self.screenshot_the_business_license() ocr_res = self.get_ocr_res('cropped_screenshot.png') return ocr_res # operate_no = random.randint(0, 1) self.d.click(0.603, 0.27) # if operate_no == 0: # self.d.xpath('//*[@text="营业执照"]').click() # else: # self.d.click(0.603, 0.27) time.sleep(self.get_sleep_time()) self.screenshot_the_business_license() ocr_res = self.get_ocr_res('cropped_screenshot.png') return ocr_res def distinct_target(self): is_position = self.d.xpath( '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]').exists return is_position def enter_target_page(self): self.d.xpath('//*[@content-desc="看病买药"]').click() time.sleep(self.get_sleep_time()) self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/vf_search_carousel_text"]').click() time.sleep(self.get_sleep_time()) self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]').click() time.sleep(self.get_sleep_time()) self.d.send_keys(self.search_key, clear=True) time.sleep(self.get_sleep_time()) self.d.xpath('//*[@text="搜索"]').click() time.sleep(self.get_sleep_time()) def get_table_license_info(self, qualification_number): try: sql = f'select business_license_company,city,credit_code from mt_drug where credit_code = "{qualification_number}"' self.mysql_client.cur.execute(sql) res = self.mysql_client.cur.fetchone() return res except: return None def integrate_data(self): """ 整合数据 :return: """ title_info = self.get_title() # 药品,规格 if title_info: product, specifications = title_info if self.search_key not in product.replace(' ', ''): self.swipe_back(1) self.unrelated_data += 1 return else: self.swipe_back(1) return min_price = self.drug_price() # 最低价格 for i in range(3): if self.d.xpath('//*[@text="进店"]').exists: print('开始获取店铺名') break self.d.swipe_ext('up', 0.2) time.sleep(1) # detail_info = self.d.xpath( # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[6]').info # bounds = detail_info['bounds'] # height = bounds['bottom'] - bounds['top'] # if self.d.xpath('//*[@text="进店"]').exists and height > 100: if self.d.xpath('//*[@text="进店"]').exists: print('开始获取店铺名') break shop = self.get_shop_name() # 爬取日期 scrape_date = self.get_current_date() dup_data = {'product': product, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date, 'platform': '美团'} if self.data_is_exists(dup_data): print('存在相同数据不入库') self.swipe_back(1) return is_has_instructions = self.has_instructions() if not shop: print('未获取到店铺名:开始回退') self.swipe_back(1) return if not shop or '京东自营' in shop: self.swipe_back(1) return time.sleep(self.get_sleep_time()) # 生产日期为空 manufacture_date = '' # 执政信息 # if is_has_enter_shop: # license_info = self.get_license_info() # business_license_company = license_info["单位名称"] # credit_code = license_info['社会信用代码'] # city_str = license_info['地址'] # # 先把省份啥的替换掉 # city_sub_str = re.sub(r'[u4e00-\u9fa5]+省', '', city_str) # try: # city = re.search(r'[\u4e00-\u9fa5]+?(市|区|县)', city_sub_str).group(0) # except: # city = city_sub_str # try: # province = self.city2province[city] # except: # province = '' # self.swipe_back(2) # else: # business_license_company = '' # credit_code = '' # city = '' # province = '' business_license_company = '' credit_code = '' city = '' province = '' # 说明书等信息 if is_has_instructions: print('开始获取说明书信息') instructions_info = self.get_instructions_data() expiry_date = instructions_info['有效期'].strip('。') manufacturer = instructions_info['生产单位'].strip('。') approval_number = instructions_info['批准文号'].strip('。') else: # 没有说明书不入库 self.swipe_back(1) return self.unrelated_data = 0 # 商品链接 product_link = '' # 爬取省份 scrape_province = '广东' # 这里先默认广东 # 是否有货 availability = '' save_data = { 'product': product, 'min_price': min_price, 'manufacture_date': manufacture_date, 'expiry_date': expiry_date, 'shop': shop, 'business_license_company': business_license_company, 'province': province, 'city': city, 'manufacturer': manufacturer, 'specification': specifications, 'approval_number': approval_number, 'product_link': product_link, 'scrape_date': scrape_date, 'scrape_province': scrape_province, 'availability': availability, 'credit_code': credit_code, 'platform': '美团' } self.save_to_database(save_data) if self.distinct_target(): print('已到达搜索列表页') else: for i in range(1): self.swipe_back(1) # 最外部有个定位按钮 if self.distinct_target(): break def main(self, device_id): spider_no = 0 self.connect_devices(device_id) time.sleep(self.get_sleep_time()) # 重新开启美团应用 self.restart_app() # 搜索关键字 self.enter_target_page() for idx in range(100): print(f'第{idx + 1}页') if spider_no > 30: time.sleep(120) spider_no = 0 print('目前无关数据量: ', self.unrelated_data) # if self.unrelated_data > 10: # # 连续超过5个不达标的数据则停止采集 # break drug_lis = self.d.xpath('//android.support.v7.widget.RecyclerView/android.widget.FrameLayout').all() for drug_one in drug_lis: bounds = drug_one.info['bounds'] top = bounds['top'] bottom = bounds['bottom'] # height = bottom - top if 304 <= top and bottom <= 1475: # 默认高度241的才行 # print('目标-->', drug_one.info) drug_one.click() # print('点击目标药品完毕') time.sleep(2) # 采集药品信息 try: self.integrate_data() # 检测下是否回退到列表页 if self.distinct_target(): print('回退到列表页', True) else: print('回退到列表页失败,终止采集') return time.sleep(self.get_sleep_time()) spider_no += 1 except Exception as e: print(f'采集药品详情数据出错:{e}') if not self.distinct_target(): for i in range(1): self.swipe_back(1) # 最外部有个定位按钮 if self.distinct_target(): break if i == 0 and not self.distinct_target(): print('页面出错,退出采集') return else: continue if self.d.xpath('//*[@text="已经到底啦"]').exists: print('已经到达列表页最底部') return search_list = self.d.xpath('//android.support.v7.widget.RecyclerView').info bounds = search_list['bounds'] print('搜索列表高度', bounds['bottom'] - bounds['top']) self.d.swipe(200, 1400, 200, 1400 + bounds['top'] - bounds['bottom']) time.sleep(self.get_sleep_time()) def unitest(self): """ 单元测试 :return: """ pass def main(): mt = MT('舒肝颗粒') # 参苓健脾胃颗粒 舒肝颗粒 清肺化痰丸 香砂平胃颗粒 # mt.main('95b2c764') mt.main('fcb3c749') if __name__ == '__main__': main() # scheduler = BlockingScheduler() # scheduler.add_job(main, 'cron', hour=21, minute=30, misfire_grace_time=120) # try: # scheduler.start() # except (KeyboardInterrupt, SystemExit): # pass