new_mt_0.py 107 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149
  1. import sys
  2. import requests
  3. import base64
  4. import cv2
  5. import uiautomator2 as u2
  6. import time
  7. import subprocess
  8. import re
  9. import random
  10. import datetime
  11. import json
  12. from aip import AipOcr
  13. from apscheduler.schedulers.blocking import BlockingScheduler
  14. # from db_mysql import mysqlClient
  15. import threading
  16. from collections import deque
  17. import numpy as np
  18. import secrets
  19. import os
  20. import oss2
  21. import urllib.parse
  22. from exceptiongroup import catch
  23. from pygments.unistring import cats
  24. # import pyperclip
  25. from config import Config
  26. from logger import setup_logger
  27. import logging
  28. # from database import MySQLClient
  29. from PIL import Image
  30. from pathlib import Path
  31. from PIL import Image, ImageDraw, ImageFont
  32. # 配置日志
  33. # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  34. setup_logger("mt_spider") # 初始化日志
  35. class SpiderMonitor(threading.Thread):
  36. """全局弹窗监控线程(增强版)"""
  37. def __init__(self, spider_instance):
  38. super().__init__(daemon=True)
  39. self.spider = spider_instance
  40. self.running = True
  41. self.pausing = threading.Event() # 主线程同步事件
  42. self.last_verification_time = 0
  43. self.verification_count = 0
  44. self.MAX_VERIFICATION_RETRY = 10
  45. self.recent_clicks = deque(maxlen=10) # 防重复点击
  46. self.logger = logging.getLogger("SpiderMonitor")
  47. # 可配置化弹窗规则
  48. self.popup_rules = {
  49. "simple": [
  50. ('//*[@text="确定"]', "点击确定"),
  51. ('//*[@text="允许"]', "点击允许"),
  52. ('//*[@text="关闭"]', "点击关闭"),
  53. ('//*[@resource-id="com.sankuai.meituan:id/close"]', "关闭按钮"),
  54. ('//*[@resource-id="com.sankuai.meituan:id/address_center_location_close"]', "关闭按钮"),
  55. ('//*[@resource-id="com.sankuai.meituan:id/location_close"]', "关闭按钮"),
  56. ('//*[@resource-id="com.sankuai.meituan:id/btn_close"]', "关闭按钮"),
  57. ],
  58. "verification": [
  59. '//*[contains(@text, "验证")]',
  60. '//*[contains(@text, "滑块")]',
  61. '//*[contains(@text, "依次点击")]',
  62. '//*[contains(@text, "请点击")]',
  63. '//*[contains(@text, "拖动滑块刚")]', # 这个需要拖动滑块至最右边,然后再截图
  64. '//*[contains(@text, "请输入图片中的内容")]',
  65. '//*[contains(@text, "用最短线连接")]',
  66. '//*[contains(@text, "请按语序依次点击")]',
  67. '//*[contains(@text, "请向右滑动滑块")]',
  68. '//*[contains(@text, "请拖动下方滑块完成拼图")]',
  69. '//*[contains(@resource-id, "captcha")]'
  70. ]
  71. }
  72. def run(self):
  73. while self.running:
  74. try:
  75. handled = self.check_and_handle_popup()
  76. time.sleep(2 if handled else 1)
  77. except Exception as e:
  78. self.logger.exception("监控线程异常: %s", e)
  79. time.sleep(1)
  80. def _is_recent_click(self, xpath):
  81. """防止重复点击同一个弹窗"""
  82. key = f"{xpath}_{int(time.time())}"
  83. if key in self.recent_clicks:
  84. return True
  85. self.recent_clicks.append(key)
  86. return False
  87. def check_and_handle_popup(self):
  88. d = self.spider.d
  89. # 1. 处理简单弹窗
  90. for xpath, desc in self.popup_rules["simple"]:
  91. if d.xpath(xpath).exists and not self._is_recent_click(xpath):
  92. self.logger.info("检测到弹窗: %s", desc)
  93. d.xpath(xpath).click()
  94. return True
  95. # 2. 处理验证码弹窗
  96. for xpath in self.popup_rules["verification"]:
  97. if d.xpath(xpath).exists:
  98. now = time.time()
  99. if now - self.last_verification_time < 30:
  100. return False # 30秒内不重复触发
  101. self.last_verification_time = now
  102. self.verification_count += 1
  103. self.logger.warning("验证码弹窗触发,等待人工处理...")
  104. if self.verification_count > self.MAX_VERIFICATION_RETRY:
  105. self.logger.error("验证码重试超限,终止任务")
  106. self.spider.stop_all()
  107. return True
  108. self.pausing.set() # 通知主线程暂停
  109. # d.toast.show("需要人工处理验证码", 120)
  110. # 等待人工处理
  111. start = time.time()
  112. # while time.time() - start < 120*60:
  113. # if not d.xpath(xpath).exists:
  114. # self.logger.info("验证码已处理")
  115. # d.toast.show("验证完成", 2)
  116. # self.pausing.clear() # 放行主线程
  117. # return True
  118. # time.sleep(5)
  119. while True:
  120. if not d.xpath(xpath).exists:
  121. self.logger.info("验证码已处理")
  122. # d.toast.show("验证完成", 2)
  123. self.pausing.clear() # 放行主线程
  124. return True
  125. time.sleep(5)
  126. self.logger.warning("验证码超时,重启APP")
  127. self.spider.restart_app()
  128. return True
  129. # 3. 处理广告弹窗(点击右上角)
  130. if d.xpath('//*[contains(@text, "广告")]').exists:
  131. w, h = d.info['displayWidth'], d.info['displayHeight']
  132. d.click(w - 50, 50)
  133. self.logger.info("关闭广告弹窗")
  134. return True
  135. return False
  136. def stop(self):
  137. self.running = False
  138. class MTScreenshot:
  139. def __init__(self, d, oss_config, search_key, title_key, scroll_times=4, compress_quality=7, resize_ratio=0.8):
  140. # 接收外部已连接好的u2设备实例
  141. self.d = d
  142. self.search_key = search_key # 添加这行
  143. self.title_key = title_key
  144. # 启动全局弹窗监控
  145. self.monitor = SpiderMonitor(self)
  146. self.monitor.start()
  147. self.loggerMT = logging.getLogger()
  148. # 日志初始化
  149. self.logger = self._init_logger()
  150. # OSS配置与初始化(核心配置,无冗余)
  151. self.oss_config = oss_config
  152. self.oss_bucket = self._init_oss_bucket()
  153. # 截图核心参数
  154. self.scroll_times = scroll_times
  155. self.compress_quality = compress_quality
  156. self.resize_ratio = resize_ratio
  157. # self.title_xpaths = [
  158. # '//*[@resource-id="com.jd.lib.productdetail.feature:id/db"]',
  159. # '//*[@resource-id="com.jd.lib.productdetail.feature:id/cx"]',
  160. # '//*[@resource-id="com.jd.lib.productdetail.feature:id/cj"]'
  161. # ]
  162. def _init_logger(self):
  163. # 极简日志配置,仅保留必要输出
  164. logger = logging.getLogger("mt_screenshot")
  165. logger.setLevel(logging.INFO)
  166. logger.handlers.clear()
  167. handler = logging.StreamHandler()
  168. handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
  169. logger.addHandler(handler)
  170. return logger
  171. def _init_oss_bucket(self):
  172. # 仅做OSS配置校验和Bucket连接,无额外功能
  173. if not all([self.oss_config.get("access_key_id"),
  174. self.oss_config.get("access_key_secret"),
  175. self.oss_config.get("endpoint"),
  176. self.oss_config.get("bucket_name")]):
  177. self.logger.warning("OSS配置不完整,无法上传")
  178. return None
  179. try:
  180. auth = oss2.Auth(self.oss_config["access_key_id"], self.oss_config["access_key_secret"])
  181. bucket = oss2.Bucket(auth, self.oss_config["endpoint"], self.oss_config["bucket_name"])
  182. bucket.get_bucket_info() # 验证连接
  183. self.logger.info("OSS Bucket连接成功")
  184. return bucket
  185. except Exception as e:
  186. self.logger.error(f"OSS Bucket连接失败: {e}")
  187. return None
  188. def _upload_to_oss(self, local_path):
  189. # 极简上传逻辑,仅返回OSS URL或None
  190. if not self.oss_bucket or not os.path.exists(local_path):
  191. return None
  192. file_name = os.path.basename(local_path)
  193. safe_name = re.sub(r'[^\w\.\-]', '_', file_name)
  194. oss_key = f"{self.oss_config.get('oss_prefix', 'scrape_data/')}{safe_name}"
  195. try:
  196. oss2.resumable_upload(self.oss_bucket, oss_key, local_path)
  197. # 生成并返回完整OSS URL
  198. oss_file_url = f"https://{self.oss_config['bucket_name']}.{self.oss_config['endpoint']}/{urllib.parse.quote(oss_key, safe='/')}"
  199. self.logger.info(f"OSS上传成功: {oss_file_url}")
  200. return oss_file_url
  201. except Exception as e:
  202. self.logger.error(f"OSS上传失败: {e}")
  203. return None
  204. # def _get_title(self):
  205. # # 仅提取标题,无冗余逻辑
  206. # for xpath in self.title_xpaths:
  207. # elem = self.d.xpath(xpath)
  208. # if elem.exists:
  209. # info = elem.info
  210. # title = (info.get("contentDescription") or info.get("content-desc") or info.get("text") or "").strip()
  211. # return title[:50] # 限制标题长度,避免文件名过长
  212. # return ""
  213. def safe_exec(self, func, *args, **kwargs):
  214. """
  215. 万能安全壳:执行 func 前检查验证码,
  216. 若监控线程已置位 pausing,则一直阻塞直到放行。
  217. """
  218. while self.monitor.pausing.is_set():
  219. time.sleep(1)
  220. # 执行真正逻辑
  221. return func(*args, **kwargs)
  222. def _get_title(self):
  223. # try:
  224. # title = self.d.xpath(
  225. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView').text
  226. # except:
  227. # title = self.d.xpath(
  228. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.TextView').text
  229. # title = self.d.xpath('//*[contains(@text, "舒肝颗粒")]').text
  230. def _inner():
  231. # elif self.search_key == '三九胃泰颗粒':
  232. # self.search_key = '三九胃泰' #兼容三九胃泰 温胃舒颗粒
  233. print(f'获取商品title时的搜索关键字:{self.title_key}')
  234. # title = self.d.xpath(f'//*[contains(@text, "{self.search_key}")]').text
  235. # 初始化
  236. drugs_name = ''
  237. specifications = ''
  238. title = ''
  239. # 循环的获取title为了有时间来处理人机验证
  240. for m in range(1, 6000):
  241. if self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').exists:
  242. title = self.safe_exec(
  243. lambda: self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').text
  244. )
  245. self.loggerMT.info(f"第{m}次获取title成功")
  246. print(f"第{m}次获取title成功")
  247. break
  248. else:
  249. time.sleep(3)
  250. # return drugs_name, specifications
  251. # drugs_name = ''
  252. # specifications = ''
  253. # try:
  254. # title_xpath = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  255. # title_xpath_2 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  256. # if self.d.xpath(title_xpath).exists:
  257. # title = self.d.xpath(title_xpath).text
  258. # print(f"title_xpath获取的title={title}")
  259. # if temp_search_key not in title:
  260. # return drugs_name, specifications
  261. # elif self.d.xpath(title_xpath_2).exists:
  262. # title = self.d.xpath(title_xpath_2).text
  263. # print(f"title_xpath_2获取的title={title}")
  264. # if temp_search_key not in title:
  265. # return drugs_name, specifications
  266. # else:
  267. # print('title_xpath不存在,请确认')
  268. # return drugs_name, specifications
  269. # # title = self.d.xpath(f'//*[contains(@text, "{temp_search_key}")]').text
  270. # except Exception as e:
  271. # print(f"发生异常: {e}")
  272. # return drugs_name, specifications
  273. # 奇怪:有的时候title取出来的记过第一位会多一个0
  274. # title = self.safe_exec(self.d.xpath(f'//*[contains(@text, "{self.search_key}")]').text)
  275. # title = self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView').text
  276. title = title[1:] if title.startswith('0') else title
  277. print(f'获取到药品标题:{title}')
  278. # 从里面匹配出药品名和规格
  279. # drugs_name
  280. # specifications
  281. # match = re.search(r'([^\d]+)([\d\D]+)', title)
  282. if self.search_key == '999赐多康大豆':
  283. return title, '1罐'
  284. if self.search_key == "999感冒清热颗粒":
  285. match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
  286. else:
  287. match = re.match(r'(\[[^\]]+\])(.*?)\s*((?:\d+\S*|\(.+))$', title)
  288. if match:
  289. # drugs_name = match.group(1).strip() + match.group(2).strip()
  290. drugs_name = title
  291. specifications = match.group(3).strip()
  292. print("药品名:", drugs_name)
  293. print("规格:", specifications)
  294. # print('完整药名:', drugs_name + specifications)
  295. return drugs_name # , specifications
  296. else:
  297. if title == '999抗病毒口服液10ml*12' or title == '999抗病毒口服液':
  298. drugs_name = title
  299. specifications = '10ml*12支/盒'
  300. return drugs_name # , specifications
  301. elif title == '999抗病毒口服液10ml*10':
  302. drugs_name = title
  303. specifications = '10ml*10支/盒'
  304. return drugs_name # , specifications
  305. elif title == '999小柴胡颗粒':
  306. drugs_name = title
  307. specifications = '10g*9袋/盒'
  308. return drugs_name # , specifications
  309. elif title == '999养胃舒颗粒':
  310. drugs_name = title
  311. specifications = '10g*10袋/盒'
  312. return drugs_name # , specifications
  313. elif title == '三九胃泰胶囊':
  314. drugs_name = title
  315. specifications = '0.5g*24粒/盒'
  316. return drugs_name # , specifications
  317. elif title == '999补脾益肠丸':
  318. drugs_name = title
  319. specifications = '6g*9袋/盒'
  320. return drugs_name # , specifications
  321. elif title == '999感冒灵颗粒':
  322. drugs_name = title
  323. specifications = '10g*9袋/盒'
  324. return drugs_name # , specifications
  325. elif title == '999感冒灵胶囊':
  326. drugs_name = title
  327. specifications = '0.5g*12粒/盒'
  328. return drugs_name # , specifications
  329. else:
  330. print("没有匹配到预期格式")
  331. drugs_name = title
  332. specifications = ''
  333. return drugs_name # , specifications
  334. # 用 safe_exec 包装内部逻辑,确保验证码阻塞
  335. return self.safe_exec(_inner)
  336. def _merge_screenshots(self, screens):
  337. # 仅拼接截图,无额外功能
  338. if len(screens) == 1:
  339. return screens[0].convert('RGB')
  340. rgb_screens = [s.convert('RGB') for s in screens]
  341. total_width = rgb_screens[0].width
  342. total_height = sum(s.height for s in rgb_screens)
  343. merged_img = Image.new('RGB', (total_width, total_height))
  344. y_offset = 0
  345. for img in rgb_screens:
  346. merged_img.paste(img, (0, y_offset))
  347. y_offset += img.height
  348. return merged_img
  349. def get_oss_url(self):
  350. """核心方法:截图+临时本地保存+上传OSS+上传成功删本地文件+返回OSS URL,可直接赋值给oss_file"""
  351. local_file_path = None
  352. try:
  353. # 1. 提取标题
  354. title = self._get_title()
  355. self.logger.info(f"获取标题: {title[:20]}..." if title else "未获取到标题")
  356. # 2. 生成本地文件路径
  357. timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
  358. safe_title = re.sub(r'[\\/*?:"<>|]', '_', title)
  359. local_dir = "../scrape_data"
  360. os.makedirs(local_dir, exist_ok=True)
  361. local_file_path = os.path.join(local_dir, f"{timestamp}_{safe_title}.jpg")
  362. # 3. 滚动截图
  363. screen_list = [self.d.screenshot()]
  364. w, h = self.d.window_size()
  365. for i in range(self.scroll_times):
  366. # 可能滑动距离太短,截不到店名。原本是0.8
  367. # self.d.swipe(w // 2, h * 0.9, w // 2, h * 0.1, duration=random.uniform(0.6, 1.2))
  368. self.d.swipe(w // 2, h * 0.85, w // 2, h * 0.15, # 滑动到15%
  369. duration=random.uniform(0.8, 1.5))
  370. time.sleep(random.uniform(2.0, 4.0))
  371. screen_list.append(self.d.screenshot())
  372. if self.d(textContains='商家服务').exists:
  373. # 看情况是否需要补滑
  374. break
  375. # # ========== 自动处理“是否存储图像”弹窗 ==========
  376. # # 检测弹窗是否存在(根据弹窗的文本/控件ID定位)
  377. # # 通过“是否存储图像”文本定位弹窗
  378. # # if self.d(text="是否存储图像").exists(timeout=2):
  379. # # # 点击“取消”(不需要系统存储截图)
  380. # # self.d(text="取消").click(timeout=2)
  381. # # self.logger.info("已自动关闭“是否存储图像”弹窗")
  382. #
  383. # #出现标题 break
  384. # ========== 滑动截图完成后,滑回初始位置 ==========
  385. # self.logger.info("开始滑回初始位置")
  386. #
  387. # for i in range(self.scroll_times):
  388. # # 反向滑动(与正向滑动方向相反)
  389. # self.d.swipe_ext('down', 0.8)
  390. # time.sleep(random.uniform(1.0, 2.0))
  391. # print(f"第{i+1}次反向滑动,已滑回部分距离")
  392. # self.logger.info("✅ 已滑回初始页面位置")
  393. # 4. 拼接+压缩+保存
  394. merged_img = self._merge_screenshots(screen_list)
  395. if 0.1 < self.resize_ratio < 1.0:
  396. new_size = (int(merged_img.width * self.resize_ratio), int(merged_img.height * self.resize_ratio))
  397. resample_mode = Image.Resampling.LANCZOS if hasattr(Image, 'Resampling') else Image.LANCZOS
  398. merged_img = merged_img.resize(new_size, resample_mode)
  399. # 临时保存到本地
  400. merged_img.save(local_file_path, format='JPEG', quality=self.compress_quality)
  401. merged_img.close() # 释放长图句柄
  402. self.logger.info(f"临时本地保存: {local_file_path}")
  403. # 5. 上传OSS
  404. oss_url = self._upload_to_oss(local_file_path)
  405. # 6. 核心:OSS上传成功后,删除本地临时文件
  406. if oss_url is not None:
  407. try:
  408. # 先不删除,检查还有没有问题
  409. # os.remove(local_file_path)
  410. self.logger.info(f"✅ OSS上传成功,已删除本地临时文件: {local_file_path}")
  411. # 若本地目录为空,可删除目录(按需开启)
  412. # if not os.listdir(local_dir):
  413. # os.rmdir(local_dir)
  414. # self.logger.info(f"本地目录{local_dir}为空,已删除")
  415. except Exception as e:
  416. self.logger.warning(f"⚠️ OSS上传成功,但删除本地文件失败: {e}")
  417. return oss_url
  418. except Exception as e:
  419. self.logger.error(f"截图/上传失败: {e}")
  420. return None
  421. def get_access_token():
  422. AppKey = "tRK2RhyItCSh6BzyT4CNVXQa"
  423. AppSrcret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  424. token_url = 'https://aip.baidubce.com/oauth/2.0/token'
  425. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  426. payload = ""
  427. headers = {
  428. 'Content-Type': 'application/json',
  429. 'Accept': 'application/json'
  430. }
  431. response = requests.request("POST", url, headers=headers, data=payload)
  432. try:
  433. return response.json()['access_token']
  434. except:
  435. return None
  436. def get_mysql():
  437. """
  438. 建立并返回一个到数据库的连接对象
  439. """
  440. import pymysql
  441. return pymysql.connect(
  442. host=Config.DB_HOST, # "localhost", # 修改后的主机
  443. port=Config.DB_PORT, # 3306, # 添加端口号
  444. user=Config.DB_USER, # 'root', # 修改后的用户名
  445. password=Config.DB_PASSWORD, # 修改后的密码
  446. db=Config.DB_NAME, # "drug_data", # 修改后的数据库名
  447. charset='utf8mb4'
  448. )
  449. class MT:
  450. def __init__(self, key, title_key,spec_list,brand,sort=None):
  451. # self.package_name = 'com.sankuai.meituan'
  452. self.package_name = Config.PACKAGE_NAME
  453. self.access_token = get_access_token()
  454. self.city2province = self.get_city_info()
  455. self.APP_ID = '116857964'
  456. self.API_KEY = '1gAzACJOAr7BeILKqkqPOETh'
  457. self.SECRET_KEY = 'ZNArANb9GwJYgLKg4EfYhukKBfPdl1n3'
  458. self.client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
  459. self.table_name = Config.DB_TABLE # "mt_drug_middle"
  460. self.shop_table_name = Config.DB_SHOP_TABLE
  461. # print(f'数据库表名:table_name:{self.table_name},shop_table_name:{self.shop_table_name}')
  462. # self.mysql_client = mysqlClient(host, user, password, database, port)
  463. self.loggerMT = logging.getLogger()
  464. self.search_key = key # 参苓健脾胃颗粒 舒肝颗粒 清肺化痰丸 香砂平胃颗粒
  465. self.title_key = title_key
  466. self.spec_list = spec_list
  467. self.brand = brand
  468. self.sort = sort
  469. self.sort_key = 0
  470. self.unrelated_data = 0 # 连续无关数据数量
  471. self.shop_data_num = 0 # 店铺数据数量
  472. def stop_app(self):
  473. self.d.app_stop(self.package_name)
  474. time.sleep(5)
  475. def start_app(self):
  476. self.d.app_start(self.package_name)
  477. time.sleep(5)
  478. def restart_app(self):
  479. """
  480. 重启app
  481. :return:
  482. """
  483. self.stop_app()
  484. self.start_app()
  485. # 排序采集
  486. def li_or_lo(self, key):
  487. if key == "升序":
  488. self.sort_key +=1
  489. self.d.xpath('//*[@text="价格"]').click()
  490. time.sleep(self.get_sleep_time())
  491. if key == "降序":
  492. self.sort_key +=1
  493. self.d.xpath('//*[@text="价格"]').click()
  494. time.sleep(1)
  495. self.d.xpath('//*[@text="价格"]').click()
  496. time.sleep(1)
  497. def wr_re(self, mod, device_id, title=None, prices=None,shop=None, sort=None):
  498. file_path = f'./ycwj/{device_id}_{self.title_key}.txt'
  499. if mod == "写":
  500. try:
  501. data = {"title": title if title else "","prices": prices if prices else "","shop": shop if shop else "","sort": sort if sort else ""}
  502. os.makedirs(os.path.dirname(file_path), exist_ok=True)
  503. with open(file_path, 'w', encoding='utf-8') as f:
  504. json.dump(data, f, ensure_ascii=False, indent=2)
  505. self.loggerMT.info(f"进度保存成功: {title}")
  506. except Exception as e:
  507. self.loggerMT.error(f"保存进度失败: {e}")
  508. elif mod == "读":
  509. try:
  510. if not os.path.exists(file_path):
  511. return None
  512. with open(file_path, 'r', encoding='utf-8') as f:
  513. data = json.load(f)
  514. while True:
  515. if self.d.xpath(f'//*[@text="{data["shop"]}"]').exists and self.d.xpath(f'//*[@text="¥{data["prices"]}"]').exists:
  516. if self.sort and self.sort_key == 0:
  517. self.li_or_lo(self.sort)
  518. break
  519. else:
  520. self.d.drag(300, 1400, 300, 400, 1)
  521. return data
  522. except Exception as e:
  523. self.loggerMT.error(f"读取进度失败: {e}")
  524. return None
  525. return None
  526. # 任何一个spec满足都算有效
  527. def is_link_spec_useful(self, product_title):
  528. if len(self.spec_list) == 0:
  529. return True
  530. for spec in self.spec_list:
  531. if spec in product_title:
  532. return True
  533. return False
  534. # TODO 继续优化这里的判断逻辑,可以考虑搭配config的修改
  535. def is_link_useful(self, product_title):
  536. if self.title_key != "" and self.title_key not in product_title:
  537. print(f"当前商品名称:{product_title} 不包含{self.title_key}关键字")
  538. return False
  539. if self.brand != "" and self.brand not in product_title:
  540. print(f"当前商品名称:{product_title} 不包含{self.brand}品牌")
  541. return False
  542. if not self.is_link_spec_useful(product_title):
  543. print(f"当前商品名称:{product_title} 不包含{self.spec_list}品规")
  544. return False
  545. return True
  546. @staticmethod
  547. def get_sleep_time():
  548. # return random.randint(5, 8)
  549. return random.randint(1, 2)
  550. @staticmethod
  551. def get_current_date():
  552. return datetime.datetime.now().strftime('%Y/%m/%d')
  553. @staticmethod
  554. def get_city_info():
  555. """
  556. 获取所有的省市数据
  557. :return:
  558. """
  559. file_path = '../kailin_city.json'
  560. with open(file_path, 'r', encoding='utf-8') as f:
  561. data = json.load(f)
  562. province = {province_one["id"]: province_one for province_one in data['province']}
  563. city2province = dict()
  564. city = data['city']
  565. for city_one in city:
  566. name = city_one['name']
  567. pid = city_one['pid']
  568. if len(str(pid)) > 2:
  569. pid = int(re.match('^\d{2}', str(pid)).group())
  570. city2province[name] = province[pid]['name']
  571. return city2province
  572. def get_shop_name(self):
  573. """
  574. 获取店铺名
  575. :return:
  576. """
  577. try:
  578. shop_name = self.d.xpath(
  579. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  580. print(f'获取到店铺名:{shop_name}')
  581. return shop_name
  582. except:
  583. try:
  584. shop_name = self.d.xpath(
  585. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()-1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  586. print(f'获取到店铺名2:{shop_name}')
  587. return shop_name
  588. except Exception as e:
  589. # 点击店铺曲获取店铺名称
  590. print("点击店铺进入后获取店铺名称")
  591. self.enter_shop()
  592. shop_xpath = '//*[@resource-id="com.sankuai.meituan:id/layout_header_view"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]//android.widget.FrameLayout[2]/android.widget.FrameLayout[1]/android.widget.TextView'
  593. if self.d.xpath(shop_xpath).exists:
  594. shop_name = self.d.xpath(shop_xpath).text
  595. self.swipe_back(1)
  596. return shop_name
  597. else:
  598. print(f'获取店铺名出错:{e}')
  599. shop_name = ''
  600. return shop_name
  601. def get_qualification_number(self):
  602. """
  603. 获取资质编号
  604. :return:
  605. """
  606. try:
  607. qualification_number_str = self.d.xpath(
  608. '//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text
  609. qualification_number = qualification_number_str.strip('资质编号:').strip()
  610. return qualification_number
  611. except:
  612. return None
  613. def get_shop_address(self):
  614. try:
  615. xpath = '//*[@resource-id="com.sankuai.meituan:id/wm_sc_drug_shop_content_mrn_container_id_2"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.TextView'
  616. if self.d.xpath(xpath).exists:
  617. shop_address = self.d.xpath(xpath).text
  618. print(f'111-获取到店铺地址:{shop_address}')
  619. if '发货时间' in shop_address:
  620. print(f'店铺地址包含发货时间,再次获取店铺地址')
  621. xpath2 = '//*[@resource-id="com.sankuai.meituan:id/wm_sc_drug_shop_content_mrn_container_id_2"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.TextView'
  622. if self.d.xpath(xpath2).exists:
  623. shop_address = self.d.xpath(xpath2).text
  624. print(f'222-获取到店铺地址:{shop_address}')
  625. else:
  626. print(f'222-xpath2获取店铺地址失败')
  627. else:
  628. shop_address = ''
  629. print(f'333-获取到店铺地址:{shop_address}')
  630. return shop_address
  631. except:
  632. print(f'获取店铺地址出错-get_shop_address')
  633. return None
  634. def enter_detail(self):
  635. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/recycler"]/android.widget.FrameLayout[1]').click()
  636. time.sleep(self.get_sleep_time())
  637. def save_to_database(self, data):
  638. i = 0
  639. while True:
  640. try:
  641. i += 1
  642. print(f'第{i}次保存数据到数据库:{data}')
  643. # 连接数据库
  644. conn = get_mysql()
  645. # 创建游标对象
  646. cur = conn.cursor()
  647. # add_sql = "insert into delete_friend_table(delete_user_name,delete_user_id,delete_content,delete_time) value(%s,%s,%s,%s)"
  648. add_sql = f"""
  649. INSERT INTO {self.table_name}
  650. (product, min_price, manufacture_date, expiry_date, shop, business_license_company, province, city, manufacturer, specification, approval_number, product_link, scrape_date, scrape_province, availability, credit_code, platform, search_key, sales, inventory, snapshot_url)
  651. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  652. """
  653. # cur.execute(add_sql, (data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'], data['business_license_company'],data['province'], data['city'], data['manufacturer'], data['specification'], data['approval_number'], data['product_link'], self.get_current_date(), data['scrape_province'], data['availability'], data['credit_code'], data['platform']))
  654. cur.execute(add_sql,
  655. (data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'],
  656. data['business_license_company'], data['province'], data['city'], data['manufacturer'],
  657. data['specification'], data['approval_number'], data['product_link'], data['scrape_date'],
  658. data['scrape_province'], data['availability'], data['credit_code'], data['platform'],
  659. data['search_key'], data['sales'], data['inventory'], data['snapshot_url']))
  660. conn.commit() # 提交数据
  661. return
  662. except Exception as e:
  663. print(f'保存数据库异常: {e}')
  664. time.sleep(self.get_sleep_time())
  665. def save_shop_info_to_database(self, data):
  666. i = 0
  667. while True:
  668. try:
  669. i += 1
  670. print(f'第{i}次保存店铺数据到数据库:{data}')
  671. # 连接数据库
  672. conn = get_mysql()
  673. # 创建游标对象
  674. cur = conn.cursor()
  675. add_sql = f"""
  676. INSERT INTO {self.shop_table_name}
  677. (shop, contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform)
  678. VALUES (%s, %s, %s, %s, %s, %s, %s)
  679. """
  680. cur.execute(add_sql, (data['shop'], data['contact_address'], data['qualification_number'],
  681. data['business_license_company'], data['business_license_address'], data['scrape_date'],
  682. data['platform']))
  683. conn.commit() # 提交数据
  684. # self.mysql_client.insert(self.shop_table_name, data)
  685. print(f'存入店铺信息到数据库成功')
  686. return
  687. except Exception as e:
  688. print(f'保存数据库异常: {e}')
  689. time.sleep(self.get_sleep_time())
  690. def swipe_up(self):
  691. """
  692. 上滑
  693. :return:
  694. """
  695. screen_width = self.d.info['displayWidth']
  696. screen_height = self.d.info['displayHeight']
  697. duration_rate = random.uniform(0, 0.3)
  698. self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate)
  699. no = random.uniform(0, 1)
  700. if no > 0.85:
  701. # 有的时候卡着 再稍微往上滑一点点
  702. self.d.swipe_ext("up", 0.1)
  703. time.sleep(self.get_sleep_time())
  704. def swipe_back(self, no):
  705. """
  706. 返回
  707. :param no: 回退次数
  708. :return:
  709. """
  710. for idx in range(no):
  711. self.d.press('back')
  712. time.sleep(self.get_sleep_time())
  713. def drug_price(self):
  714. """
  715. 获取药品价格
  716. :return:
  717. """
  718. try:
  719. price_str = self.d.xpath('//*[starts-with(@text,"¥")]').text
  720. price = float(re.search(r'[\d\.]+', price_str).group())
  721. print(f'获取到价格:{price}')
  722. return price
  723. except Exception as e:
  724. print(f'提取价格出错-->{e}')
  725. return None
  726. def drug_sale_num(self):
  727. """
  728. 获取药品销量
  729. :return:
  730. """
  731. try:
  732. sales_element = self.d.xpath('//*[starts-with(@text,"已售")]')
  733. if sales_element.exists:
  734. sales_num_str = self.d.xpath('//*[starts-with(@text,"已售")]').text
  735. sales_num_str = sales_num_str.replace("已售", "").strip()
  736. # price = float(re.search(r'[\d\.]+', price_str).group())
  737. print(f'获取到已售数量:{sales_num_str}')
  738. return sales_num_str
  739. return None
  740. except Exception as e:
  741. print(f'提取已售数量出错-->{e}')
  742. return None
  743. def restart_uiautomator_services(self, device_id):
  744. """
  745. 重启atx的uiautomator 服务
  746. :param device_id:
  747. :return:
  748. """
  749. stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop'
  750. start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'
  751. # result = subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  752. # print(result.stdout)
  753. subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  754. time.sleep(self.get_sleep_time())
  755. subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True)
  756. time.sleep(self.get_sleep_time())
  757. def connect_devices(self, device_id):
  758. """
  759. 连接设备
  760. :return:
  761. """
  762. try:
  763. self.d = u2.connect_usb(device_id)
  764. self.restart_uiautomator_services(device_id)
  765. self.oss_config = {
  766. "access_key_id": 'LTAI5tDwjfteBvivYN41r8sJ',
  767. "access_key_secret": 'yowuOGi2nYYnrqGpO3qcz94C4brcPp',
  768. "endpoint": "oss-cn-shenzhen.aliyuncs.com", # 例:oss-cn-beijing.aliyuncs.com
  769. "bucket_name": "zhijiayun-jiansuo",
  770. "oss_prefix": "scrape_data/" # OSS中存放截图的前缀(虚拟文件夹)
  771. }
  772. # jd_screenshot_ins = JDScreenshot(
  773. # d=self.d, # 传入你已连接好的设备实例
  774. # oss_config=self.oss_config,
  775. # scroll_times=2, # 可选,自定义滚动次数
  776. # compress_quality=8, # 可选,自定义压缩质量
  777. # resize_ratio=0.9 # 可选,自定义缩放比例
  778. # )
  779. print(f'连接到设备:{device_id}')
  780. self.loggerMT.info(f'连接到设备:{device_id}')
  781. except Exception as e:
  782. print(f'{device_id} 连接错误: {e}')
  783. self.loggerMT.info(f'{device_id} 连接错误: {e}')
  784. raise Exception(e)
  785. def get_ocr_res(self, img):
  786. try:
  787. # img地址
  788. print(f'开始识别图片:{img}')
  789. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  790. # 二进制方式打开图片文件
  791. f = open(img, 'rb')
  792. img = base64.b64encode(f.read())
  793. params = {"image": img}
  794. # access_token = get_access_token()
  795. request_url = request_url + "?access_token=" + self.access_token
  796. headers = {'content-type': 'application/x-www-form-urlencoded'}
  797. response = requests.post(request_url, data=params, headers=headers)
  798. if response:
  799. res = response.json()
  800. new_dic = dict()
  801. for ite in res['words_result'].keys():
  802. new_dic[ite] = res['words_result'][ite]['words']
  803. print('资质数据信息', new_dic)
  804. return new_dic
  805. else:
  806. return None
  807. except:
  808. return None
  809. def remove_watermark(self, img_path):
  810. """
  811. 图片去水印(将水印部分变成白色背景)并将数据转化为二进制数据
  812. :param img_path: 图片路径
  813. :return: 二进制图片数据
  814. """
  815. img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
  816. endswith = os.path.splitext(img_path)[1]
  817. new = np.clip(1.4057577998008846 * img - 38.33089999653017, 0, 255).astype(np.uint8)
  818. _, img_binary = cv2.imencode(endswith, new)
  819. return img_binary
  820. def get_ocr_res_image(self, img):
  821. try:
  822. image = self.remove_watermark(img)
  823. # image_file = open(img,'wb')
  824. # image_file.write(image)
  825. # res_image = self.client.basicAccurate(image) # 高精度
  826. res_image = self.client.basicGeneral(image)
  827. # print(f'百度api返回结果:{res_image}')
  828. # print(res_image.get('words_result', ''))
  829. # new_dic = dict()
  830. data = res_image.get('words_result', '')
  831. print(f'百度api返回结果:{data}')
  832. # full_text = ';'.join(item['words'] for item in data)
  833. # address = ''
  834. # for item in data:
  835. # if '企业注册号' in item['words']:
  836. # print('come in 111')
  837. # reg_number = item['words'].split(':', 1)[1].strip()
  838. # elif '企业名称' in item['words']:
  839. # print('come in 222')
  840. # company_name = item['words'].split(':', 1)[1].strip()
  841. # elif '所:' in item['words']:
  842. # print('come in 333')
  843. # address = item['words'].split(':', 1)[1].strip()
  844. # # 输出结果
  845. # print("企业注册号:", reg_number)
  846. # print("企业名称:", company_name)
  847. # print("住所:", address)
  848. return data
  849. except:
  850. return None
  851. def screenshot_the_business_license(self, qualification_number):
  852. screenshot_path = 'screenshot1.png'
  853. self.d.screenshot(screenshot_path)
  854. img = cv2.imread(screenshot_path)
  855. # 指定裁剪区域 (left, top, right, bottom)
  856. left = 0
  857. top = 480
  858. right = 720
  859. bottom = 1420
  860. cropped_img = img[top:bottom, left:right]
  861. # 创建目录
  862. SCREENSHOT_DIR = Path('screenshot') # 注意这里的变化和py文件同一级目录即可
  863. SCREENSHOT_DIR.mkdir(parents=True, exist_ok=True)
  864. if qualification_number:
  865. # cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\mt\\screenshot\\' + qualification_number + '.png'
  866. cropped_screenshot_path = SCREENSHOT_DIR / f'{qualification_number}.png'
  867. else:
  868. cropped_screenshot_path = 'cropped_screenshot.png'
  869. cv2.imwrite(cropped_screenshot_path, cropped_img)
  870. return cropped_screenshot_path
  871. def screenshot_instruction(self):
  872. # 获取当前时间
  873. current_time = datetime.datetime.now()
  874. # 格式化为时分秒
  875. time_str = current_time.strftime("%H-%M-%S")
  876. # 生成随机的 8 位字符串
  877. random_str = secrets.token_hex(4) # 生成 4 个字节的随机字符串,转换为 8 位十六进制字符串
  878. print(time_str)
  879. screenshot_path = 'instructionscreenshot1-' + time_str + '-' + random_str + '.png'
  880. self.d.screenshot(screenshot_path)
  881. return screenshot_path
  882. def extract_specification(self, text):
  883. """提取药品规格信息"""
  884. # 方法1:简单去除到期信息
  885. pattern = r'^[^【]+'
  886. match = re.search(pattern, text)
  887. if match:
  888. return match.group(0).strip()
  889. return text
  890. # 获取商品title
  891. def get_title(self):
  892. # try:
  893. # title = self.d.xpath(
  894. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView').text
  895. # except:
  896. # title = self.d.xpath(
  897. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.TextView').text
  898. # title = self.d.xpath('//*[contains(@text, "舒肝颗粒")]').text
  899. def _inner():
  900. print(f'获取商品title时的搜索关键字:{self.title_key}')
  901. # title = self.d.xpath(f'//*[contains(@text, "{self.search_key}")]').text
  902. # 初始化
  903. drugs_name = ''
  904. specifications = ''
  905. title = ''
  906. # 循环的获取title为了有时间来处理人机验证
  907. for m in range(1, 6000):
  908. if self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').exists:
  909. title = self.safe_exec(
  910. lambda: self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').text
  911. )
  912. print(f"第{m}次获取title成功")
  913. break
  914. else:
  915. time.sleep(3)
  916. # return drugs_name, specifications
  917. title = title[1:] if title.startswith('0') else title
  918. print(f'获取到药品标题:{title}')
  919. # 从里面匹配出药品名和规格
  920. # drugs_name
  921. # specifications
  922. # match = re.search(r'([^\d]+)([\d\D]+)', title)
  923. if self.search_key == '999赐多康大豆':
  924. return title, '1罐'
  925. if self.search_key == "999感冒清热颗粒":
  926. match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
  927. else:
  928. match = re.match(r'(\[[^\]]+\])(.*?)\s*((?:\d+\S*|\(.+))$', title)
  929. if match:
  930. # drugs_name = match.group(1).strip() + match.group(2).strip()
  931. drugs_name = title
  932. specifications = match.group(3).strip()
  933. print("药品名:", drugs_name)
  934. print("规格:", specifications)
  935. # 如果品规中包含到期则需要再次的正则处理
  936. if '到期' in specifications:
  937. specifications = self.extract_specification(specifications)
  938. # print('完整药名:', drugs_name + specifications)
  939. return drugs_name, specifications
  940. else:
  941. if title == '999抗病毒口服液10ml*12' or title == '999抗病毒口服液':
  942. drugs_name = title
  943. specifications = '10ml*12支/盒'
  944. return drugs_name, specifications
  945. elif title == '999抗病毒口服液10ml*10':
  946. drugs_name = title
  947. specifications = '10ml*10支/盒'
  948. return drugs_name, specifications
  949. elif title == '999小柴胡颗粒':
  950. drugs_name = title
  951. specifications = '10g*9袋/盒'
  952. return drugs_name, specifications
  953. elif title == '999养胃舒颗粒':
  954. drugs_name = title
  955. specifications = '10g*10袋/盒'
  956. return drugs_name, specifications
  957. elif title == '三九胃泰胶囊':
  958. drugs_name = title
  959. specifications = '0.5g*24粒/盒'
  960. return drugs_name, specifications
  961. elif title == '999补脾益肠丸':
  962. drugs_name = title
  963. specifications = '6g*9袋/盒'
  964. return drugs_name, specifications
  965. elif title == '999复方感冒灵颗粒':
  966. drugs_name = title
  967. specifications = '12.5g*15袋/盒'
  968. return drugs_name, specifications
  969. else:
  970. print("没有匹配到预期格式")
  971. drugs_name = title
  972. specifications = ''
  973. return drugs_name, specifications
  974. # 用 safe_exec 包装内部逻辑,确保验证码阻塞
  975. return self.safe_exec(_inner)
  976. def enter_shop(self):
  977. """
  978. 进店,方便提取资质环境
  979. :return:
  980. """
  981. # self.d.xpath('//*[@text="进店"]').click()
  982. self.d.xpath('//*[@text="店铺"]').click()
  983. time.sleep(self.get_sleep_time())
  984. def enter_shoper(self):
  985. """
  986. 进入商家
  987. :return:
  988. """
  989. is_shoper_exists = 0
  990. for i in range(10):
  991. if self.d.xpath('//*[@text="商家"]').exists:
  992. print(f'第{i}次商家存在')
  993. is_shoper_exists = 1
  994. break
  995. else:
  996. print(f'第{i}次商家不存在')
  997. time.sleep(self.get_sleep_time())
  998. if is_shoper_exists == 1:
  999. self.d.xpath('//*[@text="商家"]').click()
  1000. time.sleep(self.get_sleep_time())
  1001. return True
  1002. else:
  1003. return False
  1004. # 点击查看商家资质
  1005. def scan_shoper_license(self):
  1006. exist_shoper = 0
  1007. for i in range(10):
  1008. if self.d.xpath('//*[@text="查看商家资质"]').exists:
  1009. print(f'第{i}次查看商家资质存在')
  1010. exist_shoper = 1
  1011. break
  1012. else:
  1013. print(f'第{i}次查看商家资质不存在')
  1014. time.sleep(self.get_sleep_time())
  1015. if exist_shoper == 1:
  1016. self.d.xpath('//*[@text="查看商家资质"]').click()
  1017. time.sleep(self.get_sleep_time())
  1018. else:
  1019. self.swipe_back(1)
  1020. # 验证商品的信息是否在数据库中已存在
  1021. def data_is_exists(self, data):
  1022. """
  1023. 检查指定数据是否已存在于数据库表中(仅检查存在性)
  1024. 参数:
  1025. data: 包含查询条件的字典,键为列名,值为条件值
  1026. 返回:
  1027. True: 数据存在
  1028. False: 数据不存在
  1029. None: 检查过程中出错
  1030. """
  1031. required_keys = ['product', 'min_price', 'shop', 'scrape_date', 'platform']
  1032. if not all(key in data for key in required_keys):
  1033. missing = [key for key in required_keys if key not in data]
  1034. logging.error(f"缺少必要字段: {', '.join(missing)}")
  1035. return None
  1036. try:
  1037. # 连接数据库
  1038. conn = get_mysql()
  1039. # 创建游标对象
  1040. cur = conn.cursor()
  1041. # cur.execute(query_sql)
  1042. query_sql = """
  1043. SELECT * FROM {}
  1044. WHERE product = %s
  1045. AND min_price = %s
  1046. AND shop = %s
  1047. AND scrape_date = %s
  1048. AND platform = %s
  1049. """.format(self.table_name)
  1050. cur.execute(query_sql, (
  1051. data['product'],
  1052. data['min_price'],
  1053. data['shop'],
  1054. data['scrape_date'],
  1055. data['platform']
  1056. ))
  1057. result = cur.fetchone()
  1058. return bool(result) # 如果存在返回True,否则False
  1059. except Exception as e:
  1060. print(f"MySQL 错误: {str(e)}")
  1061. # 验证店铺信息是否在数据库中已存在
  1062. def shop_is_exists_database(self, shop):
  1063. try:
  1064. # 连接数据库
  1065. conn = get_mysql()
  1066. # 创建游标对象
  1067. cur = conn.cursor()
  1068. query_sql = """
  1069. SELECT * FROM {}
  1070. WHERE shop = %s
  1071. """.format(self.shop_table_name)
  1072. cur.execute(query_sql, (
  1073. shop
  1074. ))
  1075. result = cur.fetchone()
  1076. return bool(result) # 如果存在返回True,否则False
  1077. except Exception as e:
  1078. print(f"MySQL 错误: {str(e)}")
  1079. def wait_if_verifying(self, monitor, timeout=120):
  1080. """验证码处理期间阻塞主线程"""
  1081. start = time.time()
  1082. while monitor.pausing.is_set() and time.time() - start < timeout:
  1083. time.sleep(1)
  1084. def wait_for_ready(self, monitor, timeout=86400):
  1085. """进入每一页前都先等验证码"""
  1086. start = time.time()
  1087. while monitor.pausing.is_set() and time.time() - start < timeout:
  1088. time.sleep(1)
  1089. # 额外保险:如果验证码突然在这一秒才弹,再主动扫一次
  1090. monitor.check_and_handle_popup()
  1091. def safe_list(self, xpath, monitor):
  1092. """线程安全地拿商品列表"""
  1093. self.wait_for_ready(monitor)
  1094. return self.d.xpath(xpath).all()
  1095. def safe_exec(self, func, *args, **kwargs):
  1096. """
  1097. 万能安全壳:执行 func 前检查验证码,
  1098. 若监控线程已置位 pausing,则一直阻塞直到放行。
  1099. """
  1100. while self.monitor.pausing.is_set():
  1101. time.sleep(1)
  1102. # 执行真正逻辑
  1103. return func(*args, **kwargs)
  1104. def get_next_data(self, data, target):
  1105. for i, item in enumerate(data):
  1106. if item['words'] == target:
  1107. if i + 1 < len(data):
  1108. return data[i + 1]['words']
  1109. return None
  1110. # ccbiao
  1111. def delete_instruction_screenshot(self, screenshot_path):
  1112. # 删除截图文件
  1113. try:
  1114. os.remove(screenshot_path)
  1115. print(f"截图文件已删除:{screenshot_path}")
  1116. except FileNotFoundError:
  1117. print(f"文件未找到,无法删除:{screenshot_path}")
  1118. except Exception as e:
  1119. print(f"删除文件时出错:{e}")
  1120. def get_instructions_data(self):
  1121. """
  1122. 确定有说明书之后,提取所有的说明书数据
  1123. :return:
  1124. """
  1125. self.d.xpath('//*[@text="说明"]').click()
  1126. # time.sleep(random.randint(3, 5))
  1127. time.sleep(0.5)
  1128. if self.d.xpath('//*[@text="查看详细说明"]').exists:
  1129. self.d.xpath('//*[@text="查看详细说明"]').click()
  1130. else:
  1131. for i in range(8):
  1132. if self.d.xpath('//*[@text="查看全部"]').exists:
  1133. print('开始点击查看全部')
  1134. break
  1135. self.d.swipe_ext('down', 0.3)
  1136. time.sleep(1)
  1137. if self.d.xpath('//*[@text="查看全部"]').exists:
  1138. print('开始点击查看全部2')
  1139. break
  1140. if self.d.xpath('//*[@text="查看全部"]').exists:
  1141. self.d.xpath('//*[@text="查看全部"]').click()
  1142. else:
  1143. res_data = {
  1144. "有效期": '',
  1145. "生产单位": '',
  1146. "批准文号": ''
  1147. }
  1148. self.loggerMT.info('获取到的说明书信息为空。')
  1149. return res_data
  1150. time.sleep(0.5)
  1151. for ii in range(8):
  1152. if self.d.xpath('//*[@text="加载更多"]').exists:
  1153. self.d.xpath('//*[@text="加载更多"]').click()
  1154. time.sleep(0.2)
  1155. break
  1156. else:
  1157. self.d.swipe(200, 1000, 200, 300, 0.3)
  1158. # self.d.swipe_ext("up", scale=0.3)
  1159. for iii in range(10):
  1160. if self.d.xpath('//*[@text="生产单位"]').exists and self.d.xpath('//*[@text="批准文号"]').exists:
  1161. break
  1162. else:
  1163. self.d.swipe(200, 1300, 200, 300, 0.3)
  1164. # self.d.swipe_ext("up", scale=0.3)
  1165. instruction_path = self.screenshot_instruction()
  1166. print(f"instruction_path= {instruction_path}")
  1167. time.sleep(2)
  1168. ocr_res = self.get_ocr_res_image(instruction_path)
  1169. # print(f'ocr_res:{ocr_res}')
  1170. if ocr_res:
  1171. # 获取有效期的下一个数据
  1172. validity = self.get_next_data(ocr_res, '有效期')
  1173. # 获取批准文号的下一个数据
  1174. approval_number = self.get_next_data(ocr_res, '批准文号')
  1175. # 获取生产单位的下一个数据
  1176. manufacturer = self.get_next_data(ocr_res, '生产单位')
  1177. else:
  1178. validity = ''
  1179. approval_number = ''
  1180. manufacturer = ''
  1181. res_data = {
  1182. "有效期": validity,
  1183. "生产单位": manufacturer,
  1184. "批准文号": approval_number
  1185. }
  1186. print(f"res_data={res_data}")
  1187. time.sleep(1)
  1188. self.delete_instruction_screenshot(instruction_path)
  1189. return res_data
  1190. def has_instructions(self):
  1191. """
  1192. 是否有说明书
  1193. :return:
  1194. """
  1195. # 没有说明书的无法采集具体数据
  1196. time.sleep(self.get_sleep_time())
  1197. is_has_instructions = False
  1198. for i in range(8):
  1199. if self.d.xpath('//*[@text="说明"]').exists:
  1200. print(f"第{i}次有说明书1")
  1201. is_has_instructions = True
  1202. break
  1203. self.d.swipe_ext('down', 0.3)
  1204. time.sleep(1)
  1205. if self.d.xpath('//*[@text="说明"]').exists:
  1206. is_has_instructions = True
  1207. print(f"第{i}次有说明书2")
  1208. break
  1209. return is_has_instructions
  1210. def has_shop(self):
  1211. """
  1212. 是否有进店按钮
  1213. :return:
  1214. """
  1215. # self.d.swipe_ext('up', 0.1)
  1216. time.sleep(self.get_sleep_time())
  1217. is_has_enter_shop = self.d.xpath('//*[@text="进店"]').exists
  1218. return is_has_enter_shop
  1219. # 获取商品对应的店铺信息
  1220. def get_license_info_ex(self):
  1221. # self.enter_shop()
  1222. self.safe_exec(self.enter_shop)
  1223. # self.enter_shoper()
  1224. result = self.safe_exec(self.enter_shoper)
  1225. if result == False:
  1226. license_info_data = {'contact_address': '', 'qualification_number': '', 'business_license_company': '',
  1227. 'business_license_address': ''}
  1228. return license_info_data
  1229. for i in range(10):
  1230. if self.d.xpath('//*[@text="查看商家资质"]').exists:
  1231. print(f"第{i}次有商家资质")
  1232. break
  1233. else:
  1234. print(f"第{i}次没有商家资质")
  1235. time.sleep(self.get_sleep_time())
  1236. # 获取地址
  1237. # contact_address = self.get_shop_address()
  1238. contact_address = self.safe_exec(self.get_shop_address)
  1239. # time.sleep(50000)
  1240. ###
  1241. # self.scan_shoper_license()
  1242. self.safe_exec(self.scan_shoper_license)
  1243. # 获取资质编码
  1244. # qualification_number = self.get_qualification_number()
  1245. qualification_number = self.safe_exec(self.get_qualification_number)
  1246. # qualification_number 不为None继续下一步
  1247. if qualification_number:
  1248. # 营业执照公司名称
  1249. business_license_company = ''
  1250. # 营业执照地址
  1251. business_license_address = ''
  1252. self.d.click(0.603, 0.27)
  1253. time.sleep(self.get_sleep_time())
  1254. cropped_screenshot_path = self.screenshot_the_business_license(qualification_number)
  1255. print(f'cropped_screenshot_path:{cropped_screenshot_path}')
  1256. # if qualification_number:
  1257. # cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\mt\\screenshot\\' + qualification_number + '.png'
  1258. # else:
  1259. # cropped_screenshot_path = 'cropped_screenshot.png'
  1260. # ocr_res = self.get_ocr_res('cropped_screenshot.png')
  1261. ocr_res = self.get_ocr_res(cropped_screenshot_path)
  1262. print(f'ocr_res:{ocr_res}')
  1263. # 获取ocr_res 中的地址、单位名称
  1264. if ocr_res:
  1265. if '单位名称' in ocr_res.keys():
  1266. business_license_company = ocr_res['单位名称']
  1267. if '地址' in ocr_res.keys():
  1268. business_license_address = ocr_res['地址']
  1269. license_info_data = {'contact_address': contact_address, 'qualification_number': qualification_number,
  1270. 'business_license_company': business_license_company,
  1271. 'business_license_address': business_license_address}
  1272. else:
  1273. license_info_data = {'contact_address': contact_address, 'qualification_number': '',
  1274. 'business_license_company': '', 'business_license_address': ''}
  1275. return license_info_data
  1276. def distinct_target(self):
  1277. result = False
  1278. position_xpath = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]'
  1279. position_xpath2 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]'
  1280. is_position = self.d.xpath(position_xpath).exists
  1281. is_position2 = self.d.xpath(position_xpath2).exists
  1282. xpath = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1283. xpath2 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1284. xpath3 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1285. xpath4 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1286. is_position5 = self.d.xpath(xpath).exists
  1287. is_position6 = self.d.xpath(xpath2).exists
  1288. is_position7 = self.d.xpath(xpath3).exists
  1289. is_position8 = self.d.xpath(xpath4).exists
  1290. # print(f"is_position = {is_position}")
  1291. # print(f"is_position2 = {is_position2}")
  1292. if is_position or is_position2 or is_position5 or is_position6 or is_position7 or is_position8:
  1293. result = True
  1294. if result == False:
  1295. print("---检测没有回到列表页---")
  1296. else:
  1297. print("---检测回到了列表页---")
  1298. return result
  1299. # return is_position
  1300. def enter_target_page(self):
  1301. self.d.xpath('//*[@content-desc="看病买药"]').click()
  1302. time.sleep(self.get_sleep_time())
  1303. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/vf_search_carousel_text"]').click()
  1304. time.sleep(self.get_sleep_time())
  1305. self.d.xpath(
  1306. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]').click()
  1307. time.sleep(self.get_sleep_time())
  1308. self.d.send_keys(self.search_key, clear=True)
  1309. time.sleep(self.get_sleep_time())
  1310. self.d.xpath('//*[@text="搜索"]').click()
  1311. time.sleep(self.get_sleep_time())
  1312. self.click_express_send()
  1313. time.sleep(self.get_sleep_time())
  1314. def click_express_send(self):
  1315. # xpath= '//*[@resource-id="com.sankuai.meituan:id/container"]//android.widget.HorizontalScrollView[last()]'
  1316. slide_xpath = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]'
  1317. slide_xpath2 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]'
  1318. slide_xpath3 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]'
  1319. slide_xpath4 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]'
  1320. for i in range(1, 3):
  1321. if self.d.xpath(slide_xpath).exists:
  1322. bounds = self.d.xpath(slide_xpath).info['bounds']
  1323. top = bounds['top']
  1324. bottom = bounds['bottom']
  1325. print(f'top={top}')
  1326. print(f'bottom={bottom}')
  1327. y = (top + bottom) // 2
  1328. print(f'y={y}')
  1329. self.loggerMT.info('开始滑动1')
  1330. self.d.swipe(500, y, 100, y, 0.5)
  1331. time.sleep(self.get_sleep_time())
  1332. break
  1333. elif self.d.xpath(slide_xpath2).exists:
  1334. bounds = self.d.xpath(slide_xpath2).info['bounds']
  1335. top = bounds['top']
  1336. bottom = bounds['bottom']
  1337. print(f'top={top}')
  1338. print(f'bottom={bottom}')
  1339. y = (top + bottom) // 2
  1340. print(f'y={y}')
  1341. self.loggerMT.info('开始滑动2')
  1342. self.d.swipe(500, y, 100, y, 0.5)
  1343. time.sleep(self.get_sleep_time())
  1344. break
  1345. elif self.d.xpath(slide_xpath3).exists:
  1346. bounds = self.d.xpath(slide_xpath3).info['bounds']
  1347. top = bounds['top']
  1348. bottom = bounds['bottom']
  1349. print(f'top={top}')
  1350. print(f'bottom={bottom}')
  1351. y = (top + bottom) // 2
  1352. print(f'y={y}')
  1353. self.loggerMT.info('开始滑动3')
  1354. self.d.swipe(500, y, 100, y, 0.5)
  1355. time.sleep(self.get_sleep_time())
  1356. break
  1357. elif self.d.xpath(slide_xpath4).exists:
  1358. bounds = self.d.xpath(slide_xpath4).info['bounds']
  1359. top = bounds['top']
  1360. bottom = bounds['bottom']
  1361. print(f'top={top}')
  1362. print(f'bottom={bottom}')
  1363. y = (top + bottom) // 2
  1364. print(f'y={y}')
  1365. self.loggerMT.info('开始滑动4')
  1366. self.d.swipe(500, y, 100, y, 0.5)
  1367. time.sleep(self.get_sleep_time())
  1368. break
  1369. max_retry = 5 # 最多尝试次数
  1370. for idx in range(1, max_retry + 1):
  1371. # xpath= '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()-1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
  1372. xpath = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1373. xpath2 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1374. xpath3 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1375. xpath4 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1376. xpath5 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/androidx.recyclerview.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]'
  1377. # print(f"xpath:{xpath}")
  1378. # scroll_view = self.d(resourceId="com.sankuai.meituan:id/container") .child(className="android.widget.HorizontalScrollView")
  1379. if self.d.xpath(xpath).exists:
  1380. self.d.xpath(xpath).click()
  1381. # time.sleep(self.get_sleep_time())
  1382. print(f"第{idx}次点击xpath快递送成功")
  1383. time.sleep(self.get_sleep_time())
  1384. break
  1385. elif self.d.xpath(xpath2).exists:
  1386. self.d.xpath(xpath2).click()
  1387. print(f"第{idx}次点击xpath2快递送成功")
  1388. time.sleep(self.get_sleep_time())
  1389. break
  1390. elif self.d.xpath(xpath3).exists:
  1391. self.d.xpath(xpath3).click()
  1392. print(f"第{idx}次点击xpath3快递送成功")
  1393. time.sleep(self.get_sleep_time())
  1394. break
  1395. elif self.d.xpath(xpath4).exists:
  1396. self.d.xpath(xpath4).click()
  1397. print(f"第{idx}次点击xpath4快递送成功")
  1398. time.sleep(self.get_sleep_time())
  1399. break
  1400. elif self.d.xpath(xpath5).exists:
  1401. self.d.xpath(xpath5).click()
  1402. print(f"第{idx}次点击xpath4快递送成功")
  1403. time.sleep(self.get_sleep_time())
  1404. break
  1405. else:
  1406. print(f"第{idx}次点击xpath或xpath2或xpath3快递送都失败")
  1407. time.sleep(self.get_sleep_time())
  1408. # xpath2= '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1409. # if self.d.xpath(xpath2).exists:
  1410. # self.d.xpath(xpath2).click()
  1411. # print(f"第{idx}次点击xpath2快递送成功")
  1412. # time.sleep(self.get_sleep_time())
  1413. # break
  1414. def get_clipboard(self):
  1415. time.sleep(1)
  1416. self.loggerMT.info(f"Clipboard content:{self.d.clipboard}") # 打印调试信息
  1417. clipboard_content = self.d.clipboard
  1418. if clipboard_content is None:
  1419. return ''
  1420. return clipboard_content.strip()
  1421. # return self.d.clipboard.strip()
  1422. def clear_clipboard(self):
  1423. self.d.set_clipboard("", "text/plain")
  1424. def get_product_link(self):
  1425. product_link = ''
  1426. dots_xpaths = [
  1427. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ImageView[1]',
  1428. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[3]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ImageView[1]',
  1429. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ImageView[1]'
  1430. ]
  1431. max_retry = 5 # 最多尝试次数
  1432. for idx in range(1, max_retry + 1):
  1433. if product_link: # 已经拿到则退出
  1434. break
  1435. for xp in dots_xpaths:
  1436. if self.d.xpath(xp).exists:
  1437. print(f'{idx}-进入分享点点点')
  1438. self.loggerMT.info(f'{idx}-进入分享点点点')
  1439. # #先清空剪贴板的内容
  1440. # self.clear_clipboard()
  1441. # print("清空剪贴板内容成功。")
  1442. self.loggerMT.info(f'{idx}-click')
  1443. self.d.xpath(xp).click()
  1444. time.sleep(0.2)
  1445. self.loggerMT.info(f'{idx}-click_exists')
  1446. self.d.xpath('//*[@text="分享商品"]').click_exists()
  1447. time.sleep(0.2)
  1448. link_xpath = '//*[@text="复制链接"]'
  1449. if self.d.xpath(link_xpath).exists:
  1450. self.loggerMT.info(f'{idx}-link_xpath click')
  1451. self.d.xpath(link_xpath).click()
  1452. time.sleep(1)
  1453. product_link = self.get_clipboard()
  1454. time.sleep(0.5)
  1455. print(f'{idx}-商品链接:{product_link}')
  1456. self.loggerMT.info(f'{idx}-商品链接:{product_link}')
  1457. break # 找到并执行后跳出内层循环
  1458. else:
  1459. print(f'{idx}-商品链接:{product_link}')
  1460. self.loggerMT.info(f'{idx}-商品链接:{product_link}')
  1461. product_link = ''
  1462. # self.d.xpath('//*[@text="复制链接"]').click_exists()
  1463. # time.sleep(1)
  1464. # product_link = self.get_clipboard()
  1465. # time.sleep(0.5)
  1466. # print(f'{idx}-商品链接:{product_link}')
  1467. # self.loggerMT.info(f'{idx}-商品链接:{product_link}')
  1468. # break # 找到并执行后跳出内层循环
  1469. if not product_link and idx < max_retry:
  1470. time.sleep(0.5) # 最后一次不需要再等待
  1471. return product_link
  1472. def integrate_data(self):
  1473. """
  1474. 整合数据
  1475. :return:
  1476. """
  1477. # title_info = self.get_title() # 药品,规格
  1478. # title_info = self.safe_exec(self.get_title) # 药品,规格
  1479. product, specifications = self.safe_exec(self.get_title) # 药品,规格
  1480. if not product:
  1481. self.swipe_back(1)
  1482. return
  1483. min_price = self.drug_price() # 最低价格
  1484. sales_num = self.drug_sale_num() # 销售数量
  1485. snapshot_url = '' # 网页快照
  1486. # 在这里截图存放到OSS;#采集图片存放的oss_url;
  1487. mt_screenshot = MTScreenshot(
  1488. d=self.d,
  1489. oss_config=self.oss_config,
  1490. search_key=self.search_key, # 添加这行
  1491. title_key=self.title_key,
  1492. )
  1493. product_link = ''
  1494. if self.d.xpath('//*[@text="自营"]').exists:
  1495. shop = "美团自营大药房(快递电商)"
  1496. # 爬取日期
  1497. scrape_date = self.get_current_date()
  1498. # scrape_date = "2025-07-18"
  1499. dup_data = {'product': product, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
  1500. 'platform': '美团'}
  1501. print(f'当前数据:{dup_data}')
  1502. if self.data_is_exists(dup_data):
  1503. print('存在相同数据不入库')
  1504. self.swipe_back(1)
  1505. return
  1506. else:
  1507. for i in range(8):
  1508. if self.d.xpath('//*[@text="进店"]').exists:
  1509. print('开始获取店铺名1')
  1510. break
  1511. self.d.swipe_ext('up', 0.3)
  1512. time.sleep(1)
  1513. # detail_info = self.d.xpath(
  1514. # '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[6]').info
  1515. # bounds = detail_info['bounds']
  1516. # height = bounds['bottom'] - bounds['top']
  1517. # if self.d.xpath('//*[@text="进店"]').exists and height > 100:
  1518. if self.d.xpath('//*[@text="进店"]').exists:
  1519. print('开始获取店铺名2')
  1520. break
  1521. shop = self.get_shop_name()
  1522. # 爬取日期
  1523. scrape_date = self.get_current_date()
  1524. # scrape_date = "2025-07-18"
  1525. dup_data = {'product': product, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
  1526. 'platform': '美团'}
  1527. print(f'当前数据:{dup_data}')
  1528. # 获取店铺信息开始
  1529. # 暂时不获取店铺信息 start
  1530. is_has_enter_shop = self.has_shop()
  1531. # 需要判断shop是否已经在数据库中存在,如果存在,则不再进入店铺,直接进入下一个商品
  1532. shop_is_exists = self.shop_is_exists_database(shop)
  1533. # 存在进店 并且店铺的名称不包含美团官方的字样
  1534. print(f"已采集{self.shop_data_num}家店铺数据")
  1535. if is_has_enter_shop and '美团官方' not in shop and '美团自营' not in shop and not shop_is_exists and self.shop_data_num < 500:
  1536. # license_info = self.get_license_info_ex()
  1537. license_info = self.safe_exec(self.get_license_info_ex)
  1538. contact_address = license_info['contact_address']
  1539. qualification_number = license_info['qualification_number']
  1540. business_license_company = license_info['business_license_company']
  1541. business_license_address = license_info['business_license_address']
  1542. save_shop_data = {
  1543. 'shop': shop,
  1544. 'contact_address': contact_address,
  1545. 'qualification_number': qualification_number,
  1546. 'scrape_date': scrape_date,
  1547. 'business_license_company': business_license_company,
  1548. 'business_license_address': business_license_address,
  1549. 'platform': '美团'
  1550. }
  1551. self.save_shop_info_to_database(save_shop_data)
  1552. self.shop_data_num += 1 # 店铺数据数量+1
  1553. self.swipe_back(2)
  1554. else:
  1555. print('不采集店铺信息')
  1556. # 获取店铺信息结束
  1557. # 暂时不获取店铺信息 end
  1558. if self.data_is_exists(dup_data):
  1559. print('存在相同数据不入库')
  1560. self.swipe_back(1)
  1561. return
  1562. # 商品链接
  1563. product_link = self.get_product_link()
  1564. print(f'获取到product_link: {product_link}')
  1565. if not shop:
  1566. print('未获取到店铺名:开始回退')
  1567. self.swipe_back(1)
  1568. return
  1569. if not shop or '自营' in shop:
  1570. self.swipe_back(1)
  1571. return
  1572. time.sleep(self.get_sleep_time())
  1573. # 生产日期为空
  1574. manufacture_date = ''
  1575. # 执政信息
  1576. # if is_has_enter_shop:
  1577. # license_info = self.get_license_info()
  1578. # business_license_company = license_info["单位名称"]
  1579. # credit_code = license_info['社会信用代码']
  1580. # city_str = license_info['地址']
  1581. # # 先把省份啥的替换掉
  1582. # city_sub_str = re.sub(r'[u4e00-\u9fa5]+省', '', city_str)
  1583. # try:
  1584. # city = re.search(r'[\u4e00-\u9fa5]+?(市|区|县)', city_sub_str).group(0)
  1585. # except:
  1586. # city = city_sub_str
  1587. # try:
  1588. # province = self.city2province[city]
  1589. # except:
  1590. # province = ''
  1591. # self.swipe_back(2)
  1592. # else:
  1593. # business_license_company = ''
  1594. # credit_code = ''
  1595. # city = ''
  1596. # province = ''
  1597. business_license_company = ''
  1598. credit_code = ''
  1599. city = ''
  1600. province = ''
  1601. expiry_date = ''
  1602. manufacturer = ''
  1603. approval_number = ''
  1604. # 暂时不获取说明书信息 start
  1605. # 是否存在说明书
  1606. # is_has_instructions = self.has_instructions()
  1607. is_has_instructions = self.safe_exec(self.has_instructions)
  1608. # 说明书等信息
  1609. if is_has_instructions:
  1610. print('开始获取说明书信息')
  1611. # instructions_info = self.get_instructions_data()
  1612. instructions_info = self.safe_exec(self.get_instructions_data)
  1613. if instructions_info['有效期'] is not None:
  1614. expiry_date = instructions_info['有效期'].strip('。')
  1615. if instructions_info['生产单位'] is not None:
  1616. manufacturer = instructions_info['生产单位'].strip('。')
  1617. if instructions_info['批准文号'] is not None:
  1618. approval_number = instructions_info['批准文号'].strip('。')
  1619. else:
  1620. expiry_date = None
  1621. manufacturer = None
  1622. approval_number = None
  1623. # 爬取省份
  1624. scrape_province = '广东' # 这里先默认广东
  1625. # 是否有货
  1626. availability = ''
  1627. save_data = {
  1628. 'product': product,
  1629. 'min_price': min_price,
  1630. 'manufacture_date': manufacture_date,
  1631. 'expiry_date': expiry_date,
  1632. 'shop': shop,
  1633. 'business_license_company': business_license_company,
  1634. 'province': province,
  1635. 'city': city,
  1636. 'manufacturer': manufacturer,
  1637. 'specification': specifications,
  1638. 'approval_number': approval_number,
  1639. 'product_link': product_link,
  1640. 'scrape_date': scrape_date,
  1641. 'scrape_province': scrape_province,
  1642. 'availability': availability,
  1643. 'credit_code': credit_code,
  1644. 'platform': '美团',
  1645. 'search_key': self.search_key,
  1646. 'sales': sales_num,
  1647. 'inventory': '',
  1648. 'snapshot_url': snapshot_url
  1649. }
  1650. self.save_to_database(save_data)
  1651. def back_to_list_page(self):
  1652. for i in range(5):
  1653. # 最外部有个定位按钮
  1654. if self.distinct_target():
  1655. return True
  1656. print(f'第{i}次尝试退回到列表页')
  1657. self.swipe_back(1)
  1658. time.sleep(self.get_sleep_time())
  1659. print('页面出错,没有退回到列表页')
  1660. return False
  1661. def get_one_drug(self, drug_idx, drug_one):
  1662. bounds = drug_one.info['bounds']
  1663. top = bounds['top']
  1664. bottom = bounds['bottom']
  1665. # height = bottom - top
  1666. print(f'当前商品bottom:{bottom}')
  1667. print(f'当前商品top:{top}')
  1668. # if 304 <= top and bottom <= 1475: # 默认高度241的才行
  1669. if 304 <= top and bottom <= 1475: # 默认高度241的才行 1559
  1670. # print('目标-->', drug_one.info)
  1671. # drug_one.click()
  1672. # 获取当前元素中的属性来判断是否要点击进入采集
  1673. print(f"这页的第几个商品:{drug_idx}")
  1674. product_title = ''
  1675. price = ''
  1676. shop_name = ''
  1677. # 商品名称的xpath
  1678. product_tittle_xpath = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  1679. product_tittle_xpath2 = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  1680. if self.d.xpath(product_tittle_xpath).exists:
  1681. product_title = self.d.xpath(product_tittle_xpath).text
  1682. product_title = product_title[1:] if product_title.startswith('0') else product_title
  1683. print(f"product_tittle_xpath列表当前商品名称:{product_title}")
  1684. if not self.is_link_useful(product_title):
  1685. print(f"is_link_useful 没通过1:{product_title}")
  1686. # TODO 认真确认无关数据量的条件,这里才可以设置退出
  1687. self.unrelated_data += 1
  1688. return
  1689. elif self.d.xpath(product_tittle_xpath2).exists:
  1690. product_title = self.d.xpath(product_tittle_xpath2).text
  1691. product_title = product_title[1:] if product_title.startswith('0') else product_title
  1692. print(f"product_tittle_xpath2列表当前商品名称:{product_title}")
  1693. if not self.is_link_useful(product_title):
  1694. print(f"is_link_useful 没通过2:{product_title}")
  1695. # TODO 认真确认无关数据量的条件,这里才可以设置退出
  1696. self.unrelated_data += 1
  1697. return
  1698. else:
  1699. print(f"列表当前商品名称不存在")
  1700. # TODO 认真确认无关数据量的条件,这里才可以设置退出
  1701. self.unrelated_data += 1
  1702. return
  1703. # 这里只统计连续无关链接数
  1704. self.unrelated_data = 0
  1705. # 价格
  1706. price_xpath = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  1707. price_xpath3 = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  1708. price_xpath1 = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  1709. if self.d.xpath(price_xpath).exists:
  1710. price_str = self.d.xpath(price_xpath).text
  1711. print(f"price_xpath列表当前商品价格:{price_str}")
  1712. if price_str:
  1713. price = float(re.search(r'[\d\.]+', price_str).group())
  1714. elif self.d.xpath(price_xpath3).exists:
  1715. price_str = self.d.xpath(price_xpath3).text
  1716. print(f"price_xpath3列表当前商品价格:{price_str}")
  1717. if price_str:
  1718. price = float(re.search(r'[\d\.]+', price_str).group())
  1719. elif self.d.xpath(price_xpath1).exists:
  1720. price_str = self.d.xpath(price_xpath1).text
  1721. print(f"price_xpath1列表当前商品价格:{price_str}")
  1722. if price_str:
  1723. price = float(re.search(r'[\d\.]+', price_str).group())
  1724. else:
  1725. price_xpath2 = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView'
  1726. if self.d.xpath(price_xpath2).exists:
  1727. price_str = self.d.xpath(price_xpath2).text
  1728. print(f"price_xpath2列表当前商品价格:{price_str}")
  1729. if price_str:
  1730. price = float(re.search(r'[\d\.]+', price_str).group())
  1731. else:
  1732. print(f"列表当前商品价格不存在")
  1733. # price_str = self.d.xpath(f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]//*[starts-with(@text,"¥")]').text
  1734. print(f'列表获取到价格:{price}')
  1735. # 店铺名称的xpath
  1736. shop_name_xpath = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.FrameLayout[last()]/android.widget.TextView[1]'
  1737. shop_name_xpath2 = f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.FrameLayout[last()]/android.widget.TextView[1]'
  1738. if self.d.xpath(shop_name_xpath).exists:
  1739. shop_name = self.d.xpath(shop_name_xpath).text
  1740. print(f"shop_name_xpath列表当前商品店铺名称:{shop_name}")
  1741. elif self.d.xpath(shop_name_xpath2).exists:
  1742. shop_name = self.d.xpath(shop_name_xpath2).text
  1743. print(f"shop_name_xpath2列表当前商品店铺名称:{shop_name}")
  1744. else:
  1745. print(f"列表当前商品店铺名称不存在")
  1746. # 进度保存
  1747. self.wr_re("写",self.device_id,product_title,price,shop_name,self.sort)
  1748. # 如果商品的名称、价格和生产厂家都不存在则直接下一条数据。 跳过一些不是商品的数据。
  1749. if price == '':
  1750. print(f"列表当前价格不存在")
  1751. return
  1752. if shop_name == '':
  1753. print(f"列表当前商品店铺名称不存在")
  1754. return
  1755. scrape_date = self.get_current_date()
  1756. if product_title and price and shop_name:
  1757. dup_data = {'product': product_title, 'min_price': price, 'shop': shop_name,
  1758. 'scrape_date': scrape_date, 'platform': '美团'}
  1759. if self.data_is_exists(dup_data):
  1760. print('列表存在相同数据不入库')
  1761. return
  1762. self.safe_exec(drug_one.click)
  1763. print('点击目标药品完毕')
  1764. time.sleep(2)
  1765. try:
  1766. self.safe_exec(self.integrate_data)
  1767. print('integrate_data结束')
  1768. finally:
  1769. time.sleep(self.get_sleep_time())
  1770. def get_cur_page(self, page_no):
  1771. print(f'第{page_no + 1}页')
  1772. # 检查是否需要暂停(验证码过多)
  1773. if self.monitor.verification_count >= self.monitor.MAX_VERIFICATION_RETRY:
  1774. print("频繁遇到验证码,暂停程序")
  1775. # 等待用户点击屏幕继续
  1776. self.d.click(0, 0) # 无效点击,等待用户操作
  1777. self.monitor.verification_count = 0
  1778. # TODO ?
  1779. while True:
  1780. if self.d.xpath('//android.support.v7.widget.RecyclerView/android.widget.FrameLayout').exists:
  1781. break
  1782. time.sleep(1)
  1783. drug_lis = self.safe_exec(
  1784. self.d.xpath('//android.support.v7.widget.RecyclerView/android.widget.FrameLayout').all)
  1785. list_len = len(drug_lis)
  1786. print(f'当前页面共有{list_len}个商品')
  1787. for drug_idx, drug_one in enumerate(drug_lis, start=1):
  1788. i = 0
  1789. while i < 3:
  1790. try:
  1791. self.get_one_drug(drug_idx, drug_one)
  1792. break
  1793. except Exception as e:
  1794. print(f'get_one_drug {drug_idx} 异常 {e}')
  1795. i += 1
  1796. finally:
  1797. # TODO 可以考虑优化这里,回退到列表页
  1798. res = self.back_to_list_page()
  1799. if not res:
  1800. print('back_to_list_page出错,退出采集')
  1801. raise "back_to_list_page出错,退出采集"
  1802. if self.d.xpath('//*[@text="已经到底啦"]').exists:
  1803. print('已经到达列表页最底部')
  1804. return
  1805. print('开始滑动')
  1806. self.d.drag(300, 1400, 300, 400, 1)
  1807. print('滑动结束')
  1808. time.sleep(self.get_sleep_time())
  1809. # 主函数
  1810. def main(self, device_id):
  1811. self.device_id = device_id
  1812. self.connect_devices(device_id)
  1813. time.sleep(self.get_sleep_time())
  1814. # 启动全局弹窗监控
  1815. self.monitor = SpiderMonitor(self)
  1816. self.monitor.start()
  1817. try:
  1818. self.restart_app()
  1819. # 搜索关键字
  1820. self.safe_exec(self.enter_target_page)
  1821. # 中断恢复
  1822. if self.wr_re("读", device_id):
  1823. self.sort = self.wr_re("读", device_id)['sort']
  1824. if self.sort and self.sort_key == 0:
  1825. self.li_or_lo(self.sort)
  1826. for page_no in range(300):
  1827. self.get_cur_page(page_no)
  1828. # TODO 认真确认无关数据量的条件,这里才可以设置退出,这里退出才能执行下一个任务
  1829. print('目前连续无关数据量: ', self.unrelated_data)
  1830. if self.unrelated_data > 15:
  1831. print("连续超过15个不达标的数据则停止采集")
  1832. return
  1833. finally:
  1834. # 确保监控线程被停止
  1835. self.monitor.stop()
  1836. self.monitor.join()
  1837. device_list = {
  1838. "21885f5": [
  1839. {"search_key": "天士力养血清脑颗粒4g*9", "title_key": "养血清脑颗粒", "spec_list":["4g*9"], "brand":"天士力","sort":"升序"},
  1840. ],
  1841. "97ae80e0": [
  1842. {"search_key": "天士力复方丹参滴丸27mg*150", "title_key": "复方丹参滴丸", "spec_list":["27mg*150"], "brand":"天士力","sort":"升序"},
  1843. ],
  1844. }
  1845. def main():
  1846. device_id = '97ae80e0'
  1847. tasks = device_list[device_id]
  1848. for task in tasks:
  1849. cycle_no = 0 # 轮次计数
  1850. while True:
  1851. cycle_no += 1
  1852. logging.info(f'========== {task["search_key"]} 第 {cycle_no} 轮采集开始 ==========')
  1853. try:
  1854. mt = MT(task["search_key"], task["title_key"], task["spec_list"], task["brand"],task['sort']) # 用当前关键字实例化
  1855. mt.main(device_id) # 执行一次完整采集
  1856. logging.info(f'关键字 {task["search_key"]} 本轮采集完成')
  1857. break
  1858. except Exception as e:
  1859. # 发生异常直接跳过该关键字,继续下一轮
  1860. logging.exception(f'关键字 {task["search_key"]} 采集异常:{e}')
  1861. finally:
  1862. # 关闭当前 MT 实例资源(如有需要)
  1863. if hasattr(mt, 'close'):
  1864. mt.close()
  1865. if __name__ == '__main__':
  1866. main()