aaa_mt1.py 104 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396
  1. import sys
  2. import sys
  3. import logging
  4. import threading
  5. from concurrent.futures import ThreadPoolExecutor
  6. import pymysql
  7. import requests
  8. import base64
  9. import cv2
  10. import uiautomator2 as u2
  11. import time
  12. import subprocess
  13. import re
  14. import random
  15. import datetime
  16. import json
  17. from aip import AipOcr
  18. import threading
  19. from collections import deque
  20. import numpy as np
  21. import secrets
  22. import os
  23. import oss2
  24. import urllib.parse
  25. from config import Config
  26. from logger import setup_logger
  27. import logging
  28. from PIL import Image
  29. from pathlib import Path
  30. from PIL import Image, ImageDraw, ImageFont
  31. import http.client
  32. _DEFAULT_PATH = Path(__file__).with_name("city.json")
  33. import city_name_to_id
  34. setup_logger("mt_spider") # 初始化日志
  35. def get_access_token():
  36. AppKey = "tRK2RhyItCSh6BzyT4CNVXQa"
  37. AppSrcret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  38. token_url = 'https://aip.baidubce.com/oauth/2.0/token'
  39. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  40. payload = ""
  41. headers = {
  42. 'Content-Type': 'application/json',
  43. 'Accept': 'application/json'
  44. }
  45. response = requests.request("POST", url, headers=headers, data=payload)
  46. try:
  47. return response.json()['access_token']
  48. except:
  49. return None
  50. def get_mysql():
  51. # return pymysql.connect(
  52. # host='39.108.116.125', # "localhost", # 修改后的主机
  53. # port=3306, # 3306, # 添加端口号
  54. # user='drug_retrieve', # 'root', # 修改后的用户名
  55. # password='Pem287cwM58jNpe2', # 修改后的密码
  56. # db='drug_retrieve', #
  57. # charset='utf8mb4'
  58. # )
  59. return pymysql.connect(
  60. host='120.24.49.2', # 修改后的主机
  61. port=3306, # 添加端口号
  62. user='drug_retrieve', # 修改后的用户名
  63. password='ksCt3xm6chzdkafj', # 修改后的密码
  64. db='drug_retrieve', # 修改后的数据库名
  65. charset='utf8mb4'
  66. )
  67. SCHEDULER_INTERVAL_SECONDS = 600
  68. PLATFORM_MT = 4
  69. TASK_STATUS_PENDING = 1
  70. DEVICE_STATUS_IDLE = 0
  71. dispatch_lock = threading.Lock()
  72. running_task_ids = set()
  73. running_device_ids = set()
  74. worker_threads = {}
  75. scheduler_stop_event = threading.Event()
  76. scheduler_timer = None
  77. def parse_optional_int(value, default=None):
  78. if value in (None, ""):
  79. return default
  80. try:
  81. return int(value)
  82. except (TypeError, ValueError):
  83. return default
  84. def parse_spec_list(value):
  85. if value is None:
  86. return []
  87. if isinstance(value, (list, tuple)):
  88. return [str(item).strip() for item in value if str(item).strip()]
  89. text = str(value).strip()
  90. if not text:
  91. return []
  92. parts = re.split(r"[,,/\s]+", text)
  93. return [part.strip() for part in parts if part.strip()]
  94. def fetch_pending_tasks():
  95. conn = None
  96. try:
  97. conn = get_mysql()
  98. with conn.cursor() as cursor:
  99. sql = """
  100. SELECT *
  101. FROM retrieve_collect_task_allocate
  102. WHERE platform = %s AND status = %s
  103. ORDER BY id ASC
  104. """
  105. cursor.execute(sql, (PLATFORM_MT, TASK_STATUS_PENDING))
  106. return cursor.fetchall()
  107. except Exception as e:
  108. logging.exception(f"读取美团待执行任务失败: {e}")
  109. return []
  110. finally:
  111. if conn:
  112. conn.close()
  113. def fetch_idle_device_by_equipment_id(equipment_id):
  114. conn = None
  115. try:
  116. conn = get_mysql()
  117. with conn.cursor() as cursor:
  118. sql = """
  119. SELECT *
  120. FROM retrieve_collect_equipment
  121. WHERE name LIKE %s AND id = %s AND status = %s
  122. LIMIT 1
  123. """
  124. cursor.execute(sql, ('%mt%', equipment_id, DEVICE_STATUS_IDLE))
  125. return cursor.fetchone()
  126. except Exception as e:
  127. logging.exception(f"读取美团空闲设备失败 equipment_id={equipment_id}: {e}")
  128. return None
  129. finally:
  130. if conn:
  131. conn.close()
  132. def build_mt_task_payload(task_row, device_row):
  133. title_key = (task_row[5] if len(task_row) > 5 and task_row[5] is not None else "").strip()
  134. spec_list = parse_spec_list(task_row[6] if len(task_row) > 6 else None)
  135. brand = (task_row[7] if len(task_row) > 7 and task_row[7] is not None else "").strip()
  136. search_key = f"{brand}{title_key}".strip() or title_key
  137. start_page = parse_optional_int(task_row[9] if len(task_row) > 9 else None, None)
  138. end_page = parse_optional_int(task_row[10] if len(task_row) > 10 else None, None)
  139. page_range = []
  140. if start_page and end_page:
  141. page_range = [start_page, end_page]
  142. return {
  143. "task_id": task_row[0],
  144. "equipment_id": task_row[2],
  145. "enterprise_id": task_row[3] if len(task_row) > 3 else None,
  146. "platform": task_row[4],
  147. "title_key": title_key,
  148. "spec_list": spec_list,
  149. "brand": brand,
  150. "search_key": search_key,
  151. "sort": "升序",
  152. "collect_range": [],
  153. "page_range": page_range,
  154. "start_page": start_page,
  155. "end_page": end_page,
  156. "device_id": device_row[2],
  157. "workflow_retry_limit": {
  158. "start_app": 3,
  159. "open_product_list_page": 3,
  160. "collect_single_product": 3,
  161. },
  162. "workflow_error_action": {
  163. "start_app": "start_app",
  164. "open_product_list_page": "start_app",
  165. "collect_single_product": "back_to_list_page",
  166. },
  167. "task_row": task_row,
  168. }
  169. def fetch_runnable_mt_task_payloads():
  170. tasks = fetch_pending_tasks()
  171. if not tasks:
  172. logging.info("当前没有美团待执行任务")
  173. return []
  174. payloads = []
  175. reserved_equipment_ids = set()
  176. for task_row in tasks:
  177. task_id = task_row[0]
  178. equipment_id = task_row[2]
  179. with dispatch_lock:
  180. if task_id in running_task_ids:
  181. continue
  182. if equipment_id in reserved_equipment_ids:
  183. continue
  184. device_row = fetch_idle_device_by_equipment_id(equipment_id)
  185. if not device_row:
  186. logging.info(f"美团任务 {task_id} 对应设备 {equipment_id} 当前不空闲,跳过本轮")
  187. continue
  188. device_id = device_row[2]
  189. with dispatch_lock:
  190. if device_id in running_device_ids:
  191. logging.info(f"美团设备 {device_id} 已在本进程执行任务,跳过任务 {task_id}")
  192. continue
  193. running_task_ids.add(task_id)
  194. running_device_ids.add(device_id)
  195. reserved_equipment_ids.add(equipment_id)
  196. payloads.append(build_mt_task_payload(task_row, device_row))
  197. return payloads
  198. def cleanup_finished_workers():
  199. dead_threads = []
  200. with dispatch_lock:
  201. for device_id, thread in worker_threads.items():
  202. if not thread.is_alive():
  203. dead_threads.append(device_id)
  204. for device_id in dead_threads:
  205. worker_threads.pop(device_id, None)
  206. def run_mt_task_worker(task_payload):
  207. task_id = task_payload["task_id"]
  208. device_id = task_payload["device_id"]
  209. mt = None
  210. try:
  211. logging.info(f"[美团任务 {task_id}] 开始执行,设备: {device_id}")
  212. mt = MT(
  213. task_payload["search_key"],
  214. task_payload["title_key"],
  215. task_payload["spec_list"],
  216. task_payload["brand"],
  217. task_payload.get("sort"),
  218. task_payload.get("collect_range"),
  219. task_payload.get("page_range"),
  220. task_payload.get("workflow_retry_limit"),
  221. task_payload.get("workflow_error_action"),
  222. )
  223. mt.main(device_id)
  224. logging.info(f"[美团任务 {task_id}] 执行完成,设备: {device_id}")
  225. except Exception as e:
  226. logging.exception(f"[美团任务 {task_id}] 执行异常,设备: {device_id},错误: {e}")
  227. finally:
  228. if mt and hasattr(mt, 'close'):
  229. mt.close()
  230. with dispatch_lock:
  231. running_task_ids.discard(task_id)
  232. running_device_ids.discard(device_id)
  233. worker_threads.pop(device_id, None)
  234. def dispatch_pending_tasks():
  235. cleanup_finished_workers()
  236. task_payloads = fetch_runnable_mt_task_payloads()
  237. if not task_payloads:
  238. return
  239. for task_payload in task_payloads:
  240. device_id = task_payload["device_id"]
  241. try:
  242. thread = threading.Thread(
  243. target=run_mt_task_worker,
  244. args=(task_payload,),
  245. daemon=True,
  246. name=f"mt-{device_id}",
  247. )
  248. with dispatch_lock:
  249. worker_threads[device_id] = thread
  250. thread.start()
  251. logging.info(f"[美团任务 {task_payload['task_id']}] 已分发到设备 {device_id}")
  252. except Exception:
  253. with dispatch_lock:
  254. running_task_ids.discard(task_payload["task_id"])
  255. running_device_ids.discard(device_id)
  256. worker_threads.pop(device_id, None)
  257. raise
  258. def schedule_dispatch(delay_seconds=SCHEDULER_INTERVAL_SECONDS):
  259. global scheduler_timer
  260. if scheduler_stop_event.is_set():
  261. return
  262. scheduler_timer = threading.Timer(delay_seconds, scheduled_dispatch_job)
  263. scheduler_timer.daemon = False
  264. scheduler_timer.name = "mt-scheduler"
  265. scheduler_timer.start()
  266. def scheduled_dispatch_job():
  267. try:
  268. dispatch_pending_tasks()
  269. except Exception as e:
  270. logging.exception(f"美团定时调度异常: {e}")
  271. finally:
  272. schedule_dispatch(SCHEDULER_INTERVAL_SECONDS)
  273. class SpiderMonitor(threading.Thread):
  274. """全局弹窗监控线程(增强版)"""
  275. def __init__(self, spider_instance):
  276. super().__init__(daemon=True)
  277. self.spider = spider_instance
  278. self.running = True
  279. self.pausing = threading.Event() # 主线程同步事件
  280. self.last_verification_time = 0
  281. self.verification_count = 0
  282. self.MAX_VERIFICATION_RETRY = 10
  283. self.recent_clicks = deque(maxlen=10) # 防重复点击
  284. self.logger = logging.getLogger("SpiderMonitor")
  285. # 可配置化弹窗规则
  286. self.popup_rules = {
  287. "simple": [
  288. ('//*[@text="确定"]', "点击确定"),
  289. ('//*[@text="允许"]', "点击允许"),
  290. ('//*[@text="关闭"]', "点击关闭"),
  291. ('//*[@resource-id="com.sankuai.meituan:id/close"]', "关闭按钮"),
  292. ('//*[@resource-id="com.sankuai.meituan:id/address_center_location_close"]', "关闭按钮"),
  293. ('//*[@resource-id="com.sankuai.meituan:id/location_close"]', "关闭按钮"),
  294. ('//*[@resource-id="com.sankuai.meituan:id/btn_close"]', "关闭按钮"),
  295. ],
  296. "verification": [
  297. '//*[contains(@text, "验证")]',
  298. '//*[contains(@text, "滑块")]',
  299. '//*[contains(@text, "依次点击")]',
  300. '//*[contains(@text, "请点击")]',
  301. '//*[contains(@text, "拖动滑块刚")]', # 这个需要拖动滑块至最右边,然后再截图
  302. '//*[contains(@text, "请输入图片中的内容")]',
  303. '//*[contains(@text, "用最短线连接")]',
  304. '//*[contains(@text, "请按语序依次点击")]',
  305. '//*[contains(@text, "请向右滑动滑块")]',
  306. '//*[contains(@text, "请拖动下方滑块完成拼图")]',
  307. '//*[contains(@resource-id, "captcha")]'
  308. ]
  309. }
  310. def run(self):
  311. while self.running:
  312. try:
  313. handled = self.check_and_handle_popup()
  314. time.sleep(2 if handled else 1)
  315. except Exception as e:
  316. self.logger.exception("监控线程异常: %s", e)
  317. time.sleep(1)
  318. def _is_recent_click(self, xpath):
  319. """防止重复点击同一个弹窗"""
  320. key = f"{xpath}_{int(time.time())}"
  321. if key in self.recent_clicks:
  322. return True
  323. self.recent_clicks.append(key)
  324. return False
  325. def check_and_handle_popup(self):
  326. d = self.spider.d
  327. # 1. 处理简单弹窗
  328. for xpath, desc in self.popup_rules["simple"]:
  329. if d.xpath(xpath).exists and not self._is_recent_click(xpath):
  330. self.logger.info("检测到弹窗: %s", desc)
  331. d.xpath(xpath).click()
  332. return True
  333. # 2. 处理验证码弹窗
  334. for xpath in self.popup_rules["verification"]:
  335. if d.xpath(xpath).exists:
  336. now = time.time()
  337. if now - self.last_verification_time < 30:
  338. return False # 30秒内不重复触发
  339. self.last_verification_time = now
  340. self.verification_count += 1
  341. self.logger.warning("验证码弹窗触发,等待人工处理...")
  342. if self.verification_count > self.MAX_VERIFICATION_RETRY:
  343. self.logger.error("验证码重试超限,终止任务")
  344. self.spider.stop_all()
  345. return True
  346. self.pausing.set() # 通知主线程暂停
  347. # d.toast.show("需要人工处理验证码", 120)
  348. while True:
  349. if not d.xpath(xpath).exists:
  350. self.logger.info("验证码已处理")
  351. # d.toast.show("验证完成", 2)
  352. self.pausing.clear() # 放行主线程
  353. return True
  354. time.sleep(2)
  355. self.logger.warning("验证码超时,重启APP")
  356. self.spider.restart_app()
  357. return True
  358. # 3. 处理广告弹窗(点击右上角)
  359. if d.xpath('//*[contains(@text, "广告")]').exists:
  360. w, h = d.info['displayWidth'], d.info['displayHeight']
  361. d.click(w - 50, 50)
  362. self.logger.info("关闭广告弹窗")
  363. return True
  364. return False
  365. def stop(self):
  366. self.running = False
  367. class MTScreenshot:
  368. def __init__(self, d, oss_config, search_key, title_key, scroll_times=4, compress_quality=7, resize_ratio=0.8):
  369. # 接收外部已连接好的u2设备实例
  370. self.d = d
  371. self.search_key = search_key # 添加这行
  372. self.title_key = title_key
  373. # 启动全局弹窗监控
  374. self.monitor = SpiderMonitor(self)
  375. self.monitor.start()
  376. self.loggerMT = logging.getLogger()
  377. # 日志初始化
  378. self.logger = self._init_logger()
  379. # OSS配置与初始化(核心配置,无冗余)
  380. self.oss_config = oss_config
  381. self.oss_bucket = self._init_oss_bucket()
  382. # 截图核心参数
  383. self.scroll_times = scroll_times
  384. self.compress_quality = compress_quality
  385. self.resize_ratio = resize_ratio
  386. def _init_logger(self):
  387. # 极简日志配置,仅保留必要输出
  388. logger = logging.getLogger("mt_screenshot")
  389. logger.setLevel(logging.INFO)
  390. logger.handlers.clear()
  391. handler = logging.StreamHandler()
  392. handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
  393. logger.addHandler(handler)
  394. return logger
  395. def _init_oss_bucket(self):
  396. # 仅做OSS配置校验和Bucket连接,无额外功能
  397. if not all([self.oss_config.get("access_key_id"),
  398. self.oss_config.get("access_key_secret"),
  399. self.oss_config.get("endpoint"),
  400. self.oss_config.get("bucket_name")]):
  401. self.logger.warning("OSS配置不完整,无法上传")
  402. return None
  403. try:
  404. auth = oss2.Auth(self.oss_config["access_key_id"], self.oss_config["access_key_secret"])
  405. bucket = oss2.Bucket(auth, self.oss_config["endpoint"], self.oss_config["bucket_name"])
  406. bucket.get_bucket_info() # 验证连接
  407. self.logger.info("OSS Bucket连接成功")
  408. return bucket
  409. except Exception as e:
  410. self.logger.error(f"OSS Bucket连接失败: {e}")
  411. return None
  412. def _upload_to_oss(self, local_path):
  413. # 极简上传逻辑,仅返回OSS URL或None
  414. if not self.oss_bucket or not os.path.exists(local_path):
  415. return None
  416. file_name = os.path.basename(local_path)
  417. safe_name = re.sub(r'[^\w\.\-]', '_', file_name)
  418. oss_key = f"{self.oss_config.get('oss_prefix', 'scrape_data/')}{safe_name}"
  419. try:
  420. oss2.resumable_upload(self.oss_bucket, oss_key, local_path)
  421. # 生成并返回完整OSS URL
  422. oss_file_url = f"https://{self.oss_config['bucket_name']}.{self.oss_config['endpoint']}/{urllib.parse.quote(oss_key, safe='/')}"
  423. self.logger.info(f"OSS上传成功: {oss_file_url}")
  424. return oss_file_url
  425. except Exception as e:
  426. self.logger.error(f"OSS上传失败: {e}")
  427. return None
  428. def safe_exec(self, func, *args, **kwargs):
  429. """
  430. 万能安全壳:执行 func 前检查验证码,
  431. 若监控线程已置位 pausing,则一直阻塞直到放行。
  432. """
  433. while self.monitor.pausing.is_set():
  434. time.sleep(1)
  435. # 执行真正逻辑
  436. return func(*args, **kwargs)
  437. def _get_title(self):
  438. # try:
  439. def _inner():
  440. print(f'获取商品title时的搜索关键字:{self.title_key}')
  441. # 初始化
  442. drugs_name = ''
  443. specifications = ''
  444. title = ''
  445. # 循环的获取title为了有时间来处理人机验证
  446. for m in range(1, 6000):
  447. if self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').exists:
  448. title = self.safe_exec(
  449. lambda: self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').text
  450. )
  451. self.loggerMT.info(f"第{m}次获取title成功")
  452. print(f"第{m}次获取title成功")
  453. break
  454. else:
  455. time.sleep(1)
  456. # return drugs_name, specifications
  457. title = title[1:] if title.startswith('0') else title
  458. print(f'获取到药品标题:{title}')
  459. match = re.match(r'(\[[^\]]+\])(.*?)\s*((?:\d+\S*|\(.+))$', title)
  460. if match:
  461. drugs_name = title
  462. specifications = match.group(3).strip()
  463. print("药品名:", drugs_name)
  464. print("规格:", specifications)
  465. # print('完整药名:', drugs_name + specifications)
  466. return drugs_name # , specifications
  467. else:
  468. drugs_name = title
  469. specifications = ''
  470. return drugs_name
  471. # 用 safe_exec 包装内部逻辑,确保验证码阻塞
  472. return self.safe_exec(_inner)
  473. def _merge_screenshots(self, screens):
  474. # 仅拼接截图,无额外功能
  475. if len(screens) == 1:
  476. return screens[0].convert('RGB')
  477. rgb_screens = [s.convert('RGB') for s in screens]
  478. total_width = rgb_screens[0].width
  479. total_height = sum(s.height for s in rgb_screens)
  480. merged_img = Image.new('RGB', (total_width, total_height))
  481. y_offset = 0
  482. for img in rgb_screens:
  483. merged_img.paste(img, (0, y_offset))
  484. y_offset += img.height
  485. return merged_img
  486. def get_oss_url(self):
  487. """核心方法:截图+临时本地保存+上传OSS+上传成功删本地文件+返回OSS URL,可直接赋值给oss_file"""
  488. local_file_path = None
  489. try:
  490. # 1. 提取标题
  491. title = self._get_title()
  492. self.logger.info(f"获取标题: {title[:20]}..." if title else "未获取到标题")
  493. # 2. 生成本地文件路径
  494. timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
  495. safe_title = re.sub(r'[\\/*?:"<>|]', '_', title)
  496. local_dir = "../scrape_data"
  497. os.makedirs(local_dir, exist_ok=True)
  498. local_file_path = os.path.join(local_dir, f"{timestamp}_{safe_title}.jpg")
  499. # 3. 滚动截图
  500. screen_list = [self.d.screenshot()]
  501. w, h = self.d.window_size()
  502. for i in range(self.scroll_times):
  503. # 可能滑动距离太短,截不到店名。原本是0.8
  504. # self.d.swipe(w // 2, h * 0.9, w // 2, h * 0.1, duration=random.uniform(0.6, 1.2))
  505. self.d.swipe(w // 2, h * 0.85, w // 2, h * 0.15, # 滑动到15%
  506. duration=random.uniform(0.8, 1.5))
  507. time.sleep(random.uniform(2.0, 4.0))
  508. screen_list.append(self.d.screenshot())
  509. if self.d(textContains='商家服务').exists:
  510. # 看情况是否需要补滑
  511. break
  512. # 4. 拼接+压缩+保存
  513. merged_img = self._merge_screenshots(screen_list)
  514. if 0.1 < self.resize_ratio < 1.0:
  515. new_size = (int(merged_img.width * self.resize_ratio), int(merged_img.height * self.resize_ratio))
  516. resample_mode = Image.Resampling.LANCZOS if hasattr(Image, 'Resampling') else Image.LANCZOS
  517. merged_img = merged_img.resize(new_size, resample_mode)
  518. # 临时保存到本地
  519. merged_img.save(local_file_path, format='JPEG', quality=self.compress_quality)
  520. merged_img.close() # 释放长图句柄
  521. self.logger.info(f"临时本地保存: {local_file_path}")
  522. # 5. 上传OSS
  523. oss_url = self._upload_to_oss(local_file_path)
  524. # 6. 核心:OSS上传成功后,删除本地临时文件
  525. if oss_url is not None:
  526. try:
  527. self.logger.info(f"✅ OSS上传成功,已删除本地临时文件: {local_file_path}")
  528. except Exception as e:
  529. self.logger.warning(f"⚠️ OSS上传成功,但删除本地文件失败: {e}")
  530. return oss_url
  531. except Exception as e:
  532. self.logger.error(f"截图/上传失败: {e}")
  533. return None
  534. class MT:
  535. def __init__(
  536. self,
  537. key,
  538. title_key,
  539. spec_list,
  540. brand,
  541. sort=None,
  542. collect_range=None,
  543. page_range=None,
  544. workflow_retry_limit=None,
  545. workflow_error_action=None,
  546. ):
  547. self.package_name = Config.PACKAGE_NAME
  548. self.access_token = get_access_token()
  549. self.city2province = self.get_city_info()
  550. self.APP_ID = '116857964'
  551. self.API_KEY = '1gAzACJOAr7BeILKqkqPOETh'
  552. self.SECRET_KEY = 'ZNArANb9GwJYgLKg4EfYhukKBfPdl1n3'
  553. self.client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
  554. self.city_to_name = city_name_to_id.build_city_name_to_id(_DEFAULT_PATH)
  555. self.table_name = "retrieve_scrape_data"
  556. self.shop_table_name = "mt_shop_info_middle"
  557. self.loggerMT = logging.getLogger()
  558. self.page = 0
  559. self.search_key = key
  560. self.title_key = title_key
  561. self.spec_list = spec_list
  562. self.brand = brand
  563. self.sort = sort
  564. self.collect_range = self.normalize_collect_range(collect_range)
  565. self.page_range = self.normalize_page_range(page_range)
  566. self.sort_key = 0
  567. self.unrelated_data = 0
  568. self.shop_data_num = 0
  569. self.max_unrelated_data = 15
  570. self.collection_cursor = {"page_no": 1, "item_index": 0}
  571. self.workflow_retry_limit = workflow_retry_limit or {
  572. "start_app": 3,
  573. "open_product_list_page": 3,
  574. "collect_single_product": 3,
  575. }
  576. self.workflow_error_action = workflow_error_action or {
  577. "start_app": "start_app",
  578. "open_product_list_page": "start_app",
  579. "collect_single_product": "back_to_list_page",
  580. }
  581. @staticmethod
  582. def normalize_collect_range(collect_range):
  583. if not collect_range:
  584. return None
  585. start = None
  586. end = None
  587. if isinstance(collect_range, dict):
  588. start = collect_range.get("start")
  589. end = collect_range.get("end")
  590. elif isinstance(collect_range, (list, tuple)) and len(collect_range) >= 2:
  591. start, end = collect_range[0], collect_range[1]
  592. elif isinstance(collect_range, str):
  593. matched = re.match(r"^\s*(\d+(?:\.\d+)?)\s*[-,~]\s*(\d+(?:\.\d+)?)\s*$", collect_range)
  594. if matched:
  595. start, end = matched.group(1), matched.group(2)
  596. try:
  597. start = float(start)
  598. end = float(end)
  599. except (TypeError, ValueError):
  600. return None
  601. if start < 0 or end < 0:
  602. return None
  603. if start > end:
  604. start, end = end, start
  605. return {"start": start, "end": end}
  606. @staticmethod
  607. def normalize_page_range(page_range):
  608. if not page_range:
  609. return None
  610. start = None
  611. end = None
  612. if isinstance(page_range, dict):
  613. start = page_range.get("start")
  614. end = page_range.get("end")
  615. elif isinstance(page_range, (list, tuple)) and len(page_range) >= 2:
  616. start, end = page_range[0], page_range[1]
  617. elif isinstance(page_range, str):
  618. matched = re.match(r"^\s*[\[\(]?\s*(\d+)\s*[,,\-~]\s*(\d+)\s*[\]\)]?\s*$", page_range)
  619. if matched:
  620. start, end = matched.group(1), matched.group(2)
  621. try:
  622. start = int(float(start))
  623. end = int(float(end))
  624. except (TypeError, ValueError):
  625. return None
  626. if start <= 0 or end <= 0:
  627. return None
  628. if start > end:
  629. start, end = end, start
  630. return {"start": start, "end": end}
  631. def stop_app(self):
  632. self.d.app_stop(self.package_name)
  633. time.sleep(1)
  634. def start_app(self):
  635. self.d.app_start(self.package_name)
  636. time.sleep(1)
  637. def restart_app(self):
  638. self.stop_app()
  639. self.start_app()
  640. def li_or_lo(self, key):
  641. if key == "升序":
  642. self.sort_key += 1
  643. self.d.xpath('//*[@text="价格"]').click()
  644. n = self.d.xpath('//*[@text="总价低到高"]')
  645. if n.exists:
  646. n.click()
  647. time.sleep(1)
  648. if key == "降序":
  649. self.sort_key += 1
  650. self.d.xpath('//*[@text="价格"]').click()
  651. time.sleep(2)
  652. self.d.xpath('//*[@text="价格"]').click()
  653. def wr_re(self, mod, device_id, sort=None, page=None):
  654. file_path = f'./ycwj/{device_id}_{self.title_key}.txt'
  655. if mod == "写":
  656. try:
  657. data = {
  658. "page": page if page else "",
  659. "sort": sort if sort else "",
  660. }
  661. os.makedirs(os.path.dirname(file_path), exist_ok=True)
  662. with open(file_path, 'w', encoding='utf-8') as f:
  663. json.dump(data, f, ensure_ascii=False, indent=2)
  664. print(f"进度保存成功:{sort},{page}页")
  665. except Exception as e:
  666. print("保存进度失败")
  667. elif mod == "读":
  668. try:
  669. if not os.path.exists(file_path):
  670. return None
  671. with open(file_path, 'r', encoding='utf-8') as f:
  672. data = json.load(f)
  673. print(self.sort)
  674. if self.sort and self.sort_key == 0:
  675. self.li_or_lo(self.sort)
  676. i = 0
  677. while True:
  678. if i == data['page']:
  679. self.page = data['page']
  680. print("当前页", self.page)
  681. break
  682. else:
  683. i += 1
  684. self.d.drag(300, 1400, 300, 400, 1)
  685. return data
  686. except Exception as e:
  687. print(f"读取进度失败")
  688. return None
  689. return None
  690. # 任何一个spec满足都算有效
  691. def is_link_spec_useful(self, product_title):
  692. if len(self.spec_list) == 0:
  693. return True
  694. for spec in self.spec_list:
  695. if spec in product_title:
  696. return True
  697. return False
  698. # TODO 继续优化这里的判断逻辑,可以考虑搭配config的修改
  699. def is_link_useful(self, product_title):
  700. if self.title_key != "" and self.title_key not in product_title:
  701. print(f"当前商品名称:{product_title} 不包含{self.title_key}关键字")
  702. return False
  703. if self.brand != "" and self.brand not in product_title:
  704. print(f"当前商品名称:{product_title} 不包含{self.brand}品牌")
  705. return False
  706. if not self.is_link_spec_useful(product_title):
  707. print(f"当前商品名称:{product_title} 不包含{self.spec_list}品规")
  708. return False
  709. return True
  710. @staticmethod
  711. def get_sleep_time():
  712. # return random.randint(5, 8)
  713. # return random.randint(1, 2)
  714. return 1
  715. @staticmethod
  716. def get_current_date():
  717. return datetime.datetime.now().strftime('%Y/%m/%d')
  718. @staticmethod
  719. def get_city_info():
  720. """
  721. 获取所有的省市数据
  722. :return:
  723. """
  724. file_path = '../kailin_city.json'
  725. with open(file_path, 'r', encoding='utf-8') as f:
  726. data = json.load(f)
  727. province = {province_one["id"]: province_one for province_one in data['province']}
  728. city2province = dict()
  729. city = data['city']
  730. for city_one in city:
  731. name = city_one['name']
  732. pid = city_one['pid']
  733. if len(str(pid)) > 2:
  734. pid = int(re.match('^\d{2}', str(pid)).group())
  735. city2province[name] = province[pid]['name']
  736. return city2province
  737. def get_shop_name_from_current_page(self):
  738. """
  739. 仅从当前商品详情页读取店铺名,不做任何页面跳转。
  740. """
  741. shop_name = self.get_first_text_by_xpaths([
  742. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView',
  743. '//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()-1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView',
  744. ])
  745. if shop_name:
  746. print(f'获取到店铺名:{shop_name}')
  747. return shop_name
  748. def get_shop_name(self):
  749. """
  750. 获取店铺名
  751. :return:
  752. """
  753. shop_name = self.get_shop_name_from_current_page()
  754. if shop_name:
  755. return shop_name
  756. try:
  757. # 点击店铺进入后获取店铺名称
  758. print("点击店铺进入后获取店铺名称")
  759. self.enter_shop()
  760. shop_xpath = '//*[@resource-id="com.sankuai.meituan:id/layout_header_view"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]//android.widget.FrameLayout[2]/android.widget.FrameLayout[1]/android.widget.TextView'
  761. if self.d.xpath(shop_xpath).exists:
  762. shop_name = self.d.xpath(shop_xpath).text
  763. self.swipe_back(1)
  764. return shop_name
  765. shop_name = ''
  766. return shop_name
  767. except Exception as e:
  768. print(f'获取店铺名出错:{e}')
  769. return ''
  770. def get_qualification_number(self):
  771. """
  772. 获取资质编号
  773. :return:
  774. """
  775. try:
  776. qualification_number_str = self.d.xpath(
  777. '//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text
  778. qualification_number = qualification_number_str.strip('资质编号:').strip()
  779. return qualification_number
  780. except:
  781. return None
  782. def get_shop_address(self):
  783. try:
  784. shop_address_xpaths = [
  785. '//*[@resource-id="com.sankuai.meituan:id/wm_sc_drug_shop_content_mrn_container_id_2"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.TextView',
  786. '//*[@resource-id="com.sankuai.meituan:id/wm_sc_drug_shop_content_mrn_container_id_2"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.TextView',
  787. ]
  788. shop_address = self.get_first_text_by_xpaths(shop_address_xpaths)
  789. print(f'111-获取到店铺地址:{shop_address}')
  790. if '发货时间' in shop_address:
  791. print(f'店铺地址包含发货时间,再次获取店铺地址')
  792. shop_address = self._read_xpath_text(shop_address_xpaths[1])
  793. if shop_address:
  794. print(f'222-获取到店铺地址:{shop_address}')
  795. else:
  796. print(f'222-xpath2获取店铺地址失败')
  797. print(f'333-获取到店铺地址:{shop_address}')
  798. return shop_address
  799. except:
  800. print(f'获取店铺地址出错-get_shop_address')
  801. return None
  802. def execute_db_write(self, sql, params, action_desc, max_retries=5):
  803. for attempt in range(max_retries):
  804. conn = None
  805. try:
  806. conn = get_mysql()
  807. with conn.cursor() as cur:
  808. cur.execute(sql, params)
  809. conn.commit()
  810. print(f"{action_desc}成功")
  811. return True
  812. except Exception as e:
  813. print(f'{action_desc}异常 (尝试 {attempt + 1}/{max_retries}): {e}')
  814. if conn:
  815. conn.rollback()
  816. if attempt == max_retries - 1:
  817. print(f"{action_desc}失败,达到最大重试次数")
  818. return False
  819. time.sleep(2)
  820. finally:
  821. if conn:
  822. conn.close()
  823. def query_exists(self, sql, params, error_desc):
  824. conn = None
  825. try:
  826. conn = get_mysql()
  827. with conn.cursor() as cur:
  828. cur.execute(sql, params)
  829. return bool(cur.fetchone())
  830. except Exception as e:
  831. print(f"{error_desc}错误: {str(e)}")
  832. return None
  833. finally:
  834. if conn:
  835. conn.close()
  836. def save_to_database(self, data):
  837. print(f'保存数据到数据库:{data}')
  838. add_sql = f"""
  839. INSERT INTO {self.table_name} (
  840. enterprise_id, platform_id, platform_item_id, province_id, city_id,
  841. province_name, city_name, area_info, product_name, product_specs,
  842. one_box_price, manufacture_date, expiry_date, manufacturer, approval_number,
  843. is_sold_out, online_posting_count, continuous_listing_count, link_url,
  844. store_name, store_url, shipment_province_id, shipment_province_name,
  845. shipment_city_id, shipment_city_name, company_name, qualification_number,
  846. scrape_date, min_price, number, sales, inventory, snapshot_url, insert_time
  847. ) VALUES (
  848. %s, %s, %s, %s, %s,
  849. %s, %s, %s, %s, %s,
  850. %s, %s, %s, %s, %s,
  851. %s, %s, %s, %s,
  852. %s, %s, %s, %s,
  853. %s, %s, %s, %s,
  854. %s, %s, %s, %s, %s, %s, %s
  855. )
  856. """
  857. params = (
  858. data['enterprise_id'],
  859. data['platform_id'],
  860. data['platform_item_id'],
  861. data['province_id'],
  862. data['city_id'],
  863. data['province_name'],
  864. data['city_name'],
  865. data['area_info'],
  866. data['product_name'],
  867. data['product_specs'],
  868. data['one_box_price'],
  869. data['manufacture_date'],
  870. data['expiry_date'],
  871. data['manufacturer'],
  872. data['approval_number'],
  873. data['is_sold_out'],
  874. data['online_posting_count'],
  875. data['continuous_listing_count'],
  876. data['link_url'],
  877. data['store_name'],
  878. data['store_url'],
  879. data['shipment_province_id'],
  880. data['shipment_province_name'],
  881. data['shipment_city_id'],
  882. data['shipment_city_name'],
  883. data['company_name'],
  884. data['qualification_number'],
  885. data['scrape_date'],
  886. data['min_price'],
  887. data['number'],
  888. data['sales'],
  889. data['inventory'],
  890. data['snapshot_url'],
  891. data['insert_time'],
  892. )
  893. return self.execute_db_write(add_sql, params, "保存商品数据到数据库")
  894. def save_shop_info_to_database(self, data):
  895. print(f'保存店铺数据到数据库:{data}')
  896. add_sql = f"""
  897. INSERT INTO {self.shop_table_name}
  898. (shop, contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform)
  899. VALUES (%s, %s, %s, %s, %s, %s, %s)
  900. """
  901. params = (
  902. data['shop'],
  903. data['contact_address'],
  904. data['qualification_number'],
  905. data['business_license_company'],
  906. data['business_license_address'],
  907. data['scrape_date'],
  908. data['platform'],
  909. )
  910. return self.execute_db_write(add_sql, params, "保存店铺数据到数据库")
  911. def swipe_back(self, no):
  912. """
  913. 返回
  914. :param no: 回退次数
  915. :return:
  916. """
  917. for idx in range(no):
  918. self.d.press('back')
  919. time.sleep(self.get_sleep_time())
  920. def drug_price(self):
  921. """
  922. 获取药品价格
  923. :return:
  924. """
  925. try:
  926. price_str = self.d.xpath('//*[starts-with(@text,"¥")]').text
  927. price = float(re.search(r'[\d\.]+', price_str).group())
  928. print(f'获取到价格:{price}')
  929. return price
  930. except Exception as e:
  931. print(f'提取价格出错-->{e}')
  932. return None
  933. def drug_sale_num(self):
  934. """
  935. 获取药品销量
  936. :return:
  937. """
  938. try:
  939. sales_element = self.d.xpath('//*[starts-with(@text,"已售")]')
  940. if sales_element.exists:
  941. sales_num_str = self.d.xpath('//*[starts-with(@text,"已售")]').text
  942. sales_num_str = sales_num_str.replace("已售", "").strip()
  943. # price = float(re.search(r'[\d\.]+', price_str).group())
  944. print(f'获取到已售数量:{sales_num_str}')
  945. return sales_num_str
  946. return None
  947. except Exception as e:
  948. print(f'提取已售数量出错-->{e}')
  949. return None
  950. def restart_uiautomator_services(self, device_id):
  951. """
  952. 重启atx的uiautomator 服务
  953. :param device_id:
  954. :return:
  955. """
  956. stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop'
  957. start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'
  958. subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  959. time.sleep(self.get_sleep_time())
  960. subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True)
  961. time.sleep(self.get_sleep_time())
  962. def reconnect_device(self):
  963. """重启 atx-agent 并重新连接设备"""
  964. try:
  965. # 停止 atx-agent
  966. subprocess.run(["adb", "-s", self.device_id, "shell",
  967. "/data/local/tmp/atx-agent", "server", "-d", "--stop"],
  968. capture_output=True, timeout=5)
  969. time.sleep(1)
  970. # 启动 atx-agent
  971. subprocess.run(["adb", "-s", self.device_id, "shell",
  972. "/data/local/tmp/atx-agent", "server", "-d"],
  973. capture_output=True, timeout=5)
  974. time.sleep(2)
  975. # 重新连接 uiautomator2
  976. self.d = u2.connect_usb(self.device_id)
  977. self.restart_uiautomator_services(self.device_id)
  978. self.loggerMT.info("设备重连成功")
  979. return True
  980. except Exception as e:
  981. self.loggerMT.error(f"设备重连失败: {e}")
  982. return False
  983. def connect_devices(self, device_id):
  984. """
  985. 连接设备
  986. :return:
  987. """
  988. try:
  989. self.device_id = device_id
  990. self.d = u2.connect_usb(device_id)
  991. self.restart_uiautomator_services(device_id)
  992. self.oss_config = {
  993. "access_key_id": 'LTAI5tDwjfteBvivYN41r8sJ',
  994. "access_key_secret": 'yowuOGi2nYYnrqGpO3qcz94C4brcPp',
  995. "endpoint": "oss-cn-shenzhen.aliyuncs.com", # 例:oss-cn-beijing.aliyuncs.com
  996. "bucket_name": "zhijiayun-jiansuo",
  997. "oss_prefix": "scrape_data/" # OSS中存放截图的前缀(虚拟文件夹)
  998. }
  999. print(f'连接到设备:{device_id}')
  1000. self.loggerMT.info(f'连接到设备:{device_id}')
  1001. except Exception as e:
  1002. print(f'{device_id} 连接错误: {e}')
  1003. self.loggerMT.info(f'{device_id} 连接错误: {e}')
  1004. raise Exception(e)
  1005. def get_ocr_res(self, img):
  1006. try:
  1007. # img地址
  1008. print(f'开始识别图片:{img}')
  1009. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  1010. f = open(img, 'rb')
  1011. img = base64.b64encode(f.read())
  1012. params = {"image": img}
  1013. request_url = request_url + "?access_token=" + self.access_token
  1014. headers = {'content-type': 'application/x-www-form-urlencoded'}
  1015. response = requests.post(request_url, data=params, headers=headers)
  1016. if response:
  1017. res = response.json()
  1018. new_dic = dict()
  1019. for ite in res['words_result'].keys():
  1020. new_dic[ite] = res['words_result'][ite]['words']
  1021. print('资质数据信息', new_dic)
  1022. return new_dic
  1023. else:
  1024. return None
  1025. except:
  1026. return None
  1027. def remove_watermark(self, img_path):
  1028. """
  1029. 图片去水印(将水印部分变成白色背景)并将数据转化为二进制数据
  1030. :param img_path: 图片路径
  1031. :return: 二进制图片数据
  1032. """
  1033. img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
  1034. endswith = os.path.splitext(img_path)[1]
  1035. new = np.clip(1.4057577998008846 * img - 38.33089999653017, 0, 255).astype(np.uint8)
  1036. _, img_binary = cv2.imencode(endswith, new)
  1037. return img_binary
  1038. def get_ocr_res_image(self, img):
  1039. try:
  1040. image = self.remove_watermark(img)
  1041. # image_file = open(img,'wb')
  1042. # image_file.write(image)
  1043. # res_image = self.client.basicAccurate(image) # 高精度
  1044. res_image = self.client.basicGeneral(image)
  1045. data = res_image.get('words_result', '')
  1046. print(f'百度api返回结果:{data}')
  1047. return data
  1048. except:
  1049. return None
  1050. def screenshot_the_business_license(self, qualification_number):
  1051. screenshot_path = 'screenshot1.png'
  1052. self.d.screenshot(screenshot_path)
  1053. img = cv2.imread(screenshot_path)
  1054. # 指定裁剪区域 (left, top, right, bottom)
  1055. left = 0
  1056. top = 480
  1057. right = 720
  1058. bottom = 1420
  1059. cropped_img = img[top:bottom, left:right]
  1060. # 创建目录
  1061. SCREENSHOT_DIR = Path('screenshot') # 注意这里的变化和py文件同一级目录即可
  1062. SCREENSHOT_DIR.mkdir(parents=True, exist_ok=True)
  1063. if qualification_number:
  1064. # cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\mt\\screenshot\\' + qualification_number + '.png'
  1065. cropped_screenshot_path = SCREENSHOT_DIR / f'{qualification_number}.png'
  1066. else:
  1067. cropped_screenshot_path = 'cropped_screenshot.png'
  1068. cv2.imwrite(cropped_screenshot_path, cropped_img)
  1069. return cropped_screenshot_path
  1070. def screenshot_instruction(self):
  1071. # 获取当前时间
  1072. current_time = datetime.datetime.now()
  1073. # 格式化为时分秒
  1074. time_str = current_time.strftime("%H-%M-%S")
  1075. # 生成随机的 8 位字符串
  1076. random_str = secrets.token_hex(4) # 生成 4 个字节的随机字符串,转换为 8 位十六进制字符串
  1077. print(time_str)
  1078. screenshot_path = 'instructionscreenshot1-' + time_str + '-' + random_str + '.png'
  1079. self.d.screenshot(screenshot_path)
  1080. return screenshot_path
  1081. def extract_specification(self, text):
  1082. """提取药品规格信息"""
  1083. # 方法1:简单去除到期信息
  1084. pattern = r'^[^【]+'
  1085. match = re.search(pattern, text)
  1086. if match:
  1087. return match.group(0).strip()
  1088. return text
  1089. # 获取商品title
  1090. def get_title(self):
  1091. def _inner():
  1092. print(f'获取商品title时的搜索关键字:{self.title_key}')
  1093. # 初始化
  1094. drugs_name = ''
  1095. specifications = ''
  1096. title = ''
  1097. # 循环的获取title为了有时间来处理人机验证
  1098. for m in range(1, 6000):
  1099. if self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').exists:
  1100. title = self.safe_exec(
  1101. lambda: self.d.xpath(f'//*[contains(@text, "{self.title_key}")]').text
  1102. )
  1103. print(f"第{m}次获取title成功")
  1104. break
  1105. else:
  1106. time.sleep(3)
  1107. # return drugs_name, specifications
  1108. title = title[1:] if title.startswith('0') else title
  1109. print(f'获取到药品标题:{title}')
  1110. # match = re.match(r'(\[[^\]]+\])(.*?)\s*((?:\d+\S*|\(.+))$', title)
  1111. match = re.match(r'^(?:0?)?(?:\[([^\]]+)\])?\s*(.*?)\s*(\d+[^\s]+)$', title)
  1112. if match:
  1113. # drugs_name = match.group(1).strip() + match.group(2).strip()
  1114. drugs_name = title
  1115. specifications = match.group(3).strip()
  1116. print("药品名:", drugs_name)
  1117. print("规格:", specifications)
  1118. # 如果品规中包含到期则需要再次的正则处理
  1119. if '到期' in specifications:
  1120. specifications = self.extract_specification(specifications)
  1121. # print('完整药名:', drugs_name + specifications)
  1122. return drugs_name, specifications
  1123. else:
  1124. print("没有匹配到预期格式")
  1125. drugs_name = title
  1126. specifications = ''
  1127. return drugs_name, specifications
  1128. # 用 safe_exec 包装内部逻辑,确保验证码阻塞
  1129. return self.safe_exec(_inner)
  1130. def enter_shop(self):
  1131. """
  1132. 进店,方便提取资质环境
  1133. :return:
  1134. """
  1135. # self.d.xpath('//*[@text="进店"]').click()
  1136. self.d.xpath('//*[@text="店铺"]').click()
  1137. time.sleep(self.get_sleep_time())
  1138. def enter_shoper(self):
  1139. """
  1140. 进入商家
  1141. :return:
  1142. """
  1143. is_shoper_exists = 0
  1144. for i in range(10):
  1145. if self.d.xpath('//*[@text="商家"]').exists:
  1146. print(f'第{i}次商家存在')
  1147. is_shoper_exists = 1
  1148. break
  1149. else:
  1150. print(f'第{i}次商家不存在')
  1151. time.sleep(self.get_sleep_time())
  1152. if is_shoper_exists == 1:
  1153. self.d.xpath('//*[@text="商家"]').click()
  1154. time.sleep(self.get_sleep_time())
  1155. return True
  1156. else:
  1157. return False
  1158. # 点击查看商家资质
  1159. def scan_shoper_license(self):
  1160. exist_shoper = 0
  1161. for i in range(10):
  1162. if self.d.xpath('//*[@text="查看商家资质"]').exists:
  1163. print(f'第{i}次查看商家资质存在')
  1164. exist_shoper = 1
  1165. break
  1166. else:
  1167. print(f'第{i}次查看商家资质不存在')
  1168. time.sleep(self.get_sleep_time())
  1169. if exist_shoper == 1:
  1170. self.d.xpath('//*[@text="查看商家资质"]').click()
  1171. time.sleep(self.get_sleep_time())
  1172. else:
  1173. self.swipe_back(1)
  1174. # 验证商品的信息是否在数据库中已存在
  1175. def data_is_exists(self, data):
  1176. """
  1177. 检查指定数据是否已存在于数据库表中(仅检查存在性)
  1178. 参数:
  1179. data: 包含查询条件的字典,键为列名,值为条件值
  1180. 返回:
  1181. True: 数据存在
  1182. False: 数据不存在
  1183. None: 检查过程中出错
  1184. """
  1185. required_keys = ['product', 'min_price', 'shop', 'scrape_date', 'platform']
  1186. if not all(key in data for key in required_keys):
  1187. missing = [key for key in required_keys if key not in data]
  1188. logging.error(f"缺少必要字段: {', '.join(missing)}")
  1189. return None
  1190. query_sql = f"""
  1191. SELECT 1 FROM {self.table_name}
  1192. WHERE product_name = %s
  1193. AND min_price = %s
  1194. AND store_name = %s
  1195. AND scrape_date = %s
  1196. AND platform_id = %s
  1197. LIMIT 1
  1198. """
  1199. params = (
  1200. data['product'],
  1201. data['min_price'],
  1202. data['shop'],
  1203. data['scrape_date'],
  1204. data['platform']
  1205. )
  1206. return self.query_exists(query_sql, params, "商品查重")
  1207. # 验证店铺信息是否在数据库中已存在
  1208. def shop_is_exists_database(self, shop):
  1209. query_sql = f"""
  1210. SELECT 1 FROM {self.shop_table_name}
  1211. WHERE shop = %s
  1212. LIMIT 1
  1213. """
  1214. return self.query_exists(query_sql, (shop,), "店铺查重")
  1215. def wait_if_verifying(self, monitor, timeout=120):
  1216. """验证码处理期间阻塞主线程"""
  1217. start = time.time()
  1218. while monitor.pausing.is_set() and time.time() - start < timeout:
  1219. time.sleep(1)
  1220. def wait_for_ready(self, monitor, timeout=86400):
  1221. """进入每一页前都先等验证码"""
  1222. start = time.time()
  1223. while monitor.pausing.is_set() and time.time() - start < timeout:
  1224. time.sleep(1)
  1225. # 额外保险:如果验证码突然在这一秒才弹,再主动扫一次
  1226. monitor.check_and_handle_popup()
  1227. def safe_list(self, xpath, monitor):
  1228. """线程安全地拿商品列表"""
  1229. self.wait_for_ready(monitor)
  1230. return self.d.xpath(xpath).all()
  1231. def safe_exec(self, func, *args, **kwargs):
  1232. """
  1233. 万能安全壳:执行 func 前检查验证码,
  1234. 若监控线程已置位 pausing,则一直阻塞直到放行。
  1235. """
  1236. while self.monitor.pausing.is_set():
  1237. time.sleep(1)
  1238. max_retries = 3
  1239. for attempt in range(max_retries):
  1240. try:
  1241. return func(*args, **kwargs)
  1242. except http.client.RemoteDisconnected as e:
  1243. self.loggerMT.error(f"连接断开 (尝试 {attempt + 1}/{max_retries}): {e}")
  1244. if attempt == max_retries - 1:
  1245. raise # 最后一次失败,向上抛出
  1246. # 尝试重连
  1247. if self.reconnect_device():
  1248. self.loggerMT.info("重连成功,准备重试...")
  1249. time.sleep(2) # 等待设备稳定
  1250. continue
  1251. else:
  1252. self.loggerMT.error("重连失败,无法继续")
  1253. raise
  1254. except Exception as e:
  1255. # 其他异常直接抛出
  1256. raise
  1257. # 执行真正逻辑
  1258. return func(*args, **kwargs)
  1259. def get_next_data(self, data, target):
  1260. for i, item in enumerate(data):
  1261. if item['words'] == target:
  1262. if i + 1 < len(data):
  1263. return data[i + 1]['words']
  1264. return None
  1265. def delete_instruction_screenshot(self, screenshot_path):
  1266. # 删除截图文件
  1267. try:
  1268. os.remove(screenshot_path)
  1269. print(f"截图文件已删除:{screenshot_path}")
  1270. except FileNotFoundError:
  1271. print(f"文件未找到,无法删除:{screenshot_path}")
  1272. except Exception as e:
  1273. print(f"删除文件时出错:{e}")
  1274. def get_instructions_data(self):
  1275. """
  1276. 确定有说明书之后,提取所有的说明书数据
  1277. :return:
  1278. """
  1279. self.d.xpath('//*[@text="说明"]').click()
  1280. # time.sleep(random.randint(3, 5))
  1281. time.sleep(0.5)
  1282. if self.d.xpath('//*[@text="查看详细说明"]').exists:
  1283. self.d.xpath('//*[@text="查看详细说明"]').click()
  1284. else:
  1285. view_all_xpath = self.find_xpath_with_swipes(
  1286. ['//*[@text="查看全部"]'],
  1287. swipe_direction='down',
  1288. swipe_scale=0.3,
  1289. max_swipes=8,
  1290. found_desc='查看全部'
  1291. )
  1292. if view_all_xpath:
  1293. self.d.xpath(view_all_xpath).click()
  1294. else:
  1295. res_data = {
  1296. "有效期": '',
  1297. "生产单位": '',
  1298. "批准文号": ''
  1299. }
  1300. self.loggerMT.info('获取到的说明书信息为空。')
  1301. return res_data
  1302. time.sleep(0.5)
  1303. for ii in range(8):
  1304. if self.d.xpath('//*[@text="加载更多"]').exists:
  1305. self.d.xpath('//*[@text="加载更多"]').click()
  1306. time.sleep(0.2)
  1307. break
  1308. else:
  1309. self.d.swipe(200, 1000, 200, 300, 0.3)
  1310. # self.d.swipe_ext("up", scale=0.3)
  1311. for iii in range(10):
  1312. if self.d.xpath('//*[@text="生产单位"]').exists and self.d.xpath('//*[@text="批准文号"]').exists:
  1313. break
  1314. else:
  1315. self.d.swipe(200, 1300, 200, 300, 0.3)
  1316. # self.d.swipe_ext("up", scale=0.3)
  1317. instruction_path = self.screenshot_instruction()
  1318. print(f"instruction_path= {instruction_path}")
  1319. time.sleep(2)
  1320. ocr_res = self.get_ocr_res_image(instruction_path)
  1321. # print(f'ocr_res:{ocr_res}')
  1322. if ocr_res:
  1323. # 获取有效期的下一个数据
  1324. validity = self.get_next_data(ocr_res, '有效期')
  1325. # 获取批准文号的下一个数据
  1326. approval_number = self.get_next_data(ocr_res, '批准文号')
  1327. # 获取生产单位的下一个数据
  1328. manufacturer = self.get_next_data(ocr_res, '生产单位')
  1329. else:
  1330. validity = ''
  1331. approval_number = ''
  1332. manufacturer = ''
  1333. res_data = {
  1334. "有效期": validity,
  1335. "生产单位": manufacturer,
  1336. "批准文号": approval_number
  1337. }
  1338. print(f"res_data={res_data}")
  1339. time.sleep(1)
  1340. self.delete_instruction_screenshot(instruction_path)
  1341. return res_data
  1342. def has_instructions(self):
  1343. """
  1344. 是否有说明书
  1345. :return:
  1346. """
  1347. # 没有说明书的无法采集具体数据
  1348. time.sleep(self.get_sleep_time())
  1349. return bool(self.find_xpath_with_swipes(
  1350. ['//*[@text="说明"]'],
  1351. swipe_direction='down',
  1352. swipe_scale=0.3,
  1353. max_swipes=8,
  1354. found_desc='说明'
  1355. ))
  1356. def has_shop(self):
  1357. """
  1358. 是否有进店按钮
  1359. :return:
  1360. """
  1361. # self.d.swipe_ext('up', 0.1)
  1362. time.sleep(self.get_sleep_time())
  1363. is_has_enter_shop = self.d.xpath('//*[@text="进店"]').exists
  1364. return is_has_enter_shop
  1365. # 获取商品对应的店铺信息
  1366. def get_license_info_ex(self):
  1367. # self.enter_shop()
  1368. self.safe_exec(self.enter_shop)
  1369. # self.enter_shoper()
  1370. result = self.safe_exec(self.enter_shoper)
  1371. if result == False:
  1372. license_info_data = {'contact_address': '', 'qualification_number': '', 'business_license_company': '',
  1373. 'business_license_address': ''}
  1374. return license_info_data
  1375. for i in range(10):
  1376. if self.d.xpath('//*[@text="查看商家资质"]').exists:
  1377. print(f"第{i}次有商家资质")
  1378. break
  1379. else:
  1380. print(f"第{i}次没有商家资质")
  1381. time.sleep(self.get_sleep_time())
  1382. # 获取地址
  1383. # contact_address = self.get_shop_address()
  1384. contact_address = self.safe_exec(self.get_shop_address)
  1385. # time.sleep(50000)
  1386. ###
  1387. # self.scan_shoper_license()
  1388. self.safe_exec(self.scan_shoper_license)
  1389. # 获取资质编码
  1390. # qualification_number = self.get_qualification_number()
  1391. qualification_number = self.safe_exec(self.get_qualification_number)
  1392. # qualification_number 不为None继续下一步
  1393. if qualification_number:
  1394. # 营业执照公司名称
  1395. business_license_company = ''
  1396. # 营业执照地址
  1397. business_license_address = ''
  1398. self.d.click(0.603, 0.27)
  1399. time.sleep(self.get_sleep_time())
  1400. cropped_screenshot_path = self.screenshot_the_business_license(qualification_number)
  1401. print(f'cropped_screenshot_path:{cropped_screenshot_path}')
  1402. # if qualification_number:
  1403. # cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\mt\\screenshot\\' + qualification_number + '.png'
  1404. # else:
  1405. # cropped_screenshot_path = 'cropped_screenshot.png'
  1406. # ocr_res = self.get_ocr_res('cropped_screenshot.png')
  1407. ocr_res = self.get_ocr_res(cropped_screenshot_path)
  1408. print(f'ocr_res:{ocr_res}')
  1409. # 获取ocr_res 中的地址、单位名称
  1410. if ocr_res:
  1411. if '单位名称' in ocr_res.keys():
  1412. business_license_company = ocr_res['单位名称']
  1413. if '地址' in ocr_res.keys():
  1414. business_license_address = ocr_res['地址']
  1415. license_info_data = {'contact_address': contact_address, 'qualification_number': qualification_number,
  1416. 'business_license_company': business_license_company,
  1417. 'business_license_address': business_license_address}
  1418. else:
  1419. license_info_data = {'contact_address': contact_address, 'qualification_number': '',
  1420. 'business_license_company': '', 'business_license_address': ''}
  1421. return license_info_data
  1422. def distinct_target(self):
  1423. result = False
  1424. position_xpath = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]'
  1425. position_xpath2 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]'
  1426. is_position = self.d.xpath(position_xpath).exists
  1427. is_position2 = self.d.xpath(position_xpath2).exists
  1428. xpath = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1429. xpath2 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1430. xpath3 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1431. xpath4 = '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  1432. is_position5 = self.d.xpath(xpath).exists
  1433. is_position6 = self.d.xpath(xpath2).exists
  1434. is_position7 = self.d.xpath(xpath3).exists
  1435. is_position8 = self.d.xpath(xpath4).exists
  1436. # print(f"is_position = {is_position}")
  1437. # print(f"is_position2 = {is_position2}")
  1438. if is_position or is_position2 or is_position5 or is_position6 or is_position7 or is_position8:
  1439. result = True
  1440. if result == False:
  1441. print("---检测没有回到列表页---")
  1442. else:
  1443. print("---检测回到了列表页---")
  1444. return result
  1445. # return is_position
  1446. def enter_target_page(self):
  1447. self.d.xpath('//*[@content-desc="看病买药"]').click()
  1448. time.sleep(self.get_sleep_time())
  1449. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/vf_search_carousel_text"]').click()
  1450. time.sleep(self.get_sleep_time())
  1451. self.d.xpath(
  1452. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]').click()
  1453. time.sleep(1)
  1454. self.d.send_keys(self.search_key, clear=True)
  1455. time.sleep(1)
  1456. self.d.xpath('//*[@text="搜索"]').click()
  1457. time.sleep(1)
  1458. self.click_express_send()
  1459. time.sleep(1)
  1460. self.wr_re("读", self.device_id)
  1461. def click_express_send(self):
  1462. # xpath= '//*[@resource-id="com.sankuai.meituan:id/container"]//android.widget.HorizontalScrollView[last()]'
  1463. slide_xpaths = [
  1464. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]',
  1465. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]',
  1466. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]',
  1467. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]',
  1468. ]
  1469. for i in range(1, 3):
  1470. matched_slide_xpath = self.get_first_existing_xpath(slide_xpaths)
  1471. if not matched_slide_xpath:
  1472. time.sleep(self.get_sleep_time())
  1473. continue
  1474. bounds = self.d.xpath(matched_slide_xpath).info['bounds']
  1475. top = bounds['top']
  1476. bottom = bounds['bottom']
  1477. print(f'top={top}')
  1478. print(f'bottom={bottom}')
  1479. y = (top + bottom) // 2
  1480. print(f'y={y}')
  1481. self.loggerMT.info(f'开始滑动{i}')
  1482. self.d.swipe(500, y, 100, y, 0.5)
  1483. time.sleep(self.get_sleep_time())
  1484. break
  1485. express_send_xpaths = [
  1486. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]',
  1487. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]',
  1488. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]',
  1489. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.HorizontalScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]',
  1490. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/androidx.recyclerview.widget.RecyclerView[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]',
  1491. ]
  1492. self.click_candidate_xpaths(
  1493. express_send_xpaths,
  1494. action_desc="点击快递送",
  1495. max_retries=5,
  1496. sleep_after=self.get_sleep_time(),
  1497. )
  1498. def get_clipboard(self):
  1499. time.sleep(1)
  1500. self.loggerMT.info(f"Clipboard content:{self.d.clipboard}") # 打印调试信息
  1501. clipboard_content = self.d.clipboard
  1502. if clipboard_content is None:
  1503. return ''
  1504. return clipboard_content.strip()
  1505. # return self.d.clipboard.strip()
  1506. def clear_clipboard(self):
  1507. self.d.set_clipboard("", "text/plain")
  1508. def get_product_link(self):
  1509. product_link = ''
  1510. dots_xpaths = [
  1511. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ImageView[1]',
  1512. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[3]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ImageView[1]',
  1513. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ImageView[1]'
  1514. ]
  1515. max_retry = 5 # 最多尝试次数
  1516. for idx in range(1, max_retry + 1):
  1517. if product_link: # 已经拿到则退出
  1518. break
  1519. clicked_xpath = self.click_candidate_xpaths(
  1520. dots_xpaths,
  1521. action_desc=f'{idx}-进入分享点点点',
  1522. max_retries=1,
  1523. sleep_after=0.2,
  1524. )
  1525. if not clicked_xpath:
  1526. print(f'{idx}-分享入口点击失败')
  1527. time.sleep(self.get_sleep_time())
  1528. continue
  1529. self.loggerMT.info(f'{idx}-click_exists')
  1530. self.d.xpath('//*[@text="分享商品"]').click_exists()
  1531. time.sleep(0.2)
  1532. link_xpath = '//*[@text="复制链接"]'
  1533. if self.d.xpath(link_xpath).exists:
  1534. self.loggerMT.info(f'{idx}-link_xpath click')
  1535. self.d.xpath(link_xpath).click()
  1536. time.sleep(1)
  1537. product_link = self.get_clipboard()
  1538. time.sleep(0.5)
  1539. print(f'{idx}-商品链接:{product_link}')
  1540. self.loggerMT.info(f'{idx}-商品链接:{product_link}')
  1541. break
  1542. print(f'{idx}-商品链接:{product_link}')
  1543. self.loggerMT.info(f'{idx}-商品链接:{product_link}')
  1544. product_link = ''
  1545. if not product_link and idx < max_retry:
  1546. time.sleep(0.5) # 最后一次不需要再等待
  1547. return product_link
  1548. def run_parallel_tasks(self, task_map):
  1549. """
  1550. 并行执行相互独立的只读任务。
  1551. 任务本身不能包含点击、滑动、返回等会改变页面状态的操作。
  1552. """
  1553. if not task_map:
  1554. return {}
  1555. results = {}
  1556. with ThreadPoolExecutor(max_workers=len(task_map)) as executor:
  1557. future_map = {
  1558. task_name: executor.submit(self.safe_exec, task_func)
  1559. for task_name, task_func in task_map.items()
  1560. }
  1561. for task_name, future in future_map.items():
  1562. try:
  1563. results[task_name] = future.result()
  1564. except Exception as e:
  1565. print(f'并行采集任务 {task_name} 执行失败: {e}')
  1566. results[task_name] = None
  1567. return results
  1568. def get_available_xpaths(self, xpaths):
  1569. check_tasks = {
  1570. f'xpath_{idx}': (lambda xp=xp: self.d.xpath(xp).exists)
  1571. for idx, xp in enumerate(xpaths)
  1572. }
  1573. exists_results = self.run_parallel_tasks(check_tasks)
  1574. return [
  1575. xpath for idx, xpath in enumerate(xpaths)
  1576. if exists_results.get(f'xpath_{idx}')
  1577. ]
  1578. def get_first_existing_xpath(self, xpaths):
  1579. available_xpaths = self.get_available_xpaths(xpaths)
  1580. if not available_xpaths:
  1581. return None
  1582. return available_xpaths[0]
  1583. def get_first_text_by_xpaths(self, xpaths):
  1584. text_tasks = {
  1585. f'xpath_{idx}': (lambda xp=xp: self._read_xpath_text(xp))
  1586. for idx, xp in enumerate(xpaths)
  1587. }
  1588. text_results = self.run_parallel_tasks(text_tasks)
  1589. for idx, _ in enumerate(xpaths):
  1590. text = text_results.get(f'xpath_{idx}')
  1591. if text:
  1592. return text
  1593. return ''
  1594. def get_first_texts_by_xpath_groups(self, xpath_groups):
  1595. tasks = {}
  1596. group_keys = {}
  1597. for group_name, xpaths in xpath_groups.items():
  1598. group_keys[group_name] = []
  1599. for idx, xpath in enumerate(xpaths):
  1600. task_name = f'{group_name}_{idx}'
  1601. group_keys[group_name].append(task_name)
  1602. tasks[task_name] = (lambda xp=xpath: self._read_xpath_text(xp))
  1603. text_results = self.run_parallel_tasks(tasks)
  1604. grouped_results = {}
  1605. for group_name, task_names in group_keys.items():
  1606. grouped_results[group_name] = ''
  1607. for task_name in task_names:
  1608. text = text_results.get(task_name)
  1609. if text:
  1610. grouped_results[group_name] = text
  1611. break
  1612. return grouped_results
  1613. def _read_xpath_text(self, xpath):
  1614. selector = self.d.xpath(xpath)
  1615. if not selector.exists:
  1616. return ''
  1617. try:
  1618. text = selector.text
  1619. return text.strip() if isinstance(text, str) else text
  1620. except Exception:
  1621. return ''
  1622. def click_candidate_xpaths(self, xpaths, action_desc, max_retries=1, sleep_after=0):
  1623. for attempt in range(1, max_retries + 1):
  1624. available_xpaths = self.get_available_xpaths(xpaths)
  1625. if not available_xpaths:
  1626. print(f'{action_desc}失败,第{attempt}次没有匹配到可点击的xpath')
  1627. time.sleep(self.get_sleep_time())
  1628. continue
  1629. rotate_offset = (attempt - 1) % len(available_xpaths)
  1630. candidate_xpaths = available_xpaths[rotate_offset:] + available_xpaths[:rotate_offset]
  1631. for xpath in candidate_xpaths:
  1632. try:
  1633. self.safe_exec(lambda xp=xpath: self.d.xpath(xp).click())
  1634. print(f'{action_desc}成功')
  1635. if sleep_after:
  1636. time.sleep(sleep_after)
  1637. return xpath
  1638. except Exception as e:
  1639. print(f'{action_desc}点击异常: {e}')
  1640. time.sleep(self.get_sleep_time())
  1641. return None
  1642. def find_xpath_with_swipes(self, xpaths, swipe_direction='down', swipe_scale=0.3, max_swipes=8, found_desc=''):
  1643. for idx in range(max_swipes):
  1644. matched_xpath = self.get_first_existing_xpath(xpaths)
  1645. if matched_xpath:
  1646. if found_desc:
  1647. print(f'第{idx}次找到{found_desc}')
  1648. return matched_xpath
  1649. self.d.swipe_ext(swipe_direction, swipe_scale)
  1650. time.sleep(1)
  1651. matched_xpath = self.get_first_existing_xpath(xpaths)
  1652. if matched_xpath and found_desc:
  1653. print(f'第{max_swipes}次找到{found_desc}')
  1654. return matched_xpath
  1655. def build_dup_data(self, product, min_price, shop, scrape_date):
  1656. return {
  1657. 'product': product,
  1658. 'min_price': min_price,
  1659. 'shop': shop,
  1660. 'scrape_date': scrape_date,
  1661. 'platform': '4',
  1662. }
  1663. def integrate_data(self):
  1664. """
  1665. 整合数据
  1666. :return:
  1667. """
  1668. # title_info = self.get_title() # 药品,规格
  1669. # title_info = self.safe_exec(self.get_title) # 药品,规格
  1670. product, specifications = self.safe_exec(self.get_title) # 药品,规格
  1671. if not product:
  1672. self.swipe_back(1)
  1673. return
  1674. page_data = self.run_parallel_tasks({
  1675. "min_price": self.drug_price,
  1676. "sales_num": self.drug_sale_num,
  1677. "is_self_operated": lambda: self.d.xpath('//*[@text="自营"]').exists,
  1678. })
  1679. min_price = page_data.get("min_price") # 最低价格
  1680. sales_num = page_data.get("sales_num") # 销售数量
  1681. product_link = ''
  1682. if page_data.get("is_self_operated"):
  1683. shop = "美团自营大药房(快递电商)"
  1684. # 爬取日期
  1685. scrape_date = self.get_current_date()
  1686. dup_data = self.build_dup_data(product, min_price, shop, scrape_date)
  1687. print(f'当前数据:{dup_data}')
  1688. if self.data_is_exists(dup_data):
  1689. print('存在相同数据不入库')
  1690. self.back_to_list_page()
  1691. return
  1692. else:
  1693. self.find_xpath_with_swipes(
  1694. ['//*[@text="进店"]'],
  1695. swipe_direction='up',
  1696. swipe_scale=0.3,
  1697. max_swipes=8,
  1698. found_desc='进店'
  1699. )
  1700. shop_page_data = self.run_parallel_tasks({
  1701. "shop": self.get_shop_name_from_current_page,
  1702. "is_has_enter_shop": self.has_shop,
  1703. })
  1704. shop = shop_page_data.get("shop")
  1705. if not shop:
  1706. shop = self.get_shop_name()
  1707. # 爬取日期
  1708. scrape_date = self.get_current_date()
  1709. dup_data = self.build_dup_data(product, min_price, shop, scrape_date)
  1710. print(f'当前数据:{dup_data}')
  1711. if not shop:
  1712. print('未获取到店铺名:开始回退')
  1713. self.back_to_list_page()
  1714. return
  1715. if '自营' in shop:
  1716. self.back_to_list_page()
  1717. return
  1718. db_check_results = self.run_parallel_tasks({
  1719. "dup_exists": lambda: self.data_is_exists(dup_data),
  1720. "shop_exists": lambda: self.shop_is_exists_database(shop),
  1721. })
  1722. if db_check_results.get("dup_exists"):
  1723. print('存在相同数据不入库')
  1724. self.back_to_list_page()
  1725. return
  1726. # 获取店铺信息开始
  1727. is_has_enter_shop = bool(shop_page_data.get("is_has_enter_shop"))
  1728. # 需要判断shop是否已经在数据库中存在,如果存在,则不再进入店铺,直接进入下一个商品
  1729. shop_is_exists = bool(db_check_results.get("shop_exists"))
  1730. # 存在进店 并且店铺的名称不包含美团官方的字样
  1731. print(f"已采集{self.shop_data_num}家店铺数据")
  1732. if is_has_enter_shop and '美团官方' not in shop and '美团自营' not in shop and not shop_is_exists and self.shop_data_num < 500:
  1733. license_info = self.safe_exec(self.get_license_info_ex)
  1734. contact_address = license_info['contact_address']
  1735. qualification_number = license_info['qualification_number']
  1736. business_license_company = license_info['business_license_company']
  1737. business_license_address = license_info['business_license_address']
  1738. save_shop_data = {
  1739. 'shop': shop,
  1740. 'contact_address': contact_address,
  1741. 'qualification_number': qualification_number,
  1742. 'scrape_date': scrape_date,
  1743. 'business_license_company': business_license_company,
  1744. 'business_license_address': business_license_address,
  1745. 'platform': '4'
  1746. }
  1747. self.save_shop_info_to_database(save_shop_data)
  1748. self.shop_data_num += 1
  1749. self.swipe_back(2)
  1750. else:
  1751. print('不采集店铺信息')
  1752. # 获取店铺信息结束
  1753. # 商品链接
  1754. product_link = self.get_product_link()
  1755. print(f'获取到product_link: {product_link}')
  1756. time.sleep(self.get_sleep_time())
  1757. # 生产日期为空
  1758. manufacture_date = ''
  1759. expiry_date = ''
  1760. manufacturer = ''
  1761. approval_number = ''
  1762. # 暂时不获取说明书信息 start
  1763. is_has_instructions = self.safe_exec(self.has_instructions)
  1764. # 说明书等信息
  1765. if is_has_instructions:
  1766. print('开始获取说明书信息')
  1767. instructions_info = self.safe_exec(self.get_instructions_data)
  1768. if instructions_info['有效期'] is not None:
  1769. expiry_date = instructions_info['有效期'].strip('。')
  1770. if instructions_info['生产单位'] is not None:
  1771. manufacturer = instructions_info['生产单位'].strip('。')
  1772. if instructions_info['批准文号'] is not None:
  1773. approval_number = instructions_info['批准文号'].strip('。')
  1774. else:
  1775. expiry_date = "未知"
  1776. manufacturer = None
  1777. approval_number = None
  1778. province_id = 0
  1779. city_id = 0
  1780. city = ''
  1781. province = ''
  1782. if province in self.city_to_name:
  1783. province_id = self.city_to_name[province]
  1784. if city in self.city_to_name:
  1785. city_id = self.city_to_name[city]
  1786. save_data = {
  1787. 'enterprise_id': 3,
  1788. 'platform_id': 4,
  1789. 'platform_item_id': '',
  1790. 'province_id': province_id,
  1791. 'city_id': city_id,
  1792. 'province_name': '',
  1793. 'city_name': '',
  1794. 'area_info': "",
  1795. 'product_name': product,
  1796. 'product_specs': specifications,
  1797. 'one_box_price': 0.00,
  1798. 'manufacture_date': manufacture_date,
  1799. 'expiry_date': expiry_date,
  1800. 'manufacturer': manufacturer,
  1801. 'approval_number': approval_number,
  1802. 'is_sold_out': 0,
  1803. 'online_posting_count': 1,
  1804. 'continuous_listing_count': 1,
  1805. 'link_url': product_link,
  1806. 'store_name': shop,
  1807. 'store_url': '',
  1808. 'shipment_province_id': 0,
  1809. 'shipment_province_name': "",
  1810. 'shipment_city_id': 0,
  1811. 'shipment_city_name': "",
  1812. 'company_name': "",
  1813. 'qualification_number': "",
  1814. 'scrape_date': scrape_date,
  1815. 'min_price': min_price,
  1816. 'number': 0,
  1817. 'sales': sales_num,
  1818. 'inventory': "",
  1819. 'snapshot_url': "",
  1820. 'insert_time': time.strftime('%Y-%m-%d %H:%M:%S'),
  1821. 'update_time': time.strftime('%Y-%m-%d %H:%M:%S'),
  1822. }
  1823. self.save_to_database(save_data)
  1824. def back_to_list_page(self):
  1825. for i in range(5):
  1826. if self.distinct_target():
  1827. return True
  1828. print(f'第{i}次尝试退回到列表页')
  1829. self.swipe_back(1)
  1830. time.sleep(self.get_sleep_time())
  1831. print('页面出错,没有退回到列表页')
  1832. return False
  1833. def reset_collection_cursor(self):
  1834. self.collection_cursor["page_no"] = 1
  1835. self.collection_cursor["item_index"] = 0
  1836. def get_current_page_no(self):
  1837. return self.page + self.collection_cursor["page_no"]
  1838. def jump_to_page(self, target_page):
  1839. current_page = self.get_current_page_no()
  1840. if target_page <= current_page:
  1841. return
  1842. while current_page < target_page:
  1843. if self.d.xpath('//*[@text="已经到底啦"]').exists:
  1844. print(f"列表实际页数不足,当前停留在第{current_page}页,无法跳转到第{target_page}页")
  1845. return
  1846. print(f"跳过第{current_page}页,前往第{current_page + 1}页")
  1847. self.d.drag(300, 1400, 300, 400, 1)
  1848. time.sleep(1)
  1849. self.collection_cursor["page_no"] += 1
  1850. self.collection_cursor["item_index"] = 0
  1851. current_page = self.get_current_page_no()
  1852. def move_to_page_range_start(self):
  1853. if not self.page_range:
  1854. return
  1855. start_page = self.page_range["start"]
  1856. current_page = self.get_current_page_no()
  1857. if current_page < start_page:
  1858. self.jump_to_page(start_page)
  1859. def start_collection_app(self):
  1860. self.sort_key = 0
  1861. self.restart_app()
  1862. def open_product_list_page(self):
  1863. self.safe_exec(self.enter_target_page)
  1864. self.reset_collection_cursor()
  1865. if self.sort and self.sort_key == 0:
  1866. self.li_or_lo(self.sort)
  1867. self.move_to_page_range_start()
  1868. def handle_workflow_error(self, step_name):
  1869. action = self.workflow_error_action.get(step_name)
  1870. if action == "back_to_list_page":
  1871. if not self.back_to_list_page():
  1872. raise RuntimeError("退回列表页失败")
  1873. return "collect_single_product"
  1874. if action == "open_product_list_page":
  1875. return "open_product_list_page"
  1876. if action == "start_app":
  1877. return "start_app"
  1878. raise RuntimeError(f"未配置步骤 {step_name} 的错误处理动作")
  1879. def get_list_items(self):
  1880. for _ in range(10):
  1881. items = self.safe_exec(
  1882. self.d.xpath('//android.support.v7.widget.RecyclerView/android.widget.FrameLayout').all
  1883. )
  1884. if items:
  1885. return items
  1886. time.sleep(1)
  1887. raise RuntimeError("列表页商品加载失败")
  1888. def move_to_next_list_page(self):
  1889. current_page = self.get_current_page_no()
  1890. if self.page_range and current_page >= self.page_range["end"]:
  1891. self.wr_re("写", self.device_id, self.sort, current_page)
  1892. print(f'已完成第{current_page}页采集,达到结束页{self.page_range["end"]},停止采集')
  1893. return False
  1894. if self.d.xpath('//*[@text="已经到底啦"]').exists:
  1895. return False
  1896. self.wr_re("写", self.device_id, self.sort, current_page)
  1897. print(f'当前第{current_page}页采集完成,开始滑动到下一页')
  1898. self.d.drag(300, 1400, 300, 400, 1)
  1899. time.sleep(1)
  1900. self.collection_cursor["page_no"] += 1
  1901. self.collection_cursor["item_index"] = 0
  1902. return True
  1903. def _collect_list_item(self, drug_idx, drug_one):
  1904. bounds = drug_one.info['bounds']
  1905. top = bounds['top']
  1906. bottom = bounds['bottom']
  1907. print(f'当前商品bottom:{bottom}')
  1908. print(f'当前商品top:{top}')
  1909. if not (304 <= top and bottom <= 1475):
  1910. return "skip"
  1911. print(f"这页的第几个商品:{drug_idx}")
  1912. item_text_data = self.get_first_texts_by_xpath_groups({
  1913. "product_title": [
  1914. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView',
  1915. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView',
  1916. ],
  1917. "price_str": [
  1918. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView',
  1919. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView',
  1920. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView',
  1921. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView',
  1922. ],
  1923. "shop_name": [
  1924. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.FrameLayout[last()]/android.widget.TextView[1]',
  1925. f'//android.support.v7.widget.RecyclerView/android.widget.FrameLayout[{drug_idx}]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.widget.FrameLayout[last()]/android.widget.TextView[1]',
  1926. ],
  1927. })
  1928. product_title = item_text_data.get("product_title", "")
  1929. if not product_title:
  1930. print("列表当前商品名称不存在")
  1931. self.unrelated_data += 1
  1932. return "continue"
  1933. product_title = product_title[1:] if product_title.startswith('0') else product_title
  1934. print(f"列表当前商品名称:{product_title}")
  1935. if not self.is_link_useful(product_title):
  1936. print(f"is_link_useful 没通过:{product_title}")
  1937. self.unrelated_data += 1
  1938. return "continue"
  1939. self.unrelated_data = 0
  1940. price = ''
  1941. price_str = item_text_data.get("price_str", "")
  1942. print(f"列表当前商品价格:{price_str}")
  1943. if price_str:
  1944. price = float(re.search(r'[\d\.]+', price_str).group())
  1945. if price != '' and self.collect_range:
  1946. range_start = self.collect_range["start"]
  1947. range_end = self.collect_range["end"]
  1948. if not (range_start <= price <= range_end):
  1949. print(f"price {price} not in range {range_start}-{range_end}, skip")
  1950. return "continue"
  1951. shop_name = item_text_data.get("shop_name", "")
  1952. print(f"列表当前商品店铺名称:{shop_name}")
  1953. if price == '' or shop_name == '':
  1954. print("列表当前商品价格或店铺名称不存在")
  1955. return "continue"
  1956. scrape_date = self.get_current_date()
  1957. dup_data = {
  1958. 'product': product_title,
  1959. 'min_price': price,
  1960. 'shop': shop_name,
  1961. 'scrape_date': scrape_date,
  1962. 'platform': '4',
  1963. }
  1964. if self.data_is_exists(dup_data):
  1965. print('列表存在相同数据不入库')
  1966. return "continue"
  1967. self.safe_exec(drug_one.click)
  1968. print('点击目标药品完毕')
  1969. time.sleep(1)
  1970. self.safe_exec(self.integrate_data)
  1971. print('integrate_data结束')
  1972. time.sleep(1)
  1973. return "collected"
  1974. def collect_single_product(self):
  1975. if self.monitor.verification_count >= self.monitor.MAX_VERIFICATION_RETRY:
  1976. raise RuntimeError("验证码触发过多,暂停程序")
  1977. if self.page_range:
  1978. self.move_to_page_range_start()
  1979. current_page = self.get_current_page_no()
  1980. if current_page > self.page_range["end"]:
  1981. print(f"当前已在第{current_page}页,超过结束页{self.page_range['end']},停止采集")
  1982. return False
  1983. items = self.get_list_items()
  1984. print(f'当前第{self.get_current_page_no()}页,共有{len(items)}个商品')
  1985. while self.collection_cursor["item_index"] < len(items):
  1986. item_index = self.collection_cursor["item_index"]
  1987. self.collection_cursor["item_index"] += 1
  1988. result = self._collect_list_item(item_index + 1, items[item_index])
  1989. if result in {"continue", "collected"}:
  1990. self.back_to_list_page()
  1991. return True
  1992. if not self.move_to_next_list_page():
  1993. print('已经到达列表页最底部')
  1994. return False
  1995. return True
  1996. def execute_workflow_step(self, step_name):
  1997. if step_name == "start_app":
  1998. self.safe_exec(self.start_collection_app)
  1999. return "open_product_list_page"
  2000. if step_name == "open_product_list_page":
  2001. self.safe_exec(self.open_product_list_page)
  2002. return "collect_single_product"
  2003. if step_name == "collect_single_product":
  2004. has_next = self.safe_exec(self.collect_single_product)
  2005. if not has_next:
  2006. return None
  2007. print('目前连续无关数据量: ', self.unrelated_data)
  2008. if self.unrelated_data > self.max_unrelated_data:
  2009. print(f"连续超过{self.max_unrelated_data}个不达标的数据则停止采集")
  2010. return None
  2011. return "collect_single_product"
  2012. raise RuntimeError(f"未知流程步骤: {step_name}")
  2013. def main(self, device_id):
  2014. self.device_id = device_id
  2015. self.connect_devices(device_id)
  2016. time.sleep(self.get_sleep_time())
  2017. self.monitor = SpiderMonitor(self)
  2018. self.monitor.start()
  2019. current_step = "start_app"
  2020. step_failures = {step: 0 for step in self.workflow_retry_limit}
  2021. try:
  2022. while current_step:
  2023. try:
  2024. next_step = self.execute_workflow_step(current_step)
  2025. step_failures[current_step] = 0
  2026. current_step = next_step
  2027. except Exception as e:
  2028. print(f'{current_step} 执行异常: {e}')
  2029. time.sleep(5)
  2030. step_failures[current_step] += 1
  2031. if step_failures[current_step] > self.workflow_retry_limit[current_step]:
  2032. raise
  2033. current_step = self.handle_workflow_error(current_step)
  2034. finally:
  2035. self.monitor.stop()
  2036. self.monitor.join()
  2037. device_list = {
  2038. "U8ONIJJJS4CELVD6": [
  2039. {
  2040. "search_key": "喇叭牌正露丸 100粒",
  2041. "title_key": "正露丸",
  2042. "spec_list": [""],
  2043. "brand": "喇叭牌",
  2044. "sort": "升序",
  2045. "collect_range": [],
  2046. "page_range": [],
  2047. "workflow_retry_limit": {
  2048. "start_app": 3,
  2049. "open_product_list_page": 3,
  2050. "collect_single_product": 3,
  2051. },
  2052. "workflow_error_action": {
  2053. "start_app": "start_app",
  2054. "open_product_list_page": "start_app",
  2055. "collect_single_product": "back_to_list_page",
  2056. },
  2057. },
  2058. ],
  2059. }
  2060. def run_device(device_id):
  2061. """单个设备的采集任务(运行于独立线程)"""
  2062. if device_id not in device_list:
  2063. logging.error(f"设备id没有配置: {device_id}")
  2064. return
  2065. tasks = device_list[device_id]
  2066. logging.info(f"[设备 {device_id}] 开始执行,共 {len(tasks)} 个任务")
  2067. for task in tasks:
  2068. cycle_no = 0
  2069. while True:
  2070. cycle_no += 1
  2071. mt = None
  2072. logging.info(f'[设备 {device_id}] ========== {task["search_key"]} 第 {cycle_no} 轮采集开始 ==========')
  2073. try:
  2074. # 注意:MT类需要从外部导入或定义,此处假设已存在
  2075. mt = MT(
  2076. task["search_key"],
  2077. task["title_key"],
  2078. task["spec_list"],
  2079. task["brand"],
  2080. task.get("sort"),
  2081. task.get("collect_range"),
  2082. task.get("page_range"),
  2083. task.get("workflow_retry_limit"),
  2084. task.get("workflow_error_action"),
  2085. )
  2086. mt.main(device_id)
  2087. logging.info(f'[设备 {device_id}] 关键字 {task["search_key"]} 本轮采集完成')
  2088. break # 成功则跳出重试循环
  2089. except Exception as e:
  2090. logging.exception(f'[设备 {device_id}] 关键字 {task["search_key"]} 采集异常:{e}')
  2091. # 发生异常后继续循环重试
  2092. finally:
  2093. if mt and hasattr(mt, 'close'):
  2094. mt.close()
  2095. logging.info(f"[设备 {device_id}] 所有任务执行完毕")
  2096. def main():
  2097. # 配置日志格式,便于区分不同线程的输出
  2098. logging.basicConfig(
  2099. level=logging.INFO,
  2100. format='%(asctime)s [%(threadName)s] %(levelname)s: %(message)s'
  2101. )
  2102. if len(sys.argv) < 2:
  2103. logging.info(f"美团数据库调度器启动,轮询间隔 {SCHEDULER_INTERVAL_SECONDS} 秒")
  2104. dispatch_pending_tasks()
  2105. schedule_dispatch(SCHEDULER_INTERVAL_SECONDS)
  2106. return
  2107. device_ids = sys.argv[1:]
  2108. invalid_ids = [did for did in device_ids if did not in device_list]
  2109. if invalid_ids:
  2110. logging.error(f"以下设备ID未配置: {invalid_ids}")
  2111. return
  2112. logging.info(f"将运行指定的设备: {device_ids}")
  2113. if not device_ids:
  2114. logging.warning("没有可运行的设备")
  2115. return
  2116. with ThreadPoolExecutor(max_workers=len(device_ids)) as executor:
  2117. futures = [executor.submit(run_device, did) for did in device_ids]
  2118. for future in futures:
  2119. future.result()
  2120. if __name__ == '__main__':
  2121. main()