pdd_new4.py 74 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769
  1. import requests
  2. import base64
  3. from concurrent.futures import ThreadPoolExecutor
  4. import uiautomator2 as u2
  5. import time
  6. import sys
  7. import subprocess
  8. import re
  9. import random
  10. import json
  11. from aip import AipOcr
  12. import numpy as np
  13. import cv2
  14. import os
  15. from pdd_config import Config
  16. import logging
  17. from logger import setup_logger
  18. import pymysql
  19. from 拼多多盒数处理脚本.main import extract_box_number
  20. import datetime
  21. import threading
  22. setup_logger("pdd_spider") # 初始化日志
  23. def get_mysql():
  24. return pymysql.connect(
  25. host='120.24.49.2', # 修改后的主机
  26. port=3306, # 添加端口号
  27. user='drug_retrieve', # 修改后的用户名
  28. password='ksCt3xm6chzdkafj', # 修改后的密码
  29. db='drug_retrieve', # 修改后的数据库名
  30. charset='utf8mb4'
  31. )
  32. SCHEDULER_INTERVAL_SECONDS = 600
  33. PLATFORM_PDD = 3
  34. TASK_STATUS_PENDING = 1
  35. DEVICE_STATUS_IDLE = 0
  36. DEFAULT_MAX_COUNTS_LIMIT = 300
  37. # True: 从数据库读取并调度任务;False: 从 device_list 直接读取任务
  38. USE_DB_TASK_SOURCE = False
  39. dispatch_lock = threading.Lock()
  40. running_task_ids = set()
  41. running_device_ids = set()
  42. worker_threads = {}
  43. scheduler_stop_event = threading.Event()
  44. scheduler_timer = None
  45. def parse_optional_int(value, default=None):
  46. if value in (None, ""):
  47. return default
  48. try:
  49. return int(value)
  50. except (TypeError, ValueError):
  51. return default
  52. def fetch_pending_tasks():
  53. conn = None
  54. try:
  55. conn = get_mysql()
  56. with conn.cursor() as cursor:
  57. sql = """
  58. SELECT *
  59. FROM retrieve_collect_task_allocate
  60. WHERE platform = %s AND status = %s
  61. ORDER BY id ASC
  62. """
  63. cursor.execute(sql, (PLATFORM_PDD, TASK_STATUS_PENDING))
  64. return cursor.fetchall()
  65. except Exception as e:
  66. logging.exception(f"读取待执行任务失败: {e}")
  67. return []
  68. finally:
  69. if conn:
  70. conn.close()
  71. def fetch_idle_device_by_equipment_id(equipment_id):
  72. conn = None
  73. try:
  74. conn = get_mysql()
  75. with conn.cursor() as cursor:
  76. sql = """
  77. SELECT *
  78. FROM retrieve_collect_equipment
  79. WHERE name LIKE %s AND id = %s AND status = %s
  80. LIMIT 1
  81. """
  82. cursor.execute(sql, ('%pdd%', equipment_id, DEVICE_STATUS_IDLE))
  83. return cursor.fetchone()
  84. except Exception as e:
  85. logging.exception(f"读取空闲设备失败 equipment_id={equipment_id}: {e}")
  86. return None
  87. finally:
  88. if conn:
  89. conn.close()
  90. def build_task_payload(task_row, device_row):
  91. start_page = parse_optional_int(task_row[9] if len(task_row) > 9 else None, 0)
  92. end_page = parse_optional_int(task_row[10] if len(task_row) > 10 else None, None)
  93. max_counts_limit = parse_optional_int(
  94. task_row[11] if len(task_row) > 11 else None,
  95. DEFAULT_MAX_COUNTS_LIMIT
  96. )
  97. return {
  98. "task_id": task_row[0],
  99. "equipment_id": task_row[2],
  100. "enterprise_id": task_row[3],
  101. "platform": task_row[4],
  102. "title_key": task_row[5],
  103. "spec_list": task_row[6],
  104. "brand": task_row[7],
  105. "search_key": f"{task_row[7]}{task_row[5]}",
  106. "save_search_key": f"{task_row[7]}{task_row[5]}",
  107. "start_page": start_page,
  108. "end_page": end_page,
  109. "max_counts_limit": max_counts_limit,
  110. "sort": "升序",
  111. "device_id": device_row[2],
  112. "task_row": task_row,
  113. }
  114. def fetch_runnable_task_payloads():
  115. tasks = fetch_pending_tasks()
  116. if not tasks:
  117. logging.info("当前没有待执行任务")
  118. return []
  119. payloads = []
  120. reserved_equipment_ids = set()
  121. for task_row in tasks:
  122. task_id = task_row[0]
  123. equipment_id = task_row[2]
  124. with dispatch_lock:
  125. if task_id in running_task_ids:
  126. continue
  127. if equipment_id in reserved_equipment_ids:
  128. continue
  129. device_row = fetch_idle_device_by_equipment_id(equipment_id)
  130. if not device_row:
  131. logging.info(f"任务 {task_id} 对应设备 {equipment_id} 当前不空闲,跳过本轮")
  132. continue
  133. device_id = device_row[2]
  134. with dispatch_lock:
  135. if device_id in running_device_ids:
  136. logging.info(f"设备 {device_id} 已在本进程执行任务,跳过任务 {task_id}")
  137. continue
  138. running_task_ids.add(task_id)
  139. running_device_ids.add(device_id)
  140. reserved_equipment_ids.add(equipment_id)
  141. payloads.append(build_task_payload(task_row, device_row))
  142. return payloads
  143. def cleanup_finished_workers():
  144. dead_threads = []
  145. with dispatch_lock:
  146. for device_id, thread in worker_threads.items():
  147. if not thread.is_alive():
  148. dead_threads.append(device_id)
  149. for device_id in dead_threads:
  150. worker_threads.pop(device_id, None)
  151. def run_task_worker(task_payload):
  152. task_id = task_payload["task_id"]
  153. device_id = task_payload["device_id"]
  154. pdd = None
  155. try:
  156. logging.info(f"[任务 {task_id}] 开始执行,设备: {device_id}")
  157. print(task_payload)
  158. pdd = PDD(
  159. task_payload["search_key"],
  160. device_id,
  161. title_key=task_payload.get("title_key"),
  162. spec_list=task_payload.get("spec_list"),
  163. brand=task_payload.get("brand", ""),
  164. save_search_key=task_payload.get("save_search_key"),
  165. start_page=task_payload.get("start_page"),
  166. end_page=task_payload.get("end_page"),
  167. max_counts_limit=task_payload.get("max_counts_limit"),
  168. direct_shop_lookup=task_payload.get("direct_shop_lookup", False),
  169. sort=task_payload.get("sort"),
  170. platform=task_payload.get("platform"),
  171. task_id=task_payload.get("task_id"),
  172. enterprise_id=task_payload.get("enterprise_id"),
  173. )
  174. completed_normally = pdd.main(device_id, 1, 0)
  175. if completed_normally:
  176. logging.info(f"[任务 {task_id}] 执行完成,设备: {device_id}")
  177. else:
  178. logging.info(f"[任务 {task_id}] 已结束,设备: {device_id}")
  179. except Exception as e:
  180. end_page = task_payload.get("start_page")
  181. if pdd is not None:
  182. end_page = getattr(pdd, "page", end_page)
  183. pdd.finish_task_abnormally(end_page, f"任务执行异常: {e}")
  184. else:
  185. report_api(task_id, end_page=end_page, start=4, end_time=int(time.time()),finish_status=0)
  186. logging.exception(f"[任务 {task_id}] 执行异常,设备: {device_id},错误: {e}")
  187. finally:
  188. with dispatch_lock:
  189. running_task_ids.discard(task_id)
  190. running_device_ids.discard(device_id)
  191. worker_threads.pop(device_id, None)
  192. def dispatch_pending_tasks():
  193. cleanup_finished_workers()
  194. task_payloads = fetch_runnable_task_payloads()
  195. if not task_payloads:
  196. return
  197. for task_payload in task_payloads:
  198. device_id = task_payload["device_id"]
  199. try:
  200. thread = threading.Thread(
  201. target=run_task_worker,
  202. args=(task_payload,),
  203. daemon=True,
  204. name=f"pdd-{device_id}",
  205. )
  206. with dispatch_lock:
  207. worker_threads[device_id] = thread
  208. thread.start()
  209. logging.info(f"[任务 {task_payload['task_id']}] 已分发到设备 {device_id}")
  210. except Exception:
  211. with dispatch_lock:
  212. running_task_ids.discard(task_payload["task_id"])
  213. running_device_ids.discard(device_id)
  214. worker_threads.pop(device_id, None)
  215. raise
  216. def schedule_dispatch(delay_seconds=SCHEDULER_INTERVAL_SECONDS):
  217. global scheduler_timer
  218. if scheduler_stop_event.is_set():
  219. return
  220. scheduler_timer = threading.Timer(delay_seconds, scheduled_dispatch_job)
  221. scheduler_timer.daemon = False
  222. scheduler_timer.name = "pdd-scheduler"
  223. scheduler_timer.start()
  224. def scheduled_dispatch_job():
  225. try:
  226. dispatch_pending_tasks()
  227. except Exception as e:
  228. logging.exception(f"PDD 定时调度异常: {e}")
  229. finally:
  230. schedule_dispatch(SCHEDULER_INTERVAL_SECONDS)
  231. def report_api(task_id,page=None,start=None,end_page=None,end_time=None,finish_status=None):
  232. params = {
  233. "collect_task_allocate_id": task_id,
  234. "statr_page":page if page is not None else '',
  235. "end_page": end_page if end_page is not None else '',
  236. "status": start,
  237. "finish_status": finish_status if finish_status is not None else 0,
  238. "start_time": int(time.time()),
  239. "end_time": end_time if end_time is not None else '',
  240. }
  241. print(params)
  242. url = "http://schedule.dfwy.tech/api/collect_equipment_execute/result_report"
  243. res = requests.get(url, params=params, timeout=20)
  244. print(res.text)
  245. # 获取滑块验证中滑块需要移动的距离
  246. def slide_verify(img_path):
  247. with open(img_path, 'rb') as f:
  248. b = base64.b64encode(f.read()).decode() ## 图片二进制流base64字符串
  249. url = "http://api.jfbym.com/api/YmServer/customApi"
  250. data = {
  251. ## 关于参数,一般来说有3个;不同类型id可能有不同的参数个数和参数名,找客服获取
  252. "token": "1nDVocTE2mJ0yLEYb2sZJ5uUY2VIEoGTkIpW44X7Kgk",
  253. "type": "22222",
  254. "images": b,
  255. }
  256. _headers = {
  257. "Content-Type": "application/json"
  258. }
  259. response = requests.request("POST", url, headers=_headers, json=data).json()
  260. print(response)
  261. if response.get("msg") == "识别成功":
  262. # 获取 data 中的 data 字段
  263. result = response.get("data", {}).get("data")
  264. if result:
  265. print(result) # 输出结果
  266. else:
  267. print("无法获取数据")
  268. else:
  269. print("识别未成功")
  270. return result
  271. class PDD:
  272. def __init__(
  273. self,
  274. search_key,
  275. device_id,
  276. title_key=None,
  277. spec_list=None,
  278. brand="",
  279. save_search_key=None,
  280. start_page=0,
  281. end_page=None,
  282. max_counts_limit=None,
  283. direct_shop_lookup=False,
  284. sort=None,
  285. platform = None,
  286. task_id = None,
  287. enterprise_id=None,
  288. ):
  289. self.package_name = 'com.xunmeng.pinduoduo'
  290. self.APP_ID = '116857964'
  291. self.API_KEY = '1gAzACJOAr7BeILKqkqPOETh'
  292. self.SECRET_KEY = 'ZNArANb9GwJYgLKg4EfYhukKBfPdl1n3'
  293. self.client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
  294. self.table_name = "retrieve_scrape_data_999" # "pdd_drug"
  295. self.shop_table_name = "retrieve_scrape_data" # "pdd_shop_info"
  296. self.loggerPdd = logging.getLogger()
  297. self.clipboard = "" # 初始化剪切板的内容为空
  298. self.enterprise_id = enterprise_id
  299. self.task_id = task_id
  300. self.platform = platform
  301. self.sort = sort
  302. self.sort_key = 0
  303. self.search_key = search_key # 参苓健脾胃颗粒 香砂平胃颗粒 舒肝颗粒 清肺化痰丸
  304. self.title_key = title_key if title_key is not None else search_key
  305. self.spec_list = self._normalize_rule_list(spec_list)
  306. self.brand = brand
  307. self.save_search_key = save_search_key or search_key
  308. raw_start_page = parse_optional_int(start_page, 0)
  309. raw_end_page = parse_optional_int(end_page, None)
  310. if raw_end_page is not None and raw_end_page < raw_start_page:
  311. logging.warning(
  312. f"检测到页码配置可能反了,自动交换: start_page={raw_start_page}, end_page={raw_end_page}"
  313. )
  314. raw_start_page, raw_end_page = raw_end_page, raw_start_page
  315. self.start_page = max(raw_start_page, 0)
  316. self.end_page = raw_end_page
  317. self.max_counts_limit = max_counts_limit
  318. self.direct_shop_lookup = direct_shop_lookup
  319. self.unrelated_data = 0 # 无关数据数量
  320. self.device_id = device_id
  321. self.page = self.start_page
  322. if self.end_page is not None and self.end_page < self.start_page:
  323. self.end_page = self.start_page
  324. # 统计售罄数量
  325. self.sold_out_counts = 0
  326. # 程序启动时间
  327. self.program_start_time = self.app_start_time()
  328. # 统计商品数量
  329. # 最大量数据阈值
  330. self.max_counts = 0
  331. # 统计点击商品的次数
  332. self.click_counts = 0
  333. # 商品在列表的位置
  334. self.search_key_loc = 0
  335. self.finish_reported = False
  336. # oss配置
  337. self.oss_config = {
  338. "access_key_id": Config.access_key_id,
  339. "access_key_secret": Config.access_key_secret,
  340. "endpoint": Config.endpoint, # 例: oss-cn-beijing.aliyuncs.com
  341. "bucket_name": Config.bucket_name,
  342. "oss_prefix": Config.oss_prefix # OSS中存放截图的前缀(虚拟文件夹)
  343. }
  344. # 异常处理
  345. def wr_re(self, mod, device_id, sort=None, page=None):
  346. file_path = f'./ycwj/{device_id}_{self.title_key}.txt'
  347. if mod == "写":
  348. try:
  349. data = {
  350. "page": page if page else "",
  351. "sort": sort if sort else "",
  352. }
  353. os.makedirs(os.path.dirname(file_path), exist_ok=True)
  354. with open(file_path, 'w', encoding='utf-8') as f:
  355. json.dump(data, f, ensure_ascii=False, indent=2)
  356. print(f"进度保存成功:{sort},{page}页")
  357. except Exception as e:
  358. print("保存进度失败")
  359. elif mod == "读":
  360. try:
  361. if not os.path.exists(file_path):
  362. return None
  363. with open(file_path, 'r', encoding='utf-8') as f:
  364. data = json.load(f)
  365. print(self.sort)
  366. if self.sort and self.sort_key == 0:
  367. self.li_or_lo(self.sort)
  368. if data['page'] != '':
  369. progress_page = int(data['page'])
  370. self.page = max(progress_page, self.start_page)
  371. self.scroll_to_target_page(self.page)
  372. else:
  373. return None
  374. return data
  375. except Exception as e:
  376. print(f"读取进度失败", e)
  377. return None
  378. elif mod == "删":
  379. try:
  380. if os.path.exists(file_path):
  381. os.remove(file_path)
  382. print(f"进度文件已删除:{file_path}")
  383. except Exception as e:
  384. print(f"删除进度文件失败:{e}")
  385. return None
  386. def clear_progress_file(self):
  387. # self.wr_re("删", self.device_id, self.sort)
  388. pass
  389. def is_max_count_reached(self):
  390. return bool(self.max_counts_limit and self.max_counts >= self.max_counts_limit)
  391. def scroll_to_target_page(self, target_page):
  392. target_page = int(target_page or 0)
  393. if target_page <= 0:
  394. return
  395. for _ in range(target_page):
  396. end_y = 300
  397. self.d.swipe(200, 1400, 200, end_y, 0.4)
  398. time.sleep(self.get_sleep_time())
  399. def finish_task_normally(self, end_page, reason):
  400. if not self.finish_reported:
  401. if self.task_id:
  402. report_api(self.task_id, end_page=end_page, start=3, end_time=int(time.time()),finish_status=1)
  403. self.finish_reported = True
  404. print(reason)
  405. return True
  406. def finish_task_abnormally(self, end_page, reason, finish_status=0):
  407. if not self.finish_reported:
  408. if self.task_id:
  409. report_api(
  410. self.task_id,
  411. end_page=end_page,
  412. start=4,
  413. end_time=int(time.time()),
  414. finish_status=finish_status
  415. )
  416. self.finish_reported = True
  417. print(reason)
  418. return False
  419. def finish_task_with_max_count(self, end_page):
  420. return self.finish_task_normally(
  421. end_page,
  422. f"达到最大采集数量 {self.max_counts_limit},当前已采集 {self.max_counts} 条,停止任务"
  423. )
  424. # 排序
  425. def li_or_lo(self, key):
  426. if key == "升序":
  427. self.sort_key += 1
  428. self.d.xpath('//*[@text="价格"]').click()
  429. n = self.d.xpath('//*[@text="总价低到高"]')
  430. if n.exists:
  431. n.click()
  432. time.sleep(self.get_sleep_time())
  433. if key == "降序":
  434. self.sort_key += 1
  435. self.d.xpath('//*[@text="价格"]').click()
  436. n = self.d.xpath('//*[@text="单粒价格低到高"]')
  437. if n:
  438. n.click()
  439. else:
  440. self.d.xpath('//*[@text="价格"]').click()
  441. # 返回列表页
  442. def back_to_list_page(self):
  443. for i in range(10):
  444. if self.distinct_target():
  445. return True
  446. print(f'第{i}次尝试退回到列表页')
  447. self.swipe_back(1)
  448. time.sleep(1)
  449. print('页面出错,没有退回到列表页')
  450. return False
  451. def get_drug_lis(self, idx):
  452. if idx == 0:
  453. drug_lis = self.d.xpath(
  454. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout').all()
  455. else:
  456. for i in range(1, 6):
  457. drug_lis = self.d.xpath(
  458. f'/hierarchy/android.widget.FrameLayout[{i}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout').all()
  459. if drug_lis:
  460. break
  461. return drug_lis
  462. # 代码运行那时候的时间
  463. def app_current_time(self):
  464. return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  465. def slide_link(self):
  466. value_tag = None
  467. if self.d.xpath('//*[@text="微信"]').exists:
  468. value_tag = self.d.xpath('//*[@text="微信"]').info['bounds']
  469. self.d.swipe(400, value_tag['top'], 100, value_tag['top'], 0.3)
  470. return
  471. if self.d.xpath('//*[@text="朋友圈"]').exists:
  472. value_tag = self.d.xpath('//*[@text="朋友圈"]').info['bounds']
  473. self.d.swipe(400, value_tag['top'], 100, value_tag['top'], 0.3)
  474. return
  475. if self.d.xpath('//*[@text="QQ好友"]').exists:
  476. value_tag = self.d.xpath('//*[@text="QQ好友"]').info['bounds']
  477. self.d.swipe(400, value_tag['top'], 100, value_tag['top'], 0.3)
  478. return
  479. def app_start_time(self):
  480. """
  481. 获取app启动时间
  482. :return:
  483. """
  484. return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  485. def stop_app(self):
  486. self.d.app_stop(self.package_name)
  487. time.sleep(5)
  488. def start_app(self):
  489. self.d.app_start(self.package_name)
  490. time.sleep(5)
  491. def restart_app(self):
  492. """
  493. 重启app
  494. :return:
  495. """
  496. self.stop_app()
  497. self.start_app()
  498. @staticmethod
  499. def get_sleep_time():
  500. return random.randint(1, 2)
  501. # return random.randint(5, 8)
  502. @staticmethod
  503. def get_current_date():
  504. return datetime.datetime.now().strftime('%Y/%m/%d')
  505. @staticmethod
  506. def _normalize_rule_list(value):
  507. if value is None:
  508. return []
  509. if isinstance(value, (list, tuple, set)):
  510. raw_values = value
  511. else:
  512. raw_values = [value]
  513. result = []
  514. for item in raw_values:
  515. item_str = str(item).strip()
  516. if item_str:
  517. result.append(item_str)
  518. return result
  519. @staticmethod
  520. def _normalize_match_text(value):
  521. return re.sub(r'\s+', '', str(value or '')).lower()
  522. def _match_any_keyword(self, text, keywords):
  523. keyword_list = self._normalize_rule_list(keywords)
  524. if not keyword_list:
  525. return True
  526. normalized_text = self._normalize_match_text(text)
  527. return any(self._normalize_match_text(keyword) in normalized_text for keyword in keyword_list)
  528. def is_link_spec_useful(self, product_title, specifications=''):
  529. if not self.spec_list:
  530. return True
  531. title_text = self._normalize_match_text(product_title)
  532. spec_text = self._normalize_match_text(specifications)
  533. for spec in self.spec_list:
  534. normalized_spec = self._normalize_match_text(spec)
  535. if normalized_spec in title_text or normalized_spec in spec_text:
  536. return True
  537. return False
  538. def is_link_useful(self, product_title, specifications=''):
  539. if not self._match_any_keyword(product_title, self.title_key):
  540. print(f"当前商品名称:{product_title} 不包含{self.title_key}关键字")
  541. return False
  542. if not self._match_any_keyword(product_title, self.brand):
  543. print(f"当前商品名称:{product_title} 不包含{self.brand}品牌")
  544. return False
  545. if not self.is_link_spec_useful(product_title, specifications):
  546. print(f"当前商品名称:{product_title} 不包含{self.spec_list}品规")
  547. return False
  548. return True
  549. def remove_watermark(self, img_path):
  550. """
  551. 图片去水印(将水印部分变成白色背景)并将数据转化为二进制数据
  552. :param img_path: 图片路径
  553. :return: 二进制图片数据
  554. """
  555. img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
  556. endswith = os.path.splitext(img_path)[1]
  557. new = np.clip(1.4057577998008846 * img - 38.33089999653017, 0, 255).astype(np.uint8)
  558. _, img_binary = cv2.imencode(endswith, new)
  559. return img_binary
  560. def get_shop_name(self):
  561. """
  562. 获取店铺名
  563. :return:
  564. """
  565. try:
  566. xpath = '//*[@text="进店"]/preceding-sibling::android.view.ViewGroup/android.widget.LinearLayout/android.widget.TextView'
  567. if self.d.xpath(xpath).exists:
  568. shop_name = self.d.xpath(xpath).text
  569. self.loggerPdd.info(f'1-获取到店铺名:{shop_name}')
  570. else:
  571. # 进入店铺新页面
  572. shop_btn_xpath = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]'
  573. if self.d.xpath(shop_btn_xpath).exists:
  574. self.d.xpath(shop_btn_xpath).click()
  575. time.sleep(1)
  576. # self.d.xpath('//*[@text="店铺"]').click()
  577. xpath_shop_name = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.TextView[1]'
  578. if self.d.xpath(xpath_shop_name).exists:
  579. shop_name = self.d.xpath(xpath_shop_name).text
  580. self.loggerPdd.info(f'2-获取到店铺名:{shop_name}')
  581. else:
  582. shop_name = ''
  583. self.loggerPdd.info(f'3-获取到店铺名:{shop_name}')
  584. self.swipe_back(1) #
  585. else:
  586. shop_name = ''
  587. self.loggerPdd.info('4-因为shop_btn_xpath不存在,获取到店铺名为空')
  588. # time.sleep(10000)
  589. return shop_name
  590. except Exception as e:
  591. print(f'获取店铺名出错:{e}')
  592. self.loggerPdd.error(f'获取店铺名出错:{e}')
  593. return None
  594. def save_to_database(self, data):
  595. print(f'保存数据到数据库:{data}')
  596. max_retries = 5
  597. for attempt in range(max_retries):
  598. conn = None
  599. try:
  600. conn = get_mysql()
  601. with conn.cursor() as cur:
  602. add_sql = """
  603. INSERT INTO retrieve_scrape_data_999 (
  604. enterprise_id, platform_id, platform_item_id, province_id, city_id,
  605. province_name, city_name, area_info, product_name, product_specs,
  606. one_box_price, manufacture_date, expiry_date, manufacturer, approval_number,
  607. is_sold_out, online_posting_count, continuous_listing_count, link_url,
  608. store_name, store_url, shipment_province_id, shipment_province_name,
  609. shipment_city_id, shipment_city_name, company_name, qualification_number,
  610. scrape_date, min_price, number, sales, inventory, snapshot_url
  611. ) VALUES (
  612. %s, %s, %s, %s, %s,
  613. %s, %s, %s, %s, %s,
  614. %s, %s, %s, %s, %s,
  615. %s, %s, %s, %s,
  616. %s, %s, %s, %s,
  617. %s, %s, %s, %s,
  618. %s, %s, %s, %s, %s, %s
  619. )
  620. """
  621. cur.execute(add_sql, (
  622. data['enterprise_id'],
  623. data['platform_id'],
  624. data['platform_item_id'],
  625. data['province_id'],
  626. data['city_id'],
  627. data['province_name'],
  628. data['city_name'],
  629. data['area_info'],
  630. data['product_name'],
  631. data['product_specs'],
  632. data['one_box_price'],
  633. data['manufacture_date'],
  634. data['expiry_date'],
  635. data['manufacturer'],
  636. data['approval_number'],
  637. data['is_sold_out'],
  638. data['online_posting_count'],
  639. data['continuous_listing_count'],
  640. data['link_url'],
  641. data['store_name'],
  642. data['store_url'],
  643. data['shipment_province_id'],
  644. data['shipment_province_name'],
  645. data['shipment_city_id'],
  646. data['shipment_city_name'],
  647. data['company_name'],
  648. data['qualification_number'],
  649. data['scrape_date'],
  650. data['min_price'],
  651. data['number'],
  652. data['sales'],
  653. data['inventory'],
  654. data['snapshot_url'],
  655. ))
  656. conn.commit()
  657. self.max_counts += 1
  658. print(f"存入数据库成功,当前已采集 {self.max_counts} 条")
  659. return True
  660. except Exception as e:
  661. print(f'保存数据库异常 (尝试 {attempt + 1}/{max_retries}): {e}')
  662. if conn:
  663. conn.rollback()
  664. conn.close()
  665. if attempt == max_retries - 1:
  666. print("达到最大重试次数,保存失败")
  667. return False
  668. time.sleep(2)
  669. def click_target_product_by_search_key(self, fuzzy_match=False, timeout=10):
  670. """
  671. 动态匹配self.search_key对应的商品并点击
  672. :param fuzzy_match: 是否模糊匹配(应对商品名带额外后缀/前缀的情况) 不模糊匹配
  673. :param timeout: 等待元素出现的超时时间(秒)
  674. :return: 点击是否成功(bool)
  675. """
  676. try:
  677. # 1. 定义定位条件(动态使用self.search_key)
  678. if fuzzy_match:
  679. # 模糊匹配:包含search_key即可(推荐,适配搜索结果商品名略有差异)
  680. locator = self.d(textContains=self.search_key)
  681. print(f"🔍 模糊匹配商品:包含「{self.search_key}」的元素")
  682. else:
  683. # 精确匹配:商品名与search_key完全一致
  684. locator = self.d(text=self.search_key)
  685. print(f"🔍 精确匹配商品:「{self.search_key}」")
  686. # 2. 等待元素出现(核心:避免元素未加载就点击)
  687. if locator.wait(timeout=timeout):
  688. print(f"✅ 找到匹配的商品,准备点击")
  689. # 执行点击(优先点击可点击的元素)
  690. locator.click()
  691. print(f"✅ 成功点击「{self.search_key}」对应的商品")
  692. # 点击后等待页面加载
  693. time.sleep(self.get_sleep_time())
  694. return True
  695. else:
  696. print(f"❌ 滑动后仍未找到「{self.search_key}」对应的商品")
  697. return False
  698. except Exception as e:
  699. print(f"❌ 点击「{self.search_key}」对应商品时异常:{e}")
  700. return False
  701. def swipe_down(self):
  702. """
  703. 下滑(模拟真人操作,抗风控+设备适配+容错)
  704. 核心:起点在屏幕上方,终点在屏幕下方(和上滑相反)
  705. :return: None
  706. """
  707. try:
  708. # 1. 获取屏幕尺寸(兼容不同设备,给默认值避免获取失败)
  709. screen_width = self.d.info.get('displayWidth', 1080) # 默认1080px宽度
  710. screen_height = self.d.info.get('displayHeight', 2400) # 默认2400px高度
  711. # 2. 随机滑动时长(0.1~0.3秒,避免固定值被风控,且不设0秒)
  712. duration_rate = random.uniform(0.1, 0.3)
  713. # 3. 计算滑动坐标(用屏幕比例,适配所有设备)
  714. start_x = screen_width // 2 # 水平居中(和上滑一致,符合真人操作习惯)
  715. start_y = screen_height * 0.2 # 起点:屏幕20%高度(上方偏下)
  716. end_y = screen_height * 0.8 # 终点:屏幕80%高度(下方偏上)
  717. # 强制确保起点y < 终点y(必为向下滑,避免逻辑错误)
  718. start_y, end_y = min(start_y, end_y - 10), max(end_y, start_y + 10)
  719. # 4. 核心向下滑动操作
  720. self.d.swipe(start_x, start_y, start_x, end_y, duration=duration_rate)
  721. # 滑动后全局等待(确保页面加载,避免元素定位失败)
  722. time.sleep(self.get_sleep_time())
  723. except Exception as e:
  724. # 异常捕获:避免设备断开/滑动失败导致程序崩溃
  725. print(f"向下滑动失败:{e}")
  726. # 兜底方案:用固定坐标重试(适配主流1080x2400设备)
  727. self.d.swipe(540, 480, 540, 1920, duration=0.2)
  728. time.sleep(self.get_sleep_time())
  729. def swipe_up(self):
  730. """
  731. 上滑
  732. :return:
  733. """
  734. screen_width = self.d.info['displayWidth']
  735. screen_height = self.d.info['displayHeight']
  736. duration_rate = random.uniform(0, 0.3)
  737. self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate)
  738. no = random.uniform(0, 1)
  739. if no > 0.85:
  740. # 有的时候卡着 再稍微往上滑一点点
  741. self.d.swipe_ext("up", 0.1)
  742. time.sleep(self.get_sleep_time())
  743. def swipe_back(self, no):
  744. """
  745. 返回
  746. :param no: 回退次数
  747. :return:
  748. """
  749. if not self.distinct_target():
  750. for idx in range(no):
  751. self.d.press('back')
  752. time.sleep(self.get_sleep_time())
  753. def drug_price(self):
  754. """
  755. 获取药品价格
  756. :return:
  757. """
  758. try:
  759. xpath = '//*[@text="¥"]/following-sibling::android.widget.TextView[1]'
  760. price_str = self.d.xpath(xpath).text
  761. price = float(re.search(r'[\d\.]+', price_str).group())
  762. print(f'获取到价格:{price}')
  763. return float(price)
  764. except Exception as e:
  765. print(f'提取价格出错-->{e}')
  766. return None
  767. def drug_price_ex(self):
  768. price_str = '' # 价格初始化
  769. ext = '' # 初始化已选择的信息
  770. price = ''
  771. # 这是点击进入品规的按钮
  772. button_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[last()]'
  773. button_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[last()]'
  774. # 调试
  775. # test_button = self.d.xpath(button_xpath_1).exists
  776. # print(test_button)
  777. # test_button_2 = self.d.xpath(button_xpath_2).exists
  778. # print(test_button_2)
  779. # time.sleep(1000)
  780. # if self.d.xpath('//*[@text="发起拼单"]').exists:
  781. # self.d.xpath('//*[@text="发起拼单"]').click()
  782. # elif self.d.xpath('//*[@text="去复诊开药"]').exists:
  783. # self.d.xpath('//*[@text="去复诊开药"]').click()
  784. if self.d.xpath(button_xpath_1).exists:
  785. self.d.xpath(button_xpath_1).click()
  786. elif self.d.xpath(button_xpath_2).exists:
  787. self.d.xpath(button_xpath_2).click()
  788. else:
  789. print("button1 and button_2 all not exist")
  790. return price, ext
  791. select_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[last()]'
  792. select_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[last()]'
  793. select_xpath_3 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.TextView[last()]'
  794. select_xpath_3_2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.TextView[last()-1]'
  795. price_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[1]'
  796. price_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[1]'
  797. price_xpath_3 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]//android.widget.TextView[1]'
  798. if self.d.xpath(select_xpath_1).exists:
  799. text1 = self.d.xpath(select_xpath_1).text
  800. print(f"select_xpath_1--text1={text1}")
  801. if '已选' in text1:
  802. if self.d.xpath(price_xpath_1).exists:
  803. price_str = self.d.xpath(price_xpath_1).text
  804. print(f"select_xpath_1--price_str-1={price_str}")
  805. else:
  806. print("select_xpath_1--price_xpath_1-1 not exist")
  807. ext = text1
  808. elif '请选择' in text1:
  809. # 需要再下面点击选择
  810. scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  811. scroll_xpath_2 = ''
  812. if self.d.xpath(scroll_xpath_1).exists:
  813. self.d.xpath(scroll_xpath_1).click()
  814. time.sleep(2) # 延时2秒钟,选择了之后价格会刷新
  815. if self.d.xpath(select_xpath_1).exists:
  816. text2 = self.d.xpath(select_xpath_1).text
  817. if '已选' in text2:
  818. print(f"select_xpath_1--已选择2:text2={text2}")
  819. if self.d.xpath(price_xpath_1).exists:
  820. price_str = self.d.xpath(price_xpath_1).text
  821. print(f"select_xpath_1--price_str-2={price_str}")
  822. else:
  823. print("select_xpath_1--price_xpath_1-2 not exist")
  824. ext = text2
  825. else:
  826. print("select_xpath_1--scroll_xpath_1 not exist")
  827. elif self.d.xpath(select_xpath_2).exists:
  828. text1 = self.d.xpath(select_xpath_2).text
  829. print(f"xpath2--text1={text1}")
  830. if '已选' in text1:
  831. ext = text1
  832. if self.d.xpath(price_xpath_2).exists:
  833. price_str = self.d.xpath(price_xpath_2).text
  834. print(f"select_xpath_2--price_str-2={price_str}")
  835. else:
  836. print("select_xpath_2--price_xpath_2-1 not exist")
  837. elif '请选择' in text1:
  838. print('come in here')
  839. # 需要再下面点击选择
  840. scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
  841. if self.d.xpath(scroll_xpath_1).exists:
  842. print("scroll_xpath_1 exists")
  843. self.d.xpath(scroll_xpath_1).click()
  844. time.sleep(2) # 延时2秒钟,选择了之后价格可能会刷新
  845. if self.d.xpath(select_xpath_2).exists:
  846. text2 = self.d.xpath(select_xpath_2).text
  847. if '已选' in text2:
  848. ext = text2
  849. print(f"select_xpath_2--已选择2:text2={text2}")
  850. if self.d.xpath(price_xpath_2).exists:
  851. price_str = self.d.xpath(price_xpath_2).text
  852. print(f"select_xpath_2--price_str-2={price_str}")
  853. else:
  854. print("select_xpath_2--price_xpath_2-2 not exist")
  855. else:
  856. print("scroll_xpath_1 not exists")
  857. else:
  858. print("not exist 请选择 or 已选")
  859. elif self.d.xpath(select_xpath_3).exists:
  860. text1 = self.d.xpath(select_xpath_3).text
  861. print(f"xpath3--text1-1={text1}")
  862. if ('请选择' not in text1) and ('已选' not in text1):
  863. text1 = self.d.xpath(select_xpath_3_2).text
  864. print(f"xpath3--text1-2={text1}")
  865. if '已选' in text1:
  866. ext = text1
  867. if self.d.xpath(price_xpath_3).exists:
  868. price_str = self.d.xpath(price_xpath_3).text
  869. print(f"select_xpath_3--price_str-3-3-1={price_str}")
  870. else:
  871. print("select_xpath_3--price_xpath_3-3-1 not exist")
  872. elif '请选择' in text1:
  873. print('come in here')
  874. # 需要再下面点击选择
  875. scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
  876. recycler_view_xpath = '//*[@resource-id="android:id/content"]//android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
  877. if self.d.xpath(scroll_xpath_1).exists:
  878. print("scroll_xpath_1 exists")
  879. self.d.xpath(scroll_xpath_1).click()
  880. time.sleep(2) # 延时2秒钟,选择了之后价格可能会刷新
  881. if self.d.xpath(select_xpath_3).exists:
  882. text2 = self.d.xpath(select_xpath_3).text
  883. if '已选' in text2:
  884. ext = text2
  885. print(f"select_xpath_3--已选择2:text2={text2}")
  886. if self.d.xpath(price_xpath_3).exists:
  887. price_str = self.d.xpath(price_xpath_3).text
  888. print(f"select_xpath_3--price_str-3-2={price_str}")
  889. else:
  890. print("select_xpath_3--price_xpath_3-3-2 not exist")
  891. elif self.d.xpath(recycler_view_xpath).exists:
  892. self.d.xpath(recycler_view_xpath).click()
  893. time.sleep(2) # 延时2秒钟,选择了之后价格可能会刷新
  894. if self.d.xpath(select_xpath_3).exists:
  895. text2 = self.d.xpath(select_xpath_3).text
  896. if '已选' in text2:
  897. ext = text2
  898. print(f"select_xpath_3--已选择2:text2={text2}")
  899. if self.d.xpath(price_xpath_3).exists:
  900. price_str = self.d.xpath(price_xpath_3).text
  901. print(f"select_xpath_3--price_str-3-3={price_str}")
  902. else:
  903. print("select_xpath_3--price_xpath_3-3-3 not exist")
  904. else:
  905. print("scroll_xpath_1 not exists")
  906. else:
  907. print(f"xpath3--text1-不包含请选择和已选择")
  908. else:
  909. print("select_xpath_1 and select_xpath_2 and select_xpath_3 all not exist")
  910. if price_str:
  911. # price = float(re.search('[\d\.]+', price_str).group())
  912. match = re.search(r'¥([\d\.]+)', price_str)
  913. if match:
  914. price = float(match.group(1))
  915. else:
  916. price = ''
  917. # price = float(re.search(r'¥([\d\.]+)', price_str).group(1))
  918. print(f'获取到价格:{price}')
  919. print(f"ext={ext}")
  920. self.swipe_back(1) #
  921. return price, ext
  922. def restart_uiautomator_services(self, device_id):
  923. """
  924. 重启atx的uiautomator 服务
  925. :param device_id:
  926. :return:
  927. """
  928. stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop'
  929. start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'
  930. subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  931. time.sleep(self.get_sleep_time())
  932. subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True)
  933. time.sleep(self.get_sleep_time())
  934. def connect_devices(self, device_id):
  935. """
  936. 连接设备
  937. :return:
  938. """
  939. try:
  940. self.d = u2.connect_usb(device_id)
  941. # 设置隐形等待时间
  942. # self.d.implicitly_wait(5)
  943. self.restart_uiautomator_services(device_id)
  944. print(f'[{self.program_start_time}]连接到设备:{device_id}')
  945. except Exception as e:
  946. print(f'{device_id} 连接错误: {e}')
  947. raise Exception(e)
  948. def get_ocr_res(self, img):
  949. try:
  950. image = self.remove_watermark(img)
  951. res_image = self.client.basicGeneral(image)
  952. data = res_image.get('words_result', '')
  953. print(f'百度api返回结果:{data}')
  954. return data
  955. except:
  956. return None
  957. def get_title(self):
  958. try:
  959. print('开始提取标题')
  960. time.sleep(self.get_sleep_time())
  961. title_xpath = '//*[@resource-id="com.xunmeng.pinduoduo:id/tv_title"]'
  962. if self.d.xpath(title_xpath).exists:
  963. title = self.d.xpath(title_xpath).info['contentDescription'].strip()
  964. else:
  965. return None
  966. # title = self.d.xpath('//*[@resource-id="com.xunmeng.pinduoduo:id/tv_title"]').info['contentDescription'].strip()
  967. print(f'提取到标题:{title}')
  968. return title
  969. except Exception as e:
  970. print(f'获取标题出错:{e}')
  971. return None
  972. # 从里面匹配出药品名和规格
  973. # drugs_name
  974. # specifications
  975. # match = re.search(r'([^\d]+)([\d\D]+)', title)
  976. # match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
  977. # if match:
  978. # drugs_name = match.group(1).strip() + match.group(2).strip()
  979. # specifications = match.group(3).strip()
  980. # print("药品名:", drugs_name)
  981. # print("规格:", specifications)
  982. # print('完整药名:', drugs_name + specifications)
  983. # return drugs_name, specifications
  984. # else:
  985. # print("没有匹配到预期格式")
  986. def enter_shop(self):
  987. """
  988. 进店,方便提取资质环境
  989. :return:
  990. """
  991. # self.d.xpath('//*[@text="进店"]').click()
  992. self.d.xpath('//*[@text="店铺"]').click()
  993. time.sleep(self.get_sleep_time())
  994. def data_is_exists(self, data):
  995. # 1. 验证必要字段
  996. required_keys = ['min_price', 'shop', 'scrape_date', 'platform']
  997. if not all(key in data for key in required_keys):
  998. missing = [key for key in required_keys if key not in data]
  999. print(f"缺少必要字段: {', '.join(missing)}")
  1000. return None
  1001. conn = None
  1002. try:
  1003. conn = get_mysql()
  1004. with conn.cursor() as cur:
  1005. query_sql = """
  1006. SELECT * FROM {}
  1007. WHERE min_price = %s
  1008. AND store_name = %s
  1009. AND scrape_date = %s
  1010. AND platform_id = %s
  1011. """.format(self.table_name)
  1012. cur.execute(query_sql, (
  1013. data['min_price'],
  1014. data['shop'],
  1015. data['scrape_date'],
  1016. data['platform']
  1017. ))
  1018. result = cur.fetchone()
  1019. return bool(result) # 如果存在返回True,否则False
  1020. except Exception as e:
  1021. print(f"MySQL 错误: {str(e)}")
  1022. finally:
  1023. if conn:
  1024. conn.close()
  1025. def get_instructions_data(self):
  1026. """
  1027. 确定有详情页之后之后,提取所有的详情页数据
  1028. :return:
  1029. """
  1030. for i in range(8):
  1031. if self.d.xpath('//*[@text="品牌"]').exists or self.d.xpath('//*[@text="药品通用名"]').exists:
  1032. self.d.swipe_ext("up", scale=0.1)
  1033. print('开始采集详情数据')
  1034. break
  1035. self.d.swipe_ext("up", scale=0.5)
  1036. time.sleep(self.get_sleep_time())
  1037. # 点击查看全部
  1038. if self.d.xpath('//*[@text="品牌"]').exists:
  1039. self.d.xpath('//*[@text="品牌"]').click()
  1040. else:
  1041. self.d.xpath('//*[@text="药品通用名"]').click()
  1042. time.sleep(self.get_sleep_time())
  1043. attr = dict()
  1044. # # 获取详情页信息
  1045. xpath = '//*[starts-with(@text,"商品参数")]/parent::*/parent::*/following-sibling::*/*/*/android.view.ViewGroup//android.widget.TextView'
  1046. ddd = self.d.xpath(xpath).all()
  1047. for i in range(0, len(ddd), 2):
  1048. group = ddd[i:i + 2]
  1049. attr[group[0].text] = group[1].text
  1050. # 截图获取未获取到的数据
  1051. # if not all(i in ['有效期', '生产企业', '批准文号', '药品规格', '产品规格'] for i in attr.keys()):
  1052. if not all(i in ['有效期', '生产企业', '批准文号', '药品规格'] for i in attr.keys()):
  1053. self.d.swipe_ext("up", 0.4)
  1054. time.sleep(self.get_sleep_time())
  1055. xpath = '//*[starts-with(@text,"商品参数")]/parent::*/parent::*/following-sibling::*/*/*/android.view.ViewGroup//android.widget.TextView'
  1056. ddd = self.d.xpath(xpath).all()
  1057. for i in range(0, len(ddd), 2):
  1058. group = ddd[i:i + 2]
  1059. attr[group[0].text] = group[1].text
  1060. print(f'当前说明书规格参数:{attr}')
  1061. res_data = {
  1062. # "有效期": attr.get('有效期',''),
  1063. # "生产单位": attr['生产企业'],
  1064. # "批准文号": attr['批准文号'],
  1065. # "产品规格": attr.get('药品规格') if attr.get('药品规格', '') else attr.get('药品规格')
  1066. "有效期": attr.get('有效期', ''),
  1067. "生产单位": attr.get('生产企业', ''),
  1068. "批准文号": attr.get('批准文号', ''),
  1069. "产品规格": attr.get('药品规格', '')
  1070. }
  1071. print(f'当前规格参数字典数据:{res_data}')
  1072. return res_data
  1073. def has_instructions(self):
  1074. """
  1075. 是否有详情页
  1076. :return:如果有详情页返回True,否则返回False
  1077. """
  1078. # 没有说明书的无法采集具体数据
  1079. max_attempts = 12 # 最大尝试次数
  1080. attempt = 0 # 当前尝试次数
  1081. while attempt < max_attempts:
  1082. time.sleep(0.5)
  1083. xpath = '//*[@text="商品详情"]'
  1084. is_has_instructions = self.d.xpath(xpath).exists
  1085. if is_has_instructions:
  1086. return True # 如果找到“商品详情”,则返回True
  1087. self.d.swipe_ext("up", 0.3)
  1088. attempt += 1
  1089. return False # 如果尝试次数达到最大次数,则返回False
  1090. def distinct_target(self):
  1091. result = False
  1092. is_position = self.d.xpath('//*[@content-desc="拍照搜索"]').exists
  1093. is_position2 = self.d.xpath('//*[@text="年货节大促"]').exists
  1094. is_position3 = self.d.xpath('//*[@text="筛选"]').exists
  1095. is_position4 = self.d.xpath('//*[@text="回头客常拼"]').exists
  1096. list_page_xpath = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]//android.support.v7.widget.RecyclerView[1]'
  1097. is_position_new = self.d.xpath(list_page_xpath).exists
  1098. print(f'is_position_new={is_position_new}')
  1099. if is_position or is_position2 or is_position3 or is_position4 or is_position_new:
  1100. result = True
  1101. return result
  1102. def enter_target_page(self):
  1103. self.d.xpath(
  1104. '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]').click()
  1105. time.sleep(self.get_sleep_time())
  1106. self.d(className='android.widget.EditText').click()
  1107. time.sleep(self.get_sleep_time())
  1108. self.d.send_keys(self.search_key, clear=True)
  1109. time.sleep(self.get_sleep_time())
  1110. self.d.xpath('//*[@text="搜索"]').click()
  1111. time.sleep(self.get_sleep_time())
  1112. if self.sort and self.sort_key == 0:
  1113. self.li_or_lo(self.sort)
  1114. # progress = self.wr_re("读", self.device_id)
  1115. progress = None
  1116. if not progress and self.page > 0:
  1117. self.scroll_to_target_page(self.page)
  1118. def get_clipboard(self):
  1119. self.loggerPdd.info(f"Clipboard content:{self.d.clipboard}") # 打印调试信息
  1120. clipboard_content = self.d.clipboard
  1121. if clipboard_content is None:
  1122. return ''
  1123. return clipboard_content.strip()
  1124. def get_product_link(self):
  1125. product_link = ''
  1126. print('开始获取商品链接')
  1127. content_frame = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]').exists
  1128. print(content_frame)
  1129. relative_layout = self.d.xpath(
  1130. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]').exists
  1131. print(relative_layout)
  1132. relative_layout2 = self.d.xpath(
  1133. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]').exists
  1134. print(relative_layout2)
  1135. Frame_Layout = self.d.xpath(
  1136. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]').exists
  1137. print(Frame_Layout)
  1138. ImageView = self.d.xpath(
  1139. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]').exists
  1140. print(ImageView)
  1141. ImageView2 = self.d.xpath(
  1142. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[3]/android.view.View[1]').exists
  1143. print(ImageView2)
  1144. # 多种可能的“分享”按钮
  1145. dots_xpaths = [
  1146. # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]',
  1147. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[last()]/android.view.View[1]',
  1148. # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]',
  1149. # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[3]/android.widget.ImageView[1]',
  1150. ]
  1151. max_retry = 5 # 最多尝试次数
  1152. for idx in range(1, max_retry + 1):
  1153. if product_link: # 已经拿到则退出
  1154. break
  1155. for xp in dots_xpaths:
  1156. if self.d.xpath(xp).exists:
  1157. # print(f'{idx}-进入分享点点点')
  1158. self.loggerPdd.info(f'{idx}-进入分享点点点')
  1159. self.d.xpath(xp).click()
  1160. time.sleep(1)
  1161. self.loggerPdd.info('开始滑动')
  1162. self.slide_link()
  1163. time.sleep(0.2)
  1164. self.d.xpath('//*[@text="复制链接"]').click_exists()
  1165. time.sleep(1)
  1166. product_link = self.get_clipboard()
  1167. time.sleep(0.5)
  1168. self.loggerPdd.info(f'{idx}-商品链接:{product_link}')
  1169. break # 找到并执行后跳出内层循环
  1170. if not product_link and idx < max_retry:
  1171. time.sleep(0.5) # 最后一次不需要再等待
  1172. # time.sleep(100000)
  1173. return product_link
  1174. def integrate_data_v2(self):
  1175. """
  1176. 基于入口配置统一校验标题、品牌和品规,替代内部大量硬编码分支。
  1177. """
  1178. min_price, ext = self.drug_price_ex()
  1179. title_info = self.get_title()
  1180. if not title_info:
  1181. print('标题获取为空')
  1182. self.swipe_back(1)
  1183. return
  1184. if not self.is_link_useful(title_info):
  1185. self.swipe_back(1)
  1186. self.unrelated_data += 1
  1187. return
  1188. if not min_price:
  1189. min_price = self.drug_price()
  1190. if not min_price:
  1191. print('提取价格出错,回退到列表页')
  1192. self.swipe_back(1)
  1193. self.unrelated_data += 1
  1194. return
  1195. product_link = self.get_product_link()
  1196. time.sleep(2)
  1197. if self.direct_shop_lookup:
  1198. shop = self.get_shop_name()
  1199. else:
  1200. for _ in range(15):
  1201. if self.d(textStartsWith="进店").exists:
  1202. print('开始获取店铺名')
  1203. break
  1204. self.d.swipe_ext("up", scale=0.3)
  1205. time.sleep(self.get_sleep_time())
  1206. if self.d(textStartsWith="进店").exists:
  1207. print('可以开始获取店铺名')
  1208. shop = self.get_shop_name()
  1209. if not shop:
  1210. print('当前店铺名称为空')
  1211. self.swipe_back(1)
  1212. self.unrelated_data += 1
  1213. return
  1214. scrape_date = self.get_current_date()
  1215. dup_data = {
  1216. 'min_price': min_price,
  1217. 'shop': shop,
  1218. 'scrape_date': scrape_date,
  1219. 'platform': '3'
  1220. }
  1221. if self.data_is_exists(dup_data):
  1222. print('存在相同数据不入库')
  1223. self.back_to_list_page()
  1224. return
  1225. is_has_instructions = self.has_instructions()
  1226. self.loggerPdd.info(f'是否有说明书:{is_has_instructions}')
  1227. manufacture_date = ''
  1228. credit_code = ext
  1229. if is_has_instructions:
  1230. try:
  1231. instructions_info = self.get_instructions_data()
  1232. expiry_date = instructions_info['有效期'].strip('。')
  1233. manufacturer = instructions_info['生产单位'].strip('。')
  1234. approval_number = instructions_info['批准文号'].strip('。')
  1235. specifications = instructions_info['产品规格'].strip('。')
  1236. except Exception as e:
  1237. print(f'获取详情页规格参数出错:{e}')
  1238. self.swipe_back(2)
  1239. return
  1240. else:
  1241. expiry_date = ''
  1242. manufacturer = ''
  1243. approval_number = ''
  1244. specifications = ''
  1245. if not self.is_link_useful(title_info, specifications):
  1246. self.swipe_back(1)
  1247. self.unrelated_data += 1
  1248. return
  1249. self.unrelated_data = 0
  1250. if extract_box_number(credit_code):
  1251. one_box_price = min_price / extract_box_number(credit_code)
  1252. else:
  1253. print("单瓶药品价格没处理成功")
  1254. one_box_price = 0
  1255. save_data = {
  1256. 'enterprise_id': 6,
  1257. 'platform_id': 3,
  1258. 'platform_item_id': '',
  1259. 'province_id': 0,
  1260. 'city_id': 0,
  1261. 'province_name': '',
  1262. 'city_name': '',
  1263. 'area_info': "",
  1264. 'product_name': title_info,
  1265. 'product_specs': specifications,
  1266. 'one_box_price': one_box_price,
  1267. 'manufacture_date': manufacture_date,
  1268. 'expiry_date': expiry_date,
  1269. 'manufacturer': manufacturer,
  1270. 'approval_number': approval_number,
  1271. 'is_sold_out': 0,
  1272. 'online_posting_count': 1,
  1273. 'continuous_listing_count': 1,
  1274. 'link_url': product_link,
  1275. 'store_name': shop,
  1276. 'store_url': '',
  1277. 'shipment_province_id': 0,
  1278. 'shipment_province_name': "",
  1279. 'shipment_city_id': 0,
  1280. 'shipment_city_name': "",
  1281. 'company_name': "",
  1282. 'qualification_number': "",
  1283. 'scrape_date': scrape_date,
  1284. 'min_price': min_price,
  1285. 'number': 0,
  1286. 'sales': "",
  1287. 'inventory': "",
  1288. 'snapshot_url': "",
  1289. 'insert_time': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  1290. }
  1291. self.save_to_database(save_data)
  1292. def main(self, device_id, search_key_length, keyword_idx):
  1293. completed_normally = False
  1294. stop_by_max_count = False
  1295. spider_no = 0
  1296. current_page = self.page
  1297. self.connect_devices(device_id)
  1298. time.sleep(self.get_sleep_time())
  1299. if keyword_idx == 0:
  1300. print("搜索前,先重启APP")
  1301. self.restart_app()
  1302. # 搜索关键字
  1303. self.enter_target_page()
  1304. else:
  1305. print("清空前面的文字,再输入关键词")
  1306. self.d.send_keys(self.search_key, clear=True)
  1307. time.sleep(1)
  1308. print("点击搜索")
  1309. self.d.xpath('//*[@text="搜索"]').click()
  1310. time.sleep(1)
  1311. # 上报状态
  1312. if self.task_id:
  1313. report_api(self.task_id, self.page, 2,finish_status=0)
  1314. for idx in range(300):
  1315. print(f'第{current_page}页')
  1316. # self.wr_re("写", self.device_id, self.sort, current_page)
  1317. if spider_no > 30:
  1318. time.sleep(300)
  1319. spider_no = 0
  1320. if self.unrelated_data > 30:
  1321. print(f'[{self.program_start_time}]----{self.search_key}----连续超过30个不达标的数据则停止采集')
  1322. print(
  1323. f"[程序启动时间:{self.program_start_time}-----程序结束时间:{self.app_current_time()}]----搜索关键词:{self.search_key}----点击了{self.click_counts}个商品")
  1324. self.swipe_down()
  1325. time.sleep(self.get_sleep_time()) # 下滑后等待页面稳定
  1326. click_success = self.click_target_product_by_search_key(fuzzy_match=False)
  1327. if not click_success:
  1328. self.finish_task_abnormally(
  1329. current_page,
  1330. f"连续超过30个不达标的数据后,关键词「{self.search_key}」商品点击失败",
  1331. finish_status=1
  1332. )
  1333. return
  1334. print("点击搜索框")
  1335. self.d(className='android.widget.EditText').click()
  1336. time.sleep(self.get_sleep_time())
  1337. if keyword_idx == search_key_length - 1:
  1338. print("程序最后一个品规采集完毕,返回主屏幕")
  1339. completed_normally = self.finish_task_normally(
  1340. current_page,
  1341. '连续超过30个不达标的数据,结束采集'
  1342. )
  1343. else:
  1344. completed_normally = True
  1345. break
  1346. if self.is_max_count_reached():
  1347. completed_normally = self.finish_task_with_max_count(current_page)
  1348. # 向下滑
  1349. self.swipe_down()
  1350. time.sleep(self.get_sleep_time())
  1351. # 点击搜索框
  1352. click_success = self.click_target_product_by_search_key(fuzzy_match=False)
  1353. if not click_success:
  1354. print(f"关键词「{self.search_key}」商品点击失败")
  1355. return
  1356. print("点击搜索框")
  1357. self.d(className='android.widget.EditText').click()
  1358. time.sleep(self.get_sleep_time())
  1359. break
  1360. # 售罄次数大于4基本就是号废了但是如果下次点击不会出现这种情况就要重置为0
  1361. if self.sold_out_counts > 4:
  1362. self.finish_task_abnormally(
  1363. current_page,
  1364. "====商品已售罄4次,结束采集(号不能用)",
  1365. finish_status=1
  1366. )
  1367. print(
  1368. f"[程序启动时间:{self.program_start_time}-----程序结束时间:{self.app_current_time()}]----搜索关键词:{self.search_key}----点击了{self.click_counts}个商品")
  1369. break
  1370. drug_lis = self.get_drug_lis(idx)
  1371. print('数量', len(drug_lis))
  1372. for idd, drug_one in enumerate(drug_lis):
  1373. print(idd + 1, drug_one.info)
  1374. time.sleep(self.get_sleep_time())
  1375. top = drug_one.info['bounds']['top']
  1376. bottom = drug_one.info['bounds']['bottom']
  1377. if bottom <= 1524 and top >= 258:
  1378. drug_one.click()
  1379. self.click_counts += 1
  1380. time.sleep(self.get_sleep_time())
  1381. # 先判断是否售罄次数是否大于4
  1382. if self.sold_out_counts >= 4:
  1383. print(
  1384. f"[程序启动时间:{self.program_start_time}-----程序结束时间:{self.app_current_time()}]----搜索关键词:{self.search_key}----点击了{self.click_counts}个商品")
  1385. self.finish_task_abnormally(
  1386. current_page,
  1387. "====这是在第一页有两个,商品已售罄4次,结束采集(号不能用)====",
  1388. finish_status=1
  1389. )
  1390. time.sleep(self.get_sleep_time())
  1391. self.d.press('home')
  1392. return
  1393. if self.d.xpath('//*[contains(@text, "商品已售罄")]').wait(timeout=5):
  1394. print("======商品已售罄======")
  1395. self.sold_out_counts += 1
  1396. if self.back_to_list_page():
  1397. continue
  1398. # 采集药品信息
  1399. try:
  1400. # 重置商品售罄次数
  1401. self.sold_out_counts = 0
  1402. self.integrate_data_v2()
  1403. # 检测下是否回退到列表页
  1404. if self.back_to_list_page():
  1405. print('回退到列表页', True)
  1406. else:
  1407. print(f'[{self.app_current_time()}] 回退到列表页失败')
  1408. print(
  1409. f"[程序启动时间:{self.program_start_time}-----结束时间:{self.app_current_time()}]----搜索关键词:{self.search_key}----点击了{self.click_counts}个商品")
  1410. self.finish_task_abnormally(current_page, "回退到列表页失败,结束采集")
  1411. return
  1412. time.sleep(self.get_sleep_time())
  1413. spider_no += 1
  1414. if self.is_max_count_reached():
  1415. completed_normally = self.finish_task_with_max_count(current_page)
  1416. stop_by_max_count = True
  1417. break
  1418. except Exception as e:
  1419. self.loggerPdd.error(f'采集药品详情数据出错:{e}')
  1420. if not self.back_to_list_page():
  1421. self.finish_task_abnormally(current_page, '采集药品详情数据出错且无法回到列表页,结束采集')
  1422. return
  1423. else:
  1424. continue
  1425. if stop_by_max_count:
  1426. break
  1427. if self.end_page is not None and current_page >= self.end_page:
  1428. completed_normally = self.finish_task_normally(
  1429. current_page,
  1430. f"已采集到结束页 {self.end_page},结束任务"
  1431. )
  1432. break
  1433. if self.d(textStartsWith="抱歉,没有更多商品啦~").exists:
  1434. completed_normally = self.finish_task_normally(current_page, '已经到达列表页最底部')
  1435. break
  1436. print('开始滑入下一页')
  1437. end_y = 300
  1438. self.d.swipe(200, 1400, 200, end_y, 0.4)
  1439. time.sleep(self.get_sleep_time())
  1440. current_page += 1
  1441. self.page = current_page
  1442. if completed_normally:
  1443. self.clear_progress_file()
  1444. elif not self.finish_reported:
  1445. self.finish_task_abnormally(current_page, "采集流程异常结束")
  1446. return completed_normally
  1447. def run_device(device_id):
  1448. """单个设备的采集任务(运行于独立线程)"""
  1449. if device_id not in device_list:
  1450. logging.error(f"设备id没有配置: {device_id}")
  1451. return
  1452. tasks = device_list[device_id]
  1453. logging.info(f"[设备 {device_id}] 开始执行,共 {len(tasks)} 个任务")
  1454. for task in tasks:
  1455. cycle_no = 0
  1456. while True:
  1457. cycle_no += 1
  1458. pdd = None
  1459. logging.info(f'[设备 {device_id}] ========== {task.get("search_key")} 第 {cycle_no} 轮采集开始 ==========')
  1460. try:
  1461. pdd = PDD(
  1462. task["search_key"],
  1463. device_id,
  1464. title_key=task.get("title_key"),
  1465. spec_list=task.get("spec_list"),
  1466. brand=task.get("brand", ""),
  1467. save_search_key=task.get("save_search_key"),
  1468. start_page=task.get("start_page", 1),
  1469. end_page=task.get("end_page"),
  1470. max_counts_limit=task.get("max_counts_limit", DEFAULT_MAX_COUNTS_LIMIT),
  1471. direct_shop_lookup=task.get("direct_shop_lookup", False),
  1472. sort=task.get("sort", "升序"),
  1473. platform=task.get("platform"),
  1474. task_id=task.get("task_id"),
  1475. enterprise_id=task.get("enterprise_id"),
  1476. )
  1477. pdd.main(device_id, 1, 0)
  1478. logging.info(f'[设备 {device_id}] 关键字 {task.get("search_key")} 本轮采集完成')
  1479. break
  1480. except Exception as e:
  1481. logging.exception(f'[设备 {device_id}] 关键字 {task.get("search_key")} 采集异常:{e}')
  1482. finally:
  1483. if pdd and hasattr(pdd, 'close'):
  1484. pdd.close()
  1485. logging.info(f"[设备 {device_id}] 所有任务执行完毕")
  1486. def run_device_list_tasks(device_ids=None):
  1487. if device_ids is None:
  1488. device_ids = []
  1489. if not device_ids:
  1490. device_ids = list(device_list.keys())
  1491. logging.info("未指定设备ID,默认运行 device_list 中全部设备")
  1492. invalid_ids = [did for did in device_ids if did not in device_list]
  1493. if invalid_ids:
  1494. logging.error(f"以下设备ID未配置: {invalid_ids}")
  1495. return
  1496. if not device_ids:
  1497. logging.warning("device_list 为空,没有可运行的设备")
  1498. return
  1499. logging.info(f"将运行指定的设备: {device_ids}")
  1500. with ThreadPoolExecutor(max_workers=len(device_ids)) as executor:
  1501. futures = [executor.submit(run_device, did) for did in device_ids]
  1502. for future in futures:
  1503. future.result()
  1504. # pdd
  1505. def main(use_db_task_source=USE_DB_TASK_SOURCE):
  1506. if use_db_task_source:
  1507. logging.info(f"PDD 调度器启动,轮询间隔 {SCHEDULER_INTERVAL_SECONDS} 秒")
  1508. dispatch_pending_tasks()
  1509. schedule_dispatch(SCHEDULER_INTERVAL_SECONDS)
  1510. scheduler_stop_event.wait()
  1511. return
  1512. device_ids = sys.argv[1:]
  1513. run_device_list_tasks(device_ids)
  1514. device_list = {
  1515. "2e58510": [
  1516. {
  1517. "search_key": "999小儿感冒颗粒", # 必填
  1518. "title_key": "小儿感冒颗粒",
  1519. "spec_list": ["6g*24"], # 列表可以,代码会自动归一化
  1520. "brand": "999",
  1521. "save_search_key": "小儿感冒颗粒",
  1522. "start_page": 0,
  1523. "end_page": 200,
  1524. "max_counts_limit": 300,
  1525. "sort": "升序",
  1526. },
  1527. ],
  1528. "U47HZDRG8XJBBURW": [
  1529. {
  1530. "search_key": "999小儿感冒颗粒", # 必填
  1531. "title_key": "小儿感冒颗粒", #作为筛选依据
  1532. "spec_list": ["6g*24"], # 列表可以,代码会自动归一化
  1533. "brand": "999",
  1534. "save_search_key": "小儿感冒颗粒",
  1535. "start_page": 0,
  1536. "end_page": 200,
  1537. "max_counts_limit": 300,
  1538. "sort": "升序",
  1539. },
  1540. ],
  1541. "e2899b34": [
  1542. {
  1543. "search_key": "999小儿感冒颗粒", # 必填
  1544. "title_key": "小儿感冒颗粒", #作为筛选依据
  1545. "spec_list": ["6g*24"], # 列表可以,代码会自动归一化
  1546. "brand": "999",
  1547. "save_search_key": "小儿感冒颗粒",
  1548. "start_page": 0,
  1549. "end_page": 200,
  1550. "max_counts_limit": 300,
  1551. "sort": "升序",
  1552. },
  1553. ],
  1554. # "ZDQWUSSWBEDI896T":[
  1555. # {
  1556. # "search_key": "维生素D滴剂",
  1557. # "title_key": "维生素D滴剂",
  1558. # "spec_list": ["40粒"],
  1559. # "brand": "澳诺",
  1560. # "save_search_key": "维生素D滴剂",
  1561. # "start_page": 0,
  1562. # "end_page": 200,
  1563. # "max_counts_limit": 300,
  1564. # "sort": "升序",
  1565. # },
  1566. # ],
  1567. # "U47HZDRG8XJBBURW": [
  1568. # {
  1569. # "search_key": "维生素D滴剂",
  1570. # "title_key": "维生素D滴剂",
  1571. # "spec_list": ["10粒"],
  1572. # "brand": "澳诺",
  1573. # "save_search_key": "维生素D滴剂",
  1574. # "start_page": 0,
  1575. # "end_page": 200,
  1576. # "max_counts_limit": 300,
  1577. # "sort": "升序",
  1578. # },
  1579. # ],
  1580. }
  1581. if __name__ == '__main__':
  1582. main()