pdd_auto_scrape.py 163 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527
  1. import requests
  2. import base64
  3. import cv2
  4. import uiautomator2 as u2
  5. import time
  6. import subprocess
  7. import re
  8. import random
  9. import datetime
  10. import json
  11. from apscheduler.schedulers.blocking import BlockingScheduler
  12. from aip import AipOcr
  13. import numpy as np
  14. import cv2
  15. import os
  16. from config import Config
  17. import logging
  18. from logger import setup_logger
  19. import xml.etree.ElementTree as ET
  20. import secrets
  21. import threading
  22. from collections import deque
  23. from typing import Dict, Any
  24. import schedule
  25. setup_logger("pdd_spider") # 初始化日志
  26. class SpiderMonitor(threading.Thread):
  27. """全局弹窗监控线程(增强版)"""
  28. def __init__(self, spider_instance):
  29. super().__init__(daemon=True)
  30. self.spider = spider_instance
  31. self.running = True
  32. self.pausing = threading.Event() # 主线程同步事件
  33. self.last_verification_time = 0
  34. self.verification_count = 0
  35. self.MAX_VERIFICATION_RETRY = 10
  36. self.recent_clicks = deque(maxlen=10) # 防重复点击
  37. self.logger = logging.getLogger("SpiderMonitor")
  38. # 可配置化弹窗规则
  39. self.popup_rules = {
  40. "simple": [
  41. ('//*[@text="确定"]', "点击确定"),
  42. ('//*[@text="允许"]', "点击允许"),
  43. ('//*[@text="关闭"]', "点击关闭"),
  44. ('//*[@resource-id="com.sankuai.meituan:id/close"]', "关闭按钮"),
  45. ('//*[@resource-id="com.sankuai.meituan:id/address_center_location_close"]', "关闭按钮"),
  46. ('//*[@resource-id="com.sankuai.meituan:id/location_close"]', "关闭按钮"),
  47. ],
  48. "verification": [
  49. '//*[contains(@text, "验证")]',
  50. '//*[contains(@text, "滑块")]',
  51. '//*[contains(@text, "依次点击")]',
  52. '//*[contains(@text, "请点击")]',
  53. '//*[contains(@text, "拖动滑块刚")]', #这个需要拖动滑块至最右边,然后再截图
  54. '//*[contains(@text, "请输入图片中的内容")]',
  55. '//*[contains(@text, "用最短线连接")]',
  56. '//*[contains(@text, "请按语序依次点击")]',
  57. '//*[contains(@text, "请向右滑动滑块")]',
  58. '//*[contains(@text, "请拖动下方滑块完成拼图")]',
  59. '//*[contains(@resource-id, "captcha")]'
  60. ]
  61. }
  62. def run(self):
  63. while self.running:
  64. try:
  65. handled = self.check_and_handle_popup()
  66. time.sleep(2 if handled else 1)
  67. except Exception as e:
  68. self.logger.exception("监控线程异常: %s", e)
  69. time.sleep(3)
  70. def _is_recent_click(self, xpath):
  71. """防止重复点击同一个弹窗"""
  72. key = f"{xpath}_{int(time.time())}"
  73. if key in self.recent_clicks:
  74. return True
  75. self.recent_clicks.append(key)
  76. return False
  77. def check_and_handle_popup(self):
  78. d = self.spider.d
  79. # 1. 处理简单弹窗
  80. for xpath, desc in self.popup_rules["simple"]:
  81. if d.xpath(xpath).exists and not self._is_recent_click(xpath):
  82. self.logger.info("检测到弹窗: %s", desc)
  83. d.xpath(xpath).click()
  84. return True
  85. # 2. 处理验证码弹窗
  86. for xpath in self.popup_rules["verification"]:
  87. if d.xpath(xpath).exists:
  88. now = time.time()
  89. if now - self.last_verification_time < 30:
  90. return False # 30秒内不重复触发
  91. self.last_verification_time = now
  92. self.verification_count += 1
  93. self.logger.warning("验证码弹窗触发,等待人工处理...")
  94. if self.verification_count > self.MAX_VERIFICATION_RETRY:
  95. self.logger.error("验证码重试超限,终止任务")
  96. self.spider.stop_all()
  97. return True
  98. self.pausing.set() # 通知主线程暂停
  99. d.toast.show("需要人工处理验证码", 120)
  100. # 等待人工处理
  101. start = time.time()
  102. # while time.time() - start < 120*60:
  103. # if not d.xpath(xpath).exists:
  104. # self.logger.info("验证码已处理")
  105. # d.toast.show("验证完成", 2)
  106. # self.pausing.clear() # 放行主线程
  107. # return True
  108. # time.sleep(5)
  109. while True:
  110. if not d.xpath(xpath).exists:
  111. self.logger.info("验证码已处理")
  112. d.toast.show("验证完成", 2)
  113. self.pausing.clear() # 放行主线程
  114. return True
  115. time.sleep(5)
  116. self.logger.warning("验证码超时,重启APP")
  117. self.spider.restart_app()
  118. return True
  119. # 3. 处理广告弹窗(点击右上角)
  120. if d.xpath('//*[contains(@text, "广告")]').exists:
  121. w, h = d.info['displayWidth'], d.info['displayHeight']
  122. d.click(w - 50, 50)
  123. self.logger.info("关闭广告弹窗")
  124. return True
  125. return False
  126. def stop(self):
  127. self.running = False
  128. def get_access_token():
  129. AppKey = "tRK2RhyItCSh6BzyT4CNVXQa"
  130. AppSrcret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  131. token_url = 'https://aip.baidubce.com/oauth/2.0/token'
  132. url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
  133. payload = ""
  134. headers = {
  135. 'Content-Type': 'application/json',
  136. 'Accept': 'application/json'
  137. }
  138. response = requests.request("POST", url, headers=headers, data=payload)
  139. try:
  140. return response.json()['access_token']
  141. except:
  142. return None
  143. def get_mysql():
  144. """
  145. 建立并返回一个到数据库的连接对象
  146. """
  147. import pymysql
  148. return pymysql.connect(
  149. host = Config.DB_HOST, #"localhost", # 修改后的主机
  150. port = Config.DB_PORT, #3306, # 添加端口号
  151. user = Config.DB_USER, #'root', # 修改后的用户名
  152. password = Config.DB_PASSWORD, # 修改后的密码
  153. db = Config.DB_NAME, #"drug_data", # 修改后的数据库名
  154. charset='utf8mb4'
  155. )
  156. #获取滑块验证中滑块需要移动的距离
  157. def slide_verify(img_path):
  158. with open(img_path, 'rb') as f:
  159. b = base64.b64encode(f.read()).decode() ## 图片二进制流base64字符串
  160. url = "http://api.jfbym.com/api/YmServer/customApi"
  161. data = {
  162. ## 关于参数,一般来说有3个;不同类型id可能有不同的参数个数和参数名,找客服获取
  163. "token": "1nDVocTE2mJ0yLEYb2sZJ5uUY2VIEoGTkIpW44X7Kgk",
  164. "type": "22222",
  165. "image": b,
  166. }
  167. _headers = {
  168. "Content-Type": "application/json"
  169. }
  170. response = requests.request("POST", url, headers=_headers, json=data).json()
  171. print(response)
  172. if response.get("msg") == "识别成功":
  173. # 获取 data 中的 data 字段
  174. result = response.get("data", {}).get("data")
  175. if result:
  176. print(result) # 输出结果
  177. else:
  178. print("无法获取数据")
  179. else:
  180. print("识别未成功")
  181. return result
  182. class TaskReporter:
  183. """任务上报管理器(线程安全)"""
  184. def __init__(self):
  185. self.tasks_data = {} # 存储每个任务的数据
  186. self.lock = threading.Lock()
  187. def start_task(self, task_id: int, start_page: int, end_page: int):
  188. """记录任务开始"""
  189. with self.lock:
  190. self.tasks_data[task_id] = {
  191. 'task_id': task_id,
  192. 'start_time': int(time.time()),
  193. 'end_time': None,
  194. 'start_page': start_page,
  195. 'end_page': end_page,
  196. 'actual_end_page': start_page, # 实际结束页数
  197. 'real_count': 0, # 实际采集数量
  198. 'status': 'running', # running, completed, failed
  199. 'finish_status': 0, # 0:未完成,1:已完成
  200. }
  201. def update_task_progress(self, task_id: int,
  202. actual_end_page: int = None,
  203. real_count: int = None):
  204. """更新任务进度(线程安全)"""
  205. with self.lock:
  206. if task_id in self.tasks_data:
  207. if actual_end_page is not None:
  208. self.tasks_data[task_id]['actual_end_page'] = actual_end_page
  209. if real_count is not None:
  210. self.tasks_data[task_id]['real_count'] = real_count
  211. def end_task(self, task_id: int, status: str = 'completed',
  212. finish_status: int = 0, force_end_page: int = None):
  213. """记录任务结束并上报"""
  214. with self.lock:
  215. if task_id in self.tasks_data:
  216. data = self.tasks_data[task_id]
  217. data['end_time'] = int(time.time())
  218. data['status'] = status
  219. data['finish_status'] = finish_status
  220. if force_end_page is not None:
  221. data['actual_end_page'] = force_end_page
  222. # 准备上报数据
  223. report_data = {
  224. "collect_task_allocate_id": data['task_id'],
  225. "status": 3 if data['status'] == 'completed' else 4,
  226. "finish_status": data['finish_status'],
  227. 'real_count': data['real_count'],
  228. 'start_time': data['start_time'],
  229. 'end_time': data['end_time'],
  230. 'start_page': data['start_page'],
  231. 'end_page': data['actual_end_page']
  232. }
  233. # 调用上报接口
  234. self._call_report_api(report_data)
  235. def _call_report_api(self, data: Dict[str, Any]):
  236. """调用上报接口"""
  237. try:
  238. url = 'http://schedule.dfwy.tech/api/collect_equipment_execute/result_report'
  239. resp = requests.post(url, json=data, timeout=10)
  240. if resp.status_code == 200:
  241. print(f"任务 {data['collect_task_allocate_id']} 上报成功")
  242. # self.loggerPdd.info(f"任务 {data['collect_task_allocate_id']} 上报成功")
  243. else:
  244. print(f"任务 {data['collect_task_allocate_id']} 上报失败: {resp.status_code}")
  245. # self.loggerPdd.info(f"任务 {data['collect_task_allocate_id']} 上报失败: {resp.status_code}")
  246. except Exception as e:
  247. print(f"上报接口调用异常: {e}")
  248. # 全局上报管理器
  249. reporter = TaskReporter()
  250. class PDD:
  251. def __init__(self, search_key, device_id):
  252. self.package_name = 'com.xunmeng.pinduoduo'
  253. self.APP_ID = '116857964'
  254. self.API_KEY = '1gAzACJOAr7BeILKqkqPOETh'
  255. self.SECRET_KEY = 'ZNArANb9GwJYgLKg4EfYhukKBfPdl1n3'
  256. self.client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
  257. self.city2province = self.get_city_info()
  258. # host = "localhost"
  259. # user = "root"
  260. # password = "dfwy2025"
  261. # database = "drug_data"
  262. # port = 3306
  263. # self.table_name = "mt_drug"
  264. self.table_name = Config.DB_PDD_AUTO_TABLE #"pdd_drug" #pdd_auto_drug_test
  265. self.shop_table_name = Config.DB_PDD_SHOP_TABLE #"pdd_shop_info"
  266. self.loggerPdd = logging.getLogger()
  267. self.clipboard = "" #初始化剪切板的内容为空
  268. self.access_token = get_access_token()
  269. self.search_key = search_key # 参苓健脾胃颗粒 香砂平胃颗粒 舒肝颗粒 清肺化痰丸
  270. self.unrelated_data = 0 # 无关数据数量
  271. self.device_id = device_id
  272. # === 新增:采集统计 ===
  273. self.collected_count = 0 # 实际采集的商品数量
  274. self.task_id = None # 任务ID
  275. self.start_time = None # 任务开始时间
  276. self.current_page = 0 # 当前页码
  277. self.task_start_page = 0 # 任务开始页码
  278. self.task_end_page = 0 # 任务结束页码
  279. # ====================
  280. def update_task_status(self, status):
  281. """更新任务状态到数据库"""
  282. if not self.task_id:
  283. return
  284. try:
  285. retrieve_conn = get_retrieve_mysql()
  286. cursor = retrieve_conn.cursor()
  287. update_time = time.time()
  288. update_sql = """
  289. UPDATE retrieve_collect_task_allocate
  290. SET status = %s, update_time = %s
  291. WHERE id = %s
  292. """
  293. cursor.execute(update_sql, (status,update_time, self.task_id))
  294. retrieve_conn.commit()
  295. self.loggerPdd.info(f"任务 {self.task_id} 状态更新为 {status}")
  296. except Exception as e:
  297. self.loggerPdd.error(f"更新任务状态失败: {e}")
  298. finally:
  299. if 'cursor' in locals():
  300. cursor.close()
  301. if 'retrieve_conn' in locals():
  302. retrieve_conn.close()
  303. def stop_app(self):
  304. self.d.app_stop(self.package_name)
  305. time.sleep(5)
  306. def start_app(self):
  307. self.d.app_start(self.package_name)
  308. time.sleep(5)
  309. def restart_app(self):
  310. """
  311. 重启app
  312. :return:
  313. """
  314. self.stop_app()
  315. self.start_app()
  316. @staticmethod
  317. def get_sleep_time():
  318. return random.randint(1, 3)
  319. # return random.randint(5, 8)
  320. @staticmethod
  321. def get_current_date():
  322. return datetime.datetime.now().strftime('%Y/%m/%d')
  323. @staticmethod
  324. def get_city_info():
  325. """
  326. 获取所有的省市数据
  327. :return:
  328. """
  329. file_path = '../kailin_city.json'
  330. with open(file_path, 'r', encoding='utf-8') as f:
  331. data = json.load(f)
  332. province = {province_one["id"]: province_one for province_one in data['province']}
  333. city2province = dict()
  334. city = data['city']
  335. for city_one in city:
  336. name = city_one['name']
  337. pid = city_one['pid']
  338. if len(str(pid)) > 2:
  339. pid = int(re.match('^\d{2}', str(pid)).group())
  340. city2province[name] = province[pid]['name']
  341. return city2province
  342. def remove_watermark(self, img_path):
  343. """
  344. 图片去水印(将水印部分变成白色背景)并将数据转化为二进制数据
  345. :param img_path: 图片路径
  346. :return: 二进制图片数据
  347. """
  348. img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
  349. endswith = os.path.splitext(img_path)[1]
  350. new = np.clip(1.4057577998008846 * img - 38.33089999653017, 0, 255).astype(np.uint8)
  351. _, img_binary = cv2.imencode(endswith, new)
  352. return img_binary
  353. def human_slide(self,start_x, start_y, end_x, end_y):
  354. """模拟人类滑动轨迹"""
  355. # 生成带加速度的轨迹
  356. points = []
  357. total_steps = 50
  358. distance_x = end_x - start_x
  359. distance_y = end_y - start_y
  360. previous_x = start_x # 用于记录上一个 x 坐标值
  361. for i in range(total_steps):
  362. # 非线性进度(慢-快-慢)
  363. ratio = (i / total_steps)
  364. if ratio < 0.3:
  365. progress = 0.5 * (ratio / 0.3)**2
  366. elif ratio < 0.7:
  367. progress = 0.5 + (ratio - 0.3) * 1.25
  368. else:
  369. progress = 0.9 + 0.5 * ((ratio - 0.7)/0.3)**0.5
  370. # 添加随机抖动
  371. # offset_x = np.random.randint(-2, 3)
  372. # offset_y = np.random.randint(-2, 3)
  373. offset_x = np.random.randint(-1, 1) # 控制抖动范围
  374. offset_y = np.random.randint(-1, 1)
  375. x = start_x + distance_x * min(progress, 0.99) + offset_x
  376. y = start_y + distance_y * min(progress, 0.99) + offset_y
  377. # 确保 x 坐标单调递增
  378. if x < previous_x and x < end_x:
  379. x = previous_x + 1
  380. if x > end_x:
  381. x = end_x
  382. previous_x = x
  383. points.append((x, y))
  384. # 变速延迟(移动越快延迟越短)
  385. delay = 0.002 + 0.01 * (1 - abs(0.5 - ratio))
  386. time.sleep(delay)
  387. print(f"points: {points}")
  388. self.loggerPdd.info(f"points: {points}")
  389. # 执行滑动轨迹
  390. # self.d.touch.down(points[0][0], points[0][1])
  391. for point in points[1:]:
  392. self.d.touch.move(point[0], point[1])
  393. self.d.touch.up(points[-1][0], points[-1][1])
  394. # print(f"points: {points}")
  395. # self.loggerPdd.info(f"points: {points}")
  396. # self.d.swipe_points(points, duration=0.05)
  397. def get_shop_name(self):
  398. """
  399. 获取店铺名
  400. :return:
  401. """
  402. try:
  403. xpath = '//*[@text="进店"]/preceding-sibling::android.view.ViewGroup/android.widget.LinearLayout/android.widget.TextView'
  404. if self.d.xpath(xpath).exists:
  405. shop_name = self.d.xpath(xpath).text
  406. self.loggerPdd.info(f'1-获取到店铺名:{shop_name}')
  407. else:
  408. #进入店铺新页面
  409. shop_btn_xpath = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]'
  410. if self.d.xpath(shop_btn_xpath).exists:
  411. self.d.xpath(shop_btn_xpath).click()
  412. time.sleep(1)
  413. # self.d.xpath('//*[@text="店铺"]').click()
  414. xpath_shop_name = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.TextView[1]'
  415. if self.d.xpath(xpath_shop_name).exists:
  416. shop_name = self.d.xpath(xpath_shop_name).text
  417. self.loggerPdd.info(f'2-获取到店铺名:{shop_name}')
  418. else:
  419. shop_name = ''
  420. self.loggerPdd.info(f'3-获取到店铺名:{shop_name}')
  421. self.swipe_back(1) #
  422. else:
  423. shop_name = ''
  424. self.loggerPdd.info('4-因为shop_btn_xpath不存在,获取到店铺名为空')
  425. # time.sleep(10000)
  426. #进入店铺新页面 测试代码
  427. # self.d.xpath('//*[@text="店铺"]').click()
  428. # time.sleep(1)
  429. # content_frame = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]').exists
  430. # print(content_frame)
  431. # ViewGroup3 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]').exists
  432. # print(ViewGroup3)
  433. # LinearLayout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]').exists
  434. # print(LinearLayout)
  435. # RelativeLayout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]').exists
  436. # print(RelativeLayout)
  437. # LinearLayout2 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]').exists
  438. # print(LinearLayout2)
  439. # RecyclerView = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]').exists
  440. # print(RecyclerView)
  441. # xpath2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.TextView[1]'
  442. # if self.d.xpath(xpath2).exists:
  443. # shop_name = self.d.xpath(xpath2).text
  444. # self.loggerPdd.info(f'2-获取到店铺名:{shop_name}')
  445. # else:
  446. # shop_name = ''
  447. # self.loggerPdd.info(f'3-获取到店铺名:{shop_name}')
  448. # self.swipe_back(1) #返回
  449. return shop_name
  450. except Exception as e:
  451. print(f'获取店铺名出错:{e}')
  452. self.loggerPdd.error(f'获取店铺名出错:{e}')
  453. return None
  454. def get_qualification_number(self):
  455. """
  456. 获取资质编号
  457. :return:
  458. """
  459. try:
  460. qualification_number_str = self.d.xpath(
  461. '//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text
  462. qualification_number = qualification_number_str.strip('资质编号:').strip()
  463. return qualification_number
  464. except:
  465. return None
  466. def enter_detail(self):
  467. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/recycler"]/android.widget.FrameLayout[1]').click()
  468. time.sleep(self.get_sleep_time())
  469. def save_to_database(self, data):
  470. print(f'保存数据到数据库:{data}')
  471. # 连接数据库
  472. conn = get_mysql()
  473. # 创建游标对象
  474. cur = conn.cursor()
  475. # add_sql = "insert into delete_friend_table(delete_user_name,delete_user_id,delete_content,delete_time) value(%s,%s,%s,%s)"
  476. add_sql = f"""
  477. INSERT INTO {self.table_name}
  478. (product, min_price, manufacture_date, expiry_date, shop, business_license_company, province, city, manufacturer, specification, approval_number, product_link, scrape_date, scrape_province, availability, credit_code, platform, search_key, number)
  479. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  480. """
  481. # cur.execute(add_sql, (data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'], data['business_license_company'],data['province'], data['city'], data['manufacturer'], data['specification'], data['approval_number'], data['product_link'], self.get_current_date(), data['scrape_province'], data['availability'], data['credit_code'], data['platform']))
  482. cur.execute(add_sql, (data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'], data['business_license_company'],data['province'], data['city'], data['manufacturer'], data['specification'], data['approval_number'], data['product_link'], data['scrape_date'], data['scrape_province'], data['availability'], data['credit_code'], data['platform'], data['search_key'], data['number']))
  483. conn.commit() # 提交数据
  484. print(f"存入数据库成功")
  485. # === 新增:更新采集计数 ===
  486. self.collected_count += 1
  487. if self.task_id:
  488. # 更新上报进度
  489. reporter.update_task_progress(
  490. task_id=self.task_id,
  491. real_count=self.collected_count
  492. )
  493. def swipe_up(self):
  494. """
  495. 上滑
  496. :return:
  497. """
  498. screen_width = self.d.info['displayWidth']
  499. screen_height = self.d.info['displayHeight']
  500. duration_rate = random.uniform(0, 0.3)
  501. self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate)
  502. no = random.uniform(0, 1)
  503. if no > 0.85:
  504. # 有的时候卡着 再稍微往上滑一点点
  505. self.d.swipe_ext("up", 0.1)
  506. time.sleep(self.get_sleep_time())
  507. def swipe_back(self, no):
  508. """
  509. 返回
  510. :param no: 回退次数
  511. :return:
  512. """
  513. if not self.distinct_target():
  514. for idx in range(no):
  515. self.d.press('back')
  516. time.sleep(self.get_sleep_time())
  517. def drug_price(self):
  518. """
  519. 获取药品价格
  520. :return:
  521. """
  522. try:
  523. xpath = '//*[@text="¥"]/following-sibling::android.widget.TextView[1]'
  524. price_str = self.d.xpath(xpath).text
  525. price = float(re.search('[\d\.]+', price_str).group())
  526. print(f'获取到价格:{price}')
  527. return float(price)
  528. except Exception as e:
  529. print(f'提取价格出错-->{e}')
  530. return None
  531. def drug_price_ex(self):
  532. price_str = '' #价格初始化
  533. ext = '' #初始化已选择的信息
  534. price = ''
  535. button_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[last()]'
  536. button_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[last()]'
  537. #调试
  538. # test_button = self.d.xpath(button_xpath_1).exists
  539. # print(test_button)
  540. # test_button_2 = self.d.xpath(button_xpath_2).exists
  541. # print(test_button_2)
  542. # time.sleep(1000)
  543. # if self.d.xpath('//*[@text="发起拼单"]').exists:
  544. # self.d.xpath('//*[@text="发起拼单"]').click()
  545. # elif self.d.xpath('//*[@text="去复诊开药"]').exists:
  546. # self.d.xpath('//*[@text="去复诊开药"]').click()
  547. if self.d.xpath(button_xpath_1).exists:
  548. self.d.xpath(button_xpath_1).click()
  549. elif self.d.xpath(button_xpath_2).exists:
  550. self.d.xpath(button_xpath_2).click()
  551. else:
  552. print("button1 and button_2 all not exist")
  553. return price, ext
  554. #获取是已选择还是请选择
  555. # select_xpath = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[2]'
  556. select_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[last()]'
  557. select_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[last()]'
  558. select_xpath_3 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.TextView[last()]'
  559. select_xpath_3_2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.TextView[last()-1]'
  560. # select_xpath1 = self.d.xpath(select_xpath_1).exists
  561. # print(select_xpath1)
  562. # select_xpath2 = self.d.xpath(select_xpath_2).exists
  563. # print(select_xpath2)
  564. # select_xpath3 = self.d.xpath(select_xpath_3).exists
  565. # print(select_xpath3)
  566. # time.sleep(1000)
  567. price_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[1]'
  568. price_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[1]'
  569. price_xpath_3 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]//android.widget.TextView[1]'
  570. if self.d.xpath(select_xpath_1).exists:
  571. text1 = self.d.xpath(select_xpath_1).text
  572. print(f"select_xpath_1--text1={text1}")
  573. if '已选' in text1:
  574. if self.d.xpath(price_xpath_1).exists:
  575. price_str = self.d.xpath(price_xpath_1).text
  576. print(f"select_xpath_1--price_str-1={price_str}")
  577. else:
  578. print("select_xpath_1--price_xpath_1-1 not exist")
  579. ext = text1
  580. elif '请选择' in text1:
  581. #需要再下面点击选择
  582. scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
  583. if self.d.xpath(scroll_xpath_1).exists:
  584. self.d.xpath(scroll_xpath_1).click
  585. time.sleep(2) #延时2秒钟,选择了之后价格会刷新
  586. if self.d.xpath(select_xpath_1).exists:
  587. text2 = self.d.xpath(select_xpath_1).text
  588. if '已选' in text2:
  589. print(f"select_xpath_1--已选择2:text2={text2}")
  590. if self.d.xpath(price_xpath_1).exists:
  591. price_str = self.d.xpath(price_xpath_1).text
  592. print(f"select_xpath_1--price_str-2={price_str}")
  593. else:
  594. print("select_xpath_1--price_xpath_1-2 not exist")
  595. ext = text2
  596. else:
  597. print("select_xpath_1--scroll_xpath_1 not exist")
  598. elif self.d.xpath(select_xpath_2).exists:
  599. text1 = self.d.xpath(select_xpath_2).text
  600. print(f"xpath2--text1={text1}")
  601. if '已选' in text1:
  602. ext = text1
  603. if self.d.xpath(price_xpath_2).exists:
  604. price_str = self.d.xpath(price_xpath_2).text
  605. print(f"select_xpath_2--price_str-2={price_str}")
  606. else:
  607. print("select_xpath_2--price_xpath_2-1 not exist")
  608. elif '请选择' in text1:
  609. print('come in here')
  610. #需要再下面点击选择
  611. scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
  612. if self.d.xpath(scroll_xpath_1).exists:
  613. print("scroll_xpath_1 exists")
  614. self.d.xpath(scroll_xpath_1).click()
  615. time.sleep(2) #延时2秒钟,选择了之后价格可能会刷新
  616. if self.d.xpath(select_xpath_2).exists:
  617. text2 = self.d.xpath(select_xpath_2).text
  618. if '已选' in text2:
  619. ext = text2
  620. print(f"select_xpath_2--已选择2:text2={text2}")
  621. if self.d.xpath(price_xpath_2).exists:
  622. price_str = self.d.xpath(price_xpath_2).text
  623. print(f"select_xpath_2--price_str-2={price_str}")
  624. else:
  625. print("select_xpath_2--price_xpath_2-2 not exist")
  626. else:
  627. print("scroll_xpath_1 not exists")
  628. else:
  629. print("not exist 请选择 or 已选")
  630. elif self.d.xpath(select_xpath_3).exists:
  631. text1 = self.d.xpath(select_xpath_3).text
  632. print(f"xpath3--text1-1={text1}")
  633. if ('请选择' not in text1) and ('已选' not in text1):
  634. text1 = self.d.xpath(select_xpath_3_2).text
  635. print(f"xpath3--text1-2={text1}")
  636. if '已选' in text1:
  637. ext = text1
  638. if self.d.xpath(price_xpath_3).exists:
  639. price_str = self.d.xpath(price_xpath_3).text
  640. print(f"select_xpath_3--price_str-3-3-1={price_str}")
  641. else:
  642. print("select_xpath_3--price_xpath_3-3-1 not exist")
  643. elif '请选择' in text1:
  644. print('come in here')
  645. #需要再下面点击选择
  646. scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
  647. recycler_view_xpath = '//*[@resource-id="android:id/content"]//android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
  648. if self.d.xpath(scroll_xpath_1).exists:
  649. print("scroll_xpath_1 exists")
  650. self.d.xpath(scroll_xpath_1).click()
  651. time.sleep(2) #延时2秒钟,选择了之后价格可能会刷新
  652. if self.d.xpath(select_xpath_3).exists:
  653. text2 = self.d.xpath(select_xpath_3).text
  654. if '已选' in text2:
  655. ext = text2
  656. print(f"select_xpath_3--已选择2:text2={text2}")
  657. if self.d.xpath(price_xpath_3).exists:
  658. price_str = self.d.xpath(price_xpath_3).text
  659. print(f"select_xpath_3--price_str-3-2={price_str}")
  660. else:
  661. print("select_xpath_3--price_xpath_3-3-2 not exist")
  662. elif self.d.xpath(recycler_view_xpath).exists:
  663. self.d.xpath(recycler_view_xpath).click()
  664. time.sleep(2) #延时2秒钟,选择了之后价格可能会刷新
  665. if self.d.xpath(select_xpath_3).exists:
  666. text2 = self.d.xpath(select_xpath_3).text
  667. if '已选' in text2:
  668. ext = text2
  669. print(f"select_xpath_3--已选择2:text2={text2}")
  670. if self.d.xpath(price_xpath_3).exists:
  671. price_str = self.d.xpath(price_xpath_3).text
  672. print(f"select_xpath_3--price_str-3-3={price_str}")
  673. else:
  674. print("select_xpath_3--price_xpath_3-3-3 not exist")
  675. else:
  676. print("scroll_xpath_1 not exists")
  677. else :
  678. print(f"xpath3--text1-不包含请选择和已选择")
  679. else :
  680. print("select_xpath_1 and select_xpath_2 and select_xpath_3 all not exist")
  681. if price_str:
  682. # price = float(re.search('[\d\.]+', price_str).group())
  683. match = re.search(r'¥([\d\.]+)', price_str)
  684. if match:
  685. price = float(match.group(1))
  686. else:
  687. price = ''
  688. # price = float(re.search(r'¥([\d\.]+)', price_str).group(1))
  689. print(f'获取到价格:{price}')
  690. print(f"ext={ext}")
  691. self.swipe_back(1) #
  692. # time.sleep(1000)
  693. return price, ext
  694. def restart_uiautomator_services(self, device_id):
  695. """
  696. 重启atx的uiautomator 服务
  697. :param device_id:
  698. :return:
  699. """
  700. stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop'
  701. start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'
  702. # result = subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  703. # print(result.stdout)
  704. subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
  705. time.sleep(self.get_sleep_time())
  706. subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True)
  707. time.sleep(self.get_sleep_time())
  708. def connect_devices(self, device_id):
  709. """
  710. 连接设备
  711. :return:
  712. """
  713. try:
  714. self.d = u2.connect_usb(device_id)
  715. # 设置隐形等待时间
  716. # self.d.implicitly_wait(5)
  717. self.restart_uiautomator_services(device_id)
  718. print(f'连接到设备:{device_id}')
  719. except Exception as e:
  720. print(f'{device_id} 连接错误: {e}')
  721. raise Exception(e)
  722. def get_ocr_res(self, img):
  723. try:
  724. image = self.remove_watermark(img)
  725. # image_file = open(img,'wb')
  726. # image_file.write(image)
  727. # res_image = self.client.basicAccurate(image) # 高精度
  728. res_image = self.client.basicGeneral(image)
  729. # print(f'百度api返回结果:{res_image}')
  730. # print(res_image.get('words_result', ''))
  731. # new_dic = dict()
  732. data = res_image.get('words_result', '')
  733. print(f'百度api返回结果:{data}')
  734. # full_text = ';'.join(item['words'] for item in data)
  735. # address = ''
  736. # for item in data:
  737. # if '企业注册号' in item['words']:
  738. # print('come in 111')
  739. # reg_number = item['words'].split(':', 1)[1].strip()
  740. # elif '企业名称' in item['words']:
  741. # print('come in 222')
  742. # company_name = item['words'].split(':', 1)[1].strip()
  743. # elif '所:' in item['words']:
  744. # print('come in 333')
  745. # address = item['words'].split(':', 1)[1].strip()
  746. # # 输出结果
  747. # print("企业注册号:", reg_number)
  748. # print("企业名称:", company_name)
  749. # print("住所:", address)
  750. return data
  751. except:
  752. return None
  753. # def get_ocr_res(self, img):
  754. # try:
  755. # #img地址
  756. # print(f'开始识别图片:{img}')
  757. # request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  758. # # 二进制方式打开图片文件
  759. # f = open(img, 'rb')
  760. # img = base64.b64encode(f.read())
  761. # # img = self.remove_watermark(img)
  762. # # print(f'图片转base64成功:{img}')
  763. # params = {"image": img}
  764. # # access_token = get_access_token()
  765. # request_url = request_url + "?access_token=" + self.access_token
  766. # headers = {'content-type': 'application/x-www-form-urlencoded'}
  767. # response = requests.post(request_url, data=params, headers=headers)
  768. # print(f'请求百度api成功{response}')
  769. # if response:
  770. # res = response.json()
  771. # print(f'百度api返回结果{res}')
  772. # new_dic = dict()
  773. # for ite in res['words_result'].keys():
  774. # new_dic[ite] = res['words_result'][ite]['words']
  775. # print('资质数据信息', new_dic)
  776. # return new_dic
  777. # else:
  778. # return None
  779. # except:
  780. # return None
  781. def screenshot_the_business_license(self):
  782. screenshot_path = 'screenshot1.png'
  783. self.d.screenshot(screenshot_path)
  784. img = cv2.imread(screenshot_path)
  785. # 指定裁剪区域 (left, top, right, bottom)
  786. left = 0
  787. top = 480
  788. right = 720
  789. bottom = 1420
  790. cropped_img = img[top:bottom, left:right]
  791. cropped_screenshot_path = 'cropped_screenshot.png'
  792. cv2.imwrite(cropped_screenshot_path, cropped_img)
  793. def screenshot_the_shop_qualifications(self):
  794. screenshot_path = 'screenshot2.png'
  795. self.d.screenshot(screenshot_path)
  796. img = cv2.imread(screenshot_path)
  797. # 指定裁剪区域 (left, top, right, bottom)
  798. left = 0
  799. top = 480
  800. right = 720
  801. bottom = 1420
  802. def first_screenshot_the_verify(self):
  803. screenshot_verify_path = 'first_screenshot_verify.png'
  804. self.d.screenshot(screenshot_verify_path)
  805. img = cv2.imread(screenshot_verify_path)
  806. ocr_res = self.get_ocr_res('first_screenshot_verify.png')
  807. event = '' #事件类型:滑块验证、计算结果输入
  808. if ocr_res:
  809. for item in ocr_res:
  810. if '拖动滑块完成' in item['words']:
  811. print("滑块验证")
  812. event = '滑块验证'
  813. break
  814. # 指定裁剪区域 (left, top, right, bottom)
  815. if event == '滑块验证':
  816. left = 118
  817. top = 478
  818. right = 602
  819. bottom = 722
  820. else:
  821. left = 118
  822. top = 478
  823. right = 602
  824. bottom = 722
  825. cropped_verify_img = img[top:bottom, left:right]
  826. cropped_verify_creenshot_path = 'first_cropped_verify_screenshot.png'
  827. cv2.imwrite(cropped_verify_creenshot_path, cropped_verify_img)
  828. return event
  829. def slide_second_screenshot_the_verify(self):
  830. screenshot_verify_path = 'slide_second_screenshot_verify.png'
  831. self.d.screenshot(screenshot_verify_path)
  832. img = cv2.imread(screenshot_verify_path)
  833. left = 118
  834. top = 478
  835. right = 602
  836. bottom = 722
  837. cropped_verify_img = img[top:bottom, left:right]
  838. cropped_verify_creenshot_path = 'second_slide_cropped_verify_screenshot.png'
  839. cv2.imwrite(cropped_verify_creenshot_path, cropped_verify_img)
  840. def second_screenshot_the_verify(self):
  841. screenshot_verify_path = 'second_screenshot_verify.png'
  842. self.d.screenshot(screenshot_verify_path)
  843. img = cv2.imread(screenshot_verify_path)
  844. ocr_res = self.get_ocr_res('second_screenshot_verify.png')
  845. print(f'second_ocr_res:{ocr_res}')
  846. result = ''
  847. if ocr_res:
  848. result = '验证成功'
  849. for item in ocr_res:
  850. if '验证成功' in item['words']:
  851. result = '验证成功'
  852. break
  853. elif '验证不成功' in item['words']:
  854. result = '验证不成功'
  855. break
  856. return result
  857. def screenshot_business_license(self,shop_name):
  858. screenshot_lincense_path = 'license_screenshot.png'
  859. self.d.screenshot(screenshot_lincense_path)
  860. img = cv2.imread(screenshot_lincense_path)
  861. #裁剪
  862. left = 0
  863. top = 160
  864. right = 720
  865. bottom = 1000
  866. cropped_verify_img = img[top:bottom, left:right]
  867. cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\pdd\\screenshot\\' + shop_name + '.png'
  868. cv2.imwrite(cropped_screenshot_path, cropped_verify_img)
  869. return cropped_screenshot_path
  870. def drug_slide(self, distance):
  871. print(f"滑动的distance= {distance}")
  872. # 获取设备上所有窗口的层次结构
  873. dump = self.d.dump_hierarchy()
  874. print(f"drug_slide-111:{dump}")
  875. # 解析 JSON 数据
  876. root = ET.fromstring(dump)
  877. # dump_data = json.loads(dump)
  878. print("drug_slide-222")
  879. # 查找包含 meco.webkit.WebView 的元素
  880. webview_elements = root.findall(".//node[@class='meco.webkit.WebView']")
  881. print("drug_slide-333")
  882. if webview_elements:
  883. print("找到 WebView 元素:", webview_elements[0].attrib)
  884. # 获取WebView的bounds信息
  885. webview_bounds = webview_elements[0].attrib['bounds']
  886. print("WebView bounds:", webview_bounds)
  887. # 假设你需要操作的元素在WebView中的XPath为'//node[@class="meco.webkit.WebView"]'
  888. webview_element_xpath = '//node[@class="meco.webkit.WebView"]'
  889. # 获取WebView内部元素
  890. webview_inner_elements = self.d.xpath(webview_element_xpath)
  891. if webview_inner_elements:
  892. # 拖动元素300像素
  893. start_x = webview_inner_elements[0].info['bounds'][0]
  894. start_y = webview_inner_elements[0].info['bounds'][1]
  895. end_x = start_x + distance
  896. end_y = start_y
  897. self.d.swipe(start_x, start_y, end_x, end_y)
  898. print("拖动成功")
  899. else:
  900. print("未找到需要拖动的元素")
  901. else:
  902. print("未找到 WebView 元素")
  903. def get_title(self):
  904. try:
  905. print('开始提取标题')
  906. time.sleep(self.get_sleep_time())
  907. title_xpath = '//*[@resource-id="com.xunmeng.pinduoduo:id/tv_title"]'
  908. if self.d.xpath(title_xpath).exists:
  909. title = self.d.xpath(title_xpath).info['contentDescription'].strip()
  910. else:
  911. return None
  912. # title = self.d.xpath('//*[@resource-id="com.xunmeng.pinduoduo:id/tv_title"]').info['contentDescription'].strip()
  913. print(f'提取到标题:{title}')
  914. return title
  915. except Exception as e:
  916. print(f'获取标题出错:{e}')
  917. return None
  918. # 从里面匹配出药品名和规格
  919. # drugs_name
  920. # specifications
  921. # match = re.search(r'([^\d]+)([\d\D]+)', title)
  922. # match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
  923. # if match:
  924. # drugs_name = match.group(1).strip() + match.group(2).strip()
  925. # specifications = match.group(3).strip()
  926. # print("药品名:", drugs_name)
  927. # print("规格:", specifications)
  928. # print('完整药名:', drugs_name + specifications)
  929. # return drugs_name, specifications
  930. # else:
  931. # print("没有匹配到预期格式")
  932. def enter_shop(self):
  933. """
  934. 进店,方便提取资质环境
  935. :return:
  936. """
  937. # self.d.xpath('//*[@text="进店"]').click()
  938. self.d.xpath('//*[@text="店铺"]').click()
  939. time.sleep(self.get_sleep_time())
  940. def enter_shoper(self):
  941. """
  942. 进入商家
  943. :return:
  944. """
  945. self.d.xpath('//*[@text="商家"]').click()
  946. time.sleep(self.get_sleep_time())
  947. def scan_shoper_license(self):
  948. self.d.xpath('//*[@text="查看商家资质"]').click()
  949. time.sleep(self.get_sleep_time())
  950. def data_is_exists(self, data):
  951. # 1. 验证必要字段
  952. required_keys = ['search_key', 'min_price', 'shop', 'scrape_date', 'platform']
  953. if not all(key in data for key in required_keys):
  954. missing = [key for key in required_keys if key not in data]
  955. # logging.error(f"缺少必要字段: {', '.join(missing)}")
  956. print(f"缺少必要字段: {', '.join(missing)}")
  957. return None
  958. try:
  959. # 连接数据库
  960. conn = get_mysql()
  961. # 创建游标对象
  962. cur = conn.cursor()
  963. # query_sql = f"SELECT * FROM {self.table_name} WHERE product = '{data['product']}' AND min_price = '{data['min_price']}' AND shop = '{data['shop']}' AND scrape_date = '{data['scrape_date']}' AND platform = '{data['platform']}'"
  964. # cur.execute(query_sql)
  965. query_sql = """
  966. SELECT * FROM {}
  967. WHERE search_key = %s
  968. AND min_price = %s
  969. AND shop = %s
  970. AND scrape_date = %s
  971. AND platform = %s
  972. """.format(self.table_name)
  973. cur.execute(query_sql, (
  974. data['search_key'],
  975. data['min_price'],
  976. data['shop'],
  977. data['scrape_date'],
  978. data['platform']
  979. ))
  980. result = cur.fetchone()
  981. return bool(result) # 如果存在返回True,否则False
  982. except Exception as e:
  983. print(f"MySQL 错误: {str(e)}")
  984. #验证店铺信息是否在数据库中已存在
  985. def shop_is_exists_database(self, shop):
  986. try:
  987. # 连接数据库
  988. conn = get_mysql()
  989. # 创建游标对象
  990. cur = conn.cursor()
  991. query_sql = """
  992. SELECT * FROM {}
  993. WHERE shop = %s
  994. """.format(self.shop_table_name)
  995. cur.execute(query_sql, (
  996. shop
  997. ))
  998. result = cur.fetchone()
  999. return bool(result) # 如果存在返回True,否则False
  1000. except Exception as e:
  1001. print(f"MySQL 错误: {str(e)}")
  1002. def wait_if_verifying(self, monitor, timeout=120):
  1003. """验证码处理期间阻塞主线程"""
  1004. start = time.time()
  1005. while monitor.pausing.is_set() and time.time() - start < timeout:
  1006. time.sleep(1)
  1007. # def safe_xpath(self, xpath, timeout=10):
  1008. # """线程安全 xpath 查找"""
  1009. # self.wait_if_verifying(self.monitor)
  1010. # return self.d.xpath(xpath).wait(timeout=timeout)
  1011. def wait_for_ready(self, monitor, timeout=86400):
  1012. """进入每一页前都先等验证码"""
  1013. start = time.time()
  1014. while monitor.pausing.is_set() and time.time() - start < timeout:
  1015. time.sleep(1)
  1016. # 额外保险:如果验证码突然在这一秒才弹,再主动扫一次
  1017. monitor.check_and_handle_popup()
  1018. def safe_list(self, xpath, monitor):
  1019. """线程安全地拿商品列表"""
  1020. self.wait_for_ready(monitor)
  1021. return self.d.xpath(xpath).all()
  1022. def safe_exec(self, func, *args, **kwargs):
  1023. """
  1024. 万能安全壳:执行 func 前检查验证码,
  1025. 若监控线程已置位 pausing,则一直阻塞直到放行。
  1026. """
  1027. while self.monitor.pausing.is_set():
  1028. time.sleep(1)
  1029. # 执行真正逻辑
  1030. return func(*args, **kwargs)
  1031. def get_instructions_data(self):
  1032. """
  1033. 确定有详情页之后之后,提取所有的详情页数据
  1034. :return:
  1035. """
  1036. #下面的for循环已经有滑动的操作了,不要一进来就滑动。
  1037. # self.d.swipe_ext("up", scale=0.5)
  1038. for i in range(8):
  1039. # if self.d(textStartsWith="查看全部").exists:
  1040. if self.d.xpath('//*[@text="品牌"]').exists or self.d.xpath('//*[@text="药品通用名"]').exists:
  1041. self.d.swipe_ext("up", scale=0.1)
  1042. print('开始采集详情数据')
  1043. break
  1044. # screen_width = self.d.info['displayWidth']
  1045. # screen_height = self.d.info['displayHeight']
  1046. # self.d.swipe(screen_width // 2, screen_height - 400, screen_width // 2, 400, duration=0.2)
  1047. self.d.swipe_ext("up", scale=0.5)
  1048. time.sleep(self.get_sleep_time())
  1049. # 点击查看全部
  1050. if self.d.xpath('//*[@text="品牌"]').exists:
  1051. self.d.xpath('//*[@text="品牌"]').click()
  1052. else:
  1053. self.d.xpath('//*[@text="药品通用名"]').click()
  1054. time.sleep(self.get_sleep_time())
  1055. attr = dict()
  1056. # # 获取详情页信息
  1057. xpath = '//*[starts-with(@text,"商品参数")]/parent::*/parent::*/following-sibling::*/*/*/android.view.ViewGroup//android.widget.TextView'
  1058. ddd = self.d.xpath(xpath).all()
  1059. for i in range(0, len(ddd), 2):
  1060. group = ddd[i:i + 2]
  1061. attr[group[0].text] = group[1].text
  1062. # 截图获取未获取到的数据
  1063. # if not all(i in ['有效期', '生产企业', '批准文号', '药品规格', '产品规格'] for i in attr.keys()):
  1064. if not all(i in ['有效期', '生产企业', '批准文号', '药品规格'] for i in attr.keys()):
  1065. self.d.swipe_ext("up", 0.4)
  1066. time.sleep(self.get_sleep_time())
  1067. xpath = '//*[starts-with(@text,"商品参数")]/parent::*/parent::*/following-sibling::*/*/*/android.view.ViewGroup//android.widget.TextView'
  1068. ddd = self.d.xpath(xpath).all()
  1069. for i in range(0, len(ddd), 2):
  1070. group = ddd[i:i + 2]
  1071. attr[group[0].text] = group[1].text
  1072. print(f'当前说明书规格参数:{attr}')
  1073. res_data = {
  1074. # "有效期": attr.get('有效期',''),
  1075. # "生产单位": attr['生产企业'],
  1076. # "批准文号": attr['批准文号'],
  1077. # "产品规格": attr.get('药品规格') if attr.get('药品规格', '') else attr.get('药品规格')
  1078. "有效期": attr.get('有效期',''),
  1079. "生产单位": attr.get('生产企业', ''),
  1080. "批准文号": attr.get('批准文号', ''),
  1081. "产品规格": attr.get('药品规格', '')
  1082. }
  1083. print(f'当前规格参数字典数据:{res_data}')
  1084. return res_data
  1085. def has_instructions(self):
  1086. """
  1087. 是否有详情页
  1088. :return:如果有详情页返回True,否则返回False
  1089. """
  1090. # 没有说明书的无法采集具体数据
  1091. max_attempts = 12 # 最大尝试次数
  1092. attempt = 0 # 当前尝试次数
  1093. while attempt < max_attempts:
  1094. time.sleep(0.5)
  1095. xpath = '//*[@text="商品详情"]'
  1096. is_has_instructions = self.d.xpath(xpath).exists
  1097. if is_has_instructions:
  1098. return True # 如果找到“商品详情”,则返回True
  1099. self.d.swipe_ext("up", 0.3)
  1100. attempt += 1
  1101. return False # 如果尝试次数达到最大次数,则返回False
  1102. # time.sleep(self.get_sleep_time())
  1103. # xpath = '//*[@text="商品详情"]'
  1104. # is_has_instructions = self.d.xpath(xpath).exists
  1105. # return
  1106. def has_shop_qualifications(self):
  1107. max_attempts = 3 # 最大尝试次数
  1108. attempt = 0 # 当前尝试次数
  1109. while attempt < max_attempts:
  1110. time.sleep(0.5)
  1111. xpath = '//*[@text="查看全部"]'
  1112. is_has_search_all = self.d.xpath(xpath).exists
  1113. if is_has_search_all:
  1114. elements = self.d.xpath(xpath).all()
  1115. count = len(elements)
  1116. print(f"页面上共有 {count} 个 '查看全部' 元素")
  1117. if count >= 2:
  1118. second_element = elements[1]
  1119. bounds = second_element.bounds
  1120. else:
  1121. element = self.d.xpath(xpath).get()
  1122. bounds = element.bounds
  1123. print(f'bounds:{bounds}')
  1124. # time.sleep(1000000)
  1125. # 获取元素的坐标和尺寸
  1126. x1, y1, x2, y2 = bounds # x1, y1 是左上角坐标,x2, y2 是右下角坐标
  1127. element_x = (x1 + x2) / 2 # 元素的中心 x 坐标
  1128. element_y = (y1 + y2) / 2 # 元素的中心 y 坐标
  1129. if element_y > 500:
  1130. self.d.swipe(element_x, element_y, element_x, 500, 1)
  1131. #画完之后再去一次坐标
  1132. elements_ex = self.d.xpath(xpath).all()
  1133. count_ex = len(elements_ex)
  1134. print(f"第二次页面上共有 {count_ex} 个 '查看全部' 元素")
  1135. if count_ex >= 2:
  1136. second_element_ex = elements[1]
  1137. bounds_ex = second_element_ex.bounds
  1138. else:
  1139. element_ex = self.d.xpath(xpath).get()
  1140. bounds_ex = element_ex.bounds
  1141. print(f'bounds_ex:{bounds_ex}')
  1142. x1, y1, x2, y2 = bounds_ex # x1, y1 是左上角坐标,x2, y2 是右下角坐标
  1143. element_x = (x1 + x2) / 2 # 元素的中心 x 坐标
  1144. element_y = (y1 + y2) / 2 # 元素的中心 y 坐标
  1145. time.sleep(self.get_sleep_time())
  1146. target_x = element_x
  1147. target_y = element_y + 80
  1148. print(f'目标坐标:{target_x}, {target_y}')
  1149. #点击图片
  1150. self.d.click(target_x, target_y)
  1151. time.sleep(self.get_sleep_time())
  1152. #获取图片内容
  1153. self.screenshot_the_shop_qualifications()
  1154. ocr_res = self.get_ocr_res('screenshot2.png')
  1155. print(f'ocr_res:{ocr_res}')
  1156. time.sleep(100000)
  1157. return True # 如果找到“商品详情”,则返回True
  1158. self.d.swipe_ext("up", 0.1)
  1159. attempt += 1
  1160. return False # 如果尝试次数达到最大次数,则返回False
  1161. def get_license_info_ex(self, shop_name):
  1162. # print('开始获取商家资质信息')
  1163. self.enter_shop() #点击店铺
  1164. #点击店铺图片
  1165. xpath_shop_image = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.ImageView[1]'
  1166. if self.d.xpath(xpath_shop_image).exists:
  1167. self.d.xpath(xpath_shop_image).click()
  1168. time.sleep(self.get_sleep_time())
  1169. #从弹窗页获取店铺资质的位置
  1170. for i in range(10):
  1171. if self.d.xpath('//*[@text="店铺资质"]').exists:
  1172. print('店铺资质存在1')
  1173. break
  1174. self.d.swipe_ext('up', 0.3)
  1175. time.sleep(1)
  1176. if self.d.xpath('//*[@text="店铺资质"]').exists:
  1177. print('店铺资质存在2')
  1178. break
  1179. if self.d.xpath('//*[@text="已上传"]').exists:
  1180. self.d.xpath('//*[@text="已上传"]').click()
  1181. else:
  1182. self.d.xpath('//*[@text="店铺资质"]').click()
  1183. time.sleep(self.get_sleep_time())
  1184. xpath_pop_window = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/meco.webkit.WebView[1]'
  1185. #等待验证弹窗出现
  1186. for i in range(10):
  1187. if self.d.xpath(xpath_pop_window).exists:
  1188. print(f'第{i}次安全验证弹窗存在')
  1189. break
  1190. else:
  1191. print(f'第{i}次安全验证弹窗不存在')
  1192. time.sleep(self.get_sleep_time())
  1193. time.sleep(10)
  1194. #第一次截屏
  1195. event = self.first_screenshot_the_verify()
  1196. #用ocr的方式识别当前是拖动还是点击
  1197. # ocr_res = self.get_ocr_res('screenshot_verify.png')
  1198. # event = ''
  1199. # if ocr_res:
  1200. # for item in ocr_res:
  1201. # if '拖动滑块完成' in item['words']:
  1202. # print("滑块验证")
  1203. # event = '滑块验证'
  1204. # break
  1205. # print("ocr_res end")
  1206. if event == '滑块验证':
  1207. #需要点击》按钮不懂后截屏,获取
  1208. self.d.touch.down(110, 780)
  1209. time.sleep(0.5)
  1210. #截屏
  1211. event = self.slide_second_screenshot_the_verify()
  1212. # result = slide_verify('first_cropped_verify_screenshot.png') second_slide_cropped_verify_screenshot
  1213. result = slide_verify('second_slide_cropped_verify_screenshot.png')
  1214. result = int(result)
  1215. print(f'滑动距离:{result}')
  1216. print('开始滑动')
  1217. # internel = 1000
  1218. # self.d.touch.move_to(110 + result, 780, duration=internel)
  1219. # self.d.touch.up(110 + result, 780)
  1220. self.human_slide(110, 780, 110 + result, 780)
  1221. print('滑动结束')
  1222. time.sleep(self.get_sleep_time())
  1223. #滑完之后怎么判断是否验证成功?
  1224. second_resut = self.second_screenshot_the_verify()
  1225. if second_resut == '验证成功':
  1226. time.sleep(8)
  1227. cropped_screenshot_path = self.screenshot_business_license(shop_name)
  1228. ocr_res = self.get_ocr_res(cropped_screenshot_path)
  1229. print(f'ocr_res:{ocr_res}')
  1230. company_name = ''
  1231. reg_number = ''
  1232. address = ''
  1233. if ocr_res:
  1234. for item in ocr_res:
  1235. if '企业注册号' in item['words']:
  1236. # print('come in 111')
  1237. reg_number = item['words'].split(':', 1)[1].strip()
  1238. elif '企业名称' in item['words']:
  1239. # print('come in 222')
  1240. company_name = item['words'].split(':', 1)[1].strip()
  1241. elif '所:' in item['words']:
  1242. # print('come in 333')
  1243. address = item['words'].split(':', 1)[1].strip()
  1244. # 输出结果
  1245. print("企业注册号:", reg_number)
  1246. print("企业名称:", company_name)
  1247. print("住所:", address)
  1248. print("yanzhenghcenggong")
  1249. # 截取图片保存 第三次截屏,保存到本地
  1250. #将图片传给第三方verify接口得到需要移动的距离
  1251. #result = slide_verify('cropped_verify_screenshot.png')
  1252. #print(f'滑动距离:{result}')
  1253. # self.drug_slide(340)
  1254. # print('开始滑动')
  1255. # self.d.swipe(120, 760, 460, 760, 0.3)
  1256. # print('滑动结束')
  1257. # time.sleep(1000000)
  1258. #截图获取需要验证的内容
  1259. def get_license_info(self):
  1260. self.enter_shop()
  1261. self.enter_shoper()
  1262. self.scan_shoper_license()
  1263. # 获取资质编码
  1264. qualification_number = self.get_qualification_number()
  1265. if qualification_number:
  1266. table_license_info = self.get_table_license_info(qualification_number)
  1267. if table_license_info:
  1268. return {
  1269. '单位名称': table_license_info[0],
  1270. '地址': table_license_info[1],
  1271. '社会信用代码': table_license_info[2]
  1272. }
  1273. else:
  1274. # operate_no = random.randint(0, 1)
  1275. self.d.click(0.603, 0.27)
  1276. # if operate_no == 0:
  1277. # self.d.xpath('//*[@text="营业执照"]').click()
  1278. # else:
  1279. # self.d.click(0.603, 0.27)
  1280. time.sleep(self.get_sleep_time())
  1281. self.screenshot_the_business_license()
  1282. ocr_res = self.get_ocr_res('cropped_screenshot.png')
  1283. return ocr_res
  1284. # operate_no = random.randint(0, 1)
  1285. self.d.click(0.603, 0.27)
  1286. # if operate_no == 0:
  1287. # self.d.xpath('//*[@text="营业执照"]').click()
  1288. # else:
  1289. # self.d.click(0.603, 0.27)
  1290. time.sleep(self.get_sleep_time())
  1291. self.screenshot_the_business_license()
  1292. ocr_res = self.get_ocr_res('cropped_screenshot.png')
  1293. return ocr_res
  1294. def distinct_target(self):
  1295. result = False
  1296. is_position = self.d.xpath('//*[@content-desc="拍照搜索"]').exists
  1297. is_position2 = self.d.xpath('//*[@text="年货节大促"]').exists
  1298. is_position3 = self.d.xpath('//*[@text="筛选"]').exists
  1299. is_position4 = self.d.xpath('//*[@text="回头客常拼"]').exists
  1300. list_page_xpath = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]//android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]'
  1301. is_position_new = self.d.xpath(list_page_xpath).exists
  1302. print(f'is_position_new={is_position_new}')
  1303. if is_position or is_position2 or is_position3 or is_position4 or is_position_new:
  1304. result = True
  1305. return result
  1306. def click_element_with_retry(self, xpath, max_retries=5, timeout=5):
  1307. """
  1308. 带重试机制的点击函数
  1309. """
  1310. for attempt in range(max_retries):
  1311. try:
  1312. if self.d.xpath(xpath).exists:
  1313. self.d.xpath(xpath).click()
  1314. print(f"第{attempt+1}次尝试点击成功")
  1315. return True
  1316. else:
  1317. print(f"第{attempt+1}次尝试:元素不存在")
  1318. except Exception as e:
  1319. print(f"第{attempt+1}次尝试失败: {e}")
  1320. if attempt < max_retries - 1:
  1321. time.sleep(1) # 等待1秒后重试
  1322. print(f"经过{max_retries}次尝试后点击失败")
  1323. return False
  1324. def enter_target_page(self):
  1325. self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]').click()
  1326. time.sleep(self.get_sleep_time())
  1327. self.d(className='android.widget.EditText').click()
  1328. time.sleep(self.get_sleep_time())
  1329. self.d.send_keys(self.search_key, clear=True)
  1330. time.sleep(self.get_sleep_time())
  1331. self.d.xpath('//*[@text="搜索"]').click()
  1332. time.sleep(self.get_sleep_time())
  1333. #点击价格
  1334. self.click_element_with_retry('//*[@text="价格"]')
  1335. # self.d.xpath('//*[@text="价格"]').click()
  1336. time.sleep(self.get_sleep_time())
  1337. """暂不用该功能
  1338. def get_table_license_info(self, qualification_number):
  1339. try:
  1340. sql = f'select business_license_company,city,credit_code from mt_drug where credit_code = "{qualification_number}"'
  1341. self.mysql_client.cur.execute(sql)
  1342. res = self.mysql_client.cur.fetchone()
  1343. return res
  1344. except:
  1345. return None
  1346. """
  1347. def get_clipboard(self):
  1348. self.loggerPdd.info(f"Clipboard content:{self.d.clipboard}") # 打印调试信息
  1349. clipboard_content = self.d.clipboard
  1350. if clipboard_content is None:
  1351. return ''
  1352. return clipboard_content.strip()
  1353. def clear_clipboard(self):
  1354. self.d.set_clipboard("")
  1355. def get_product_link(self):
  1356. product_link = ''
  1357. print('开始获取商品链接')
  1358. content_frame = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]').exists
  1359. print(content_frame)
  1360. relative_layout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]').exists
  1361. print(relative_layout)
  1362. relative_layout2 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]').exists
  1363. print(relative_layout2)
  1364. Frame_Layout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]').exists
  1365. print(Frame_Layout)
  1366. ImageView = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]').exists
  1367. print(ImageView)
  1368. ImageView2 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[3]/android.view.View[1]').exists
  1369. print(ImageView2)
  1370. # time.sleep(10000)
  1371. '''
  1372. if self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]').exists:
  1373. self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]').click()
  1374. print('进入分享成功')
  1375. time.sleep(1)
  1376. #先清除剪切板的数据
  1377. # self.clear_clipboard()
  1378. #需要增加从右往左的滑动
  1379. # self.d.swipe_ext("left", 0.5)
  1380. print('开始滑动')
  1381. self.d.swipe(400, 1250, 100, 1250, 0.2)
  1382. time.sleep(0.2)
  1383. if self.d.xpath('//*[@text="复制链接"]').exists:
  1384. self.d.xpath('//*[@text="复制链接"]').click()
  1385. print('点击复制链接')
  1386. # self.d.xpath('//*[@text="复制链接"]').click_exists()
  1387. # self.d.xpath('//*[contains(@text="复")]').click_exists()
  1388. # time.sleep(1)
  1389. product_link = self.get_clipboard()
  1390. time.sleep(0.5)
  1391. print(f'商品链接:{product_link}')
  1392. elif self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[3]/android.view.View[1]').exists:
  1393. self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[3]/android.view.View[1]').click()
  1394. print('进入分享成功111')
  1395. time.sleep(1)
  1396. print('开始滑动')
  1397. self.d.swipe(400, 1250, 100, 1250, 0.2)
  1398. time.sleep(0.2)
  1399. if self.d.xpath('//*[@text="复制链接"]').exists:
  1400. self.d.xpath('//*[@text="复制链接"]').click()
  1401. print('点击复制链接')
  1402. product_link = self.get_clipboard()
  1403. time.sleep(0.5)
  1404. print(f'商品链接:{product_link}')
  1405. else:
  1406. print('进入分享失败')
  1407. time.sleep(10000)
  1408. '''
  1409. # 多种可能的“分享”按钮
  1410. dots_xpaths = [
  1411. # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]',
  1412. '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[last()]/android.view.View[1]',
  1413. # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]',
  1414. # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[3]/android.widget.ImageView[1]',
  1415. ]
  1416. max_retry = 5 # 最多尝试次数
  1417. for idx in range(1, max_retry + 1):
  1418. if product_link: # 已经拿到则退出
  1419. break
  1420. for xp in dots_xpaths:
  1421. if self.d.xpath(xp).exists:
  1422. # print(f'{idx}-进入分享点点点')
  1423. self.loggerPdd.info(f'{idx}-进入分享点点点')
  1424. self.d.xpath(xp).click()
  1425. time.sleep(1)
  1426. # self.d.xpath('//*[@text="分享商品"]').click_exists()
  1427. # time.sleep(0.2)
  1428. # print('开始滑动')
  1429. self.loggerPdd.info('开始滑动')
  1430. if (self.device_id == '2e58510' or self.device_id == 'fcb3c749' or self.device_id == 'ea4e4eb8' or self.device_id == '95b2c764' or
  1431. self.device_id == '369dcf96' or self.device_id == 'ZDQWUSSWBEDI896T' or self.device_id == 'IRLZAAQCDMHYWKTS' or
  1432. self.device_id == 'U47HZDRG8XJBBURW' or self.device_id == 'WWRO9LTGG6KFGQCM' or self.device_id == 'fcb3c749' or
  1433. self.device_id == 'OVFETO8PCY45E6A6' or self.device_id == 'U8ONIJJJS4CELVD6'):
  1434. self.d.swipe(400, 1350, 100, 1350, 0.3)
  1435. elif (self.device_id == 'KNNNEMNVWCJZQOLZ' or self.device_id == 'CMKFUSSG99ROR489' or self.device_id == '656DTOPRZDEALZ5X' or
  1436. self.device_id == 'UCQGF6CQFMU8WKHI' or self.device_id == '4TZDUGTOAIFMJVGU' or self.device_id == 'DEZXWKUC7DJBLVPJ' or
  1437. self.device_id == 'GQIRKB7LVOONM7VW'):
  1438. self.d.swipe(400, 1300, 100, 1300, 0.3)
  1439. elif self.device_id == 'e2899b34':
  1440. self.d.swipe(400, 1050, 100, 1050, 0.3)
  1441. else:
  1442. self.d.swipe(400, 1250, 100, 1250, 0.3)
  1443. # self.d.swipe(400, 1250, 100, 1250, 0.3)
  1444. # self.d.drag(300, 1280, 50, 1280, 0.3)
  1445. # self.d.swipe_ext('left', 0.3)
  1446. time.sleep(0.2)
  1447. self.d.xpath('//*[@text="复制链接"]').click_exists()
  1448. time.sleep(1)
  1449. product_link = self.get_clipboard()
  1450. time.sleep(0.5)
  1451. # print(f'{idx}-商品链接:{product_link}')
  1452. self.loggerPdd.info(f'{idx}-商品链接:{product_link}')
  1453. break # 找到并执行后跳出内层循环
  1454. # if self.d.xpath('//*[@text="复制链接"]').exists:
  1455. # self.d.xpath('//*[@text="复制链接"]').click()
  1456. # # print('点击复制链接')
  1457. # self.loggerPdd.info('点击复制链接')
  1458. # product_link = self.get_clipboard()
  1459. # time.sleep(0.5)
  1460. # # print(f'{idx}-商品链接:{product_link}')
  1461. # self.loggerPdd.info(f'{idx}-商品链接:{product_link}')
  1462. # break # 找到并执行后跳出内层循环
  1463. if not product_link and idx < max_retry:
  1464. time.sleep(0.5) # 最后一次不需要再等待
  1465. # time.sleep(100000)
  1466. return product_link
  1467. def integrate_data(self):
  1468. """
  1469. 整合数据
  1470. :return:
  1471. """
  1472. #测试通过点击店铺获取店铺的名称:
  1473. # shop = self.get_shop_name()
  1474. # print(f'店铺名称:{shop}')
  1475. # time.sleep(100000)
  1476. #测试点击店铺进入获取店铺资质
  1477. # license_info = self.get_license_info_ex('1234')#店铺名称
  1478. # time.sleep(100000)
  1479. #首先判断是否存在:商品已售罄,推荐以下相似商品 的文本
  1480. # if self.d.xpath('//*[contains(@text, "商品已售罄,推荐以下相似商品")]'):
  1481. # self.loggerPdd.info(f'商品已售罄:{self.search_key}')
  1482. # self.swipe_back(1)
  1483. # return
  1484. #获取价格和盒数备注
  1485. min_price, ext = self.safe_exec(self.drug_price_ex) # 最低价格和盒数备注
  1486. title_info = self.safe_exec(self.get_title) # 药品名字
  1487. if title_info:
  1488. if '999' in self.search_key:
  1489. temp_search_key = self.search_key.replace('999', '')
  1490. if self.search_key == '999强力枇杷露225ml':
  1491. temp_search_key = temp_search_key.replace('225ml', '')
  1492. elif self.search_key == '999糠酸莫米松凝胶15':
  1493. temp_search_key = temp_search_key.replace('15', '')
  1494. elif self.search_key == '999养胃舒颗粒10g*6':
  1495. temp_search_key = temp_search_key.replace('10g*6', '')
  1496. elif self.search_key == '999曲安奈德益康唑乳膏30g':
  1497. temp_search_key = temp_search_key.replace ('30g', '')
  1498. elif self.search_key == '999抗病毒口服液10ml*6支':
  1499. temp_search_key = temp_search_key.replace('10ml*6支', '')
  1500. elif self.search_key == '999复方板蓝根颗粒15袋':
  1501. temp_search_key = temp_search_key.replace('15袋', '')
  1502. elif self.search_key == '999可调式生理性海水鼻腔喷雾50':
  1503. temp_search_key = temp_search_key.replace('50', '')
  1504. elif self.search_key == '999维生素E.C颗粒9袋':
  1505. temp_search_key = temp_search_key.replace('9袋', '')
  1506. elif self.search_key == '999复方氨酚烷胺胶囊6粒':
  1507. temp_search_key = temp_search_key.replace('6粒', '')
  1508. elif self.search_key == '999复方板蓝根颗粒15g*15袋':
  1509. temp_search_key = temp_search_key.replace('15g*15袋', '')
  1510. elif self.search_key == '999止泻利颗粒15g*8':
  1511. temp_search_key = temp_search_key.replace('15g*8', '')
  1512. elif self.search_key == '999三蛇胆川贝膏138':
  1513. temp_search_key = temp_search_key.replace('138', '')
  1514. elif self.search_key == '999强力枇杷露16袋':
  1515. temp_search_key = temp_search_key.replace('16袋', '')
  1516. elif self.search_key == '999复方苦参肠炎康片12片':
  1517. temp_search_key = temp_search_key.replace('12片', '')
  1518. elif self.search_key == '999必无忧盐酸特比萘芬乳膏15':
  1519. temp_search_key = temp_search_key.replace('必无忧', '')
  1520. temp_search_key = temp_search_key.replace('15', '')
  1521. elif self.search_key == '999速复康布洛芬缓释胶囊':
  1522. temp_search_key = temp_search_key.replace('速复康', '')
  1523. elif self.search_key == '999强力枇杷露120ml':
  1524. temp_search_key = temp_search_key.replace('120ml', '')
  1525. elif self.search_key == '999强力枇杷露150ml':
  1526. temp_search_key = temp_search_key.replace('150ml', '')
  1527. elif self.search_key == '999抗病毒口服液10ml*10':
  1528. temp_search_key = temp_search_key.replace('10ml*10', '')
  1529. elif self.search_key == '999抗病毒口服液10ml*12':
  1530. temp_search_key = temp_search_key.replace('10ml*12', '')
  1531. elif self.search_key == '999感冒清热颗粒6g*10':
  1532. temp_search_key = temp_search_key.replace('6g*10', '')
  1533. elif self.search_key == '999选平硝酸咪康唑乳膏20g':
  1534. temp_search_key = temp_search_key.replace('选平', '')
  1535. temp_search_key = temp_search_key.replace('20g', '')
  1536. elif self.search_key == '999糠酸莫米松乳膏10g':
  1537. temp_search_key = temp_search_key.replace('10g', '')
  1538. elif self.search_key == '999壮骨关节丸6g*20':
  1539. temp_search_key = temp_search_key.replace('6g*20', '')
  1540. elif self.search_key == '999正天丸6g*15':
  1541. temp_search_key = temp_search_key.replace('6g*15', '')
  1542. elif self.search_key == '999藿香正气合剂10ml*6':
  1543. temp_search_key = temp_search_key.replace('10ml*6', '')
  1544. elif self.search_key == '999藿香正气合剂10ml*10':
  1545. temp_search_key = temp_search_key.replace('10ml*10', '')
  1546. elif self.search_key == '999小儿止咳糖浆120':
  1547. temp_search_key = temp_search_key.replace('120', '')
  1548. elif self.search_key == '999小儿止咳糖浆225':
  1549. temp_search_key = temp_search_key.replace('225', '')
  1550. elif self.search_key == '999小儿感冒颗粒6g*10':
  1551. temp_search_key = temp_search_key.replace('6g*10', '')
  1552. elif self.search_key == '999小儿感冒颗粒6g*24':
  1553. temp_search_key = temp_search_key.replace('6g*24', '')
  1554. elif self.search_key == '999小儿氨酚黄那敏颗粒6g*10袋':
  1555. temp_search_key = temp_search_key.replace('6g*10袋', '')
  1556. elif self.search_key == '999小儿氨酚黄那敏颗粒6g*20袋':
  1557. temp_search_key = temp_search_key.replace('6g*20袋', '')
  1558. elif self.search_key == '999感冒灵颗粒10g*9袋':
  1559. temp_search_key = temp_search_key.replace('10g*9袋', '')
  1560. elif self.search_key == '999皮炎平复方醋酸地塞米松乳膏20':
  1561. temp_search_key = temp_search_key.replace('皮炎平', '')
  1562. temp_search_key = temp_search_key.replace('20', '')
  1563. elif self.search_key == '999糠酸莫米松凝胶10':
  1564. temp_search_key = temp_search_key.replace('10', '')
  1565. elif self.search_key == '999板蓝根颗粒10g*20':
  1566. temp_search_key = temp_search_key.replace('10g*20', '')
  1567. elif self.search_key == '999咽炎片0.26g*12片':
  1568. temp_search_key = temp_search_key.replace('0.26g*12片', '')
  1569. elif self.search_key == '999小儿咽扁颗粒8g*10袋':
  1570. temp_search_key = temp_search_key.replace('8g*10袋', '')
  1571. elif self.search_key == '999感冒清热颗粒12g*18':
  1572. temp_search_key = temp_search_key.replace('12g*18', '')
  1573. # print (f"temp_search_key={temp_search_key}")
  1574. if self.search_key == '999抗病毒口服液': #如果标题不包含 999 或 抗病毒口服液 或 (10ml*12 和 10ml*18) 则退出
  1575. if '999' not in title_info or temp_search_key not in title_info:
  1576. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1577. self.swipe_back(1)
  1578. self.unrelated_data += 1
  1579. return
  1580. elif '10ml*12' not in title_info and '10ml*10' not in title_info:
  1581. print(f"当前商品名称:{title_info} 不包含10*12或10*10品规")
  1582. self.swipe_back(1)
  1583. self.unrelated_data += 1
  1584. return
  1585. elif self.search_key == '999抗病毒口服液10ml*6支': #如果标题不包含 999 或 抗病毒口服液 或 (10ml*12 和 10ml*18) 则退出
  1586. if '999' not in title_info or temp_search_key not in title_info:
  1587. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1588. self.swipe_back(1)
  1589. self.unrelated_data += 1
  1590. return
  1591. elif '10ml*6' not in title_info:
  1592. print(f"当前商品名称:{title_info} 不包含10ml*6品规")
  1593. self.swipe_back(1)
  1594. self.unrelated_data += 1
  1595. return
  1596. elif self.search_key == '999曲安奈德益康唑乳膏30g':
  1597. if '999' not in title_info or temp_search_key not in title_info:
  1598. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1599. self.swipe_back(1)
  1600. self.unrelated_data += 1
  1601. return
  1602. elif '30' not in title_info:
  1603. print(f"当前商品名称:{title_info} 不包含30品规")
  1604. self.swipe_back(1)
  1605. self.unrelated_data += 1
  1606. return
  1607. elif self.search_key == '999复方感冒灵颗粒':
  1608. if '999' not in title_info or temp_search_key not in title_info :
  1609. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1610. self.swipe_back(1)
  1611. self.unrelated_data += 1
  1612. return
  1613. elif '14g*15' not in title_info and '14g*9' not in title_info:
  1614. print(f"当前商品名称:{title_info} 不包含14g*15 和 14g*9品规")
  1615. self.swipe_back(1)
  1616. self.unrelated_data += 1
  1617. return
  1618. elif self.search_key == '999养胃舒颗粒10g*6':
  1619. if '999' not in title_info or temp_search_key not in title_info :
  1620. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1621. self.swipe_back(1)
  1622. self.unrelated_data += 1
  1623. return
  1624. elif '10g*6' not in title_info:
  1625. print(f"当前商品名称:{title_info} 不包含10g*6品规")
  1626. self.swipe_back(1)
  1627. self.unrelated_data += 1
  1628. return
  1629. elif self.search_key == '999糠酸莫米松凝胶15':
  1630. if '999' not in title_info or temp_search_key not in title_info :
  1631. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1632. self.swipe_back(1)
  1633. self.unrelated_data += 1
  1634. return
  1635. elif '15' not in title_info:
  1636. print(f"当前商品名称:{title_info} 不包含15品规")
  1637. self.swipe_back(1)
  1638. self.unrelated_data += 1
  1639. return
  1640. elif self.search_key == '999强力枇杷露225ml':
  1641. if '999' not in title_info or temp_search_key not in title_info :
  1642. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1643. self.swipe_back(1)
  1644. self.unrelated_data += 1
  1645. return
  1646. elif '225' not in title_info:
  1647. print(f"当前商品名称:{title_info} 不包含225品规")
  1648. self.swipe_back(1)
  1649. self.unrelated_data += 1
  1650. return
  1651. elif self.search_key == '999强力枇杷露120ml':
  1652. if '999' not in title_info or temp_search_key not in title_info :
  1653. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1654. self.swipe_back(1)
  1655. self.unrelated_data += 1
  1656. return
  1657. elif '120' not in title_info:
  1658. print(f"当前商品名称:{title_info} 不包含120品规")
  1659. self.swipe_back(1)
  1660. self.unrelated_data += 1
  1661. return
  1662. elif self.search_key == '999强力枇杷露150ml':
  1663. if '999' not in title_info or temp_search_key not in title_info :
  1664. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1665. self.swipe_back(1)
  1666. self.unrelated_data += 1
  1667. return
  1668. elif '150' not in title_info:
  1669. print(f"当前商品名称:{title_info} 不包含150品规")
  1670. self.swipe_back(1)
  1671. self.unrelated_data += 1
  1672. return
  1673. elif self.search_key == '999抗病毒口服液10ml*10':
  1674. if '999' not in title_info or temp_search_key not in title_info :
  1675. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1676. self.swipe_back(1)
  1677. self.unrelated_data += 1
  1678. return
  1679. elif '10ml*10' not in title_info:
  1680. print(f"当前商品名称:{title_info} 不包含10ml*10品规")
  1681. self.swipe_back(1)
  1682. self.unrelated_data += 1
  1683. return
  1684. elif self.search_key == '999抗病毒口服液10ml*12':
  1685. if '999' not in title_info or temp_search_key not in title_info :
  1686. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  1687. self.swipe_back(1)
  1688. self.unrelated_data += 1
  1689. return
  1690. elif '10ml*12' not in title_info:
  1691. print(f"当前商品名称:{title_info} 不包含10ml*12品规")
  1692. self.swipe_back(1)
  1693. self.unrelated_data += 1
  1694. return
  1695. elif self.search_key == '999复方板蓝根颗粒15袋':
  1696. if '999' not in title_info or temp_search_key not in title_info :
  1697. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1698. self.swipe_back(1)
  1699. self.unrelated_data += 1
  1700. return
  1701. elif '15袋' not in title_info:
  1702. print(f"当前商品名称:{title_info} 不包含15袋品规")
  1703. self.swipe_back(1)
  1704. self.unrelated_data += 1
  1705. return
  1706. elif self.search_key == '999可调式生理性海水鼻腔喷雾50':
  1707. if '999' not in title_info or temp_search_key not in title_info :
  1708. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1709. self.swipe_back(1)
  1710. self.unrelated_data += 1
  1711. return
  1712. elif self.search_key == '999维生素E.C颗粒9袋':
  1713. if '999' not in title_info or temp_search_key not in title_info :
  1714. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1715. self.swipe_back(1)
  1716. self.unrelated_data += 1
  1717. return
  1718. elif self.search_key == '999复方氨酚烷胺胶囊6粒':
  1719. if '999' not in title_info or temp_search_key not in title_info :
  1720. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1721. self.swipe_back(1)
  1722. self.unrelated_data += 1
  1723. return
  1724. elif '6粒' not in title_info:
  1725. print(f"当前商品名称:{title_info} 不包含6粒品规")
  1726. self.swipe_back(1)
  1727. self.unrelated_data += 1
  1728. return
  1729. elif self.search_key == '999复方板蓝根颗粒15g*15袋':
  1730. if '999' not in title_info or temp_search_key not in title_info :
  1731. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1732. self.swipe_back(1)
  1733. self.unrelated_data += 1
  1734. return
  1735. elif '15g*15' not in title_info:
  1736. print(f"当前商品名称:{title_info} 不包含15g*15品规")
  1737. self.swipe_back(1)
  1738. self.unrelated_data += 1
  1739. return
  1740. elif self.search_key == '999止泻利颗粒15g*8':
  1741. if '999' not in title_info or temp_search_key not in title_info :
  1742. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1743. self.swipe_back(1)
  1744. self.unrelated_data += 1
  1745. return
  1746. elif '15g*8' not in title_info:
  1747. print(f"当前商品名称:{title_info} 不包含15g*8品规")
  1748. self.swipe_back(1)
  1749. self.unrelated_data += 1
  1750. return
  1751. elif self.search_key == '999三蛇胆川贝膏138':
  1752. if '999' not in title_info or temp_search_key not in title_info :
  1753. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1754. self.swipe_back(1)
  1755. self.unrelated_data += 1
  1756. return
  1757. elif '138' not in title_info:
  1758. print(f"当前商品名称:{title_info} 不包含138品规")
  1759. self.swipe_back(1)
  1760. self.unrelated_data += 1
  1761. return
  1762. elif self.search_key == '999强力枇杷露16袋':
  1763. if '999' not in title_info or temp_search_key not in title_info :
  1764. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1765. self.swipe_back(1)
  1766. self.unrelated_data += 1
  1767. return
  1768. elif '16袋' not in title_info:
  1769. print(f"当前商品名称:{title_info} 不包含16袋品规")
  1770. self.swipe_back(1)
  1771. self.unrelated_data += 1
  1772. return
  1773. elif self.search_key == '999复方苦参肠炎康片12片':
  1774. if '999' not in title_info or temp_search_key not in title_info :
  1775. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1776. self.swipe_back(1)
  1777. self.unrelated_data += 1
  1778. return
  1779. elif '12片' not in title_info:
  1780. print(f"当前商品名称:{title_info} 不包含12片品规")
  1781. self.swipe_back(1)
  1782. self.unrelated_data += 1
  1783. return
  1784. elif self.search_key == '999必无忧盐酸特比萘芬乳膏15':
  1785. if temp_search_key not in title_info :
  1786. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1787. self.swipe_back(1)
  1788. self.unrelated_data += 1
  1789. return
  1790. elif ('999' not in title_info) and ('必无忧' not in title_info):
  1791. print(f"当前商品名称:{title_info} 不包含关键字:999或 必无忧")
  1792. self.swipe_back(1)
  1793. self.unrelated_data += 1
  1794. return
  1795. elif '15' not in title_info:
  1796. print(f"当前商品名称:{title_info} 不包含15品规")
  1797. self.swipe_back(1)
  1798. self.unrelated_data += 1
  1799. return
  1800. elif self.search_key == '999速复康布洛芬缓释胶囊':
  1801. if temp_search_key not in title_info :
  1802. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1803. self.swipe_back(1)
  1804. self.unrelated_data += 1
  1805. return
  1806. elif ('999' not in title_info) and ('速复康' not in title_info):
  1807. print(f"当前商品名称:{title_info} 不包含关键字:999或 速复康")
  1808. self.swipe_back(1)
  1809. self.unrelated_data += 1
  1810. return
  1811. elif self.search_key == '999维生素C咀嚼片':
  1812. if '999' not in title_info or temp_search_key not in title_info :
  1813. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1814. self.swipe_back(1)
  1815. self.unrelated_data += 1
  1816. return
  1817. elif '80' not in title_info:
  1818. print(f"当前商品名称:{title_info} 不包含80品规")
  1819. self.swipe_back(1)
  1820. self.unrelated_data += 1
  1821. return
  1822. elif self.search_key == '999精氨酸布洛芬颗粒':
  1823. if '999' not in title_info or temp_search_key not in title_info :
  1824. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1825. self.swipe_back(1)
  1826. self.unrelated_data += 1
  1827. return
  1828. elif '9' not in title_info:
  1829. print(f"当前商品名称:{title_info} 不包含9品规")
  1830. self.swipe_back(1)
  1831. self.unrelated_data += 1
  1832. return
  1833. elif self.search_key == '999阿奇霉素片':
  1834. if '999' not in title_info or temp_search_key not in title_info :
  1835. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1836. self.swipe_back(1)
  1837. self.unrelated_data += 1
  1838. return
  1839. elif '0.25g*6' not in title_info:
  1840. print(f"当前商品名称:{title_info} 不包含0.25g*6品规")
  1841. self.swipe_back(1)
  1842. self.unrelated_data += 1
  1843. return
  1844. elif self.search_key == '999感冒清热颗粒6g*10':
  1845. if '999' not in title_info or temp_search_key not in title_info :
  1846. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1847. self.swipe_back(1)
  1848. self.unrelated_data += 1
  1849. return
  1850. elif '6g*10' not in title_info:
  1851. print(f"当前商品名称:{title_info} 不包含6g*10品规")
  1852. self.swipe_back(1)
  1853. self.unrelated_data += 1
  1854. return
  1855. elif self.search_key == '999选平硝酸咪康唑乳膏20g':
  1856. if temp_search_key not in title_info :
  1857. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1858. self.swipe_back(1)
  1859. self.unrelated_data += 1
  1860. return
  1861. elif ('999' not in title_info) and ('选平' not in title_info):
  1862. print(f"当前商品名称:{title_info} 不包含关键字:999或 选平")
  1863. self.swipe_back(1)
  1864. self.unrelated_data += 1
  1865. return
  1866. elif '20g' not in title_info:
  1867. print(f"当前商品名称:{title_info} 不包含20g品规")
  1868. self.swipe_back(1)
  1869. self.unrelated_data += 1
  1870. return
  1871. elif self.search_key == '999糠酸莫米松乳膏10g':
  1872. if '999' not in title_info or temp_search_key not in title_info :
  1873. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1874. self.swipe_back(1)
  1875. self.unrelated_data += 1
  1876. return
  1877. elif '10g' not in title_info:
  1878. print(f"当前商品名称:{title_info} 不包含10g品规")
  1879. self.swipe_back(1)
  1880. self.unrelated_data += 1
  1881. return
  1882. elif self.search_key == '999补脾益肠丸':
  1883. if temp_search_key not in title_info :
  1884. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1885. self.swipe_back(1)
  1886. self.unrelated_data += 1
  1887. return
  1888. elif ('999' not in title_info) and ('三九' not in title_info):
  1889. print(f"当前商品名称:{title_info} 不包含关键字:999或 三九")
  1890. self.swipe_back(1)
  1891. self.unrelated_data += 1
  1892. return
  1893. elif self.search_key == '999壮骨关节丸6g*20':
  1894. if '999' not in title_info or temp_search_key not in title_info :
  1895. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1896. self.swipe_back(1)
  1897. self.unrelated_data += 1
  1898. return
  1899. elif '6g*20' not in title_info:
  1900. print(f"当前商品名称:{title_info} 不包含6g*20品规")
  1901. self.swipe_back(1)
  1902. self.unrelated_data += 1
  1903. return
  1904. elif self.search_key == '999正天丸6g*15':
  1905. if '999' not in title_info or temp_search_key not in title_info :
  1906. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1907. self.swipe_back(1)
  1908. self.unrelated_data += 1
  1909. return
  1910. elif '6g*15' not in title_info:
  1911. print(f"当前商品名称:{title_info} 不包含6g*15品规")
  1912. self.swipe_back(1)
  1913. self.unrelated_data += 1
  1914. return
  1915. elif self.search_key == '999藿香正气合剂10ml*6':
  1916. if '999' not in title_info or temp_search_key not in title_info :
  1917. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1918. self.swipe_back(1)
  1919. self.unrelated_data += 1
  1920. return
  1921. elif '10ml*6' not in title_info:
  1922. print(f"当前商品名称:{title_info} 不包含10ml*6品规")
  1923. self.swipe_back(1)
  1924. self.unrelated_data += 1
  1925. return
  1926. elif self.search_key == '999藿香正气合剂10ml*10':
  1927. if '999' not in title_info or temp_search_key not in title_info :
  1928. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1929. self.swipe_back(1)
  1930. self.unrelated_data += 1
  1931. return
  1932. elif '10ml*10' not in title_info:
  1933. print(f"当前商品名称:{title_info} 不包含10ml*10品规")
  1934. self.swipe_back(1)
  1935. self.unrelated_data += 1
  1936. return
  1937. elif self.search_key == '999小儿止咳糖浆120':
  1938. if '999' not in title_info or temp_search_key not in title_info :
  1939. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1940. self.swipe_back(1)
  1941. self.unrelated_data += 1
  1942. return
  1943. elif '120' not in title_info:
  1944. print(f"当前商品名称:{title_info} 不包含120品规")
  1945. self.swipe_back(1)
  1946. self.unrelated_data += 1
  1947. return
  1948. elif self.search_key == '999小儿止咳糖浆225':
  1949. if '999' not in title_info or temp_search_key not in title_info :
  1950. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1951. self.swipe_back(1)
  1952. self.unrelated_data += 1
  1953. return
  1954. elif '225' not in title_info:
  1955. print(f"当前商品名称:{title_info} 不包含225品规")
  1956. self.swipe_back(1)
  1957. self.unrelated_data += 1
  1958. return
  1959. elif self.search_key == '999小儿感冒颗粒6g*10':
  1960. if '999' not in title_info or temp_search_key not in title_info :
  1961. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1962. self.swipe_back(1)
  1963. self.unrelated_data += 1
  1964. return
  1965. elif '6g*10' not in title_info:
  1966. print(f"当前商品名称:{title_info} 不包含6g*10品规")
  1967. self.swipe_back(1)
  1968. self.unrelated_data += 1
  1969. return
  1970. elif self.search_key == '999小儿感冒颗粒6g*24':
  1971. if '999' not in title_info or temp_search_key not in title_info :
  1972. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1973. self.swipe_back(1)
  1974. self.unrelated_data += 1
  1975. return
  1976. elif '6g*24' not in title_info:
  1977. print(f"当前商品名称:{title_info} 不包含6g*24品规")
  1978. self.swipe_back(1)
  1979. self.unrelated_data += 1
  1980. return
  1981. elif self.search_key == '999小儿氨酚黄那敏颗粒6g*10袋':
  1982. if '999' not in title_info or temp_search_key not in title_info :
  1983. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1984. self.swipe_back(1)
  1985. self.unrelated_data += 1
  1986. return
  1987. elif '6g*10' not in title_info:
  1988. print(f"当前商品名称:{title_info} 不包含6g*10品规")
  1989. self.swipe_back(1)
  1990. self.unrelated_data += 1
  1991. return
  1992. elif self.search_key == '999小儿氨酚黄那敏颗粒6g*20袋':
  1993. if '999' not in title_info or temp_search_key not in title_info :
  1994. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  1995. self.swipe_back(1)
  1996. self.unrelated_data += 1
  1997. return
  1998. elif '6g*20' not in title_info:
  1999. print(f"当前商品名称:{title_info} 不包含6g*20品规")
  2000. self.swipe_back(1)
  2001. self.unrelated_data += 1
  2002. return
  2003. elif self.search_key == '999感冒灵颗粒':
  2004. if '999' not in title_info or temp_search_key not in title_info :
  2005. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  2006. self.swipe_back(1)
  2007. self.unrelated_data += 1
  2008. return
  2009. elif '10g*9' not in title_info:
  2010. print(f"当前商品名称:{title_info} 不包含10g*9品规")
  2011. self.swipe_back(1)
  2012. self.unrelated_data += 1
  2013. return
  2014. elif self.search_key == '999皮炎平复方醋酸地塞米松乳膏20':
  2015. if temp_search_key not in title_info :
  2016. print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
  2017. self.swipe_back(1)
  2018. self.unrelated_data += 1
  2019. return
  2020. elif ('999' not in title_info) and ('皮炎平' not in title_info):
  2021. print(f"当前商品名称:{title_info} 不包含关键字:999或 皮炎平")
  2022. self.swipe_back(1)
  2023. self.unrelated_data += 1
  2024. return
  2025. elif '20g' not in title_info:
  2026. print(f"当前商品名称:{title_info} 不包含20g品规")
  2027. self.swipe_back(1)
  2028. self.unrelated_data += 1
  2029. return
  2030. elif self.search_key == '999糠酸莫米松凝胶10':
  2031. if '999' not in title_info or temp_search_key not in title_info :
  2032. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  2033. self.swipe_back(1)
  2034. self.unrelated_data += 1
  2035. return
  2036. elif '10' not in title_info:
  2037. print(f"当前商品名称:{title_info} 不包含10品规")
  2038. self.swipe_back(1)
  2039. self.unrelated_data += 1
  2040. return
  2041. elif self.search_key == '999板蓝根颗粒10g*20':
  2042. if '999' not in title_info or temp_search_key not in title_info :
  2043. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  2044. self.swipe_back(1)
  2045. self.unrelated_data += 1
  2046. return
  2047. elif '10g*20' not in title_info:
  2048. print(f"当前商品名称:{title_info} 不包含10g*20品规")
  2049. self.swipe_back(1)
  2050. self.unrelated_data += 1
  2051. return
  2052. elif self.search_key == '999咽炎片0.26g*12片':
  2053. if '999' not in title_info or temp_search_key not in title_info :
  2054. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  2055. self.swipe_back(1)
  2056. self.unrelated_data += 1
  2057. return
  2058. elif '0.26g*12' not in title_info and '0.26g*24' not in title_info:
  2059. print(f"当前商品名称:{title_info} 不包含0.26g*12 和 0.26g*24品规")
  2060. self.swipe_back(1)
  2061. self.unrelated_data += 1
  2062. return
  2063. elif self.search_key == '999感冒清热颗粒12g*18':
  2064. if '999' not in title_info or temp_search_key not in title_info :
  2065. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  2066. self.swipe_back(1)
  2067. self.unrelated_data += 1
  2068. return
  2069. elif '12g*18' not in title_info:
  2070. print(f"当前商品名称:{title_info} 不包含12g*18品规")
  2071. self.swipe_back(1)
  2072. self.unrelated_data += 1
  2073. return
  2074. elif self.search_key == '999小儿咽扁颗粒8g*10袋':
  2075. if '999' not in title_info or temp_search_key not in title_info :
  2076. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  2077. self.swipe_back(1)
  2078. self.unrelated_data += 1
  2079. return
  2080. elif '8g*10' not in title_info:
  2081. print(f"当前商品名称:{title_info} 不包含8g*10品规")
  2082. self.swipe_back(1)
  2083. self.unrelated_data += 1
  2084. return
  2085. else:
  2086. if '999' not in title_info or temp_search_key not in title_info:
  2087. print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
  2088. self.swipe_back(1)
  2089. self.unrelated_data += 1
  2090. return
  2091. else:
  2092. if self.search_key == '史达功右美沙芬愈创甘油醚糖浆120':
  2093. temp_search_key = self.search_key.replace('史达功', '')
  2094. temp_search_key = temp_search_key.replace('120', '')
  2095. if '史达功' not in title_info or temp_search_key not in title_info:
  2096. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2097. self.swipe_back(1)
  2098. self.unrelated_data += 1
  2099. return
  2100. elif '120' not in title_info:
  2101. print(f"当前商品名称:{title_info} 不包含120品规")
  2102. self.swipe_back(1)
  2103. self.unrelated_data += 1
  2104. return
  2105. elif self.search_key == '三九胃泰养胃舒颗粒8袋':
  2106. temp_search_key = self.search_key.replace('三九胃泰', '')
  2107. temp_search_key = temp_search_key.replace('8袋', '')
  2108. if '三九胃泰' not in title_info or temp_search_key not in title_info:
  2109. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2110. self.swipe_back(1)
  2111. self.unrelated_data += 1
  2112. return
  2113. elif '8袋' not in title_info:
  2114. print(f"当前商品名称:{title_info} 不包含8袋品规")
  2115. self.swipe_back(1)
  2116. self.unrelated_data += 1
  2117. return
  2118. elif self.search_key == '三九复方金银花颗粒10g*8袋':
  2119. temp_search_key = self.search_key.replace('三九', '')
  2120. temp_search_key = temp_search_key.replace('10g*8袋', '')
  2121. if '三九' not in title_info or temp_search_key not in title_info:
  2122. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2123. self.swipe_back(1)
  2124. self.unrelated_data += 1
  2125. return
  2126. elif '10g*8' not in title_info:
  2127. print(f"当前商品名称:{title_info} 不包含10g*8品规")
  2128. self.swipe_back(1)
  2129. self.unrelated_data += 1
  2130. return
  2131. elif self.search_key == '必无忧盐酸特比萘芬喷雾剂30ml':
  2132. temp_search_key = self.search_key.replace('必无忧', '')
  2133. temp_search_key = temp_search_key.replace('30ml', '')
  2134. if '必无忧' not in title_info or temp_search_key not in title_info:
  2135. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2136. self.swipe_back(1)
  2137. self.unrelated_data += 1
  2138. return
  2139. elif '30' not in title_info:
  2140. print(f"当前商品名称:{title_info} 不包含30品规")
  2141. self.swipe_back(1)
  2142. self.unrelated_data += 1
  2143. return
  2144. elif self.search_key == '佳美舒阿奇霉素肠溶胶囊4':
  2145. temp_search_key = self.search_key.replace('佳美舒', '')
  2146. temp_search_key = temp_search_key.replace('4', '')
  2147. if '佳美舒' not in title_info or temp_search_key not in title_info:
  2148. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2149. self.swipe_back(1)
  2150. self.unrelated_data += 1
  2151. return
  2152. elif '4' not in title_info and '8' not in title_info:
  2153. print(f"当前商品名称:{title_info} 不包含4品规或8品规")
  2154. self.swipe_back(1)
  2155. self.unrelated_data += 1
  2156. return
  2157. elif self.search_key == '三九胃泰颗粒20g*10':
  2158. temp_search_key = self.search_key.replace('20g*10', '')
  2159. if temp_search_key not in title_info:
  2160. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2161. self.swipe_back(1)
  2162. self.unrelated_data += 1
  2163. return
  2164. elif '20g*10' not in title_info:
  2165. print(f"当前商品名称:{title_info} 不包含20g*10品规")
  2166. self.swipe_back(1)
  2167. self.unrelated_data += 1
  2168. return
  2169. elif self.search_key == '三九胃泰颗粒2.5g*6':
  2170. temp_search_key = self.search_key.replace('2.5g*6', '')
  2171. if temp_search_key not in title_info:
  2172. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2173. self.swipe_back(1)
  2174. self.unrelated_data += 1
  2175. return
  2176. elif '2.5g*6' not in title_info:
  2177. print(f"当前商品名称:{title_info} 不包含2.5g*6品规")
  2178. self.swipe_back(1)
  2179. self.unrelated_data += 1
  2180. return
  2181. elif self.search_key == '三九胃泰颗粒20g*6袋':
  2182. temp_search_key = self.search_key.replace('20g*6袋', '')
  2183. if temp_search_key not in title_info:
  2184. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2185. self.swipe_back(1)
  2186. self.unrelated_data += 1
  2187. return
  2188. elif '20g*6' not in title_info:
  2189. print(f"当前商品名称:{title_info} 不包含20g*6品规")
  2190. self.swipe_back(1)
  2191. self.unrelated_data += 1
  2192. return
  2193. elif self.search_key == '顺峰酮康他索乳膏':
  2194. temp_search_key = self.search_key.replace('顺峰', '')
  2195. if temp_search_key not in title_info or '顺峰' not in title_info:
  2196. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2197. self.swipe_back(1)
  2198. self.unrelated_data += 1
  2199. return
  2200. elif self.search_key == '速复康磷酸奥司他韦胶囊75mg*10':
  2201. temp_search_key = self.search_key.replace('速复康', '')
  2202. temp_search_key = temp_search_key.replace('75mg*10', '')
  2203. if '佳美舒' not in title_info or temp_search_key not in title_info:
  2204. print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2205. self.swipe_back(1)
  2206. self.unrelated_data += 1
  2207. return
  2208. elif '75mg*10' not in title_info:
  2209. print(f"当前商品名称:{title_info} 不包含75mg*10品规")
  2210. self.swipe_back(1)
  2211. self.unrelated_data += 1
  2212. return
  2213. else:
  2214. if self.search_key not in title_info:
  2215. print(f'药品标题未包含药品关键字:-->{self.search_key}')
  2216. self.swipe_back(1)
  2217. self.unrelated_data += 1
  2218. return
  2219. # temp_search_key = self.search_key
  2220. # if self.search_key == '三九胃泰颗粒':
  2221. # temp_search_key = '三九胃泰' #兼容三九胃泰 温胃舒颗粒
  2222. # elif '999' in self.search_key:
  2223. # temp_search_key = self.search_key.replace('999', '')
  2224. # if '999' in self.search_key:
  2225. # if ('999' not in title_info) or (temp_search_key not in title_info):
  2226. # print(f'药品标题未包含药品关键字:-->{temp_search_key}和未包含999')
  2227. # self.swipe_back(1)
  2228. # self.unrelated_data += 1
  2229. # return
  2230. # else:
  2231. # if temp_search_key not in title_info:
  2232. # print(f'药品标题未包含药品关键字:-->{temp_search_key}')
  2233. # self.swipe_back(1)
  2234. # self.unrelated_data += 1
  2235. # return
  2236. else:
  2237. print('标题获取为空')
  2238. self.swipe_back(1)
  2239. return
  2240. #第一次没有获取到价格
  2241. if not min_price:
  2242. min_price = self.drug_price() # 最低价格 第二次获取
  2243. if not min_price:
  2244. print('提取价格出错,回退到列表页')
  2245. self.swipe_back(1)
  2246. self.unrelated_data += 1
  2247. return
  2248. # 商品链接 分享链接
  2249. product_link = self.get_product_link()
  2250. time.sleep(2)
  2251. if self.search_key == '999小儿止咳糖浆' or self.search_key == '999小儿氨酚黄那敏颗粒' or self.search_key == '999小儿感冒颗粒':
  2252. shop = self.get_shop_name()
  2253. else:
  2254. for i in range(15):
  2255. if self.d(textStartsWith="进店").exists:
  2256. print('开始获取店铺名')
  2257. break
  2258. screen_width = self.d.info['displayWidth']
  2259. screen_height = self.d.info['displayHeight']
  2260. # self.d.swipe(screen_width // 2, screen_height - 400, screen_width // 2, 400, duration=0.2)
  2261. self.d.swipe_ext("up", scale=0.3)
  2262. time.sleep(self.get_sleep_time())
  2263. if self.d(textStartsWith="进店").exists:
  2264. print('可以开始获取店铺名')
  2265. # shop = self.get_shop_name()
  2266. shop = self.get_shop_name()
  2267. if not shop:
  2268. print('当前店铺名称为空')
  2269. self.swipe_back(1)
  2270. self.unrelated_data += 1
  2271. return
  2272. # 爬取日期
  2273. scrape_date = self.get_current_date()
  2274. dup_data = {'search_key': self.search_key, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
  2275. 'platform': '拼多多'}
  2276. if self.data_is_exists(dup_data):
  2277. print('存在相同数据不入库')
  2278. self.swipe_back(1)
  2279. return
  2280. is_has_instructions = self.has_instructions()
  2281. # print(f'是否有说明书:{is_has_instructions}')
  2282. self.loggerPdd.info(f'是否有说明书:{is_has_instructions}')
  2283. # 生产日期为空
  2284. manufacture_date = ''
  2285. # 执政信息
  2286. # if is_has_enter_shop:
  2287. # license_info = self.get_license_info()
  2288. # business_license_company = license_info["单位名称"]
  2289. # credit_code = license_info['社会信用代码']
  2290. # city_str = license_info['地址']
  2291. # # 先把省份啥的替换掉
  2292. # city_sub_str = re.sub(r'[u4e00-\u9fa5]+省', '', city_str)
  2293. # try:
  2294. # city = re.search(r'[\u4e00-\u9fa5]+?(市|区|县)', city_sub_str).group(0)
  2295. # except:
  2296. # city = city_sub_str
  2297. # try:
  2298. # province = self.city2province[city]
  2299. # except:
  2300. # province = ''
  2301. # self.swipe_back(2)
  2302. # else:
  2303. # business_license_company = ''
  2304. # credit_code = ''
  2305. # city = ''
  2306. # province = ''
  2307. business_license_company = ''
  2308. # credit_code = ''
  2309. credit_code = ext
  2310. city = ''
  2311. province = ''
  2312. # 说明书等信息
  2313. if is_has_instructions:
  2314. try:
  2315. instructions_info = self.safe_exec(self.get_instructions_data)
  2316. # print('说明书相关信息:', instructions_info)
  2317. expiry_date = instructions_info['有效期'].strip('。')
  2318. manufacturer = instructions_info['生产单位'].strip('。')
  2319. approval_number = instructions_info['批准文号'].strip('。')
  2320. specifications = instructions_info['产品规格'].strip('。')
  2321. except Exception as e:
  2322. print(f'获取详情页规格参数出错:{e}')
  2323. self.swipe_back(2)
  2324. return
  2325. else:
  2326. expiry_date = ''
  2327. manufacturer = ''
  2328. approval_number = ''
  2329. specifications = ''
  2330. # if self.search_key == '999小柴胡颗粒':
  2331. # if '10g*9' in specifications or '10克x9' in specifications or '10g*15' in specifications or '10克/袋*9' in specifications:
  2332. # print("111")
  2333. # else:
  2334. # self.swipe_back(1)
  2335. # return
  2336. # elif self.search_key == '':
  2337. # if '10ml*12' in specifications or '10ml*18' in specifications:
  2338. # print(222)
  2339. # else:
  2340. # self.swipe_back(1)
  2341. # return
  2342. self.unrelated_data = 0
  2343. # 商品链接
  2344. # product_link = ''
  2345. # 爬取省份
  2346. scrape_province = '广东' # 这里先默认广东
  2347. # 是否有货
  2348. availability = ''
  2349. save_data = {
  2350. 'product': title_info,
  2351. 'min_price': min_price,
  2352. 'manufacture_date': manufacture_date,
  2353. 'expiry_date': expiry_date,
  2354. 'shop': shop,
  2355. 'business_license_company': business_license_company,
  2356. 'province': province,
  2357. 'city': city,
  2358. 'manufacturer': manufacturer,
  2359. 'specification': specifications,
  2360. 'approval_number': approval_number,
  2361. 'product_link': product_link,
  2362. 'scrape_date': scrape_date,
  2363. 'scrape_province': scrape_province,
  2364. 'availability': availability,
  2365. 'credit_code': credit_code,
  2366. 'platform': '拼多多',
  2367. 'search_key': self.search_key,
  2368. 'number' : 1
  2369. }
  2370. # print(f'待插入数据:{save_data}')
  2371. self.save_to_database(save_data)
  2372. self.swipe_back(1)
  2373. #获取店铺信息start 2025-07-28
  2374. '''
  2375. #不获取店铺信息
  2376. #1、判断店铺名称是否已存在
  2377. shop_is_exists = self.shop_is_exists_database(shop)
  2378. #2、获取店铺资质 是否存在
  2379. # is_has_shop_qualifications = self.has_shop_qualifications()
  2380. if not shop_is_exists :
  2381. print('开始获取店铺信息')
  2382. #点击店铺,点击店铺标题,点击店铺资质
  2383. # license_info = self.get_license_info_ex()
  2384. else:
  2385. #日志中加上店铺名称
  2386. self.loggerPdd.info(f'店铺{shop}信息已存在数据库')
  2387. #获取店铺信息end
  2388. '''
  2389. if self.distinct_target():
  2390. print('已到达搜索列表页')
  2391. else:
  2392. for i in range(2):
  2393. self.swipe_back(1)
  2394. # 最外部有个定位按钮
  2395. if self.distinct_target():
  2396. break
  2397. def main(self, device_id, start_page, end_page, task_id, max_duration_minutes=None, retry_count=0):
  2398. # === 新增:初始化任务信息 ===
  2399. self.task_id = task_id
  2400. self.task_start_page = start_page
  2401. self.task_end_page = end_page
  2402. # self.current_page = start_page
  2403. self.start_time = time.time()
  2404. # === 新增:线程启动成功后更新状态为2 ===
  2405. if self.task_id:
  2406. try:
  2407. self.update_task_status(2) # 状态2: 执行中
  2408. self.loggerPdd.info(f"任务 {task_id} 线程启动成功,状态已更新为2")
  2409. except Exception as e:
  2410. self.loggerPdd.error(f"更新任务状态失败: {e}")
  2411. # =====================================
  2412. # 记录任务开始
  2413. if task_id:
  2414. reporter.start_task(task_id, start_page, end_page)
  2415. # ========================
  2416. task_scape_count = 0 #任务采集数量初始化为0
  2417. # 计算超时时间(秒)
  2418. timeout_seconds = None
  2419. if max_duration_minutes:
  2420. timeout_seconds = max_duration_minutes * 60
  2421. MAX_RETRY = 3 # 最大重试次数
  2422. spider_no = 0
  2423. self.connect_devices(device_id)
  2424. time.sleep(self.get_sleep_time())
  2425. # 启动全局弹窗监控
  2426. self.monitor = SpiderMonitor(self)
  2427. self.monitor.start()
  2428. # 重新开启拼多多应用
  2429. try:
  2430. self.restart_app()
  2431. # 搜索关键字
  2432. # self.enter_target_page()
  2433. self.safe_exec(self.enter_target_page)
  2434. # === 新增:跳过前面的页面直到start_page start===
  2435. if start_page > 1:
  2436. self.loggerPdd.info(f"跳过前 {start_page-1} 页,从第 {start_page} 页开始采集")
  2437. current_page = 1
  2438. while current_page < start_page:
  2439. # 检查是否需要暂停
  2440. if self.monitor.pausing.is_set():
  2441. self.wait_for_ready(self.monitor)
  2442. # 检查是否到达底部
  2443. if self.d.xpath('//*[@text="已经到底啦"]').exists:
  2444. self.loggerPdd.info(f"在第 {current_page} 页已到达底部,无法继续翻页")
  2445. self.loggerPdd.warning(f"未能到达目标页码 {start_page},实际只到达第 {current_page} 页")
  2446. if task_id:
  2447. reporter.end_task(
  2448. task_id=task_id,
  2449. status='completed',
  2450. finish_status=1,
  2451. force_end_page=idx
  2452. )
  2453. return
  2454. # break
  2455. # 滑动到下一页
  2456. self.d.swipe(200, 1400, 200, 300, 0.4)
  2457. time.sleep(self.get_sleep_time())
  2458. current_page += 1
  2459. # 可选:添加页码日志
  2460. self.loggerPdd.debug(f"已翻到第 {current_page} 页")
  2461. # 验证是否到达目标页码
  2462. if current_page < start_page:
  2463. self.loggerPdd.error(f"翻页失败!目标页码:{start_page},实际到达:{current_page}")
  2464. # 这里可以根据需要决定是否继续执行或抛出异常
  2465. # return False 或 raise Exception
  2466. else:
  2467. self.loggerPdd.info(f"成功翻到第 {start_page} 页,开始采集")
  2468. # === 新增:跳过前面的页面直到start_page end===
  2469. # === 新增:跳过前面的页面直到start_page start===
  2470. # if start_page > 1:
  2471. # self.loggerPdd.info(f"跳过前 {start_page-1} 页,从第 {start_page} 页开始采集")
  2472. # for skip_idx in range(1, start_page):
  2473. # # 检查是否需要暂停
  2474. # if self.monitor.pausing.is_set():
  2475. # self.wait_for_ready(self.monitor)
  2476. # # 滑动到下一页
  2477. # # self.d.drag(300, 1400, 300, 400, 1)
  2478. # self.d.swipe(200, 1400, 200, 300, 0.4)
  2479. # time.sleep(self.get_sleep_time())
  2480. # # 检查是否到达底部
  2481. # if self.d.xpath('//*[@text="已经到底啦"]').exists:
  2482. # self.loggerPdd.info(f"在第 {skip_idx} 页已到达底部,无法继续翻页")
  2483. # break
  2484. # === 新增:跳过前面的页面直到start_page end===
  2485. # for idx in range(300):
  2486. for idx in range(start_page, end_page + 1):
  2487. # === 新增:检查是否超过结束页 ===
  2488. if idx > end_page:
  2489. self.loggerPdd.info(f"已采集到指定结束页 {end_page},停止采集")
  2490. if task_id:
  2491. reporter.end_task(
  2492. task_id=task_id,
  2493. status='completed',
  2494. finish_status=1,
  2495. force_end_page=end_page
  2496. )
  2497. return
  2498. # === 新增:检查超时 ===
  2499. if timeout_seconds and (time.time() - self.start_time) > timeout_seconds:
  2500. print(f"任务 {task_id} 达到时间限制 {max_duration_minutes} 分钟,停止采集")
  2501. self.loggerPdd.info(f"任务 {task_id} 达到时间限制 {max_duration_minutes} 分钟,停止采集")
  2502. # 上报未完成状态
  2503. if task_id:
  2504. reporter.end_task(
  2505. task_id=task_id,
  2506. status='completed',
  2507. finish_status=0, # 0:未完成
  2508. force_end_page=self.current_page
  2509. )
  2510. return
  2511. # ====================
  2512. # print(f'第{idx + 1}页')
  2513. print(f'第{idx}页(指定范围: {start_page}-{end_page})')
  2514. self.current_page = idx # 更新当前页码
  2515. # === 新增:更新上报进度 ===
  2516. if task_id:
  2517. reporter.update_task_progress(
  2518. task_id=task_id,
  2519. actual_end_page=self.current_page
  2520. )
  2521. # ========================
  2522. # print(f'第{idx + 1}页')
  2523. if spider_no > 30:
  2524. time.sleep(300)
  2525. spider_no = 0
  2526. # 检查是否需要暂停(验证码过多)
  2527. if self.monitor.verification_count >= self.monitor.MAX_VERIFICATION_RETRY:
  2528. print("频繁遇到验证码,暂停程序")
  2529. self.d.toast("请处理验证码后点击继续", 30)
  2530. # 等待用户点击屏幕继续
  2531. self.d.click(0, 0) # 无效点击,等待用户操作
  2532. self.monitor.verification_count = 0
  2533. if self.unrelated_data > 30:
  2534. print('连续超过30个不达标的数据则停止采集')
  2535. # 连续超过30个不达标的数据则停止采集
  2536. self.loggerPdd.info(f"连续20个数据不达标,品规:{self.search_key}")
  2537. # === 新增:任务正常完成 ===
  2538. if task_id:
  2539. reporter.end_task(
  2540. task_id=task_id,
  2541. status='completed',
  2542. finish_status=1, # 1:已完成
  2543. force_end_page=end_page
  2544. )
  2545. # ========================
  2546. break
  2547. # if idx == 0:
  2548. if idx == 1:
  2549. drug_lis = self.safe_exec(self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout').all)
  2550. else:
  2551. for i in range(1, 6):
  2552. drug_xpath = f'/hierarchy/android.widget.FrameLayout[{i}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout'
  2553. drug_lis = self.safe_exec(self.d.xpath(
  2554. f'/hierarchy/android.widget.FrameLayout[{i}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout').all)
  2555. if drug_lis:
  2556. break
  2557. print('数量', len(drug_lis))
  2558. for idd, drug_one in enumerate(drug_lis):
  2559. print(idd+1, drug_one.info)
  2560. time.sleep(self.get_sleep_time())
  2561. # left = drug_one.info['bounds']['left']
  2562. top = drug_one.info['bounds']['top']
  2563. # right = drug_one.info['bounds']['right']
  2564. bottom = drug_one.info['bounds']['bottom']
  2565. # height = bottom - top
  2566. # 高度低于多少的不点击采集
  2567. # if bottom <= 1400 and top >= 258:
  2568. if bottom <= 1524 and top >= 258:
  2569. # print(f"这页的第{idd+1}个商品")
  2570. # #商品名称的xpath
  2571. # if idx == 0:
  2572. # product_tittle_xpath = f'//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[{idd+2}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[2]/android.widget.TextView'
  2573. # else:
  2574. # product_tittle_xpath = drug_xpath + f'[{idd+1}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[2]/android.widget.TextView'
  2575. # print(f"商品名称的xpath:{product_tittle_xpath}")
  2576. # if self.d.xpath(product_tittle_xpath).exists:
  2577. # # product_title = self.d.xpath(product_tittle_xpath).info['contentDescription']
  2578. # print(f"列表当前info:{self.d.xpath(product_tittle_xpath).info}")
  2579. # product_title = self.d.xpath(product_tittle_xpath).text
  2580. # print(f"列表当前商品名称:{product_title}")
  2581. # if '999' in self.search_key:
  2582. # temp_search_key = self.search_key.replace('999', '')
  2583. # if '999' not in product_title or temp_search_key not in product_title:
  2584. # print(f"当前商品名称:{product_title} 不包含关键字:{self.search_key}")
  2585. # continue
  2586. # else:
  2587. # if self.search_key not in product_title.replace(' ', ''):
  2588. # continue
  2589. # else:
  2590. # print(f"列表当前商品路径不存在")
  2591. self.safe_exec(drug_one.click)
  2592. time.sleep(self.get_sleep_time())
  2593. # 采集药品信息
  2594. try:
  2595. self.safe_exec(self.integrate_data)
  2596. # 检测下是否回退到列表页
  2597. if self.distinct_target():
  2598. print('回退到列表页', True)
  2599. else:
  2600. if retry_count < MAX_RETRY:
  2601. # 停止当前监控线程
  2602. self.monitor.stop()
  2603. self.monitor.join()
  2604. # 递归重启采集
  2605. return self.main(device_id, start_page, end_page, task_id, max_duration_minutes, retry_count+1)
  2606. else:
  2607. print("超过最大重试次数,终止程序")
  2608. return
  2609. print('回退到列表页失败,终止采集')
  2610. return
  2611. time.sleep(self.get_sleep_time())
  2612. spider_no += 1
  2613. except Exception as e:
  2614. print(f'采集药品详情数据出错:{e}')
  2615. self.loggerPdd.error(f'采集药品详情数据出错:{e}')
  2616. if not self.distinct_target():
  2617. for i in range(1):
  2618. self.swipe_back(1)
  2619. # 最外部有个列表按钮
  2620. if self.distinct_target():
  2621. break
  2622. if i == 0 and not self.distinct_target():
  2623. print('页面出错,退出采集')
  2624. return
  2625. else:
  2626. continue
  2627. # if self.d(textStartsWith="抱歉,没有更多商品啦~").exists:
  2628. # print('已经到达列表页最底部')
  2629. # break
  2630. # 翻页逻辑(如果是最后一页则不再翻页)
  2631. if idx < end_page:
  2632. if self.d(textStartsWith="抱歉,没有更多商品啦~").exists:
  2633. self.loggerPdd.info(f'在第 {idx} 页已到达列表最底部')
  2634. if task_id:
  2635. reporter.end_task(
  2636. task_id=task_id,
  2637. status='completed',
  2638. finish_status=1,
  2639. force_end_page=idx
  2640. )
  2641. return
  2642. # 翻页
  2643. print('开始滑动')
  2644. self.d.drag(200, 1400, 200, 300, 0.4)
  2645. print('滑动结束')
  2646. time.sleep(self.get_sleep_time())
  2647. # 采集完成,数据上报
  2648. if task_id:
  2649. reporter.end_task(
  2650. task_id=task_id,
  2651. status='completed',
  2652. finish_status=1,
  2653. force_end_page=end_page
  2654. )
  2655. print('开始滑入下一页')
  2656. # search_list = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]').info
  2657. # bounds = search_list['bounds']
  2658. # try:
  2659. # top = drug_lis[0].info['bounds']['top']
  2660. # except Exception as e:
  2661. # print(f'获取滑动参数top出错:{e}')
  2662. # top = 250
  2663. # search_list = self.d.xpath('//android.support.v7.widget.RecyclerView').info
  2664. # bottom = search_list['bounds']['bottom']
  2665. # end_y = 1400 + top - bottom + 162
  2666. # if end_y < 150:
  2667. # end_y = 150
  2668. '''
  2669. end_y = 300
  2670. # self.d.swipe(200, 1400, 200, 1400 + bounds['top'] - bounds['bottom'] + 162, 0.4)
  2671. self.d.swipe(200, 1400, 200, end_y, 0.4)
  2672. time.sleep(self.get_sleep_time())
  2673. '''
  2674. except Exception as e:
  2675. print(f"采集任务异常: {e}")
  2676. # === 新增:异常结束上报 ===
  2677. if task_id:
  2678. reporter.end_task(
  2679. task_id=task_id,
  2680. status='failed',
  2681. finish_status=0, # 未完成
  2682. force_end_page=self.current_page
  2683. )
  2684. # ========================
  2685. raise
  2686. finally:
  2687. # 确保监控线程被停止
  2688. self.monitor.stop()
  2689. self.monitor.join()
  2690. def unitest(self):
  2691. """
  2692. 单元测试
  2693. :return:
  2694. """
  2695. # device_id = '21885f5'
  2696. # # self.connect_devices(device_id)
  2697. # # self.screenshot_the_business_license()
  2698. # # cropped_screenshot_path = 'cropped_screenshot.png'
  2699. # # self.get_ocr_res(cropped_screenshot_path)
  2700. # shop_name = '我的店铺'
  2701. # base_path = r'D:\work\dfwy_spider\drug_data\pdd\screenshot'
  2702. # cropped_screenshot_path = os.path.join(base_path, shop_name + '.png')
  2703. # os.makedirs(base_path, exist_ok=True)
  2704. # print(cropped_screenshot_path)
  2705. # cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\pdd\\screenshot\\' + shop_name + '.png'
  2706. # print(cropped_screenshot_path)
  2707. # time.sleep(10000)
  2708. ocr_res = self.get_ocr_res('ceshi1.jpg')
  2709. print(f'ocr_res:{ocr_res}')
  2710. # 获取当前时间
  2711. current_time = datetime.datetime.now()
  2712. # 格式化为时分秒
  2713. time_str = current_time.strftime("%H-%M-%S")
  2714. # 生成随机的 8 位字符串
  2715. random_str = secrets.token_hex(4) # 生成 4 个字节的随机字符串,转换为 8 位十六进制字符串
  2716. screenshot_path = 'instructionscreenshot1-' + time_str + '-' + random_str + '.png'
  2717. print(screenshot_path)
  2718. # if ocr_res:
  2719. # for item in ocr_res:
  2720. # if '拖动滑块完成' in item['words']:
  2721. # print("滑块验证")
  2722. # break
  2723. # print("ocr_res end")
  2724. # company_name = ''
  2725. # reg_number = ''
  2726. # address = ''
  2727. # if ocr_res:
  2728. # for item in ocr_res:
  2729. # if '企业注册号' in item['words']:
  2730. # # print('come in 111')
  2731. # reg_number = item['words'].split(':', 1)[1].strip()
  2732. # elif '企业名称' in item['words']:
  2733. # # print('come in 222')
  2734. # company_name = item['words'].split(':', 1)[1].strip()
  2735. # elif '所:' in item['words']:
  2736. # # print('come in 333')
  2737. # address = item['words'].split(':', 1)[1].strip()
  2738. # # 输出结果
  2739. # print("企业注册号:", reg_number)
  2740. # print("企业名称:", company_name)
  2741. # print("住所:", address)
  2742. # screenshot_verify_path = 'screenshot_verify.png'
  2743. # img = cv2.imread(screenshot_verify_path)
  2744. # # 指定裁剪区域 (left, top, right, bottom)
  2745. # left = 118
  2746. # top = 478
  2747. # right = 602
  2748. # bottom = 722
  2749. # cropped_verify_img = img[top:bottom, left:right]
  2750. # cropped_verify_creenshot_path = 'cropped_verify_screenshot.png'
  2751. # cv2.imwrite(cropped_verify_creenshot_path, cropped_verify_img)
  2752. # print('裁剪完成')
  2753. time.sleep(100000)
  2754. title_info = '云南白药 参苓健脾胃颗粒10袋 补脾健胃利湿止泻 脾胃虚弱 饮食不消 或泻或吐 形瘦色萎 神疲乏力 5盒装(补脾健胃)'
  2755. min_price = 85
  2756. shop = '堂鹭北大药房旗舰店'
  2757. scrape_date = '2025-03-19'
  2758. dup_data = {'product': title_info, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date}
  2759. print(self.data_is_exists(dup_data))
  2760. #pdd
  2761. '''
  2762. def main():
  2763. # search_key = '999板蓝根颗粒10g*20' # 参苓健脾胃颗粒 香砂平胃颗粒 舒肝颗粒 清肺化痰丸
  2764. keys_list = [
  2765. #999感冒清热颗粒、 三九胃泰颗粒
  2766. #暂时不需要的:
  2767. # '999小儿止咳糖浆'
  2768. # '999小儿氨酚黄那敏颗粒'
  2769. # '999小儿感冒颗粒'
  2770. # '999抗病毒口服液10ml*6支'
  2771. # '今维多赐多康牌蛋白粉',
  2772. # '必无忧盐酸特比萘芬喷雾剂30ml'
  2773. # '999冰连清咽'
  2774. # '999复方苦参肠炎康片12片'
  2775. # '999强力枇杷露16袋'
  2776. # '999三蛇胆川贝膏138'
  2777. # '999维生素E.C颗粒9袋'
  2778. # '三九胃泰养胃舒颗粒8袋'
  2779. # '999止泻利颗粒15g*8'
  2780. # '史达功右美沙芬愈创甘油醚糖浆120'
  2781. # '999复方氨酚烷胺胶囊6粒'
  2782. # '999可调式生理性海水鼻腔喷雾50'
  2783. # '999小儿止咳糖浆120' #不低于19.8
  2784. # '999小儿止咳糖浆225' #禁止挂网
  2785. # '999小儿感冒颗粒6g*10' #不低于24.9
  2786. # '999小儿感冒颗粒6g*24' #禁止挂网
  2787. # '999小儿氨酚黄那敏颗粒6g*10袋' #不低于15.8
  2788. # '999小儿氨酚黄那敏颗粒6g*20袋' #禁止挂网
  2789. # '999小儿咽扁颗粒8g*10袋' #仅限999官旗店
  2790. # '999阿奇霉素片'
  2791. #2025-08-01最新 其中 藿香正气合剂两种规格 10支和6支 抗病毒口服液 12支和18支 蒲地蓝 24片 36片和44片 枇杷露225ml 小柴胡颗粒9袋和15袋 养胃舒 6袋 复方感冒灵颗粒15袋,
  2792. #曲安奈德益康唑乳膏 30g 葡萄糖酸锌口服溶液 12支 18支 24支和30支,
  2793. # 1、999止泻利颗粒15g*8 没有数据 2、三九胃泰养胃舒颗粒8袋 没有数据 3、999三蛇胆川贝膏138 没有数据 4、999强力枇杷露16袋 没有数据 5、999复方苦参肠炎康片12片 6、999冰连清咽 没有数据
  2794. # '999藿香正气合剂'
  2795. # '999藿香正气合剂10ml*6',
  2796. # '999藿香正气合剂10ml*10',
  2797. # '999糠酸莫米松凝胶15',
  2798. # '999抗病毒口服液',
  2799. # '999抗病毒口服液10ml*10'
  2800. # '999抗病毒口服液10ml*12'
  2801. # '999蒲地蓝消炎片',
  2802. # '999强力枇杷露225ml',
  2803. # '999小柴胡颗粒',
  2804. # '999养胃舒颗粒10g*6',
  2805. # '999复方感冒灵颗粒',
  2806. # '999黄芪精',
  2807. # '999曲安奈德益康唑乳膏30g',
  2808. # '999葡萄糖酸锌口服溶液',
  2809. # '佳美舒阿奇霉素肠溶胶囊4'
  2810. # '三九复方金银花颗粒10g*8袋'
  2811. # '999必无忧盐酸特比萘芬乳膏15'
  2812. # '999复方板蓝根颗粒15g*15袋'
  2813. # '999速复康布洛芬缓释胶囊'
  2814. # '999维生素C咀嚼片'
  2815. # '999精氨酸布洛芬颗粒'
  2816. # '999强力枇杷露120ml'
  2817. # '999强力枇杷露150ml'
  2818. # '999强力枇杷露' #同时支持120,150和225ml
  2819. #OTC
  2820. # '999银菊清咽颗粒' #没有数据
  2821. # '999感冒清热颗粒6g*10'
  2822. # '999选平硝酸咪康唑乳膏20g'
  2823. # '999糠酸莫米松乳膏10g'
  2824. # '999表虚感冒颗粒' #没有数据
  2825. # '999补脾益肠丸'
  2826. # '999壮骨关节胶囊'
  2827. # '999壮骨关节丸6g*20'
  2828. # '999正天丸6g*15'
  2829. # '999正天胶囊'
  2830. # '三九胃泰胶囊'
  2831. # '三九胃泰颗粒20g*10'
  2832. # '三九胃泰颗粒2.5g*6'
  2833. #10.31 new add
  2834. '999感冒灵颗粒' #不低于15.5
  2835. # '999皮炎平复方醋酸地塞米松乳膏20' #不低于12.5
  2836. # '三九胃泰颗粒20g*6袋' #不低于13.5
  2837. # '顺峰酮康他索乳膏' #包含10g和20g两个规格 10g 不低于7.5 20g 不低于12.5 '顺峰康王酮康他索乳膏'
  2838. # '999糠酸莫米松凝胶10' #不低于26.9
  2839. # '999板蓝根颗粒10g*20' #不低于26.9
  2840. # '999复方氨酚烷胺胶囊12粒' #不低于17.9 #统一成:999复方氨酚烷胺胶囊
  2841. # '999复方氨酚烷胺胶囊10粒' #禁止挂网 #统一成:999复方氨酚烷胺胶囊
  2842. # '999复方氨酚烷胺胶囊6粒' #禁止挂网 #统一成:999复方氨酚烷胺胶囊
  2843. # '999复方氨酚烷胺胶囊'
  2844. # '999咽炎片0.26g*12片' #不低于13.5 #999咽炎片0.26g*12片*2板改成 999咽炎片0.26g*12片
  2845. # '999感冒灵胶囊' #仅限999官旗店
  2846. # '999荆防颗粒' #美团没有数据 #禁止挂网 拼多多也没数据
  2847. # '999小儿感冒宁颗粒' #禁止挂网 999小儿感冒宁颗粒2.5g*10袋 改成 999小儿感冒宁颗粒
  2848. # '速复康磷酸奥司他韦胶囊75mg*10' #美团没数据 # 禁止挂网 999磷酸奥司他韦胶囊75mg*10 改成 速复康磷酸奥司他韦胶囊75mg*10
  2849. # '史达功右美沙芬愈创甘油醚糖浆120' #仅限999官旗店
  2850. # '999感冒清热颗粒12g*18' #禁止挂网
  2851. ]
  2852. # 设备序列号
  2853. # device_id = 'e2899b34'
  2854. # device_id = '2e58510'
  2855. # device_id = '369dcf96'
  2856. # device_id = 'ea4e4eb8'
  2857. # device_id = 'IZTOWWDQT45D49BU'
  2858. # device_id = 'INXCDAIR75FMMFGU'
  2859. # device_id = 'CMKFUSSG99ROR489'
  2860. # device_id = '95b2c764'
  2861. # device_id = 'UCQGF6CQFMU8WKHI'
  2862. device_id = 'U8ONIJJJS4CELVD6'
  2863. # device_id = 'OVFETO8PCY45E6A6'
  2864. # device_id = 'IRLZAAQCDMHYWKTS'
  2865. # device_id = 'DEZXWKUC7DJBLVPJ'
  2866. # device_id = 'U47HZDRG8XJBBURW'
  2867. # device_id = 'WWRO9LTGG6KFGQCM'
  2868. # device_id = 'GQIRKB7LVOONM7VW'
  2869. # device_id = 'ZDQWUSSWBEDI896T'
  2870. # device_id = '656DTOPRZDEALZ5X'
  2871. # device_id = 'N7ZXBITOSOGMYXQS'
  2872. # device_id = '1462a51f'
  2873. # device_id = '4TZDUGTOAIFMJVGU'
  2874. # device_id = 'GIOFIBRKZTUGJJAE'
  2875. # device_id = 'fcb3c749'
  2876. cycle_no = 0 # 轮次计数
  2877. while True:
  2878. cycle_no += 1
  2879. logging.info(f'========== 第 {cycle_no} 轮采集开始 ==========')
  2880. for idx, key in enumerate(keys_list, 1):
  2881. logging.info(f'[{idx}/{len(keys_list)}] 开始采集关键字:{key}')
  2882. try:
  2883. # mt = MT(key) # 用当前关键字实例化
  2884. # mt.main(device_id) # 执行一次完整采集
  2885. pdd = PDD(key, device_id)
  2886. pdd.main(device_id)
  2887. logging.info(f'关键字 {key} 本轮采集完成')
  2888. except Exception as e:
  2889. # 发生异常直接跳过该关键字,继续下一轮
  2890. logging.exception(f'关键字 {key} 采集异常:{e}')
  2891. finally:
  2892. # 关闭当前 MT 实例资源(如有需要)
  2893. if hasattr(pdd, 'close'):
  2894. pdd.close()
  2895. # logging.info('本轮全部关键字采集完成,等待 2 小时后下一轮...')
  2896. # time.sleep(1 * 3600) # 2 小时 = 7200 秒
  2897. # pdd = PDD(search_key, device_id)
  2898. # pdd.unitest()
  2899. # pdd.main('369dcf96')
  2900. # pdd.main(device_id)
  2901. '''
  2902. def get_retrieve_mysql():
  2903. """
  2904. 建立远端连接并返回一个到数据库的连接对象
  2905. """
  2906. import pymysql
  2907. return pymysql.connect(
  2908. host='39.108.116.125', # 修改后的主机
  2909. port=3306, # 添加端口号
  2910. user='drug_retrieve', # 修改后的用户名
  2911. password='Pem287cwM58jNpe2', # 修改后的密码
  2912. db='drug_retrieve', # 修改后的数据库名
  2913. charset='utf8mb4'
  2914. )
  2915. class TimeoutException(Exception):
  2916. pass
  2917. # 如果需要并行处理(提高效率),可以使用线程池:
  2918. def process_tasks_in_parallel(max_workers=12):
  2919. """使用线程池并行处理多个任务""" """使用线程池并行处理多个任务,每个任务最多执行30分钟"""
  2920. from concurrent.futures import ThreadPoolExecutor, as_completed
  2921. import concurrent.futures # ← 新增导入
  2922. retrieve_conn = get_retrieve_mysql()
  2923. cursor = retrieve_conn.cursor()
  2924. query = """
  2925. SELECT id, collect_equipment_id, product_name, start_page, end_page, duration
  2926. FROM retrieve_collect_task_allocate
  2927. WHERE status = 1 AND platform = 3
  2928. """
  2929. cursor.execute(query)
  2930. results = cursor.fetchall()
  2931. print(f"获取到的任务结果={results}")
  2932. if not results:
  2933. print("PDD 没有要采集的品规")
  2934. return
  2935. # 准备任务列表
  2936. tasks = []
  2937. device_map = {}
  2938. for result in results:
  2939. task_id = result[0]
  2940. collect_equipment_id = result[1]
  2941. product_name = result[2]
  2942. start_page = result[3]
  2943. end_page = result[4]
  2944. duration = result[5]
  2945. if collect_equipment_id != 0 and product_name and product_name.strip():
  2946. # 缓存设备查询
  2947. if collect_equipment_id not in device_map:
  2948. device_query = "SELECT device_id FROM retrieve_collect_equipment WHERE id = %s AND status = 0"
  2949. cursor.execute(device_query, (collect_equipment_id,))
  2950. device_result = cursor.fetchone()
  2951. device_map[collect_equipment_id] = device_result[0] if device_result else None
  2952. if device_map[collect_equipment_id]:
  2953. # ↓ 使用数据库中的duration,如果没有设置则用默认值30分钟
  2954. duration_minutes = duration if duration is not None else 30
  2955. tasks.append({
  2956. 'task_id': task_id,
  2957. 'device_id': device_map[collect_equipment_id],
  2958. 'key': product_name.strip(),
  2959. 'start_page': start_page,
  2960. 'end_page': end_page,
  2961. 'duration_minutes': duration_minutes, # 存储执行时间限制(分钟)
  2962. })
  2963. cursor.close()
  2964. retrieve_conn.close()
  2965. if not tasks:
  2966. print("没有有效的采集任务")
  2967. return
  2968. print(f"准备并行处理 {len(tasks)} 个任务")
  2969. def process_single_task(task):
  2970. """处理单个任务的函数"""
  2971. task_start_time = time.time() # ← 记录开始时间
  2972. # start_time = time.time()
  2973. try:
  2974. pdd = PDD(task['key'], task['device_id'])
  2975. # 执行采集,获取采集数量 关键数据:实际采集的数量,实际的页数
  2976. pdd.main(
  2977. device_id = task['device_id'],
  2978. start_page = task['start_page'],
  2979. end_page = task['end_page'],
  2980. task_id = task['task_id'],
  2981. max_duration_minutes = task['duration_minutes'] # 传入时间限制
  2982. )
  2983. return {
  2984. 'task_id': task['task_id'],
  2985. 'success': True,
  2986. 'collected_count': pdd.collected_count,
  2987. 'final_page': pdd.current_page
  2988. }
  2989. except Exception as e:
  2990. print(f"任务 {task['task_id']} 执行异常: {e}")
  2991. return {
  2992. 'task_id': task['task_id'],
  2993. 'success': False,
  2994. 'error': str(e)
  2995. }
  2996. finally:
  2997. if 'mt' in locals() and hasattr(pdd, 'close'):
  2998. try:
  2999. pdd.close()
  3000. except:
  3001. pass
  3002. # 使用线程池并行执行
  3003. successful_tasks = 0
  3004. failed_tasks = 0
  3005. # total_execution_time = 0 # 初始化总执行时间变量
  3006. with ThreadPoolExecutor(max_workers=max_workers) as executor:
  3007. # 提交所有任务
  3008. future_to_task = {
  3009. executor.submit(process_single_task, task): task
  3010. for task in tasks
  3011. }
  3012. # 处理完成的任务
  3013. for future in as_completed(future_to_task):
  3014. task = future_to_task[future]
  3015. try:
  3016. task_timeout = (task['duration_minutes'] + 5) * 60 # 加5分钟缓冲
  3017. result = future.result(timeout=task_timeout) # 使用动态超时时间
  3018. if result['success']:
  3019. successful_tasks += 1
  3020. print(f"任务 {result['task_id']}: 完成,采集 {result['collected_count']} 条数据")
  3021. else:
  3022. failed_tasks += 1
  3023. print(f"任务 {result['task_id']}: 失败,错误: {result['error']}")
  3024. except concurrent.futures.TimeoutError: # ← 捕获超时异常
  3025. failed_tasks += 1
  3026. timeout_tasks += 1
  3027. print(f"任务 {task['task_id']}: 超时(限制 {task['duration_minutes']} 分钟)")
  3028. # ↓ 超时后上报数据
  3029. if task['task_id']:
  3030. # 这里需要调用上报,但reporter可能没有这个任务的数据
  3031. # 更好的方式是在MT.main中已经上报了
  3032. pass
  3033. except Exception as e:
  3034. failed_tasks += 1
  3035. print(f"任务 {task['task_id']}: 执行异常 {e}")
  3036. # if (successful_tasks + failed_tasks) > 0:
  3037. # avg_time = total_execution_time / (successful_tasks + failed_tasks)
  3038. # avg_minutes = avg_time / 60
  3039. # else:
  3040. # avg_minutes = 0
  3041. # total_minutes = total_execution_time / 60
  3042. print(f"\n并行采集完成:")
  3043. print(f"成功: {successful_tasks} 个")
  3044. print(f"失败: {failed_tasks} 个")
  3045. if __name__ == '__main__':
  3046. # process_tasks_in_parallel(max_workers=10) # 可以同时处理10个任务
  3047. def run_collection():
  3048. """执行采集任务"""
  3049. try:
  3050. print(f"【定时任务开始】时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  3051. process_tasks_in_parallel(max_workers=12)
  3052. print(f"【定时任务结束】时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  3053. except Exception as e:
  3054. print(f"【定时任务异常】: {e}")
  3055. # 设置定时任务
  3056. schedule.every(130).minutes.do(run_collection)
  3057. # 立即执行一次
  3058. run_collection()
  3059. print("定时任务已设置,每130分钟执行一次采集")
  3060. # 循环执行
  3061. while True:
  3062. schedule.run_pending()
  3063. time.sleep(60) # 每分钟检查一次
  3064. # main()
  3065. # scheduler = BlockingScheduler()
  3066. # scheduler.add_job(main, 'cron', hour=11, minute=1, misfire_grace_time=120)
  3067. # try:
  3068. # scheduler.start()
  3069. # except (KeyboardInterrupt, SystemExit):
  3070. # pass