| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527 |
- import requests
- import base64
- import cv2
- import uiautomator2 as u2
- import time
- import subprocess
- import re
- import random
- import datetime
- import json
- from apscheduler.schedulers.blocking import BlockingScheduler
- from aip import AipOcr
- import numpy as np
- import cv2
- import os
- from config import Config
- import logging
- from logger import setup_logger
- import xml.etree.ElementTree as ET
- import secrets
- import threading
- from collections import deque
- from typing import Dict, Any
- import schedule
- setup_logger("pdd_spider") # 初始化日志
- class SpiderMonitor(threading.Thread):
- """全局弹窗监控线程(增强版)"""
- def __init__(self, spider_instance):
- super().__init__(daemon=True)
- self.spider = spider_instance
- self.running = True
- self.pausing = threading.Event() # 主线程同步事件
- self.last_verification_time = 0
- self.verification_count = 0
- self.MAX_VERIFICATION_RETRY = 10
- self.recent_clicks = deque(maxlen=10) # 防重复点击
- self.logger = logging.getLogger("SpiderMonitor")
- # 可配置化弹窗规则
- self.popup_rules = {
- "simple": [
- ('//*[@text="确定"]', "点击确定"),
- ('//*[@text="允许"]', "点击允许"),
- ('//*[@text="关闭"]', "点击关闭"),
- ('//*[@resource-id="com.sankuai.meituan:id/close"]', "关闭按钮"),
- ('//*[@resource-id="com.sankuai.meituan:id/address_center_location_close"]', "关闭按钮"),
- ('//*[@resource-id="com.sankuai.meituan:id/location_close"]', "关闭按钮"),
- ],
- "verification": [
- '//*[contains(@text, "验证")]',
- '//*[contains(@text, "滑块")]',
- '//*[contains(@text, "依次点击")]',
- '//*[contains(@text, "请点击")]',
- '//*[contains(@text, "拖动滑块刚")]', #这个需要拖动滑块至最右边,然后再截图
- '//*[contains(@text, "请输入图片中的内容")]',
- '//*[contains(@text, "用最短线连接")]',
- '//*[contains(@text, "请按语序依次点击")]',
- '//*[contains(@text, "请向右滑动滑块")]',
- '//*[contains(@text, "请拖动下方滑块完成拼图")]',
- '//*[contains(@resource-id, "captcha")]'
- ]
- }
- def run(self):
- while self.running:
- try:
- handled = self.check_and_handle_popup()
- time.sleep(2 if handled else 1)
- except Exception as e:
- self.logger.exception("监控线程异常: %s", e)
- time.sleep(3)
- def _is_recent_click(self, xpath):
- """防止重复点击同一个弹窗"""
- key = f"{xpath}_{int(time.time())}"
- if key in self.recent_clicks:
- return True
- self.recent_clicks.append(key)
- return False
- def check_and_handle_popup(self):
- d = self.spider.d
- # 1. 处理简单弹窗
- for xpath, desc in self.popup_rules["simple"]:
- if d.xpath(xpath).exists and not self._is_recent_click(xpath):
- self.logger.info("检测到弹窗: %s", desc)
- d.xpath(xpath).click()
- return True
- # 2. 处理验证码弹窗
- for xpath in self.popup_rules["verification"]:
- if d.xpath(xpath).exists:
- now = time.time()
- if now - self.last_verification_time < 30:
- return False # 30秒内不重复触发
- self.last_verification_time = now
- self.verification_count += 1
- self.logger.warning("验证码弹窗触发,等待人工处理...")
- if self.verification_count > self.MAX_VERIFICATION_RETRY:
- self.logger.error("验证码重试超限,终止任务")
- self.spider.stop_all()
- return True
-
- self.pausing.set() # 通知主线程暂停
- d.toast.show("需要人工处理验证码", 120)
- # 等待人工处理
- start = time.time()
- # while time.time() - start < 120*60:
- # if not d.xpath(xpath).exists:
- # self.logger.info("验证码已处理")
- # d.toast.show("验证完成", 2)
- # self.pausing.clear() # 放行主线程
- # return True
- # time.sleep(5)
- while True:
- if not d.xpath(xpath).exists:
- self.logger.info("验证码已处理")
- d.toast.show("验证完成", 2)
- self.pausing.clear() # 放行主线程
- return True
- time.sleep(5)
- self.logger.warning("验证码超时,重启APP")
- self.spider.restart_app()
- return True
- # 3. 处理广告弹窗(点击右上角)
- if d.xpath('//*[contains(@text, "广告")]').exists:
- w, h = d.info['displayWidth'], d.info['displayHeight']
- d.click(w - 50, 50)
- self.logger.info("关闭广告弹窗")
- return True
- return False
- def stop(self):
- self.running = False
- def get_access_token():
- AppKey = "tRK2RhyItCSh6BzyT4CNVXQa"
- AppSrcret = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
- token_url = 'https://aip.baidubce.com/oauth/2.0/token'
- url = f"{token_url}?grant_type=client_credentials&client_id={AppKey}&client_secret={AppSrcret}"
- payload = ""
- headers = {
- 'Content-Type': 'application/json',
- 'Accept': 'application/json'
- }
- response = requests.request("POST", url, headers=headers, data=payload)
- try:
- return response.json()['access_token']
- except:
- return None
-
- def get_mysql():
- """
- 建立并返回一个到数据库的连接对象
- """
- import pymysql
- return pymysql.connect(
- host = Config.DB_HOST, #"localhost", # 修改后的主机
- port = Config.DB_PORT, #3306, # 添加端口号
- user = Config.DB_USER, #'root', # 修改后的用户名
- password = Config.DB_PASSWORD, # 修改后的密码
- db = Config.DB_NAME, #"drug_data", # 修改后的数据库名
- charset='utf8mb4'
- )
- #获取滑块验证中滑块需要移动的距离
- def slide_verify(img_path):
- with open(img_path, 'rb') as f:
- b = base64.b64encode(f.read()).decode() ## 图片二进制流base64字符串
- url = "http://api.jfbym.com/api/YmServer/customApi"
- data = {
- ## 关于参数,一般来说有3个;不同类型id可能有不同的参数个数和参数名,找客服获取
- "token": "1nDVocTE2mJ0yLEYb2sZJ5uUY2VIEoGTkIpW44X7Kgk",
- "type": "22222",
- "image": b,
- }
- _headers = {
- "Content-Type": "application/json"
- }
- response = requests.request("POST", url, headers=_headers, json=data).json()
- print(response)
-
- if response.get("msg") == "识别成功":
- # 获取 data 中的 data 字段
- result = response.get("data", {}).get("data")
- if result:
- print(result) # 输出结果
- else:
- print("无法获取数据")
- else:
- print("识别未成功")
-
- return result
- class TaskReporter:
- """任务上报管理器(线程安全)"""
- def __init__(self):
- self.tasks_data = {} # 存储每个任务的数据
- self.lock = threading.Lock()
-
- def start_task(self, task_id: int, start_page: int, end_page: int):
- """记录任务开始"""
- with self.lock:
- self.tasks_data[task_id] = {
- 'task_id': task_id,
- 'start_time': int(time.time()),
- 'end_time': None,
- 'start_page': start_page,
- 'end_page': end_page,
- 'actual_end_page': start_page, # 实际结束页数
- 'real_count': 0, # 实际采集数量
- 'status': 'running', # running, completed, failed
- 'finish_status': 0, # 0:未完成,1:已完成
- }
-
- def update_task_progress(self, task_id: int,
- actual_end_page: int = None,
- real_count: int = None):
- """更新任务进度(线程安全)"""
- with self.lock:
- if task_id in self.tasks_data:
- if actual_end_page is not None:
- self.tasks_data[task_id]['actual_end_page'] = actual_end_page
- if real_count is not None:
- self.tasks_data[task_id]['real_count'] = real_count
-
- def end_task(self, task_id: int, status: str = 'completed',
- finish_status: int = 0, force_end_page: int = None):
- """记录任务结束并上报"""
- with self.lock:
- if task_id in self.tasks_data:
- data = self.tasks_data[task_id]
- data['end_time'] = int(time.time())
- data['status'] = status
- data['finish_status'] = finish_status
- if force_end_page is not None:
- data['actual_end_page'] = force_end_page
-
- # 准备上报数据
- report_data = {
- "collect_task_allocate_id": data['task_id'],
- "status": 3 if data['status'] == 'completed' else 4,
- "finish_status": data['finish_status'],
- 'real_count': data['real_count'],
- 'start_time': data['start_time'],
- 'end_time': data['end_time'],
- 'start_page': data['start_page'],
- 'end_page': data['actual_end_page']
- }
-
- # 调用上报接口
- self._call_report_api(report_data)
-
- def _call_report_api(self, data: Dict[str, Any]):
- """调用上报接口"""
- try:
- url = 'http://schedule.dfwy.tech/api/collect_equipment_execute/result_report'
- resp = requests.post(url, json=data, timeout=10)
-
- if resp.status_code == 200:
- print(f"任务 {data['collect_task_allocate_id']} 上报成功")
- # self.loggerPdd.info(f"任务 {data['collect_task_allocate_id']} 上报成功")
- else:
- print(f"任务 {data['collect_task_allocate_id']} 上报失败: {resp.status_code}")
- # self.loggerPdd.info(f"任务 {data['collect_task_allocate_id']} 上报失败: {resp.status_code}")
- except Exception as e:
- print(f"上报接口调用异常: {e}")
- # 全局上报管理器
- reporter = TaskReporter()
- class PDD:
- def __init__(self, search_key, device_id):
- self.package_name = 'com.xunmeng.pinduoduo'
- self.APP_ID = '116857964'
- self.API_KEY = '1gAzACJOAr7BeILKqkqPOETh'
- self.SECRET_KEY = 'ZNArANb9GwJYgLKg4EfYhukKBfPdl1n3'
- self.client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
- self.city2province = self.get_city_info()
- # host = "localhost"
- # user = "root"
- # password = "dfwy2025"
- # database = "drug_data"
- # port = 3306
- # self.table_name = "mt_drug"
- self.table_name = Config.DB_PDD_AUTO_TABLE #"pdd_drug" #pdd_auto_drug_test
- self.shop_table_name = Config.DB_PDD_SHOP_TABLE #"pdd_shop_info"
- self.loggerPdd = logging.getLogger()
- self.clipboard = "" #初始化剪切板的内容为空
- self.access_token = get_access_token()
- self.search_key = search_key # 参苓健脾胃颗粒 香砂平胃颗粒 舒肝颗粒 清肺化痰丸
- self.unrelated_data = 0 # 无关数据数量
- self.device_id = device_id
- # === 新增:采集统计 ===
- self.collected_count = 0 # 实际采集的商品数量
- self.task_id = None # 任务ID
- self.start_time = None # 任务开始时间
- self.current_page = 0 # 当前页码
- self.task_start_page = 0 # 任务开始页码
- self.task_end_page = 0 # 任务结束页码
- # ====================
- def update_task_status(self, status):
- """更新任务状态到数据库"""
- if not self.task_id:
- return
-
- try:
- retrieve_conn = get_retrieve_mysql()
- cursor = retrieve_conn.cursor()
- update_time = time.time()
- update_sql = """
- UPDATE retrieve_collect_task_allocate
- SET status = %s, update_time = %s
- WHERE id = %s
- """
- cursor.execute(update_sql, (status,update_time, self.task_id))
- retrieve_conn.commit()
-
- self.loggerPdd.info(f"任务 {self.task_id} 状态更新为 {status}")
-
- except Exception as e:
- self.loggerPdd.error(f"更新任务状态失败: {e}")
- finally:
- if 'cursor' in locals():
- cursor.close()
- if 'retrieve_conn' in locals():
- retrieve_conn.close()
- def stop_app(self):
- self.d.app_stop(self.package_name)
- time.sleep(5)
- def start_app(self):
- self.d.app_start(self.package_name)
- time.sleep(5)
- def restart_app(self):
- """
- 重启app
- :return:
- """
- self.stop_app()
- self.start_app()
- @staticmethod
- def get_sleep_time():
- return random.randint(1, 3)
- # return random.randint(5, 8)
- @staticmethod
- def get_current_date():
- return datetime.datetime.now().strftime('%Y/%m/%d')
- @staticmethod
- def get_city_info():
- """
- 获取所有的省市数据
- :return:
- """
- file_path = '../kailin_city.json'
- with open(file_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- province = {province_one["id"]: province_one for province_one in data['province']}
- city2province = dict()
- city = data['city']
- for city_one in city:
- name = city_one['name']
- pid = city_one['pid']
- if len(str(pid)) > 2:
- pid = int(re.match('^\d{2}', str(pid)).group())
- city2province[name] = province[pid]['name']
- return city2province
- def remove_watermark(self, img_path):
- """
- 图片去水印(将水印部分变成白色背景)并将数据转化为二进制数据
- :param img_path: 图片路径
- :return: 二进制图片数据
- """
- img = cv2.imdecode(np.fromfile(img_path, dtype=np.uint8), -1)
- endswith = os.path.splitext(img_path)[1]
- new = np.clip(1.4057577998008846 * img - 38.33089999653017, 0, 255).astype(np.uint8)
- _, img_binary = cv2.imencode(endswith, new)
- return img_binary
- def human_slide(self,start_x, start_y, end_x, end_y):
- """模拟人类滑动轨迹"""
- # 生成带加速度的轨迹
- points = []
- total_steps = 50
- distance_x = end_x - start_x
- distance_y = end_y - start_y
-
- previous_x = start_x # 用于记录上一个 x 坐标值
- for i in range(total_steps):
- # 非线性进度(慢-快-慢)
- ratio = (i / total_steps)
- if ratio < 0.3:
- progress = 0.5 * (ratio / 0.3)**2
- elif ratio < 0.7:
- progress = 0.5 + (ratio - 0.3) * 1.25
- else:
- progress = 0.9 + 0.5 * ((ratio - 0.7)/0.3)**0.5
-
- # 添加随机抖动
- # offset_x = np.random.randint(-2, 3)
- # offset_y = np.random.randint(-2, 3)
- offset_x = np.random.randint(-1, 1) # 控制抖动范围
- offset_y = np.random.randint(-1, 1)
-
- x = start_x + distance_x * min(progress, 0.99) + offset_x
- y = start_y + distance_y * min(progress, 0.99) + offset_y
-
- # 确保 x 坐标单调递增
- if x < previous_x and x < end_x:
- x = previous_x + 1
- if x > end_x:
- x = end_x
-
- previous_x = x
- points.append((x, y))
-
- # 变速延迟(移动越快延迟越短)
- delay = 0.002 + 0.01 * (1 - abs(0.5 - ratio))
- time.sleep(delay)
-
- print(f"points: {points}")
- self.loggerPdd.info(f"points: {points}")
- # 执行滑动轨迹
- # self.d.touch.down(points[0][0], points[0][1])
- for point in points[1:]:
- self.d.touch.move(point[0], point[1])
- self.d.touch.up(points[-1][0], points[-1][1])
- # print(f"points: {points}")
- # self.loggerPdd.info(f"points: {points}")
- # self.d.swipe_points(points, duration=0.05)
- def get_shop_name(self):
- """
- 获取店铺名
- :return:
- """
- try:
- xpath = '//*[@text="进店"]/preceding-sibling::android.view.ViewGroup/android.widget.LinearLayout/android.widget.TextView'
- if self.d.xpath(xpath).exists:
- shop_name = self.d.xpath(xpath).text
- self.loggerPdd.info(f'1-获取到店铺名:{shop_name}')
- else:
- #进入店铺新页面
- shop_btn_xpath = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]'
- if self.d.xpath(shop_btn_xpath).exists:
- self.d.xpath(shop_btn_xpath).click()
- time.sleep(1)
- # self.d.xpath('//*[@text="店铺"]').click()
-
-
- xpath_shop_name = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.TextView[1]'
- if self.d.xpath(xpath_shop_name).exists:
- shop_name = self.d.xpath(xpath_shop_name).text
- self.loggerPdd.info(f'2-获取到店铺名:{shop_name}')
- else:
- shop_name = ''
- self.loggerPdd.info(f'3-获取到店铺名:{shop_name}')
-
- self.swipe_back(1) #
- else:
- shop_name = ''
- self.loggerPdd.info('4-因为shop_btn_xpath不存在,获取到店铺名为空')
- # time.sleep(10000)
-
- #进入店铺新页面 测试代码
- # self.d.xpath('//*[@text="店铺"]').click()
- # time.sleep(1)
- # content_frame = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]').exists
- # print(content_frame)
- # ViewGroup3 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]').exists
- # print(ViewGroup3)
- # LinearLayout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]').exists
- # print(LinearLayout)
- # RelativeLayout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]').exists
- # print(RelativeLayout)
- # LinearLayout2 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]').exists
- # print(LinearLayout2)
- # RecyclerView = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]').exists
- # print(RecyclerView)
- # xpath2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.TextView[1]'
- # if self.d.xpath(xpath2).exists:
- # shop_name = self.d.xpath(xpath2).text
- # self.loggerPdd.info(f'2-获取到店铺名:{shop_name}')
- # else:
- # shop_name = ''
- # self.loggerPdd.info(f'3-获取到店铺名:{shop_name}')
-
- # self.swipe_back(1) #返回
-
- return shop_name
- except Exception as e:
- print(f'获取店铺名出错:{e}')
- self.loggerPdd.error(f'获取店铺名出错:{e}')
- return None
- def get_qualification_number(self):
- """
- 获取资质编号
- :return:
- """
- try:
- qualification_number_str = self.d.xpath(
- '//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text
- qualification_number = qualification_number_str.strip('资质编号:').strip()
- return qualification_number
- except:
- return None
- def enter_detail(self):
- self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/recycler"]/android.widget.FrameLayout[1]').click()
- time.sleep(self.get_sleep_time())
- def save_to_database(self, data):
- print(f'保存数据到数据库:{data}')
- # 连接数据库
- conn = get_mysql()
- # 创建游标对象
- cur = conn.cursor()
- # add_sql = "insert into delete_friend_table(delete_user_name,delete_user_id,delete_content,delete_time) value(%s,%s,%s,%s)"
- add_sql = f"""
- INSERT INTO {self.table_name}
- (product, min_price, manufacture_date, expiry_date, shop, business_license_company, province, city, manufacturer, specification, approval_number, product_link, scrape_date, scrape_province, availability, credit_code, platform, search_key, number)
- VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
- """
- # cur.execute(add_sql, (data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'], data['business_license_company'],data['province'], data['city'], data['manufacturer'], data['specification'], data['approval_number'], data['product_link'], self.get_current_date(), data['scrape_province'], data['availability'], data['credit_code'], data['platform']))
- cur.execute(add_sql, (data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'], data['business_license_company'],data['province'], data['city'], data['manufacturer'], data['specification'], data['approval_number'], data['product_link'], data['scrape_date'], data['scrape_province'], data['availability'], data['credit_code'], data['platform'], data['search_key'], data['number']))
- conn.commit() # 提交数据
- print(f"存入数据库成功")
- # === 新增:更新采集计数 ===
- self.collected_count += 1
- if self.task_id:
- # 更新上报进度
- reporter.update_task_progress(
- task_id=self.task_id,
- real_count=self.collected_count
- )
- def swipe_up(self):
- """
- 上滑
- :return:
- """
- screen_width = self.d.info['displayWidth']
- screen_height = self.d.info['displayHeight']
- duration_rate = random.uniform(0, 0.3)
- self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate)
- no = random.uniform(0, 1)
- if no > 0.85:
- # 有的时候卡着 再稍微往上滑一点点
- self.d.swipe_ext("up", 0.1)
- time.sleep(self.get_sleep_time())
- def swipe_back(self, no):
- """
- 返回
- :param no: 回退次数
- :return:
- """
- if not self.distinct_target():
- for idx in range(no):
- self.d.press('back')
- time.sleep(self.get_sleep_time())
- def drug_price(self):
- """
- 获取药品价格
- :return:
- """
- try:
- xpath = '//*[@text="¥"]/following-sibling::android.widget.TextView[1]'
- price_str = self.d.xpath(xpath).text
- price = float(re.search('[\d\.]+', price_str).group())
- print(f'获取到价格:{price}')
- return float(price)
- except Exception as e:
- print(f'提取价格出错-->{e}')
- return None
-
- def drug_price_ex(self):
- price_str = '' #价格初始化
- ext = '' #初始化已选择的信息
- price = ''
-
- button_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[last()]'
- button_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[2]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[last()]'
- #调试
- # test_button = self.d.xpath(button_xpath_1).exists
- # print(test_button)
- # test_button_2 = self.d.xpath(button_xpath_2).exists
- # print(test_button_2)
- # time.sleep(1000)
- # if self.d.xpath('//*[@text="发起拼单"]').exists:
- # self.d.xpath('//*[@text="发起拼单"]').click()
- # elif self.d.xpath('//*[@text="去复诊开药"]').exists:
- # self.d.xpath('//*[@text="去复诊开药"]').click()
- if self.d.xpath(button_xpath_1).exists:
- self.d.xpath(button_xpath_1).click()
- elif self.d.xpath(button_xpath_2).exists:
- self.d.xpath(button_xpath_2).click()
- else:
- print("button1 and button_2 all not exist")
- return price, ext
- #获取是已选择还是请选择
- # select_xpath = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[2]'
- select_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[last()]'
- select_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[last()]'
- select_xpath_3 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.TextView[last()]'
- select_xpath_3_2 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.TextView[last()-1]'
- # select_xpath1 = self.d.xpath(select_xpath_1).exists
- # print(select_xpath1)
- # select_xpath2 = self.d.xpath(select_xpath_2).exists
- # print(select_xpath2)
-
- # select_xpath3 = self.d.xpath(select_xpath_3).exists
- # print(select_xpath3)
- # time.sleep(1000)
- price_xpath_1 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[1]'
- price_xpath_2 = '//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.TextView[1]'
- price_xpath_3 = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.view.ViewGroup[2]/android.widget.LinearLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]//android.widget.TextView[1]'
- if self.d.xpath(select_xpath_1).exists:
- text1 = self.d.xpath(select_xpath_1).text
- print(f"select_xpath_1--text1={text1}")
- if '已选' in text1:
- if self.d.xpath(price_xpath_1).exists:
- price_str = self.d.xpath(price_xpath_1).text
- print(f"select_xpath_1--price_str-1={price_str}")
- else:
- print("select_xpath_1--price_xpath_1-1 not exist")
-
- ext = text1
- elif '请选择' in text1:
- #需要再下面点击选择
- scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]'
- if self.d.xpath(scroll_xpath_1).exists:
- self.d.xpath(scroll_xpath_1).click
- time.sleep(2) #延时2秒钟,选择了之后价格会刷新
- if self.d.xpath(select_xpath_1).exists:
- text2 = self.d.xpath(select_xpath_1).text
- if '已选' in text2:
- print(f"select_xpath_1--已选择2:text2={text2}")
- if self.d.xpath(price_xpath_1).exists:
- price_str = self.d.xpath(price_xpath_1).text
- print(f"select_xpath_1--price_str-2={price_str}")
- else:
- print("select_xpath_1--price_xpath_1-2 not exist")
- ext = text2
- else:
- print("select_xpath_1--scroll_xpath_1 not exist")
- elif self.d.xpath(select_xpath_2).exists:
- text1 = self.d.xpath(select_xpath_2).text
- print(f"xpath2--text1={text1}")
- if '已选' in text1:
- ext = text1
- if self.d.xpath(price_xpath_2).exists:
- price_str = self.d.xpath(price_xpath_2).text
- print(f"select_xpath_2--price_str-2={price_str}")
- else:
- print("select_xpath_2--price_xpath_2-1 not exist")
- elif '请选择' in text1:
- print('come in here')
- #需要再下面点击选择
- scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
- if self.d.xpath(scroll_xpath_1).exists:
- print("scroll_xpath_1 exists")
- self.d.xpath(scroll_xpath_1).click()
- time.sleep(2) #延时2秒钟,选择了之后价格可能会刷新
- if self.d.xpath(select_xpath_2).exists:
- text2 = self.d.xpath(select_xpath_2).text
- if '已选' in text2:
- ext = text2
- print(f"select_xpath_2--已选择2:text2={text2}")
- if self.d.xpath(price_xpath_2).exists:
- price_str = self.d.xpath(price_xpath_2).text
- print(f"select_xpath_2--price_str-2={price_str}")
- else:
- print("select_xpath_2--price_xpath_2-2 not exist")
- else:
- print("scroll_xpath_1 not exists")
- else:
- print("not exist 请选择 or 已选")
- elif self.d.xpath(select_xpath_3).exists:
- text1 = self.d.xpath(select_xpath_3).text
- print(f"xpath3--text1-1={text1}")
-
- if ('请选择' not in text1) and ('已选' not in text1):
- text1 = self.d.xpath(select_xpath_3_2).text
- print(f"xpath3--text1-2={text1}")
- if '已选' in text1:
- ext = text1
- if self.d.xpath(price_xpath_3).exists:
- price_str = self.d.xpath(price_xpath_3).text
- print(f"select_xpath_3--price_str-3-3-1={price_str}")
- else:
- print("select_xpath_3--price_xpath_3-3-1 not exist")
- elif '请选择' in text1:
- print('come in here')
- #需要再下面点击选择
- scroll_xpath_1 = '//*[@resource-id="android:id/content"]//android.widget.ScrollView[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
- recycler_view_xpath = '//*[@resource-id="android:id/content"]//android.support.v7.widget.RecyclerView[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]'
- if self.d.xpath(scroll_xpath_1).exists:
- print("scroll_xpath_1 exists")
- self.d.xpath(scroll_xpath_1).click()
- time.sleep(2) #延时2秒钟,选择了之后价格可能会刷新
- if self.d.xpath(select_xpath_3).exists:
- text2 = self.d.xpath(select_xpath_3).text
- if '已选' in text2:
- ext = text2
- print(f"select_xpath_3--已选择2:text2={text2}")
- if self.d.xpath(price_xpath_3).exists:
- price_str = self.d.xpath(price_xpath_3).text
- print(f"select_xpath_3--price_str-3-2={price_str}")
- else:
- print("select_xpath_3--price_xpath_3-3-2 not exist")
- elif self.d.xpath(recycler_view_xpath).exists:
- self.d.xpath(recycler_view_xpath).click()
- time.sleep(2) #延时2秒钟,选择了之后价格可能会刷新
- if self.d.xpath(select_xpath_3).exists:
- text2 = self.d.xpath(select_xpath_3).text
- if '已选' in text2:
- ext = text2
- print(f"select_xpath_3--已选择2:text2={text2}")
- if self.d.xpath(price_xpath_3).exists:
- price_str = self.d.xpath(price_xpath_3).text
- print(f"select_xpath_3--price_str-3-3={price_str}")
- else:
- print("select_xpath_3--price_xpath_3-3-3 not exist")
- else:
- print("scroll_xpath_1 not exists")
- else :
- print(f"xpath3--text1-不包含请选择和已选择")
- else :
- print("select_xpath_1 and select_xpath_2 and select_xpath_3 all not exist")
- if price_str:
- # price = float(re.search('[\d\.]+', price_str).group())
- match = re.search(r'¥([\d\.]+)', price_str)
- if match:
- price = float(match.group(1))
- else:
- price = ''
- # price = float(re.search(r'¥([\d\.]+)', price_str).group(1))
- print(f'获取到价格:{price}')
-
-
- print(f"ext={ext}")
- self.swipe_back(1) #
- # time.sleep(1000)
- return price, ext
- def restart_uiautomator_services(self, device_id):
- """
- 重启atx的uiautomator 服务
- :param device_id:
- :return:
- """
- stop_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d --stop'
- start_uiautomator_services = f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'
- # result = subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
- # print(result.stdout)
- subprocess.run(stop_uiautomator_services, capture_output=True, text=True, shell=True)
- time.sleep(self.get_sleep_time())
- subprocess.run(start_uiautomator_services, capture_output=True, text=True, shell=True)
- time.sleep(self.get_sleep_time())
-
- def connect_devices(self, device_id):
- """
- 连接设备
- :return:
- """
- try:
- self.d = u2.connect_usb(device_id)
- # 设置隐形等待时间
- # self.d.implicitly_wait(5)
- self.restart_uiautomator_services(device_id)
- print(f'连接到设备:{device_id}')
- except Exception as e:
- print(f'{device_id} 连接错误: {e}')
- raise Exception(e)
- def get_ocr_res(self, img):
- try:
- image = self.remove_watermark(img)
- # image_file = open(img,'wb')
- # image_file.write(image)
- # res_image = self.client.basicAccurate(image) # 高精度
- res_image = self.client.basicGeneral(image)
- # print(f'百度api返回结果:{res_image}')
- # print(res_image.get('words_result', ''))
- # new_dic = dict()
- data = res_image.get('words_result', '')
- print(f'百度api返回结果:{data}')
- # full_text = ';'.join(item['words'] for item in data)
- # address = ''
- # for item in data:
- # if '企业注册号' in item['words']:
- # print('come in 111')
- # reg_number = item['words'].split(':', 1)[1].strip()
- # elif '企业名称' in item['words']:
- # print('come in 222')
- # company_name = item['words'].split(':', 1)[1].strip()
- # elif '所:' in item['words']:
- # print('come in 333')
- # address = item['words'].split(':', 1)[1].strip()
- # # 输出结果
- # print("企业注册号:", reg_number)
- # print("企业名称:", company_name)
- # print("住所:", address)
- return data
- except:
- return None
-
- # def get_ocr_res(self, img):
- # try:
- # #img地址
- # print(f'开始识别图片:{img}')
- # request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
- # # 二进制方式打开图片文件
- # f = open(img, 'rb')
- # img = base64.b64encode(f.read())
- # # img = self.remove_watermark(img)
- # # print(f'图片转base64成功:{img}')
- # params = {"image": img}
- # # access_token = get_access_token()
- # request_url = request_url + "?access_token=" + self.access_token
- # headers = {'content-type': 'application/x-www-form-urlencoded'}
- # response = requests.post(request_url, data=params, headers=headers)
- # print(f'请求百度api成功{response}')
- # if response:
- # res = response.json()
- # print(f'百度api返回结果{res}')
- # new_dic = dict()
- # for ite in res['words_result'].keys():
- # new_dic[ite] = res['words_result'][ite]['words']
- # print('资质数据信息', new_dic)
- # return new_dic
- # else:
- # return None
- # except:
- # return None
- def screenshot_the_business_license(self):
- screenshot_path = 'screenshot1.png'
- self.d.screenshot(screenshot_path)
- img = cv2.imread(screenshot_path)
- # 指定裁剪区域 (left, top, right, bottom)
- left = 0
- top = 480
- right = 720
- bottom = 1420
- cropped_img = img[top:bottom, left:right]
- cropped_screenshot_path = 'cropped_screenshot.png'
- cv2.imwrite(cropped_screenshot_path, cropped_img)
-
- def screenshot_the_shop_qualifications(self):
- screenshot_path = 'screenshot2.png'
- self.d.screenshot(screenshot_path)
- img = cv2.imread(screenshot_path)
- # 指定裁剪区域 (left, top, right, bottom)
- left = 0
- top = 480
- right = 720
- bottom = 1420
-
-
- def first_screenshot_the_verify(self):
- screenshot_verify_path = 'first_screenshot_verify.png'
- self.d.screenshot(screenshot_verify_path)
- img = cv2.imread(screenshot_verify_path)
- ocr_res = self.get_ocr_res('first_screenshot_verify.png')
- event = '' #事件类型:滑块验证、计算结果输入
- if ocr_res:
- for item in ocr_res:
- if '拖动滑块完成' in item['words']:
- print("滑块验证")
- event = '滑块验证'
- break
-
- # 指定裁剪区域 (left, top, right, bottom)
- if event == '滑块验证':
- left = 118
- top = 478
- right = 602
- bottom = 722
- else:
- left = 118
- top = 478
- right = 602
- bottom = 722
- cropped_verify_img = img[top:bottom, left:right]
- cropped_verify_creenshot_path = 'first_cropped_verify_screenshot.png'
- cv2.imwrite(cropped_verify_creenshot_path, cropped_verify_img)
- return event
- def slide_second_screenshot_the_verify(self):
- screenshot_verify_path = 'slide_second_screenshot_verify.png'
- self.d.screenshot(screenshot_verify_path)
- img = cv2.imread(screenshot_verify_path)
- left = 118
- top = 478
- right = 602
- bottom = 722
- cropped_verify_img = img[top:bottom, left:right]
- cropped_verify_creenshot_path = 'second_slide_cropped_verify_screenshot.png'
- cv2.imwrite(cropped_verify_creenshot_path, cropped_verify_img)
-
- def second_screenshot_the_verify(self):
- screenshot_verify_path = 'second_screenshot_verify.png'
- self.d.screenshot(screenshot_verify_path)
- img = cv2.imread(screenshot_verify_path)
- ocr_res = self.get_ocr_res('second_screenshot_verify.png')
- print(f'second_ocr_res:{ocr_res}')
-
- result = ''
- if ocr_res:
- result = '验证成功'
- for item in ocr_res:
- if '验证成功' in item['words']:
- result = '验证成功'
- break
- elif '验证不成功' in item['words']:
- result = '验证不成功'
- break
- return result
-
- def screenshot_business_license(self,shop_name):
- screenshot_lincense_path = 'license_screenshot.png'
- self.d.screenshot(screenshot_lincense_path)
- img = cv2.imread(screenshot_lincense_path)
- #裁剪
- left = 0
- top = 160
- right = 720
- bottom = 1000
- cropped_verify_img = img[top:bottom, left:right]
- cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\pdd\\screenshot\\' + shop_name + '.png'
- cv2.imwrite(cropped_screenshot_path, cropped_verify_img)
- return cropped_screenshot_path
- def drug_slide(self, distance):
- print(f"滑动的distance= {distance}")
- # 获取设备上所有窗口的层次结构
- dump = self.d.dump_hierarchy()
- print(f"drug_slide-111:{dump}")
- # 解析 JSON 数据
- root = ET.fromstring(dump)
- # dump_data = json.loads(dump)
- print("drug_slide-222")
- # 查找包含 meco.webkit.WebView 的元素
-
- webview_elements = root.findall(".//node[@class='meco.webkit.WebView']")
- print("drug_slide-333")
- if webview_elements:
- print("找到 WebView 元素:", webview_elements[0].attrib)
- # 获取WebView的bounds信息
- webview_bounds = webview_elements[0].attrib['bounds']
- print("WebView bounds:", webview_bounds)
- # 假设你需要操作的元素在WebView中的XPath为'//node[@class="meco.webkit.WebView"]'
- webview_element_xpath = '//node[@class="meco.webkit.WebView"]'
- # 获取WebView内部元素
- webview_inner_elements = self.d.xpath(webview_element_xpath)
- if webview_inner_elements:
- # 拖动元素300像素
- start_x = webview_inner_elements[0].info['bounds'][0]
- start_y = webview_inner_elements[0].info['bounds'][1]
- end_x = start_x + distance
- end_y = start_y
- self.d.swipe(start_x, start_y, end_x, end_y)
- print("拖动成功")
- else:
- print("未找到需要拖动的元素")
- else:
- print("未找到 WebView 元素")
- def get_title(self):
- try:
- print('开始提取标题')
- time.sleep(self.get_sleep_time())
- title_xpath = '//*[@resource-id="com.xunmeng.pinduoduo:id/tv_title"]'
- if self.d.xpath(title_xpath).exists:
- title = self.d.xpath(title_xpath).info['contentDescription'].strip()
- else:
- return None
- # title = self.d.xpath('//*[@resource-id="com.xunmeng.pinduoduo:id/tv_title"]').info['contentDescription'].strip()
- print(f'提取到标题:{title}')
- return title
- except Exception as e:
- print(f'获取标题出错:{e}')
- return None
- # 从里面匹配出药品名和规格
- # drugs_name
- # specifications
- # match = re.search(r'([^\d]+)([\d\D]+)', title)
- # match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
- # if match:
- # drugs_name = match.group(1).strip() + match.group(2).strip()
- # specifications = match.group(3).strip()
- # print("药品名:", drugs_name)
- # print("规格:", specifications)
- # print('完整药名:', drugs_name + specifications)
- # return drugs_name, specifications
- # else:
- # print("没有匹配到预期格式")
- def enter_shop(self):
- """
- 进店,方便提取资质环境
- :return:
- """
- # self.d.xpath('//*[@text="进店"]').click()
- self.d.xpath('//*[@text="店铺"]').click()
- time.sleep(self.get_sleep_time())
- def enter_shoper(self):
- """
- 进入商家
- :return:
- """
- self.d.xpath('//*[@text="商家"]').click()
- time.sleep(self.get_sleep_time())
- def scan_shoper_license(self):
- self.d.xpath('//*[@text="查看商家资质"]').click()
- time.sleep(self.get_sleep_time())
- def data_is_exists(self, data):
- # 1. 验证必要字段
- required_keys = ['search_key', 'min_price', 'shop', 'scrape_date', 'platform']
- if not all(key in data for key in required_keys):
- missing = [key for key in required_keys if key not in data]
- # logging.error(f"缺少必要字段: {', '.join(missing)}")
- print(f"缺少必要字段: {', '.join(missing)}")
- return None
-
- try:
- # 连接数据库
- conn = get_mysql()
- # 创建游标对象
- cur = conn.cursor()
- # query_sql = f"SELECT * FROM {self.table_name} WHERE product = '{data['product']}' AND min_price = '{data['min_price']}' AND shop = '{data['shop']}' AND scrape_date = '{data['scrape_date']}' AND platform = '{data['platform']}'"
- # cur.execute(query_sql)
- query_sql = """
- SELECT * FROM {}
- WHERE search_key = %s
- AND min_price = %s
- AND shop = %s
- AND scrape_date = %s
- AND platform = %s
- """.format(self.table_name)
- cur.execute(query_sql, (
- data['search_key'],
- data['min_price'],
- data['shop'],
- data['scrape_date'],
- data['platform']
- ))
- result = cur.fetchone()
- return bool(result) # 如果存在返回True,否则False
- except Exception as e:
- print(f"MySQL 错误: {str(e)}")
-
- #验证店铺信息是否在数据库中已存在
- def shop_is_exists_database(self, shop):
- try:
- # 连接数据库
- conn = get_mysql()
- # 创建游标对象
- cur = conn.cursor()
- query_sql = """
- SELECT * FROM {}
- WHERE shop = %s
- """.format(self.shop_table_name)
- cur.execute(query_sql, (
- shop
- ))
- result = cur.fetchone()
- return bool(result) # 如果存在返回True,否则False
- except Exception as e:
- print(f"MySQL 错误: {str(e)}")
- def wait_if_verifying(self, monitor, timeout=120):
- """验证码处理期间阻塞主线程"""
- start = time.time()
- while monitor.pausing.is_set() and time.time() - start < timeout:
- time.sleep(1)
- # def safe_xpath(self, xpath, timeout=10):
- # """线程安全 xpath 查找"""
- # self.wait_if_verifying(self.monitor)
- # return self.d.xpath(xpath).wait(timeout=timeout)
-
- def wait_for_ready(self, monitor, timeout=86400):
- """进入每一页前都先等验证码"""
- start = time.time()
- while monitor.pausing.is_set() and time.time() - start < timeout:
- time.sleep(1)
- # 额外保险:如果验证码突然在这一秒才弹,再主动扫一次
- monitor.check_and_handle_popup()
- def safe_list(self, xpath, monitor):
- """线程安全地拿商品列表"""
- self.wait_for_ready(monitor)
- return self.d.xpath(xpath).all()
- def safe_exec(self, func, *args, **kwargs):
- """
- 万能安全壳:执行 func 前检查验证码,
- 若监控线程已置位 pausing,则一直阻塞直到放行。
- """
- while self.monitor.pausing.is_set():
- time.sleep(1)
- # 执行真正逻辑
- return func(*args, **kwargs)
- def get_instructions_data(self):
- """
- 确定有详情页之后之后,提取所有的详情页数据
- :return:
- """
- #下面的for循环已经有滑动的操作了,不要一进来就滑动。
- # self.d.swipe_ext("up", scale=0.5)
- for i in range(8):
- # if self.d(textStartsWith="查看全部").exists:
- if self.d.xpath('//*[@text="品牌"]').exists or self.d.xpath('//*[@text="药品通用名"]').exists:
- self.d.swipe_ext("up", scale=0.1)
- print('开始采集详情数据')
- break
- # screen_width = self.d.info['displayWidth']
- # screen_height = self.d.info['displayHeight']
- # self.d.swipe(screen_width // 2, screen_height - 400, screen_width // 2, 400, duration=0.2)
- self.d.swipe_ext("up", scale=0.5)
- time.sleep(self.get_sleep_time())
- # 点击查看全部
- if self.d.xpath('//*[@text="品牌"]').exists:
- self.d.xpath('//*[@text="品牌"]').click()
- else:
- self.d.xpath('//*[@text="药品通用名"]').click()
- time.sleep(self.get_sleep_time())
- attr = dict()
- # # 获取详情页信息
- xpath = '//*[starts-with(@text,"商品参数")]/parent::*/parent::*/following-sibling::*/*/*/android.view.ViewGroup//android.widget.TextView'
- ddd = self.d.xpath(xpath).all()
- for i in range(0, len(ddd), 2):
- group = ddd[i:i + 2]
- attr[group[0].text] = group[1].text
- # 截图获取未获取到的数据
- # if not all(i in ['有效期', '生产企业', '批准文号', '药品规格', '产品规格'] for i in attr.keys()):
- if not all(i in ['有效期', '生产企业', '批准文号', '药品规格'] for i in attr.keys()):
- self.d.swipe_ext("up", 0.4)
- time.sleep(self.get_sleep_time())
- xpath = '//*[starts-with(@text,"商品参数")]/parent::*/parent::*/following-sibling::*/*/*/android.view.ViewGroup//android.widget.TextView'
- ddd = self.d.xpath(xpath).all()
- for i in range(0, len(ddd), 2):
- group = ddd[i:i + 2]
- attr[group[0].text] = group[1].text
- print(f'当前说明书规格参数:{attr}')
- res_data = {
- # "有效期": attr.get('有效期',''),
- # "生产单位": attr['生产企业'],
- # "批准文号": attr['批准文号'],
- # "产品规格": attr.get('药品规格') if attr.get('药品规格', '') else attr.get('药品规格')
- "有效期": attr.get('有效期',''),
- "生产单位": attr.get('生产企业', ''),
- "批准文号": attr.get('批准文号', ''),
- "产品规格": attr.get('药品规格', '')
- }
- print(f'当前规格参数字典数据:{res_data}')
- return res_data
- def has_instructions(self):
- """
- 是否有详情页
- :return:如果有详情页返回True,否则返回False
- """
- # 没有说明书的无法采集具体数据
- max_attempts = 12 # 最大尝试次数
- attempt = 0 # 当前尝试次数
- while attempt < max_attempts:
- time.sleep(0.5)
- xpath = '//*[@text="商品详情"]'
- is_has_instructions = self.d.xpath(xpath).exists
- if is_has_instructions:
- return True # 如果找到“商品详情”,则返回True
- self.d.swipe_ext("up", 0.3)
- attempt += 1
- return False # 如果尝试次数达到最大次数,则返回False
- # time.sleep(self.get_sleep_time())
- # xpath = '//*[@text="商品详情"]'
- # is_has_instructions = self.d.xpath(xpath).exists
- # return
-
- def has_shop_qualifications(self):
- max_attempts = 3 # 最大尝试次数
- attempt = 0 # 当前尝试次数
- while attempt < max_attempts:
- time.sleep(0.5)
- xpath = '//*[@text="查看全部"]'
- is_has_search_all = self.d.xpath(xpath).exists
- if is_has_search_all:
- elements = self.d.xpath(xpath).all()
- count = len(elements)
- print(f"页面上共有 {count} 个 '查看全部' 元素")
- if count >= 2:
- second_element = elements[1]
- bounds = second_element.bounds
- else:
- element = self.d.xpath(xpath).get()
- bounds = element.bounds
- print(f'bounds:{bounds}')
- # time.sleep(1000000)
- # 获取元素的坐标和尺寸
- x1, y1, x2, y2 = bounds # x1, y1 是左上角坐标,x2, y2 是右下角坐标
- element_x = (x1 + x2) / 2 # 元素的中心 x 坐标
- element_y = (y1 + y2) / 2 # 元素的中心 y 坐标
- if element_y > 500:
- self.d.swipe(element_x, element_y, element_x, 500, 1)
- #画完之后再去一次坐标
- elements_ex = self.d.xpath(xpath).all()
- count_ex = len(elements_ex)
- print(f"第二次页面上共有 {count_ex} 个 '查看全部' 元素")
- if count_ex >= 2:
- second_element_ex = elements[1]
- bounds_ex = second_element_ex.bounds
- else:
- element_ex = self.d.xpath(xpath).get()
- bounds_ex = element_ex.bounds
- print(f'bounds_ex:{bounds_ex}')
-
- x1, y1, x2, y2 = bounds_ex # x1, y1 是左上角坐标,x2, y2 是右下角坐标
- element_x = (x1 + x2) / 2 # 元素的中心 x 坐标
- element_y = (y1 + y2) / 2 # 元素的中心 y 坐标
-
- time.sleep(self.get_sleep_time())
- target_x = element_x
- target_y = element_y + 80
-
- print(f'目标坐标:{target_x}, {target_y}')
- #点击图片
- self.d.click(target_x, target_y)
- time.sleep(self.get_sleep_time())
- #获取图片内容
- self.screenshot_the_shop_qualifications()
- ocr_res = self.get_ocr_res('screenshot2.png')
- print(f'ocr_res:{ocr_res}')
- time.sleep(100000)
- return True # 如果找到“商品详情”,则返回True
- self.d.swipe_ext("up", 0.1)
- attempt += 1
- return False # 如果尝试次数达到最大次数,则返回False
- def get_license_info_ex(self, shop_name):
- # print('开始获取商家资质信息')
- self.enter_shop() #点击店铺
- #点击店铺图片
- xpath_shop_image = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.RelativeLayout[1]/android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.RelativeLayout[1]/android.view.ViewGroup[1]/android.widget.LinearLayout[1]/android.widget.LinearLayout[1]/android.widget.ImageView[1]'
- if self.d.xpath(xpath_shop_image).exists:
- self.d.xpath(xpath_shop_image).click()
- time.sleep(self.get_sleep_time())
- #从弹窗页获取店铺资质的位置
- for i in range(10):
- if self.d.xpath('//*[@text="店铺资质"]').exists:
- print('店铺资质存在1')
- break
- self.d.swipe_ext('up', 0.3)
- time.sleep(1)
- if self.d.xpath('//*[@text="店铺资质"]').exists:
- print('店铺资质存在2')
- break
- if self.d.xpath('//*[@text="已上传"]').exists:
- self.d.xpath('//*[@text="已上传"]').click()
- else:
- self.d.xpath('//*[@text="店铺资质"]').click()
- time.sleep(self.get_sleep_time())
-
- xpath_pop_window = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/meco.webkit.WebView[1]'
- #等待验证弹窗出现
- for i in range(10):
- if self.d.xpath(xpath_pop_window).exists:
- print(f'第{i}次安全验证弹窗存在')
- break
- else:
- print(f'第{i}次安全验证弹窗不存在')
- time.sleep(self.get_sleep_time())
- time.sleep(10)
-
- #第一次截屏
- event = self.first_screenshot_the_verify()
- #用ocr的方式识别当前是拖动还是点击
- # ocr_res = self.get_ocr_res('screenshot_verify.png')
- # event = ''
- # if ocr_res:
- # for item in ocr_res:
- # if '拖动滑块完成' in item['words']:
- # print("滑块验证")
- # event = '滑块验证'
- # break
-
- # print("ocr_res end")
- if event == '滑块验证':
- #需要点击》按钮不懂后截屏,获取
- self.d.touch.down(110, 780)
- time.sleep(0.5)
- #截屏
- event = self.slide_second_screenshot_the_verify()
- # result = slide_verify('first_cropped_verify_screenshot.png') second_slide_cropped_verify_screenshot
- result = slide_verify('second_slide_cropped_verify_screenshot.png')
- result = int(result)
- print(f'滑动距离:{result}')
- print('开始滑动')
- # internel = 1000
- # self.d.touch.move_to(110 + result, 780, duration=internel)
- # self.d.touch.up(110 + result, 780)
- self.human_slide(110, 780, 110 + result, 780)
- print('滑动结束')
- time.sleep(self.get_sleep_time())
- #滑完之后怎么判断是否验证成功?
- second_resut = self.second_screenshot_the_verify()
- if second_resut == '验证成功':
- time.sleep(8)
- cropped_screenshot_path = self.screenshot_business_license(shop_name)
- ocr_res = self.get_ocr_res(cropped_screenshot_path)
- print(f'ocr_res:{ocr_res}')
- company_name = ''
- reg_number = ''
- address = ''
- if ocr_res:
- for item in ocr_res:
- if '企业注册号' in item['words']:
- # print('come in 111')
- reg_number = item['words'].split(':', 1)[1].strip()
- elif '企业名称' in item['words']:
- # print('come in 222')
- company_name = item['words'].split(':', 1)[1].strip()
- elif '所:' in item['words']:
- # print('come in 333')
- address = item['words'].split(':', 1)[1].strip()
- # 输出结果
- print("企业注册号:", reg_number)
- print("企业名称:", company_name)
- print("住所:", address)
- print("yanzhenghcenggong")
- # 截取图片保存 第三次截屏,保存到本地
-
-
- #将图片传给第三方verify接口得到需要移动的距离
- #result = slide_verify('cropped_verify_screenshot.png')
- #print(f'滑动距离:{result}')
- # self.drug_slide(340)
- # print('开始滑动')
- # self.d.swipe(120, 760, 460, 760, 0.3)
- # print('滑动结束')
- # time.sleep(1000000)
- #截图获取需要验证的内容
-
-
- def get_license_info(self):
- self.enter_shop()
- self.enter_shoper()
- self.scan_shoper_license()
- # 获取资质编码
- qualification_number = self.get_qualification_number()
- if qualification_number:
- table_license_info = self.get_table_license_info(qualification_number)
- if table_license_info:
- return {
- '单位名称': table_license_info[0],
- '地址': table_license_info[1],
- '社会信用代码': table_license_info[2]
- }
- else:
- # operate_no = random.randint(0, 1)
- self.d.click(0.603, 0.27)
- # if operate_no == 0:
- # self.d.xpath('//*[@text="营业执照"]').click()
- # else:
- # self.d.click(0.603, 0.27)
- time.sleep(self.get_sleep_time())
- self.screenshot_the_business_license()
- ocr_res = self.get_ocr_res('cropped_screenshot.png')
- return ocr_res
- # operate_no = random.randint(0, 1)
- self.d.click(0.603, 0.27)
- # if operate_no == 0:
- # self.d.xpath('//*[@text="营业执照"]').click()
- # else:
- # self.d.click(0.603, 0.27)
- time.sleep(self.get_sleep_time())
- self.screenshot_the_business_license()
- ocr_res = self.get_ocr_res('cropped_screenshot.png')
- return ocr_res
- def distinct_target(self):
- result = False
- is_position = self.d.xpath('//*[@content-desc="拍照搜索"]').exists
- is_position2 = self.d.xpath('//*[@text="年货节大促"]').exists
- is_position3 = self.d.xpath('//*[@text="筛选"]').exists
- is_position4 = self.d.xpath('//*[@text="回头客常拼"]').exists
- list_page_xpath = '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.view.ViewGroup[1]//android.widget.LinearLayout[1]/android.support.v7.widget.RecyclerView[1]'
- is_position_new = self.d.xpath(list_page_xpath).exists
- print(f'is_position_new={is_position_new}')
- if is_position or is_position2 or is_position3 or is_position4 or is_position_new:
- result = True
- return result
- def click_element_with_retry(self, xpath, max_retries=5, timeout=5):
- """
- 带重试机制的点击函数
- """
- for attempt in range(max_retries):
- try:
- if self.d.xpath(xpath).exists:
- self.d.xpath(xpath).click()
- print(f"第{attempt+1}次尝试点击成功")
- return True
- else:
- print(f"第{attempt+1}次尝试:元素不存在")
- except Exception as e:
- print(f"第{attempt+1}次尝试失败: {e}")
-
- if attempt < max_retries - 1:
- time.sleep(1) # 等待1秒后重试
-
- print(f"经过{max_retries}次尝试后点击失败")
- return False
- def enter_target_page(self):
- self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]').click()
- time.sleep(self.get_sleep_time())
- self.d(className='android.widget.EditText').click()
- time.sleep(self.get_sleep_time())
- self.d.send_keys(self.search_key, clear=True)
- time.sleep(self.get_sleep_time())
- self.d.xpath('//*[@text="搜索"]').click()
- time.sleep(self.get_sleep_time())
- #点击价格
- self.click_element_with_retry('//*[@text="价格"]')
- # self.d.xpath('//*[@text="价格"]').click()
- time.sleep(self.get_sleep_time())
- """暂不用该功能
- def get_table_license_info(self, qualification_number):
- try:
- sql = f'select business_license_company,city,credit_code from mt_drug where credit_code = "{qualification_number}"'
- self.mysql_client.cur.execute(sql)
- res = self.mysql_client.cur.fetchone()
- return res
- except:
- return None
- """
- def get_clipboard(self):
- self.loggerPdd.info(f"Clipboard content:{self.d.clipboard}") # 打印调试信息
- clipboard_content = self.d.clipboard
- if clipboard_content is None:
- return ''
- return clipboard_content.strip()
-
- def clear_clipboard(self):
- self.d.set_clipboard("")
- def get_product_link(self):
- product_link = ''
- print('开始获取商品链接')
- content_frame = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]').exists
- print(content_frame)
-
- relative_layout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]').exists
- print(relative_layout)
- relative_layout2 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]').exists
- print(relative_layout2)
- Frame_Layout = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]').exists
- print(Frame_Layout)
- ImageView = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]').exists
- print(ImageView)
- ImageView2 = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[3]/android.view.View[1]').exists
- print(ImageView2)
- # time.sleep(10000)
- '''
- if self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]').exists:
- self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]').click()
- print('进入分享成功')
- time.sleep(1)
- #先清除剪切板的数据
- # self.clear_clipboard()
- #需要增加从右往左的滑动
- # self.d.swipe_ext("left", 0.5)
- print('开始滑动')
- self.d.swipe(400, 1250, 100, 1250, 0.2)
- time.sleep(0.2)
- if self.d.xpath('//*[@text="复制链接"]').exists:
- self.d.xpath('//*[@text="复制链接"]').click()
- print('点击复制链接')
- # self.d.xpath('//*[@text="复制链接"]').click_exists()
- # self.d.xpath('//*[contains(@text="复")]').click_exists()
- # time.sleep(1)
- product_link = self.get_clipboard()
- time.sleep(0.5)
- print(f'商品链接:{product_link}')
- elif self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[3]/android.view.View[1]').exists:
- self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[3]/android.view.View[1]').click()
- print('进入分享成功111')
- time.sleep(1)
- print('开始滑动')
- self.d.swipe(400, 1250, 100, 1250, 0.2)
- time.sleep(0.2)
- if self.d.xpath('//*[@text="复制链接"]').exists:
- self.d.xpath('//*[@text="复制链接"]').click()
- print('点击复制链接')
- product_link = self.get_clipboard()
- time.sleep(0.5)
- print(f'商品链接:{product_link}')
- else:
- print('进入分享失败')
- time.sleep(10000)
- '''
- # 多种可能的“分享”按钮
- dots_xpaths = [
- # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]',
- '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[last()]/android.view.View[1]',
- # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[2]/android.view.View[1]',
- # '//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[2]/android.widget.RelativeLayout[2]/android.widget.FrameLayout[3]/android.widget.ImageView[1]',
- ]
- max_retry = 5 # 最多尝试次数
- for idx in range(1, max_retry + 1):
- if product_link: # 已经拿到则退出
- break
- for xp in dots_xpaths:
- if self.d.xpath(xp).exists:
- # print(f'{idx}-进入分享点点点')
- self.loggerPdd.info(f'{idx}-进入分享点点点')
- self.d.xpath(xp).click()
- time.sleep(1)
- # self.d.xpath('//*[@text="分享商品"]').click_exists()
- # time.sleep(0.2)
- # print('开始滑动')
- self.loggerPdd.info('开始滑动')
- if (self.device_id == '2e58510' or self.device_id == 'fcb3c749' or self.device_id == 'ea4e4eb8' or self.device_id == '95b2c764' or
- self.device_id == '369dcf96' or self.device_id == 'ZDQWUSSWBEDI896T' or self.device_id == 'IRLZAAQCDMHYWKTS' or
- self.device_id == 'U47HZDRG8XJBBURW' or self.device_id == 'WWRO9LTGG6KFGQCM' or self.device_id == 'fcb3c749' or
- self.device_id == 'OVFETO8PCY45E6A6' or self.device_id == 'U8ONIJJJS4CELVD6'):
- self.d.swipe(400, 1350, 100, 1350, 0.3)
- elif (self.device_id == 'KNNNEMNVWCJZQOLZ' or self.device_id == 'CMKFUSSG99ROR489' or self.device_id == '656DTOPRZDEALZ5X' or
- self.device_id == 'UCQGF6CQFMU8WKHI' or self.device_id == '4TZDUGTOAIFMJVGU' or self.device_id == 'DEZXWKUC7DJBLVPJ' or
- self.device_id == 'GQIRKB7LVOONM7VW'):
- self.d.swipe(400, 1300, 100, 1300, 0.3)
- elif self.device_id == 'e2899b34':
- self.d.swipe(400, 1050, 100, 1050, 0.3)
- else:
- self.d.swipe(400, 1250, 100, 1250, 0.3)
- # self.d.swipe(400, 1250, 100, 1250, 0.3)
- # self.d.drag(300, 1280, 50, 1280, 0.3)
- # self.d.swipe_ext('left', 0.3)
- time.sleep(0.2)
- self.d.xpath('//*[@text="复制链接"]').click_exists()
- time.sleep(1)
- product_link = self.get_clipboard()
- time.sleep(0.5)
- # print(f'{idx}-商品链接:{product_link}')
- self.loggerPdd.info(f'{idx}-商品链接:{product_link}')
- break # 找到并执行后跳出内层循环
-
- # if self.d.xpath('//*[@text="复制链接"]').exists:
- # self.d.xpath('//*[@text="复制链接"]').click()
- # # print('点击复制链接')
- # self.loggerPdd.info('点击复制链接')
- # product_link = self.get_clipboard()
- # time.sleep(0.5)
- # # print(f'{idx}-商品链接:{product_link}')
- # self.loggerPdd.info(f'{idx}-商品链接:{product_link}')
- # break # 找到并执行后跳出内层循环
- if not product_link and idx < max_retry:
- time.sleep(0.5) # 最后一次不需要再等待
-
- # time.sleep(100000)
- return product_link
- def integrate_data(self):
- """
- 整合数据
- :return:
- """
- #测试通过点击店铺获取店铺的名称:
- # shop = self.get_shop_name()
- # print(f'店铺名称:{shop}')
- # time.sleep(100000)
- #测试点击店铺进入获取店铺资质
- # license_info = self.get_license_info_ex('1234')#店铺名称
- # time.sleep(100000)
- #首先判断是否存在:商品已售罄,推荐以下相似商品 的文本
- # if self.d.xpath('//*[contains(@text, "商品已售罄,推荐以下相似商品")]'):
- # self.loggerPdd.info(f'商品已售罄:{self.search_key}')
- # self.swipe_back(1)
- # return
-
- #获取价格和盒数备注
- min_price, ext = self.safe_exec(self.drug_price_ex) # 最低价格和盒数备注
-
- title_info = self.safe_exec(self.get_title) # 药品名字
-
- if title_info:
-
- if '999' in self.search_key:
- temp_search_key = self.search_key.replace('999', '')
- if self.search_key == '999强力枇杷露225ml':
- temp_search_key = temp_search_key.replace('225ml', '')
- elif self.search_key == '999糠酸莫米松凝胶15':
- temp_search_key = temp_search_key.replace('15', '')
- elif self.search_key == '999养胃舒颗粒10g*6':
- temp_search_key = temp_search_key.replace('10g*6', '')
- elif self.search_key == '999曲安奈德益康唑乳膏30g':
- temp_search_key = temp_search_key.replace ('30g', '')
- elif self.search_key == '999抗病毒口服液10ml*6支':
- temp_search_key = temp_search_key.replace('10ml*6支', '')
- elif self.search_key == '999复方板蓝根颗粒15袋':
- temp_search_key = temp_search_key.replace('15袋', '')
- elif self.search_key == '999可调式生理性海水鼻腔喷雾50':
- temp_search_key = temp_search_key.replace('50', '')
- elif self.search_key == '999维生素E.C颗粒9袋':
- temp_search_key = temp_search_key.replace('9袋', '')
- elif self.search_key == '999复方氨酚烷胺胶囊6粒':
- temp_search_key = temp_search_key.replace('6粒', '')
- elif self.search_key == '999复方板蓝根颗粒15g*15袋':
- temp_search_key = temp_search_key.replace('15g*15袋', '')
- elif self.search_key == '999止泻利颗粒15g*8':
- temp_search_key = temp_search_key.replace('15g*8', '')
- elif self.search_key == '999三蛇胆川贝膏138':
- temp_search_key = temp_search_key.replace('138', '')
- elif self.search_key == '999强力枇杷露16袋':
- temp_search_key = temp_search_key.replace('16袋', '')
- elif self.search_key == '999复方苦参肠炎康片12片':
- temp_search_key = temp_search_key.replace('12片', '')
- elif self.search_key == '999必无忧盐酸特比萘芬乳膏15':
- temp_search_key = temp_search_key.replace('必无忧', '')
- temp_search_key = temp_search_key.replace('15', '')
- elif self.search_key == '999速复康布洛芬缓释胶囊':
- temp_search_key = temp_search_key.replace('速复康', '')
- elif self.search_key == '999强力枇杷露120ml':
- temp_search_key = temp_search_key.replace('120ml', '')
- elif self.search_key == '999强力枇杷露150ml':
- temp_search_key = temp_search_key.replace('150ml', '')
- elif self.search_key == '999抗病毒口服液10ml*10':
- temp_search_key = temp_search_key.replace('10ml*10', '')
- elif self.search_key == '999抗病毒口服液10ml*12':
- temp_search_key = temp_search_key.replace('10ml*12', '')
- elif self.search_key == '999感冒清热颗粒6g*10':
- temp_search_key = temp_search_key.replace('6g*10', '')
- elif self.search_key == '999选平硝酸咪康唑乳膏20g':
- temp_search_key = temp_search_key.replace('选平', '')
- temp_search_key = temp_search_key.replace('20g', '')
- elif self.search_key == '999糠酸莫米松乳膏10g':
- temp_search_key = temp_search_key.replace('10g', '')
- elif self.search_key == '999壮骨关节丸6g*20':
- temp_search_key = temp_search_key.replace('6g*20', '')
- elif self.search_key == '999正天丸6g*15':
- temp_search_key = temp_search_key.replace('6g*15', '')
- elif self.search_key == '999藿香正气合剂10ml*6':
- temp_search_key = temp_search_key.replace('10ml*6', '')
- elif self.search_key == '999藿香正气合剂10ml*10':
- temp_search_key = temp_search_key.replace('10ml*10', '')
- elif self.search_key == '999小儿止咳糖浆120':
- temp_search_key = temp_search_key.replace('120', '')
- elif self.search_key == '999小儿止咳糖浆225':
- temp_search_key = temp_search_key.replace('225', '')
- elif self.search_key == '999小儿感冒颗粒6g*10':
- temp_search_key = temp_search_key.replace('6g*10', '')
- elif self.search_key == '999小儿感冒颗粒6g*24':
- temp_search_key = temp_search_key.replace('6g*24', '')
- elif self.search_key == '999小儿氨酚黄那敏颗粒6g*10袋':
- temp_search_key = temp_search_key.replace('6g*10袋', '')
- elif self.search_key == '999小儿氨酚黄那敏颗粒6g*20袋':
- temp_search_key = temp_search_key.replace('6g*20袋', '')
- elif self.search_key == '999感冒灵颗粒10g*9袋':
- temp_search_key = temp_search_key.replace('10g*9袋', '')
- elif self.search_key == '999皮炎平复方醋酸地塞米松乳膏20':
- temp_search_key = temp_search_key.replace('皮炎平', '')
- temp_search_key = temp_search_key.replace('20', '')
- elif self.search_key == '999糠酸莫米松凝胶10':
- temp_search_key = temp_search_key.replace('10', '')
- elif self.search_key == '999板蓝根颗粒10g*20':
- temp_search_key = temp_search_key.replace('10g*20', '')
- elif self.search_key == '999咽炎片0.26g*12片':
- temp_search_key = temp_search_key.replace('0.26g*12片', '')
- elif self.search_key == '999小儿咽扁颗粒8g*10袋':
- temp_search_key = temp_search_key.replace('8g*10袋', '')
- elif self.search_key == '999感冒清热颗粒12g*18':
- temp_search_key = temp_search_key.replace('12g*18', '')
-
- # print (f"temp_search_key={temp_search_key}")
-
- if self.search_key == '999抗病毒口服液': #如果标题不包含 999 或 抗病毒口服液 或 (10ml*12 和 10ml*18) 则退出
- if '999' not in title_info or temp_search_key not in title_info:
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10ml*12' not in title_info and '10ml*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10*12或10*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999抗病毒口服液10ml*6支': #如果标题不包含 999 或 抗病毒口服液 或 (10ml*12 和 10ml*18) 则退出
- if '999' not in title_info or temp_search_key not in title_info:
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10ml*6' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10ml*6品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999曲安奈德益康唑乳膏30g':
- if '999' not in title_info or temp_search_key not in title_info:
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '30' not in title_info:
- print(f"当前商品名称:{title_info} 不包含30品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999复方感冒灵颗粒':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '14g*15' not in title_info and '14g*9' not in title_info:
- print(f"当前商品名称:{title_info} 不包含14g*15 和 14g*9品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999养胃舒颗粒10g*6':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10g*6' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10g*6品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999糠酸莫米松凝胶15':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '15' not in title_info:
- print(f"当前商品名称:{title_info} 不包含15品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
-
- elif self.search_key == '999强力枇杷露225ml':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '225' not in title_info:
- print(f"当前商品名称:{title_info} 不包含225品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999强力枇杷露120ml':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '120' not in title_info:
- print(f"当前商品名称:{title_info} 不包含120品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999强力枇杷露150ml':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '150' not in title_info:
- print(f"当前商品名称:{title_info} 不包含150品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999抗病毒口服液10ml*10':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10ml*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10ml*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999抗病毒口服液10ml*12':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10ml*12' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10ml*12品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999复方板蓝根颗粒15袋':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '15袋' not in title_info:
- print(f"当前商品名称:{title_info} 不包含15袋品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999可调式生理性海水鼻腔喷雾50':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999维生素E.C颗粒9袋':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999复方氨酚烷胺胶囊6粒':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6粒' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6粒品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999复方板蓝根颗粒15g*15袋':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '15g*15' not in title_info:
- print(f"当前商品名称:{title_info} 不包含15g*15品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999止泻利颗粒15g*8':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '15g*8' not in title_info:
- print(f"当前商品名称:{title_info} 不包含15g*8品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999三蛇胆川贝膏138':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '138' not in title_info:
- print(f"当前商品名称:{title_info} 不包含138品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999强力枇杷露16袋':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '16袋' not in title_info:
- print(f"当前商品名称:{title_info} 不包含16袋品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999复方苦参肠炎康片12片':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '12片' not in title_info:
- print(f"当前商品名称:{title_info} 不包含12片品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999必无忧盐酸特比萘芬乳膏15':
- if temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif ('999' not in title_info) and ('必无忧' not in title_info):
- print(f"当前商品名称:{title_info} 不包含关键字:999或 必无忧")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '15' not in title_info:
- print(f"当前商品名称:{title_info} 不包含15品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999速复康布洛芬缓释胶囊':
- if temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif ('999' not in title_info) and ('速复康' not in title_info):
- print(f"当前商品名称:{title_info} 不包含关键字:999或 速复康")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999维生素C咀嚼片':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '80' not in title_info:
- print(f"当前商品名称:{title_info} 不包含80品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999精氨酸布洛芬颗粒':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '9' not in title_info:
- print(f"当前商品名称:{title_info} 不包含9品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999阿奇霉素片':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '0.25g*6' not in title_info:
- print(f"当前商品名称:{title_info} 不包含0.25g*6品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999感冒清热颗粒6g*10':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6g*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6g*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999选平硝酸咪康唑乳膏20g':
- if temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif ('999' not in title_info) and ('选平' not in title_info):
- print(f"当前商品名称:{title_info} 不包含关键字:999或 选平")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '20g' not in title_info:
- print(f"当前商品名称:{title_info} 不包含20g品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999糠酸莫米松乳膏10g':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10g' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10g品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999补脾益肠丸':
- if temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif ('999' not in title_info) and ('三九' not in title_info):
- print(f"当前商品名称:{title_info} 不包含关键字:999或 三九")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999壮骨关节丸6g*20':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6g*20' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6g*20品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999正天丸6g*15':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6g*15' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6g*15品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999藿香正气合剂10ml*6':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10ml*6' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10ml*6品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999藿香正气合剂10ml*10':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10ml*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10ml*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999小儿止咳糖浆120':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '120' not in title_info:
- print(f"当前商品名称:{title_info} 不包含120品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999小儿止咳糖浆225':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '225' not in title_info:
- print(f"当前商品名称:{title_info} 不包含225品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999小儿感冒颗粒6g*10':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6g*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6g*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999小儿感冒颗粒6g*24':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6g*24' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6g*24品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999小儿氨酚黄那敏颗粒6g*10袋':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6g*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6g*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999小儿氨酚黄那敏颗粒6g*20袋':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '6g*20' not in title_info:
- print(f"当前商品名称:{title_info} 不包含6g*20品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999感冒灵颗粒':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10g*9' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10g*9品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999皮炎平复方醋酸地塞米松乳膏20':
- if temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{temp_search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif ('999' not in title_info) and ('皮炎平' not in title_info):
- print(f"当前商品名称:{title_info} 不包含关键字:999或 皮炎平")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '20g' not in title_info:
- print(f"当前商品名称:{title_info} 不包含20g品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999糠酸莫米松凝胶10':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999板蓝根颗粒10g*20':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10g*20' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10g*20品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999咽炎片0.26g*12片':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '0.26g*12' not in title_info and '0.26g*24' not in title_info:
- print(f"当前商品名称:{title_info} 不包含0.26g*12 和 0.26g*24品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999感冒清热颗粒12g*18':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '12g*18' not in title_info:
- print(f"当前商品名称:{title_info} 不包含12g*18品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '999小儿咽扁颗粒8g*10袋':
- if '999' not in title_info or temp_search_key not in title_info :
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '8g*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含8g*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- else:
- if '999' not in title_info or temp_search_key not in title_info:
- print(f"当前商品名称:{title_info} 不包含关键字:{self.search_key}")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- else:
- if self.search_key == '史达功右美沙芬愈创甘油醚糖浆120':
- temp_search_key = self.search_key.replace('史达功', '')
- temp_search_key = temp_search_key.replace('120', '')
- if '史达功' not in title_info or temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '120' not in title_info:
- print(f"当前商品名称:{title_info} 不包含120品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '三九胃泰养胃舒颗粒8袋':
- temp_search_key = self.search_key.replace('三九胃泰', '')
- temp_search_key = temp_search_key.replace('8袋', '')
- if '三九胃泰' not in title_info or temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '8袋' not in title_info:
- print(f"当前商品名称:{title_info} 不包含8袋品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '三九复方金银花颗粒10g*8袋':
- temp_search_key = self.search_key.replace('三九', '')
- temp_search_key = temp_search_key.replace('10g*8袋', '')
- if '三九' not in title_info or temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '10g*8' not in title_info:
- print(f"当前商品名称:{title_info} 不包含10g*8品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '必无忧盐酸特比萘芬喷雾剂30ml':
- temp_search_key = self.search_key.replace('必无忧', '')
- temp_search_key = temp_search_key.replace('30ml', '')
- if '必无忧' not in title_info or temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '30' not in title_info:
- print(f"当前商品名称:{title_info} 不包含30品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '佳美舒阿奇霉素肠溶胶囊4':
- temp_search_key = self.search_key.replace('佳美舒', '')
- temp_search_key = temp_search_key.replace('4', '')
- if '佳美舒' not in title_info or temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '4' not in title_info and '8' not in title_info:
- print(f"当前商品名称:{title_info} 不包含4品规或8品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '三九胃泰颗粒20g*10':
- temp_search_key = self.search_key.replace('20g*10', '')
- if temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '20g*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含20g*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '三九胃泰颗粒2.5g*6':
- temp_search_key = self.search_key.replace('2.5g*6', '')
- if temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '2.5g*6' not in title_info:
- print(f"当前商品名称:{title_info} 不包含2.5g*6品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '三九胃泰颗粒20g*6袋':
- temp_search_key = self.search_key.replace('20g*6袋', '')
- if temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '20g*6' not in title_info:
- print(f"当前商品名称:{title_info} 不包含20g*6品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '顺峰酮康他索乳膏':
- temp_search_key = self.search_key.replace('顺峰', '')
- if temp_search_key not in title_info or '顺峰' not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif self.search_key == '速复康磷酸奥司他韦胶囊75mg*10':
- temp_search_key = self.search_key.replace('速复康', '')
- temp_search_key = temp_search_key.replace('75mg*10', '')
- if '佳美舒' not in title_info or temp_search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- elif '75mg*10' not in title_info:
- print(f"当前商品名称:{title_info} 不包含75mg*10品规")
- self.swipe_back(1)
- self.unrelated_data += 1
- return
-
-
- else:
- if self.search_key not in title_info:
- print(f'药品标题未包含药品关键字:-->{self.search_key}')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- # temp_search_key = self.search_key
- # if self.search_key == '三九胃泰颗粒':
- # temp_search_key = '三九胃泰' #兼容三九胃泰 温胃舒颗粒
- # elif '999' in self.search_key:
- # temp_search_key = self.search_key.replace('999', '')
-
- # if '999' in self.search_key:
- # if ('999' not in title_info) or (temp_search_key not in title_info):
- # print(f'药品标题未包含药品关键字:-->{temp_search_key}和未包含999')
- # self.swipe_back(1)
- # self.unrelated_data += 1
- # return
- # else:
- # if temp_search_key not in title_info:
- # print(f'药品标题未包含药品关键字:-->{temp_search_key}')
- # self.swipe_back(1)
- # self.unrelated_data += 1
- # return
- else:
- print('标题获取为空')
- self.swipe_back(1)
- return
-
- #第一次没有获取到价格
- if not min_price:
- min_price = self.drug_price() # 最低价格 第二次获取
- if not min_price:
- print('提取价格出错,回退到列表页')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
-
- # 商品链接 分享链接
-
- product_link = self.get_product_link()
- time.sleep(2)
- if self.search_key == '999小儿止咳糖浆' or self.search_key == '999小儿氨酚黄那敏颗粒' or self.search_key == '999小儿感冒颗粒':
- shop = self.get_shop_name()
- else:
- for i in range(15):
- if self.d(textStartsWith="进店").exists:
- print('开始获取店铺名')
- break
- screen_width = self.d.info['displayWidth']
- screen_height = self.d.info['displayHeight']
- # self.d.swipe(screen_width // 2, screen_height - 400, screen_width // 2, 400, duration=0.2)
- self.d.swipe_ext("up", scale=0.3)
- time.sleep(self.get_sleep_time())
- if self.d(textStartsWith="进店").exists:
- print('可以开始获取店铺名')
-
- # shop = self.get_shop_name()
- shop = self.get_shop_name()
- if not shop:
- print('当前店铺名称为空')
- self.swipe_back(1)
- self.unrelated_data += 1
- return
- # 爬取日期
- scrape_date = self.get_current_date()
- dup_data = {'search_key': self.search_key, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date,
- 'platform': '拼多多'}
- if self.data_is_exists(dup_data):
- print('存在相同数据不入库')
- self.swipe_back(1)
- return
- is_has_instructions = self.has_instructions()
- # print(f'是否有说明书:{is_has_instructions}')
- self.loggerPdd.info(f'是否有说明书:{is_has_instructions}')
- # 生产日期为空
- manufacture_date = ''
- # 执政信息
- # if is_has_enter_shop:
- # license_info = self.get_license_info()
- # business_license_company = license_info["单位名称"]
- # credit_code = license_info['社会信用代码']
- # city_str = license_info['地址']
- # # 先把省份啥的替换掉
- # city_sub_str = re.sub(r'[u4e00-\u9fa5]+省', '', city_str)
- # try:
- # city = re.search(r'[\u4e00-\u9fa5]+?(市|区|县)', city_sub_str).group(0)
- # except:
- # city = city_sub_str
- # try:
- # province = self.city2province[city]
- # except:
- # province = ''
- # self.swipe_back(2)
- # else:
- # business_license_company = ''
- # credit_code = ''
- # city = ''
- # province = ''
- business_license_company = ''
- # credit_code = ''
- credit_code = ext
- city = ''
- province = ''
- # 说明书等信息
- if is_has_instructions:
- try:
- instructions_info = self.safe_exec(self.get_instructions_data)
- # print('说明书相关信息:', instructions_info)
- expiry_date = instructions_info['有效期'].strip('。')
- manufacturer = instructions_info['生产单位'].strip('。')
- approval_number = instructions_info['批准文号'].strip('。')
- specifications = instructions_info['产品规格'].strip('。')
- except Exception as e:
- print(f'获取详情页规格参数出错:{e}')
- self.swipe_back(2)
- return
- else:
- expiry_date = ''
- manufacturer = ''
- approval_number = ''
- specifications = ''
- # if self.search_key == '999小柴胡颗粒':
- # if '10g*9' in specifications or '10克x9' in specifications or '10g*15' in specifications or '10克/袋*9' in specifications:
- # print("111")
- # else:
- # self.swipe_back(1)
- # return
- # elif self.search_key == '':
- # if '10ml*12' in specifications or '10ml*18' in specifications:
- # print(222)
- # else:
- # self.swipe_back(1)
- # return
- self.unrelated_data = 0
- # 商品链接
- # product_link = ''
- # 爬取省份
- scrape_province = '广东' # 这里先默认广东
- # 是否有货
- availability = ''
- save_data = {
- 'product': title_info,
- 'min_price': min_price,
- 'manufacture_date': manufacture_date,
- 'expiry_date': expiry_date,
- 'shop': shop,
- 'business_license_company': business_license_company,
- 'province': province,
- 'city': city,
- 'manufacturer': manufacturer,
- 'specification': specifications,
- 'approval_number': approval_number,
- 'product_link': product_link,
- 'scrape_date': scrape_date,
- 'scrape_province': scrape_province,
- 'availability': availability,
- 'credit_code': credit_code,
- 'platform': '拼多多',
- 'search_key': self.search_key,
- 'number' : 1
- }
- # print(f'待插入数据:{save_data}')
- self.save_to_database(save_data)
- self.swipe_back(1)
- #获取店铺信息start 2025-07-28
- '''
- #不获取店铺信息
- #1、判断店铺名称是否已存在
- shop_is_exists = self.shop_is_exists_database(shop)
- #2、获取店铺资质 是否存在
- # is_has_shop_qualifications = self.has_shop_qualifications()
- if not shop_is_exists :
- print('开始获取店铺信息')
- #点击店铺,点击店铺标题,点击店铺资质
- # license_info = self.get_license_info_ex()
- else:
- #日志中加上店铺名称
- self.loggerPdd.info(f'店铺{shop}信息已存在数据库')
- #获取店铺信息end
- '''
- if self.distinct_target():
- print('已到达搜索列表页')
- else:
- for i in range(2):
- self.swipe_back(1)
- # 最外部有个定位按钮
- if self.distinct_target():
- break
- def main(self, device_id, start_page, end_page, task_id, max_duration_minutes=None, retry_count=0):
- # === 新增:初始化任务信息 ===
- self.task_id = task_id
- self.task_start_page = start_page
- self.task_end_page = end_page
- # self.current_page = start_page
- self.start_time = time.time()
- # === 新增:线程启动成功后更新状态为2 ===
- if self.task_id:
- try:
- self.update_task_status(2) # 状态2: 执行中
- self.loggerPdd.info(f"任务 {task_id} 线程启动成功,状态已更新为2")
- except Exception as e:
- self.loggerPdd.error(f"更新任务状态失败: {e}")
- # =====================================
-
- # 记录任务开始
- if task_id:
- reporter.start_task(task_id, start_page, end_page)
- # ========================
-
- task_scape_count = 0 #任务采集数量初始化为0
- # 计算超时时间(秒)
- timeout_seconds = None
- if max_duration_minutes:
- timeout_seconds = max_duration_minutes * 60
- MAX_RETRY = 3 # 最大重试次数
- spider_no = 0
- self.connect_devices(device_id)
- time.sleep(self.get_sleep_time())
- # 启动全局弹窗监控
- self.monitor = SpiderMonitor(self)
- self.monitor.start()
- # 重新开启拼多多应用
- try:
- self.restart_app()
- # 搜索关键字
- # self.enter_target_page()
- self.safe_exec(self.enter_target_page)
- # === 新增:跳过前面的页面直到start_page start===
- if start_page > 1:
- self.loggerPdd.info(f"跳过前 {start_page-1} 页,从第 {start_page} 页开始采集")
- current_page = 1
- while current_page < start_page:
- # 检查是否需要暂停
- if self.monitor.pausing.is_set():
- self.wait_for_ready(self.monitor)
- # 检查是否到达底部
- if self.d.xpath('//*[@text="已经到底啦"]').exists:
- self.loggerPdd.info(f"在第 {current_page} 页已到达底部,无法继续翻页")
- self.loggerPdd.warning(f"未能到达目标页码 {start_page},实际只到达第 {current_page} 页")
- if task_id:
- reporter.end_task(
- task_id=task_id,
- status='completed',
- finish_status=1,
- force_end_page=idx
- )
- return
- # break
-
- # 滑动到下一页
- self.d.swipe(200, 1400, 200, 300, 0.4)
- time.sleep(self.get_sleep_time())
- current_page += 1
- # 可选:添加页码日志
- self.loggerPdd.debug(f"已翻到第 {current_page} 页")
-
- # 验证是否到达目标页码
- if current_page < start_page:
- self.loggerPdd.error(f"翻页失败!目标页码:{start_page},实际到达:{current_page}")
- # 这里可以根据需要决定是否继续执行或抛出异常
- # return False 或 raise Exception
- else:
- self.loggerPdd.info(f"成功翻到第 {start_page} 页,开始采集")
- # === 新增:跳过前面的页面直到start_page end===
- # === 新增:跳过前面的页面直到start_page start===
- # if start_page > 1:
- # self.loggerPdd.info(f"跳过前 {start_page-1} 页,从第 {start_page} 页开始采集")
- # for skip_idx in range(1, start_page):
- # # 检查是否需要暂停
- # if self.monitor.pausing.is_set():
- # self.wait_for_ready(self.monitor)
-
- # # 滑动到下一页
- # # self.d.drag(300, 1400, 300, 400, 1)
- # self.d.swipe(200, 1400, 200, 300, 0.4)
- # time.sleep(self.get_sleep_time())
-
- # # 检查是否到达底部
- # if self.d.xpath('//*[@text="已经到底啦"]').exists:
- # self.loggerPdd.info(f"在第 {skip_idx} 页已到达底部,无法继续翻页")
- # break
- # === 新增:跳过前面的页面直到start_page end===
- # for idx in range(300):
- for idx in range(start_page, end_page + 1):
- # === 新增:检查是否超过结束页 ===
- if idx > end_page:
- self.loggerPdd.info(f"已采集到指定结束页 {end_page},停止采集")
- if task_id:
- reporter.end_task(
- task_id=task_id,
- status='completed',
- finish_status=1,
- force_end_page=end_page
- )
- return
- # === 新增:检查超时 ===
- if timeout_seconds and (time.time() - self.start_time) > timeout_seconds:
- print(f"任务 {task_id} 达到时间限制 {max_duration_minutes} 分钟,停止采集")
- self.loggerPdd.info(f"任务 {task_id} 达到时间限制 {max_duration_minutes} 分钟,停止采集")
- # 上报未完成状态
- if task_id:
- reporter.end_task(
- task_id=task_id,
- status='completed',
- finish_status=0, # 0:未完成
- force_end_page=self.current_page
- )
- return
- # ====================
- # print(f'第{idx + 1}页')
- print(f'第{idx}页(指定范围: {start_page}-{end_page})')
- self.current_page = idx # 更新当前页码
- # === 新增:更新上报进度 ===
- if task_id:
- reporter.update_task_progress(
- task_id=task_id,
- actual_end_page=self.current_page
- )
- # ========================
- # print(f'第{idx + 1}页')
- if spider_no > 30:
- time.sleep(300)
- spider_no = 0
- # 检查是否需要暂停(验证码过多)
- if self.monitor.verification_count >= self.monitor.MAX_VERIFICATION_RETRY:
- print("频繁遇到验证码,暂停程序")
- self.d.toast("请处理验证码后点击继续", 30)
- # 等待用户点击屏幕继续
- self.d.click(0, 0) # 无效点击,等待用户操作
- self.monitor.verification_count = 0
- if self.unrelated_data > 30:
- print('连续超过30个不达标的数据则停止采集')
- # 连续超过30个不达标的数据则停止采集
- self.loggerPdd.info(f"连续20个数据不达标,品规:{self.search_key}")
- # === 新增:任务正常完成 ===
- if task_id:
- reporter.end_task(
- task_id=task_id,
- status='completed',
- finish_status=1, # 1:已完成
- force_end_page=end_page
- )
- # ========================
- break
- # if idx == 0:
- if idx == 1:
- drug_lis = self.safe_exec(self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout').all)
- else:
- for i in range(1, 6):
- drug_xpath = f'/hierarchy/android.widget.FrameLayout[{i}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout'
- drug_lis = self.safe_exec(self.d.xpath(
- f'/hierarchy/android.widget.FrameLayout[{i}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout').all)
- if drug_lis:
- break
- print('数量', len(drug_lis))
- for idd, drug_one in enumerate(drug_lis):
- print(idd+1, drug_one.info)
- time.sleep(self.get_sleep_time())
- # left = drug_one.info['bounds']['left']
- top = drug_one.info['bounds']['top']
- # right = drug_one.info['bounds']['right']
- bottom = drug_one.info['bounds']['bottom']
- # height = bottom - top
- # 高度低于多少的不点击采集
- # if bottom <= 1400 and top >= 258:
- if bottom <= 1524 and top >= 258:
- # print(f"这页的第{idd+1}个商品")
- # #商品名称的xpath
- # if idx == 0:
- # product_tittle_xpath = f'//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[2]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]/android.widget.FrameLayout[{idd+2}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[2]/android.widget.TextView'
- # else:
- # product_tittle_xpath = drug_xpath + f'[{idd+1}]/android.widget.LinearLayout[1]/android.widget.FrameLayout[2]/android.widget.TextView'
-
- # print(f"商品名称的xpath:{product_tittle_xpath}")
- # if self.d.xpath(product_tittle_xpath).exists:
- # # product_title = self.d.xpath(product_tittle_xpath).info['contentDescription']
- # print(f"列表当前info:{self.d.xpath(product_tittle_xpath).info}")
- # product_title = self.d.xpath(product_tittle_xpath).text
- # print(f"列表当前商品名称:{product_title}")
- # if '999' in self.search_key:
- # temp_search_key = self.search_key.replace('999', '')
- # if '999' not in product_title or temp_search_key not in product_title:
- # print(f"当前商品名称:{product_title} 不包含关键字:{self.search_key}")
- # continue
- # else:
- # if self.search_key not in product_title.replace(' ', ''):
- # continue
- # else:
- # print(f"列表当前商品路径不存在")
-
- self.safe_exec(drug_one.click)
- time.sleep(self.get_sleep_time())
- # 采集药品信息
- try:
- self.safe_exec(self.integrate_data)
- # 检测下是否回退到列表页
- if self.distinct_target():
- print('回退到列表页', True)
- else:
- if retry_count < MAX_RETRY:
- # 停止当前监控线程
- self.monitor.stop()
- self.monitor.join()
- # 递归重启采集
- return self.main(device_id, start_page, end_page, task_id, max_duration_minutes, retry_count+1)
- else:
- print("超过最大重试次数,终止程序")
- return
- print('回退到列表页失败,终止采集')
- return
- time.sleep(self.get_sleep_time())
- spider_no += 1
- except Exception as e:
- print(f'采集药品详情数据出错:{e}')
- self.loggerPdd.error(f'采集药品详情数据出错:{e}')
- if not self.distinct_target():
- for i in range(1):
- self.swipe_back(1)
- # 最外部有个列表按钮
- if self.distinct_target():
- break
- if i == 0 and not self.distinct_target():
- print('页面出错,退出采集')
- return
- else:
- continue
- # if self.d(textStartsWith="抱歉,没有更多商品啦~").exists:
- # print('已经到达列表页最底部')
- # break
- # 翻页逻辑(如果是最后一页则不再翻页)
- if idx < end_page:
- if self.d(textStartsWith="抱歉,没有更多商品啦~").exists:
- self.loggerPdd.info(f'在第 {idx} 页已到达列表最底部')
- if task_id:
- reporter.end_task(
- task_id=task_id,
- status='completed',
- finish_status=1,
- force_end_page=idx
- )
- return
-
- # 翻页
- print('开始滑动')
- self.d.drag(200, 1400, 200, 300, 0.4)
- print('滑动结束')
- time.sleep(self.get_sleep_time())
-
- # 采集完成,数据上报
- if task_id:
- reporter.end_task(
- task_id=task_id,
- status='completed',
- finish_status=1,
- force_end_page=end_page
- )
- print('开始滑入下一页')
- # search_list = self.d.xpath('//*[@resource-id="android:id/content"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.support.v7.widget.RecyclerView[1]').info
- # bounds = search_list['bounds']
- # try:
- # top = drug_lis[0].info['bounds']['top']
- # except Exception as e:
- # print(f'获取滑动参数top出错:{e}')
- # top = 250
- # search_list = self.d.xpath('//android.support.v7.widget.RecyclerView').info
- # bottom = search_list['bounds']['bottom']
- # end_y = 1400 + top - bottom + 162
- # if end_y < 150:
- # end_y = 150
- '''
- end_y = 300
- # self.d.swipe(200, 1400, 200, 1400 + bounds['top'] - bounds['bottom'] + 162, 0.4)
- self.d.swipe(200, 1400, 200, end_y, 0.4)
- time.sleep(self.get_sleep_time())
- '''
- except Exception as e:
- print(f"采集任务异常: {e}")
- # === 新增:异常结束上报 ===
- if task_id:
- reporter.end_task(
- task_id=task_id,
- status='failed',
- finish_status=0, # 未完成
- force_end_page=self.current_page
- )
- # ========================
- raise
- finally:
- # 确保监控线程被停止
- self.monitor.stop()
- self.monitor.join()
-
- def unitest(self):
- """
- 单元测试
- :return:
- """
- # device_id = '21885f5'
- # # self.connect_devices(device_id)
- # # self.screenshot_the_business_license()
- # # cropped_screenshot_path = 'cropped_screenshot.png'
- # # self.get_ocr_res(cropped_screenshot_path)
-
- # shop_name = '我的店铺'
-
- # base_path = r'D:\work\dfwy_spider\drug_data\pdd\screenshot'
- # cropped_screenshot_path = os.path.join(base_path, shop_name + '.png')
- # os.makedirs(base_path, exist_ok=True)
- # print(cropped_screenshot_path)
- # cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\pdd\\screenshot\\' + shop_name + '.png'
- # print(cropped_screenshot_path)
- # time.sleep(10000)
- ocr_res = self.get_ocr_res('ceshi1.jpg')
- print(f'ocr_res:{ocr_res}')
- # 获取当前时间
- current_time = datetime.datetime.now()
- # 格式化为时分秒
- time_str = current_time.strftime("%H-%M-%S")
- # 生成随机的 8 位字符串
- random_str = secrets.token_hex(4) # 生成 4 个字节的随机字符串,转换为 8 位十六进制字符串
- screenshot_path = 'instructionscreenshot1-' + time_str + '-' + random_str + '.png'
- print(screenshot_path)
-
- # if ocr_res:
- # for item in ocr_res:
- # if '拖动滑块完成' in item['words']:
- # print("滑块验证")
- # break
- # print("ocr_res end")
-
-
- # company_name = ''
- # reg_number = ''
- # address = ''
- # if ocr_res:
- # for item in ocr_res:
- # if '企业注册号' in item['words']:
- # # print('come in 111')
- # reg_number = item['words'].split(':', 1)[1].strip()
- # elif '企业名称' in item['words']:
- # # print('come in 222')
- # company_name = item['words'].split(':', 1)[1].strip()
- # elif '所:' in item['words']:
- # # print('come in 333')
- # address = item['words'].split(':', 1)[1].strip()
- # # 输出结果
- # print("企业注册号:", reg_number)
- # print("企业名称:", company_name)
- # print("住所:", address)
- # screenshot_verify_path = 'screenshot_verify.png'
- # img = cv2.imread(screenshot_verify_path)
- # # 指定裁剪区域 (left, top, right, bottom)
- # left = 118
- # top = 478
- # right = 602
- # bottom = 722
- # cropped_verify_img = img[top:bottom, left:right]
- # cropped_verify_creenshot_path = 'cropped_verify_screenshot.png'
- # cv2.imwrite(cropped_verify_creenshot_path, cropped_verify_img)
- # print('裁剪完成')
- time.sleep(100000)
- title_info = '云南白药 参苓健脾胃颗粒10袋 补脾健胃利湿止泻 脾胃虚弱 饮食不消 或泻或吐 形瘦色萎 神疲乏力 5盒装(补脾健胃)'
- min_price = 85
- shop = '堂鹭北大药房旗舰店'
- scrape_date = '2025-03-19'
- dup_data = {'product': title_info, 'min_price': min_price, 'shop': shop, 'scrape_date': scrape_date}
- print(self.data_is_exists(dup_data))
- #pdd
-
- '''
- def main():
-
- # search_key = '999板蓝根颗粒10g*20' # 参苓健脾胃颗粒 香砂平胃颗粒 舒肝颗粒 清肺化痰丸
- keys_list = [
- #999感冒清热颗粒、 三九胃泰颗粒
- #暂时不需要的:
- # '999小儿止咳糖浆'
- # '999小儿氨酚黄那敏颗粒'
- # '999小儿感冒颗粒'
- # '999抗病毒口服液10ml*6支'
- # '今维多赐多康牌蛋白粉',
- # '必无忧盐酸特比萘芬喷雾剂30ml'
- # '999冰连清咽'
- # '999复方苦参肠炎康片12片'
- # '999强力枇杷露16袋'
- # '999三蛇胆川贝膏138'
- # '999维生素E.C颗粒9袋'
- # '三九胃泰养胃舒颗粒8袋'
- # '999止泻利颗粒15g*8'
- # '史达功右美沙芬愈创甘油醚糖浆120'
- # '999复方氨酚烷胺胶囊6粒'
- # '999可调式生理性海水鼻腔喷雾50'
- # '999小儿止咳糖浆120' #不低于19.8
- # '999小儿止咳糖浆225' #禁止挂网
- # '999小儿感冒颗粒6g*10' #不低于24.9
- # '999小儿感冒颗粒6g*24' #禁止挂网
- # '999小儿氨酚黄那敏颗粒6g*10袋' #不低于15.8
- # '999小儿氨酚黄那敏颗粒6g*20袋' #禁止挂网
- # '999小儿咽扁颗粒8g*10袋' #仅限999官旗店
- # '999阿奇霉素片'
- #2025-08-01最新 其中 藿香正气合剂两种规格 10支和6支 抗病毒口服液 12支和18支 蒲地蓝 24片 36片和44片 枇杷露225ml 小柴胡颗粒9袋和15袋 养胃舒 6袋 复方感冒灵颗粒15袋,
- #曲安奈德益康唑乳膏 30g 葡萄糖酸锌口服溶液 12支 18支 24支和30支,
- # 1、999止泻利颗粒15g*8 没有数据 2、三九胃泰养胃舒颗粒8袋 没有数据 3、999三蛇胆川贝膏138 没有数据 4、999强力枇杷露16袋 没有数据 5、999复方苦参肠炎康片12片 6、999冰连清咽 没有数据
- # '999藿香正气合剂'
- # '999藿香正气合剂10ml*6',
- # '999藿香正气合剂10ml*10',
- # '999糠酸莫米松凝胶15',
- # '999抗病毒口服液',
- # '999抗病毒口服液10ml*10'
- # '999抗病毒口服液10ml*12'
- # '999蒲地蓝消炎片',
- # '999强力枇杷露225ml',
- # '999小柴胡颗粒',
- # '999养胃舒颗粒10g*6',
- # '999复方感冒灵颗粒',
- # '999黄芪精',
- # '999曲安奈德益康唑乳膏30g',
- # '999葡萄糖酸锌口服溶液',
- # '佳美舒阿奇霉素肠溶胶囊4'
- # '三九复方金银花颗粒10g*8袋'
- # '999必无忧盐酸特比萘芬乳膏15'
- # '999复方板蓝根颗粒15g*15袋'
- # '999速复康布洛芬缓释胶囊'
- # '999维生素C咀嚼片'
- # '999精氨酸布洛芬颗粒'
- # '999强力枇杷露120ml'
- # '999强力枇杷露150ml'
- # '999强力枇杷露' #同时支持120,150和225ml
- #OTC
-
- # '999银菊清咽颗粒' #没有数据
- # '999感冒清热颗粒6g*10'
- # '999选平硝酸咪康唑乳膏20g'
- # '999糠酸莫米松乳膏10g'
- # '999表虚感冒颗粒' #没有数据
- # '999补脾益肠丸'
- # '999壮骨关节胶囊'
- # '999壮骨关节丸6g*20'
- # '999正天丸6g*15'
- # '999正天胶囊'
- # '三九胃泰胶囊'
- # '三九胃泰颗粒20g*10'
- # '三九胃泰颗粒2.5g*6'
- #10.31 new add
- '999感冒灵颗粒' #不低于15.5
- # '999皮炎平复方醋酸地塞米松乳膏20' #不低于12.5
- # '三九胃泰颗粒20g*6袋' #不低于13.5
- # '顺峰酮康他索乳膏' #包含10g和20g两个规格 10g 不低于7.5 20g 不低于12.5 '顺峰康王酮康他索乳膏'
- # '999糠酸莫米松凝胶10' #不低于26.9
- # '999板蓝根颗粒10g*20' #不低于26.9
- # '999复方氨酚烷胺胶囊12粒' #不低于17.9 #统一成:999复方氨酚烷胺胶囊
- # '999复方氨酚烷胺胶囊10粒' #禁止挂网 #统一成:999复方氨酚烷胺胶囊
- # '999复方氨酚烷胺胶囊6粒' #禁止挂网 #统一成:999复方氨酚烷胺胶囊
- # '999复方氨酚烷胺胶囊'
- # '999咽炎片0.26g*12片' #不低于13.5 #999咽炎片0.26g*12片*2板改成 999咽炎片0.26g*12片
- # '999感冒灵胶囊' #仅限999官旗店
- # '999荆防颗粒' #美团没有数据 #禁止挂网 拼多多也没数据
- # '999小儿感冒宁颗粒' #禁止挂网 999小儿感冒宁颗粒2.5g*10袋 改成 999小儿感冒宁颗粒
- # '速复康磷酸奥司他韦胶囊75mg*10' #美团没数据 # 禁止挂网 999磷酸奥司他韦胶囊75mg*10 改成 速复康磷酸奥司他韦胶囊75mg*10
- # '史达功右美沙芬愈创甘油醚糖浆120' #仅限999官旗店
- # '999感冒清热颗粒12g*18' #禁止挂网
- ]
-
- # 设备序列号
- # device_id = 'e2899b34'
- # device_id = '2e58510'
- # device_id = '369dcf96'
- # device_id = 'ea4e4eb8'
- # device_id = 'IZTOWWDQT45D49BU'
- # device_id = 'INXCDAIR75FMMFGU'
- # device_id = 'CMKFUSSG99ROR489'
- # device_id = '95b2c764'
- # device_id = 'UCQGF6CQFMU8WKHI'
- device_id = 'U8ONIJJJS4CELVD6'
- # device_id = 'OVFETO8PCY45E6A6'
- # device_id = 'IRLZAAQCDMHYWKTS'
- # device_id = 'DEZXWKUC7DJBLVPJ'
- # device_id = 'U47HZDRG8XJBBURW'
- # device_id = 'WWRO9LTGG6KFGQCM'
- # device_id = 'GQIRKB7LVOONM7VW'
- # device_id = 'ZDQWUSSWBEDI896T'
- # device_id = '656DTOPRZDEALZ5X'
- # device_id = 'N7ZXBITOSOGMYXQS'
- # device_id = '1462a51f'
- # device_id = '4TZDUGTOAIFMJVGU'
- # device_id = 'GIOFIBRKZTUGJJAE'
- # device_id = 'fcb3c749'
- cycle_no = 0 # 轮次计数
- while True:
- cycle_no += 1
- logging.info(f'========== 第 {cycle_no} 轮采集开始 ==========')
- for idx, key in enumerate(keys_list, 1):
- logging.info(f'[{idx}/{len(keys_list)}] 开始采集关键字:{key}')
- try:
- # mt = MT(key) # 用当前关键字实例化
- # mt.main(device_id) # 执行一次完整采集
- pdd = PDD(key, device_id)
- pdd.main(device_id)
- logging.info(f'关键字 {key} 本轮采集完成')
- except Exception as e:
- # 发生异常直接跳过该关键字,继续下一轮
- logging.exception(f'关键字 {key} 采集异常:{e}')
- finally:
- # 关闭当前 MT 实例资源(如有需要)
- if hasattr(pdd, 'close'):
- pdd.close()
- # logging.info('本轮全部关键字采集完成,等待 2 小时后下一轮...')
- # time.sleep(1 * 3600) # 2 小时 = 7200 秒
-
- # pdd = PDD(search_key, device_id)
- # pdd.unitest()
- # pdd.main('369dcf96')
-
- # pdd.main(device_id)
- '''
- def get_retrieve_mysql():
- """
- 建立远端连接并返回一个到数据库的连接对象
- """
- import pymysql
- return pymysql.connect(
- host='39.108.116.125', # 修改后的主机
- port=3306, # 添加端口号
- user='drug_retrieve', # 修改后的用户名
- password='Pem287cwM58jNpe2', # 修改后的密码
- db='drug_retrieve', # 修改后的数据库名
- charset='utf8mb4'
- )
- class TimeoutException(Exception):
- pass
- # 如果需要并行处理(提高效率),可以使用线程池:
- def process_tasks_in_parallel(max_workers=12):
- """使用线程池并行处理多个任务""" """使用线程池并行处理多个任务,每个任务最多执行30分钟"""
- from concurrent.futures import ThreadPoolExecutor, as_completed
- import concurrent.futures # ← 新增导入
-
- retrieve_conn = get_retrieve_mysql()
- cursor = retrieve_conn.cursor()
-
- query = """
- SELECT id, collect_equipment_id, product_name, start_page, end_page, duration
- FROM retrieve_collect_task_allocate
- WHERE status = 1 AND platform = 3
- """
- cursor.execute(query)
- results = cursor.fetchall()
- print(f"获取到的任务结果={results}")
-
- if not results:
- print("PDD 没有要采集的品规")
- return
-
- # 准备任务列表
- tasks = []
- device_map = {}
-
- for result in results:
- task_id = result[0]
- collect_equipment_id = result[1]
- product_name = result[2]
- start_page = result[3]
- end_page = result[4]
- duration = result[5]
-
- if collect_equipment_id != 0 and product_name and product_name.strip():
- # 缓存设备查询
- if collect_equipment_id not in device_map:
- device_query = "SELECT device_id FROM retrieve_collect_equipment WHERE id = %s AND status = 0"
- cursor.execute(device_query, (collect_equipment_id,))
- device_result = cursor.fetchone()
- device_map[collect_equipment_id] = device_result[0] if device_result else None
-
- if device_map[collect_equipment_id]:
- # ↓ 使用数据库中的duration,如果没有设置则用默认值30分钟
- duration_minutes = duration if duration is not None else 30
- tasks.append({
- 'task_id': task_id,
- 'device_id': device_map[collect_equipment_id],
- 'key': product_name.strip(),
- 'start_page': start_page,
- 'end_page': end_page,
- 'duration_minutes': duration_minutes, # 存储执行时间限制(分钟)
- })
-
- cursor.close()
- retrieve_conn.close()
-
- if not tasks:
- print("没有有效的采集任务")
- return
-
- print(f"准备并行处理 {len(tasks)} 个任务")
-
- def process_single_task(task):
- """处理单个任务的函数"""
- task_start_time = time.time() # ← 记录开始时间
- # start_time = time.time()
-
- try:
- pdd = PDD(task['key'], task['device_id'])
-
- # 执行采集,获取采集数量 关键数据:实际采集的数量,实际的页数
- pdd.main(
- device_id = task['device_id'],
- start_page = task['start_page'],
- end_page = task['end_page'],
- task_id = task['task_id'],
- max_duration_minutes = task['duration_minutes'] # 传入时间限制
- )
- return {
- 'task_id': task['task_id'],
- 'success': True,
- 'collected_count': pdd.collected_count,
- 'final_page': pdd.current_page
- }
- except Exception as e:
- print(f"任务 {task['task_id']} 执行异常: {e}")
- return {
- 'task_id': task['task_id'],
- 'success': False,
- 'error': str(e)
- }
- finally:
- if 'mt' in locals() and hasattr(pdd, 'close'):
- try:
- pdd.close()
- except:
- pass
-
- # 使用线程池并行执行
- successful_tasks = 0
- failed_tasks = 0
- # total_execution_time = 0 # 初始化总执行时间变量
-
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
- # 提交所有任务
- future_to_task = {
- executor.submit(process_single_task, task): task
- for task in tasks
- }
-
- # 处理完成的任务
- for future in as_completed(future_to_task):
- task = future_to_task[future]
- try:
- task_timeout = (task['duration_minutes'] + 5) * 60 # 加5分钟缓冲
- result = future.result(timeout=task_timeout) # 使用动态超时时间
- if result['success']:
- successful_tasks += 1
- print(f"任务 {result['task_id']}: 完成,采集 {result['collected_count']} 条数据")
- else:
- failed_tasks += 1
- print(f"任务 {result['task_id']}: 失败,错误: {result['error']}")
- except concurrent.futures.TimeoutError: # ← 捕获超时异常
- failed_tasks += 1
- timeout_tasks += 1
- print(f"任务 {task['task_id']}: 超时(限制 {task['duration_minutes']} 分钟)")
-
- # ↓ 超时后上报数据
- if task['task_id']:
- # 这里需要调用上报,但reporter可能没有这个任务的数据
- # 更好的方式是在MT.main中已经上报了
- pass
- except Exception as e:
- failed_tasks += 1
- print(f"任务 {task['task_id']}: 执行异常 {e}")
-
- # if (successful_tasks + failed_tasks) > 0:
- # avg_time = total_execution_time / (successful_tasks + failed_tasks)
- # avg_minutes = avg_time / 60
- # else:
- # avg_minutes = 0
-
- # total_minutes = total_execution_time / 60
-
- print(f"\n并行采集完成:")
- print(f"成功: {successful_tasks} 个")
- print(f"失败: {failed_tasks} 个")
- if __name__ == '__main__':
- # process_tasks_in_parallel(max_workers=10) # 可以同时处理10个任务
- def run_collection():
- """执行采集任务"""
- try:
- print(f"【定时任务开始】时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
- process_tasks_in_parallel(max_workers=12)
- print(f"【定时任务结束】时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
- except Exception as e:
- print(f"【定时任务异常】: {e}")
-
- # 设置定时任务
- schedule.every(130).minutes.do(run_collection)
- # 立即执行一次
- run_collection()
- print("定时任务已设置,每130分钟执行一次采集")
- # 循环执行
- while True:
- schedule.run_pending()
- time.sleep(60) # 每分钟检查一次
- # main()
- # scheduler = BlockingScheduler()
- # scheduler.add_job(main, 'cron', hour=11, minute=1, misfire_grace_time=120)
- # try:
- # scheduler.start()
- # except (KeyboardInterrupt, SystemExit):
- # pass
|