spider.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. # mt_spider/spider.py
  2. import re, time, random, datetime, json, logging, requests, base64, cv2, uiautomator2 as u2, subprocess
  3. from monitor import SpiderMonitor
  4. from config import Config
  5. from db import get_mysql
  6. # ------------------ 装饰器 ------------------
  7. def safe_method(func):
  8. import functools
  9. @functools.wraps(func)
  10. def wrapper(self, *args, **kwargs):
  11. return self.safe_exec(func, self, *args, **kwargs)
  12. return wrapper
  13. # ------------------ 主类 ------------------
  14. class MT:
  15. def __init__(self, key: str):
  16. self.package_name = Config.PACKAGE_NAME
  17. self.search_key = key
  18. self.unrelated_data = 0
  19. self.monitor = None
  20. self.d = None
  21. # ------------------ 通用 ------------------
  22. def safe_exec(self, func, *args, **kwargs):
  23. while self.monitor and self.monitor.pausing.is_set():
  24. time.sleep(1)
  25. return func(*args, **kwargs)
  26. @staticmethod
  27. def get_sleep_time():
  28. return random.randint(5, 8)
  29. @staticmethod
  30. def get_current_date():
  31. return datetime.datetime.now().strftime('%Y-%m-%d')
  32. def stop_all(self):
  33. logging.warning("收到停止信号,准备退出")
  34. if self.monitor:
  35. self.monitor.stop()
  36. # ------------------ 设备/APP ------------------
  37. def connect_devices(self, device_id):
  38. """
  39. 连接设备
  40. :return:
  41. """
  42. # try:
  43. # self.d = u2.connect_usb(device_id)
  44. # # 设置隐形等待时间
  45. # # self.d.implicitly_wait(5)
  46. # self.restart_uiautomator_services(device_id)
  47. # print(f'连接到设备:{device_id}')
  48. # except Exception as e:
  49. # print(f'{device_id} 连接错误: {e}')
  50. # raise Exception(e)
  51. self.d = u2.connect_usb(device_id)
  52. print(f'连接到设备:{device_id}')
  53. subprocess.run(
  54. f'adb -s {device_id} shell /data/local/tmp/atx-agent server -d'.split(),
  55. capture_output=True
  56. )
  57. time.sleep(3)
  58. def restart_app(self):
  59. self.d.app_stop(self.package_name)
  60. time.sleep(2)
  61. self.d.app_start(self.package_name)
  62. time.sleep(5)
  63. # ------------------ 页面操作 ------------------
  64. def enter_target_page(self):
  65. self.d.xpath('//*[@content-desc="看病买药"]').click()
  66. self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/vf_search_carousel_text"]').click()
  67. self.d.xpath('//*[@text="搜索"]').click()
  68. self.d.send_keys(self.search_key, clear=True)
  69. self.d.xpath('//*[@text="搜索"]').click()
  70. # ------------------ 数据抓取 ------------------
  71. @safe_method
  72. def get_title(self):
  73. if "999" in self.search_key:
  74. self.search_key = self.search_key.replace("999", "")
  75. title = self.d.xpath(f'//*[contains(@text, "{self.search_key}")]').text
  76. # title = self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.FrameLayout[1]/android.widget.TextView').text
  77. print(f'获取到药品标题:{title}')
  78. match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title)
  79. if match:
  80. drugs_name = match.group(1).strip() + match.group(2).strip()
  81. specifications = match.group(3).strip()
  82. print("药品名:", drugs_name)
  83. print("规格:", specifications)
  84. print('完整药名:', drugs_name + specifications)
  85. return drugs_name, specifications
  86. else:
  87. print("没有匹配到预期格式")
  88. return None, None
  89. # match = re.search(r'(\[[^\]]+\])(.+?)(\d+.*)', title or "")
  90. # return (match.group(1) + match.group(2), match.group(3)) if match else (None, None)
  91. @safe_method
  92. def swipe_up(self):
  93. """
  94. 上滑
  95. :return:
  96. """
  97. screen_width = self.d.info['displayWidth']
  98. screen_height = self.d.info['displayHeight']
  99. duration_rate = random.uniform(0, 0.3)
  100. self.d.swipe(screen_width // 2, screen_height - 100, screen_width // 2, 100, duration=duration_rate)
  101. no = random.uniform(0, 1)
  102. if no > 0.85:
  103. # 有的时候卡着 再稍微往上滑一点点
  104. self.d.swipe_ext("up", 0.1)
  105. time.sleep(self.get_sleep_time())
  106. @safe_method
  107. def swipe_back(self, no):
  108. """
  109. 返回
  110. :param no: 回退次数
  111. :return:
  112. """
  113. for idx in range(no):
  114. self.d.press('back')
  115. time.sleep(self.get_sleep_time())
  116. @safe_method
  117. def drug_price(self):
  118. """
  119. 获取药品价格
  120. :return:
  121. """
  122. try:
  123. price_str = self.d.xpath('//*[starts-with(@text,"¥")]').text
  124. price = float(re.search('[\d\.]+', price_str).group())
  125. print(f'获取到价格:{price}')
  126. return price
  127. except Exception as e:
  128. print(f'提取价格出错-->{e}')
  129. return None
  130. # txt = self.d.xpath('//*[starts-with(@text,"¥")]').text
  131. # return float(re.search(r'[\d.]+', txt).group()) if txt else None
  132. @safe_method
  133. def get_shop_name(self):
  134. try:
  135. return self.d.xpath('//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  136. except:
  137. try:
  138. return self.d.xpath('//android.widget.ScrollView/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[last()-1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.FrameLayout[1]/android.widget.TextView').text
  139. except Exception as e:
  140. logging.error('获取店铺名出错: %s', e)
  141. return None
  142. @safe_method
  143. def get_shop_address(self):
  144. try:
  145. shop_address = self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/wm_sc_drug_shop_content_mrn_container_id_2"]/android.widget.FrameLayout[1]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.widget.TextView').text
  146. print(f'获取到店铺地址:{shop_address}')
  147. return shop_address
  148. except:
  149. return None
  150. @safe_method
  151. def get_qualification_number(self):
  152. try:
  153. qualification_number_str = self.d.xpath('//*[@resource-id="com.sankuai.meituan:id/mil_container"]/android.webkit.WebView[1]/android.webkit.WebView[1]/android.view.View[1]/android.view.View[1]/android.widget.TextView[2]').text
  154. return qualification_number_str.strip('资质编号:').strip()
  155. except:
  156. return None
  157. # ------------------ OCR ------------------
  158. def get_ocr_res(self, img):
  159. try:
  160. #img地址
  161. print(f'开始识别图片:{img}')
  162. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  163. # 二进制方式打开图片文件
  164. f = open(img, 'rb')
  165. img = base64.b64encode(f.read())
  166. params = {"image": img}
  167. # access_token = get_access_token()
  168. request_url = request_url + "?access_token=" + self.access_token
  169. headers = {'content-type': 'application/x-www-form-urlencoded'}
  170. response = requests.post(request_url, data=params, headers=headers)
  171. if response:
  172. res = response.json()
  173. new_dic = dict()
  174. for ite in res['words_result'].keys():
  175. new_dic[ite] = res['words_result'][ite]['words']
  176. print('资质数据信息', new_dic)
  177. return new_dic
  178. else:
  179. return None
  180. except:
  181. return None
  182. def screenshot_the_business_license(self, qualification_number: str):
  183. screenshot_path = 'screenshot1.png'
  184. self.d.screenshot(screenshot_path)
  185. img = cv2.imread(screenshot_path)
  186. # 指定裁剪区域 (left, top, right, bottom)
  187. left = 0
  188. top = 480
  189. right = 720
  190. bottom = 1420
  191. cropped_img = img[top:bottom, left:right]
  192. if qualification_number:
  193. cropped_screenshot_path = 'D:\\work\\dfwy_spider\\drug_data\\mt\\screenshot\\' + qualification_number + '.png'
  194. else:
  195. cropped_screenshot_path = 'cropped_screenshot.png'
  196. cv2.imwrite(cropped_screenshot_path, cropped_img)
  197. return cropped_screenshot_path
  198. # ------------------ 说明书 ------------------
  199. @safe_method
  200. def get_instructions_data(self):
  201. self.d.xpath('//*[@text="说明"]').click()
  202. time.sleep(1)
  203. self.d.xpath('//*[@text="查看详细说明"]').click()
  204. time.sleep(1)
  205. self.d.xpath('//*[@text="加载更多"]').click_exists()
  206. loop_page = 5
  207. new_list = []
  208. for i in range(loop_page):
  209. self.d.xpath('//*[@text="加载更多"]').click_exists()
  210. time.sleep(0.2)
  211. if i == 0:
  212. self.d.swipe(200, 1000, 200, 300, 0.4)
  213. else:
  214. self.d.swipe(200, 1000, 200, 62)
  215. time.sleep(0.2)
  216. if self.d.xpath('//*[@text="加载更多"]').exists:
  217. self.d.xpath('//*[@text="加载更多"]').click()
  218. time.sleep(0.2)
  219. all_tt = self.d.xpath(
  220. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup').all()
  221. for idx in range(1, len(all_tt) + 1):
  222. all_tt1 = self.d.xpath(
  223. f'//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[{idx}]//android.widget.TextView').all()
  224. print(f'当前说明书列表数据:{all_tt1}')
  225. for tt in all_tt1:
  226. if tt.text and tt.text != '展开全文':
  227. new_list.append(tt.text)
  228. if i == 0:
  229. height = 938
  230. else:
  231. drug_box = self.d.xpath(
  232. '//*[@resource-id="com.sankuai.meituan:id/container"]/android.widget.FrameLayout[1]/android.widget.RelativeLayout[1]/android.widget.FrameLayout[1]/android.widget.LinearLayout[1]/android.widget.FrameLayout[1]/android.widget.FrameLayout[3]/android.widget.FrameLayout[1]/android.view.ViewGroup[1]/android.view.ViewGroup[2]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[3]/android.widget.ScrollView[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]/android.view.ViewGroup[1]').info
  233. bounds = drug_box['bounds']
  234. height = bounds['bottom'] - bounds['top']
  235. if height < 938:
  236. # print('说明书翻页到底部')
  237. break
  238. # 展开全文
  239. new_list = [item for item in new_list if item != '展开全文']
  240. print(f'当前说明书列表数据:{new_list}')
  241. res_data = {
  242. "有效期": (new_list[new_list.index("有效期") + 1]) if "有效期" in new_list and new_list.index("有效期") + 1 < len(new_list) else "",
  243. "生产单位": (new_list[new_list.index("生产单位") + 1]) if "生产单位" in new_list and new_list.index("生产单位") + 1 < len(new_list) else "",
  244. "批准文号": (new_list[new_list.index("批准文号") + 1]) if "批准文号" in new_list and new_list.index("批准文号") + 1 < len(new_list) else ""
  245. }
  246. print(f'当前说明书字典数据:{res_data}')
  247. return res_data
  248. # ------------------ 店铺资质 ------------------
  249. def enter_shop(self):
  250. self.d.xpath('//*[@text="店铺"]').click()
  251. time.sleep(self.get_sleep_time())
  252. def enter_shoper(self):
  253. self.d.xpath('//*[@text="商家"]').click()
  254. time.sleep(self.get_sleep_time())
  255. def scan_shoper_license(self):
  256. self.d.xpath('//*[@text="查看商家资质"]').click()
  257. time.sleep(self.get_sleep_time())
  258. @safe_method
  259. def get_license_info_ex(self):
  260. self.enter_shop()
  261. self.enter_shoper()
  262. contact_address = self.get_shop_address()
  263. self.scan_shoper_license()
  264. qualification_number = self.get_qualification_number()
  265. if qualification_number:
  266. business_license_company = ''
  267. business_license_address = ''
  268. self.d.click(0.603, 0.27)
  269. time.sleep(self.get_sleep_time())
  270. img_path = self.screenshot_the_business_license(qualification_number)
  271. print(f'cropped_screenshot_path:{img_path}')
  272. ocr_res = self.get_ocr_res(img_path)
  273. print(f'ocr_res:{ocr_res}')
  274. if ocr_res:
  275. if '单位名称' in ocr_res.keys():
  276. business_license_company = ocr_res['单位名称']
  277. if '地址' in ocr_res.keys():
  278. business_license_address = ocr_res['地址']
  279. license_info_data = {'contact_address': contact_address, 'qualification_number': qualification_number, 'business_license_company': business_license_company, 'business_license_address': business_license_address}
  280. else:
  281. license_info_data = {'contact_address': contact_address, 'qualification_number': '', 'business_license_company': '', 'business_license_address': ''}
  282. return license_info_data
  283. # ------------------ 数据库 ------------------
  284. def data_is_exists(self, data):
  285. try:
  286. conn = get_mysql()
  287. cur = conn.cursor()
  288. query_sql = """
  289. SELECT * FROM {}
  290. WHERE product = %s
  291. AND min_price = %s
  292. AND shop = %s
  293. AND scrape_date = %s
  294. AND platform = %s
  295. """.format(Config.DB_TABLE)
  296. cur.execute(query_sql, (
  297. data['product'],
  298. data['min_price'],
  299. data['shop'],
  300. data['scrape_date'],
  301. data['platform']
  302. ))
  303. result = cur.fetchone()
  304. return bool(result) # 如果存在返回True,否则False
  305. except Exception as e:
  306. print(f"MySQL 错误: {str(e)}")
  307. logging.error('检查商品存在性失败: %s', e)
  308. return False
  309. def shop_is_exists_database(self, shop):
  310. try:
  311. conn = get_mysql()
  312. cur = conn.cursor()
  313. query_sql = """
  314. SELECT * FROM {}
  315. WHERE shop = %s
  316. """.format(Config.DB_SHOP_TABLE)
  317. cur.execute(query_sql, (
  318. shop
  319. ))
  320. result = cur.fetchone()
  321. return bool(result) # 如果存在返回True,否则False
  322. except Exception as e:
  323. print(f"MySQL 错误: {str(e)}")
  324. logging.error('检查店铺存在性失败: %s', e)
  325. return False
  326. def save_to_database(self, data):
  327. print(f'保存数据到数据库:{data}')
  328. try:
  329. conn = get_mysql()
  330. cur = conn.cursor()
  331. add_sql = f"""
  332. INSERT INTO {Config.DB_TABLE}
  333. (product, min_price, manufacture_date, expiry_date, shop, business_license_company, province, city, manufacturer, specification, approval_number, product_link, scrape_date, scrape_province, availability, credit_code, platform)
  334. VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
  335. """
  336. cur.execute(add_sql, (
  337. data['product'], data['min_price'], data['manufacture_date'], data['expiry_date'], data['shop'],
  338. data['business_license_company'], data['province'], data['city'], data['manufacturer'],
  339. data['specification'], data['approval_number'], data['product_link'], data['scrape_date'],
  340. data['scrape_province'], data['availability'], data['credit_code'], data['platform']
  341. ))
  342. conn.commit() # 提交数据
  343. print(f"存入数据库成功")
  344. logging.info('商品数据已入库')
  345. except Exception as e:
  346. logging.error('写入商品数据失败: %s', e)
  347. def save_shop_info_to_database(self, data):
  348. print(f'保存店铺数据到数据库:{data}')
  349. try:
  350. conn = get_mysql()
  351. cur = conn.cursor()
  352. add_sql = f"""
  353. INSERT INTO {Config.DB_TABLE}
  354. (shop, contact_address, qualification_number, business_license_company, business_license_address, scrape_date, platform)
  355. VALUES (%s, %s, %s, %s, %s, %s, %s)
  356. """
  357. cur.execute(add_sql, (
  358. data['shop'], data['contact_address'], data['qualification_number'],
  359. data['business_license_company'], data['business_license_address'],
  360. data['scrape_date'], data['platform']
  361. ))
  362. conn.commit() # 提交数据
  363. print(f'存入店铺信息到数据库成功')
  364. logging.info('店铺数据已入库')
  365. except Exception as e:
  366. logging.error('写入店铺数据失败: %s', e)
  367. # ------------------ 采集一条完整商品数据 ------------------
  368. @safe_method
  369. def integrate_data(self):
  370. """
  371. 采集一条完整商品 + 店铺信息并入库
  372. """
  373. logger = logging.getLogger()
  374. logger.info('开始采集当前商品详情')
  375. # 1. 商品名 + 规格
  376. title_info = self.get_title()
  377. if not title_info:
  378. logger.warning('未获取到标题,跳过')
  379. self.swipe_back(1)
  380. return
  381. product, specifications = title_info
  382. if self.search_key not in product.replace(' ', ''):
  383. logger.info('无关商品,跳过')
  384. self.unrelated_data += 1
  385. self.swipe_back(1)
  386. return
  387. # 2. 价格
  388. min_price = self.drug_price()
  389. if min_price is None:
  390. logger.warning('未获取到价格,跳过')
  391. self.swipe_back(1)
  392. return
  393. # 3. 自营判断
  394. if self.d.xpath('//*[@text="自营"]').exists:
  395. shop = "美团自营大药房(快递电商)"
  396. scrape_date = self.get_current_date()
  397. dup_data = {
  398. 'product': product, 'min_price': min_price,
  399. 'shop': shop, 'scrape_date': scrape_date, 'platform': '美团'
  400. }
  401. if self.data_is_exists(dup_data):
  402. logger.info('自营商品已存在,跳过')
  403. self.swipe_back(1)
  404. return
  405. else:
  406. # 4. 非自营:找进店
  407. for i in range(3):
  408. if self.d.xpath('//*[@text="进店"]').exists:
  409. print('开始获取店铺名1')
  410. break
  411. self.d.swipe_ext('up', 0.2)
  412. time.sleep(1)
  413. if self.d.xpath('//*[@text="进店"]').exists:
  414. print('开始获取店铺名2')
  415. break
  416. shop = self.get_shop_name()
  417. scrape_date = self.get_current_date()
  418. dup_data = {
  419. 'product': product, 'min_price': min_price,
  420. 'shop': shop, 'scrape_date': scrape_date, 'platform': '美团'
  421. }
  422. if self.data_is_exists(dup_data):
  423. logger.info('商品已存在,跳过')
  424. self.swipe_back(1)
  425. return
  426. if not shop or '自营' in shop:
  427. logger.info('店铺为自营或空,跳过')
  428. self.swipe_back(1)
  429. return
  430. # 5. 采集店铺资质(仅新店铺)
  431. if self.d.xpath('//*[@text="进店"]').exists and '美团官方' not in shop and not self.shop_is_exists_database(shop):
  432. lic = self.get_license_info_ex()
  433. save_shop_data = {
  434. 'shop': shop,
  435. 'contact_address': lic['contact_address'],
  436. 'qualification_number': lic['qualification_number'],
  437. 'business_license_company': lic['business_license_company'],
  438. 'business_license_address': lic['business_license_address'],
  439. 'scrape_date': scrape_date,
  440. 'platform': '美团'
  441. }
  442. self.save_shop_info_to_database(save_shop_data)
  443. self.swipe_back(2) # 返回两次:资质页 -> 店铺 -> 列表
  444. # 6. 说明书信息
  445. if not self.has_instructions():
  446. logger.info('无说明书,跳过')
  447. self.swipe_back(1)
  448. return
  449. instructions = self.get_instructions_data()
  450. expiry_date = instructions.get('有效期', '').strip('。')
  451. manufacturer = instructions.get('生产单位', '').strip('。')
  452. approval_number = instructions.get('批准文号', '').strip('。')
  453. # 7. 组装入库数据
  454. save_data = {
  455. 'product': product,
  456. 'min_price': min_price,
  457. 'manufacture_date': '',
  458. 'expiry_date': expiry_date,
  459. 'shop': shop,
  460. 'business_license_company': '',
  461. 'province': '',
  462. 'city': '',
  463. 'manufacturer': manufacturer,
  464. 'specification': specifications,
  465. 'approval_number': approval_number,
  466. 'product_link': '',
  467. 'scrape_date': scrape_date,
  468. 'scrape_province': '广东',
  469. 'availability': '',
  470. 'credit_code': '',
  471. 'platform': '美团'
  472. }
  473. self.save_to_database(save_data)
  474. logger.info('商品数据已入库:%s', product)
  475. self.unrelated_data = 0
  476. self.swipe_back(1)
  477. # ------------------ 主流程 ------------------
  478. def main(self, device_id, retry_count=0):
  479. MAX_RETRY = 3
  480. logger = logging.getLogger()
  481. spider_no = 0
  482. self.connect_devices(device_id)
  483. time.sleep(self.get_sleep_time())
  484. self.monitor = SpiderMonitor(self)
  485. self.monitor.start()
  486. try:
  487. self.restart_app()
  488. self.enter_target_page()
  489. for idx in range(300):
  490. logger.info('========== 第 %s 页 ==========', idx + 1)
  491. if spider_no > 30:
  492. logger.info('已采集 30 条,休息 120 秒')
  493. time.sleep(120)
  494. spider_no = 0
  495. if self.monitor.verification_count >= self.monitor.MAX_VERIFICATION_RETRY:
  496. logger.warning('验证码重试超限,等待人工处理')
  497. self.d.toast('请处理验证码后点击继续', 30)
  498. self.monitor.verification_count = 0
  499. continue
  500. drug_lis = self.safe_exec(
  501. self.d.xpath('//android.support.v7.widget.RecyclerView/android.widget.FrameLayout').all
  502. )
  503. lis_len = len(drug_lis)
  504. logger.info('当前页面共有 %s 个商品', lis_len)
  505. for drug_one in drug_lis:
  506. bounds = drug_one.info['bounds']
  507. top, bottom = bounds['top'], bounds['bottom']
  508. if not (304 <= top and bottom <= 1559):
  509. continue
  510. self.safe_exec(drug_one.click)
  511. time.sleep(2)
  512. try:
  513. self.integrate_data()
  514. spider_no += 1
  515. except Exception as e:
  516. logger.exception('采集详情异常: %s', e)
  517. self.swipe_back(1)
  518. continue
  519. if self.safe_exec(self.distinct_target):
  520. logger.debug('已返回列表页')
  521. else:
  522. if self.d.xpath('//*[@text="搜索"]').exists:
  523. logger.warning('已回到搜索页,重新开始流程')
  524. if retry_count < MAX_RETRY:
  525. self.monitor.stop()
  526. self.monitor.join()
  527. return self.main(device_id, retry_count + 1)
  528. else:
  529. logger.error('超过最大重试次数,终止')
  530. return
  531. else:
  532. logger.error('无法恢复页面,终止')
  533. return
  534. time.sleep(self.get_sleep_time())
  535. if self.d.xpath('//*[@text="已经到底啦"]').exists:
  536. logger.info('已到底')
  537. break
  538. self.d.drag(300, 1400, 300, 400, 1)
  539. time.sleep(self.get_sleep_time())
  540. finally:
  541. self.monitor.stop()
  542. self.monitor.join()