|
|
@@ -1,9 +1,8 @@
|
|
|
"""定时从调度库拉取任务、执行爬虫并上报状态的通用入口。"""
|
|
|
-
|
|
|
+import random
|
|
|
import time
|
|
|
import json
|
|
|
import requests
|
|
|
-import schedule
|
|
|
|
|
|
from commons.Logger import logger
|
|
|
from commons.conn_mysql import MySQLPoolOnline
|
|
|
@@ -74,35 +73,54 @@ class CollectScheduleRunner:
|
|
|
logger.info(json.dumps(task_dict))
|
|
|
return task_dict
|
|
|
|
|
|
+ def heartbeat_task(self):
|
|
|
+ url = "https://scheduleapi.findit.ltd/api/collect_equipment_execute/heartbeat"
|
|
|
+ params = {
|
|
|
+ "collect_task_allocate_id": self.task_id,
|
|
|
+ }
|
|
|
+ try:
|
|
|
+ res = requests.get(RESULT_REPORT_URL, params=params, timeout=20)
|
|
|
+ logger.info("心跳任务上报成功")
|
|
|
+ except Exception as e:
|
|
|
+ logger.info("心跳任务上报失败")
|
|
|
+
|
|
|
def run(self):
|
|
|
self.task_dict = self.get_task()
|
|
|
if not self.task_dict:
|
|
|
logger.info("%s暂无任务", self.platform_name)
|
|
|
return
|
|
|
- # self._report_status(2)
|
|
|
+ self._report_status(2)
|
|
|
self.crawl_count, is_success = self.spider_cls(self.task_dict).run()
|
|
|
- # send_text(
|
|
|
- # f"{time.strftime('%Y-%m-%d %H:%M:%S')} 通知:\n"
|
|
|
- # f"平台: {self.platform_name}, 药品: {self.task_dict.get('product_name')}, "
|
|
|
- # f"爬取数据: {self.crawl_count}条"
|
|
|
- # )
|
|
|
- # self._report_status(3 if is_success else 4)
|
|
|
|
|
|
+ self.heartbeat_task()
|
|
|
+ send_text(
|
|
|
+ f"{time.strftime('%Y-%m-%d %H:%M:%S')} 通知:\n"
|
|
|
+ f"平台: {self.platform_name}, 药品: {self.task_dict.get('product_name')}, "
|
|
|
+ f"爬取数据: {self.crawl_count}条"
|
|
|
+ )
|
|
|
+ self._report_status(3 if is_success else 4)
|
|
|
|
|
|
-def run_scheduled_loop(platform_name, platform_id, spider_cls, *, interval_minutes=5, sleep_seconds=3, ):
|
|
|
- """先立即跑一轮,再按间隔定时执行。"""
|
|
|
|
|
|
- def scheduled_job():
|
|
|
+def run_scheduled_loop(
|
|
|
+ platform_name,
|
|
|
+ platform_id,
|
|
|
+ spider_cls,
|
|
|
+ *,
|
|
|
+ interval_minutes=5,
|
|
|
+ sleep_seconds=3,
|
|
|
+):
|
|
|
+ """循环拉取任务并执行爬虫;每轮结束后休眠 interval_minutes 分钟。"""
|
|
|
+ idle_seconds = random.randint(180,300)
|
|
|
+ logger.info(
|
|
|
+ "循环任务已启动,平台=%s,每轮间隔 %s 秒",
|
|
|
+ platform_name,
|
|
|
+ idle_seconds,
|
|
|
+ )
|
|
|
+ while True:
|
|
|
try:
|
|
|
logger.info("开始执行%s爬虫任务", platform_name)
|
|
|
CollectScheduleRunner(platform_name, platform_id, spider_cls).run()
|
|
|
logger.info("%s爬虫任务执行完成", platform_name)
|
|
|
except Exception as e:
|
|
|
logger.error("%s爬虫任务执行失败: %s", platform_name, e, exc_info=True)
|
|
|
-
|
|
|
- CollectScheduleRunner(platform_name, platform_id, spider_cls).run()
|
|
|
- schedule.every(interval_minutes).minutes.do(scheduled_job)
|
|
|
- logger.info("定时任务已启动,每%s分钟执行一次%s爬虫", interval_minutes, platform_name)
|
|
|
- while True:
|
|
|
- schedule.run_pending()
|
|
|
- time.sleep(sleep_seconds)
|
|
|
+ time.sleep(idle_seconds)
|