config.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. # config.py - 药帮忙采集配置
  2. from datetime import datetime
  3. from dotenv import load_dotenv
  4. import os
  5. import oss2
  6. from PIL import Image
  7. from commons.Logger import logger
  8. from conn_mysql import MySQLPoolOnline
  9. # 步骤:加载 .env 文件(如需)
  10. # load_dotenv() 默认读取当前目录下的 .env;如在其他路径可手动指定
  11. # load_dotenv()
  12. # MySQL:与 conn_mysql.MySQLPoolOnline 一致(默认 drug_retrieve 线上库)。
  13. # 覆盖方式:环境变量 MYSQL_DRUG_HOST / MYSQL_DRUG_PORT / MYSQL_DRUG_USER /
  14. # MYSQL_DRUG_PASSWORD / MYSQL_DRUG_DB
  15. mysql_pool = MySQLPoolOnline()
  16. # 模糊匹配 URL
  17. def fuzzy_match_product_url_in_db_mysql(product_url):
  18. # 先做非空校验
  19. if not product_url:
  20. logger.warning("鈿狅笍 寰呭尮閰嶇殑 product_url 涓虹┖锛岃烦杩囨暟鎹烘煡")
  21. return None
  22. # 如有需要可转义 % / _,避免 LIKE 通配符影响
  23. # escaped_product_url = product_url.replace("%", "\%").replace("_", "\_")
  24. try:
  25. sql = "SELECT * FROM ybm_drug_middle WHERE product_link LIKE %s"
  26. rows = mysql_pool.select_data(sql, (f"%{product_url}%",))
  27. return rows[0] if rows else None
  28. except Exception as e:
  29. logger.error(f"MySQL 模糊匹配失败:{str(e)}")
  30. return None
  31. # ==================== 从数据库提取任务 ====================
  32. def get_search_keywords_from_db(platform= 9, include_collect_config=False):
  33. """读取待执行任务(status=3),返回单条任务字典;无任务返回空 dict。"""
  34. sql = " SELECT * FROM retrieve_collect_task_allocate WHERE status = 1 AND platform = %s LIMIT 1 "
  35. try:
  36. rows = mysql_pool.select_data(sql, (platform,))
  37. data = rows[0] if rows else {}
  38. logger.debug(
  39. "读取待执行任务完成 platform=%s task_id=%s",
  40. platform,
  41. data.get("id") if data else None,
  42. )
  43. except Exception as e:
  44. logger.error(f"读取待执行任务失败,platform={platform},错误:{str(e)}")
  45. data = {}
  46. print(data)
  47. return data
  48. def has_running_task(platform: int = 9) -> bool:
  49. """检查指定 platform 在当天是否存在执行中任务(status=2)。"""
  50. try:
  51. day_start_ts = int(datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp())
  52. next_day_ts = day_start_ts + 24 * 60 * 60
  53. sql = (
  54. "SELECT 1 AS one FROM retrieve_collect_task_allocate "
  55. "WHERE status = 2 AND platform = %s "
  56. "AND update_time >= %s AND update_time < %s LIMIT 1"
  57. )
  58. rows = mysql_pool.select_data(sql, (platform, day_start_ts, next_day_ts))
  59. return bool(rows)
  60. except Exception as e:
  61. logger.error(f"检查执行中任务失败,platform={platform},错误:{str(e)}")
  62. return False
  63. if __name__ == '__main__':
  64. get_search_keywords_from_db()
  65. # ==================== 2. 反爬配置 ====================
  66. # 随机延迟范围(模拟真人操作)
  67. MIN_CLICK_DELAY = 1.5 # 点击最小延迟(秒)
  68. MAX_CLICK_DELAY = 3.5 # 点击最大延迟(秒)
  69. MIN_INPUT_DELAY = 0.1 # 输入最小延迟(秒)
  70. MAX_INPUT_DELAY = 0.3 # 输入最大延迟(秒)
  71. MIN_PAGE_DELAY = 2.0 # 页面最小等待(秒)
  72. MAX_PAGE_DELAY = 4.0 # 页面最大等待(秒)
  73. # 关键词之间的随机延迟(秒)
  74. MIN_KEYWORD_DELAY = 8.0
  75. MAX_KEYWORD_DELAY = 15.0
  76. # 滚动配置(目标 400px,含随机偏移)
  77. SCROLL_TARGET_DISTANCE = 400 # 目标滚动距离
  78. SCROLL_OFFSET_RANGE = 50 # 随机偏移范围
  79. SCROLL_STEP = 50 # 每次滚动步长
  80. SCROLL_INTERVAL = 0.05 # 步长间隔(秒)
  81. # ==================== 3. Cookie & 登录配置 ====================
  82. COOKIE_FILE_PATH = "ybm_cookies.json" # Cookie 保存路径
  83. # Cookie 有效性验证页面
  84. LOGIN_VALIDATE_URL = "https://www.ybm100.com/new/"
  85. # 登录账号密码
  86. USERNAME = "18008650300"
  87. PASSWORD = "12345678"
  88. # USERNAME = "yjj112031"
  89. # PASSWORD = "123456"
  90. # 登录URL
  91. TARGET_LOGIN_URL = "https://www.ybm100.com/new/login"
  92. # "https://www.yyjzt.com/login?redirect=%2FgoodDetail%3FladderNum%26itemStoreId%3D124250306%26sourceProdetail%3D%252Fsearch%26is_store%3D0"
  93. # ==================== 4. 元素选择器配置 ====================
  94. # 基础选择器
  95. USERNAME_SELECTOR = "input[placeholder*=请输入账号]"
  96. PASSWORD_SELECTOR = "input[placeholder*=请输入密码]"
  97. LOGIN_BTN_SELECTOR = "button:has(span:text('登录'))"
  98. SEARCH_INPUT_SELECTOR = "input[placeholder*='药品名称/厂家名称']"
  99. SEARCH_INPUT_SELECTOR2 = "div.home-search-container-search-head"
  100. SEARCH_BTN_SELECTOR = "div.home-search-container-search-head-btn[data-scmd=\"text-搜索\"]"
  101. # 采集元素选择器可根据页面实际情况调整
  102. # 如页面结构变化,请优先更新以上选择器
  103. PRODUCT_ITEM_SELECTOR = "div.product-list-item" # 商品项容器
  104. PRODUCT_TITLE_SELECTOR = "div.product-name" # 商品标题
  105. PRODUCT_PRICE_SELECTOR = "div.main-price" # 商品价格
  106. PRODUCT_STORE_SELECTOR = 'div.prduct-shop-name div.shop-name' # 店铺名称
  107. PRODUCT_COMPANY_SELECTOR = "div.product-manufacturer" # 公司名称
  108. PRODUCT_VALIDITY_SELECTOR = "div.product-period" # 有效期
  109. # div.shop-info-container-left-info-name span
  110. # ==================== 5. 等待时间配置(秒) ====================
  111. ELEMENT_TIMEOUT = 10000
  112. LOGIN_AFTER_CLICK = 5000
  113. SEARCH_BTN_TIMEOUT = 5000
  114. COLLECT_DELAY = 3000
  115. DETAIL_LOAD_TIMEOUT = 5000 # 点击商品后等待详情加载时间
  116. # ==================== 6. 浏览器配置 ====================
  117. BROWSER_HEADLESS = False
  118. BROWSER_CHANNEL = "chrome"
  119. SLOW_MO_MIN = 50
  120. SLOW_MO_MAX = 100
  121. # ==================== 7. CSV 配置 ====================
  122. CSV_HEADERS = [
  123. "商品标题", "商品采购价格", "商品折扣价格", "规格", "盒数",
  124. "店铺名称", "公司名称",
  125. "有效日期", "生产日期", "批准文号", "采集时间"
  126. ] # CSV 表头
  127. # 存放营业执照截图路径(如需)
  128. # cropped_screenshot_path =
  129. # 百度 OCR 配置
  130. request_url_config = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  131. AppKey_config = "tRK2RhyItCSh6BzyT4CNVXQa"
  132. AppSecret_config = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  133. token_url_config = 'https://aip.baidubce.com/oauth/2.0/token'
  134. # ---------------------- OSS 配置 ----------------------
  135. OSS_ACCESS_KEY_ID = 'LTAI5tDwjfteBvivYN41r8sJ'
  136. OSS_ACCESS_KEY_SECRET = 'yowuOGi2nYYnrqGpO3qcz94C4brcPp'
  137. OSS_ENDPOINT = "oss-cn-shenzhen.aliyuncs.com"
  138. OSS_BUCKET_NAME = "zhijiayun-jiansuo"
  139. OSS_PREFIX = "scrape_data/"
  140. # 本地截图配置
  141. LOCAL_SCREENSHOT_DIR = "local_screenshots" # 本地截图目录
  142. LOCAL_SCREENSHOT_NAME = None # 自动生成文件名
  143. LOCAL_CROPPED_DIR = "./local_cropped_screenshots" # 裁剪后目录
  144. # 图片压缩配置
  145. IMAGE_COMPRESS_ENABLE = True # 是否启用压缩(True/False)
  146. IMAGE_COMPRESS_QUALITY = 30 # JPG 质量(1-95)
  147. IMAGE_COMPRESS_PNG_LEVEL = 9 # PNG 压缩级别(0-9)
  148. # ---------------------- 宸ュ叿鍑芥暟 ----------------------
  149. def init_local_screenshot_dir():
  150. """初始化本地截图目录(不存在则创建)。"""
  151. if not os.path.exists(LOCAL_SCREENSHOT_DIR):
  152. os.makedirs(LOCAL_SCREENSHOT_DIR)
  153. logger.info(f"本地截图目录已创建: {LOCAL_SCREENSHOT_DIR}")
  154. else:
  155. logger.debug(f"本地截图目录已存在: {LOCAL_SCREENSHOT_DIR}")
  156. def init_oss_bucket():
  157. """初始化 OSS Bucket 对象。"""
  158. try:
  159. auth = oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET)
  160. bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET_NAME)
  161. bucket.get_bucket_info()
  162. logger.info("OSS Bucket 初始化成功")
  163. return bucket
  164. except Exception as e:
  165. logger.error(f"OSS Bucket 初始化失败: {str(e)}")
  166. raise
  167. def upload_local_screenshot_to_oss(bucket, local_file_path, oss_file_path=None):
  168. """将本地截图上传到 OSS,返回公网访问链接。"""
  169. if not os.path.exists(local_file_path):
  170. raise FileNotFoundError(f"本地截图文件不存在: {local_file_path}")
  171. if not oss_file_path:
  172. local_file_name = os.path.basename(local_file_path)
  173. oss_file_path = f"screenshots/{local_file_name}"
  174. try:
  175. bucket.put_object_from_file(oss_file_path, local_file_path)
  176. oss_file_url = f"https://{OSS_BUCKET_NAME}.{OSS_ENDPOINT}/{oss_file_path}"
  177. logger.info(f"截图上传 OSS 成功: {oss_file_url}")
  178. return oss_file_url
  179. except Exception as e:
  180. logger.error(f"截图上传 OSS 失败: {str(e)}")
  181. raise
  182. # ---------------------- 图片裁剪与压缩 ----------------------
  183. def crop_local_screenshot(local_file_path, cropped_file_path=None, crop_region=None):
  184. """裁剪本地截图并保存,成功后删除原图,返回裁剪后文件路径。"""
  185. if not os.path.exists(local_file_path):
  186. raise FileNotFoundError(f"原始截图文件不存在: {local_file_path}")
  187. os.makedirs(LOCAL_CROPPED_DIR, exist_ok=True)
  188. if not cropped_file_path:
  189. file_name = os.path.basename(local_file_path)
  190. file_name_no_ext, file_ext = os.path.splitext(file_name)
  191. cropped_file_name = f"{file_name_no_ext}_cropped{file_ext}"
  192. cropped_file_path = os.path.join(LOCAL_CROPPED_DIR, cropped_file_name)
  193. with Image.open(local_file_path) as img:
  194. img_width, img_height = img.size
  195. logger.debug(f"原图尺寸: width={img_width}, height={img_height}")
  196. if not crop_region:
  197. left = 0
  198. upper = 0
  199. right = int(img_width)
  200. lower = int(img_height * 0.3)
  201. crop_region = (left, upper, right, lower)
  202. logger.debug(f"未指定裁剪区域,使用默认区域: {crop_region}")
  203. c_left, c_upper, c_right, c_lower = crop_region
  204. if c_right > img_width or c_lower > img_height or c_left < 0 or c_upper < 0:
  205. raise ValueError(
  206. f"裁剪区域超出图片范围,图片尺寸=({img_width}, {img_height}),裁剪区域={crop_region}"
  207. )
  208. cropped_img = img.crop(crop_region)
  209. file_ext = os.path.splitext(cropped_file_path)[1].lower()
  210. try:
  211. if IMAGE_COMPRESS_ENABLE:
  212. if file_ext in ['.jpg', '.jpeg']:
  213. cropped_img.save(
  214. cropped_file_path,
  215. format='JPEG',
  216. quality=IMAGE_COMPRESS_QUALITY,
  217. optimize=True,
  218. progressive=True,
  219. )
  220. else:
  221. cropped_img.save(cropped_file_path)
  222. logger.info(f"裁剪图片已保存(压缩开启): {cropped_file_path}")
  223. else:
  224. cropped_img.save(cropped_file_path, format='JPEG')
  225. logger.info(f"裁剪图片已保存(压缩关闭): {cropped_file_path}")
  226. except Exception as e:
  227. cropped_img.save(cropped_file_path, format='JPEG')
  228. logger.warning(f"图片压缩失败,已按普通 JPEG 保存: {str(e)}")
  229. try:
  230. if os.path.exists(cropped_file_path):
  231. os.remove(local_file_path)
  232. logger.debug(f"已删除原始截图: {local_file_path}")
  233. else:
  234. logger.warning(f"裁剪文件不存在,跳过删除原图: {cropped_file_path}")
  235. except OSError as e:
  236. logger.warning(f"删除原始截图失败: {str(e)}")
  237. return cropped_file_path
  238. def screenshot_target_page_to_local_then_oss(target_page, local_file_path=None, oss_file_path=None, full_page=True, crop_region=None):
  239. """页面截图到本地后裁剪,再上传 OSS,返回(裁剪路径, OSS链接)。"""
  240. os.makedirs(LOCAL_SCREENSHOT_DIR, exist_ok=True)
  241. if not local_file_path:
  242. current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
  243. local_file_name = f"{current_time}_target_page.jpg"
  244. local_file_path = os.path.join(LOCAL_SCREENSHOT_DIR, local_file_name)
  245. logger.info(f"开始页面截图: {local_file_path}")
  246. target_page.screenshot(
  247. path=local_file_path,
  248. full_page=full_page,
  249. omit_background=False,
  250. timeout=10000,
  251. )
  252. logger.debug("页面截图完成")
  253. cropped_file_path = crop_local_screenshot(
  254. local_file_path=local_file_path,
  255. crop_region=crop_region,
  256. )
  257. bucket = init_oss_bucket()
  258. oss_file_url = upload_local_screenshot_to_oss(bucket, cropped_file_path, oss_file_path)
  259. return cropped_file_path, oss_file_url