config.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. # config.py - 药帮忙采集配置
  2. from datetime import datetime
  3. import pymysql
  4. from dotenv import load_dotenv
  5. import os
  6. import oss2
  7. from PIL import Image
  8. from logger_config import logger
  9. # 步骤:加载 .env 文件(如需)
  10. # load_dotenv() 默认读取当前目录下的 .env;如在其他路径可手动指定
  11. # load_dotenv()
  12. # MySQL 配置(与 MYSQL_CONFIG 结构一致)
  13. # MYSQL_CONFIG = {
  14. # "host": "47.119.164.65", # MySQL 地址
  15. # "port": 3306, # 端口
  16. # "user": "test_c", # 用户名
  17. # "password": "Dfwy@2025", # 密码
  18. # "database": "test2", # 数据库名
  19. # "charset": "utf8mb4" # 字符集
  20. # }
  21. # 测试环境
  22. # MYSQL_CONFIG = {
  23. # "host": "39.108.116.125", # MySQL 地址
  24. # "port": 3306, # 端口
  25. # "user": "drug_retrieve", # 用户名
  26. # "password": "Pem287...", # 密码
  27. # "database": "drug_retrieve", # 数据库名
  28. # "charset": "utf8mb4" # 字符集
  29. # }
  30. # 线上环境
  31. MYSQL_CONFIG = {
  32. "host": "120.24.49.2", # MySQL 地址
  33. "port": 3306, # 端口
  34. "user": "drug_retrieve", # 用户名
  35. "password": "ksCt3xm6chzdkafj", # 密码
  36. "database": "drug_retrieve", # 数据库名
  37. "charset": "utf8mb4" # 字符集
  38. }
  39. # MYSQL_CONFIG = {
  40. # "host": os.getenv("MYSQL_HOST"), # 从 .env 读取 MYSQL_HOST
  41. # "user": os.getenv("MYSQL_USER"),
  42. # "password": os.getenv("MYSQL_PASSWORD"), # 敏感信息建议放在 .env
  43. # "database": os.getenv("MYSQL_DATABASE"),
  44. # "port": int(os.getenv("MYSQL_PORT", 3306)), # 默认端口 3306
  45. # "charset": "utf8mb4"
  46. # }
  47. # 模糊匹配 URL
  48. def fuzzy_match_product_url_in_db_mysql(product_url):
  49. # 先做非空校验
  50. if not product_url:
  51. logger.warning("鈿狅笍 寰呭尮閰嶇殑 product_url 涓虹┖锛岃烦杩囨暟鎹烘煡")
  52. return None
  53. # 如有需要可转义 % / _,避免 LIKE 通配符影响
  54. # escaped_product_url = product_url.replace("%", "\%").replace("_", "\_")
  55. try:
  56. conn = pymysql.connect(**MYSQL_CONFIG)
  57. cursor = conn.cursor()
  58. # 使用 LIKE 做模糊匹配
  59. # 例如:%product_url%
  60. sql = "SELECT * FROM ybm_drug_middle WHERE product_link LIKE %s"
  61. match_value = f"%{product_url}%"
  62. cursor.execute(sql, (match_value,))
  63. # 取第一条匹配记录并格式化为字典
  64. result = cursor.fetchone() # 返回元组,如 (id, product_url, price, ...)
  65. if result:
  66. # 将查询结果转换为字典,便于后续按字段名取值
  67. column_names = [desc[0] for desc in cursor.description]
  68. result_dict = dict(zip(column_names, result))
  69. return result_dict # 匹配成功
  70. else:
  71. return None # 未匹配到记录
  72. except Exception as e:
  73. logger.error(f"MySQL 模糊匹配失败:{str(e)}")
  74. return None
  75. finally:
  76. # 无论成功与否都关闭连接,避免连接泄漏
  77. if 'conn' in locals() and conn:
  78. conn.close()
  79. # ==================== 从数据库提取任务 ====================
  80. def get_search_keywords_from_db(platform: int = 9):
  81. """读取待执行任务(status=1),返回 [(task_id, brand, keyword, company_id), ...]。"""
  82. keywords = []
  83. conn = None
  84. cursor = None
  85. try:
  86. required_configs = ['host', 'user', 'password', 'database']
  87. for cfg in required_configs:
  88. if cfg not in MYSQL_CONFIG:
  89. raise ValueError(f"MYSQL_CONFIG 缺少必要字段: {cfg}")
  90. conn = pymysql.connect(**MYSQL_CONFIG)
  91. cursor = conn.cursor()
  92. sql = (
  93. 'SELECT id, product_brand, product_name, product_specs, company_id '
  94. 'FROM retrieve_collect_task_allocate '
  95. 'WHERE status = 1 AND platform = %s'
  96. )
  97. cursor.execute(sql, (platform,))
  98. results = cursor.fetchall()
  99. for row in results:
  100. task_id = row[0]
  101. brand = (row[1] or '').strip()
  102. name = (row[2] or '').strip()
  103. company_id = row[4] if row[4] is not None else 0
  104. parts = [p for p in [brand, name] if p]
  105. if parts:
  106. keyword = ''.join(parts)
  107. keywords.append((task_id, brand, keyword, company_id))
  108. logger.debug(f"读取待执行任务完成,platform={platform},数量={len(keywords)}")
  109. except Exception as e:
  110. logger.error(f"读取待执行任务失败,platform={platform},错误:{str(e)}")
  111. keywords = []
  112. finally:
  113. if cursor:
  114. try:
  115. cursor.close()
  116. except Exception:
  117. pass
  118. if conn:
  119. try:
  120. conn.close()
  121. except Exception:
  122. pass
  123. return keywords
  124. def has_running_task(platform: int = 9) -> bool:
  125. """检查指定 platform 在当天是否存在执行中任务(status=2)。"""
  126. conn = None
  127. cursor = None
  128. try:
  129. required_configs = ['host', 'user', 'password', 'database']
  130. for cfg in required_configs:
  131. if cfg not in MYSQL_CONFIG:
  132. raise ValueError(f"MYSQL_CONFIG 缺少必要字段: {cfg}")
  133. day_start_ts = int(datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp())
  134. next_day_ts = day_start_ts + 24 * 60 * 60
  135. conn = pymysql.connect(**MYSQL_CONFIG)
  136. cursor = conn.cursor()
  137. sql = (
  138. 'SELECT 1 FROM retrieve_collect_task_allocate '
  139. 'WHERE status = 2 AND platform = %s '
  140. 'AND update_time >= %s AND update_time < %s LIMIT 1'
  141. )
  142. cursor.execute(sql, (platform, day_start_ts, next_day_ts))
  143. return cursor.fetchone() is not None
  144. except Exception as e:
  145. logger.error(f"检查执行中任务失败,platform={platform},错误:{str(e)}")
  146. return False
  147. finally:
  148. if cursor:
  149. try:
  150. cursor.close()
  151. except Exception:
  152. pass
  153. if conn:
  154. try:
  155. conn.close()
  156. except Exception:
  157. pass
  158. # 以下历史示例注释已保留为空,避免乱码干扰
  159. # ==================== 2. 反爬配置 ====================
  160. # 随机延迟范围(模拟真人操作)
  161. MIN_CLICK_DELAY = 1.5 # 点击最小延迟(秒)
  162. MAX_CLICK_DELAY = 3.5 # 点击最大延迟(秒)
  163. MIN_INPUT_DELAY = 0.1 # 输入最小延迟(秒)
  164. MAX_INPUT_DELAY = 0.3 # 输入最大延迟(秒)
  165. MIN_PAGE_DELAY = 2.0 # 页面最小等待(秒)
  166. MAX_PAGE_DELAY = 4.0 # 页面最大等待(秒)
  167. # 关键词之间的随机延迟(秒)
  168. MIN_KEYWORD_DELAY = 8.0
  169. MAX_KEYWORD_DELAY = 15.0
  170. # 滚动配置(目标 400px,含随机偏移)
  171. SCROLL_TARGET_DISTANCE = 400 # 目标滚动距离
  172. SCROLL_OFFSET_RANGE = 50 # 随机偏移范围
  173. SCROLL_STEP = 50 # 每次滚动步长
  174. SCROLL_INTERVAL = 0.05 # 步长间隔(秒)
  175. # ==================== 3. Cookie & 登录配置 ====================
  176. COOKIE_FILE_PATH = "ybm_cookies.json" # Cookie 保存路径
  177. # Cookie 有效性验证页面
  178. LOGIN_VALIDATE_URL = "https://www.ybm100.com/new/"
  179. # 登录账号密码
  180. USERNAME = "18008650300"
  181. PASSWORD = "12345678"
  182. # USERNAME = "yjj112031"
  183. # PASSWORD = "123456"
  184. # 登录URL
  185. TARGET_LOGIN_URL = "https://www.ybm100.com/new/login"
  186. # "https://www.yyjzt.com/login?redirect=%2FgoodDetail%3FladderNum%26itemStoreId%3D124250306%26sourceProdetail%3D%252Fsearch%26is_store%3D0"
  187. # ==================== 4. 元素选择器配置 ====================
  188. # 基础选择器
  189. USERNAME_SELECTOR = "input[placeholder*=请输入账号]"
  190. PASSWORD_SELECTOR = "input[placeholder*=请输入密码]"
  191. LOGIN_BTN_SELECTOR = "button:has(span:text('登录'))"
  192. SEARCH_INPUT_SELECTOR = "input[placeholder*='药品名称/厂家名称']"
  193. SEARCH_INPUT_SELECTOR2 = "div.home-search-container-search-head"
  194. SEARCH_BTN_SELECTOR = "div.home-search-container-search-head-btn[data-scmd=\"text-搜索\"]"
  195. # 采集元素选择器可根据页面实际情况调整
  196. # 如页面结构变化,请优先更新以上选择器
  197. PRODUCT_ITEM_SELECTOR = "div.product-list-item" # 商品项容器
  198. PRODUCT_TITLE_SELECTOR = "div.product-name" # 商品标题
  199. PRODUCT_PRICE_SELECTOR = "div.main-price" # 商品价格
  200. PRODUCT_STORE_SELECTOR = 'div.prduct-shop-name div.shop-name' # 店铺名称
  201. PRODUCT_COMPANY_SELECTOR = "div.product-manufacturer" # 公司名称
  202. PRODUCT_VALIDITY_SELECTOR = "div.product-period" # 有效期
  203. # div.shop-info-container-left-info-name span
  204. # ==================== 5. 等待时间配置(秒) ====================
  205. ELEMENT_TIMEOUT = 10000
  206. LOGIN_AFTER_CLICK = 5000
  207. SEARCH_BTN_TIMEOUT = 5000
  208. COLLECT_DELAY = 3000
  209. DETAIL_LOAD_TIMEOUT = 5000 # 点击商品后等待详情加载时间
  210. # ==================== 6. 浏览器配置 ====================
  211. BROWSER_HEADLESS = False
  212. BROWSER_CHANNEL = "chrome"
  213. SLOW_MO_MIN = 50
  214. SLOW_MO_MAX = 100
  215. # ==================== 7. CSV 配置 ====================
  216. CSV_HEADERS = [
  217. "商品标题", "商品采购价格", "商品折扣价格", "规格", "盒数",
  218. "店铺名称", "公司名称",
  219. "有效日期", "生产日期", "批准文号", "采集时间"
  220. ] # CSV 表头
  221. # 存放营业执照截图路径(如需)
  222. # cropped_screenshot_path =
  223. # 百度 OCR 配置
  224. request_url_config = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  225. AppKey_config = "tRK2RhyItCSh6BzyT4CNVXQa"
  226. AppSecret_config = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  227. token_url_config = 'https://aip.baidubce.com/oauth/2.0/token'
  228. # ---------------------- OSS 配置 ----------------------
  229. OSS_ACCESS_KEY_ID = 'LTAI5tDwjfteBvivYN41r8sJ'
  230. OSS_ACCESS_KEY_SECRET = 'yowuOGi2nYYnrqGpO3qcz94C4brcPp'
  231. OSS_ENDPOINT = "oss-cn-shenzhen.aliyuncs.com"
  232. OSS_BUCKET_NAME = "zhijiayun-jiansuo"
  233. OSS_PREFIX = "scrape_data/"
  234. # 本地截图配置
  235. LOCAL_SCREENSHOT_DIR = "local_screenshots" # 本地截图目录
  236. LOCAL_SCREENSHOT_NAME = None # 自动生成文件名
  237. LOCAL_CROPPED_DIR = "./local_cropped_screenshots" # 裁剪后目录
  238. # 图片压缩配置
  239. IMAGE_COMPRESS_ENABLE = True # 是否启用压缩(True/False)
  240. IMAGE_COMPRESS_QUALITY = 30 # JPG 质量(1-95)
  241. IMAGE_COMPRESS_PNG_LEVEL = 9 # PNG 压缩级别(0-9)
  242. # ---------------------- 宸ュ叿鍑芥暟 ----------------------
  243. def init_local_screenshot_dir():
  244. """初始化本地截图目录(不存在则创建)。"""
  245. if not os.path.exists(LOCAL_SCREENSHOT_DIR):
  246. os.makedirs(LOCAL_SCREENSHOT_DIR)
  247. logger.info(f"本地截图目录已创建: {LOCAL_SCREENSHOT_DIR}")
  248. else:
  249. logger.debug(f"本地截图目录已存在: {LOCAL_SCREENSHOT_DIR}")
  250. def init_oss_bucket():
  251. """初始化 OSS Bucket 对象。"""
  252. try:
  253. auth = oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET)
  254. bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET_NAME)
  255. bucket.get_bucket_info()
  256. logger.info("OSS Bucket 初始化成功")
  257. return bucket
  258. except Exception as e:
  259. logger.error(f"OSS Bucket 初始化失败: {str(e)}")
  260. raise
  261. def upload_local_screenshot_to_oss(bucket, local_file_path, oss_file_path=None):
  262. """将本地截图上传到 OSS,返回公网访问链接。"""
  263. if not os.path.exists(local_file_path):
  264. raise FileNotFoundError(f"本地截图文件不存在: {local_file_path}")
  265. if not oss_file_path:
  266. local_file_name = os.path.basename(local_file_path)
  267. oss_file_path = f"screenshots/{local_file_name}"
  268. try:
  269. bucket.put_object_from_file(oss_file_path, local_file_path)
  270. oss_file_url = f"https://{OSS_BUCKET_NAME}.{OSS_ENDPOINT}/{oss_file_path}"
  271. logger.info(f"截图上传 OSS 成功: {oss_file_url}")
  272. return oss_file_url
  273. except Exception as e:
  274. logger.error(f"截图上传 OSS 失败: {str(e)}")
  275. raise
  276. # ---------------------- 图片裁剪与压缩 ----------------------
  277. def crop_local_screenshot(local_file_path, cropped_file_path=None, crop_region=None):
  278. """裁剪本地截图并保存,成功后删除原图,返回裁剪后文件路径。"""
  279. if not os.path.exists(local_file_path):
  280. raise FileNotFoundError(f"原始截图文件不存在: {local_file_path}")
  281. os.makedirs(LOCAL_CROPPED_DIR, exist_ok=True)
  282. if not cropped_file_path:
  283. file_name = os.path.basename(local_file_path)
  284. file_name_no_ext, file_ext = os.path.splitext(file_name)
  285. cropped_file_name = f"{file_name_no_ext}_cropped{file_ext}"
  286. cropped_file_path = os.path.join(LOCAL_CROPPED_DIR, cropped_file_name)
  287. with Image.open(local_file_path) as img:
  288. img_width, img_height = img.size
  289. logger.debug(f"原图尺寸: width={img_width}, height={img_height}")
  290. if not crop_region:
  291. left = 0
  292. upper = 0
  293. right = int(img_width)
  294. lower = int(img_height * 0.3)
  295. crop_region = (left, upper, right, lower)
  296. logger.debug(f"未指定裁剪区域,使用默认区域: {crop_region}")
  297. c_left, c_upper, c_right, c_lower = crop_region
  298. if c_right > img_width or c_lower > img_height or c_left < 0 or c_upper < 0:
  299. raise ValueError(
  300. f"裁剪区域超出图片范围,图片尺寸=({img_width}, {img_height}),裁剪区域={crop_region}"
  301. )
  302. cropped_img = img.crop(crop_region)
  303. file_ext = os.path.splitext(cropped_file_path)[1].lower()
  304. try:
  305. if IMAGE_COMPRESS_ENABLE:
  306. if file_ext in ['.jpg', '.jpeg']:
  307. cropped_img.save(
  308. cropped_file_path,
  309. format='JPEG',
  310. quality=IMAGE_COMPRESS_QUALITY,
  311. optimize=True,
  312. progressive=True,
  313. )
  314. else:
  315. cropped_img.save(cropped_file_path)
  316. logger.info(f"裁剪图片已保存(压缩开启): {cropped_file_path}")
  317. else:
  318. cropped_img.save(cropped_file_path, format='JPEG')
  319. logger.info(f"裁剪图片已保存(压缩关闭): {cropped_file_path}")
  320. except Exception as e:
  321. cropped_img.save(cropped_file_path, format='JPEG')
  322. logger.warning(f"图片压缩失败,已按普通 JPEG 保存: {str(e)}")
  323. try:
  324. if os.path.exists(cropped_file_path):
  325. os.remove(local_file_path)
  326. logger.debug(f"已删除原始截图: {local_file_path}")
  327. else:
  328. logger.warning(f"裁剪文件不存在,跳过删除原图: {cropped_file_path}")
  329. except OSError as e:
  330. logger.warning(f"删除原始截图失败: {str(e)}")
  331. return cropped_file_path
  332. def screenshot_target_page_to_local_then_oss(target_page, local_file_path=None, oss_file_path=None, full_page=True, crop_region=None):
  333. """页面截图到本地后裁剪,再上传 OSS,返回(裁剪路径, OSS链接)。"""
  334. os.makedirs(LOCAL_SCREENSHOT_DIR, exist_ok=True)
  335. if not local_file_path:
  336. current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
  337. local_file_name = f"{current_time}_target_page.jpg"
  338. local_file_path = os.path.join(LOCAL_SCREENSHOT_DIR, local_file_name)
  339. logger.info(f"开始页面截图: {local_file_path}")
  340. target_page.screenshot(
  341. path=local_file_path,
  342. full_page=full_page,
  343. omit_background=False,
  344. timeout=10000,
  345. )
  346. logger.debug("页面截图完成")
  347. cropped_file_path = crop_local_screenshot(
  348. local_file_path=local_file_path,
  349. crop_region=crop_region,
  350. )
  351. bucket = init_oss_bucket()
  352. oss_file_url = upload_local_screenshot_to_oss(bucket, cropped_file_path, oss_file_path)
  353. return cropped_file_path, oss_file_url