config.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. # config.py - 药帮忙采集配置
  2. from datetime import datetime
  3. import pymysql
  4. from dotenv import load_dotenv
  5. import os
  6. import oss2
  7. from PIL import Image
  8. from logger_config import logger
  9. # 步骤:加载 .env 文件(如需)
  10. # load_dotenv() 默认读取当前目录下的 .env;如在其他路径可手动指定
  11. # load_dotenv()
  12. # MySQL 配置(与 MYSQL_CONFIG 结构一致)
  13. # MYSQL_CONFIG = {
  14. # "host": "47.119.164.65", # MySQL 地址
  15. # "port": 3306, # 端口
  16. # "user": "test_c", # 用户名
  17. # "password": "Dfwy@2025", # 密码
  18. # "database": "test2", # 数据库名
  19. # "charset": "utf8mb4" # 字符集
  20. # }
  21. # 测试环境
  22. # MYSQL_CONFIG = {
  23. # "host": "39.108.116.125", # MySQL 地址
  24. # "port": 3306, # 端口
  25. # "user": "drug_retrieve", # 用户名
  26. # "password": "Pem287...", # 密码
  27. # "database": "drug_retrieve", # 数据库名
  28. # "charset": "utf8mb4" # 字符集
  29. # }
  30. # 线上环境
  31. MYSQL_CONFIG = {
  32. "host": "120.24.49.2", # MySQL 地址
  33. "port": 3306, # 端口
  34. "user": "drug_retrieve", # 用户名
  35. "password": "ksCt3xm6chzdkafj", # 密码
  36. "database": "drug_retrieve", # 数据库名
  37. "charset": "utf8mb4" # 字符集
  38. }
  39. # MYSQL_CONFIG = {
  40. # "host": os.getenv("MYSQL_HOST"), # 从 .env 读取 MYSQL_HOST
  41. # "user": os.getenv("MYSQL_USER"),
  42. # "password": os.getenv("MYSQL_PASSWORD"), # 敏感信息建议放在 .env
  43. # "database": os.getenv("MYSQL_DATABASE"),
  44. # "port": int(os.getenv("MYSQL_PORT", 3306)), # 默认端口 3306
  45. # "charset": "utf8mb4"
  46. # }
  47. # 模糊匹配 URL
  48. def fuzzy_match_product_url_in_db_mysql(product_url):
  49. # 先做非空校验
  50. if not product_url:
  51. logger.warning("鈿狅笍 寰呭尮閰嶇殑 product_url 涓虹┖锛岃烦杩囨暟鎹烘煡")
  52. return None
  53. # 如有需要可转义 % / _,避免 LIKE 通配符影响
  54. # escaped_product_url = product_url.replace("%", "\%").replace("_", "\_")
  55. try:
  56. conn = pymysql.connect(**MYSQL_CONFIG)
  57. cursor = conn.cursor()
  58. # 使用 LIKE 做模糊匹配
  59. # 例如:%product_url%
  60. sql = "SELECT * FROM ybm_drug_middle WHERE product_link LIKE %s"
  61. match_value = f"%{product_url}%"
  62. cursor.execute(sql, (match_value,))
  63. # 取第一条匹配记录并格式化为字典
  64. result = cursor.fetchone() # 返回元组,如 (id, product_url, price, ...)
  65. if result:
  66. # 将查询结果转换为字典,便于后续按字段名取值
  67. column_names = [desc[0] for desc in cursor.description]
  68. result_dict = dict(zip(column_names, result))
  69. return result_dict # 匹配成功
  70. else:
  71. return None # 未匹配到记录
  72. except Exception as e:
  73. logger.error(f"MySQL 模糊匹配失败:{str(e)}")
  74. return None
  75. finally:
  76. # 无论成功与否都关闭连接,避免连接泄漏
  77. if 'conn' in locals() and conn:
  78. conn.close()
  79. # ==================== 从数据库提取任务 ====================
  80. def get_search_keywords_from_db(platform: int = 9):
  81. """读取待执行任务(status=1),返回 [(task_id, brand, keyword, company_id, product_specs), ...]。"""
  82. keywords = []
  83. conn = None
  84. cursor = None
  85. try:
  86. required_configs = ['host', 'user', 'password', 'database']
  87. for cfg in required_configs:
  88. if cfg not in MYSQL_CONFIG:
  89. raise ValueError(f"MYSQL_CONFIG 缺少必要字段: {cfg}")
  90. conn = pymysql.connect(**MYSQL_CONFIG)
  91. cursor = conn.cursor()
  92. sql = (
  93. 'SELECT id, product_brand, product_name, product_specs, company_id '
  94. 'FROM retrieve_collect_task_allocate '
  95. 'WHERE status = 1 AND platform = %s'
  96. )
  97. cursor.execute(sql, (platform,))
  98. results = cursor.fetchall()
  99. for row in results:
  100. task_id = row[0]
  101. brand = (row[1] or '').strip()
  102. name = (row[2] or '').strip()
  103. specs = (row[3] or '').strip()
  104. company_id = row[4] if row[4] is not None else 0
  105. keywords.append((task_id, brand, name, specs, company_id))
  106. logger.debug(f"读取待执行任务完成,platform={platform},数量={len(keywords)}")
  107. except Exception as e:
  108. logger.error(f"读取待执行任务失败,platform={platform},错误:{str(e)}")
  109. keywords = []
  110. finally:
  111. if cursor:
  112. try:
  113. cursor.close()
  114. except Exception:
  115. pass
  116. if conn:
  117. try:
  118. conn.close()
  119. except Exception:
  120. pass
  121. return keywords
  122. def has_running_task(platform: int = 9) -> bool:
  123. """检查指定 platform 在当天是否存在执行中任务(status=2)。"""
  124. conn = None
  125. cursor = None
  126. try:
  127. required_configs = ['host', 'user', 'password', 'database']
  128. for cfg in required_configs:
  129. if cfg not in MYSQL_CONFIG:
  130. raise ValueError(f"MYSQL_CONFIG 缺少必要字段: {cfg}")
  131. day_start_ts = int(datetime.now().replace(hour=0, minute=0, second=0, microsecond=0).timestamp())
  132. next_day_ts = day_start_ts + 24 * 60 * 60
  133. conn = pymysql.connect(**MYSQL_CONFIG)
  134. cursor = conn.cursor()
  135. sql = (
  136. 'SELECT 1 FROM retrieve_collect_task_allocate '
  137. 'WHERE status = 2 AND platform = %s '
  138. 'AND update_time >= %s AND update_time < %s LIMIT 1'
  139. )
  140. cursor.execute(sql, (platform, day_start_ts, next_day_ts))
  141. return cursor.fetchone() is not None
  142. except Exception as e:
  143. logger.error(f"检查执行中任务失败,platform={platform},错误:{str(e)}")
  144. return False
  145. finally:
  146. if cursor:
  147. try:
  148. cursor.close()
  149. except Exception:
  150. pass
  151. if conn:
  152. try:
  153. conn.close()
  154. except Exception:
  155. pass
  156. # 以下历史示例注释已保留为空,避免乱码干扰
  157. # ==================== 2. 反爬配置 ====================
  158. # 随机延迟范围(模拟真人操作)
  159. MIN_CLICK_DELAY = 1.5 # 点击最小延迟(秒)
  160. MAX_CLICK_DELAY = 3.5 # 点击最大延迟(秒)
  161. MIN_INPUT_DELAY = 0.1 # 输入最小延迟(秒)
  162. MAX_INPUT_DELAY = 0.3 # 输入最大延迟(秒)
  163. MIN_PAGE_DELAY = 2.0 # 页面最小等待(秒)
  164. MAX_PAGE_DELAY = 4.0 # 页面最大等待(秒)
  165. # 关键词之间的随机延迟(秒)
  166. MIN_KEYWORD_DELAY = 8.0
  167. MAX_KEYWORD_DELAY = 15.0
  168. # 滚动配置(目标 400px,含随机偏移)
  169. SCROLL_TARGET_DISTANCE = 400 # 目标滚动距离
  170. SCROLL_OFFSET_RANGE = 50 # 随机偏移范围
  171. SCROLL_STEP = 50 # 每次滚动步长
  172. SCROLL_INTERVAL = 0.05 # 步长间隔(秒)
  173. # ==================== 3. Cookie & 登录配置 ====================
  174. COOKIE_FILE_PATH = "ybm_cookies.json" # Cookie 保存路径
  175. # Cookie 有效性验证页面
  176. LOGIN_VALIDATE_URL = "https://www.ybm100.com/new/"
  177. # 登录账号密码
  178. USERNAME = "18008650300"
  179. PASSWORD = "12345678"
  180. # USERNAME = "yjj112031"
  181. # PASSWORD = "123456"
  182. # 登录URL
  183. TARGET_LOGIN_URL = "https://www.ybm100.com/new/login"
  184. # "https://www.yyjzt.com/login?redirect=%2FgoodDetail%3FladderNum%26itemStoreId%3D124250306%26sourceProdetail%3D%252Fsearch%26is_store%3D0"
  185. # ==================== 4. 元素选择器配置 ====================
  186. # 基础选择器
  187. USERNAME_SELECTOR = "input[placeholder*=请输入账号]"
  188. PASSWORD_SELECTOR = "input[placeholder*=请输入密码]"
  189. LOGIN_BTN_SELECTOR = "button:has(span:text('登录'))"
  190. SEARCH_INPUT_SELECTOR = "input[placeholder*='药品名称/厂家名称']"
  191. SEARCH_INPUT_SELECTOR2 = "div.home-search-container-search-head"
  192. SEARCH_BTN_SELECTOR = "div.home-search-container-search-head-btn[data-scmd=\"text-搜索\"]"
  193. # 采集元素选择器可根据页面实际情况调整
  194. # 如页面结构变化,请优先更新以上选择器
  195. PRODUCT_ITEM_SELECTOR = "div.product-list-item" # 商品项容器
  196. PRODUCT_TITLE_SELECTOR = "div.product-name" # 商品标题
  197. PRODUCT_PRICE_SELECTOR = "div.main-price" # 商品价格
  198. PRODUCT_STORE_SELECTOR = 'div.prduct-shop-name div.shop-name' # 店铺名称
  199. PRODUCT_COMPANY_SELECTOR = "div.product-manufacturer" # 公司名称
  200. PRODUCT_VALIDITY_SELECTOR = "div.product-period" # 有效期
  201. # div.shop-info-container-left-info-name span
  202. # ==================== 5. 等待时间配置(秒) ====================
  203. ELEMENT_TIMEOUT = 10000
  204. LOGIN_AFTER_CLICK = 5000
  205. SEARCH_BTN_TIMEOUT = 5000
  206. COLLECT_DELAY = 3000
  207. DETAIL_LOAD_TIMEOUT = 5000 # 点击商品后等待详情加载时间
  208. # ==================== 6. 浏览器配置 ====================
  209. BROWSER_HEADLESS = False
  210. BROWSER_CHANNEL = "chrome"
  211. SLOW_MO_MIN = 50
  212. SLOW_MO_MAX = 100
  213. # ==================== 7. CSV 配置 ====================
  214. CSV_HEADERS = [
  215. "商品标题", "商品采购价格", "商品折扣价格", "规格", "盒数",
  216. "店铺名称", "公司名称",
  217. "有效日期", "生产日期", "批准文号", "采集时间"
  218. ] # CSV 表头
  219. # 存放营业执照截图路径(如需)
  220. # cropped_screenshot_path =
  221. # 百度 OCR 配置
  222. request_url_config = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license"
  223. AppKey_config = "tRK2RhyItCSh6BzyT4CNVXQa"
  224. AppSecret_config = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh"
  225. token_url_config = 'https://aip.baidubce.com/oauth/2.0/token'
  226. # ---------------------- OSS 配置 ----------------------
  227. OSS_ACCESS_KEY_ID = 'LTAI5tDwjfteBvivYN41r8sJ'
  228. OSS_ACCESS_KEY_SECRET = 'yowuOGi2nYYnrqGpO3qcz94C4brcPp'
  229. OSS_ENDPOINT = "oss-cn-shenzhen.aliyuncs.com"
  230. OSS_BUCKET_NAME = "zhijiayun-jiansuo"
  231. OSS_PREFIX = "scrape_data/"
  232. # 本地截图配置
  233. LOCAL_SCREENSHOT_DIR = "local_screenshots" # 本地截图目录
  234. LOCAL_SCREENSHOT_NAME = None # 自动生成文件名
  235. LOCAL_CROPPED_DIR = "./local_cropped_screenshots" # 裁剪后目录
  236. # 图片压缩配置
  237. IMAGE_COMPRESS_ENABLE = True # 是否启用压缩(True/False)
  238. IMAGE_COMPRESS_QUALITY = 30 # JPG 质量(1-95)
  239. IMAGE_COMPRESS_PNG_LEVEL = 9 # PNG 压缩级别(0-9)
  240. # ---------------------- 宸ュ叿鍑芥暟 ----------------------
  241. def init_local_screenshot_dir():
  242. """初始化本地截图目录(不存在则创建)。"""
  243. if not os.path.exists(LOCAL_SCREENSHOT_DIR):
  244. os.makedirs(LOCAL_SCREENSHOT_DIR)
  245. logger.info(f"本地截图目录已创建: {LOCAL_SCREENSHOT_DIR}")
  246. else:
  247. logger.debug(f"本地截图目录已存在: {LOCAL_SCREENSHOT_DIR}")
  248. def init_oss_bucket():
  249. """初始化 OSS Bucket 对象。"""
  250. try:
  251. auth = oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET)
  252. bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET_NAME)
  253. bucket.get_bucket_info()
  254. logger.info("OSS Bucket 初始化成功")
  255. return bucket
  256. except Exception as e:
  257. logger.error(f"OSS Bucket 初始化失败: {str(e)}")
  258. raise
  259. def upload_local_screenshot_to_oss(bucket, local_file_path, oss_file_path=None):
  260. """将本地截图上传到 OSS,返回公网访问链接。"""
  261. if not os.path.exists(local_file_path):
  262. raise FileNotFoundError(f"本地截图文件不存在: {local_file_path}")
  263. if not oss_file_path:
  264. local_file_name = os.path.basename(local_file_path)
  265. oss_file_path = f"screenshots/{local_file_name}"
  266. try:
  267. bucket.put_object_from_file(oss_file_path, local_file_path)
  268. oss_file_url = f"https://{OSS_BUCKET_NAME}.{OSS_ENDPOINT}/{oss_file_path}"
  269. logger.info(f"截图上传 OSS 成功: {oss_file_url}")
  270. return oss_file_url
  271. except Exception as e:
  272. logger.error(f"截图上传 OSS 失败: {str(e)}")
  273. raise
  274. # ---------------------- 图片裁剪与压缩 ----------------------
  275. def crop_local_screenshot(local_file_path, cropped_file_path=None, crop_region=None):
  276. """裁剪本地截图并保存,成功后删除原图,返回裁剪后文件路径。"""
  277. if not os.path.exists(local_file_path):
  278. raise FileNotFoundError(f"原始截图文件不存在: {local_file_path}")
  279. os.makedirs(LOCAL_CROPPED_DIR, exist_ok=True)
  280. if not cropped_file_path:
  281. file_name = os.path.basename(local_file_path)
  282. file_name_no_ext, file_ext = os.path.splitext(file_name)
  283. cropped_file_name = f"{file_name_no_ext}_cropped{file_ext}"
  284. cropped_file_path = os.path.join(LOCAL_CROPPED_DIR, cropped_file_name)
  285. with Image.open(local_file_path) as img:
  286. img_width, img_height = img.size
  287. logger.debug(f"原图尺寸: width={img_width}, height={img_height}")
  288. if not crop_region:
  289. left = 0
  290. upper = 0
  291. right = int(img_width)
  292. lower = int(img_height * 0.3)
  293. crop_region = (left, upper, right, lower)
  294. logger.debug(f"未指定裁剪区域,使用默认区域: {crop_region}")
  295. c_left, c_upper, c_right, c_lower = crop_region
  296. if c_right > img_width or c_lower > img_height or c_left < 0 or c_upper < 0:
  297. raise ValueError(
  298. f"裁剪区域超出图片范围,图片尺寸=({img_width}, {img_height}),裁剪区域={crop_region}"
  299. )
  300. cropped_img = img.crop(crop_region)
  301. file_ext = os.path.splitext(cropped_file_path)[1].lower()
  302. try:
  303. if IMAGE_COMPRESS_ENABLE:
  304. if file_ext in ['.jpg', '.jpeg']:
  305. cropped_img.save(
  306. cropped_file_path,
  307. format='JPEG',
  308. quality=IMAGE_COMPRESS_QUALITY,
  309. optimize=True,
  310. progressive=True,
  311. )
  312. else:
  313. cropped_img.save(cropped_file_path)
  314. logger.info(f"裁剪图片已保存(压缩开启): {cropped_file_path}")
  315. else:
  316. cropped_img.save(cropped_file_path, format='JPEG')
  317. logger.info(f"裁剪图片已保存(压缩关闭): {cropped_file_path}")
  318. except Exception as e:
  319. cropped_img.save(cropped_file_path, format='JPEG')
  320. logger.warning(f"图片压缩失败,已按普通 JPEG 保存: {str(e)}")
  321. try:
  322. if os.path.exists(cropped_file_path):
  323. os.remove(local_file_path)
  324. logger.debug(f"已删除原始截图: {local_file_path}")
  325. else:
  326. logger.warning(f"裁剪文件不存在,跳过删除原图: {cropped_file_path}")
  327. except OSError as e:
  328. logger.warning(f"删除原始截图失败: {str(e)}")
  329. return cropped_file_path
  330. def screenshot_target_page_to_local_then_oss(target_page, local_file_path=None, oss_file_path=None, full_page=True, crop_region=None):
  331. """页面截图到本地后裁剪,再上传 OSS,返回(裁剪路径, OSS链接)。"""
  332. os.makedirs(LOCAL_SCREENSHOT_DIR, exist_ok=True)
  333. if not local_file_path:
  334. current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
  335. local_file_name = f"{current_time}_target_page.jpg"
  336. local_file_path = os.path.join(LOCAL_SCREENSHOT_DIR, local_file_name)
  337. logger.info(f"开始页面截图: {local_file_path}")
  338. target_page.screenshot(
  339. path=local_file_path,
  340. full_page=full_page,
  341. omit_background=False,
  342. timeout=10000,
  343. )
  344. logger.debug("页面截图完成")
  345. cropped_file_path = crop_local_screenshot(
  346. local_file_path=local_file_path,
  347. crop_region=crop_region,
  348. )
  349. bucket = init_oss_bucket()
  350. oss_file_url = upload_local_screenshot_to_oss(bucket, cropped_file_path, oss_file_path)
  351. return cropped_file_path, oss_file_url