# config.py - 药九九数据采集配置文件 from datetime import datetime import pymysql from dotenv import load_dotenv import os import oss2 from PIL import Image # 第一步:加载.env文件(必须放在配置读取前) # load_dotenv() 默认读取当前目录的.env文件;若.env在其他路径,可指定:load_dotenv("/path/to/.env") # load_dotenv() # MySQL配置(和你原有MYSQL_CONFIG结构一致) MYSQL_CONFIG = { "host": "47.119.164.65", # 本地MySQL地址 "port": 3306, # 端口 "user": "test_c", # 你的MySQL用户名 "password": "Dfwy@2025", # 你的MySQL密码 "database": "test2", # 已建好的数据库名 "charset": "utf8mb4" # 字符集(避免中文乱码) } # MYSQL_CONFIG = { # "host": os.getenv("MYSQL_HOST"), # 读取.env中的MYSQL_HOST # "user": os.getenv("MYSQL_USER"), # "password": os.getenv("MYSQL_PASSWORD"), # 敏感值从.env读取 # "database": os.getenv("MYSQL_DATABASE"), # "port": int(os.getenv("MYSQL_PORT", 3306)), # 可选配置:设置默认值3306,避免.env缺失时报错 # "charset": "utf8mb4" # } # ==================== 从数据库提取商品 ==================== def get_search_keywords_from_db(): """从数据库读取keywords字段,生成SEARCH_KEYWORDS列表""" keywords = [] conn = None cursor = None try: # 校验MYSQL_CONFIG完整性 required_configs = ['host', 'user', 'password', 'database'] for cfg in required_configs: if cfg not in MYSQL_CONFIG: raise ValueError(f"MYSQL_CONFIG缺失必要配置:{cfg}") # 建立数据库连接 conn = pymysql.connect(**MYSQL_CONFIG) cursor = conn.cursor() sql = 'SELECT scrape_name FROM yjj_scape_name_config WHERE status = 1' cursor.execute(sql) # 提取所有keywords字段值,生成列表 results = cursor.fetchall() keywords = [row[0].strip() for row in results if row[0].strip()] print(f"成功从数据库读取 {len(keywords)} 个关键词(status=1)") except Exception as e: print(f"读取数据库关键词失败:{str(e)}") # 读取失败时,可返回空列表或备用列表(可选) keywords = [] finally: print("读取到的关键词示例:") print(keywords[:5]) # 关闭游标和连接(容错处理) if cursor: try: cursor.close() except: pass if conn: try: conn.close() except: pass return keywords # ==================== 1. 核心业务配置 ==================== # 搜索关键词列表 SEARCH_KEYWORDS = get_search_keywords_from_db() # get_search_keywords_from_db() # ['999 感冒灵颗粒'] # MySQL表结构(确保和你建好的表一致,仅做校验用) # CREATE_TABLE_SQL = """ # CREATE TABLE IF NOT EXISTS yjj_medicine_data ( # id INT AUTO_INCREMENT PRIMARY KEY COMMENT '自增主键', # product_title VARCHAR(500) COMMENT '商品标题', # product_url VARCHAR(1000) COMMENT '商品详情页链接', # purchase_price DECIMAL(10,2) DEFAULT 0.00 COMMENT '采购价格', # discount_price DECIMAL(10,2) DEFAULT 0.00 COMMENT '折扣价格', # spec VARCHAR(200) DEFAULT '未知规格' COMMENT '规格', # box_count INT DEFAULT 1 COMMENT '盒数', # store_name VARCHAR(200) DEFAULT '未知店铺' COMMENT '店铺名称', # company_name VARCHAR(200) DEFAULT '未知公司' COMMENT '公司名称', # validity_date VARCHAR(100) DEFAULT '无有效期' COMMENT '有效日期', # production_date VARCHAR(100) DEFAULT '无生产日期' COMMENT '生产日期', # approval_number VARCHAR(100) DEFAULT '无批准文号' COMMENT '批准文号', # keyword VARCHAR(100) DEFAULT '无搜素关键词' COMMENT '搜素关键词', # collect_time DATETIME COMMENT '采集时间' # ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='药九九采集数据'; # """ # ==================== 2. 反爬配置 ==================== # 随机延迟范围(模拟真人操作间隔) MIN_CLICK_DELAY = 1.5 # 点击间隔最小秒数 MAX_CLICK_DELAY = 3.5 # 点击间隔最大秒数 MIN_INPUT_DELAY = 0.1 # 打字每个字符的最小延迟 MAX_INPUT_DELAY = 0.3 # 打字每个字符的最大延迟 MIN_PAGE_DELAY = 2.0 # 页面加载后最小等待秒数 MAX_PAGE_DELAY = 4.0 # 页面加载后最大等待秒数 # 关键词间的反爬长延迟(比单个商品更长) MIN_KEYWORD_DELAY = 8.0 MAX_KEYWORD_DELAY = 15.0 # 滚动配置(固定1400px±50px) SCROLL_TARGET_DISTANCE = 400 # 目标滚动距离 SCROLL_OFFSET_RANGE = 50 # 随机偏移范围 SCROLL_STEP = 50 # 每次滚动步长(越小越慢,越像真人) SCROLL_INTERVAL = 0.05 # 步长间隔(秒) # ==================== 3. Cookie & 登录配置 ==================== COOKIE_FILE_PATH = "yjj_cookies.json" # Cookie保存路径 # 需要登录后访问的验证页面(用于检测Cookie是否有效) LOGIN_VALIDATE_URL = "https://www.yyjzt.com/login" # 账号密码 USERNAME = "18971731507" PASSWORD = "Jzt000000" # USERNAME = "yjj112031" # PASSWORD = "123456" # 目标登录URL TARGET_LOGIN_URL = "https://www.yyjzt.com/" # "https://www.yyjzt.com/login?redirect=%2FgoodDetail%3FladderNum%26itemStoreId%3D124250306%26sourceProdetail%3D%252Fsearch%26is_store%3D0" # ==================== 4. 元素选择器配置 ==================== # 基础选择器 USERNAME_SELECTOR = "input[placeholder*=请输入手机号]" PASSWORD_SELECTOR = "input[placeholder*=请填写登录密码]" LOGIN_BTN_SELECTOR = "button:has(span:text('账号登录'))" SEARCH_INPUT_SELECTOR = "input[placeholder*='药名/店铺/品牌/厂家']" SEARCH_BTN_SELECTOR = ".el-button.ph-si-btn" # 采集元素选择器(根据页面实际调整!) PRODUCT_ITEM_SELECTOR = "div.sr-list-item[data-item_loc]" # 商品项容器 PRODUCT_TITLE_SELECTOR = "span.gc-l3-name" # 商品标题 PRODUCT_PRICE_SELECTOR = "span.gc-l2-price" # 商品价格(取第一个) PRODUCT_STORE_SELECTOR = 'span.gc-l7-shop-store-name' #店铺名称 PRODUCT_COMPANY_SELECTOR = "div.gc-l4" # 公司名称 PRODUCT_VALIDITY_SELECTOR = "span.el-tooltip" # 有效期 # ==================== 5. 等待时间配置(毫秒) ==================== ELEMENT_TIMEOUT = 10000 LOGIN_AFTER_CLICK = 5000 SEARCH_BTN_TIMEOUT = 5000 COLLECT_DELAY = 3000 DETAIL_LOAD_TIMEOUT = 5000 # 点击商品后等待详情加载的时间 # ==================== 6. 浏览器配置 ==================== BROWSER_HEADLESS = False BROWSER_CHANNEL = "chrome" SLOW_MO_MIN = 50 SLOW_MO_MAX = 100 # ==================== 7. CSV配置 ==================== CSV_HEADERS = [ "商品标题", "商品采购价格", "商品折扣价格", "规格", "盒数", "店铺名称", "公司名称", "有效日期", "生产日期", "批准文号", "采集时间" ] # 表头 # 注:CSV_FILE_PATH 因包含动态时间戳,保留在主文件中定义 #存放营业执照图片路径 # cropped_screenshot_path = #百度OCR配置 request_url_config = "https://aip.baidubce.com/rest/2.0/ocr/v1/business_license" AppKey_config = "tRK2RhyItCSh6BzyT4CNVXQa" AppSecret_config = "TDgKiPo94i2mOM1sDqOuDnlcK1bG66jh" token_url_config = 'https://aip.baidubce.com/oauth/2.0/token' # ---------------------- OSS 配置项 ---------------------- OSS_ACCESS_KEY_ID = 'LTAI5tDwjfteBvivYN41r8sJ' OSS_ACCESS_KEY_SECRET = 'yowuOGi2nYYnrqGpO3qcz94C4brcPp' OSS_ENDPOINT = "oss-cn-shenzhen.aliyuncs.com" OSS_BUCKET_NAME = "zhijiayun-jiansuo" OSS_PREFIX = "scrape_data/" # 本地截图配置 LOCAL_SCREENSHOT_DIR = "local_screenshots" # 本地截图保存目录 LOCAL_SCREENSHOT_NAME = None # 自动生成文件名,无需手动指定 LOCAL_CROPPED_DIR = "./local_cropped_screenshots" # 裁剪后图片保存目录 # 图片压缩配置 IMAGE_COMPRESS_ENABLE = True # 是否开启图片压缩(True=开启,False=关闭) IMAGE_COMPRESS_QUALITY = 30 # jpg/jpeg格式压缩质量(1-95,数值越大画质越好,文件越大,推荐80-90) IMAGE_COMPRESS_PNG_LEVEL = 9 # png格式压缩级别(0-9,数值越大压缩率越高,速度越慢,推荐5-7) # ---------------------- 工具函数 ---------------------- def init_local_screenshot_dir(): """ 初始化本地截图目录(如果不存在则创建) """ if not os.path.exists(LOCAL_SCREENSHOT_DIR): os.makedirs(LOCAL_SCREENSHOT_DIR) print(f"本地截图目录【{LOCAL_SCREENSHOT_DIR}】创建成功") else: print(f"本地截图目录【{LOCAL_SCREENSHOT_DIR}】已存在") def init_oss_bucket(): """ 初始化OSS Bucket对象,用于后续上传操作 """ try: # 创建认证对象 auth = oss2.Auth(OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET) bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET_NAME) # 验证Bucket是否可访问(可选) bucket.get_bucket_info() print("OSS Bucket 初始化成功") return bucket except Exception as e: print(f"OSS Bucket 初始化失败:{str(e)}") raise def upload_local_screenshot_to_oss(bucket, local_file_path, oss_file_path=None): """ 将截图内容上传到OSS :param bucket: 初始化好的OSS Bucket对象 :param screenshot_content: 截图内容(字节流,或本地文件路径) :param oss_file_path: 上传到OSS后的文件路径(如screenshots/20260130_100000_target_page.jpg) :return: 上传后的OSS文件公网访问链接 """ # 1. 校验本地文件是否存在 if not os.path.exists(local_file_path): raise FileNotFoundError(f"本地截图文件不存在:{local_file_path}") # 2. 生成默认的OSS文件路径(如果用户未指定) if not oss_file_path: # 提取本地文件名作为OSS文件名,保持一致性 local_file_name = os.path.basename(local_file_path) oss_file_path = f"screenshots/{local_file_name}" try: # 3. 上传本地文件到OSS(核心修改:使用put_object_from_file) bucket.put_object_from_file(oss_file_path, local_file_path) # 4. 构造OSS文件的公网访问链接 oss_file_url = f"https://{OSS_BUCKET_NAME}.{OSS_ENDPOINT}/{oss_file_path}" print(f"本地截图上传OSS成功,访问链接:{oss_file_url}") return oss_file_url except Exception as e: print(f"本地截图上传OSS失败:{str(e)}") raise # ---------------------- 补全/修改:裁剪函数(新增完整裁剪+删原图逻辑) ---------------------- def crop_local_screenshot(local_file_path, cropped_file_path=None, crop_region=None): """ 裁剪本地截图文件(完整实现:裁剪后图片压缩,裁剪+保存裁剪文件+删除原图) :param local_file_path: 原始本地截图文件路径 :param cropped_file_path: 裁剪后图片的保存路径(可选) :param crop_region: 裁剪区域(元组,格式:(left, upper, right, lower)),可选 :return: 裁剪后图片的本地路径 """ # 1. 校验原始文件是否存在 if not os.path.exists(local_file_path): raise FileNotFoundError(f"原始截图文件不存在:{local_file_path}") # 2. 初始化裁剪后文件目录(自动创建)(你的原有逻辑,保持不变) os.makedirs(LOCAL_CROPPED_DIR, exist_ok=True) # 3. 新增:生成默认裁剪后文件路径(避免重名,带_cropped标识) if not cropped_file_path: file_name = os.path.basename(local_file_path) file_name_no_ext, file_ext = os.path.splitext(file_name) cropped_file_name = f"{file_name_no_ext}_cropped{file_ext}" cropped_file_path = os.path.join(LOCAL_CROPPED_DIR, cropped_file_name) with Image.open(local_file_path) as img: img_width, img_height = img.size print(f"获取截图尺寸:宽={img_width},高={img_height}") # 打印尺寸,方便排查 if not crop_region: left = int(img_width * 0.1) upper = 0 right = int(img_width * 0.9) lower = int(img_height * 0.3) crop_region = (left, upper, right, lower) print(f"未指定裁剪区域,默认裁剪中间30%区域:{crop_region}") # 4.2 新增:校验裁剪区域合法性(避免超出图片尺寸) c_left, c_upper, c_right, c_lower = crop_region if c_right > img_width or c_lower > img_height or c_left < 0 or c_upper < 0: raise ValueError(f"裁剪区域超出图片尺寸!图片尺寸:({img_width}, {img_height}),裁剪区域:{crop_region}") # 4.3 执行裁剪并保存裁剪后的图片 cropped_img = img.crop(crop_region) # 4.4 压缩并保存裁剪后的图片 file_ext = os.path.splitext(cropped_file_path)[1].lower() # 获取文件后缀(小写,兼容JPG/Jpg等) try: if IMAGE_COMPRESS_ENABLE: # 区分图片格式,应用不同压缩策略 if file_ext in ['.jpg', '.jpeg']: # JPG/JPEG格式:质量压缩(有损压缩,平衡画质和大小) cropped_img.save( cropped_file_path, format='JPEG', # 强制指定JPEG格式,确保压缩生效 quality=IMAGE_COMPRESS_QUALITY, # 压缩质量(配置项中定义) optimize=True, # 开启优化,提升压缩效果(减小文件体积) progressive=True # 生成渐进式JPG,网页加载更友好(可选,不影响压缩效果) ) print(f"JPG图片压缩保存成功,压缩质量:{IMAGE_COMPRESS_QUALITY},保存到:{cropped_file_path}") else: cropped_img.save(cropped_file_path, format='JPEG') print(f"未开启压缩,裁剪图片直接保存到:{cropped_file_path}") except Exception as e: # 压缩失败兜底:直接保存未压缩的JPG图片,不中断后续流程 cropped_img.save(cropped_file_path, format='JPEG') print(f"JPG图片压缩失败,已直接保存未压缩版本:{str(e)}") # 5. 裁剪成功后,删除原始截图文件(带异常处理) try: if os.path.exists(cropped_file_path): # 确保裁剪文件生成成功,再删原图 os.remove(local_file_path) print(f"原始截图文件已删除:{local_file_path}") else: print(f"裁剪文件未生成,暂不删除原始截图:{local_file_path}") except OSError as e: print(f"删除原始截图文件失败(文件可能被占用):{str(e)}") # 6. 返回裁剪+压缩后的文件路径 return cropped_file_path def screenshot_target_page_to_local_then_oss(target_page, local_file_path=None, oss_file_path=None, full_page=True, crop_region=None): """ 对target_page截图保存到本地→裁剪图片(删原图)→上传裁剪后的图片到OSS(修改后整合版) :param target_page: Playwright的Page对象(已加载目标页面) :param local_file_path: 本地截图文件的完整路径(可选) :param oss_file_path: OSS上的文件路径(可选) :param full_page: 是否截取全屏(True=全屏,False=当前可视区域) :param crop_region: 自定义裁剪区域(元组:(left, upper, right, lower)),可选 :return: 裁剪后文件路径 + OSS文件访问链接 """ # 1. 初始化本地截图目录(不存在则创建,避免保存文件时报错) os.makedirs(LOCAL_SCREENSHOT_DIR, exist_ok=True) # 2. 生成默认的本地文件路径(如果用户未指定) if not local_file_path: current_time = datetime.now().strftime("%Y%m%d_%H%M%S") local_file_name = f"{current_time}_target_page.jpg" local_file_path = os.path.join(LOCAL_SCREENSHOT_DIR, local_file_name) # 3. 对target_page截图并保存到本地(核心修改:指定path参数) print(f"正在对target_page截图,将保存到:{local_file_path}") target_page.screenshot( path=local_file_path, # 保存到本地文件的核心参数 full_page=full_page, # 是否全屏截图 omit_background=False, # 是否忽略背景 timeout=10000 # 截图超时时间 ) print(f"本地截图保存成功") # 4. 调用裁剪函数,处理原图(裁剪+删原图) cropped_file_path = crop_local_screenshot( local_file_path=local_file_path, crop_region=crop_region ) # 5. 初始化OSS Bucket bucket = init_oss_bucket() # 6. 修改:上传裁剪后的图片,而非原始截图 oss_file_url = upload_local_screenshot_to_oss(bucket, cropped_file_path, oss_file_path) # 6. 返回本地文件路径和OSS链接,方便后续使用 return cropped_file_path, oss_file_url