from .config import * import pymysql import re from .logger_config import logger from datetime import datetime # 1. 中文数字/量词转阿拉伯数字的映射 CHN_NUM_MAP = { # 基础数字 '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, # 十位数字 '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, '三十': 30, '四十': 40, '五十': 50, '六十': 60, '七十': 70, '八十': 80, '九十': 90, # 量词(单=1) '单': 1, "两": 2, } # 检测+号(最高优先级,出现则跳过处理) PLUS_SIGN_PATTERN = re.compile(r'\+', re.IGNORECASE) # 检测“本品”关键词(+号场景的例外) BEN_PIN_PATTERN = re.compile(r'本品', re.IGNORECASE) # 2. 匹配“拍几/发几”(最高优先级,只要出现就强制设1) PAI_FA_PATTERN = re.compile(r'(?=.*(拍|选)几)(?=.*发几)', re.IGNORECASE) # 匹配“需要多少盒下多少件”相关文本 NEED_BOX_PATTERN = re.compile(r'需要多少盒下多少件|要多少盒下多少件|需多少盒下多少件', re.IGNORECASE) # 正则表达式:匹配“盒”前面的数字(支持 1盒、1 盒、1*盒、1×盒 等格式) #忽略大小写 BOX_NUM_PATTERN = re.compile( r'(单|[零一二三四五六七八九十百]+|[- ]*\d+)\s*[×*]?\s*(盒|瓶)', re.IGNORECASE ) # 需排除的非盒数单位(可根据业务扩展) EXCLUDE_UNITS = {'%', 'g', 'mg', 'ml', 'kg', 'l', 'μl', '片', '粒', '袋'} # DEFAULT_BOX_PATTERN = re.compile(r'标准装|无规格', re.IGNORECASE) def chn_num_to_arab(chn_num): """将中文数字转为阿拉伯数字(支持一到九十九)""" if not chn_num: return None # 先处理直接映射的情况(如“十”“二十”) if chn_num in CHN_NUM_MAP: return CHN_NUM_MAP[chn_num] # 处理特殊情况(如“一十一”→“十一”) chn_num = chn_num.replace('一十一', '十一').replace('一十二', '十二') return CHN_NUM_MAP.get(chn_num, None) def is_pure_box_number(num_str, credit_code, match_pos): """ 判断数字是否为纯盒数(非单位相关数字) :param num_str: 匹配到的数字字符串 :param credit_code: 原始文本 :param match_pos: 数字在文本中的起始位置 :return: True=纯盒数,False=带单位的数字 """ num_len = len(num_str) end_pos = match_pos + num_len max_pos = len(credit_code) # 检查数字前1个字符是否是排除单位 if match_pos > 0: prev_char = credit_code[match_pos-1].lower() if prev_char in EXCLUDE_UNITS: return False # 检查数字后1-2个字符是否是排除单位(兼容mg/ml等双字符单位) for i in range(1, 3): if end_pos + i <= max_pos: next_chars = credit_code[end_pos:end_pos+i].lower() if next_chars in EXCLUDE_UNITS: return False # 检查数字后紧跟的单个字符 if end_pos < max_pos: next_char = credit_code[end_pos].lower() if next_char in EXCLUDE_UNITS: return False # 检查数字前后是否有空格分隔的单位(如“10 g”) # 截取数字前后5个字符的上下文 context_start = max(0, match_pos - 5) context_end = min(max_pos, end_pos + 5) context = credit_code[context_start:context_end].lower() for unit in EXCLUDE_UNITS: if f" {unit}" in context or f"{unit} " in context: return False return True def extract_box_number(credit_code): """ 提取数字规则(优先级从高到低): 1. 含+号但不含本品 → 返回SKIP(跳过) 2. 含+号且含本品 / 不含+号 → 按正常规则提取数值: 3. 含“拍几/发几” → 强制返回1 4. 数字/单+盒/瓶 → 返回对应数字 5. 标准装/无规格 → 返回1 6. 其他 → 返回None """ # 盒数超过1000时返回None,触发“跳过更新”逻辑 MAX_BOX_NUM = 1000 #标准装和无规格的,你不用去更新number和availability #不好处理的直接跳过 # 先判断是否为有效字符串 if not isinstance(credit_code, str) or credit_code.strip() == '': return None # 第一步:处理+号场景 # 如果search()找到 + 号(返回匹配对象)→ 匹配对象 is not None → 结果是True(真); # 如果search()没找到 + 号(返回None)→ None is not None → 结果是False(假)。 # has_plus = PLUS_SIGN_PATTERN.search(credit_code) is not None # has_ben_pin = BEN_PIN_PATTERN.search(credit_code) is not None # if has_plus and not has_ben_pin: # return None # 含+号无本品 → 跳过 # (最高优先级):匹配“拍几发几”或“需要多少盒下多少件”,强制返回1 if PAI_FA_PATTERN.search(credit_code) or NEED_BOX_PATTERN.search(credit_code): logger.debug(f"文本[{credit_code}]匹配'拍几发几'或'需要多少盒下多少件',强制返回盒数1") return 1 # 用正则匹配数字 num_match = BOX_NUM_PATTERN.search(credit_code) if num_match: # 第一步:匹配数字+盒/瓶/支(原有逻辑) num_part = num_match.group(1).strip() if num_part.isdigit(): # 阿拉伯数字直接转 box_num = int(num_part) else: # 中文数字转阿拉伯数字 box_num = chn_num_to_arab(num_part) if box_num and box_num <= MAX_BOX_NUM: return box_num # 仅匹配 1-20 的单独数字 num_only_pattern = re.compile(r'\b(1\d|20|[1-9])\b', re.IGNORECASE) num_matches = list(num_only_pattern.finditer(credit_code)) # 找到所有1~20的数字 for match in num_matches: num_str = match.group() match_pos = match.start() # 判断是否为纯盒数(非单位相关) if is_pure_box_number(num_str, credit_code, match_pos): box_num = int(num_str) if 1 <= box_num <= MAX_BOX_NUM: logger.debug(f"文本[{credit_code}]匹配纯数字{box_num}(非单位相关)") return box_num return None def get_db_connect(): """获取数据库连接""" conn = None cursor = None try: required_configs = ['host', 'user', 'password', 'database'] for cfg in required_configs: if cfg not in MYSQL_CONFIG: raise ValueError(f"MYSQL_CONFIG缺失必要配置:{cfg}") conn = pymysql.connect(**MYSQL_CONFIG) cursor = conn.cursor() return conn, cursor except Exception as e: logger.error(f"数据库连接失败:{str(e)}") if cursor: cursor.close() if conn: conn.close() raise def get_today_date(): return datetime.now().strftime('%Y-%m-%d') def update_availability_by_credit_code(): """读取credit_code并更新availability字段""" conn = None cursor = None try: conn, cursor = get_db_connect() """改日期""" scrape_date = get_today_date() query_sql = "SELECT id, credit_code FROM pdd_drug_middle WHERE scrape_date = %s and availability != 1" cursor.execute(query_sql,(scrape_date)) records = cursor.fetchall() # 获取所有记录 if not records: logger.info("没查询到结果") return total_count = len(records) success_count = 0 fail_count = 0 for record in records: record_id = record['id'] credit_code = record['credit_code'] # 提取盒数 box_num = extract_box_number(credit_code) # 提取成功(有数字)则availability=1,否则=0 # 根据box_num是否存在动态生成SQL availability = 1 if box_num else 0 if box_num is not None and box_num >= 1: update_sql = """ UPDATE pdd_drug_middle SET availability = %s, # 修改availability, number字段 number = %s WHERE id = %s # 用id找对应的记录 """ execute_params = (availability, box_num, record_id) success_count += 1 logger.debug(f"ID:{record_id} 提取成功,盒数:{box_num},文本:{credit_code}") else: # 无值时只更新availability update_sql = """ UPDATE pdd_drug_middle SET availability = %s WHERE id = %s """ execute_params = (availability, record_id) fail_count += 1 logger.debug(f"ID:{record_id} 提取失败,文本:{credit_code}") # 在cursor.execute前添加日志 logger.debug(f"准备更新ID:{record_id},box_num={box_num}") cursor.execute(update_sql, execute_params) conn.commit() # 5. 计算成功率(避免除零错误) if total_count > 0: success_rate = (success_count / total_count) * 100 else: success_rate = 0.0 # 6. 输出详细统计结果 logger.info(f"===== 处理结果统计 =====") logger.info(f"总处理记录数:{total_count} 条") logger.info(f"提取成功数:{success_count} 条(availability=1)") logger.info(f"提取失败数:{fail_count} 条(availability=0)") logger.info(f"提取成功率:{success_rate:.2f}%") # 保留2位小数,更直观 except Exception as e: # 出错时回滚事务 if conn: conn.rollback() logger.error(f"处理失败:{str(e)}", exc_info=True) raise finally: # 确保游标和连接最终关闭 if cursor: cursor.close() if conn: conn.close() if __name__ == '__main__': update_availability_by_credit_code()