| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- from .config import *
- import pymysql
- import re
- from .logger_config import logger
- from datetime import datetime
- # 1. 中文数字/量词转阿拉伯数字的映射
- CHN_NUM_MAP = {
- # 基础数字
- '零': 0, '一': 1, '二': 2, '三': 3, '四': 4,
- '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
- # 十位数字
- '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14,
- '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19,
- '二十': 20, '三十': 30, '四十': 40, '五十': 50,
- '六十': 60, '七十': 70, '八十': 80, '九十': 90,
- # 量词(单=1)
- '单': 1, "两": 2,
- }
- # 检测+号(最高优先级,出现则跳过处理)
- PLUS_SIGN_PATTERN = re.compile(r'\+', re.IGNORECASE)
- # 检测“本品”关键词(+号场景的例外)
- BEN_PIN_PATTERN = re.compile(r'本品', re.IGNORECASE)
- # 2. 匹配“拍几/发几”(最高优先级,只要出现就强制设1)
- PAI_FA_PATTERN = re.compile(r'(?=.*(拍|选)几)(?=.*发几)', re.IGNORECASE)
- # 匹配“需要多少盒下多少件”相关文本
- NEED_BOX_PATTERN = re.compile(r'需要多少盒下多少件|要多少盒下多少件|需多少盒下多少件', re.IGNORECASE)
- # 正则表达式:匹配“盒”前面的数字(支持 1盒、1 盒、1*盒、1×盒 等格式) #忽略大小写
- BOX_NUM_PATTERN = re.compile(
- r'(单|[零一二三四五六七八九十百]+|[- ]*\d+)\s*[×*]?\s*(盒|瓶)',
- re.IGNORECASE
- )
- # 需排除的非盒数单位(可根据业务扩展)
- EXCLUDE_UNITS = {'%', 'g', 'mg', 'ml', 'kg', 'l', 'μl', '片', '粒', '袋'}
- # DEFAULT_BOX_PATTERN = re.compile(r'标准装|无规格', re.IGNORECASE)
- def chn_num_to_arab(chn_num):
- """将中文数字转为阿拉伯数字(支持一到九十九)"""
- if not chn_num:
- return None
- # 先处理直接映射的情况(如“十”“二十”)
- if chn_num in CHN_NUM_MAP:
- return CHN_NUM_MAP[chn_num]
- # 处理特殊情况(如“一十一”→“十一”)
- chn_num = chn_num.replace('一十一', '十一').replace('一十二', '十二')
- return CHN_NUM_MAP.get(chn_num, None)
- def is_pure_box_number(num_str, credit_code, match_pos):
- """
- 判断数字是否为纯盒数(非单位相关数字)
- :param num_str: 匹配到的数字字符串
- :param credit_code: 原始文本
- :param match_pos: 数字在文本中的起始位置
- :return: True=纯盒数,False=带单位的数字
- """
- num_len = len(num_str)
- end_pos = match_pos + num_len
- max_pos = len(credit_code)
- # 检查数字前1个字符是否是排除单位
- if match_pos > 0:
- prev_char = credit_code[match_pos-1].lower()
- if prev_char in EXCLUDE_UNITS:
- return False
- # 检查数字后1-2个字符是否是排除单位(兼容mg/ml等双字符单位)
- for i in range(1, 3):
- if end_pos + i <= max_pos:
- next_chars = credit_code[end_pos:end_pos+i].lower()
- if next_chars in EXCLUDE_UNITS:
- return False
- # 检查数字后紧跟的单个字符
- if end_pos < max_pos:
- next_char = credit_code[end_pos].lower()
- if next_char in EXCLUDE_UNITS:
- return False
- # 检查数字前后是否有空格分隔的单位(如“10 g”)
- # 截取数字前后5个字符的上下文
- context_start = max(0, match_pos - 5)
- context_end = min(max_pos, end_pos + 5)
- context = credit_code[context_start:context_end].lower()
- for unit in EXCLUDE_UNITS:
- if f" {unit}" in context or f"{unit} " in context:
- return False
- return True
- def extract_box_number(credit_code):
- """
- 提取数字规则(优先级从高到低):
- 1. 含+号但不含本品 → 返回SKIP(跳过)
- 2. 含+号且含本品 / 不含+号 → 按正常规则提取数值:
- 3. 含“拍几/发几” → 强制返回1
- 4. 数字/单+盒/瓶 → 返回对应数字
- 5. 标准装/无规格 → 返回1
- 6. 其他 → 返回None
- """
- # 盒数超过1000时返回None,触发“跳过更新”逻辑
- MAX_BOX_NUM = 1000
- #标准装和无规格的,你不用去更新number和availability
- #不好处理的直接跳过
- # 先判断是否为有效字符串
- if not isinstance(credit_code, str) or credit_code.strip() == '':
- return None
- # 第一步:处理+号场景
- # 如果search()找到 + 号(返回匹配对象)→ 匹配对象 is not None → 结果是True(真);
- # 如果search()没找到 + 号(返回None)→ None is not None → 结果是False(假)。
- # has_plus = PLUS_SIGN_PATTERN.search(credit_code) is not None
- # has_ben_pin = BEN_PIN_PATTERN.search(credit_code) is not None
- # if has_plus and not has_ben_pin:
- # return None # 含+号无本品 → 跳过
- # (最高优先级):匹配“拍几发几”或“需要多少盒下多少件”,强制返回1
- if PAI_FA_PATTERN.search(credit_code) or NEED_BOX_PATTERN.search(credit_code):
- logger.debug(f"文本[{credit_code}]匹配'拍几发几'或'需要多少盒下多少件',强制返回盒数1")
- return 1
- # 用正则匹配数字
- num_match = BOX_NUM_PATTERN.search(credit_code)
- if num_match:
- # 第一步:匹配数字+盒/瓶/支(原有逻辑)
- num_part = num_match.group(1).strip()
- if num_part.isdigit():
- # 阿拉伯数字直接转
- box_num = int(num_part)
- else:
- # 中文数字转阿拉伯数字
- box_num = chn_num_to_arab(num_part)
- if box_num and box_num <= MAX_BOX_NUM:
- return box_num
- # 仅匹配 1-20 的单独数字
- num_only_pattern = re.compile(r'\b(1\d|20|[1-9])\b', re.IGNORECASE)
- num_matches = list(num_only_pattern.finditer(credit_code)) # 找到所有1~20的数字
- for match in num_matches:
- num_str = match.group()
- match_pos = match.start()
- # 判断是否为纯盒数(非单位相关)
- if is_pure_box_number(num_str, credit_code, match_pos):
- box_num = int(num_str)
- if 1 <= box_num <= MAX_BOX_NUM:
- logger.debug(f"文本[{credit_code}]匹配纯数字{box_num}(非单位相关)")
- return box_num
- return None
- def get_db_connect():
- """获取数据库连接"""
- conn = None
- cursor = None
- try:
- required_configs = ['host', 'user', 'password', 'database']
- for cfg in required_configs:
- if cfg not in MYSQL_CONFIG:
- raise ValueError(f"MYSQL_CONFIG缺失必要配置:{cfg}")
- conn = pymysql.connect(**MYSQL_CONFIG)
- cursor = conn.cursor()
- return conn, cursor
- except Exception as e:
- logger.error(f"数据库连接失败:{str(e)}")
- if cursor:
- cursor.close()
- if conn:
- conn.close()
- raise
- def get_today_date():
- return datetime.now().strftime('%Y-%m-%d')
- def update_availability_by_credit_code():
- """读取credit_code并更新availability字段"""
- conn = None
- cursor = None
- try:
- conn, cursor = get_db_connect()
- """改日期"""
- scrape_date = get_today_date()
- query_sql = "SELECT id, credit_code FROM pdd_drug_middle WHERE scrape_date = %s and availability != 1"
- cursor.execute(query_sql,(scrape_date))
- records = cursor.fetchall() # 获取所有记录
- if not records:
- logger.info("没查询到结果")
- return
- total_count = len(records)
- success_count = 0
- fail_count = 0
- for record in records:
- record_id = record['id']
- credit_code = record['credit_code']
- # 提取盒数
- box_num = extract_box_number(credit_code)
- # 提取成功(有数字)则availability=1,否则=0
- # 根据box_num是否存在动态生成SQL
- availability = 1 if box_num else 0
- if box_num is not None and box_num >= 1:
- update_sql = """
- UPDATE pdd_drug_middle
- SET availability = %s, # 修改availability, number字段
- number = %s
- WHERE id = %s # 用id找对应的记录
- """
- execute_params = (availability, box_num, record_id)
- success_count += 1
- logger.debug(f"ID:{record_id} 提取成功,盒数:{box_num},文本:{credit_code}")
- else:
- # 无值时只更新availability
- update_sql = """
- UPDATE pdd_drug_middle
- SET availability = %s
- WHERE id = %s
- """
- execute_params = (availability, record_id)
- fail_count += 1
- logger.debug(f"ID:{record_id} 提取失败,文本:{credit_code}")
- # 在cursor.execute前添加日志
- logger.debug(f"准备更新ID:{record_id},box_num={box_num}")
- cursor.execute(update_sql, execute_params)
- conn.commit()
- # 5. 计算成功率(避免除零错误)
- if total_count > 0:
- success_rate = (success_count / total_count) * 100
- else:
- success_rate = 0.0
- # 6. 输出详细统计结果
- logger.info(f"===== 处理结果统计 =====")
- logger.info(f"总处理记录数:{total_count} 条")
- logger.info(f"提取成功数:{success_count} 条(availability=1)")
- logger.info(f"提取失败数:{fail_count} 条(availability=0)")
- logger.info(f"提取成功率:{success_rate:.2f}%") # 保留2位小数,更直观
- except Exception as e:
- # 出错时回滚事务
- if conn:
- conn.rollback()
- logger.error(f"处理失败:{str(e)}", exc_info=True)
- raise
- finally:
- # 确保游标和连接最终关闭
- if cursor:
- cursor.close()
- if conn:
- conn.close()
- if __name__ == '__main__':
- update_availability_by_credit_code()
|