main.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. from .config import *
  2. import pymysql
  3. import re
  4. from .logger_config import logger
  5. from datetime import datetime
  6. # 1. 中文数字/量词转阿拉伯数字的映射
  7. CHN_NUM_MAP = {
  8. # 基础数字
  9. '零': 0, '一': 1, '二': 2, '三': 3, '四': 4,
  10. '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
  11. # 十位数字
  12. '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14,
  13. '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19,
  14. '二十': 20, '三十': 30, '四十': 40, '五十': 50,
  15. '六十': 60, '七十': 70, '八十': 80, '九十': 90,
  16. # 量词(单=1)
  17. '单': 1, "两": 2,
  18. }
  19. # 检测+号(最高优先级,出现则跳过处理)
  20. PLUS_SIGN_PATTERN = re.compile(r'\+', re.IGNORECASE)
  21. # 检测“本品”关键词(+号场景的例外)
  22. BEN_PIN_PATTERN = re.compile(r'本品', re.IGNORECASE)
  23. # 2. 匹配“拍几/发几”(最高优先级,只要出现就强制设1)
  24. PAI_FA_PATTERN = re.compile(r'(?=.*(拍|选)几)(?=.*发几)', re.IGNORECASE)
  25. # 匹配“需要多少盒下多少件”相关文本
  26. NEED_BOX_PATTERN = re.compile(r'需要多少盒下多少件|要多少盒下多少件|需多少盒下多少件', re.IGNORECASE)
  27. # 正则表达式:匹配“盒”前面的数字(支持 1盒、1 盒、1*盒、1×盒 等格式) #忽略大小写
  28. BOX_NUM_PATTERN = re.compile(
  29. r'(单|[零一二三四五六七八九十百]+|[- ]*\d+)\s*[×*]?\s*(盒|瓶)',
  30. re.IGNORECASE
  31. )
  32. # 需排除的非盒数单位(可根据业务扩展)
  33. EXCLUDE_UNITS = {'%', 'g', 'mg', 'ml', 'kg', 'l', 'μl', '片', '粒', '袋'}
  34. # DEFAULT_BOX_PATTERN = re.compile(r'标准装|无规格', re.IGNORECASE)
  35. def chn_num_to_arab(chn_num):
  36. """将中文数字转为阿拉伯数字(支持一到九十九)"""
  37. if not chn_num:
  38. return None
  39. # 先处理直接映射的情况(如“十”“二十”)
  40. if chn_num in CHN_NUM_MAP:
  41. return CHN_NUM_MAP[chn_num]
  42. # 处理特殊情况(如“一十一”→“十一”)
  43. chn_num = chn_num.replace('一十一', '十一').replace('一十二', '十二')
  44. return CHN_NUM_MAP.get(chn_num, None)
  45. def is_pure_box_number(num_str, credit_code, match_pos):
  46. """
  47. 判断数字是否为纯盒数(非单位相关数字)
  48. :param num_str: 匹配到的数字字符串
  49. :param credit_code: 原始文本
  50. :param match_pos: 数字在文本中的起始位置
  51. :return: True=纯盒数,False=带单位的数字
  52. """
  53. num_len = len(num_str)
  54. end_pos = match_pos + num_len
  55. max_pos = len(credit_code)
  56. # 检查数字前1个字符是否是排除单位
  57. if match_pos > 0:
  58. prev_char = credit_code[match_pos-1].lower()
  59. if prev_char in EXCLUDE_UNITS:
  60. return False
  61. # 检查数字后1-2个字符是否是排除单位(兼容mg/ml等双字符单位)
  62. for i in range(1, 3):
  63. if end_pos + i <= max_pos:
  64. next_chars = credit_code[end_pos:end_pos+i].lower()
  65. if next_chars in EXCLUDE_UNITS:
  66. return False
  67. # 检查数字后紧跟的单个字符
  68. if end_pos < max_pos:
  69. next_char = credit_code[end_pos].lower()
  70. if next_char in EXCLUDE_UNITS:
  71. return False
  72. # 检查数字前后是否有空格分隔的单位(如“10 g”)
  73. # 截取数字前后5个字符的上下文
  74. context_start = max(0, match_pos - 5)
  75. context_end = min(max_pos, end_pos + 5)
  76. context = credit_code[context_start:context_end].lower()
  77. for unit in EXCLUDE_UNITS:
  78. if f" {unit}" in context or f"{unit} " in context:
  79. return False
  80. return True
  81. def extract_box_number(credit_code):
  82. """
  83. 提取数字规则(优先级从高到低):
  84. 1. 含+号但不含本品 → 返回SKIP(跳过)
  85. 2. 含+号且含本品 / 不含+号 → 按正常规则提取数值:
  86. 3. 含“拍几/发几” → 强制返回1
  87. 4. 数字/单+盒/瓶 → 返回对应数字
  88. 5. 标准装/无规格 → 返回1
  89. 6. 其他 → 返回None
  90. """
  91. # 盒数超过1000时返回None,触发“跳过更新”逻辑
  92. MAX_BOX_NUM = 1000
  93. #标准装和无规格的,你不用去更新number和availability
  94. #不好处理的直接跳过
  95. # 先判断是否为有效字符串
  96. if not isinstance(credit_code, str) or credit_code.strip() == '':
  97. return None
  98. # 第一步:处理+号场景
  99. # 如果search()找到 + 号(返回匹配对象)→ 匹配对象 is not None → 结果是True(真);
  100. # 如果search()没找到 + 号(返回None)→ None is not None → 结果是False(假)。
  101. # has_plus = PLUS_SIGN_PATTERN.search(credit_code) is not None
  102. # has_ben_pin = BEN_PIN_PATTERN.search(credit_code) is not None
  103. # if has_plus and not has_ben_pin:
  104. # return None # 含+号无本品 → 跳过
  105. # (最高优先级):匹配“拍几发几”或“需要多少盒下多少件”,强制返回1
  106. if PAI_FA_PATTERN.search(credit_code) or NEED_BOX_PATTERN.search(credit_code):
  107. logger.debug(f"文本[{credit_code}]匹配'拍几发几'或'需要多少盒下多少件',强制返回盒数1")
  108. return 1
  109. # 用正则匹配数字
  110. num_match = BOX_NUM_PATTERN.search(credit_code)
  111. if num_match:
  112. # 第一步:匹配数字+盒/瓶/支(原有逻辑)
  113. num_part = num_match.group(1).strip()
  114. if num_part.isdigit():
  115. # 阿拉伯数字直接转
  116. box_num = int(num_part)
  117. else:
  118. # 中文数字转阿拉伯数字
  119. box_num = chn_num_to_arab(num_part)
  120. if box_num and box_num <= MAX_BOX_NUM:
  121. return box_num
  122. # 仅匹配 1-20 的单独数字
  123. num_only_pattern = re.compile(r'\b(1\d|20|[1-9])\b', re.IGNORECASE)
  124. num_matches = list(num_only_pattern.finditer(credit_code)) # 找到所有1~20的数字
  125. for match in num_matches:
  126. num_str = match.group()
  127. match_pos = match.start()
  128. # 判断是否为纯盒数(非单位相关)
  129. if is_pure_box_number(num_str, credit_code, match_pos):
  130. box_num = int(num_str)
  131. if 1 <= box_num <= MAX_BOX_NUM:
  132. logger.debug(f"文本[{credit_code}]匹配纯数字{box_num}(非单位相关)")
  133. return box_num
  134. return None
  135. def get_db_connect():
  136. """获取数据库连接"""
  137. conn = None
  138. cursor = None
  139. try:
  140. required_configs = ['host', 'user', 'password', 'database']
  141. for cfg in required_configs:
  142. if cfg not in MYSQL_CONFIG:
  143. raise ValueError(f"MYSQL_CONFIG缺失必要配置:{cfg}")
  144. conn = pymysql.connect(**MYSQL_CONFIG)
  145. cursor = conn.cursor()
  146. return conn, cursor
  147. except Exception as e:
  148. logger.error(f"数据库连接失败:{str(e)}")
  149. if cursor:
  150. cursor.close()
  151. if conn:
  152. conn.close()
  153. raise
  154. def get_today_date():
  155. return datetime.now().strftime('%Y-%m-%d')
  156. def update_availability_by_credit_code():
  157. """读取credit_code并更新availability字段"""
  158. conn = None
  159. cursor = None
  160. try:
  161. conn, cursor = get_db_connect()
  162. """改日期"""
  163. scrape_date = get_today_date()
  164. query_sql = "SELECT id, credit_code FROM pdd_drug_middle WHERE scrape_date = %s and availability != 1"
  165. cursor.execute(query_sql,(scrape_date))
  166. records = cursor.fetchall() # 获取所有记录
  167. if not records:
  168. logger.info("没查询到结果")
  169. return
  170. total_count = len(records)
  171. success_count = 0
  172. fail_count = 0
  173. for record in records:
  174. record_id = record['id']
  175. credit_code = record['credit_code']
  176. # 提取盒数
  177. box_num = extract_box_number(credit_code)
  178. # 提取成功(有数字)则availability=1,否则=0
  179. # 根据box_num是否存在动态生成SQL
  180. availability = 1 if box_num else 0
  181. if box_num is not None and box_num >= 1:
  182. update_sql = """
  183. UPDATE pdd_drug_middle
  184. SET availability = %s, # 修改availability, number字段
  185. number = %s
  186. WHERE id = %s # 用id找对应的记录
  187. """
  188. execute_params = (availability, box_num, record_id)
  189. success_count += 1
  190. logger.debug(f"ID:{record_id} 提取成功,盒数:{box_num},文本:{credit_code}")
  191. else:
  192. # 无值时只更新availability
  193. update_sql = """
  194. UPDATE pdd_drug_middle
  195. SET availability = %s
  196. WHERE id = %s
  197. """
  198. execute_params = (availability, record_id)
  199. fail_count += 1
  200. logger.debug(f"ID:{record_id} 提取失败,文本:{credit_code}")
  201. # 在cursor.execute前添加日志
  202. logger.debug(f"准备更新ID:{record_id},box_num={box_num}")
  203. cursor.execute(update_sql, execute_params)
  204. conn.commit()
  205. # 5. 计算成功率(避免除零错误)
  206. if total_count > 0:
  207. success_rate = (success_count / total_count) * 100
  208. else:
  209. success_rate = 0.0
  210. # 6. 输出详细统计结果
  211. logger.info(f"===== 处理结果统计 =====")
  212. logger.info(f"总处理记录数:{total_count} 条")
  213. logger.info(f"提取成功数:{success_count} 条(availability=1)")
  214. logger.info(f"提取失败数:{fail_count} 条(availability=0)")
  215. logger.info(f"提取成功率:{success_rate:.2f}%") # 保留2位小数,更直观
  216. except Exception as e:
  217. # 出错时回滚事务
  218. if conn:
  219. conn.rollback()
  220. logger.error(f"处理失败:{str(e)}", exc_info=True)
  221. raise
  222. finally:
  223. # 确保游标和连接最终关闭
  224. if cursor:
  225. cursor.close()
  226. if conn:
  227. conn.close()
  228. if __name__ == '__main__':
  229. update_availability_by_credit_code()