| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import re
- def extract_quantity_and_unit(text: str):
- """
- 从药品选择文本中提取数量与单位。
- 优先级:
- 1. 选几发几 → (1, '盒')
- 2. 显式数字/中文数字 + 单位(如 10盒, 三粒)
- 3. 特殊模式 *数字.../盒 或 *数字.../瓶(如 0.5g*15袋/盒)→ (1, 盒/瓶) # 仅当没有显式数量时
- 4. 数字ml/单位(如 100ml/盒)→ (1, 单位)
- 5. 单独数字ml(如 120ml)→ (1, '瓶')
- """
- unit_pattern = '瓶|盒|支|只|个|袋|包|粒|片|贴|罐|桶|条|板|枚|颗|管|套|丸|锭|听'
- def chinese_to_int(s: str):
- if s.isdigit():
- return int(s)
- chinese_map = {'一':1,'二':2,'三':3,'四':4,'五':5,
- '六':6,'七':7,'八':8,'九':9,'十':10,
- '百':100,'千':1000,'万':10000}
- if len(s) == 1:
- return chinese_map.get(s, 0)
- if len(s) == 2 and s[0] in chinese_map and s[1] == '十':
- return chinese_map[s[0]] * 10
- if s == '十':
- return 10
- return 1
- # 1. 选几发几
- if re.search(r'选\d+发\d+|选[一二三四五六七八九十]+发[一二三四五六七八九十]+', text):
- return (1, '盒')
- # 2. 显式数字/中文数字 + 单位(优先级最高)
- explicit_match = re.search(
- r'([一二三四五六七八九十百千万]+|\d+)\s*/?\s*(' + unit_pattern + r')',
- text
- )
- if explicit_match:
- num_str, unit = explicit_match.groups()
- quantity = chinese_to_int(num_str)
- return (quantity, unit)
- # 3. 特殊模式:*数字.../盒 或 *数字.../瓶(无显式数量时使用)
- special_match = re.search(r'\*\s*\d+\s*.*?/(盒|瓶)(?:装)?', text, re.IGNORECASE)
- if special_match:
- unit = special_match.group(1)
- return (1, unit)
- # 4. 数字ml/单位
- implicit_match = re.search(r'[\d\.]+\s*ml\s*/\s*(' + unit_pattern + r')', text, re.I)
- if implicit_match:
- unit = implicit_match.group(1)
- return (1, unit)
- # 5. 单独数字ml → 1瓶
- if re.search(r'[\d\.]+\s*ml', text, re.I):
- return (1, '瓶')
- return (None, None)
- test_cases = [
- "0.5g*15袋/盒", # 无显式数量 → 1盒
- "0.33g*24/粒/盒", # → 1盒
- "0.5g*100/片/瓶", # → 1瓶
- "10mg*20/粒/瓶", # → 1瓶
- "已选: 2盒", # → 2盒
- "已选择: 25ml/瓶", # → 1瓶
- "0.5g*15袋/盒装", # → 1盒
- "0.5g*15袋", # 无/盒/瓶 → 15袋
- "0.5g*15袋/盒 10盒", # 有显式10盒 → 10盒 ✅
- "10盒 0.5g*15袋/盒", # → 10盒
- "一支", # → 1支
- "两粒", # → 2粒
- ]
- for ex in test_cases:
- q, u = extract_quantity_and_unit(ex)
- print(f"{ex:30s} -> {q}{u if u else ''}")
|