1 개월 전 · edaaeceb4a
--- a/config.py
+++ b/config.py
@@ -97,7 +97,7 @@ def fuzzy_match_product_url_in_db_mysql(product_url):
 
				 
			
 
				 # ==================== 从数据库提取任务 ====================
			
 
				 def get_search_keywords_from_db(platform: int = 9):
			
 
				-    """读取待执行任务（status=1），返回 [(task_id, brand, keyword, company_id), ...]。"""
			
 
				+    """读取待执行任务（status=1），返回 [(task_id, brand, keyword, company_id, product_specs), ...]。"""
			
 
				     keywords = []
			
 
				     conn = None
			
 
				     cursor = None
			
@@ -121,12 +121,11 @@ def get_search_keywords_from_db(platform: int = 9):
 
				             task_id = row[0]
			
 
				             brand = (row[1] or '').strip()
			
 
				             name = (row[2] or '').strip()
			
 
				+            specs = (row[3] or '').strip()
			
 
				             company_id = row[4] if row[4] is not None else 0
			
 
				 
			
 
				-            parts = [p for p in [brand, name] if p]
			
 
				-            if parts:
			
 
				-                keyword = ''.join(parts)
			
 
				-                keywords.append((task_id, brand, keyword, company_id))
			
 
				+
			
 
				+            keywords.append((task_id, brand, name, specs, company_id))
			
 
				 
			
 
				         logger.debug(f"读取待执行任务完成，platform={platform}，数量={len(keywords)}")
			
 
				     except Exception as e:
			
--- a/main.py
+++ b/main.py
@@ -1478,7 +1478,7 @@ def extract_province_city(address):
 
				 
			
 
				 
			
 
				 #采集数据核心
			
 
				-def collect_data(store_page, brand, keyword, company_id):
			
 
				+def collect_data(store_page, brand, name, keyword, spec, company_id):
			
 
				     """
			
 
				     1) 先获取当前页商品个数（count）
			
 
				     2) 按循环次数采集；每循环15次滚动一次 slow_scroll_1200px
			
@@ -1579,6 +1579,8 @@ def collect_data(store_page, brand, keyword, company_id):
 
				                     logger.info(f"✅ 提取到data-product-id：{product_id}")  # 输出：5678955
			
 
				                 else:
			
 
				                     logger.warning("⚠️ 未找到商品ID，使用默认空字符串")
			
 
				+
			
 
				+
			
 
				                 #3、 提取商品标题（处理空值）
			
 
				                 product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
			
 
				                 if product_locator.count() > 0:
			
@@ -1588,6 +1590,12 @@ def collect_data(store_page, brand, keyword, company_id):
 
				                     logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到，使用默认值：{title}")
			
 
				 
			
 
				 
			
 
				+                #筛选非想要的品牌、名称、品规等等。
			
 
				+                if brand not in title and name not in title and spec not in title:
			
 
				+                    logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含品牌「{brand}」、名称「{name}」、规格「{spec}」，跳过本次循环")
			
 
				+                    continue
			
 
				+
			
 
				+
			
 
				                 #关键词不在标题中，跳过当前商品
			
 
				                 # core_keyword = re.sub(r'^999[\s\(\)（）、·]*', '', keyword)
			
 
				                 # if core_keyword not in title:
			
@@ -2309,7 +2317,7 @@ def main():
 
				             #     # [12,'金力舒阿莫西林克拉维酸钾片']
			
 
				             # ]
			
 
				             tasks = get_search_keywords_from_db()
			
 
				-            # tasks = [1, '金活','金活依马打正红花油',3],
			
 
				+            # tasks = [1, '金活','依马打正红花油','25ml',3],
			
 
				             if not tasks:
			
 
				                 logger.error("未获取到任何任务，程序退出")
			
 
				                 return
			
@@ -2318,8 +2326,10 @@ def main():
 
				             nums = 0
			
 
				 
			
 
				 
			
 
				-            # 2. 批量搜索+采集+保存
			
 
				-            for task_id,brand, keyword, company_id in tasks:
			
 
				+            # 2. 批量搜索+采集+保存   keywords是金活依马打正红花油
			
 
				+            for task_id, brand, name, spec, company_id in tasks:
			
 
				+                keyword = brand + name
			
 
				+
			
 
				                 logger.info(f"\n=====================================")
			
 
				                 logger.info(f"开始处理任务 {task_id}，公司ID：{company_id}，关键词：{keyword}")
			
 
				                 logger.info(f"=====================================")
			
@@ -2366,7 +2376,7 @@ def main():
 
				                 store_page.wait_for_load_state('networkidle')
			
 
				 
			
 
				                 # 采集数据
			
 
				-                data_list = collect_data(store_page, brand, keyword, company_id)
			
 
				+                data_list = collect_data(store_page, brand, name, keyword, spec, company_id)
			
 
				                 real_count = len(data_list)
			
 
				                 success = True
			
 
				                 logger.info(f"关键词「{keyword}」采集完成，共 {real_count} 条数据")