소스 검색

添加筛选非想要的品牌、名称、品规

feelsocode 1 주 전
부모
커밋
edaaeceb4a
2개의 변경된 파일19개의 추가작업 그리고 10개의 파일을 삭제
  1. 4 5
      config.py
  2. 15 5
      main.py

+ 4 - 5
config.py

@@ -97,7 +97,7 @@ def fuzzy_match_product_url_in_db_mysql(product_url):
 
 # ==================== 从数据库提取任务 ====================
 def get_search_keywords_from_db(platform: int = 9):
-    """读取待执行任务(status=1),返回 [(task_id, brand, keyword, company_id), ...]。"""
+    """读取待执行任务(status=1),返回 [(task_id, brand, keyword, company_id, product_specs), ...]。"""
     keywords = []
     conn = None
     cursor = None
@@ -121,12 +121,11 @@ def get_search_keywords_from_db(platform: int = 9):
             task_id = row[0]
             brand = (row[1] or '').strip()
             name = (row[2] or '').strip()
+            specs = (row[3] or '').strip()
             company_id = row[4] if row[4] is not None else 0
 
-            parts = [p for p in [brand, name] if p]
-            if parts:
-                keyword = ''.join(parts)
-                keywords.append((task_id, brand, keyword, company_id))
+
+            keywords.append((task_id, brand, name, specs, company_id))
 
         logger.debug(f"读取待执行任务完成,platform={platform},数量={len(keywords)}")
     except Exception as e:

+ 15 - 5
main.py

@@ -1478,7 +1478,7 @@ def extract_province_city(address):
 
 
 #采集数据核心
-def collect_data(store_page, brand, keyword, company_id):
+def collect_data(store_page, brand, name, keyword, spec, company_id):
     """
     1) 先获取当前页商品个数(count)
     2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
@@ -1579,6 +1579,8 @@ def collect_data(store_page, brand, keyword, company_id):
                     logger.info(f"✅ 提取到data-product-id:{product_id}")  # 输出:5678955
                 else:
                     logger.warning("⚠️ 未找到商品ID,使用默认空字符串")
+
+
                 #3、 提取商品标题(处理空值)
                 product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
                 if product_locator.count() > 0:
@@ -1588,6 +1590,12 @@ def collect_data(store_page, brand, keyword, company_id):
                     logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
 
 
+                #筛选非想要的品牌、名称、品规等等。
+                if brand not in title and name not in title and spec not in title:
+                    logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含品牌「{brand}」、名称「{name}」、规格「{spec}」,跳过本次循环")
+                    continue
+
+
                 #关键词不在标题中,跳过当前商品
                 # core_keyword = re.sub(r'^999[\s\(\)()、·]*', '', keyword)
                 # if core_keyword not in title:
@@ -2309,7 +2317,7 @@ def main():
             #     # [12,'金力舒阿莫西林克拉维酸钾片']
             # ]
             tasks = get_search_keywords_from_db()
-            # tasks = [1, '金活','金活依马打正红花油',3],
+            # tasks = [1, '金活','依马打正红花油','25ml',3],
             if not tasks:
                 logger.error("未获取到任何任务,程序退出")
                 return
@@ -2318,8 +2326,10 @@ def main():
             nums = 0
 
 
-            # 2. 批量搜索+采集+保存
-            for task_id,brand, keyword, company_id in tasks:
+            # 2. 批量搜索+采集+保存   keywords是金活依马打正红花油
+            for task_id, brand, name, spec, company_id in tasks:
+                keyword = brand + name
+
                 logger.info(f"\n=====================================")
                 logger.info(f"开始处理任务 {task_id},公司ID:{company_id},关键词:{keyword}")
                 logger.info(f"=====================================")
@@ -2366,7 +2376,7 @@ def main():
                 store_page.wait_for_load_state('networkidle')
 
                 # 采集数据
-                data_list = collect_data(store_page, brand, keyword, company_id)
+                data_list = collect_data(store_page, brand, name, keyword, spec, company_id)
                 real_count = len(data_list)
                 success = True
                 logger.info(f"关键词「{keyword}」采集完成,共 {real_count} 条数据")