|
|
@@ -1478,7 +1478,7 @@ def extract_province_city(address):
|
|
|
|
|
|
|
|
|
#采集数据核心
|
|
|
-def collect_data(store_page, brand, keyword, company_id):
|
|
|
+def collect_data(store_page, brand, name, keyword, spec, company_id):
|
|
|
"""
|
|
|
1) 先获取当前页商品个数(count)
|
|
|
2) 按循环次数采集;每循环15次滚动一次 slow_scroll_1200px
|
|
|
@@ -1579,6 +1579,8 @@ def collect_data(store_page, brand, keyword, company_id):
|
|
|
logger.info(f"✅ 提取到data-product-id:{product_id}") # 输出:5678955
|
|
|
else:
|
|
|
logger.warning("⚠️ 未找到商品ID,使用默认空字符串")
|
|
|
+
|
|
|
+
|
|
|
#3、 提取商品标题(处理空值)
|
|
|
product_locator = item.locator(PRODUCT_TITLE_SELECTOR)
|
|
|
if product_locator.count() > 0:
|
|
|
@@ -1588,6 +1590,12 @@ def collect_data(store_page, brand, keyword, company_id):
|
|
|
logger.warning(f" 「{keyword}」第{collected_count}个商品 - 列表页标题元素未找到,使用默认值:{title}")
|
|
|
|
|
|
|
|
|
+ #筛选非想要的品牌、名称、品规等等。
|
|
|
+ if brand not in title and name not in title and spec not in title:
|
|
|
+ logger.warning(f" 「{keyword}」第{collected_count}个商品 - 标题「{title}」不包含品牌「{brand}」、名称「{name}」、规格「{spec}」,跳过本次循环")
|
|
|
+ continue
|
|
|
+
|
|
|
+
|
|
|
#关键词不在标题中,跳过当前商品
|
|
|
# core_keyword = re.sub(r'^999[\s\(\)()、·]*', '', keyword)
|
|
|
# if core_keyword not in title:
|
|
|
@@ -2309,7 +2317,7 @@ def main():
|
|
|
# # [12,'金力舒阿莫西林克拉维酸钾片']
|
|
|
# ]
|
|
|
tasks = get_search_keywords_from_db()
|
|
|
- # tasks = [1, '金活','金活依马打正红花油',3],
|
|
|
+ # tasks = [1, '金活','依马打正红花油','25ml',3],
|
|
|
if not tasks:
|
|
|
logger.error("未获取到任何任务,程序退出")
|
|
|
return
|
|
|
@@ -2318,8 +2326,10 @@ def main():
|
|
|
nums = 0
|
|
|
|
|
|
|
|
|
- # 2. 批量搜索+采集+保存
|
|
|
- for task_id,brand, keyword, company_id in tasks:
|
|
|
+ # 2. 批量搜索+采集+保存 keywords是金活依马打正红花油
|
|
|
+ for task_id, brand, name, spec, company_id in tasks:
|
|
|
+ keyword = brand + name
|
|
|
+
|
|
|
logger.info(f"\n=====================================")
|
|
|
logger.info(f"开始处理任务 {task_id},公司ID:{company_id},关键词:{keyword}")
|
|
|
logger.info(f"=====================================")
|
|
|
@@ -2366,7 +2376,7 @@ def main():
|
|
|
store_page.wait_for_load_state('networkidle')
|
|
|
|
|
|
# 采集数据
|
|
|
- data_list = collect_data(store_page, brand, keyword, company_id)
|
|
|
+ data_list = collect_data(store_page, brand, name, keyword, spec, company_id)
|
|
|
real_count = len(data_list)
|
|
|
success = True
|
|
|
logger.info(f"关键词「{keyword}」采集完成,共 {real_count} 条数据")
|