|
@@ -9,9 +9,10 @@ from urllib.parse import quote
|
|
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
from DrissionPage import ChromiumPage, ChromiumOptions
|
|
|
import json
|
|
import json
|
|
|
from commons.Logger import get_spider_logger
|
|
from commons.Logger import get_spider_logger
|
|
|
-from commons.conn_mysql import MySQLPoolOnline
|
|
|
|
|
|
|
+from commons.conn_mysql import MySQLPoolOnline
|
|
|
from pipelines.drug_pipelines import DrugPipeline
|
|
from pipelines.drug_pipelines import DrugPipeline
|
|
|
from commons.feishu_webhook import send_text
|
|
from commons.feishu_webhook import send_text
|
|
|
|
|
+from spiders.jd.jd_captcha import handle_jd_slider_captcha
|
|
|
|
|
|
|
|
logger = get_spider_logger("jd")
|
|
logger = get_spider_logger("jd")
|
|
|
|
|
|
|
@@ -40,10 +41,14 @@ class JdCrawlerV2:
|
|
|
self.db = MySQLPoolOnline()
|
|
self.db = MySQLPoolOnline()
|
|
|
self.ip = None
|
|
self.ip = None
|
|
|
self.account_name = None
|
|
self.account_name = None
|
|
|
|
|
+ self.login_username = None
|
|
|
|
|
+ self.login_password = None
|
|
|
self.platform = 2
|
|
self.platform = 2
|
|
|
self.pipeline = DrugPipeline("jd")
|
|
self.pipeline = DrugPipeline("jd")
|
|
|
self.task_dict = drug_dict or {}
|
|
self.task_dict = drug_dict or {}
|
|
|
|
|
|
|
|
|
|
+ self.start_page = 1
|
|
|
|
|
+ self.end_page = 1
|
|
|
if self.task_dict:
|
|
if self.task_dict:
|
|
|
self.get_product_data()
|
|
self.get_product_data()
|
|
|
self.success = True
|
|
self.success = True
|
|
@@ -64,6 +69,16 @@ class JdCrawlerV2:
|
|
|
self.account_id = self.task_dict.get("collect_equipment_account_id", "")
|
|
self.account_id = self.task_dict.get("collect_equipment_account_id", "")
|
|
|
self.collect_region_id = self.task_dict.get("collect_region_id", "")
|
|
self.collect_region_id = self.task_dict.get("collect_region_id", "")
|
|
|
self.collect_round = self.task_dict.get("collect_round", 1)
|
|
self.collect_round = self.task_dict.get("collect_round", 1)
|
|
|
|
|
+ self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
|
|
|
|
|
+ self.end_page = self._parse_page(self.task_dict.get("end_page"), 15)
|
|
|
|
|
+
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def _parse_page(value, default=1):
|
|
|
|
|
+ try:
|
|
|
|
|
+ page = int(value)
|
|
|
|
|
+ return page if page >= 1 else default
|
|
|
|
|
+ except (TypeError, ValueError):
|
|
|
|
|
+ return default
|
|
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def _get_free_port():
|
|
def _get_free_port():
|
|
@@ -92,7 +107,15 @@ class JdCrawlerV2:
|
|
|
co.set_argument(f"--proxy-server={proxy}")
|
|
co.set_argument(f"--proxy-server={proxy}")
|
|
|
logger.info("启动浏览器: account=%s, debug_port=%s", self.account_name, debug_port)
|
|
logger.info("启动浏览器: account=%s, debug_port=%s", self.account_name, debug_port)
|
|
|
self.driver = ChromiumPage(co)
|
|
self.driver = ChromiumPage(co)
|
|
|
|
|
+ self._listen_started = False
|
|
|
|
|
+
|
|
|
|
|
+ def _start_listen(self):
|
|
|
|
|
+ """登录完成后再开监听,避免干扰登录页/验证码拖动。"""
|
|
|
|
|
+ if self._listen_started or not self.driver:
|
|
|
|
|
+ return
|
|
|
self.driver.listen.start("api?appid=search-pc-java")
|
|
self.driver.listen.start("api?appid=search-pc-java")
|
|
|
|
|
+ self._listen_started = True
|
|
|
|
|
+ logger.info("已启动搜索接口监听")
|
|
|
|
|
|
|
|
def register_signal_handler(self):
|
|
def register_signal_handler(self):
|
|
|
def handler(signum, frame):
|
|
def handler(signum, frame):
|
|
@@ -184,118 +207,125 @@ class JdCrawlerV2:
|
|
|
return fp.get("estimatedPrice", "") or ""
|
|
return fp.get("estimatedPrice", "") or ""
|
|
|
return ""
|
|
return ""
|
|
|
|
|
|
|
|
- def parse(self, data):
|
|
|
|
|
- ware_list = data.get("data", {}).get("wareList", [])
|
|
|
|
|
- if not ware_list:
|
|
|
|
|
- return
|
|
|
|
|
- try:
|
|
|
|
|
- for w in ware_list:
|
|
|
|
|
- title = w.get("wareName", "")
|
|
|
|
|
|
|
+ def parse(self, ware_list):
|
|
|
|
|
|
|
|
- title = re.sub(r"<[^>]*>", "", title).strip()
|
|
|
|
|
- logger.info(title)
|
|
|
|
|
- if self.product not in title:
|
|
|
|
|
- self.is_no_prodcut += 1
|
|
|
|
|
- continue
|
|
|
|
|
- if self.brand not in title:
|
|
|
|
|
- self.is_no_prodcut += 1
|
|
|
|
|
- continue
|
|
|
|
|
- # if self.product_desc not in title:
|
|
|
|
|
- # continue
|
|
|
|
|
- if "+[" in title:
|
|
|
|
|
- self.is_no_prodcut += 1
|
|
|
|
|
- continue
|
|
|
|
|
- self.is_no_prodcut = 0
|
|
|
|
|
- status = 1
|
|
|
|
|
- if self.product_keyword:
|
|
|
|
|
- search_keyword_list = self.product_keyword.split(",")
|
|
|
|
|
- for search_keyword in search_keyword_list:
|
|
|
|
|
- if search_keyword.strip() not in title:
|
|
|
|
|
- status = 0
|
|
|
|
|
- if status == 0:
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ for w in ware_list:
|
|
|
|
|
+ title = w.get("wareName", "")
|
|
|
|
|
+
|
|
|
|
|
+ title = re.sub(r"<[^>]*>", "", title).strip()
|
|
|
|
|
+ color = w.get("color", "")
|
|
|
|
|
+ full_title = title + " " + color
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(full_title)
|
|
|
|
|
|
|
|
- logger.info(f"商品名:{title}")
|
|
|
|
|
- sku_id = w.get("skuId", "")
|
|
|
|
|
- sales = w.get("totalSales", "")
|
|
|
|
|
- shop_id = w.get("shopId", "")
|
|
|
|
|
- shop_name = w.get("shopName", "")
|
|
|
|
|
- heshu_m = re.search(r"(\d+)盒", title)
|
|
|
|
|
- if heshu_m:
|
|
|
|
|
- heshu_count = int(heshu_m.group(1))
|
|
|
|
|
|
|
+ if self.product not in full_title:
|
|
|
|
|
+ self.is_no_prodcut += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ if self.brand not in full_title:
|
|
|
|
|
+ self.is_no_prodcut += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+ if self.product_desc:
|
|
|
|
|
+ if self.product_desc in full_title:
|
|
|
|
|
+ crawl_product_desc = self.product_desc
|
|
|
else:
|
|
else:
|
|
|
- heshu_count = 1
|
|
|
|
|
- final_price = self._estimated_price(w)
|
|
|
|
|
- jd_price = w.get("jdPrice", "")
|
|
|
|
|
- low_price = final_price if final_price else jd_price
|
|
|
|
|
-
|
|
|
|
|
- try:
|
|
|
|
|
- price = Decimal(str(low_price)).quantize(Decimal("0.00"))
|
|
|
|
|
- except (InvalidOperation, ValueError):
|
|
|
|
|
- price = Decimal("0.00")
|
|
|
|
|
-
|
|
|
|
|
- item_url = f"https://item.jd.com/{sku_id}.html"
|
|
|
|
|
- mall_url = f"https://mall.jd.com/index-{shop_id}.html?from=pc"
|
|
|
|
|
-
|
|
|
|
|
- # 字段与 yaofangwang_crawl 对齐;键顺序须与 commons.sql_data.RETRIEVE_SCRAPE_INSERT_COLUMNS 一致
|
|
|
|
|
- now_ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
- product = {
|
|
|
|
|
- "platform": self.platform,
|
|
|
|
|
- "item_id": sku_id,
|
|
|
|
|
- "enterprise_id": self.company_id,
|
|
|
|
|
- "product_name": title,
|
|
|
|
|
- "spec": self.product_desc,
|
|
|
|
|
- "one_price": "",
|
|
|
|
|
- "detail_url": item_url,
|
|
|
|
|
- "shop_name": shop_name,
|
|
|
|
|
- "anonymous_store_name": "",
|
|
|
|
|
- "shop_url": mall_url,
|
|
|
|
|
- "city_name": "",
|
|
|
|
|
- "city_id": "",
|
|
|
|
|
- "province_name": "",
|
|
|
|
|
- "province_id": "",
|
|
|
|
|
- "shipment_city_name": "",
|
|
|
|
|
- "shipment_city_id": "",
|
|
|
|
|
- "shipment_province_name": "",
|
|
|
|
|
- "shipment_province_id": "",
|
|
|
|
|
- "area_info": "",
|
|
|
|
|
- "factory_name": "",
|
|
|
|
|
- "scrape_date": time.strftime("%Y-%m-%d"),
|
|
|
|
|
- "price": price,
|
|
|
|
|
- "sales": sales,
|
|
|
|
|
- "stock_count": "",
|
|
|
|
|
- "snapshot_url": "",
|
|
|
|
|
- "approval_num": "",
|
|
|
|
|
- "produced_time": "",
|
|
|
|
|
- "deadline": "",
|
|
|
|
|
- "update_time": now_ts,
|
|
|
|
|
- "insert_time": now_ts,
|
|
|
|
|
- "number": heshu_count,
|
|
|
|
|
- "product_brand": self.brand or "",
|
|
|
|
|
- "collect_task_id": self.collect_task_id,
|
|
|
|
|
- "search_name": self.product,
|
|
|
|
|
- "company_name": "",
|
|
|
|
|
- "collect_config_info": json.dumps(
|
|
|
|
|
- {
|
|
|
|
|
- "sampling_cycle": self.sampling_cycle,
|
|
|
|
|
- "sampling_start_time": self.sampling_start_time,
|
|
|
|
|
- "sampling_end_time": self.sampling_end_time,
|
|
|
|
|
- }
|
|
|
|
|
- ),
|
|
|
|
|
- "account_id": self.account_id,
|
|
|
|
|
- "collect_region_id": self.collect_region_id,
|
|
|
|
|
- "collect_round": self.collect_round,
|
|
|
|
|
- "is_sold_out": 1
|
|
|
|
|
-
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- try:
|
|
|
|
|
- self.pipeline.storge_data(product)
|
|
|
|
|
- logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.exception("写入数据库失败: %s", e)
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.error("写入数据库失败: %s", e)
|
|
|
|
|
|
|
+ crawl_product_desc = ""
|
|
|
|
|
+ title = full_title
|
|
|
|
|
+ else:
|
|
|
|
|
+ crawl_product_desc = ""
|
|
|
|
|
+ title = full_title
|
|
|
|
|
+
|
|
|
|
|
+ if "+[" in title:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ self.is_no_prodcut = 0
|
|
|
|
|
+ status = 1
|
|
|
|
|
+ if self.product_keyword:
|
|
|
|
|
+ search_keyword_list = self.product_keyword.split(",")
|
|
|
|
|
+ for search_keyword in search_keyword_list:
|
|
|
|
|
+ if search_keyword.strip() not in title:
|
|
|
|
|
+ status = 0
|
|
|
|
|
+ if status == 0:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"商品名:{title}")
|
|
|
|
|
+ sku_id = w.get("skuId", "")
|
|
|
|
|
+ sales = w.get("totalSales", "")
|
|
|
|
|
+ shop_id = w.get("shopId", "")
|
|
|
|
|
+ shop_name = w.get("shopName", "")
|
|
|
|
|
+ heshu_m = re.search(r"(\d+)(盒|瓶)", full_title)
|
|
|
|
|
+ if heshu_m:
|
|
|
|
|
+ heshu_count = int(heshu_m.group(1))
|
|
|
|
|
+ else:
|
|
|
|
|
+ heshu_count = 1
|
|
|
|
|
+ final_price = self._estimated_price(w)
|
|
|
|
|
+ jd_price = w.get("jdPrice", "")
|
|
|
|
|
+ low_price = final_price if final_price else jd_price
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ price = Decimal(str(low_price)).quantize(Decimal("0.00"))
|
|
|
|
|
+ except (InvalidOperation, ValueError):
|
|
|
|
|
+ price = Decimal("0.00")
|
|
|
|
|
+
|
|
|
|
|
+ item_url = f"https://item.jd.com/{sku_id}.html"
|
|
|
|
|
+ mall_url = f"https://mall.jd.com/index-{shop_id}.html?from=pc"
|
|
|
|
|
+
|
|
|
|
|
+ # 字段与 yaofangwang_crawl 对齐;键顺序须与 commons.sql_data.RETRIEVE_SCRAPE_INSERT_COLUMNS 一致
|
|
|
|
|
+ now_ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
+ product = {
|
|
|
|
|
+ "platform": self.platform,
|
|
|
|
|
+ "item_id": sku_id,
|
|
|
|
|
+ "enterprise_id": self.company_id,
|
|
|
|
|
+ "product_name": title,
|
|
|
|
|
+ "spec": crawl_product_desc,
|
|
|
|
|
+ "one_price": "",
|
|
|
|
|
+ "detail_url": item_url,
|
|
|
|
|
+ "shop_name": shop_name,
|
|
|
|
|
+ "anonymous_store_name": "",
|
|
|
|
|
+ "shop_url": mall_url,
|
|
|
|
|
+ "city_name": "",
|
|
|
|
|
+ "city_id": "",
|
|
|
|
|
+ "province_name": "",
|
|
|
|
|
+ "province_id": "",
|
|
|
|
|
+ "shipment_city_name": "",
|
|
|
|
|
+ "shipment_city_id": "",
|
|
|
|
|
+ "shipment_province_name": "",
|
|
|
|
|
+ "shipment_province_id": "",
|
|
|
|
|
+ "area_info": "",
|
|
|
|
|
+ "factory_name": "",
|
|
|
|
|
+ "scrape_date": time.strftime("%Y-%m-%d"),
|
|
|
|
|
+ "price": price,
|
|
|
|
|
+ "sales": sales,
|
|
|
|
|
+ "stock_count": "",
|
|
|
|
|
+ "snapshot_url": "",
|
|
|
|
|
+ "approval_num": "",
|
|
|
|
|
+ "produced_time": "",
|
|
|
|
|
+ "deadline": "",
|
|
|
|
|
+ "update_time": now_ts,
|
|
|
|
|
+ "insert_time": now_ts,
|
|
|
|
|
+ "number": heshu_count,
|
|
|
|
|
+ "product_brand": self.brand or "",
|
|
|
|
|
+ "collect_task_id": self.collect_task_id,
|
|
|
|
|
+ "search_name": self.product,
|
|
|
|
|
+ "company_name": "",
|
|
|
|
|
+ "collect_config_info": json.dumps(
|
|
|
|
|
+ {
|
|
|
|
|
+ "sampling_cycle": self.sampling_cycle,
|
|
|
|
|
+ "sampling_start_time": self.sampling_start_time,
|
|
|
|
|
+ "sampling_end_time": self.sampling_end_time,
|
|
|
|
|
+ }
|
|
|
|
|
+ ),
|
|
|
|
|
+ "account_id": self.account_id,
|
|
|
|
|
+ "collect_region_id": self.collect_region_id,
|
|
|
|
|
+ "collect_round": self.collect_round,
|
|
|
|
|
+ "is_sold_out": 0
|
|
|
|
|
+
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ self.pipeline.storge_data(product)
|
|
|
|
|
+ logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.exception("写入数据库失败: %s", e)
|
|
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def _response_has_ware_list(data):
|
|
def _response_has_ware_list(data):
|
|
@@ -312,7 +342,7 @@ class JdCrawlerV2:
|
|
|
if not self._response_has_ware_list(data):
|
|
if not self._response_has_ware_list(data):
|
|
|
continue
|
|
continue
|
|
|
ware_list = data["data"]["wareList"]
|
|
ware_list = data["data"]["wareList"]
|
|
|
- self.parse(data)
|
|
|
|
|
|
|
+ self.parse(ware_list)
|
|
|
n += len(ware_list)
|
|
n += len(ware_list)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.warning("解析监听响应失败: %s", e)
|
|
logger.warning("解析监听响应失败: %s", e)
|
|
@@ -328,7 +358,7 @@ class JdCrawlerV2:
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.debug("清空监听缓冲失败: %s", e)
|
|
logger.debug("清空监听缓冲失败: %s", e)
|
|
|
|
|
|
|
|
- def collect_full_page_items(self, max_steps=20):
|
|
|
|
|
|
|
+ def collect_full_page_items(self, max_steps=10):
|
|
|
"""单次循环:边滑动边收数据,到底 / 看见「下一页」即停。"""
|
|
"""单次循环:边滑动边收数据,到底 / 看见「下一页」即停。"""
|
|
|
n = self.fetch_items_once(timeout=FETCH_TIMEOUT_FIRST)
|
|
n = self.fetch_items_once(timeout=FETCH_TIMEOUT_FIRST)
|
|
|
|
|
|
|
@@ -367,7 +397,7 @@ class JdCrawlerV2:
|
|
|
if random.random() < 0.15:
|
|
if random.random() < 0.15:
|
|
|
self.driver.run_js(f"window.scrollBy(0, -{random.randint(60, 140)})")
|
|
self.driver.run_js(f"window.scrollBy(0, -{random.randint(60, 140)})")
|
|
|
|
|
|
|
|
- self.sleep(3, 5)
|
|
|
|
|
|
|
+ self.sleep(0.5, 1.5)
|
|
|
|
|
|
|
|
if step % 3 == 2:
|
|
if step % 3 == 2:
|
|
|
n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
|
|
n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
|
|
@@ -380,37 +410,142 @@ class JdCrawlerV2:
|
|
|
return n, next_btn
|
|
return n, next_btn
|
|
|
|
|
|
|
|
def get_account(self):
|
|
def get_account(self):
|
|
|
- sql_account = f""" select `id`, `name`, `ip`, `cookie_timestamp`, `cookie_str` from `accounts_platform` where `platform` = 2 and `status` = 1 and `equipment_id` = 1 order by `cookie_timestamp` asc limit 1 """
|
|
|
|
|
- account_list = self.db.select_data(sql_account)
|
|
|
|
|
|
|
+ sql_account = """
|
|
|
|
|
+ SELECT *
|
|
|
|
|
+ FROM `retrieve_collect_equipment_account`
|
|
|
|
|
+ WHERE `id` = %s
|
|
|
|
|
+ and `status` = 0
|
|
|
|
|
+ """
|
|
|
|
|
+ account_list = self.db.select_data(sql_account, self.account_id)
|
|
|
if not account_list:
|
|
if not account_list:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
account_dict = account_list[0]
|
|
account_dict = account_list[0]
|
|
|
- self.ip = account_dict["ip"]
|
|
|
|
|
- self.account_name = account_dict["name"]
|
|
|
|
|
|
|
+ print(account_dict)
|
|
|
|
|
+ self.ip = account_dict.get("ip")
|
|
|
|
|
+ self.account_name = account_dict.get("username")
|
|
|
|
|
+ self.login_username = account_dict.get("phone", "")
|
|
|
|
|
+ self.login_password = account_dict.get("password", "")
|
|
|
logger.info("获取到账号: %s, ip: %s", self.account_name, self.ip)
|
|
logger.info("获取到账号: %s, ip: %s", self.account_name, self.ip)
|
|
|
return True
|
|
return True
|
|
|
|
|
|
|
|
def disable_account(self):
|
|
def disable_account(self):
|
|
|
- update_sql = f""" UPDATE `accounts_platform` SET `status`= %s WHERE `name` = %s; """
|
|
|
|
|
- self.db.execute(update_sql, (0, self.account_name))
|
|
|
|
|
|
|
+ update_sql = f""" UPDATE `retrieve_collect_equipment_account` SET `status`= %s WHERE `name` = %s; """
|
|
|
|
|
+ self.db.execute(update_sql, (1, self.account_name))
|
|
|
|
|
+
|
|
|
|
|
+ def _build_search_keyword(self):
|
|
|
|
|
+ parts = [p for p in (self.brand, self.product, self.product_desc) if p]
|
|
|
|
|
+ return " ".join(parts).strip() or self.product
|
|
|
|
|
+
|
|
|
|
|
+ def _is_logged_out(self):
|
|
|
|
|
+ return bool(self.driver.ele("xpath=//*[@class='link-login']", timeout=2))
|
|
|
|
|
+
|
|
|
|
|
+ def perform_jd_login(self):
|
|
|
|
|
+ """
|
|
|
|
|
+ 使用已有浏览器实例执行京东账号密码登录(含滑块验证码)。
|
|
|
|
|
+ 成功返回 True,失败返回 False。
|
|
|
|
|
+ """
|
|
|
|
|
+ username = self.login_username
|
|
|
|
|
+ password = self.login_password
|
|
|
|
|
+ login_url = "https://passport.jd.com/new/login.aspx"
|
|
|
|
|
+ self.driver.get(login_url)
|
|
|
|
|
+ input_name = self.driver.ele("xpath=//input[@id='loginname']", timeout=15)
|
|
|
|
|
+ if not input_name:
|
|
|
|
|
+ print("未找到用户名输入框")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ input_name.input(username)
|
|
|
|
|
+ time.sleep(random.uniform(1.5, 2.5))
|
|
|
|
|
+
|
|
|
|
|
+ input_pass = self.driver.ele("xpath://input[@name='nloginpwd']", timeout=5)
|
|
|
|
|
+ if not input_pass:
|
|
|
|
|
+ print("未找到密码输入框")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ input_pass.input(password)
|
|
|
|
|
+ time.sleep(random.uniform(1.5, 2.5))
|
|
|
|
|
+
|
|
|
|
|
+ login_btn = self.driver.ele("xpath://a[@id='loginsubmit']", timeout=5)
|
|
|
|
|
+ if not login_btn:
|
|
|
|
|
+ print("未找到登录按钮")
|
|
|
|
|
+ return False
|
|
|
|
|
+ login_btn.click()
|
|
|
|
|
+
|
|
|
|
|
+ time.sleep(random.uniform(3, 5))
|
|
|
|
|
+
|
|
|
|
|
+ if not handle_jd_slider_captcha(self.driver):
|
|
|
|
|
+ print("滑块验证码未通过")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ def _ensure_logged_in(self):
|
|
|
|
|
+ """未登录时自动走登录流程(账号密码 + 滑块)。"""
|
|
|
|
|
+ if not self._is_logged_out():
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ logger.info("检测到未登录,开始自动登录: %s", self.account_name)
|
|
|
|
|
+ ok = self.perform_jd_login()
|
|
|
|
|
+ if ok and not self._is_logged_out():
|
|
|
|
|
+ logger.info("自动登录成功: %s", self.account_name)
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ logger.error("自动登录失败: %s", self.account_name)
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def _check_page_blocked(self):
|
|
|
|
|
+ html = self.driver.html or ""
|
|
|
|
|
+ if "抱歉由于访问频繁导致无法搜索" in html:
|
|
|
|
|
+ logger.error("账号无法搜索(访问频繁)")
|
|
|
|
|
+ self.success = False
|
|
|
|
|
+ return True
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def _jump_to_page(self, target_page):
|
|
|
|
|
+ """跳转到指定页码,并清空跳转前的监听残留。"""
|
|
|
|
|
+ to_page_input = self.driver.ele(
|
|
|
|
|
+ "xpath=//div[contains(@class,'_pagination_toPageNum_')]//input[@type='text']",
|
|
|
|
|
+ timeout=3,
|
|
|
|
|
+ )
|
|
|
|
|
+ if not to_page_input:
|
|
|
|
|
+ logger.warning("未找到跳页输入框,无法跳转到第 %s 页", target_page)
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ self.clear_listen_buffer()
|
|
|
|
|
+ to_page_input.input(str(target_page))
|
|
|
|
|
+ self.sleep(1, 2)
|
|
|
|
|
+ self.driver.actions.key_down("enter").key_up("enter")
|
|
|
|
|
+ self.sleep(3, 5)
|
|
|
|
|
+ self.clear_listen_buffer()
|
|
|
|
|
+ logger.info("已跳转到第 %s 页", target_page)
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ def _go_next_page(self, next_btn):
|
|
|
|
|
+ self.clear_listen_buffer()
|
|
|
|
|
+ if not self._human_click(next_btn):
|
|
|
|
|
+ logger.warning("点击下一页失败")
|
|
|
|
|
+ return False
|
|
|
|
|
+ self.sleep(2, 4)
|
|
|
|
|
+ self.clear_listen_buffer()
|
|
|
|
|
+ return True
|
|
|
|
|
|
|
|
def crawl(self):
|
|
def crawl(self):
|
|
|
total = 0
|
|
total = 0
|
|
|
- keyword = self.product
|
|
|
|
|
- if self.brand:
|
|
|
|
|
- keyword = self.brand + "" + self.product
|
|
|
|
|
- if self.product_desc:
|
|
|
|
|
- keyword = keyword + " " + self.product_desc
|
|
|
|
|
|
|
+ keyword = self._build_search_keyword()
|
|
|
|
|
+
|
|
|
self.driver.get("https://www.jd.com/", timeout=15)
|
|
self.driver.get("https://www.jd.com/", timeout=15)
|
|
|
time.sleep(15)
|
|
time.sleep(15)
|
|
|
- # 判端是否登录
|
|
|
|
|
- link_login = self.driver.ele("xpath=//*[@class='link-login']")
|
|
|
|
|
- if link_login:
|
|
|
|
|
- self.disable_account()
|
|
|
|
|
- send_text(f"京东:{self.account_name}账号非登录状态")
|
|
|
|
|
- self.is_success = False
|
|
|
|
|
- logger.error(f"{self.account_name}账号非登录状态")
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if self._is_logged_out():
|
|
|
|
|
+ if not self.login_password or not self.login_username:
|
|
|
|
|
+ return
|
|
|
|
|
+ if not self._ensure_logged_in():
|
|
|
|
|
+ self.disable_account()
|
|
|
|
|
+ send_text(f"京东:{self.account_name}账号登录失败")
|
|
|
|
|
+ self.success = False
|
|
|
|
|
+ return
|
|
|
|
|
+ self.driver.get("https://www.jd.com/", timeout=15)
|
|
|
|
|
+ self.sleep(3, 5)
|
|
|
|
|
|
|
|
kw = quote(str(keyword or ""), safe="")
|
|
kw = quote(str(keyword or ""), safe="")
|
|
|
self.driver.get(
|
|
self.driver.get(
|
|
@@ -418,47 +553,79 @@ class JdCrawlerV2:
|
|
|
)
|
|
)
|
|
|
self.sleep(5, 8)
|
|
self.sleep(5, 8)
|
|
|
|
|
|
|
|
- for page in range(1, 11):
|
|
|
|
|
- if "抱歉由于访问频繁导致无法搜索" in self.driver.html:
|
|
|
|
|
- logger.error("账号无法搜索")
|
|
|
|
|
|
|
+ if self._check_page_blocked():
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ if not handle_jd_slider_captcha(self.driver, pause_listen=False):
|
|
|
|
|
+ logger.warning("进入搜索页后滑块验证码处理失败")
|
|
|
|
|
+ self.success = False
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ self._start_listen()
|
|
|
|
|
+
|
|
|
|
|
+ if self.start_page > 1:
|
|
|
|
|
+ if not self._jump_to_page(self.start_page):
|
|
|
|
|
+ logger.warning("跳页失败,将从第 1 页开始采集")
|
|
|
|
|
+ self.start_page = 1
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(
|
|
|
|
|
+ "采集页码范围: %s ~ %s(共 %s 页)",
|
|
|
|
|
+ self.start_page,
|
|
|
|
|
+ self.end_page,
|
|
|
|
|
+ self.end_page - self.start_page + 1,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ for page_no in range(self.start_page, self.end_page + 1):
|
|
|
|
|
+ if self._is_logged_out():
|
|
|
|
|
+ if not self._ensure_logged_in():
|
|
|
|
|
+ self.success = False
|
|
|
|
|
+ break
|
|
|
|
|
+ self.driver.get(
|
|
|
|
|
+ f"https://search.jd.com/Search?keyword={kw}&enc=utf-8&wq={kw}",
|
|
|
|
|
+ timeout=15,
|
|
|
|
|
+ )
|
|
|
|
|
+ self.sleep(3, 5)
|
|
|
|
|
+ if page_no > 1:
|
|
|
|
|
+ self._jump_to_page(page_no)
|
|
|
|
|
+
|
|
|
|
|
+ if not handle_jd_slider_captcha(self.driver, pause_listen=True):
|
|
|
|
|
+ logger.warning("滑块验证码处理失败,停止采集")
|
|
|
self.success = False
|
|
self.success = False
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
- if "cfe.m.jd.com/privatedomain" in self.driver.url:
|
|
|
|
|
- self.disable_account()
|
|
|
|
|
- logger.error("账号出现验证码,暂时禁用")
|
|
|
|
|
- self.success = False
|
|
|
|
|
|
|
+ if self._check_page_blocked():
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
- logger.info(f"===== 第 {page} 页 =====")
|
|
|
|
|
- time.sleep(random.uniform(3, 5))
|
|
|
|
|
|
|
+ logger.info("===== 正在爬取第 %s 页 =====", page_no)
|
|
|
|
|
+ search_ele = self.driver.ele("xpath=//div[@id='search-condition']", timeout=10)
|
|
|
|
|
+ if not search_ele:
|
|
|
|
|
+ logger.warning("未找到搜索结果区域,停止采集")
|
|
|
|
|
+ break
|
|
|
|
|
|
|
|
- page_n, next_btn = self.collect_full_page_items()
|
|
|
|
|
- self.sleep(3, 5)
|
|
|
|
|
- logger.info(f"本页监听商品条数(含可能重复): {page_n}")
|
|
|
|
|
|
|
+ page_n, _ = self.collect_full_page_items()
|
|
|
|
|
+ logger.info("本页监听商品条数(含可能重复): %s", page_n)
|
|
|
total += page_n
|
|
total += page_n
|
|
|
- logger.info(f"累计监听条数: {total}")
|
|
|
|
|
|
|
+ logger.info("累计监听条数: %s", total)
|
|
|
|
|
|
|
|
- if not next_btn:
|
|
|
|
|
- next_btn = self.driver.ele("text=下一页")
|
|
|
|
|
|
|
+ if self.is_no_prodcut > 20:
|
|
|
|
|
+ logger.info("连续无匹配商品过多,停止采集")
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ if page_no >= self.end_page:
|
|
|
|
|
+ break
|
|
|
|
|
|
|
|
|
|
+ next_btn = self.driver.ele("text=下一页", timeout=2)
|
|
|
if not next_btn:
|
|
if not next_btn:
|
|
|
logger.info("没有下一页(未找到)")
|
|
logger.info("没有下一页(未找到)")
|
|
|
break
|
|
break
|
|
|
-
|
|
|
|
|
cls_str = next_btn.attr("class") or ""
|
|
cls_str = next_btn.attr("class") or ""
|
|
|
if "disabled" in cls_str:
|
|
if "disabled" in cls_str:
|
|
|
logger.info("没有下一页(已禁用)")
|
|
logger.info("没有下一页(已禁用)")
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
- self.clear_listen_buffer(
|
|
|
|
|
- rounds=LISTEN_CLEAR_ROUNDS, timeout=LISTEN_CLEAR_TIMEOUT
|
|
|
|
|
- )
|
|
|
|
|
- if self.is_no_prodcut > 20:
|
|
|
|
|
|
|
+ if not self._go_next_page(next_btn):
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
- self._human_click(next_btn)
|
|
|
|
|
-
|
|
|
|
|
def run(self):
|
|
def run(self):
|
|
|
# 检测账号
|
|
# 检测账号
|
|
|
if not self.get_account():
|
|
if not self.get_account():
|
|
@@ -467,9 +634,9 @@ class JdCrawlerV2:
|
|
|
return self.pipeline.crawl_count, self.success
|
|
return self.pipeline.crawl_count, self.success
|
|
|
logger.info("获取到账号:%s,代理ip:%s", self.account_name, self.ip)
|
|
logger.info("获取到账号:%s,代理ip:%s", self.account_name, self.ip)
|
|
|
|
|
|
|
|
- # 每次选取账号,立马账号使用时间
|
|
|
|
|
- update_sql = f""" UPDATE `accounts_platform` SET `status`= %s, `cookie_timestamp`= %s WHERE `name` = %s; """
|
|
|
|
|
- self.db.execute(update_sql, (1, int(time.time()), self.account_name))
|
|
|
|
|
|
|
+ # # # 每次选取账号,立马账号使用时间
|
|
|
|
|
+ update_sql = f""" UPDATE `retrieve_collect_equipment_account` SET `status`= %s, `update_time`= %s WHERE `name` = %s; """
|
|
|
|
|
+ self.db.execute(update_sql, (0, int(time.time()), self.account_name))
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
self.init_browser()
|
|
self.init_browser()
|