1 hete · d235607eb3
--- a/spiders/jd/jd_auto_crawl.py
+++ b/spiders/jd/jd_auto_crawl.py
@@ -223,24 +223,6 @@ class JdCrawlerV2:
 
				         else:
			
 
				             return 1
			
 
				 
			
 
				-    def _take_snapshot(self, upload_key, ele):
			
 
				-        """在指定标签页截图并上传。"""
			
 
				-        time.sleep(1)
			
 
				-        try:
			
 
				-            jpg_bytes = ele.get_screenshot(as_bytes="jpg")
			
 
				-            if not jpg_bytes:
			
 
				-                logger.warning("截图为空 upload_key=%s", upload_key)
			
 
				-                return ""
			
 
				-            img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
			
 
				-        except Exception:
			
 
				-            logger.exception("截图或 OSS 上传失败 upload_key=%s", upload_key)
			
 
				-            return ""
			
 
				-        if not img_url:
			
 
				-            logger.warning("OSS 未返回有效地址 upload_key=%s", upload_key)
			
 
				-            return ""
			
 
				-        logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
			
 
				-        time.sleep(random.uniform(1, 2))
			
 
				-        return img_url
			
 
				 
			
 
				     def get_heshu(self,full_title):
			
 
				         last_box = None
			
@@ -312,7 +294,7 @@ class JdCrawlerV2:
 
				             ele_xpath = "//div[@id='main_search_conter']//div[contains(@class,'_goodsContainer_')]/div[@data-sku=" + "'" + sku_id + "'" + "]"
			
 
				             ele_screen = self.driver.ele("xpath="+ele_xpath)
			
 
				             upload_key = hashlib.md5(item_url.encode("utf-8")).hexdigest()
			
 
				-            snap_url = self._take_snapshot(upload_key,ele_screen)
			
 
				+            snap_url = ""
			
 
				 
			
 
				             try:
			
 
				                 price = Decimal(str(low_price)).quantize(Decimal("0.00"))
			
--- a/spiders/jd/jd_auto_crawl_snap.py
+++ b/spiders/jd/jd_auto_crawl_snap.py
@@ -0,0 +1,718 @@
 
				+import random
			
 
				+import re
			
 
				+import signal
			
 
				+import socket
			
 
				+import sys
			
 
				+import time
			
 
				+from decimal import Decimal, InvalidOperation
			
 
				+from urllib.parse import quote
			
 
				+from DrissionPage import ChromiumPage, ChromiumOptions
			
 
				+import json
			
 
				+import hashlib
			
 
				+from commons.Logger import get_spider_logger
			
 
				+from commons.conn_mysql import MySQLPoolOnline
			
 
				+from pipelines.drug_pipelines import DrugPipeline
			
 
				+from commons.feishu_webhook import send_text
			
 
				+from spiders.jd.jd_captcha import handle_jd_slider_captcha
			
 
				+from oss_upload.oss_upload import AliyunOSSUploader
			
 
				+
			
 
				+logger = get_spider_logger("jd")
			
 
				+
			
 
				+chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
			
 
				+
			
 
				+FETCH_TIMEOUT_FIRST = 5
			
 
				+FETCH_TIMEOUT_SCROLL = 6
			
 
				+LISTEN_CLEAR_ROUNDS = 3
			
 
				+LISTEN_CLEAR_TIMEOUT = 0.45
			
 
				+
			
 
				+# 「下一页」是否在视口内（条件略宽）
			
 
				+_JS_NEXT_BTN_IN_VIEWPORT = """
			
 
				+var el = arguments[0];
			
 
				+if (!el) return false;
			
 
				+var r = el.getBoundingClientRect();
			
 
				+var h = window.innerHeight || document.documentElement.clientHeight || 800;
			
 
				+var w = window.innerWidth || document.documentElement.clientWidth || 1200;
			
 
				+return r.bottom > 80 && r.top < h - 40 && r.right > 0 && r.left < w;
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+class JdCrawlerV2:
			
 
				+    def __init__(self, drug_dict=None):
			
 
				+        self.driver = None
			
 
				+        self.register_signal_handler()
			
 
				+        self.db = MySQLPoolOnline()
			
 
				+        self.ip = None
			
 
				+        self.account_name = None
			
 
				+        self.login_username = None
			
 
				+        self.login_password = None
			
 
				+        self.platform = 2
			
 
				+        self.pipeline = DrugPipeline("jd")
			
 
				+        self.task_dict = drug_dict or {}
			
 
				+        self.ossuploader = AliyunOSSUploader()
			
 
				+        self.start_page = 1
			
 
				+        self.end_page = 1
			
 
				+        if self.task_dict:
			
 
				+            self.get_product_data()
			
 
				+        self.success = True
			
 
				+        self.is_no_prodcut = 0
			
 
				+
			
 
				+    def get_product_data(self):
			
 
				+        self.task_id = self.task_dict["id"]
			
 
				+        self.company_id = self.task_dict["company_id"]
			
 
				+        self.product = self.task_dict["product_name"]
			
 
				+        self.product_desc = self.task_dict.get("product_specs", "")
			
 
				+        self.brand = self.task_dict.get("product_brand", "")
			
 
				+        self.product_keyword = self.task_dict.get("product_keyword", "")
			
 
				+        self.collect_task_id = self.task_dict.get("collect_task_id", "")
			
 
				+        self.sampling_cycle = self.task_dict.get("sampling_cycle", "")
			
 
				+        self.sampling_start_time = self.task_dict.get("sampling_start_time", "")
			
 
				+        self.sampling_end_time = self.task_dict.get("sampling_end_time", "")
			
 
				+        self.collect_equipment_id = self.task_dict.get("collect_equipment_id", "")
			
 
				+        self.account_id = self.task_dict.get("collect_equipment_account_id", "")
			
 
				+        self.collect_region_id = self.task_dict.get("collect_region_id", "")
			
 
				+        self.collect_round = self.task_dict.get("collect_round", 1)
			
 
				+        self.start_page = self._parse_page(self.task_dict.get("start_page"), 1)
			
 
				+        self.end_page = self._parse_page(self.task_dict.get("end_page"), 20)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _parse_page(value, default=1):
			
 
				+        try:
			
 
				+            page = int(value)
			
 
				+            return page if page >= 1 else default
			
 
				+        except (TypeError, ValueError):
			
 
				+            return default
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _get_free_port():
			
 
				+        """获取一个当前可用的本地端口，供 Chrome 调试使用。"""
			
 
				+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
			
 
				+            s.bind(("127.0.0.1", 0))
			
 
				+            return s.getsockname()[1]
			
 
				+
			
 
				+    def init_browser(self):
			
 
				+        co = ChromiumOptions().set_browser_path(chrome_path)
			
 
				+        debug_port = self._get_free_port()
			
 
				+        co.set_user_data_path(f"./spiders/jd/{self.account_name}")
			
 
				+
			
 
				+        co.set_local_port(debug_port)
			
 
				+        co.set_argument(f"--remote-debugging-port={debug_port}")
			
 
				+        co.set_argument("--remote-debugging-address=127.0.0.1")
			
 
				+        # co.set_argument("--disable-blink-features=AutomationControlled")
			
 
				+        co.set_argument("--disable-dev-shm-usage")
			
 
				+        co.set_argument("--no-first-run")  # 避免首次运行弹窗
			
 
				+        co.set_argument("--no-default-browser-check")  # 避免默认浏览器检查
			
 
				+
			
 
				+        if self.ip:
			
 
				+            proxy = self.ip.strip()
			
 
				+            if not proxy.startswith(("http://", "https://")):
			
 
				+                proxy = f"http://{proxy}"
			
 
				+            co.set_argument(f"--proxy-server={proxy}")
			
 
				+        logger.info("启动浏览器: account=%s, debug_port=%s", self.account_name, debug_port)
			
 
				+        self.driver = ChromiumPage(co)
			
 
				+        self._listen_started = False
			
 
				+
			
 
				+    def _start_listen(self):
			
 
				+        """登录完成后再开监听，避免干扰登录页/验证码拖动。"""
			
 
				+        if self._listen_started or not self.driver:
			
 
				+            return
			
 
				+        self.driver.listen.start("api?appid=search-pc-java")
			
 
				+        self._listen_started = True
			
 
				+        logger.info("已启动搜索接口监听")
			
 
				+
			
 
				+    def register_signal_handler(self):
			
 
				+        def handler(signum, frame):
			
 
				+            print("\n⚠️ 程序退出")
			
 
				+            if self.driver:
			
 
				+                self.driver.quit()
			
 
				+            sys.exit(0)
			
 
				+
			
 
				+        signal.signal(signal.SIGINT, handler)
			
 
				+        if hasattr(signal, "SIGTERM"):
			
 
				+            signal.signal(signal.SIGTERM, handler)
			
 
				+
			
 
				+    def sleep(self, a, b):
			
 
				+        time.sleep(random.uniform(a, b))
			
 
				+
			
 
				+    def _scroll_page_down(self, delta=900):
			
 
				+        self.driver.run_js(f"window.scrollBy(0, {int(delta)});")
			
 
				+        time.sleep(random.uniform(0.3, 0.6))
			
 
				+
			
 
				+    def _scroll_next_into_view(self, el):
			
 
				+        if not el:
			
 
				+            return
			
 
				+        try:
			
 
				+            self.driver.run_js(
			
 
				+                "arguments[0].scrollIntoView({block:'center',behavior:'instant'});",
			
 
				+                el,
			
 
				+            )
			
 
				+            self.sleep(1, 2)
			
 
				+        except Exception as e:
			
 
				+            logger.warning("滚动到下一页按钮失败: %s", e)
			
 
				+            try:
			
 
				+                el.scroll.to_see()
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				+    def _get_scroll_info(self):
			
 
				+        return self.driver.run_js("""
			
 
				+            return {
			
 
				+                scrollY: window.scrollY || window.pageYOffset || 0,
			
 
				+                docH: Math.max(document.body.scrollHeight,
			
 
				+                               document.documentElement.scrollHeight,
			
 
				+                               document.body.offsetHeight),
			
 
				+                viewH: window.innerHeight || document.documentElement.clientHeight || 800
			
 
				+            };
			
 
				+        """)
			
 
				+
			
 
				+    def _find_next_btn(self, timeout=0.3):
			
 
				+        try:
			
 
				+            return self.driver.ele("text=下一页", timeout=timeout)
			
 
				+        except Exception:
			
 
				+            return None
			
 
				+
			
 
				+    def _is_next_btn_visible(self, btn):
			
 
				+        if not btn:
			
 
				+            return False
			
 
				+        try:
			
 
				+            return bool(self.driver.run_js(_JS_NEXT_BTN_IN_VIEWPORT, btn))
			
 
				+        except Exception:
			
 
				+            return False
			
 
				+
			
 
				+    def _human_click(self, element):
			
 
				+        """在目标节点上触发 click，避免 move_to + 无目标 actions.click() 因布局位移点到商品链接触发详情页。"""
			
 
				+        if not element:
			
 
				+            return False
			
 
				+        try:
			
 
				+            self.sleep(0.8, 2.0)
			
 
				+            try:
			
 
				+                self.driver.run_js(
			
 
				+                    "arguments[0].scrollIntoView({block:'center',behavior:'instant'});",
			
 
				+                    element,
			
 
				+                )
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+            self.sleep(0.2, 0.6)
			
 
				+            self.driver.run_js("arguments[0].click();", element)
			
 
				+            return True
			
 
				+        except Exception as e:
			
 
				+            logger.warning("点击失败: %s", e)
			
 
				+            try:
			
 
				+                element.click()
			
 
				+                return True
			
 
				+            except Exception:
			
 
				+                return False
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _estimated_price(json_data):
			
 
				+        fp = json_data.get("finalPrice")
			
 
				+        if isinstance(fp, dict):
			
 
				+            return fp.get("estimatedPrice", "") or ""
			
 
				+        return ""
			
 
				+
			
 
				+    def get_heshu(self, full_title):
			
 
				+        last_box = None
			
 
				+        last_bottle = None
			
 
				+        for match in re.finditer(r"(\d+)(盒|瓶)", full_title):
			
 
				+            if match.group(2) == '盒':
			
 
				+                last_box = match
			
 
				+            else:  # 瓶
			
 
				+                last_bottle = match
			
 
				+        if last_box:
			
 
				+            return int(last_box.group(1))
			
 
				+        elif last_bottle:
			
 
				+            return int(last_bottle.group(1))
			
 
				+        else:
			
 
				+            return 1
			
 
				+
			
 
				+    def _take_snapshot(self, upload_key, image_ele, max_retries=3):
			
 
				+        """在指定标签页截图并上传。"""
			
 
				+        for attempt in range(1, max_retries + 1):
			
 
				+            time.sleep(1)
			
 
				+            try:
			
 
				+                jpg_bytes = image_ele.get_screenshot(as_bytes="jpg")
			
 
				+                if not jpg_bytes:
			
 
				+                    logger.warning(
			
 
				+                        "截图为空 upload_key=%s attempt=%s/%s",
			
 
				+                        upload_key, attempt, max_retries, )
			
 
				+                    continue
			
 
				+                img_url = self.ossuploader.upload_from_bytes(jpg_bytes, str(upload_key))
			
 
				+            except Exception:
			
 
				+                logger.exception(
			
 
				+                    "截图或 OSS 上传失败 upload_key=%s attempt=%s/%s",
			
 
				+                    upload_key, attempt, max_retries,
			
 
				+                )
			
 
				+                continue
			
 
				+            if not img_url:
			
 
				+                logger.warning(
			
 
				+                    "OSS 未返回有效地址 upload_key=%s attempt=%s/%s",
			
 
				+                    upload_key, attempt, max_retries,
			
 
				+                )
			
 
				+                continue
			
 
				+            logger.info("截图上传完成 upload_key=%s url=%s", upload_key, img_url)
			
 
				+            time.sleep(random.uniform(1, 2))
			
 
				+            return img_url
			
 
				+        logger.warning("截图失败，已达最大重试次数 upload_key=%s", upload_key)
			
 
				+        return ""
			
 
				+
			
 
				+    def get_heshu(self, full_title):
			
 
				+        last_box = None
			
 
				+        last_bottle = None
			
 
				+        for match in re.finditer(r"(\d+)(盒|瓶)", full_title):
			
 
				+            if match.group(2) == '盒':
			
 
				+                last_box = match
			
 
				+            else:  # 瓶
			
 
				+                last_bottle = match
			
 
				+        if last_box:
			
 
				+            return int(last_box.group(1))
			
 
				+        elif last_bottle:
			
 
				+            return int(last_bottle.group(1))
			
 
				+        else:
			
 
				+            return 1
			
 
				+
			
 
				+    def parse(self, ware_list):
			
 
				+
			
 
				+        for w in ware_list:
			
 
				+            title = w.get("wareName", "")
			
 
				+
			
 
				+            title = re.sub(r"<[^>]*>", "", title).strip()
			
 
				+            color = w.get("color", "")
			
 
				+            full_title = title + " " + color
			
 
				+
			
 
				+            logger.info(full_title)
			
 
				+
			
 
				+            if self.product not in full_title:
			
 
				+                self.is_no_prodcut += 1
			
 
				+                continue
			
 
				+            if self.brand not in full_title:
			
 
				+                self.is_no_prodcut += 1
			
 
				+                continue
			
 
				+            if self.product_desc:
			
 
				+                if self.product_desc in full_title:
			
 
				+                    crawl_product_desc = self.product_desc
			
 
				+                else:
			
 
				+                    crawl_product_desc = ""
			
 
				+                    title = full_title
			
 
				+            else:
			
 
				+                crawl_product_desc = ""
			
 
				+                title = full_title
			
 
				+
			
 
				+            if "+[" in title:
			
 
				+                continue
			
 
				+
			
 
				+            self.is_no_prodcut = 0
			
 
				+            status = 1
			
 
				+            if self.product_keyword:
			
 
				+                search_keyword_list = self.product_keyword.split(",")
			
 
				+                for search_keyword in search_keyword_list:
			
 
				+                    if search_keyword.strip() not in title:
			
 
				+                        status = 0
			
 
				+            if status == 0:
			
 
				+                continue
			
 
				+
			
 
				+            logger.info(f"商品名：{title}")
			
 
				+            sku_id = w.get("skuId", "")
			
 
				+            sales = w.get("totalSales", "")
			
 
				+            shop_id = w.get("shopId", "")
			
 
				+            shop_name = w.get("shopName", "")
			
 
				+            heshu_count = self.get_heshu(full_title)
			
 
				+            final_price = self._estimated_price(w)
			
 
				+            jd_price = w.get("jdPrice", "")
			
 
				+            item_url = f"https://item.jd.com/{sku_id}.html"
			
 
				+            low_price = final_price if final_price else jd_price
			
 
				+
			
 
				+            # 获取列表页快照
			
 
				+            ele_xpath = "//div[@id='main_search_conter']//div[contains(@class,'_goodsContainer_')]/div[@data-sku=" + "'" + sku_id + "'" + "]"
			
 
				+            ele_screen = self.driver.ele("xpath=" + ele_xpath)
			
 
				+            upload_key = hashlib.md5(item_url.encode("utf-8")).hexdigest()
			
 
				+            snap_url = self._take_snapshot(upload_key, ele_screen)
			
 
				+
			
 
				+            try:
			
 
				+                price = Decimal(str(low_price)).quantize(Decimal("0.00"))
			
 
				+            except (InvalidOperation, ValueError):
			
 
				+                price = Decimal("0.00")
			
 
				+
			
 
				+            item_url = f"https://item.jd.com/{sku_id}.html"
			
 
				+            mall_url = f"https://mall.jd.com/index-{shop_id}.html?from=pc"
			
 
				+
			
 
				+            # 字段与 yaofangwang_crawl 对齐；键顺序须与 commons.sql_data.RETRIEVE_SCRAPE_INSERT_COLUMNS 一致
			
 
				+            now_ts = time.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+            product = {
			
 
				+                "platform": self.platform,
			
 
				+                "item_id": sku_id,
			
 
				+                "enterprise_id": self.company_id,
			
 
				+                "product_name": title,
			
 
				+                "spec": crawl_product_desc,
			
 
				+                "one_price": "",
			
 
				+                "detail_url": item_url,
			
 
				+                "shop_name": shop_name,
			
 
				+                "anonymous_store_name": "",
			
 
				+                "shop_url": mall_url,
			
 
				+                "city_name": "",
			
 
				+                "city_id": "",
			
 
				+                "province_name": "",
			
 
				+                "province_id": "",
			
 
				+                "shipment_city_name": "",
			
 
				+                "shipment_city_id": "",
			
 
				+                "shipment_province_name": "",
			
 
				+                "shipment_province_id": "",
			
 
				+                "area_info": "",
			
 
				+                "factory_name": "",
			
 
				+                "scrape_date": time.strftime("%Y-%m-%d"),
			
 
				+                "price": price,
			
 
				+                "sales": sales,
			
 
				+                "stock_count": "",
			
 
				+                "snapshot_url": snap_url,
			
 
				+                "approval_num": "",
			
 
				+                "produced_time": "",
			
 
				+                "deadline": "",
			
 
				+                "update_time": now_ts,
			
 
				+                "insert_time": now_ts,
			
 
				+                "number": heshu_count,
			
 
				+                "product_brand": self.brand or "",
			
 
				+                "collect_task_id": self.collect_task_id,
			
 
				+                "search_name": self.product,
			
 
				+                "company_name": "",
			
 
				+                "collect_config_info": json.dumps(
			
 
				+                    {
			
 
				+                        "sampling_cycle": self.sampling_cycle,
			
 
				+                        "sampling_start_time": self.sampling_start_time,
			
 
				+                        "sampling_end_time": self.sampling_end_time,
			
 
				+                    }
			
 
				+                ),
			
 
				+                "account_id": self.account_id,
			
 
				+                "collect_region_id": self.collect_region_id,
			
 
				+                "collect_round": self.collect_round,
			
 
				+                "is_sold_out": 0
			
 
				+
			
 
				+            }
			
 
				+
			
 
				+            try:
			
 
				+                self.pipeline.storge_data(product)
			
 
				+                logger.info("%s", json.dumps(product, ensure_ascii=False, default=str))
			
 
				+            except Exception as e:
			
 
				+                logger.exception("写入数据库失败: %s", e)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _response_has_ware_list(data):
			
 
				+        if not isinstance(data, dict):
			
 
				+            return False
			
 
				+        wl = data.get("data", {}).get("wareList")
			
 
				+        return bool(wl)
			
 
				+
			
 
				+    def fetch_items_once(self, timeout=FETCH_TIMEOUT_FIRST):
			
 
				+        n = 0
			
 
				+        for resp in self.driver.listen.steps(timeout=timeout):
			
 
				+            try:
			
 
				+                data = resp.response.body
			
 
				+                if not self._response_has_ware_list(data):
			
 
				+                    continue
			
 
				+                ware_list = data["data"]["wareList"]
			
 
				+                self.parse(ware_list)
			
 
				+                n += len(ware_list)
			
 
				+            except Exception as e:
			
 
				+                logger.warning("解析监听响应失败: %s", e)
			
 
				+        return n
			
 
				+
			
 
				+    def clear_listen_buffer(self, rounds=LISTEN_CLEAR_ROUNDS, timeout=LISTEN_CLEAR_TIMEOUT):
			
 
				+        try:
			
 
				+            for _ in range(rounds):
			
 
				+                resps = list(self.driver.listen.steps(timeout=timeout))
			
 
				+                if not resps:
			
 
				+                    break
			
 
				+            logger.debug("监听缓冲已清空")
			
 
				+        except Exception as e:
			
 
				+            logger.debug("清空监听缓冲失败: %s", e)
			
 
				+
			
 
				+    def collect_full_page_items(self, max_steps=10):
			
 
				+        """单次循环：边滑动边收数据，到底 / 看见「下一页」即停。"""
			
 
				+        n = self.fetch_items_once(timeout=FETCH_TIMEOUT_FIRST)
			
 
				+
			
 
				+        stagnant = 0
			
 
				+        last_scroll_y = None
			
 
				+
			
 
				+        for step in range(max_steps):
			
 
				+            next_btn = self._find_next_btn(timeout=0.3)
			
 
				+            if self._is_next_btn_visible(next_btn):
			
 
				+                n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
			
 
				+                return n, next_btn
			
 
				+
			
 
				+            info = self._get_scroll_info()
			
 
				+            scroll_y = info["scrollY"]
			
 
				+            doc_h = info["docH"]
			
 
				+            view_h = info["viewH"]
			
 
				+
			
 
				+            at_bottom = (scroll_y + view_h >= doc_h - 20)
			
 
				+            if last_scroll_y is not None and abs(scroll_y - last_scroll_y) < 8:
			
 
				+                stagnant += 1
			
 
				+            else:
			
 
				+                stagnant = 0
			
 
				+            last_scroll_y = scroll_y
			
 
				+
			
 
				+            if at_bottom and stagnant >= 2:
			
 
				+                n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
			
 
				+                next_btn = self._find_next_btn(timeout=2)
			
 
				+                if next_btn:
			
 
				+                    self._scroll_next_into_view(next_btn)
			
 
				+                    return n, next_btn
			
 
				+                logger.info("已到页面底部且未发现下一页，停止滑动")
			
 
				+                return n, None
			
 
				+
			
 
				+            self._scroll_page_down(random.randint(400, 800))
			
 
				+
			
 
				+            if random.random() < 0.15:
			
 
				+                self.driver.run_js(f"window.scrollBy(0, -{random.randint(60, 140)})")
			
 
				+
			
 
				+            self.sleep(0.5, 1.5)
			
 
				+
			
 
				+            if step % 3 == 2:
			
 
				+                n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
			
 
				+
			
 
				+        n += self.fetch_items_once(timeout=FETCH_TIMEOUT_SCROLL)
			
 
				+        next_btn = self._find_next_btn(timeout=3)
			
 
				+        if next_btn and not self._is_next_btn_visible(next_btn):
			
 
				+            self._scroll_next_into_view(next_btn)
			
 
				+
			
 
				+        return n, next_btn
			
 
				+
			
 
				+    def get_account(self):
			
 
				+        sql_account = """
			
 
				+                      SELECT *
			
 
				+                      FROM `retrieve_collect_equipment_account`
			
 
				+                      WHERE `id` = %s
			
 
				+                        and `status` = 0
			
 
				+                      """
			
 
				+        account_list = self.db.select_data(sql_account, self.account_id)
			
 
				+        if not account_list:
			
 
				+            return False
			
 
				+
			
 
				+        account_dict = account_list[0]
			
 
				+        print(account_dict)
			
 
				+        self.ip = account_dict.get("ip")
			
 
				+        self.account_name = account_dict.get("username")
			
 
				+        self.login_username = account_dict.get("phone", "")
			
 
				+        self.login_password = account_dict.get("password", "")
			
 
				+        logger.info("获取到账号: %s, ip: %s", self.account_name, self.ip)
			
 
				+        return True
			
 
				+
			
 
				+    def disable_account(self):
			
 
				+        update_sql = f""" UPDATE `retrieve_collect_equipment_account` SET `status`= %s WHERE `name` = %s; """
			
 
				+        self.db.execute(update_sql, (1, self.account_name))
			
 
				+
			
 
				+    def _build_search_keyword(self):
			
 
				+        parts = [p for p in (self.brand, self.product, self.product_desc) if p]
			
 
				+        return " ".join(parts).strip() or self.product
			
 
				+
			
 
				+    def _is_logged_out(self):
			
 
				+        return bool(self.driver.ele("xpath=//*[@class='link-login']", timeout=2))
			
 
				+
			
 
				+    def perform_jd_login(self):
			
 
				+        """
			
 
				+        使用已有浏览器实例执行京东账号密码登录（含滑块验证码）。
			
 
				+        成功返回 True，失败返回 False。
			
 
				+        """
			
 
				+        username = self.login_username
			
 
				+        password = self.login_password
			
 
				+        login_url = "https://passport.jd.com/new/login.aspx"
			
 
				+        self.driver.get(login_url)
			
 
				+        input_name = self.driver.ele("xpath=//input[@id='loginname']", timeout=15)
			
 
				+        if not input_name:
			
 
				+            print("未找到用户名输入框")
			
 
				+            return False
			
 
				+
			
 
				+        input_name.input(username)
			
 
				+        time.sleep(random.uniform(1.5, 2.5))
			
 
				+
			
 
				+        input_pass = self.driver.ele("xpath://input[@name='nloginpwd']", timeout=5)
			
 
				+        if not input_pass:
			
 
				+            print("未找到密码输入框")
			
 
				+            return False
			
 
				+
			
 
				+        input_pass.input(password)
			
 
				+        time.sleep(random.uniform(1.5, 2.5))
			
 
				+
			
 
				+        login_btn = self.driver.ele("xpath://a[@id='loginsubmit']", timeout=5)
			
 
				+        if not login_btn:
			
 
				+            print("未找到登录按钮")
			
 
				+            return False
			
 
				+        login_btn.click()
			
 
				+
			
 
				+        time.sleep(random.uniform(3, 5))
			
 
				+
			
 
				+        if not handle_jd_slider_captcha(self.driver):
			
 
				+            print("滑块验证码未通过")
			
 
				+            return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def _ensure_logged_in(self):
			
 
				+        """未登录时自动走登录流程（账号密码 + 滑块）。"""
			
 
				+        if not self._is_logged_out():
			
 
				+            return True
			
 
				+
			
 
				+        logger.info("检测到未登录，开始自动登录: %s", self.account_name)
			
 
				+        ok = self.perform_jd_login()
			
 
				+        if ok and not self._is_logged_out():
			
 
				+            logger.info("自动登录成功: %s", self.account_name)
			
 
				+            return True
			
 
				+
			
 
				+        logger.error("自动登录失败: %s", self.account_name)
			
 
				+        return False
			
 
				+
			
 
				+    def _check_page_blocked(self):
			
 
				+        html = self.driver.html or ""
			
 
				+        if "抱歉由于访问频繁导致无法搜索" in html:
			
 
				+            logger.error("账号无法搜索（访问频繁）")
			
 
				+            self.success = False
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def _jump_to_page(self, target_page):
			
 
				+        """跳转到指定页码，并清空跳转前的监听残留。"""
			
 
				+        to_page_input = self.driver.ele(
			
 
				+            "xpath=//div[contains(@class,'_pagination_toPageNum_')]//input[@type='text']",
			
 
				+            timeout=3,
			
 
				+        )
			
 
				+        if not to_page_input:
			
 
				+            logger.warning("未找到跳页输入框，无法跳转到第 %s 页", target_page)
			
 
				+            return False
			
 
				+
			
 
				+        self.clear_listen_buffer()
			
 
				+        to_page_input.input(str(target_page))
			
 
				+        self.sleep(1, 2)
			
 
				+        self.driver.actions.key_down("enter").key_up("enter")
			
 
				+        self.sleep(3, 5)
			
 
				+        self.clear_listen_buffer()
			
 
				+        logger.info("已跳转到第 %s 页", target_page)
			
 
				+        return True
			
 
				+
			
 
				+    def _go_next_page(self, next_btn):
			
 
				+        self.clear_listen_buffer()
			
 
				+        if not self._human_click(next_btn):
			
 
				+            logger.warning("点击下一页失败")
			
 
				+            return False
			
 
				+        self.sleep(2, 4)
			
 
				+        return True
			
 
				+
			
 
				+    def crawl(self):
			
 
				+        total = 0
			
 
				+        keyword = self._build_search_keyword()
			
 
				+
			
 
				+        self.driver.get("https://www.jd.com/", timeout=15)
			
 
				+        time.sleep(15)
			
 
				+
			
 
				+        if self._is_logged_out():
			
 
				+            if not self.login_password or not self.login_username:
			
 
				+                return
			
 
				+            if not self._ensure_logged_in():
			
 
				+                self.disable_account()
			
 
				+                send_text(f"京东：{self.account_name}账号登录失败")
			
 
				+                self.success = False
			
 
				+                return
			
 
				+            self.driver.get("https://www.jd.com/", timeout=15)
			
 
				+            self.sleep(3, 5)
			
 
				+
			
 
				+        kw = quote(str(keyword or ""), safe="")
			
 
				+        self._search_kw = kw
			
 
				+        # 必须先监听再打开搜索页，否则首屏 wareList（前约 30 条）在监听开启前就返回了
			
 
				+        self._start_listen()
			
 
				+        self.driver.get(
			
 
				+            f"https://search.jd.com/Search?keyword={kw}&enc=utf-8&wq={kw}", timeout=15
			
 
				+        )
			
 
				+        self.sleep(5, 8)
			
 
				+
			
 
				+        if self._check_page_blocked():
			
 
				+            return
			
 
				+
			
 
				+        if not handle_jd_slider_captcha(self.driver, pause_listen=False):
			
 
				+            logger.warning("进入搜索页后滑块验证码处理失败")
			
 
				+            self.success = False
			
 
				+            return
			
 
				+
			
 
				+        if self.start_page > 1:
			
 
				+            if not self._jump_to_page(self.start_page):
			
 
				+                logger.warning("跳页失败，将从第 1 页开始采集")
			
 
				+                self.start_page = 1
			
 
				+
			
 
				+        logger.info(
			
 
				+            "采集页码范围: %s ~ %s（共 %s 页）",
			
 
				+            self.start_page,
			
 
				+            self.end_page,
			
 
				+            self.end_page - self.start_page + 1,
			
 
				+        )
			
 
				+
			
 
				+        for page_no in range(self.start_page, self.end_page + 1):
			
 
				+            if self._is_logged_out():
			
 
				+                if not self._ensure_logged_in():
			
 
				+                    self.success = False
			
 
				+                    break
			
 
				+                self.driver.get(
			
 
				+                    f"https://search.jd.com/Search?keyword={kw}&enc=utf-8&wq={kw}",
			
 
				+                    timeout=15,
			
 
				+                )
			
 
				+                self.sleep(3, 5)
			
 
				+                if page_no > 1:
			
 
				+                    self._jump_to_page(page_no)
			
 
				+
			
 
				+            if not handle_jd_slider_captcha(self.driver, pause_listen=True):
			
 
				+                logger.warning("滑块验证码处理失败，停止采集")
			
 
				+                self.success = False
			
 
				+                break
			
 
				+
			
 
				+            if self._check_page_blocked():
			
 
				+                break
			
 
				+
			
 
				+            logger.info("===== 正在爬取第 %s 页 =====", page_no)
			
 
				+            search_ele = self.driver.ele("xpath=//div[@id='search-condition']", timeout=10)
			
 
				+            if not search_ele:
			
 
				+                logger.warning("未找到搜索结果区域，停止采集")
			
 
				+                break
			
 
				+
			
 
				+            page_n, _ = self.collect_full_page_items()
			
 
				+            logger.info("本页监听商品条数（含可能重复）: %s", page_n)
			
 
				+            total += page_n
			
 
				+            logger.info("累计监听条数: %s", total)
			
 
				+
			
 
				+            if self.is_no_prodcut > 20:
			
 
				+                logger.info("连续无匹配商品过多，停止采集")
			
 
				+                break
			
 
				+
			
 
				+            if page_no >= self.end_page:
			
 
				+                break
			
 
				+
			
 
				+            next_btn = self.driver.ele("text=下一页", timeout=2)
			
 
				+            if not next_btn:
			
 
				+                logger.info("没有下一页（未找到）")
			
 
				+                break
			
 
				+            cls_str = next_btn.attr("class") or ""
			
 
				+            if "disabled" in cls_str:
			
 
				+                logger.info("没有下一页（已禁用）")
			
 
				+                break
			
 
				+
			
 
				+            if not self._go_next_page(next_btn):
			
 
				+                break
			
 
				+
			
 
				+    def run(self):
			
 
				+        # 检测账号
			
 
				+        if not self.get_account():
			
 
				+            logger.info("==================当前无账号可用==================")
			
 
				+            self.success = False
			
 
				+            return self.pipeline.crawl_count, self.success
			
 
				+        logger.info("获取到账号:%s,代理ip:%s", self.account_name, self.ip)
			
 
				+
			
 
				+        # # # 每次选取账号，立马账号使用时间
			
 
				+        update_sql = f""" UPDATE `retrieve_collect_equipment_account` SET `status`= %s, `update_time`= %s WHERE `username` = %s; """
			
 
				+        self.db.execute(update_sql, (0, int(time.time()), self.account_name))
			
 
				+
			
 
				+        try:
			
 
				+            self.init_browser()
			
 
				+            self.crawl()
			
 
				+        except Exception as e:
			
 
				+            self.success = False
			
 
				+            logger.exception("爬取异常: %s", e)
			
 
				+            self.sleep(3, 5)
			
 
				+
			
 
				+        finally:
			
 
				+            if self.driver:
			
 
				+                self.driver.quit()
			
 
				+                self.driver = None
			
 
				+        return self.pipeline.crawl_count, self.success