Răsfoiți Sursa

壹药城增加过略

zhuoyuncheng 1 săptămână în urmă
părinte
comite
926dd3a54f

+ 8 - 4
spiders/yaoex/yaoex_crawl.py

@@ -164,7 +164,11 @@ class YaoexCrawler:
         resp = self._post_with_retry(list_url, payload)
         resp = self._post_with_retry(list_url, payload)
         data = resp.json()
         data = resp.json()
 
 
-        return data.get("data", {}).get("shopProducts", []) or []
+        recall_status = data.get("data", {}).get("recallStatus",0)
+        if int(recall_status)==1:
+            return data.get("data", {}).get("shopProducts", []) or []
+        else:
+            return []
 
 
     def fetch_detail(self, spu_code, seller_code):
     def fetch_detail(self, spu_code, seller_code):
         payload = self._detail_payload(spu_code, seller_code)
         payload = self._detail_payload(spu_code, seller_code)
@@ -315,9 +319,9 @@ class YaoexCrawler:
                 if self.product not in product_name:
                 if self.product not in product_name:
                     self.is_not_product += 1
                     self.is_not_product += 1
                     continue
                     continue
-                # if self.brand not in product_name:
-                #     self.is_not_product += 1
-                #     continue
+                if self.brand not in product_name:
+                    self.is_not_product += 1
+                    continue
                 self.is_not_product = 0
                 self.is_not_product = 0
                 product = self.parse_product(item)
                 product = self.parse_product(item)
 
 

+ 19 - 13
spiders/yaoex/yaoex_snapshot_crawl.py

@@ -25,7 +25,6 @@ chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
 # 项目根目录 → spiders/yaoex(与从哪执行脚本无关)
 # 项目根目录 → spiders/yaoex(与从哪执行脚本无关)
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 YAOEX_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoex"
 YAOEX_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoex"
-BROWSER_PROFILE_SUBDIR = "chrome_profile"
 SLIDER_OFFSET_FIX = 10
 SLIDER_OFFSET_FIX = 10
 DETAIL_GET_TIMEOUT = 15
 DETAIL_GET_TIMEOUT = 15
 DETAIL_URL_WAIT = 10
 DETAIL_URL_WAIT = 10
@@ -136,22 +135,20 @@ class YaoexSnapshotCrawl:
             return s.getsockname()[1]
             return s.getsockname()[1]
 
 
     def _resolve_browser_profile_dir(self):
     def _resolve_browser_profile_dir(self):
-        """
-        浏览器数据固定落在 <项目根>/spiders/yaoex/ 下。
-        优先 chrome_profile/<账号>;若旧版直接在 yaoex/<账号> 已有登录态则继续沿用。
-        """
-        preferred = YAOEX_SPIDER_DIR / BROWSER_PROFILE_SUBDIR / self.account_name
-        legacy_flat = YAOEX_SPIDER_DIR / self.account_name
+        """浏览器数据目录: <项目根>/spiders/yaoex/<账号>"""
+        profile_dir = YAOEX_SPIDER_DIR / self.account_name
+        # 仅兼容历史误路径,新建不再使用 chrome_profile
         legacy_nested = YAOEX_SPIDER_DIR / "spiders" / "yaoex" / self.account_name
         legacy_nested = YAOEX_SPIDER_DIR / "spiders" / "yaoex" / self.account_name
+        legacy_chrome_profile = YAOEX_SPIDER_DIR / "chrome_profile" / self.account_name
 
 
-        for candidate in (preferred, legacy_flat, legacy_nested):
+        for candidate in (profile_dir, legacy_nested, legacy_chrome_profile):
             if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
             if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
                 logger.info("使用已有浏览器配置目录: %s", candidate)
                 logger.info("使用已有浏览器配置目录: %s", candidate)
                 return candidate
                 return candidate
 
 
-        preferred.parent.mkdir(parents=True, exist_ok=True)
-        logger.info("新建浏览器配置目录: %s", preferred)
-        return preferred
+        profile_dir.mkdir(parents=True, exist_ok=True)
+        logger.info("新建浏览器配置目录: %s", profile_dir)
+        return profile_dir
 
 
     def init_browser(self):
     def init_browser(self):
         co = ChromiumOptions().set_browser_path(chrome_path)
         co = ChromiumOptions().set_browser_path(chrome_path)
@@ -384,7 +381,12 @@ class YaoexSnapshotCrawl:
     def fetch_list_page(self, keyword, page):
     def fetch_list_page(self, keyword, page):
         list_url = "https://gateway-b2b.fangkuaiyi.com/home/search/homeSearchList"
         list_url = "https://gateway-b2b.fangkuaiyi.com/home/search/homeSearchList"
         resp = self._post_with_retry(list_url, self._list_payload(keyword, page))
         resp = self._post_with_retry(list_url, self._list_payload(keyword, page))
-        return resp.json().get("data", {}).get("shopProducts", []) or []
+        data = resp.json()
+        recall_status = data.get("data", {}).get("recallStatus", 0)
+        if int(recall_status) == 1:
+            return data.get("data", {}).get("shopProducts", []) or []
+        else:
+            return []
 
 
     def fetch_shop(self, seller_code):
     def fetch_shop(self, seller_code):
         detail_url = "https://gateway-b2b.fangkuaiyi.com/ycapp/shop/enterpriseQualification"
         detail_url = "https://gateway-b2b.fangkuaiyi.com/ycapp/shop/enterpriseQualification"
@@ -457,7 +459,11 @@ class YaoexSnapshotCrawl:
                 time.sleep(0.5)
                 time.sleep(0.5)
                 self.driver.refresh()
                 self.driver.refresh()
                 time.sleep(2)
                 time.sleep(2)
-                return True
+                ele = self.driver.ele("xpath=//div[@class='yaoex-product-detail__product-detail']")
+                if ele:
+                    return True
+                else:
+                    continue
             except Exception as e:
             except Exception as e:
                 logger.warning(
                 logger.warning(
                     "跳转详情异常 spu=%s seller=%s attempt=%s: %s",
                     "跳转详情异常 spu=%s seller=%s attempt=%s: %s",

+ 19 - 13
spiders/yaoshibang/ysb_snapshot_crawl.py

@@ -38,7 +38,6 @@ DETAIL_APPROVAL_XPATH = (
 chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
 chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 YSB_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoshibang"
 YSB_SPIDER_DIR = PROJECT_ROOT / "spiders" / "yaoshibang"
-BROWSER_PROFILE_SUBDIR = "chrome_profile"
 
 
 headers = {
 headers = {
     "Accept": "*/*",
     "Accept": "*/*",
@@ -168,22 +167,20 @@ class YaoShiBangSnapshot:
             return s.getsockname()[1]
             return s.getsockname()[1]
 
 
     def _resolve_browser_profile_dir(self):
     def _resolve_browser_profile_dir(self):
-        """
-        浏览器数据固定落在 <项目根>/spiders/yaoshibang/ 下。
-        优先 chrome_profile/<账号>;若旧版目录已有登录态则继续沿用。
-        """
-        preferred = YSB_SPIDER_DIR / BROWSER_PROFILE_SUBDIR / self.account_name
-        legacy_flat = YSB_SPIDER_DIR / self.account_name
+        """浏览器数据目录: <项目根>/spiders/yaoshibang/<账号>"""
+        profile_dir = YSB_SPIDER_DIR / self.account_name
+        # 仅兼容历史误路径,新建不再使用 chrome_profile
         legacy_nested = YSB_SPIDER_DIR / "spiders" / "yaoshibang" / self.account_name
         legacy_nested = YSB_SPIDER_DIR / "spiders" / "yaoshibang" / self.account_name
+        legacy_chrome_profile = YSB_SPIDER_DIR / "chrome_profile" / self.account_name
 
 
-        for candidate in (preferred, legacy_flat, legacy_nested):
+        for candidate in (profile_dir, legacy_nested, legacy_chrome_profile):
             if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
             if (candidate / "Default").is_dir() or (candidate / "Local State").is_file():
                 logger.info("使用已有浏览器配置目录: %s", candidate)
                 logger.info("使用已有浏览器配置目录: %s", candidate)
                 return candidate
                 return candidate
 
 
-        preferred.parent.mkdir(parents=True, exist_ok=True)
-        logger.info("新建浏览器配置目录: %s", preferred)
-        return preferred
+        profile_dir.mkdir(parents=True, exist_ok=True)
+        logger.info("新建浏览器配置目录: %s", profile_dir)
+        return profile_dir
 
 
     def init_browser(self):
     def init_browser(self):
         co = ChromiumOptions().set_browser_path(chrome_path)
         co = ChromiumOptions().set_browser_path(chrome_path)
@@ -365,7 +362,10 @@ class YaoShiBangSnapshot:
                     self.driver.refresh()
                     self.driver.refresh()
 
 
                 time.sleep(1.5)
                 time.sleep(1.5)
-                if str(item_id) in self.driver.url:
+                ele = self.driver.ele("xpath=//div[@class='drug-pic-viewer']")
+                if not ele:
+                    continue
+                else:
                     return True
                     return True
 
 
             except Exception as e:
             except Exception as e:
@@ -644,6 +644,7 @@ class YaoShiBangSnapshot:
         if not self._is_logged_in():
         if not self._is_logged_in():
             if not self.login():
             if not self.login():
                 return False
                 return False
+        time.sleep(3)
         cookies_list = self.driver.cookies()
         cookies_list = self.driver.cookies()
         cookies_dict = {c['name']: c['value'] for c in cookies_list}
         cookies_dict = {c['name']: c['value'] for c in cookies_list}
         self.token = cookies_dict.get("Token") or cookies_dict.get("token")
         self.token = cookies_dict.get("Token") or cookies_dict.get("token")
@@ -732,7 +733,12 @@ class YaoShiBangSnapshot:
                 if self.is_product_count >= 20:
                 if self.is_product_count >= 20:
                     return False
                     return False
 
 
-                self._goto_detail_page(item_id, detail_url)
+                if not self._goto_detail_page(item_id, detail_url):
+                    logger.warning(
+                        "详情页跳转失败,跳过 item_id=%s url=%s",
+                        item_id, detail_url,
+                    )
+                    continue
 
 
                 product = self.parse_detail(product)
                 product = self.parse_detail(product)
                 upload_key = hashlib.md5(detail_url.encode("utf-8")).hexdigest()
                 upload_key = hashlib.md5(detail_url.encode("utf-8")).hexdigest()