Explorar el Código

淘宝规格修改,京东爬取滑动优化

zhuoyuncheng hace 2 días
padre
commit
babedd0813
Se han modificado 2 ficheros con 23 adiciones y 4 borrados
  1. 17 3
      spiders/jd/jd_auto_crawl.py
  2. 6 1
      spiders/taobao/taobao_crawl.py

+ 17 - 3
spiders/jd/jd_auto_crawl.py

@@ -222,6 +222,21 @@ class JdCrawlerV2:
         else:
             return 1
 
+    def get_heshu(self,full_title):
+        last_box = None
+        last_bottle = None
+        for match in re.finditer(r"(\d+)(盒|瓶)", full_title):
+            if match.group(2) == '盒':
+                last_box = match
+            else:  # 瓶
+                last_bottle = match
+        if last_box:
+            return int(last_box.group(1))
+        elif last_bottle:
+            return int(last_bottle.group(1))
+        else:
+            return 1
+
     def parse(self, ware_list):
 
         for w in ware_list:
@@ -537,7 +552,6 @@ class JdCrawlerV2:
             logger.warning("点击下一页失败")
             return False
         self.sleep(2, 4)
-        self.clear_listen_buffer()
         return True
 
     def crawl(self):
@@ -559,6 +573,8 @@ class JdCrawlerV2:
             self.sleep(3, 5)
 
         kw = quote(str(keyword or ""), safe="")
+        self._search_kw = kw
+        # 必须先监听再打开搜索页,否则首屏 wareList(前约 30 条)在监听开启前就返回了
         self._start_listen()
         self.driver.get(
             f"https://search.jd.com/Search?keyword={kw}&enc=utf-8&wq={kw}", timeout=15
@@ -573,8 +589,6 @@ class JdCrawlerV2:
             self.success = False
             return
 
-        self._start_listen()
-
         if self.start_page > 1:
             if not self._jump_to_page(self.start_page):
                 logger.warning("跳页失败,将从第 1 页开始采集")

+ 6 - 1
spiders/taobao/taobao_crawl.py

@@ -18,7 +18,7 @@ MTOP_APP_KEY = "12574478"
 MTOP_APP_ID = "34385"
 SEARCH_MAX_PAGE = 20
 REQUEST_RETRY_COUNT = 3
-COOKIE_MAX_AGE_SEC = 1800
+COOKIE_MAX_AGE_SEC = 3600
 
 headers = {
     "accept": "*/*",
@@ -333,6 +333,11 @@ class TaobaoCrawl:
                     raw.get("shopInfo", {}).get("url", "")
                 )
 
+                structured_list = raw.get("structuredUSPInfo",{})
+                for structured in structured_list:
+                    if structured.get("propertyName","") == "规格":
+                        crawl_product_desc = structured.get("propertyValueName","")
+
                 pic_path = raw.get("pic_path", "")
                 raw_price = item_price_show
                 if raw_price in (None, ""):