|
|
@@ -222,6 +222,21 @@ class JdCrawlerV2:
|
|
|
else:
|
|
|
return 1
|
|
|
|
|
|
+ def get_heshu(self,full_title):
|
|
|
+ last_box = None
|
|
|
+ last_bottle = None
|
|
|
+ for match in re.finditer(r"(\d+)(盒|瓶)", full_title):
|
|
|
+ if match.group(2) == '盒':
|
|
|
+ last_box = match
|
|
|
+ else: # 瓶
|
|
|
+ last_bottle = match
|
|
|
+ if last_box:
|
|
|
+ return int(last_box.group(1))
|
|
|
+ elif last_bottle:
|
|
|
+ return int(last_bottle.group(1))
|
|
|
+ else:
|
|
|
+ return 1
|
|
|
+
|
|
|
def parse(self, ware_list):
|
|
|
|
|
|
for w in ware_list:
|
|
|
@@ -537,7 +552,6 @@ class JdCrawlerV2:
|
|
|
logger.warning("点击下一页失败")
|
|
|
return False
|
|
|
self.sleep(2, 4)
|
|
|
- self.clear_listen_buffer()
|
|
|
return True
|
|
|
|
|
|
def crawl(self):
|
|
|
@@ -559,6 +573,8 @@ class JdCrawlerV2:
|
|
|
self.sleep(3, 5)
|
|
|
|
|
|
kw = quote(str(keyword or ""), safe="")
|
|
|
+ self._search_kw = kw
|
|
|
+ # 必须先监听再打开搜索页,否则首屏 wareList(前约 30 条)在监听开启前就返回了
|
|
|
self._start_listen()
|
|
|
self.driver.get(
|
|
|
f"https://search.jd.com/Search?keyword={kw}&enc=utf-8&wq={kw}", timeout=15
|
|
|
@@ -573,8 +589,6 @@ class JdCrawlerV2:
|
|
|
self.success = False
|
|
|
return
|
|
|
|
|
|
- self._start_listen()
|
|
|
-
|
|
|
if self.start_page > 1:
|
|
|
if not self._jump_to_page(self.start_page):
|
|
|
logger.warning("跳页失败,将从第 1 页开始采集")
|