123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230 |
- # 可以运行版
- # 获取淘宝数据:https://s.taobao.com/
- # 搜索键盘相关数据,会自动拦截登录页面(所以需要cookie)
- import csv
- import time
- import requests
- from pprint import pprint
- import hashlib
- import json
- import re
- import random
- import os
- url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
- class Taobao(object):
- def __init__(self):
- self.headers = {
-
- }
-
- self.cookies ="xlly_s=1; thw=cn; _samesite_flag_=true; cookie2=151fad5f9ffa4608c32ca72c662db8be; t=14298717e952f76a2f7396109d4c6cc1; mt=ci=0_0; cna=k1dMIAlFSjQCAXQHYietAHSq; wk_cookie2=1d7bced8e668cc2efdd4140f95f073b0; cancelledSubSites=empty; _tb_token_=e160ae359e5b8; 3PcFlag=1740999324684; unb=1861565883; lgc=h1uanglei; cookie17=UondFeoLTo8Txg%3D%3D; dnk=h1uanglei; tracknick=h1uanglei; _l_g_=Ug%3D%3D; sg=i3f; _nk_=h1uanglei; cookie1=BdM1QJq9vlgKbKGCUjTOwZyt1axrmsRipD2ngmSbYXg%3D; sgcookie=E100%2BvyUzpriPnESSk8a3%2F3zgj4HedGT2vBIfSwWFDbJUp6Dp2nJwFCcBYiGyvb0o59P7JTivBnaO%2FnN4KE1KU3CIC9RUy11OWyaieGfD9bQJ44RIFtPLD42T9TyerwB6Ejw; havana_lgc2_0=eyJoaWQiOjE4NjE1NjU4ODMsInNnIjoiMjUxOWQ0Nzg2ZjAyZTk4N2ZhNTMxMTNiMTIxZjI0ZjUiLCJzaXRlIjowLCJ0b2tlbiI6IjFQM1c1OWpGOTRDVHAzWkNUaTNJSGZRIn0; _hvn_lgc_=0; havana_lgc_exp=1772103365431; cookie3_bak=151fad5f9ffa4608c32ca72c662db8be; cookie3_bak_exp=1741258565431; wk_unb=UondFeoLTo8Txg%3D%3D; sn=; uc3=nk2=C3x7SEGhHLJn&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dD2E8Y%2FMtFiwzuXBQ%3D&id2=UondFeoLTo8Txg%3D%3D; csg=be6ba657; env_bak=FM%2Bgnk3pmw0lBUQ8h3n3gsJh%2FCB5N5xvUHV%2FRZ7KU%2FkT; skt=19181ed02977a367; existShop=MTc0MDk5OTM2NQ%3D%3D; uc4=id4=0%40UOE3G%2BGXiDhqK35olO7ifdvHqrWA&nk4=0%40CTuUbbUuJY4etRCqmXjXAAmLGlY%3D; _cc_=U%2BGCWk%2F7og%3D%3D; sdkSilent=1741085781835; uc1=cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&pas=0&cookie21=URm48syIYB3rzvI4Dim4&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoYaiuNF0i8sLw%3D%3D&existShop=false; mtop_partitioned_detect=1; _m_h5_tk=941a87691759f9a8fd7dc2b5d7b5618f_1741081550471; _m_h5_tk_enc=7f659ca5a2421e2b64cba4c13382ec96; x5sectag=274983; x5sec=7b2274223a313734313037353536392c22733b32223a2262313135313833343865373438303932222c22617365727665723b33223a22307c434f586f6d72344745506e39334a7744476730784f4459784e5459314f44677a4f7a457a4967567a593256755a544343747532332f502f2f2f2f3842227d; tfstk=gEArBffjibhrlhTwcCfU3DCnAgCRQ6osaBsC-eYhPgjlVebhTnxIygTn2HRFmepSy9i88zd2bWNS2YL3T61nfcGs1U3R96m_tztyBuQC5823AW4cww6-cqZj1ULR9zrn5Kc_YR39uT2hxHfciNbdt7X3ZqWc5wf3Z9VunojAmMfhxTcmiw_LtMXhxq8cDwfhxHXoutYJ-Hx8giWo8TX2sCHo8TIMriP3_PQhEVL8d5VMgaWytUjVlZOVzTSNHu0kxQxX-QORHbF59E9wYpxmg7-e2UAh3BiboO5cmOvOEPPNhKxJgtJjso7vnU9Hw3zUtadW19SO32VddsA1IdxzJVLcZwxdBBiTwgYHWCp1_XPca9jrfkQ0WOApUk2FEZQVfqu2LZQ-C8v1iVwLptkAuGgtWJedEZQVfqu4pJB26ZSsWVC..; isg=BHl5A6NGOq5zV-Y5Ysx_zV-viOVThm04KaayZZuvZqAfIp206tAFCL80pCbUmgVw"
-
-
-
- self.em = "941a87691759f9a8fd7dc2b5d7b5618f"
- self.proxies = ['',
-
- # 'http://183.164.243.157:8089',
-
- ]
-
- """
- mtopjsonp6({"api":"mtop.relationrecommend.wirelessrecommend.recommend","data":{},"ret":["FAIL_SYS_ILLEGAL_ACCESS::非法请求"]
- sign参数每次请求都会变化,导致请求不到数据(参数sign逆向)
- """
- # eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
- def getSign(self, eC, page):
- # em.token
- em = self.em
- eS = '12574478'
-
- signParam = {
- "device": "HMA-AL00",
- "isBeta": "false",
- "grayHair": "false",
- "from": "nt_history",
- "brand": "HUAWEI",
- "info": "wifi",
- "index": "4",
- "rainbow": "",
- "schemaType": "auction",
- "elderHome": "false",
- "isEnterSrpSearch": "true",
- "newSearch": "false",
- "network": "wifi",
- "subtype": "",
- "hasPreposeFilter": "false",
- "prepositionVersion": "v2",
- "client_os": "Android",
- "gpsEnabled": "false",
- "searchDoorFrom": "srp",
- "debug_rerankNewOpenCard": "false",
- "homePageVersion": "v7",
- "searchElderHomeOpen": "false",
- "search_action": "initiative",
- "sugg": "_4_1",
- "sversion": "13.6",
- "style": "list",
- "ttid": "600000@taobao_pc_10.7.0",
- "needTabs": "true",
- "areaCode": "CN",
- "vm": "nw",
- "countryNum": "156",
- "m": "pc",
- "page": page,
- "n": 48,
- "q": "%E9%9D%9E%E5%A4%84%E6%96%B9%E8%8D%AF",
- "qSource": "url",
- "pageSource": "",
- "tab": "all",
- "pageSize": 48,
- "totalPage": 100,
- "totalResults": 4800,
- "sourceS": "0",
- "sort": "_coefp",
- "bcoffset": "",
- "ntoffset": "",
- "filterTag": "",
- "service": "",
- "prop": "",
- "loc": "",
- "start_price": "",
- "end_price": "",
- "startPrice": "",
- "endPrice": "",
- "itemIds": "",
- "p4pIds": "",
- "p4pS": "",
- "categoryp": "",
- "ha3Kvpairs": "",
- "myCNA": "k1dMIAlFSjQCAXQHYietAHSq"
- }
- n = json.dumps(signParam)
- # print(json.dumps(json.dumps(signParam)))
- data = {
- "appId": "34385",
- "params": n
- }
- # print(data)
- n_data = json.dumps(data).replace(" ", "")
- str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
- # print(str)
- MD5 = hashlib.md5()
- MD5.update(str.encode("utf-8"))
- sign = MD5.hexdigest()
- return sign,n_data
- if __name__ == '__main__':
- tb = Taobao()
- for i in range(1,100):
- time.sleep(random.randint(10, 20))
- date_time = str(int(time.time() * 1000))
- sign, n = tb.getSign(eC=date_time, page = i)
- cookie = tb.cookies
- print(sign)
- params = {
- 'jsv': '2.7.4',
- 'appKey': '12574478',
- 't': date_time,
- 'sign': sign,
- 'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
- 'v': '2.0',
- 'timeout': '10000',
- 'type': 'jsonp',
- 'dataType': 'jsonp',
- 'callback': 'mtopjsonp' + str(random.randint(1, 30)),
- 'data': n
- }
- # 修改请求头,添加 Accept-Encoding 和 Content-Type
- user_agent_pool = [
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.125 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.119 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
- ]
- headers = {
- "User-Agent": user_agent_pool[random.randint(0, len(user_agent_pool) - 1)],
- "Referer": "https://s.taobao.com/",
- "cookie": cookie,
- "Accept-Encoding": "gzip, deflate, br",
- "Content-Type": "application/json; charset=utf-8"
- }
-
- # 增加重试机制
- max_retries = len(tb.proxies)
- for _ in range(max_retries):
- proxy = tb.proxies[random.randint(0, len(tb.proxies) - 1)]
- proxies = {
- 'http': proxy,
- 'https': proxy
- }
- try:
- resp = requests.get(url, params=params, headers=headers, proxies=proxies)
- break
- except requests.exceptions.ProxyError as e:
- print(f"使用代理 {proxy} 失败: {e}")
- else:
- print("所有代理都尝试过,仍然无法连接。")
- continue
- # print(resp.text)
- resp.encoding = 'utf-8'
- html = resp.text
- print(html)
- if html:
- print('请求成功')
- else:
- # 读取文件
- with open('./python/log/taobao.log', 'r', encoding='utf-8') as f:
- html = f.read()
- # 确保目录存在
- log_dir = './python/log'
- if not os.path.exists(log_dir):
- os.makedirs(log_dir)
- # 采集数据
- info = re.findall(r'mtopjsonp\d+\((.*)', html)[0].replace(')', '')
- # 写入文件
- jsonData = json.loads(info)
- # 循环获取数据
- with open('./python/log/taobao.csv',mode="w+",newline='',encoding="utf-8") as f:
- writer = csv.writer(f)
- try:
-
- # 写入表头
- head = ['标题','图片链接','价格','地区','销量','店铺']
- writer.writerow(head)
- for item in jsonData['data']['itemsArray']:
- dit = {
- 'title': item['title'].replace('<span class=H>', '').replace('</span>',''),
- 'img': item['pic_path'],
- 'price': item['price'],
- 'procity': item['procity'],
- 'realSales': item['realSales'],
- 'shopName': item['nick'],
- }
- writer.writerow(dit.values())
- print(dit)
- except Exception as e:
- time.sleep(60*2)
- print(e)
- print('采集数据失败,继续')
- continue
-
-
|