get_taobao_print.py 9.8 KB


  1. # 可以运行版
  2. # 获取淘宝数据:https://s.taobao.com/
  3. # 搜索键盘相关数据,会自动拦截登录页面(所以需要cookie)
  4. import csv
  5. import time
  6. import requests
  7. from pprint import pprint
  8. import hashlib
  9. import json
  10. import re
  11. import random
  12. import os
  13. url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
  14. class Taobao(object):
  15. def __init__(self):
  16. self.headers = {
  17. }
  18. self.cookies ="xlly_s=1; thw=cn; _samesite_flag_=true; cookie2=151fad5f9ffa4608c32ca72c662db8be; t=14298717e952f76a2f7396109d4c6cc1; mt=ci=0_0; cna=k1dMIAlFSjQCAXQHYietAHSq; wk_cookie2=1d7bced8e668cc2efdd4140f95f073b0; cancelledSubSites=empty; _tb_token_=e160ae359e5b8; 3PcFlag=1740999324684; unb=1861565883; lgc=h1uanglei; cookie17=UondFeoLTo8Txg%3D%3D; dnk=h1uanglei; tracknick=h1uanglei; _l_g_=Ug%3D%3D; sg=i3f; _nk_=h1uanglei; cookie1=BdM1QJq9vlgKbKGCUjTOwZyt1axrmsRipD2ngmSbYXg%3D; sgcookie=E100%2BvyUzpriPnESSk8a3%2F3zgj4HedGT2vBIfSwWFDbJUp6Dp2nJwFCcBYiGyvb0o59P7JTivBnaO%2FnN4KE1KU3CIC9RUy11OWyaieGfD9bQJ44RIFtPLD42T9TyerwB6Ejw; havana_lgc2_0=eyJoaWQiOjE4NjE1NjU4ODMsInNnIjoiMjUxOWQ0Nzg2ZjAyZTk4N2ZhNTMxMTNiMTIxZjI0ZjUiLCJzaXRlIjowLCJ0b2tlbiI6IjFQM1c1OWpGOTRDVHAzWkNUaTNJSGZRIn0; _hvn_lgc_=0; havana_lgc_exp=1772103365431; cookie3_bak=151fad5f9ffa4608c32ca72c662db8be; cookie3_bak_exp=1741258565431; wk_unb=UondFeoLTo8Txg%3D%3D; sn=; uc3=nk2=C3x7SEGhHLJn&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&vt3=F8dD2E8Y%2FMtFiwzuXBQ%3D&id2=UondFeoLTo8Txg%3D%3D; csg=be6ba657; env_bak=FM%2Bgnk3pmw0lBUQ8h3n3gsJh%2FCB5N5xvUHV%2FRZ7KU%2FkT; skt=19181ed02977a367; existShop=MTc0MDk5OTM2NQ%3D%3D; uc4=id4=0%40UOE3G%2BGXiDhqK35olO7ifdvHqrWA&nk4=0%40CTuUbbUuJY4etRCqmXjXAAmLGlY%3D; _cc_=U%2BGCWk%2F7og%3D%3D; sdkSilent=1741085781835; uc1=cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&pas=0&cookie21=URm48syIYB3rzvI4Dim4&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoYaiuNF0i8sLw%3D%3D&existShop=false; mtop_partitioned_detect=1; _m_h5_tk=941a87691759f9a8fd7dc2b5d7b5618f_1741081550471; _m_h5_tk_enc=7f659ca5a2421e2b64cba4c13382ec96; x5sectag=274983; x5sec=7b2274223a313734313037353536392c22733b32223a2262313135313833343865373438303932222c22617365727665723b33223a22307c434f586f6d72344745506e39334a7744476730784f4459784e5459314f44677a4f7a457a4967567a593256755a544343747532332f502f2f2f2f3842227d; tfstk=gEArBffjibhrlhTwcCfU3DCnAgCRQ6osaBsC-eYhPgjlVebhTnxIygTn2HRFmepSy9i88zd2bWNS2YL3T61nfcGs1U3R96m_tztyBuQC5823AW4cww6-cqZj1ULR9zrn5Kc_YR39uT2hxHfciNbdt7X3ZqWc5wf3Z9VunojAmMfhxTcmiw_LtMXhxq8cDwfhxHXoutYJ-Hx8giWo8TX2sCHo8TIMriP3_PQhEVL8d5VMgaWytUjVlZOVzTSNHu0kxQxX-QORHbF59E9wYpxmg7-e2UAh3BiboO5cmOvOEPPNhKxJgtJjso7vnU9Hw3zUtadW19SO32VddsA1IdxzJVLcZwxdBBiTwgYHWCp1_XPca9jrfkQ0WOApUk2FEZQVfqu2LZQ-C8v1iVwLptkAuGgtWJedEZQVfqu4pJB26ZSsWVC..; isg=BHl5A6NGOq5zV-Y5Ysx_zV-viOVThm04KaayZZuvZqAfIp206tAFCL80pCbUmgVw"
  19. self.em = "941a87691759f9a8fd7dc2b5d7b5618f"
  20. self.proxies = ['',
  21. # 'http://183.164.243.157:8089',
  22. ]
  23. """
  24. mtopjsonp6({"api":"mtop.relationrecommend.wirelessrecommend.recommend","data":{},"ret":["FAIL_SYS_ILLEGAL_ACCESS::非法请求"]
  25. sign参数每次请求都会变化,导致请求不到数据(参数sign逆向)
  26. """
  27. # eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
  28. def getSign(self, eC, page):
  29. # em.token
  30. em = self.em
  31. eS = '12574478'
  32. signParam = {
  33. "device": "HMA-AL00",
  34. "isBeta": "false",
  35. "grayHair": "false",
  36. "from": "nt_history",
  37. "brand": "HUAWEI",
  38. "info": "wifi",
  39. "index": "4",
  40. "rainbow": "",
  41. "schemaType": "auction",
  42. "elderHome": "false",
  43. "isEnterSrpSearch": "true",
  44. "newSearch": "false",
  45. "network": "wifi",
  46. "subtype": "",
  47. "hasPreposeFilter": "false",
  48. "prepositionVersion": "v2",
  49. "client_os": "Android",
  50. "gpsEnabled": "false",
  51. "searchDoorFrom": "srp",
  52. "debug_rerankNewOpenCard": "false",
  53. "homePageVersion": "v7",
  54. "searchElderHomeOpen": "false",
  55. "search_action": "initiative",
  56. "sugg": "_4_1",
  57. "sversion": "13.6",
  58. "style": "list",
  59. "ttid": "600000@taobao_pc_10.7.0",
  60. "needTabs": "true",
  61. "areaCode": "CN",
  62. "vm": "nw",
  63. "countryNum": "156",
  64. "m": "pc",
  65. "page": page,
  66. "n": 48,
  67. "q": "%E9%9D%9E%E5%A4%84%E6%96%B9%E8%8D%AF",
  68. "qSource": "url",
  69. "pageSource": "",
  70. "tab": "all",
  71. "pageSize": 48,
  72. "totalPage": 100,
  73. "totalResults": 4800,
  74. "sourceS": "0",
  75. "sort": "_coefp",
  76. "bcoffset": "",
  77. "ntoffset": "",
  78. "filterTag": "",
  79. "service": "",
  80. "prop": "",
  81. "loc": "",
  82. "start_price": "",
  83. "end_price": "",
  84. "startPrice": "",
  85. "endPrice": "",
  86. "itemIds": "",
  87. "p4pIds": "",
  88. "p4pS": "",
  89. "categoryp": "",
  90. "ha3Kvpairs": "",
  91. "myCNA": "k1dMIAlFSjQCAXQHYietAHSq"
  92. }
  93. n = json.dumps(signParam)
  94. # print(json.dumps(json.dumps(signParam)))
  95. data = {
  96. "appId": "34385",
  97. "params": n
  98. }
  99. # print(data)
  100. n_data = json.dumps(data).replace(" ", "")
  101. str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
  102. # print(str)
  103. MD5 = hashlib.md5()
  104. MD5.update(str.encode("utf-8"))
  105. sign = MD5.hexdigest()
  106. return sign,n_data
  107. if __name__ == '__main__':
  108. tb = Taobao()
  109. for i in range(1,100):
  110. time.sleep(random.randint(10, 20))
  111. date_time = str(int(time.time() * 1000))
  112. sign, n = tb.getSign(eC=date_time, page = i)
  113. cookie = tb.cookies
  114. print(sign)
  115. params = {
  116. 'jsv': '2.7.4',
  117. 'appKey': '12574478',
  118. 't': date_time,
  119. 'sign': sign,
  120. 'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
  121. 'v': '2.0',
  122. 'timeout': '10000',
  123. 'type': 'jsonp',
  124. 'dataType': 'jsonp',
  125. 'callback': 'mtopjsonp' + str(random.randint(1, 30)),
  126. 'data': n
  127. }
  128. # 修改请求头,添加 Accept-Encoding 和 Content-Type
  129. user_agent_pool = [
  130. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
  131. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.125 Safari/537.36",
  132. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.119 Safari/537.36",
  133. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
  134. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
  135. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
  136. ]
  137. headers = {
  138. "User-Agent": user_agent_pool[random.randint(0, len(user_agent_pool) - 1)],
  139. "Referer": "https://s.taobao.com/",
  140. "cookie": cookie,
  141. "Accept-Encoding": "gzip, deflate, br",
  142. "Content-Type": "application/json; charset=utf-8"
  143. }
  144. # 增加重试机制
  145. max_retries = len(tb.proxies)
  146. for _ in range(max_retries):
  147. proxy = tb.proxies[random.randint(0, len(tb.proxies) - 1)]
  148. proxies = {
  149. 'http': proxy,
  150. 'https': proxy
  151. }
  152. try:
  153. resp = requests.get(url, params=params, headers=headers, proxies=proxies)
  154. break
  155. except requests.exceptions.ProxyError as e:
  156. print(f"使用代理 {proxy} 失败: {e}")
  157. else:
  158. print("所有代理都尝试过,仍然无法连接。")
  159. continue
  160. # print(resp.text)
  161. resp.encoding = 'utf-8'
  162. html = resp.text
  163. print(html)
  164. if html:
  165. print('请求成功')
  166. else:
  167. # 读取文件
  168. with open('./python/log/taobao.log', 'r', encoding='utf-8') as f:
  169. html = f.read()
  170. # 确保目录存在
  171. log_dir = './python/log'
  172. if not os.path.exists(log_dir):
  173. os.makedirs(log_dir)
  174. # 采集数据
  175. info = re.findall(r'mtopjsonp\d+\((.*)', html)[0].replace(')', '')
  176. # 写入文件
  177. jsonData = json.loads(info)
  178. # 循环获取数据
  179. with open('./python/log/taobao.csv',mode="w+",newline='',encoding="utf-8") as f:
  180. writer = csv.writer(f)
  181. try:
  182. # 写入表头
  183. head = ['标题','图片链接','价格','地区','销量','店铺']
  184. writer.writerow(head)
  185. for item in jsonData['data']['itemsArray']:
  186. dit = {
  187. 'title': item['title'].replace('<span class=H>', '').replace('</span>',''),
  188. 'img': item['pic_path'],
  189. 'price': item['price'],
  190. 'procity': item['procity'],
  191. 'realSales': item['realSales'],
  192. 'shopName': item['nick'],
  193. }
  194. writer.writerow(dit.values())
  195. print(dit)
  196. except Exception as e:
  197. time.sleep(60*2)
  198. print(e)
  199. print('采集数据失败,继续')
  200. continue