You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
83 lines
3.9 KiB
83 lines
3.9 KiB
import requests
|
|
import re
|
|
|
|
"""
|
|
第一部分:数据爬取与清洗处理
|
|
"""
|
|
#获取网页信息
|
|
def getHTMLText(url):
|
|
"""
|
|
cookie和user-agent的获取
|
|
cookie: 以火狐浏览器为例子 登录淘宝后 F12 -> 网络 -> cookie
|
|
user-agent: F12 -> 网络 -> 消息头 -> user-agent
|
|
"""
|
|
try:
|
|
h = {
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.40',
|
|
'cookie':'cna=HI7zF8FaWRQCAXWIYnv6TUFc; miid=2003369626578581895; thw=cn; sgcookie=E100oHYUvXW%2Bt%2BGNjycE%2B7SYOvSVQNkpqpPv5qTsMrPuHKx0G%2Febzrl4pcZCj7QxBTNJ%2Fz5BMMj1Hkt4f0kC8CzJfRs%2BCOSQlS7SPa1FBKv0dkozqFgzLQw%2FhtfdvClGtx7v; uc3=vt3=F8dCvChz3DJvLwR8GvQ%3D&id2=UUphw2eZR95J2%2FRH9A%3D%3D&lg2=UtASsssmOIJ0bQ%3D%3D&nk2=F5RFh6bQZIuYwo0%3D; csg=5541983a; lgc=tb078230136; uc4=nk4=0%40FY4O7o%2BSkzlkgeRJ%2B29qbe6EFW27lg%3D%3D&id4=0%40U2grGNhgbPZsqW1N3kuwN%2FfY%2FpXkVSol; tracknick=tb078230136; _cc_=VFC%2FuZ9ajQ%3D%3D; sg=677; enc=GczM2HPZC6WdhQOWPRrh%2FbfIFDpGXw9H9TW2mx04lgVPU6bOdgltYbzqwxAva8KYEU%2FI1DI8mdE6tZHyRIi4jh5zapRUoWNw9e%2B%2FhDI4k6Q%3D; mt=ci=-1_0; _m_h5_tk=1bbca207acea1b18b0ced12571b0e556_1651944694145; _m_h5_tk_enc=fa9427ab38a6a5911bb885db8d445d89; xlly_s=1; cookie2=19eea3ce29b69873e24e73b75bc8652d; uc1=cookie14=UoexMyps4ZHTyw%3D%3D; t=5974611b34125053c6d2ed23e73a56b7; _tb_token_=e4ebee5e3e13e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _samesite_flag_=true; JSESSIONID=54EF74CA88077E5E6FD7C29316EF90E9; l=eBEgzNmHLu0l4hfEBOfwourza77tjLAxiuPzaNbMiOCPOdCp5oBAW64yjOL9CnhVh6upR3SLKwwvBeYBqS24n5U62j-lasMmn; tfstk=cBRhBgV-ZpWQIzXMhX1INAwQjNkOZInFYQRp_2cvrlMeepONiM2aey1jSGrej81..; isg=BH5-hO02kF-Z5sRWibJTSs2W2pTAv0I5BXBsIiiH7UG8yx6lkEzZSaApRpcHQzpR'
|
|
}
|
|
r = requests.get(url, timeout=30, headers=h)
|
|
r.raise_for_status()
|
|
r.encoding = r.apparent_encoding
|
|
return r.text
|
|
except:
|
|
return ""
|
|
|
|
#对网页的数据进行解析,提取想要的数据
|
|
def parsePage(ilt, html):
|
|
try:
|
|
#利用正则表达式对需要的网页信息进行提取
|
|
# 价格
|
|
pplt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
|
|
# 标题信息
|
|
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
|
|
# 销量
|
|
slt = re.findall(r'\"view_sales\"\:\"[\d]*.{0,4}\"', html)
|
|
#发货地址
|
|
dlt = re.findall(r'\"item_loc\"\:\".*?\s?.*?\"', html)
|
|
#ult = re.findall(r'\"detail_url\"\:\".*?\"', html)
|
|
#对字典有用的数据进行提取
|
|
for i in range(len(pplt)):
|
|
price = eval(pplt[i].split(':')[1])
|
|
title = eval(tlt[i].split(':')[1])
|
|
sale = slt[i].split(':')[1]
|
|
deliver = eval(dlt[i].split(':')[1])
|
|
#url = eval(ult[i].split(':')[1])
|
|
'''for j in url:
|
|
j.replace("\\u003d", "=").replace("\\u0026", "&")'''
|
|
number = ""
|
|
#提取sale中的数字信息
|
|
for i in sale:
|
|
if ord(i) >= 48 and ord(i) <= 57:
|
|
number += i
|
|
ilt.append([price, number, deliver, title])
|
|
except:
|
|
print("")
|
|
|
|
#对网页信息进行输出
|
|
def printGoodsList(ilt):
|
|
tpplt = "{:4}\t{:8}\t{:16}\t{:20}\t{:30}"
|
|
print(tpplt.format("序号", "价格", "销量", "发货地", "商品名称"))
|
|
count = 0
|
|
for g in ilt:
|
|
count = count + 1
|
|
print(tpplt.format(count, g[0], g[1], g[2], g[3]))
|
|
|
|
infoList = []
|
|
def main():
|
|
#这是搜索的内容goods
|
|
goods = input()
|
|
depth = input()
|
|
start_url = 'https://s.taobao.com/search?q=' + goods
|
|
for i in range(depth):
|
|
try:
|
|
url = start_url + '&s=' + str(44 * i)
|
|
html = getHTMLText(url)
|
|
parsePage(infoList, html)
|
|
except:
|
|
continue
|
|
printGoodsList(infoList)
|
|
|
|
print("开始爬取")
|
|
main()
|