You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

83 lines
3.9 KiB

import requests
import re
"""
第一部分:数据爬取与清洗处理
"""
#获取网页信息
def getHTMLText(url):
"""
cookie和user-agent的获取
cookie: 以火狐浏览器为例子 登录淘宝后 F12 -> 网络 -> cookie
user-agent: F12 -> 网络 -> 消息头 -> user-agent
"""
try:
h = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.40',
'cookie':'cna=HI7zF8FaWRQCAXWIYnv6TUFc; miid=2003369626578581895; thw=cn; sgcookie=E100oHYUvXW%2Bt%2BGNjycE%2B7SYOvSVQNkpqpPv5qTsMrPuHKx0G%2Febzrl4pcZCj7QxBTNJ%2Fz5BMMj1Hkt4f0kC8CzJfRs%2BCOSQlS7SPa1FBKv0dkozqFgzLQw%2FhtfdvClGtx7v; uc3=vt3=F8dCvChz3DJvLwR8GvQ%3D&id2=UUphw2eZR95J2%2FRH9A%3D%3D&lg2=UtASsssmOIJ0bQ%3D%3D&nk2=F5RFh6bQZIuYwo0%3D; csg=5541983a; lgc=tb078230136; uc4=nk4=0%40FY4O7o%2BSkzlkgeRJ%2B29qbe6EFW27lg%3D%3D&id4=0%40U2grGNhgbPZsqW1N3kuwN%2FfY%2FpXkVSol; tracknick=tb078230136; _cc_=VFC%2FuZ9ajQ%3D%3D; sg=677; enc=GczM2HPZC6WdhQOWPRrh%2FbfIFDpGXw9H9TW2mx04lgVPU6bOdgltYbzqwxAva8KYEU%2FI1DI8mdE6tZHyRIi4jh5zapRUoWNw9e%2B%2FhDI4k6Q%3D; mt=ci=-1_0; _m_h5_tk=1bbca207acea1b18b0ced12571b0e556_1651944694145; _m_h5_tk_enc=fa9427ab38a6a5911bb885db8d445d89; xlly_s=1; cookie2=19eea3ce29b69873e24e73b75bc8652d; uc1=cookie14=UoexMyps4ZHTyw%3D%3D; t=5974611b34125053c6d2ed23e73a56b7; _tb_token_=e4ebee5e3e13e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _samesite_flag_=true; JSESSIONID=54EF74CA88077E5E6FD7C29316EF90E9; l=eBEgzNmHLu0l4hfEBOfwourza77tjLAxiuPzaNbMiOCPOdCp5oBAW64yjOL9CnhVh6upR3SLKwwvBeYBqS24n5U62j-lasMmn; tfstk=cBRhBgV-ZpWQIzXMhX1INAwQjNkOZInFYQRp_2cvrlMeepONiM2aey1jSGrej81..; isg=BH5-hO02kF-Z5sRWibJTSs2W2pTAv0I5BXBsIiiH7UG8yx6lkEzZSaApRpcHQzpR'
}
r = requests.get(url, timeout=30, headers=h)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
#对网页的数据进行解析,提取想要的数据
def parsePage(ilt, html):
try:
#利用正则表达式对需要的网页信息进行提取
# 价格
pplt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
# 标题信息
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
# 销量
slt = re.findall(r'\"view_sales\"\:\"[\d]*.{0,4}\"', html)
#发货地址
dlt = re.findall(r'\"item_loc\"\:\".*?\s?.*?\"', html)
#ult = re.findall(r'\"detail_url\"\:\".*?\"', html)
#对字典有用的数据进行提取
for i in range(len(pplt)):
price = eval(pplt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
sale = slt[i].split(':')[1]
deliver = eval(dlt[i].split(':')[1])
#url = eval(ult[i].split(':')[1])
'''for j in url:
j.replace("\\u003d", "=").replace("\\u0026", "&")'''
number = ""
#提取sale中的数字信息
for i in sale:
if ord(i) >= 48 and ord(i) <= 57:
number += i
ilt.append([price, number, deliver, title])
except:
print("")
#对网页信息进行输出
def printGoodsList(ilt):
tpplt = "{:4}\t{:8}\t{:16}\t{:20}\t{:30}"
print(tpplt.format("序号", "价格", "销量", "发货地", "商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tpplt.format(count, g[0], g[1], g[2], g[3]))
infoList = []
def main():
#这是搜索的内容goods
goods = input()
depth = input()
start_url = 'https://s.taobao.com/search?q=' + goods
for i in range(depth):
try:
url = start_url + '&s=' + str(44 * i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
print("开始爬取")
main()