|
|
import requests
|
|
|
from pylab import mpl
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
from datetime import datetime, timedelta, date
|
|
|
|
|
|
mpl.rcParams["font.sans-serif"] = ["SimHei"]
|
|
|
mpl.rcParams["axes.unicode_minus"] = False
|
|
|
|
|
|
|
|
|
class DownloadBaiDuIndex(object):
|
|
|
# 创建一个类来下载百度指数
|
|
|
def __init__(self, cookie):
|
|
|
self.cookie = cookie
|
|
|
# 配置请求头
|
|
|
self.headers = {
|
|
|
"Connection":
|
|
|
"keep-alive",
|
|
|
"Accept":
|
|
|
"application/json, text/plain, */*",
|
|
|
"User-Agent":
|
|
|
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
|
|
|
"Sec-Fetch-Site":
|
|
|
"same-origin",
|
|
|
"Sec-Fetch-Mode":
|
|
|
"cors",
|
|
|
"Sec-Fetch-Dest":
|
|
|
"empty",
|
|
|
"Referer":
|
|
|
"https://index.baidu.com/v2/main/index.html",
|
|
|
"Accept-Language":
|
|
|
"zh-CN,zh;q=0.9",
|
|
|
'Cookie':
|
|
|
self.cookie,
|
|
|
"Host":
|
|
|
"index.baidu.com",
|
|
|
"X-Requested-With":
|
|
|
"XMLHttpRequest",
|
|
|
"Cipher-Text":
|
|
|
"1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==",
|
|
|
}
|
|
|
|
|
|
def decrypt(self, ptbk, index_data):
|
|
|
n = len(ptbk) // 2
|
|
|
a = dict(zip(ptbk[:n], ptbk[n:]))
|
|
|
return "".join([a[s] for s in index_data])
|
|
|
|
|
|
def get_index_data_json(self, keys, start=None, end=None):
|
|
|
words = [[{"name": key, "wordType": 1}] for key in keys]
|
|
|
words = str(words).replace(" ", "").replace("'", "\"")
|
|
|
url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}'
|
|
|
res = requests.get(url, headers=self.headers)
|
|
|
html = res.content.decode("UTF-8")
|
|
|
data = res.json()['data']
|
|
|
uniqid = data['uniqid']
|
|
|
url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
|
|
|
# print(url)
|
|
|
res = requests.get(url, headers=self.headers)
|
|
|
html2 = res.content.decode("UTF-8")
|
|
|
time.sleep(3)
|
|
|
ptbk = res.json()['data']
|
|
|
result = {}
|
|
|
result["startDate"] = start
|
|
|
result["endDate"] = end
|
|
|
for userIndexe in data['userIndexes']:
|
|
|
name = userIndexe['word'][0]['name']
|
|
|
tmp = {}
|
|
|
index_all = userIndexe['all']['data']
|
|
|
index_all_data = [
|
|
|
int(e) for e in self.decrypt(ptbk, index_all).split(",")
|
|
|
]
|
|
|
tmp["all"] = index_all_data
|
|
|
index_pc = userIndexe['pc']['data']
|
|
|
index_pc_data = [
|
|
|
int(e) for e in self.decrypt(ptbk, index_pc).split(",")
|
|
|
]
|
|
|
tmp["pc"] = index_pc_data
|
|
|
index_wise = userIndexe['wise']['data']
|
|
|
index_wise_data = [
|
|
|
int(e) for e in self.decrypt(ptbk, index_wise).split(",")
|
|
|
]
|
|
|
tmp["wise"] = index_wise_data
|
|
|
result[name] = tmp
|
|
|
return result
|
|
|
|
|
|
def GetIndex(self, keys, start=None, end=None):
|
|
|
today = date.today()
|
|
|
if start is None:
|
|
|
start = str(today - timedelta(days=8))
|
|
|
if end is None:
|
|
|
end = str(today - timedelta(days=2))
|
|
|
try:
|
|
|
raw_data = self.get_index_data_json(keys=keys,
|
|
|
start=start,
|
|
|
end=end)
|
|
|
raw_data = pd.DataFrame(raw_data[keys[0]])
|
|
|
raw_data.index = pd.date_range(start=start, end=end)
|
|
|
except Exception as e:
|
|
|
print(e)
|
|
|
raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []})
|
|
|
# 分别表示总计,PC端,移动端
|
|
|
finally:
|
|
|
return raw_data
|
|
|
|
|
|
|
|
|
def get_baidu_index():
|
|
|
cookie = 'BIDUPSID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC; \
|
|
|
PSTM=1697213335; \
|
|
|
BAIDUID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; BAIDUID_BFESS=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1701483117; BDUSS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04514997999zSyIXXcI1QTeZqm4c8hyxlWksvkordeK7x1ZPceY2CR3NLufUujm7MOZ3p6TYUaUvd3Qjet3M3JcQfM5hy8%2FuP9HNu4dCG7B6RoS3S4L25PQZlnh3joEA0cArzaShqjtNyIlDOFD7nF4m%2FHL%2FxUXMnks0IYh6ZyO0xZ1iCY3pJruPDK3dBKJPJ%2BTsLIUPckisDLv5o4FBynumqVmNrIcRJauvv%2BcQtioTBjGMshtfwaZjDT2WCz713NtlH6uxabBdf8gRHMu6r8uSWjXKPG3dAflk5ycDG%2F1BoioLYK697k%3D91877884685963653296273632513192; __cas__rn__=451499799; __cas__st__212=b5f51a7b5b20cb36d3ced6764c8b0e567b436d1a2aa46e1f861833387e9d43267ac11419a4d630081274b162; __cas__id__212=51862268; CPTK_212=1671659797; CPID_212=51862268; bdindexid=473uetvtav5o3d1jfb3m9s3d34; RT="z=1&dm=baidu.com&si=0751b751-3767-4525-9566-4b5f1cd26e3a&ss=lpnhlcxe&sl=8&tt=fr3&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1701490081; ab_sr=1.0.1_MjQ2ODNmNmI4NzI5MzFhZDAxYzIzZDQzYmMyZDAwOTZiYWE5NDY4OGQxMDNkYzA0NGM4OGU1ZDk5YjZmYjdkMTkyNTYxMDJiZmVlMjllNGU1MWQ1YjgwYTAzZGQxMWFkYzEyMDQ3ZjYxMThkNWI1NTg1ZTliOWVmYTQ1M2E3NjhmMDUzNTllNjU3YzYwNDlhOTU0ODRhMzJlZDAwMWY5Yg==; BDUSS_BFESS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH'
|
|
|
|
|
|
# 初始化一个实例
|
|
|
downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie)
|
|
|
# key = input('请输入关键词')
|
|
|
key = '流感'
|
|
|
# 获取当天时间
|
|
|
|
|
|
# from datetime import date
|
|
|
today = str(date.today())
|
|
|
data = downloadbaiduindex.get_index_data_json(keys=[key],
|
|
|
start='2012-01-01',
|
|
|
end=today)
|
|
|
liugan_data = (data['流感']['all'])
|
|
|
|
|
|
# 设定起始日期和终止日期
|
|
|
start_date = date(2012, 1, 1)
|
|
|
end_date = datetime.now().date() + timedelta(days=7)
|
|
|
|
|
|
# 创建日期列表,间隔为一周
|
|
|
date_list = []
|
|
|
current_date = start_date
|
|
|
while current_date <= end_date:
|
|
|
date_list.append(current_date)
|
|
|
current_date += timedelta(weeks=1) # 每次增加一周
|
|
|
date_list = date_list[:len(liugan_data)]
|
|
|
|
|
|
df = pd.DataFrame({'date': date_list, 'liugan_index': liugan_data})
|
|
|
df = df.drop(df.index[-1])
|
|
|
print(df)
|
|
|
# 数据保存
|
|
|
df.to_csv('./test/data/baidu_index.csv', encoding='utf-8')
|
|
|
print('成功爬取百度流感指数并储存在baidu_index.csv')
|
|
|
|
|
|
|
|
|
# 调用函数
|
|
|
get_baidu_index()
|