You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/spiders/百度流感指数爬取.py

146 lines
6.9 KiB

import requests
from pylab import mpl
import pandas as pd
import time
from datetime import datetime, timedelta, date
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
class DownloadBaiDuIndex(object):
# 创建一个类来下载百度指数
def __init__(self, cookie):
self.cookie = cookie
# 配置请求头
self.headers = {
"Connection":
"keep-alive",
"Accept":
"application/json, text/plain, */*",
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"Sec-Fetch-Site":
"same-origin",
"Sec-Fetch-Mode":
"cors",
"Sec-Fetch-Dest":
"empty",
"Referer":
"https://index.baidu.com/v2/main/index.html",
"Accept-Language":
"zh-CN,zh;q=0.9",
'Cookie':
self.cookie,
"Host":
"index.baidu.com",
"X-Requested-With":
"XMLHttpRequest",
"Cipher-Text":
"1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==",
}
def decrypt(self, ptbk, index_data):
n = len(ptbk) // 2
a = dict(zip(ptbk[:n], ptbk[n:]))
return "".join([a[s] for s in index_data])
def get_index_data_json(self, keys, start=None, end=None):
words = [[{"name": key, "wordType": 1}] for key in keys]
words = str(words).replace(" ", "").replace("'", "\"")
url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}'
res = requests.get(url, headers=self.headers)
html = res.content.decode("UTF-8")
data = res.json()['data']
uniqid = data['uniqid']
url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
# print(url)
res = requests.get(url, headers=self.headers)
html2 = res.content.decode("UTF-8")
time.sleep(3)
ptbk = res.json()['data']
result = {}
result["startDate"] = start
result["endDate"] = end
for userIndexe in data['userIndexes']:
name = userIndexe['word'][0]['name']
tmp = {}
index_all = userIndexe['all']['data']
index_all_data = [
int(e) for e in self.decrypt(ptbk, index_all).split(",")
]
tmp["all"] = index_all_data
index_pc = userIndexe['pc']['data']
index_pc_data = [
int(e) for e in self.decrypt(ptbk, index_pc).split(",")
]
tmp["pc"] = index_pc_data
index_wise = userIndexe['wise']['data']
index_wise_data = [
int(e) for e in self.decrypt(ptbk, index_wise).split(",")
]
tmp["wise"] = index_wise_data
result[name] = tmp
return result
def GetIndex(self, keys, start=None, end=None):
today = date.today()
if start is None:
start = str(today - timedelta(days=8))
if end is None:
end = str(today - timedelta(days=2))
try:
raw_data = self.get_index_data_json(keys=keys,
start=start,
end=end)
raw_data = pd.DataFrame(raw_data[keys[0]])
raw_data.index = pd.date_range(start=start, end=end)
except Exception as e:
print(e)
raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []})
# 分别表示总计PC端移动端
finally:
return raw_data
def get_baidu_index():
cookie = 'BIDUPSID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC; \
PSTM=1697213335; \
BAIDUID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; BAIDUID_BFESS=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1701483117; BDUSS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04514997999zSyIXXcI1QTeZqm4c8hyxlWksvkordeK7x1ZPceY2CR3NLufUujm7MOZ3p6TYUaUvd3Qjet3M3JcQfM5hy8%2FuP9HNu4dCG7B6RoS3S4L25PQZlnh3joEA0cArzaShqjtNyIlDOFD7nF4m%2FHL%2FxUXMnks0IYh6ZyO0xZ1iCY3pJruPDK3dBKJPJ%2BTsLIUPckisDLv5o4FBynumqVmNrIcRJauvv%2BcQtioTBjGMshtfwaZjDT2WCz713NtlH6uxabBdf8gRHMu6r8uSWjXKPG3dAflk5ycDG%2F1BoioLYK697k%3D91877884685963653296273632513192; __cas__rn__=451499799; __cas__st__212=b5f51a7b5b20cb36d3ced6764c8b0e567b436d1a2aa46e1f861833387e9d43267ac11419a4d630081274b162; __cas__id__212=51862268; CPTK_212=1671659797; CPID_212=51862268; bdindexid=473uetvtav5o3d1jfb3m9s3d34; RT="z=1&dm=baidu.com&si=0751b751-3767-4525-9566-4b5f1cd26e3a&ss=lpnhlcxe&sl=8&tt=fr3&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1701490081; ab_sr=1.0.1_MjQ2ODNmNmI4NzI5MzFhZDAxYzIzZDQzYmMyZDAwOTZiYWE5NDY4OGQxMDNkYzA0NGM4OGU1ZDk5YjZmYjdkMTkyNTYxMDJiZmVlMjllNGU1MWQ1YjgwYTAzZGQxMWFkYzEyMDQ3ZjYxMThkNWI1NTg1ZTliOWVmYTQ1M2E3NjhmMDUzNTllNjU3YzYwNDlhOTU0ODRhMzJlZDAwMWY5Yg==; BDUSS_BFESS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH'
# 初始化一个实例
downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie)
# key = input('请输入关键词')
key = '流感'
# 获取当天时间
# from datetime import date
today = str(date.today())
data = downloadbaiduindex.get_index_data_json(keys=[key],
start='2012-01-01',
end=today)
liugan_data = (data['流感']['all'])
# 设定起始日期和终止日期
start_date = date(2012, 1, 1)
end_date = datetime.now().date() + timedelta(days=7)
# 创建日期列表,间隔为一周
date_list = []
current_date = start_date
while current_date <= end_date:
date_list.append(current_date)
current_date += timedelta(weeks=1) # 每次增加一周
date_list = date_list[:len(liugan_data)]
df = pd.DataFrame({'date': date_list, 'liugan_index': liugan_data})
df = df.drop(df.index[-1])
print(df)
# 数据保存
df.to_csv('./test/data/baidu_index.csv', encoding='utf-8')
print('成功爬取百度流感指数并储存在baidu_index.csv')
# 调用函数
get_baidu_index()