From 442b502724589d91fb1de2bcb8cac2439cb93cd7 Mon Sep 17 00:00:00 2001
From: Yao <1928814540@qq.com>
Date: Tue, 8 Oct 2024 12:05:42 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=B5=81=E6=84=9F?=
=?UTF-8?q?=E5=91=A8=E6=8A=A5=E7=88=AC=E5=8F=96=E8=84=9A=E6=9C=AC=E5=92=8C?=
=?UTF-8?q?=E7=94=A8=E6=88=B7=E4=BB=A3=E7=90=86=E6=B1=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
添加了流感周报爬取脚本和用户代理池,用于从指定网站爬取流感周报数据。流感周报爬取2.py负责爬取数据并保存到CSV文件中,user_agents_pool.py提供了可供使用的用户代理池。
---
spiders/user_agents_pool.py | 41 +++++++
spiders/北京传染病数据爬取.py | 164 +++++++++++++++++++++++++
spiders/天天基金数据爬取.py | 131 ++++++++++++++++++++
spiders/流感周报爬取.py | 110 +++++++++++++++++
spiders/流感周报爬取2.py | 75 +++++++++++
spiders/百度流感指数爬取.py | 145 ++++++++++++++++++++++
6 files changed, 666 insertions(+)
create mode 100644 spiders/user_agents_pool.py
create mode 100644 spiders/北京传染病数据爬取.py
create mode 100644 spiders/天天基金数据爬取.py
create mode 100644 spiders/流感周报爬取.py
create mode 100644 spiders/流感周报爬取2.py
create mode 100644 spiders/百度流感指数爬取.py
diff --git a/spiders/user_agents_pool.py b/spiders/user_agents_pool.py
new file mode 100644
index 0000000..c7677ce
--- /dev/null
+++ b/spiders/user_agents_pool.py
@@ -0,0 +1,41 @@
+# 可供使用的user_agent池
+agent_list = [
+ "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \
+ Firefox/87.0",
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\
+ ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\
+ ML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
+ "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebK\
+ it/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/53\
+ 7.36",
+ "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWeb\
+ Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/\
+ 537.36",
+ "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (\
+ KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36",
+ "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535\
+ .19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=t\
+ rue",
+ "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
+ 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
+ ile Safari/537.36 Edge/14.14263",
+ "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \
+ 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\
+ ile Safari/537.36 Edge/14.14263",
+ "Mozilla/5.0 (Linux; Android 11; moto g power (2022)) AppleWebKit/\
+ 537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
+ "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36\
+ (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWe\
+ bKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWeb\
+ Kit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.\
+ 36",
+ "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKi\
+ t/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36\
+ ",
+ "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006\
+ ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile S\
+ afari/537.36",
+]
diff --git a/spiders/北京传染病数据爬取.py b/spiders/北京传染病数据爬取.py
new file mode 100644
index 0000000..44f0041
--- /dev/null
+++ b/spiders/北京传染病数据爬取.py
@@ -0,0 +1,164 @@
+import requests
+import random
+import pandas as pd
+import re
+from pylab import mpl
+from datetime import datetime, timedelta, date
+from multiprocessing.pool import ThreadPool
+
+mpl.rcParams["font.sans-serif"] = ["SimHei"]
+mpl.rcParams["axes.unicode_minus"] = False
+
+
+class GetBeijingGanranShuju(object):
+
+ def __init__(self):
+ ua_list = [
+ "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
+ ]
+ user_Agent = random.choice(ua_list)
+ self.headers = {
+ "User-Agent": random.choice(ua_list),
+ }
+
+ def get_Link_2023(self, url):
+ import time
+ response = requests.get(url=url, headers=self.headers)
+ time.sleep(random.uniform(1, 3))
+ html = response.content.decode("utf-8")
+ link_2023 = re.findall('', html)
+ for i in link_2023:
+ url_head = "https://www.bjcdc.org/"
+ i = url_head + i
+ link_list_2023.append(i)
+ return link_list_2023
+
+ def get_Link_2024(self, url):
+ import time
+ response = requests.get(url=url, headers=self.headers)
+ time.sleep(random.uniform(1, 3))
+ html = response.content.decode("utf-8")
+ link_2024 = re.findall('', html)
+ for i in link_2024:
+ url_head = "https://www.bjcdc.org/"
+ i = url_head + i
+ link_list_2024.append(i)
+ return link_list_2024
+
+ def get_content_2023(self, link):
+ number = ''
+ response = requests.get(url=link, headers=self.headers)
+ import time
+ time.sleep(random.uniform(1, 3))
+ html = response.content.decode("utf-8")
+ number_list = re.findall(r'(\d+)例', html, re.DOTALL)
+ if number_list != []:
+ number = number_list[0]
+ time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
+ if time_list != []:
+ time = time_list[0]
+ time1 = re.match(r'\d+月\d+日?', time).group()
+ month_number = re.match(r'\d{1,2}', time1).group()
+ day_number = re.findall(r'月(\d{1,2})', time1)[0]
+ time = '2023-' + str("%02s" % month_number) + '-' + str(
+ "%02s" % day_number)
+ time = time.replace(' ', '0')
+ if number.isdigit():
+ data.append([time, number])
+
+ def get_content_2024(self, link):
+ number = ''
+ response = requests.get(url=link, headers=self.headers)
+ html = response.content.decode("utf-8")
+ if '周' in html:
+ return None
+ else:
+ number_list = re.findall(r'(\d+)例', html, re.DOTALL)
+ if number_list != []:
+ number = number_list[0]
+ time_list = re.findall(r'(\d+年\d+月)', html)
+ if time_list != []:
+ time = time_list[0]
+ if number.isdigit():
+ month_data.append([time, number])
+
+
+# 创建获取 获取北京传染病数据 类的实例
+get_beijing_ganran_shuju = GetBeijingGanranShuju()
+data, link_list_2023, link_list_2024 = [], [], []
+url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
+url_list2 = [
+ f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml'
+ for i in range(2, 5)
+]
+url_list = url_1 + url_list2
+# 2023
+for url in url_list:
+ get_beijing_ganran_shuju.get_Link_2023(url)
+
+# 使用多进程处理每个块
+pool = ThreadPool(100)
+courses_list = pool.map(get_beijing_ganran_shuju.get_content_2023,
+ reversed(link_list_2023))
+pool.close()
+pool.join()
+# 排序
+# print(data)
+
+# 2024
+month_data = []
+for url in url_list:
+ get_beijing_ganran_shuju.get_Link_2024(url)
+# print(link_list_2024)
+for x in reversed(link_list_2024):
+ get_beijing_ganran_shuju.get_content_2024(x)
+# print(month_data)
+# print(data)
+# print(type(data))
+df = pd.DataFrame(data, columns=['日期', '感染数量'])
+df = df[df['日期'] != '2023-12-26']
+df['日期'] = pd.to_datetime(df['日期'])
+df_week = df.sort_values(by='日期')
+# print(df_week)
+today = date.today()
+# 将月份数据转为周数据
+# 起始日期和今天的日期
+start_date = datetime(2024, 1, 2)
+end_date = datetime.now()
+
+# 生成日期列表
+dates = []
+while start_date <= end_date:
+ dates.append(start_date)
+ start_date += timedelta(days=7)
+
+# 感染数据列表
+infection_data = month_data
+
+# 将感染数据转换为字典,键为年月(YYYY-MM格式)
+infections = {
+ datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4)
+ for month, total in infection_data
+}
+
+# 创建日期和感染数量列表
+date_infections = []
+for date in dates:
+ # 转换日期为YYYY-MM格式以匹配字典键
+ month_key = date.strftime("%Y-%m")
+ if month_key in infections:
+ date_infections.append([date, infections[month_key]])
+
+# 创建DataFrame
+month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])
+
+# 合并周数据和月数据
+df = pd.concat([df_week, month_df])
+# 打印DataFrame
+df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
+print(df)
+
+df.to_csv('beijin_zhoubao.csv', encoding="utf_8")
+print('成功爬取北京传染病数据并保存在beijin_zhoubao.csv中')
diff --git a/spiders/天天基金数据爬取.py b/spiders/天天基金数据爬取.py
new file mode 100644
index 0000000..78b6d7a
--- /dev/null
+++ b/spiders/天天基金数据爬取.py
@@ -0,0 +1,131 @@
+import requests
+import re
+from multiprocessing.pool import ThreadPool
+import pandas as pd
+
+
+def get_jijin_data(*args):
+ """
+ 获取某个基金某页的历史净值数据
+ :param fundCode:
+ :param page:
+ :return: list
+ """
+ cookies = {
+ 'qgqp_b_id': '5c08ebc12f489b4f5ba9e76c2539ce0b',
+ 'emshistory':
+ '%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D',
+ 'HAList':
+ 'ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F',
+ 'mtp': '1',
+ 'ct':
+ 'Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8',
+ 'ut':
+ 'FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK',
+ 'pi':
+ '6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D',
+ 'uidal':
+ '6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02',
+ 'sid': '',
+ 'vtpst': '|',
+ 'websitepoptg_api_time': '1715218615434',
+ 'st_si': '46368340182479',
+ 'EmFundFavorVersion': '1686749115372',
+ 'EmFundFavorVersion2': '1686749115372',
+ 'st_asi': 'delete',
+ 'EMFUND0': 'null',
+ 'st_pvi': '35290886003252',
+ 'st_sp': '2023-12-17%2018%3A51%3A34',
+ 'st_inirUrl': 'https%3A%2F%2Fcn.bing.com%2F',
+ 'st_sn': '27',
+ 'st_psi': '20240509100744555-112200305283-5067673963',
+ }
+
+ headers = {
+ 'Accept':
+ '*/*',
+ 'Accept-Language':
+ 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+ 'Connection':
+ 'keep-alive',
+ # 'Cookie': 'qgqp_b_id=5c08ebc12f489b4f5ba9e76c2539ce0b; emshistory=%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D; HAList=ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F; mtp=1; ct=Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8; ut=FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK; pi=6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D; uidal=6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02; sid=; vtpst=|; websitepoptg_api_time=1715218615434; st_si=46368340182479; EmFundFavorVersion=1686749115372; EmFundFavorVersion2=1686749115372; st_asi=delete; EMFUND0=null; EMFUND1=05-09%2009%3A49%3A02@%23%24%u534E%u590F%u6210%u957F%u6DF7%u5408@%23%24000001; EMFUND2=05-09%2009%3A53%3A36@%23%24%u5BCC%u56FD%u7CBE%u51C6%u533B%u7597%u6DF7%u5408A@%23%24005176; EMFUND3=05-09%2009%3A54%3A07@%23%24%u94F6%u6CB3%u533B%u836F%u6DF7%u5408A@%23%24011335; EMFUND4=05-09%2009%3A54%3A13@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77A@%23%24015052; EMFUND5=05-09%2009%3A57%3A40@%23%24%u5B9D%u76C8%u73B0%u4EE3%u670D%u52A1%u4E1A%u6DF7%u5408A@%23%24009223; EMFUND6=05-09%2009%3A57%3A51@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77C@%23%24015053; EMFUND7=05-09%2009%3A58%3A04@%23%24%u5E7F%u53D1%u521B%u65B0%u533B%u7597%u4E24%u5E74%u6301%u6709%u6DF7%u5408A@%23%24010731; EMFUND8=05-09%2009%3A58%3A56@%23%24%u5BCC%u56FD%u751F%u7269%u533B%u836F%u79D1%u6280%u6DF7%u5408A@%23%24006218; EMFUND9=05-09 09:59:24@#$%u534E%u5546%u533B%u836F%u533B%u7597%u884C%u4E1A%u80A1%u7968@%23%24008107; st_pvi=35290886003252; st_sp=2023-12-17%2018%3A51%3A34; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=27; st_psi=20240509100744555-112200305283-5067673963',
+ 'Referer':
+ 'https://fundf10.eastmoney.com/',
+ 'User-Agent':
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
+ }
+
+ params = {
+ 'callback': 'jQuery183019015669101010957_1715220464680',
+ 'fundCode': args[0][0],
+ 'pageIndex': args[0][1],
+ 'pageSize': '20',
+ 'startDate': '',
+ 'endDate': '',
+ '_': '1715220492762',
+ }
+
+ response = requests.get('https://api.fund.eastmoney.com/f10/lsjz',
+ params=params,
+ cookies=cookies,
+ headers=headers)
+ pattern = r'"FSRQ":"(.*?)","DWJZ":"(.*?)"'
+ text = response.text
+ data_page = re.findall(pattern, text)
+ data_list = []
+ for data in data_page:
+ data_list.append(list(data))
+ return data_list
+
+
+def get_hx_data():
+ """
+ 获取华商医药医疗行业股票基金历史净值数据
+ :return: list of hx_data
+ """
+ fundcode = '008107'
+ page_list = range(1, 29)
+ hx_data = []
+ args_list = [(fundcode, i) for i in page_list]
+ # 使用多进程处理
+ pool = ThreadPool(100)
+ data_list = pool.map(get_jijin_data, args_list)
+ pool.close()
+ pool.join()
+ for data in data_list:
+ hx_data += data
+ print(hx_data)
+ # 数据储存
+ return hx_data
+
+
+def get_gf_data():
+ """
+ 获取广发创新医疗两年持有混合基金历史净值数据
+ :return: list of hx_data
+ """
+ fundcode = '010731'
+ page_list = range(1, 29)
+ gf_data = []
+ args_list = [(fundcode, i) for i in page_list]
+ # 使用多进程处理
+ pool = ThreadPool(100)
+ data_list = pool.map(get_jijin_data, args_list)
+ pool.close()
+ pool.join()
+ for data in data_list:
+ gf_data += data
+ print(gf_data)
+ return gf_data
+
+
+def save_data_to_csv(data, filename):
+ df = pd.DataFrame(data, columns=['date', filename])
+ df['date'] = pd.to_datetime(df['date'])
+ df = df.sort_values(by='date')
+ df.to_csv(f'{filename}.csv', encoding="utf_8")
+ print(f'成功爬取流感基金数据并保存在{filename}.csv中')
+
+
+save_data_to_csv(get_hx_data(), 'hx_jijin_data')
+save_data_to_csv(get_gf_data(), 'gf_jijin_data')
diff --git a/spiders/流感周报爬取.py b/spiders/流感周报爬取.py
new file mode 100644
index 0000000..35b15e4
--- /dev/null
+++ b/spiders/流感周报爬取.py
@@ -0,0 +1,110 @@
+import requests
+import random
+import pandas as pd
+from lxml import etree
+import time
+import re
+from datetime import datetime
+from tqdm import *
+from multiprocessing.pool import ThreadPool
+from user_agents_pool import *
+
+url_1 = ['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
+url_list2 = [
+ f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm'
+ for i in range(1, 4)
+]
+url_list = url_1 + url_list2
+
+user_Agent = random.choice(agent_list)
+headers = {
+ "User-Agent": user_Agent,
+}
+
+
+def get_Link(url):
+ link_list = []
+ response = requests.get(url=url, headers=headers)
+ time.sleep(2)
+ html = response.content.decode("utf-8")
+ tree = etree.HTML(html)
+ li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
+ # print(len(li_list))
+ for table in li_list:
+ link = table.xpath("./span[1]/a/@href")[0]
+ link = link.replace('.', '')
+ url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
+ link = url_head + link
+ link = link.replace('htm', '.htm')
+ link_list.append(link)
+ return link_list
+
+
+def get_content(link):
+ response = requests.get(url=link, headers=headers)
+ time.sleep(2)
+ html = response.content.decode("utf-8")
+ # print(html)
+ tree = etree.HTML(html)
+ date = tree.xpath(
+ '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()'
+ )[1]
+ # print(time)
+ year = tree.xpath(
+ '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()'
+ )[0]
+ # print(year)
+ date = year + date
+ date = date.replace(')', '')
+ date_format = '%Y年%m月%d日'
+ target_date = datetime.strptime(date, date_format)
+ # print(target_date)
+ start_time = '2023年2月18日'
+ start_date = datetime.strptime(start_time, date_format)
+ if target_date > start_date:
+ specific_number = re.search(
+ r'(.?<=font-size: 10pt;\">|)(\d+)(?=起|起)',
+ html)
+ number = specific_number.group(2) if specific_number else None
+ if number == None:
+ pattern = r'(\d+) 0 else f'{BASE_URL}.htm' for i in range(4)
+]
+
+
+def get_links(url):
+ try:
+ response = SESSION.get(url, headers=HEADERS)
+ time.sleep(0.3)
+ print(response.status_code)
+ tree = etree.HTML(response.content.decode("utf-8"))
+ links = tree.xpath('//li/span[1]/a/@href')
+ print([
+ "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
+ url.replace('.', '').replace('htm', '.htm') for url in links
+ ])
+ return [
+ "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
+ url.replace('.', '').replace('htm', '.htm') for url in links
+ ]
+ except Exception as e:
+ print(f"Error fetching links from {url}: {e}")
+ return []
+
+
+def get_content(link):
+ try:
+ response = SESSION.get(link, headers=HEADERS)
+ time.sleep(0.3)
+ html = response.content.decode("utf-8")
+ tree = etree.HTML(html)
+ date_text = tree.xpath('//div[@class="content"]//p[1]/span/text()')[1]
+ year = tree.xpath('//div[@class="content"]//p[1]/span/span/text()')[0]
+ date = datetime.strptime(year + date_text.replace(')', ''),
+ '%Y年%m月%d日')
+ if date > START_DATE:
+ number = re.search(r'(\d+)(?=起)', html)
+ return [
+ date.strftime('%Y-%m-%d'),
+ number.group(0) if number else 0
+ ]
+ except Exception as e:
+ print(f"Error fetching content from {link}: {e}")
+ return None
+
+
+def get_liuganzhoubao():
+ links = []
+ for url in url_list:
+ links += get_links(url)
+ print(links)
+ with ThreadPool(10) as pool:
+ data_list = pool.map(get_content, links)
+ return [data for data in data_list if data]
+
+
+if __name__ == "__main__":
+ data = get_liuganzhoubao()
+ print(data)
diff --git a/spiders/百度流感指数爬取.py b/spiders/百度流感指数爬取.py
new file mode 100644
index 0000000..57588b9
--- /dev/null
+++ b/spiders/百度流感指数爬取.py
@@ -0,0 +1,145 @@
+import requests
+from pylab import mpl
+import pandas as pd
+import time
+from datetime import datetime, timedelta, date
+
+mpl.rcParams["font.sans-serif"] = ["SimHei"]
+mpl.rcParams["axes.unicode_minus"] = False
+
+
+class DownloadBaiDuIndex(object):
+ # 创建一个类来下载百度指数
+ def __init__(self, cookie):
+ self.cookie = cookie
+ # 配置请求头
+ self.headers = {
+ "Connection":
+ "keep-alive",
+ "Accept":
+ "application/json, text/plain, */*",
+ "User-Agent":
+ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
+ "Sec-Fetch-Site":
+ "same-origin",
+ "Sec-Fetch-Mode":
+ "cors",
+ "Sec-Fetch-Dest":
+ "empty",
+ "Referer":
+ "https://index.baidu.com/v2/main/index.html",
+ "Accept-Language":
+ "zh-CN,zh;q=0.9",
+ 'Cookie':
+ self.cookie,
+ "Host":
+ "index.baidu.com",
+ "X-Requested-With":
+ "XMLHttpRequest",
+ "Cipher-Text":
+ "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==",
+ }
+
+ def decrypt(self, ptbk, index_data):
+ n = len(ptbk) // 2
+ a = dict(zip(ptbk[:n], ptbk[n:]))
+ return "".join([a[s] for s in index_data])
+
+ def get_index_data_json(self, keys, start=None, end=None):
+ words = [[{"name": key, "wordType": 1}] for key in keys]
+ words = str(words).replace(" ", "").replace("'", "\"")
+ url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}'
+ res = requests.get(url, headers=self.headers)
+ html = res.content.decode("UTF-8")
+ data = res.json()['data']
+ uniqid = data['uniqid']
+ url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
+ # print(url)
+ res = requests.get(url, headers=self.headers)
+ html2 = res.content.decode("UTF-8")
+ time.sleep(3)
+ ptbk = res.json()['data']
+ result = {}
+ result["startDate"] = start
+ result["endDate"] = end
+ for userIndexe in data['userIndexes']:
+ name = userIndexe['word'][0]['name']
+ tmp = {}
+ index_all = userIndexe['all']['data']
+ index_all_data = [
+ int(e) for e in self.decrypt(ptbk, index_all).split(",")
+ ]
+ tmp["all"] = index_all_data
+ index_pc = userIndexe['pc']['data']
+ index_pc_data = [
+ int(e) for e in self.decrypt(ptbk, index_pc).split(",")
+ ]
+ tmp["pc"] = index_pc_data
+ index_wise = userIndexe['wise']['data']
+ index_wise_data = [
+ int(e) for e in self.decrypt(ptbk, index_wise).split(",")
+ ]
+ tmp["wise"] = index_wise_data
+ result[name] = tmp
+ return result
+
+ def GetIndex(self, keys, start=None, end=None):
+ today = date.today()
+ if start is None:
+ start = str(today - timedelta(days=8))
+ if end is None:
+ end = str(today - timedelta(days=2))
+ try:
+ raw_data = self.get_index_data_json(keys=keys,
+ start=start,
+ end=end)
+ raw_data = pd.DataFrame(raw_data[keys[0]])
+ raw_data.index = pd.date_range(start=start, end=end)
+ except Exception as e:
+ print(e)
+ raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []})
+ # 分别表示总计,PC端,移动端
+ finally:
+ return raw_data
+
+
+def get_baidu_index():
+ cookie = 'BIDUPSID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC; \
+ PSTM=1697213335; \
+ BAIDUID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; BAIDUID_BFESS=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1701483117; BDUSS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04514997999zSyIXXcI1QTeZqm4c8hyxlWksvkordeK7x1ZPceY2CR3NLufUujm7MOZ3p6TYUaUvd3Qjet3M3JcQfM5hy8%2FuP9HNu4dCG7B6RoS3S4L25PQZlnh3joEA0cArzaShqjtNyIlDOFD7nF4m%2FHL%2FxUXMnks0IYh6ZyO0xZ1iCY3pJruPDK3dBKJPJ%2BTsLIUPckisDLv5o4FBynumqVmNrIcRJauvv%2BcQtioTBjGMshtfwaZjDT2WCz713NtlH6uxabBdf8gRHMu6r8uSWjXKPG3dAflk5ycDG%2F1BoioLYK697k%3D91877884685963653296273632513192; __cas__rn__=451499799; __cas__st__212=b5f51a7b5b20cb36d3ced6764c8b0e567b436d1a2aa46e1f861833387e9d43267ac11419a4d630081274b162; __cas__id__212=51862268; CPTK_212=1671659797; CPID_212=51862268; bdindexid=473uetvtav5o3d1jfb3m9s3d34; RT="z=1&dm=baidu.com&si=0751b751-3767-4525-9566-4b5f1cd26e3a&ss=lpnhlcxe&sl=8&tt=fr3&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1701490081; ab_sr=1.0.1_MjQ2ODNmNmI4NzI5MzFhZDAxYzIzZDQzYmMyZDAwOTZiYWE5NDY4OGQxMDNkYzA0NGM4OGU1ZDk5YjZmYjdkMTkyNTYxMDJiZmVlMjllNGU1MWQ1YjgwYTAzZGQxMWFkYzEyMDQ3ZjYxMThkNWI1NTg1ZTliOWVmYTQ1M2E3NjhmMDUzNTllNjU3YzYwNDlhOTU0ODRhMzJlZDAwMWY5Yg==; BDUSS_BFESS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH'
+
+ # 初始化一个实例
+ downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie)
+ # key = input('请输入关键词')
+ key = '流感'
+ # 获取当天时间
+
+ # from datetime import date
+ today = str(date.today())
+ data = downloadbaiduindex.get_index_data_json(keys=[key],
+ start='2012-01-01',
+ end=today)
+ liugan_data = (data['流感']['all'])
+
+ # 设定起始日期和终止日期
+ start_date = date(2012, 1, 1)
+ end_date = datetime.now().date() + timedelta(days=7)
+
+ # 创建日期列表,间隔为一周
+ date_list = []
+ current_date = start_date
+ while current_date <= end_date:
+ date_list.append(current_date)
+ current_date += timedelta(weeks=1) # 每次增加一周
+ date_list = date_list[:len(liugan_data)]
+
+ df = pd.DataFrame({'date': date_list, 'liugan_index': liugan_data})
+ df = df.drop(df.index[-1])
+ print(df)
+ # 数据保存
+ df.to_csv('./test/data/baidu_index.csv', encoding='utf-8')
+ print('成功爬取百度流感指数并储存在baidu_index.csv')
+
+
+# 调用函数
+get_baidu_index()