From 442b502724589d91fb1de2bcb8cac2439cb93cd7 Mon Sep 17 00:00:00 2001 From: Yao <1928814540@qq.com> Date: Tue, 8 Oct 2024 12:05:42 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=B5=81=E6=84=9F?= =?UTF-8?q?=E5=91=A8=E6=8A=A5=E7=88=AC=E5=8F=96=E8=84=9A=E6=9C=AC=E5=92=8C?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E4=BB=A3=E7=90=86=E6=B1=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加了流感周报爬取脚本和用户代理池,用于从指定网站爬取流感周报数据。流感周报爬取2.py负责爬取数据并保存到CSV文件中,user_agents_pool.py提供了可供使用的用户代理池。 --- spiders/user_agents_pool.py | 41 +++++++ spiders/北京传染病数据爬取.py | 164 +++++++++++++++++++++++++ spiders/天天基金数据爬取.py | 131 ++++++++++++++++++++ spiders/流感周报爬取.py | 110 +++++++++++++++++ spiders/流感周报爬取2.py | 75 +++++++++++ spiders/百度流感指数爬取.py | 145 ++++++++++++++++++++++ 6 files changed, 666 insertions(+) create mode 100644 spiders/user_agents_pool.py create mode 100644 spiders/北京传染病数据爬取.py create mode 100644 spiders/天天基金数据爬取.py create mode 100644 spiders/流感周报爬取.py create mode 100644 spiders/流感周报爬取2.py create mode 100644 spiders/百度流感指数爬取.py diff --git a/spiders/user_agents_pool.py b/spiders/user_agents_pool.py new file mode 100644 index 0000000..c7677ce --- /dev/null +++ b/spiders/user_agents_pool.py @@ -0,0 +1,41 @@ +# 可供使用的user_agent池 +agent_list = [ + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 \ + Firefox/87.0", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, li\ + ke Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0", + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHT\ + ML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', + "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebK\ + it/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/53\ + 7.36", + "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWeb\ + Kit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/\ + 537.36", + "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (\ + KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36", + "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535\ + .19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=t\ + rue", + "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \ + 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\ + ile Safari/537.36 Edge/14.14263", + "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia \ + 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mob\ + ile Safari/537.36 Edge/14.14263", + "Mozilla/5.0 (Linux; Android 11; moto g power (2022)) AppleWebKit/\ + 537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36", + "Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36\ + (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36", + "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWe\ + bKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", + "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWeb\ + Kit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.\ + 36", + "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKi\ + t/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36\ + ", + "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006\ + ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile S\ + afari/537.36", +] diff --git a/spiders/北京传染病数据爬取.py b/spiders/北京传染病数据爬取.py new file mode 100644 index 0000000..44f0041 --- /dev/null +++ b/spiders/北京传染病数据爬取.py @@ -0,0 +1,164 @@ +import requests +import random +import pandas as pd +import re +from pylab import mpl +from datetime import datetime, timedelta, date +from multiprocessing.pool import ThreadPool + +mpl.rcParams["font.sans-serif"] = ["SimHei"] +mpl.rcParams["axes.unicode_minus"] = False + + +class GetBeijingGanranShuju(object): + + def __init__(self): + ua_list = [ + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0" + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0", + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0' + ] + user_Agent = random.choice(ua_list) + self.headers = { + "User-Agent": random.choice(ua_list), + } + + def get_Link_2023(self, url): + import time + response = requests.get(url=url, headers=self.headers) + time.sleep(random.uniform(1, 3)) + html = response.content.decode("utf-8") + link_2023 = re.findall('', html) + for i in link_2023: + url_head = "https://www.bjcdc.org/" + i = url_head + i + link_list_2023.append(i) + return link_list_2023 + + def get_Link_2024(self, url): + import time + response = requests.get(url=url, headers=self.headers) + time.sleep(random.uniform(1, 3)) + html = response.content.decode("utf-8") + link_2024 = re.findall('', html) + for i in link_2024: + url_head = "https://www.bjcdc.org/" + i = url_head + i + link_list_2024.append(i) + return link_list_2024 + + def get_content_2023(self, link): + number = '' + response = requests.get(url=link, headers=self.headers) + import time + time.sleep(random.uniform(1, 3)) + html = response.content.decode("utf-8") + number_list = re.findall(r'(\d+)例', html, re.DOTALL) + if number_list != []: + number = number_list[0] + time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html) + if time_list != []: + time = time_list[0] + time1 = re.match(r'\d+月\d+日?', time).group() + month_number = re.match(r'\d{1,2}', time1).group() + day_number = re.findall(r'月(\d{1,2})', time1)[0] + time = '2023-' + str("%02s" % month_number) + '-' + str( + "%02s" % day_number) + time = time.replace(' ', '0') + if number.isdigit(): + data.append([time, number]) + + def get_content_2024(self, link): + number = '' + response = requests.get(url=link, headers=self.headers) + html = response.content.decode("utf-8") + if '周' in html: + return None + else: + number_list = re.findall(r'(\d+)例', html, re.DOTALL) + if number_list != []: + number = number_list[0] + time_list = re.findall(r'(\d+年\d+月)', html) + if time_list != []: + time = time_list[0] + if number.isdigit(): + month_data.append([time, number]) + + +# 创建获取 获取北京传染病数据 类的实例 +get_beijing_ganran_shuju = GetBeijingGanranShuju() +data, link_list_2023, link_list_2024 = [], [], [] +url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml'] +url_list2 = [ + f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml' + for i in range(2, 5) +] +url_list = url_1 + url_list2 +# 2023 +for url in url_list: + get_beijing_ganran_shuju.get_Link_2023(url) + +# 使用多进程处理每个块 +pool = ThreadPool(100) +courses_list = pool.map(get_beijing_ganran_shuju.get_content_2023, + reversed(link_list_2023)) +pool.close() +pool.join() +# 排序 +# print(data) + +# 2024 +month_data = [] +for url in url_list: + get_beijing_ganran_shuju.get_Link_2024(url) +# print(link_list_2024) +for x in reversed(link_list_2024): + get_beijing_ganran_shuju.get_content_2024(x) +# print(month_data) +# print(data) +# print(type(data)) +df = pd.DataFrame(data, columns=['日期', '感染数量']) +df = df[df['日期'] != '2023-12-26'] +df['日期'] = pd.to_datetime(df['日期']) +df_week = df.sort_values(by='日期') +# print(df_week) +today = date.today() +# 将月份数据转为周数据 +# 起始日期和今天的日期 +start_date = datetime(2024, 1, 2) +end_date = datetime.now() + +# 生成日期列表 +dates = [] +while start_date <= end_date: + dates.append(start_date) + start_date += timedelta(days=7) + +# 感染数据列表 +infection_data = month_data + +# 将感染数据转换为字典,键为年月(YYYY-MM格式) +infections = { + datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4) + for month, total in infection_data +} + +# 创建日期和感染数量列表 +date_infections = [] +for date in dates: + # 转换日期为YYYY-MM格式以匹配字典键 + month_key = date.strftime("%Y-%m") + if month_key in infections: + date_infections.append([date, infections[month_key]]) + +# 创建DataFrame +month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量']) + +# 合并周数据和月数据 +df = pd.concat([df_week, month_df]) +# 打印DataFrame +df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'}) +print(df) + +df.to_csv('beijin_zhoubao.csv', encoding="utf_8") +print('成功爬取北京传染病数据并保存在beijin_zhoubao.csv中') diff --git a/spiders/天天基金数据爬取.py b/spiders/天天基金数据爬取.py new file mode 100644 index 0000000..78b6d7a --- /dev/null +++ b/spiders/天天基金数据爬取.py @@ -0,0 +1,131 @@ +import requests +import re +from multiprocessing.pool import ThreadPool +import pandas as pd + + +def get_jijin_data(*args): + """ + 获取某个基金某页的历史净值数据 + :param fundCode: + :param page: + :return: list + """ + cookies = { + 'qgqp_b_id': '5c08ebc12f489b4f5ba9e76c2539ce0b', + 'emshistory': + '%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D', + 'HAList': + 'ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F', + 'mtp': '1', + 'ct': + 'Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8', + 'ut': + 'FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK', + 'pi': + '6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D', + 'uidal': + '6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02', + 'sid': '', + 'vtpst': '|', + 'websitepoptg_api_time': '1715218615434', + 'st_si': '46368340182479', + 'EmFundFavorVersion': '1686749115372', + 'EmFundFavorVersion2': '1686749115372', + 'st_asi': 'delete', + 'EMFUND0': 'null', + 'st_pvi': '35290886003252', + 'st_sp': '2023-12-17%2018%3A51%3A34', + 'st_inirUrl': 'https%3A%2F%2Fcn.bing.com%2F', + 'st_sn': '27', + 'st_psi': '20240509100744555-112200305283-5067673963', + } + + headers = { + 'Accept': + '*/*', + 'Accept-Language': + 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', + 'Connection': + 'keep-alive', + # 'Cookie': 'qgqp_b_id=5c08ebc12f489b4f5ba9e76c2539ce0b; emshistory=%5B%2200005%22%2C%2200002%22%2C%2200002%E3%80%81%22%2C%2200001%22%5D; HAList=ty-0-300411-%u91D1%u76FE%u80A1%u4EFD%2Cty-0-399366-%u80FD%u6E90%u91D1%u5C5E%2Cty-116-00002-%u4E2D%u7535%u63A7%u80A1%2Cty-116-03119-GX%u4E9A%u6D32%u534A%u5BFC%u4F53%2Cty-116-00007-%u667A%u5BCC%u8D44%u6E90%u6295%u8D44%2Cty-116-00001-%u957F%u548C%2Cty-116-00016-%u65B0%u9E3F%u57FA%u5730%u4EA7%2Cty-0-301075-%u591A%u745E%u533B%u836F%2Cty-90-BK1042-%u533B%u836F%u5546%u4E1A%2Cty-1-601607-%u4E0A%u6D77%u533B%u836F; mtp=1; ct=Rc8QhLQwVpXSsLuf4UOMLbPMtE9gFAEkMTisAatrxh1rv-WFWG9EC-2zw_WFCJnVfsaViwejVO4ziLTZig1GUptw6NORwx36yfzDu9g9zstYkLdwIWvQ-9QqGL-F5C1GCS7xhUtoBrFAibnr_-HA078LL8tr7yWiGM9V3ZmooC8; ut=FobyicMgeV54OLFNgnrRk4fT26HSX01NG2N55VZbVzZlqOMDJ-67DsHyCMk6G-yTMaqRhIAFuiYbVkK6Y-sYY8ghkJ3v9gyvUZyHWYpJnreP78yw4o-H8FNcTvUXmOj4KLsGaYuV1TAHltcdN0WDTy-YCOJ8OlzrX-MQbQc_CBvXfUYn10iBhXwvJY94XBkg4eOCJpu6Dok3ot9Xsr8flPIDz6f3KxJcIgnXZ7QpZKDMIvavpSunuMiR8Q5ezUD2y-JiBEgNkeoH_36wg0elojOfd5k61gTK; pi=6293426663250936%3Bm6293426663250936%3B%E4%BA%89%E5%88%86%E5%A4%BA%E7%A7%92%E7%9A%84%E9%A3%8E%E8%BE%B02%3B4qqIkcy3NvmegD2EnE%2BsOg2O1jjgPTjDxX3du3GmlWaCk8fr0sJ%2FmubqRXtUqqRoZWsMMmMvcfSg1wNNX8p93XE3fanPRZvbcs7bYEjCeUqg5RMcJtmbM9jEifMzwRAAmCipwh9KbqrYLdkLenTwJYqOaG9qmaZ2qDmn2Pa66eitUxhH2q0aU0kerTnJCi2qJnM8Y0Oc%3Bz%2Bzk7gxq8gdHwxSGucOoQSvBZ44Uaf7Um0f7bFnTUgwLnxWm2OMnlrG9SZX6ezbrsEoqVVrOk%2FVRGekqxUH%2BufKtmb89UVNnA0x62lxu6z84Y8dT0sXAWUELHmWZf8cnumRIL8kPvuAcHSXq5P6pTC3OaxbBeQ%3D%3D; uidal=6293426663250936%e4%ba%89%e5%88%86%e5%a4%ba%e7%a7%92%e7%9a%84%e9%a3%8e%e8%be%b02; sid=; vtpst=|; websitepoptg_api_time=1715218615434; st_si=46368340182479; EmFundFavorVersion=1686749115372; EmFundFavorVersion2=1686749115372; st_asi=delete; EMFUND0=null; EMFUND1=05-09%2009%3A49%3A02@%23%24%u534E%u590F%u6210%u957F%u6DF7%u5408@%23%24000001; EMFUND2=05-09%2009%3A53%3A36@%23%24%u5BCC%u56FD%u7CBE%u51C6%u533B%u7597%u6DF7%u5408A@%23%24005176; EMFUND3=05-09%2009%3A54%3A07@%23%24%u94F6%u6CB3%u533B%u836F%u6DF7%u5408A@%23%24011335; EMFUND4=05-09%2009%3A54%3A13@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77A@%23%24015052; EMFUND5=05-09%2009%3A57%3A40@%23%24%u5B9D%u76C8%u73B0%u4EE3%u670D%u52A1%u4E1A%u6DF7%u5408A@%23%24009223; EMFUND6=05-09%2009%3A57%3A51@%23%24%u4E1C%u65B9%u7EA2%u533B%u7597%u5347%u7EA7%u80A1%u7968%u53D1%u8D77C@%23%24015053; EMFUND7=05-09%2009%3A58%3A04@%23%24%u5E7F%u53D1%u521B%u65B0%u533B%u7597%u4E24%u5E74%u6301%u6709%u6DF7%u5408A@%23%24010731; EMFUND8=05-09%2009%3A58%3A56@%23%24%u5BCC%u56FD%u751F%u7269%u533B%u836F%u79D1%u6280%u6DF7%u5408A@%23%24006218; EMFUND9=05-09 09:59:24@#$%u534E%u5546%u533B%u836F%u533B%u7597%u884C%u4E1A%u80A1%u7968@%23%24008107; st_pvi=35290886003252; st_sp=2023-12-17%2018%3A51%3A34; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=27; st_psi=20240509100744555-112200305283-5067673963', + 'Referer': + 'https://fundf10.eastmoney.com/', + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0', + } + + params = { + 'callback': 'jQuery183019015669101010957_1715220464680', + 'fundCode': args[0][0], + 'pageIndex': args[0][1], + 'pageSize': '20', + 'startDate': '', + 'endDate': '', + '_': '1715220492762', + } + + response = requests.get('https://api.fund.eastmoney.com/f10/lsjz', + params=params, + cookies=cookies, + headers=headers) + pattern = r'"FSRQ":"(.*?)","DWJZ":"(.*?)"' + text = response.text + data_page = re.findall(pattern, text) + data_list = [] + for data in data_page: + data_list.append(list(data)) + return data_list + + +def get_hx_data(): + """ + 获取华商医药医疗行业股票基金历史净值数据 + :return: list of hx_data + """ + fundcode = '008107' + page_list = range(1, 29) + hx_data = [] + args_list = [(fundcode, i) for i in page_list] + # 使用多进程处理 + pool = ThreadPool(100) + data_list = pool.map(get_jijin_data, args_list) + pool.close() + pool.join() + for data in data_list: + hx_data += data + print(hx_data) + # 数据储存 + return hx_data + + +def get_gf_data(): + """ + 获取广发创新医疗两年持有混合基金历史净值数据 + :return: list of hx_data + """ + fundcode = '010731' + page_list = range(1, 29) + gf_data = [] + args_list = [(fundcode, i) for i in page_list] + # 使用多进程处理 + pool = ThreadPool(100) + data_list = pool.map(get_jijin_data, args_list) + pool.close() + pool.join() + for data in data_list: + gf_data += data + print(gf_data) + return gf_data + + +def save_data_to_csv(data, filename): + df = pd.DataFrame(data, columns=['date', filename]) + df['date'] = pd.to_datetime(df['date']) + df = df.sort_values(by='date') + df.to_csv(f'{filename}.csv', encoding="utf_8") + print(f'成功爬取流感基金数据并保存在{filename}.csv中') + + +save_data_to_csv(get_hx_data(), 'hx_jijin_data') +save_data_to_csv(get_gf_data(), 'gf_jijin_data') diff --git a/spiders/流感周报爬取.py b/spiders/流感周报爬取.py new file mode 100644 index 0000000..35b15e4 --- /dev/null +++ b/spiders/流感周报爬取.py @@ -0,0 +1,110 @@ +import requests +import random +import pandas as pd +from lxml import etree +import time +import re +from datetime import datetime +from tqdm import * +from multiprocessing.pool import ThreadPool +from user_agents_pool import * + +url_1 = ['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm'] +url_list2 = [ + f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm' + for i in range(1, 4) +] +url_list = url_1 + url_list2 + +user_Agent = random.choice(agent_list) +headers = { + "User-Agent": user_Agent, +} + + +def get_Link(url): + link_list = [] + response = requests.get(url=url, headers=headers) + time.sleep(2) + html = response.content.decode("utf-8") + tree = etree.HTML(html) + li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li') + # print(len(li_list)) + for table in li_list: + link = table.xpath("./span[1]/a/@href")[0] + link = link.replace('.', '') + url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" + link = url_head + link + link = link.replace('htm', '.htm') + link_list.append(link) + return link_list + + +def get_content(link): + response = requests.get(url=link, headers=headers) + time.sleep(2) + html = response.content.decode("utf-8") + # print(html) + tree = etree.HTML(html) + date = tree.xpath( + '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()' + )[1] + # print(time) + year = tree.xpath( + '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()' + )[0] + # print(year) + date = year + date + date = date.replace(')', '') + date_format = '%Y年%m月%d日' + target_date = datetime.strptime(date, date_format) + # print(target_date) + start_time = '2023年2月18日' + start_date = datetime.strptime(start_time, date_format) + if target_date > start_date: + specific_number = re.search( + r'(.?<=font-size: 10pt;\">|)(\d+)(?=起|起)', + html) + number = specific_number.group(2) if specific_number else None + if number == None: + pattern = r'(\d+) 0 else f'{BASE_URL}.htm' for i in range(4) +] + + +def get_links(url): + try: + response = SESSION.get(url, headers=HEADERS) + time.sleep(0.3) + print(response.status_code) + tree = etree.HTML(response.content.decode("utf-8")) + links = tree.xpath('//li/span[1]/a/@href') + print([ + "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" + + url.replace('.', '').replace('htm', '.htm') for url in links + ]) + return [ + "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" + + url.replace('.', '').replace('htm', '.htm') for url in links + ] + except Exception as e: + print(f"Error fetching links from {url}: {e}") + return [] + + +def get_content(link): + try: + response = SESSION.get(link, headers=HEADERS) + time.sleep(0.3) + html = response.content.decode("utf-8") + tree = etree.HTML(html) + date_text = tree.xpath('//div[@class="content"]//p[1]/span/text()')[1] + year = tree.xpath('//div[@class="content"]//p[1]/span/span/text()')[0] + date = datetime.strptime(year + date_text.replace(')', ''), + '%Y年%m月%d日') + if date > START_DATE: + number = re.search(r'(\d+)(?=起)', html) + return [ + date.strftime('%Y-%m-%d'), + number.group(0) if number else 0 + ] + except Exception as e: + print(f"Error fetching content from {link}: {e}") + return None + + +def get_liuganzhoubao(): + links = [] + for url in url_list: + links += get_links(url) + print(links) + with ThreadPool(10) as pool: + data_list = pool.map(get_content, links) + return [data for data in data_list if data] + + +if __name__ == "__main__": + data = get_liuganzhoubao() + print(data) diff --git a/spiders/百度流感指数爬取.py b/spiders/百度流感指数爬取.py new file mode 100644 index 0000000..57588b9 --- /dev/null +++ b/spiders/百度流感指数爬取.py @@ -0,0 +1,145 @@ +import requests +from pylab import mpl +import pandas as pd +import time +from datetime import datetime, timedelta, date + +mpl.rcParams["font.sans-serif"] = ["SimHei"] +mpl.rcParams["axes.unicode_minus"] = False + + +class DownloadBaiDuIndex(object): + # 创建一个类来下载百度指数 + def __init__(self, cookie): + self.cookie = cookie + # 配置请求头 + self.headers = { + "Connection": + "keep-alive", + "Accept": + "application/json, text/plain, */*", + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + "Sec-Fetch-Site": + "same-origin", + "Sec-Fetch-Mode": + "cors", + "Sec-Fetch-Dest": + "empty", + "Referer": + "https://index.baidu.com/v2/main/index.html", + "Accept-Language": + "zh-CN,zh;q=0.9", + 'Cookie': + self.cookie, + "Host": + "index.baidu.com", + "X-Requested-With": + "XMLHttpRequest", + "Cipher-Text": + "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==", + } + + def decrypt(self, ptbk, index_data): + n = len(ptbk) // 2 + a = dict(zip(ptbk[:n], ptbk[n:])) + return "".join([a[s] for s in index_data]) + + def get_index_data_json(self, keys, start=None, end=None): + words = [[{"name": key, "wordType": 1}] for key in keys] + words = str(words).replace(" ", "").replace("'", "\"") + url = f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area=0&startDate={start}&endDate={end}' + res = requests.get(url, headers=self.headers) + html = res.content.decode("UTF-8") + data = res.json()['data'] + uniqid = data['uniqid'] + url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}' + # print(url) + res = requests.get(url, headers=self.headers) + html2 = res.content.decode("UTF-8") + time.sleep(3) + ptbk = res.json()['data'] + result = {} + result["startDate"] = start + result["endDate"] = end + for userIndexe in data['userIndexes']: + name = userIndexe['word'][0]['name'] + tmp = {} + index_all = userIndexe['all']['data'] + index_all_data = [ + int(e) for e in self.decrypt(ptbk, index_all).split(",") + ] + tmp["all"] = index_all_data + index_pc = userIndexe['pc']['data'] + index_pc_data = [ + int(e) for e in self.decrypt(ptbk, index_pc).split(",") + ] + tmp["pc"] = index_pc_data + index_wise = userIndexe['wise']['data'] + index_wise_data = [ + int(e) for e in self.decrypt(ptbk, index_wise).split(",") + ] + tmp["wise"] = index_wise_data + result[name] = tmp + return result + + def GetIndex(self, keys, start=None, end=None): + today = date.today() + if start is None: + start = str(today - timedelta(days=8)) + if end is None: + end = str(today - timedelta(days=2)) + try: + raw_data = self.get_index_data_json(keys=keys, + start=start, + end=end) + raw_data = pd.DataFrame(raw_data[keys[0]]) + raw_data.index = pd.date_range(start=start, end=end) + except Exception as e: + print(e) + raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []}) + # 分别表示总计,PC端,移动端 + finally: + return raw_data + + +def get_baidu_index(): + cookie = 'BIDUPSID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC; \ + PSTM=1697213335; \ + BAIDUID=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; BAIDUID_BFESS=84B8FDC3134DE2D8E0E6B86E2BFCC3DC:SL=0:NR=10:FG=1; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1701483117; BDUSS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04514997999zSyIXXcI1QTeZqm4c8hyxlWksvkordeK7x1ZPceY2CR3NLufUujm7MOZ3p6TYUaUvd3Qjet3M3JcQfM5hy8%2FuP9HNu4dCG7B6RoS3S4L25PQZlnh3joEA0cArzaShqjtNyIlDOFD7nF4m%2FHL%2FxUXMnks0IYh6ZyO0xZ1iCY3pJruPDK3dBKJPJ%2BTsLIUPckisDLv5o4FBynumqVmNrIcRJauvv%2BcQtioTBjGMshtfwaZjDT2WCz713NtlH6uxabBdf8gRHMu6r8uSWjXKPG3dAflk5ycDG%2F1BoioLYK697k%3D91877884685963653296273632513192; __cas__rn__=451499799; __cas__st__212=b5f51a7b5b20cb36d3ced6764c8b0e567b436d1a2aa46e1f861833387e9d43267ac11419a4d630081274b162; __cas__id__212=51862268; CPTK_212=1671659797; CPID_212=51862268; bdindexid=473uetvtav5o3d1jfb3m9s3d34; RT="z=1&dm=baidu.com&si=0751b751-3767-4525-9566-4b5f1cd26e3a&ss=lpnhlcxe&sl=8&tt=fr3&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf"; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1701490081; ab_sr=1.0.1_MjQ2ODNmNmI4NzI5MzFhZDAxYzIzZDQzYmMyZDAwOTZiYWE5NDY4OGQxMDNkYzA0NGM4OGU1ZDk5YjZmYjdkMTkyNTYxMDJiZmVlMjllNGU1MWQ1YjgwYTAzZGQxMWFkYzEyMDQ3ZjYxMThkNWI1NTg1ZTliOWVmYTQ1M2E3NjhmMDUzNTllNjU3YzYwNDlhOTU0ODRhMzJlZDAwMWY5Yg==; BDUSS_BFESS=RUU3ZtM0RwcU9VeW0zV0ltMGhWZXNvd3hoMXc3YmtoZmxOOXktTDNFM3JMNUpsRUFBQUFBJCQAAAAAAQAAAAEAAADwtxh-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOuiamXromplSH' + + # 初始化一个实例 + downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie) + # key = input('请输入关键词') + key = '流感' + # 获取当天时间 + + # from datetime import date + today = str(date.today()) + data = downloadbaiduindex.get_index_data_json(keys=[key], + start='2012-01-01', + end=today) + liugan_data = (data['流感']['all']) + + # 设定起始日期和终止日期 + start_date = date(2012, 1, 1) + end_date = datetime.now().date() + timedelta(days=7) + + # 创建日期列表,间隔为一周 + date_list = [] + current_date = start_date + while current_date <= end_date: + date_list.append(current_date) + current_date += timedelta(weeks=1) # 每次增加一周 + date_list = date_list[:len(liugan_data)] + + df = pd.DataFrame({'date': date_list, 'liugan_index': liugan_data}) + df = df.drop(df.index[-1]) + print(df) + # 数据保存 + df.to_csv('./test/data/baidu_index.csv', encoding='utf-8') + print('成功爬取百度流感指数并储存在baidu_index.csv') + + +# 调用函数 +get_baidu_index()