import requests import random import pandas as pd import re from pylab import mpl from datetime import datetime, timedelta, date from multiprocessing.pool import ThreadPool mpl.rcParams["font.sans-serif"] = ["SimHei"] mpl.rcParams["axes.unicode_minus"] = False class GetBeijingGanranShuju(object): def __init__(self): ua_list = [ "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0", 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0' ] user_Agent = random.choice(ua_list) self.headers = { "User-Agent": random.choice(ua_list), } def get_Link_2023(self, url): import time response = requests.get(url=url, headers=self.headers) time.sleep(random.uniform(1, 3)) html = response.content.decode("utf-8") link_2023 = re.findall('', html) for i in link_2023: url_head = "https://www.bjcdc.org/" i = url_head + i link_list_2023.append(i) return link_list_2023 def get_Link_2024(self, url): import time response = requests.get(url=url, headers=self.headers) time.sleep(random.uniform(1, 3)) html = response.content.decode("utf-8") link_2024 = re.findall('', html) for i in link_2024: url_head = "https://www.bjcdc.org/" i = url_head + i link_list_2024.append(i) return link_list_2024 def get_content_2023(self, link): number = '' response = requests.get(url=link, headers=self.headers) import time time.sleep(random.uniform(1, 3)) html = response.content.decode("utf-8") number_list = re.findall(r'(\d+)例', html, re.DOTALL) if number_list != []: number = number_list[0] time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html) if time_list != []: time = time_list[0] time1 = re.match(r'\d+月\d+日?', time).group() month_number = re.match(r'\d{1,2}', time1).group() day_number = re.findall(r'月(\d{1,2})', time1)[0] time = '2023-' + str("%02s" % month_number) + '-' + str( "%02s" % day_number) time = time.replace(' ', '0') if number.isdigit(): data.append([time, number]) def get_content_2024(self, link): number = '' response = requests.get(url=link, headers=self.headers) html = response.content.decode("utf-8") if '周' in html: return None else: number_list = re.findall(r'(\d+)例', html, re.DOTALL) if number_list != []: number = number_list[0] time_list = re.findall(r'(\d+年\d+月)', html) if time_list != []: time = time_list[0] if number.isdigit(): month_data.append([time, number]) # 创建获取 获取北京传染病数据 类的实例 get_beijing_ganran_shuju = GetBeijingGanranShuju() data, link_list_2023, link_list_2024 = [], [], [] url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml'] url_list2 = [ f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml' for i in range(2, 5) ] url_list = url_1 + url_list2 # 2023 for url in url_list: get_beijing_ganran_shuju.get_Link_2023(url) # 使用多进程处理每个块 pool = ThreadPool(100) courses_list = pool.map(get_beijing_ganran_shuju.get_content_2023, reversed(link_list_2023)) pool.close() pool.join() # 排序 # print(data) # 2024 month_data = [] for url in url_list: get_beijing_ganran_shuju.get_Link_2024(url) # print(link_list_2024) for x in reversed(link_list_2024): get_beijing_ganran_shuju.get_content_2024(x) # print(month_data) # print(data) # print(type(data)) df = pd.DataFrame(data, columns=['日期', '感染数量']) df = df[df['日期'] != '2023-12-26'] df['日期'] = pd.to_datetime(df['日期']) df_week = df.sort_values(by='日期') # print(df_week) today = date.today() # 将月份数据转为周数据 # 起始日期和今天的日期 start_date = datetime(2024, 1, 2) end_date = datetime.now() # 生成日期列表 dates = [] while start_date <= end_date: dates.append(start_date) start_date += timedelta(days=7) # 感染数据列表 infection_data = month_data # 将感染数据转换为字典,键为年月(YYYY-MM格式) infections = { datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4) for month, total in infection_data } # 创建日期和感染数量列表 date_infections = [] for date in dates: # 转换日期为YYYY-MM格式以匹配字典键 month_key = date.strftime("%Y-%m") if month_key in infections: date_infections.append([date, infections[month_key]]) # 创建DataFrame month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量']) # 合并周数据和月数据 df = pd.concat([df_week, month_df]) # 打印DataFrame df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'}) print(df) df.to_csv('beijin_zhoubao.csv', encoding="utf_8") print('成功爬取北京传染病数据并保存在beijin_zhoubao.csv中')