|
|
|
|
import requests
|
|
|
|
|
import random
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import re
|
|
|
|
|
from pylab import mpl
|
|
|
|
|
from datetime import datetime, timedelta, date
|
|
|
|
|
from multiprocessing.pool import ThreadPool
|
|
|
|
|
|
|
|
|
|
mpl.rcParams["font.sans-serif"] = ["SimHei"]
|
|
|
|
|
mpl.rcParams["axes.unicode_minus"] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GetBeijingGanranShuju(object):
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
ua_list = [
|
|
|
|
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
|
|
|
|
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
|
|
|
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
|
|
|
|
|
]
|
|
|
|
|
user_Agent = random.choice(ua_list)
|
|
|
|
|
self.headers = {
|
|
|
|
|
"User-Agent": random.choice(ua_list),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def get_Link_2023(self, url):
|
|
|
|
|
import time
|
|
|
|
|
response = requests.get(url=url, headers=self.headers)
|
|
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
html = response.content.decode("utf-8")
|
|
|
|
|
link_2023 = re.findall('<a href="[.]*?(/.*?2023.*?)">', html)
|
|
|
|
|
for i in link_2023:
|
|
|
|
|
url_head = "https://www.bjcdc.org/"
|
|
|
|
|
i = url_head + i
|
|
|
|
|
link_list_2023.append(i)
|
|
|
|
|
return link_list_2023
|
|
|
|
|
|
|
|
|
|
def get_Link_2024(self, url):
|
|
|
|
|
import time
|
|
|
|
|
response = requests.get(url=url, headers=self.headers)
|
|
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
html = response.content.decode("utf-8")
|
|
|
|
|
link_2024 = re.findall('<a href="[.]*?(/.*?2024.*?)">', html)
|
|
|
|
|
for i in link_2024:
|
|
|
|
|
url_head = "https://www.bjcdc.org/"
|
|
|
|
|
i = url_head + i
|
|
|
|
|
link_list_2024.append(i)
|
|
|
|
|
return link_list_2024
|
|
|
|
|
|
|
|
|
|
def get_content_2023(self, link):
|
|
|
|
|
number = ''
|
|
|
|
|
response = requests.get(url=link, headers=self.headers)
|
|
|
|
|
import time
|
|
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
html = response.content.decode("utf-8")
|
|
|
|
|
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
|
|
|
|
|
if number_list != []:
|
|
|
|
|
number = number_list[0]
|
|
|
|
|
time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
|
|
|
|
|
if time_list != []:
|
|
|
|
|
time = time_list[0]
|
|
|
|
|
time1 = re.match(r'\d+月\d+日?', time).group()
|
|
|
|
|
month_number = re.match(r'\d{1,2}', time1).group()
|
|
|
|
|
day_number = re.findall(r'月(\d{1,2})', time1)[0]
|
|
|
|
|
time = '2023-' + str("%02s" % month_number) + '-' + str(
|
|
|
|
|
"%02s" % day_number)
|
|
|
|
|
time = time.replace(' ', '0')
|
|
|
|
|
if number.isdigit():
|
|
|
|
|
data.append([time, number])
|
|
|
|
|
|
|
|
|
|
def get_content_2024(self, link):
|
|
|
|
|
number = ''
|
|
|
|
|
response = requests.get(url=link, headers=self.headers)
|
|
|
|
|
html = response.content.decode("utf-8")
|
|
|
|
|
if '周' in html:
|
|
|
|
|
return None
|
|
|
|
|
else:
|
|
|
|
|
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
|
|
|
|
|
if number_list != []:
|
|
|
|
|
number = number_list[0]
|
|
|
|
|
time_list = re.findall(r'(\d+年\d+月)', html)
|
|
|
|
|
if time_list != []:
|
|
|
|
|
time = time_list[0]
|
|
|
|
|
if number.isdigit():
|
|
|
|
|
month_data.append([time, number])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 创建获取 获取北京传染病数据 类的实例
|
|
|
|
|
get_beijing_ganran_shuju = GetBeijingGanranShuju()
|
|
|
|
|
data, link_list_2023, link_list_2024 = [], [], []
|
|
|
|
|
url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
|
|
|
|
|
url_list2 = [
|
|
|
|
|
f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml'
|
|
|
|
|
for i in range(2, 5)
|
|
|
|
|
]
|
|
|
|
|
url_list = url_1 + url_list2
|
|
|
|
|
# 2023
|
|
|
|
|
for url in url_list:
|
|
|
|
|
get_beijing_ganran_shuju.get_Link_2023(url)
|
|
|
|
|
|
|
|
|
|
# 使用多进程处理每个块
|
|
|
|
|
pool = ThreadPool(100)
|
|
|
|
|
courses_list = pool.map(get_beijing_ganran_shuju.get_content_2023,
|
|
|
|
|
reversed(link_list_2023))
|
|
|
|
|
pool.close()
|
|
|
|
|
pool.join()
|
|
|
|
|
# 排序
|
|
|
|
|
# print(data)
|
|
|
|
|
|
|
|
|
|
# 2024
|
|
|
|
|
month_data = []
|
|
|
|
|
for url in url_list:
|
|
|
|
|
get_beijing_ganran_shuju.get_Link_2024(url)
|
|
|
|
|
# print(link_list_2024)
|
|
|
|
|
for x in reversed(link_list_2024):
|
|
|
|
|
get_beijing_ganran_shuju.get_content_2024(x)
|
|
|
|
|
# print(month_data)
|
|
|
|
|
# print(data)
|
|
|
|
|
# print(type(data))
|
|
|
|
|
df = pd.DataFrame(data, columns=['日期', '感染数量'])
|
|
|
|
|
df = df[df['日期'] != '2023-12-26']
|
|
|
|
|
df['日期'] = pd.to_datetime(df['日期'])
|
|
|
|
|
df_week = df.sort_values(by='日期')
|
|
|
|
|
# print(df_week)
|
|
|
|
|
today = date.today()
|
|
|
|
|
# 将月份数据转为周数据
|
|
|
|
|
# 起始日期和今天的日期
|
|
|
|
|
start_date = datetime(2024, 1, 2)
|
|
|
|
|
end_date = datetime.now()
|
|
|
|
|
|
|
|
|
|
# 生成日期列表
|
|
|
|
|
dates = []
|
|
|
|
|
while start_date <= end_date:
|
|
|
|
|
dates.append(start_date)
|
|
|
|
|
start_date += timedelta(days=7)
|
|
|
|
|
|
|
|
|
|
# 感染数据列表
|
|
|
|
|
infection_data = month_data
|
|
|
|
|
|
|
|
|
|
# 将感染数据转换为字典,键为年月(YYYY-MM格式)
|
|
|
|
|
infections = {
|
|
|
|
|
datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4)
|
|
|
|
|
for month, total in infection_data
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 创建日期和感染数量列表
|
|
|
|
|
date_infections = []
|
|
|
|
|
for date in dates:
|
|
|
|
|
# 转换日期为YYYY-MM格式以匹配字典键
|
|
|
|
|
month_key = date.strftime("%Y-%m")
|
|
|
|
|
if month_key in infections:
|
|
|
|
|
date_infections.append([date, infections[month_key]])
|
|
|
|
|
|
|
|
|
|
# 创建DataFrame
|
|
|
|
|
month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])
|
|
|
|
|
|
|
|
|
|
# 合并周数据和月数据
|
|
|
|
|
df = pd.concat([df_week, month_df])
|
|
|
|
|
# 打印DataFrame
|
|
|
|
|
df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
|
|
|
|
|
print(df)
|
|
|
|
|
|
|
|
|
|
df.to_csv('beijin_zhoubao.csv', encoding="utf_8")
|
|
|
|
|
print('成功爬取北京传染病数据并保存在beijin_zhoubao.csv中')
|