You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/spiders/北京传染病数据爬取.py

165 lines
5.4 KiB

import requests
import random
import pandas as pd
import re
from pylab import mpl
from datetime import datetime, timedelta, date
from multiprocessing.pool import ThreadPool
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
class GetBeijingGanranShuju(object):
def __init__(self):
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
]
user_Agent = random.choice(ua_list)
self.headers = {
"User-Agent": random.choice(ua_list),
}
def get_Link_2023(self, url):
import time
response = requests.get(url=url, headers=self.headers)
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
link_2023 = re.findall('<a href="[.]*?(/.*?2023.*?)">', html)
for i in link_2023:
url_head = "https://www.bjcdc.org/"
i = url_head + i
link_list_2023.append(i)
return link_list_2023
def get_Link_2024(self, url):
import time
response = requests.get(url=url, headers=self.headers)
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
link_2024 = re.findall('<a href="[.]*?(/.*?2024.*?)">', html)
for i in link_2024:
url_head = "https://www.bjcdc.org/"
i = url_head + i
link_list_2024.append(i)
return link_list_2024
def get_content_2023(self, link):
number = ''
response = requests.get(url=link, headers=self.headers)
import time
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
if number_list != []:
number = number_list[0]
time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
if time_list != []:
time = time_list[0]
time1 = re.match(r'\d+月\d+日?', time).group()
month_number = re.match(r'\d{1,2}', time1).group()
day_number = re.findall(r'月(\d{1,2})', time1)[0]
time = '2023-' + str("%02s" % month_number) + '-' + str(
"%02s" % day_number)
time = time.replace(' ', '0')
if number.isdigit():
data.append([time, number])
def get_content_2024(self, link):
number = ''
response = requests.get(url=link, headers=self.headers)
html = response.content.decode("utf-8")
if '' in html:
return None
else:
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
if number_list != []:
number = number_list[0]
time_list = re.findall(r'(\d+年\d+月)', html)
if time_list != []:
time = time_list[0]
if number.isdigit():
month_data.append([time, number])
# 创建获取 获取北京传染病数据 类的实例
get_beijing_ganran_shuju = GetBeijingGanranShuju()
data, link_list_2023, link_list_2024 = [], [], []
url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
url_list2 = [
f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml'
for i in range(2, 5)
]
url_list = url_1 + url_list2
# 2023
for url in url_list:
get_beijing_ganran_shuju.get_Link_2023(url)
# 使用多进程处理每个块
pool = ThreadPool(100)
courses_list = pool.map(get_beijing_ganran_shuju.get_content_2023,
reversed(link_list_2023))
pool.close()
pool.join()
# 排序
# print(data)
# 2024
month_data = []
for url in url_list:
get_beijing_ganran_shuju.get_Link_2024(url)
# print(link_list_2024)
for x in reversed(link_list_2024):
get_beijing_ganran_shuju.get_content_2024(x)
# print(month_data)
# print(data)
# print(type(data))
df = pd.DataFrame(data, columns=['日期', '感染数量'])
df = df[df['日期'] != '2023-12-26']
df['日期'] = pd.to_datetime(df['日期'])
df_week = df.sort_values(by='日期')
# print(df_week)
today = date.today()
# 将月份数据转为周数据
# 起始日期和今天的日期
start_date = datetime(2024, 1, 2)
end_date = datetime.now()
# 生成日期列表
dates = []
while start_date <= end_date:
dates.append(start_date)
start_date += timedelta(days=7)
# 感染数据列表
infection_data = month_data
# 将感染数据转换为字典键为年月YYYY-MM格式
infections = {
datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4)
for month, total in infection_data
}
# 创建日期和感染数量列表
date_infections = []
for date in dates:
# 转换日期为YYYY-MM格式以匹配字典键
month_key = date.strftime("%Y-%m")
if month_key in infections:
date_infections.append([date, infections[month_key]])
# 创建DataFrame
month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])
# 合并周数据和月数据
df = pd.concat([df_week, month_df])
# 打印DataFrame
df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
print(df)
df.to_csv('beijin_zhoubao.csv', encoding="utf_8")
print('成功爬取北京传染病数据并保存在beijin_zhoubao.csv中')