You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/spiders/北京传染病数据爬取.py

165 lines
5.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import random
import pandas as pd
import re
from pylab import mpl
from datetime import datetime, timedelta, date
from multiprocessing.pool import ThreadPool
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
class GetBeijingGanranShuju(object):
def __init__(self):
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
]
user_Agent = random.choice(ua_list)
self.headers = {
"User-Agent": random.choice(ua_list),
}
def get_Link_2023(self, url):
import time
response = requests.get(url=url, headers=self.headers)
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
link_2023 = re.findall('<a href="[.]*?(/.*?2023.*?)">', html)
for i in link_2023:
url_head = "https://www.bjcdc.org/"
i = url_head + i
link_list_2023.append(i)
return link_list_2023
def get_Link_2024(self, url):
import time
response = requests.get(url=url, headers=self.headers)
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
link_2024 = re.findall('<a href="[.]*?(/.*?2024.*?)">', html)
for i in link_2024:
url_head = "https://www.bjcdc.org/"
i = url_head + i
link_list_2024.append(i)
return link_list_2024
def get_content_2023(self, link):
number = ''
response = requests.get(url=link, headers=self.headers)
import time
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
if number_list != []:
number = number_list[0]
time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
if time_list != []:
time = time_list[0]
time1 = re.match(r'\d+月\d+日?', time).group()
month_number = re.match(r'\d{1,2}', time1).group()
day_number = re.findall(r'月(\d{1,2})', time1)[0]
time = '2023-' + str("%02s" % month_number) + '-' + str(
"%02s" % day_number)
time = time.replace(' ', '0')
if number.isdigit():
data.append([time, number])
def get_content_2024(self, link):
number = ''
response = requests.get(url=link, headers=self.headers)
html = response.content.decode("utf-8")
if '' in html:
return None
else:
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
if number_list != []:
number = number_list[0]
time_list = re.findall(r'(\d+年\d+月)', html)
if time_list != []:
time = time_list[0]
if number.isdigit():
month_data.append([time, number])
# 创建获取 获取北京传染病数据 类的实例
get_beijing_ganran_shuju = GetBeijingGanranShuju()
data, link_list_2023, link_list_2024 = [], [], []
url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
url_list2 = [
f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml'
for i in range(2, 5)
]
url_list = url_1 + url_list2
# 2023
for url in url_list:
get_beijing_ganran_shuju.get_Link_2023(url)
# 使用多进程处理每个块
pool = ThreadPool(100)
courses_list = pool.map(get_beijing_ganran_shuju.get_content_2023,
reversed(link_list_2023))
pool.close()
pool.join()
# 排序
# print(data)
# 2024
month_data = []
for url in url_list:
get_beijing_ganran_shuju.get_Link_2024(url)
# print(link_list_2024)
for x in reversed(link_list_2024):
get_beijing_ganran_shuju.get_content_2024(x)
# print(month_data)
# print(data)
# print(type(data))
df = pd.DataFrame(data, columns=['日期', '感染数量'])
df = df[df['日期'] != '2023-12-26']
df['日期'] = pd.to_datetime(df['日期'])
df_week = df.sort_values(by='日期')
# print(df_week)
today = date.today()
# 将月份数据转为周数据
# 起始日期和今天的日期
start_date = datetime(2024, 1, 2)
end_date = datetime.now()
# 生成日期列表
dates = []
while start_date <= end_date:
dates.append(start_date)
start_date += timedelta(days=7)
# 感染数据列表
infection_data = month_data
# 将感染数据转换为字典键为年月YYYY-MM格式
infections = {
datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4)
for month, total in infection_data
}
# 创建日期和感染数量列表
date_infections = []
for date in dates:
# 转换日期为YYYY-MM格式以匹配字典键
month_key = date.strftime("%Y-%m")
if month_key in infections:
date_infections.append([date, infections[month_key]])
# 创建DataFrame
month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])
# 合并周数据和月数据
df = pd.concat([df_week, month_df])
# 打印DataFrame
df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
print(df)
df.to_csv('beijin_zhoubao.csv', encoding="utf_8")
print('成功爬取北京传染病数据并保存在beijin_zhoubao.csv中')