Influenza_fund_linkage_system/spiders/北京传染病数据爬取.py

import requests
import random
import pandas as pd
import re
from pylab import mpl
from datetime import datetime, timedelta, date
from multiprocessing.pool import ThreadPool

mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False


class GetBeijingGanranShuju(object):

    def __init__(self):
        ua_list = [
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
        ]
        user_Agent = random.choice(ua_list)
        self.headers = {
            "User-Agent": random.choice(ua_list),
        }

    def get_Link_2023(self, url):
        import time
        response = requests.get(url=url, headers=self.headers)
        time.sleep(random.uniform(1, 3))
        html = response.content.decode("utf-8")
        link_2023 = re.findall('<a href="[.]*?(/.*?2023.*?)">', html)
        for i in link_2023:
            url_head = "https://www.bjcdc.org/"
            i = url_head + i
            link_list_2023.append(i)
        return link_list_2023

    def get_Link_2024(self, url):
        import time
        response = requests.get(url=url, headers=self.headers)
        time.sleep(random.uniform(1, 3))
        html = response.content.decode("utf-8")
        link_2024 = re.findall('<a href="[.]*?(/.*?2024.*?)">', html)
        for i in link_2024:
            url_head = "https://www.bjcdc.org/"
            i = url_head + i
            link_list_2024.append(i)
        return link_list_2024

    def get_content_2023(self, link):
        number = ''
        response = requests.get(url=link, headers=self.headers)
        import time
        time.sleep(random.uniform(1, 3))
        html = response.content.decode("utf-8")
        number_list = re.findall(r'(\d+)例', html, re.DOTALL)
        if number_list != []:
            number = number_list[0]
        time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
        if time_list != []:
            time = time_list[0]
            time1 = re.match(r'\d+月\d+日?', time).group()
            month_number = re.match(r'\d{1,2}', time1).group()
            day_number = re.findall(r'月(\d{1,2})', time1)[0]
            time = '2023-' + str("%02s" % month_number) + '-' + str(
                "%02s" % day_number)
            time = time.replace(' ', '0')
            if number.isdigit():
                data.append([time, number])

    def get_content_2024(self, link):
        number = ''
        response = requests.get(url=link, headers=self.headers)
        html = response.content.decode("utf-8")
        if '周' in html:
            return None
        else:
            number_list = re.findall(r'(\d+)例', html, re.DOTALL)
            if number_list != []:
                number = number_list[0]
            time_list = re.findall(r'(\d+年\d+月)', html)
            if time_list != []:
                time = time_list[0]
                if number.isdigit():
                    month_data.append([time, number])


# 创建获取 获取北京传染病数据 类的实例
get_beijing_ganran_shuju = GetBeijingGanranShuju()
data, link_list_2023, link_list_2024 = [], [], []
url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
url_list2 = [
    f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml'
    for i in range(2, 5)
]
url_list = url_1 + url_list2
# 2023
for url in url_list:
    get_beijing_ganran_shuju.get_Link_2023(url)

# 使用多进程处理每个块
pool = ThreadPool(100)
courses_list = pool.map(get_beijing_ganran_shuju.get_content_2023,
                        reversed(link_list_2023))
pool.close()
pool.join()
# 排序
# print(data)

# 2024
month_data = []
for url in url_list:
    get_beijing_ganran_shuju.get_Link_2024(url)
# print(link_list_2024)
for x in reversed(link_list_2024):
    get_beijing_ganran_shuju.get_content_2024(x)
# print(month_data)
# print(data)
# print(type(data))
df = pd.DataFrame(data, columns=['日期', '感染数量'])
df = df[df['日期'] != '2023-12-26']
df['日期'] = pd.to_datetime(df['日期'])
df_week = df.sort_values(by='日期')
# print(df_week)
today = date.today()
# 将月份数据转为周数据
# 起始日期和今天的日期
start_date = datetime(2024, 1, 2)
end_date = datetime.now()

# 生成日期列表
dates = []
while start_date <= end_date:
    dates.append(start_date)
    start_date += timedelta(days=7)

# 感染数据列表
infection_data = month_data

# 将感染数据转换为字典，键为年月（YYYY-MM格式）
infections = {
    datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4)
    for month, total in infection_data
}

# 创建日期和感染数量列表
date_infections = []
for date in dates:
    # 转换日期为YYYY-MM格式以匹配字典键
    month_key = date.strftime("%Y-%m")
    if month_key in infections:
        date_infections.append([date, infections[month_key]])

# 创建DataFrame
month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])

# 合并周数据和月数据
df = pd.concat([df_week, month_df])
# 打印DataFrame
df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
print(df)

df.to_csv('beijin_zhoubao.csv', encoding="utf_8")
print('成功爬取北京传染病数据并保存在beijin_zhoubao.csv中')