import asyncio import os import random import re import time from datetime import datetime, timedelta, date from multiprocessing.pool import ThreadPool import django import matplotlib.pyplot as plt import pandas as pd import requests from django.db import IntegrityError from lxml import etree from pylab import mpl from .models import BeijingWeekData from .user_agents_pool import agent_list # 确保 user_agents_pool.py 文件在当前目录,并包含 agent_list os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings') django.setup() mpl.rcParams["font.sans-serif"] = ["SimHei"] mpl.rcParams["axes.unicode_minus"] = False class GetBeijingGanranShuju(object): def __init__(self): user_agent = random.choice(agent_list) self.headers = { "User-Agent": user_agent, } self.data = [] self.link_list_2023 = [] self.link_list_2024 = [] def get_Link_2023(self, url): response = requests.get(url=url, headers=self.headers) time.sleep(random.uniform(1, 3)) html = response.content.decode("utf-8") link_2023 = re.findall('', html) for i in link_2023: url_head = "https://www.bjcdc.org/" i = url_head + i self.link_list_2023.append(i) return self.link_list_2023 def get_Link_2024(self, url): response = requests.get(url=url, headers=self.headers) time.sleep(random.uniform(1, 3)) html = response.content.decode("utf-8") link_2024 = re.findall('', html) for i in link_2024: url_head = "https://www.bjcdc.org/" i = url_head + i self.link_list_2024.append(i) return self.link_list_2024 def get_content_2023(self, link): response = requests.get(url=link, headers=self.headers) import time time.sleep(random.uniform(1, 3)) html = response.content.decode("utf-8") number_list = re.findall(r'(\d+)例', html, re.DOTALL) number = number_list[0] if number_list else '' time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html) if time_list: time_str = time_list[0] time1 = re.match(r'\d+月\d+日?', time_str).group() month_number = re.match(r'\d{1,2}', time1).group() day_number = re.findall(r'月(\d{1,2})', time1)[0] time = f'2023-{int(month_number):02d}-{int(day_number):02d}' if number.isdigit(): self.data.append([time, number]) def get_content_2024(self, link): response = requests.get(url=link, headers=self.headers) import time time.sleep(random.uniform(1, 3)) html = response.content.decode("utf-8") if '周' not in html: number_list = re.findall(r'(\d+)例', html, re.DOTALL) number = number_list[0] if number_list else '' time_list = re.findall(r'(\d+年\d+月)', html) if time_list: time = time_list[0] if number.isdigit(): self.month_data.append([time, number]) def get_beijing_zhoubao(): # 创建获取 获取北京传染病数据 类的实例 get_beijing_ganran_shuju = GetBeijingGanranShuju() url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml'] url_list2 = [f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml' for i in range(2, 5)] url_list = url_1 + url_list2 # 2023 for url in url_list: get_beijing_ganran_shuju.get_Link_2023(url) # 使用多进程处理每个块 pool = ThreadPool(100) pool.map(get_beijing_ganran_shuju.get_content_2023, reversed(get_beijing_ganran_shuju.link_list_2023)) pool.close() pool.join() # 2024 get_beijing_ganran_shuju.month_data = [] for url in url_list: get_beijing_ganran_shuju.get_Link_2024(url) for x in reversed(get_beijing_ganran_shuju.link_list_2024): get_beijing_ganran_shuju.get_content_2024(x) df = pd.DataFrame(get_beijing_ganran_shuju.data, columns=['日期', '感染数量']) df = df[df['日期'] != '2023-12-26'] df['日期'] = pd.to_datetime(df['日期']) df_week = df.sort_values(by='日期') from datetime import date today = date.today() start_date = datetime(2024, 1, 2) end_date = datetime.now() dates = [] while start_date <= end_date: dates.append(start_date) start_date += timedelta(days=7) infections = {datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4) for month, total in get_beijing_ganran_shuju.month_data} date_infections = [] for date in dates: month_key = date.strftime("%Y-%m") if month_key in infections: date_infections.append([date, infections[month_key]]) month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量']) df = pd.concat([df_week, month_df]) df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'}) print(df) converted_data = df.values.tolist() for data in converted_data: obj, created = BeijingWeekData.objects.get_or_create(date=data[0], defaults={'infection_number': data[1]}) if created: print(f"Added new record for date {data[0]} with infections {data[1]}") else: print(f"Record for date {data[0]} already exists.") print('成功载入数据库')