You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
149 lines
5.4 KiB
149 lines
5.4 KiB
import asyncio
|
|
import os
|
|
import random
|
|
import re
|
|
import time
|
|
from datetime import datetime, timedelta, date
|
|
from multiprocessing.pool import ThreadPool
|
|
|
|
import django
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
import requests
|
|
from django.db import IntegrityError
|
|
from lxml import etree
|
|
from pylab import mpl
|
|
|
|
from .models import BeijingWeekData
|
|
from .user_agents_pool import agent_list # 确保 user_agents_pool.py 文件在当前目录,并包含 agent_list
|
|
|
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings')
|
|
django.setup()
|
|
|
|
mpl.rcParams["font.sans-serif"] = ["SimHei"]
|
|
mpl.rcParams["axes.unicode_minus"] = False
|
|
|
|
class GetBeijingGanranShuju(object):
|
|
def __init__(self):
|
|
user_agent = random.choice(agent_list)
|
|
self.headers = {
|
|
"User-Agent": user_agent,
|
|
}
|
|
self.data = []
|
|
self.link_list_2023 = []
|
|
self.link_list_2024 = []
|
|
|
|
def get_Link_2023(self, url):
|
|
response = requests.get(url=url, headers=self.headers)
|
|
time.sleep(random.uniform(1, 3))
|
|
html = response.content.decode("utf-8")
|
|
link_2023 = re.findall('<a href="[.]*?(/.*?2023.*?)">', html)
|
|
for i in link_2023:
|
|
url_head = "https://www.bjcdc.org/"
|
|
i = url_head + i
|
|
self.link_list_2023.append(i)
|
|
return self.link_list_2023
|
|
|
|
def get_Link_2024(self, url):
|
|
response = requests.get(url=url, headers=self.headers)
|
|
time.sleep(random.uniform(1, 3))
|
|
html = response.content.decode("utf-8")
|
|
link_2024 = re.findall('<a href="[.]*?(/.*?2024.*?)">', html)
|
|
for i in link_2024:
|
|
url_head = "https://www.bjcdc.org/"
|
|
i = url_head + i
|
|
self.link_list_2024.append(i)
|
|
return self.link_list_2024
|
|
|
|
def get_content_2023(self, link):
|
|
response = requests.get(url=link, headers=self.headers)
|
|
import time
|
|
time.sleep(random.uniform(1, 3))
|
|
html = response.content.decode("utf-8")
|
|
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
|
|
number = number_list[0] if number_list else ''
|
|
time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
|
|
if time_list:
|
|
time_str = time_list[0]
|
|
time1 = re.match(r'\d+月\d+日?', time_str).group()
|
|
month_number = re.match(r'\d{1,2}', time1).group()
|
|
day_number = re.findall(r'月(\d{1,2})', time1)[0]
|
|
time = f'2023-{int(month_number):02d}-{int(day_number):02d}'
|
|
if number.isdigit():
|
|
self.data.append([time, number])
|
|
|
|
def get_content_2024(self, link):
|
|
response = requests.get(url=link, headers=self.headers)
|
|
import time
|
|
time.sleep(random.uniform(1, 3))
|
|
html = response.content.decode("utf-8")
|
|
if '周' not in html:
|
|
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
|
|
number = number_list[0] if number_list else ''
|
|
time_list = re.findall(r'(\d+年\d+月)', html)
|
|
if time_list:
|
|
time = time_list[0]
|
|
if number.isdigit():
|
|
self.month_data.append([time, number])
|
|
|
|
def get_beijing_zhoubao():
|
|
# 创建获取 获取北京传染病数据 类的实例
|
|
get_beijing_ganran_shuju = GetBeijingGanranShuju()
|
|
url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
|
|
url_list2 = [f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml' for i in range(2, 5)]
|
|
url_list = url_1 + url_list2
|
|
|
|
# 2023
|
|
for url in url_list:
|
|
get_beijing_ganran_shuju.get_Link_2023(url)
|
|
|
|
# 使用多进程处理每个块
|
|
pool = ThreadPool(100)
|
|
pool.map(get_beijing_ganran_shuju.get_content_2023, reversed(get_beijing_ganran_shuju.link_list_2023))
|
|
pool.close()
|
|
pool.join()
|
|
|
|
# 2024
|
|
get_beijing_ganran_shuju.month_data = []
|
|
for url in url_list:
|
|
get_beijing_ganran_shuju.get_Link_2024(url)
|
|
for x in reversed(get_beijing_ganran_shuju.link_list_2024):
|
|
get_beijing_ganran_shuju.get_content_2024(x)
|
|
|
|
df = pd.DataFrame(get_beijing_ganran_shuju.data, columns=['日期', '感染数量'])
|
|
df = df[df['日期'] != '2023-12-26']
|
|
df['日期'] = pd.to_datetime(df['日期'])
|
|
df_week = df.sort_values(by='日期')
|
|
from datetime import date
|
|
today = date.today()
|
|
start_date = datetime(2024, 1, 2)
|
|
end_date = datetime.now()
|
|
|
|
dates = []
|
|
while start_date <= end_date:
|
|
dates.append(start_date)
|
|
start_date += timedelta(days=7)
|
|
|
|
infections = {datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4) for month, total in get_beijing_ganran_shuju.month_data}
|
|
|
|
date_infections = []
|
|
for date in dates:
|
|
month_key = date.strftime("%Y-%m")
|
|
if month_key in infections:
|
|
date_infections.append([date, infections[month_key]])
|
|
|
|
month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])
|
|
df = pd.concat([df_week, month_df])
|
|
df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
|
|
print(df)
|
|
|
|
converted_data = df.values.tolist()
|
|
for data in converted_data:
|
|
obj, created = BeijingWeekData.objects.get_or_create(date=data[0], defaults={'infection_number': data[1]})
|
|
if created:
|
|
print(f"Added new record for date {data[0]} with infections {data[1]}")
|
|
else:
|
|
print(f"Record for date {data[0]} already exists.")
|
|
print('成功载入数据库')
|
|
|