You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/app_test/beijing_zhoubao_spider.py

149 lines
5.4 KiB

5 months ago
import asyncio
import os
import random
import re
import time
from datetime import datetime, timedelta, date
from multiprocessing.pool import ThreadPool
import django
import matplotlib.pyplot as plt
import pandas as pd
import requests
from django.db import IntegrityError
from lxml import etree
from pylab import mpl
from .models import BeijingWeekData
from .user_agents_pool import agent_list # 确保 user_agents_pool.py 文件在当前目录,并包含 agent_list
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings')
django.setup()
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
class GetBeijingGanranShuju(object):
def __init__(self):
user_agent = random.choice(agent_list)
self.headers = {
"User-Agent": user_agent,
}
self.data = []
self.link_list_2023 = []
self.link_list_2024 = []
def get_Link_2023(self, url):
response = requests.get(url=url, headers=self.headers)
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
link_2023 = re.findall('<a href="[.]*?(/.*?2023.*?)">', html)
for i in link_2023:
url_head = "https://www.bjcdc.org/"
i = url_head + i
self.link_list_2023.append(i)
return self.link_list_2023
def get_Link_2024(self, url):
response = requests.get(url=url, headers=self.headers)
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
link_2024 = re.findall('<a href="[.]*?(/.*?2024.*?)">', html)
for i in link_2024:
url_head = "https://www.bjcdc.org/"
i = url_head + i
self.link_list_2024.append(i)
return self.link_list_2024
def get_content_2023(self, link):
response = requests.get(url=link, headers=self.headers)
import time
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
number = number_list[0] if number_list else ''
time_list = re.findall(r'(\d+月\d+日至2023年\d+月\d+日)', html)
if time_list:
time_str = time_list[0]
time1 = re.match(r'\d+月\d+日?', time_str).group()
month_number = re.match(r'\d{1,2}', time1).group()
day_number = re.findall(r'月(\d{1,2})', time1)[0]
time = f'2023-{int(month_number):02d}-{int(day_number):02d}'
if number.isdigit():
self.data.append([time, number])
def get_content_2024(self, link):
response = requests.get(url=link, headers=self.headers)
import time
time.sleep(random.uniform(1, 3))
html = response.content.decode("utf-8")
if '' not in html:
number_list = re.findall(r'(\d+)例', html, re.DOTALL)
number = number_list[0] if number_list else ''
time_list = re.findall(r'(\d+年\d+月)', html)
if time_list:
time = time_list[0]
if number.isdigit():
self.month_data.append([time, number])
def get_beijing_zhoubao():
# 创建获取 获取北京传染病数据 类的实例
get_beijing_ganran_shuju = GetBeijingGanranShuju()
url_1 = ['https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index.shtml']
url_list2 = [f'https://www.bjcdc.org/cdcmodule/jkdt/yqbb/index_{i}.shtml' for i in range(2, 5)]
url_list = url_1 + url_list2
# 2023
for url in url_list:
get_beijing_ganran_shuju.get_Link_2023(url)
# 使用多进程处理每个块
pool = ThreadPool(100)
pool.map(get_beijing_ganran_shuju.get_content_2023, reversed(get_beijing_ganran_shuju.link_list_2023))
pool.close()
pool.join()
# 2024
get_beijing_ganran_shuju.month_data = []
for url in url_list:
get_beijing_ganran_shuju.get_Link_2024(url)
for x in reversed(get_beijing_ganran_shuju.link_list_2024):
get_beijing_ganran_shuju.get_content_2024(x)
df = pd.DataFrame(get_beijing_ganran_shuju.data, columns=['日期', '感染数量'])
df = df[df['日期'] != '2023-12-26']
df['日期'] = pd.to_datetime(df['日期'])
df_week = df.sort_values(by='日期')
from datetime import date
today = date.today()
start_date = datetime(2024, 1, 2)
end_date = datetime.now()
dates = []
while start_date <= end_date:
dates.append(start_date)
start_date += timedelta(days=7)
infections = {datetime.strptime(month, "%Y年%m月").strftime("%Y-%m"): int(int(total) / 4) for month, total in get_beijing_ganran_shuju.month_data}
date_infections = []
for date in dates:
month_key = date.strftime("%Y-%m")
if month_key in infections:
date_infections.append([date, infections[month_key]])
month_df = pd.DataFrame(date_infections, columns=['日期', '感染数量'])
df = pd.concat([df_week, month_df])
df = df.rename(columns={'日期': 'date', '感染数量': 'beijing_number'})
print(df)
converted_data = df.values.tolist()
for data in converted_data:
obj, created = BeijingWeekData.objects.get_or_create(date=data[0], defaults={'infection_number': data[1]})
if created:
print(f"Added new record for date {data[0]} with infections {data[1]}")
else:
print(f"Record for date {data[0]} already exists.")
print('成功载入数据库')