You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/app_test/liugan_zhoubao_spider.py

126 lines
4.0 KiB

5 months ago
import datetime
import os
import random
import re
import time
from datetime import datetime
from multiprocessing.pool import ThreadPool
import django
import requests
from app_test.models import LiuganWeekData
from django.db import IntegrityError
from lxml import etree
from tqdm import *
from .user_agents_pool import *
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings')
django.setup()
# 现在你可以安全地使用 Django 的模型和其他组件了
url_1=['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
url_list2=[f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm' for i in range(1,4)]
url_list=url_1+url_list2
user_Agent = random.choice(agent_list)
headers = {
"User-Agent": user_Agent,
}
def get_Link(url):
link_list = []
response = requests.get(url=url, headers=headers)
time.sleep(1)
html = response.content.decode("utf-8")
tree = etree.HTML(html)
li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
# print(len(li_list))
for table in li_list:
link = table.xpath("./span[1]/a/@href")[0]
link = link.replace('.','')
url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
link = url_head + link
link = link.replace('htm','.htm')
link_list.append(link)
return link_list
def get_content(link):
response = requests.get(url=link, headers=headers)
time.sleep(1.5)
html = response.content.decode("utf-8")
# print(html)
tree = etree.HTML(html)
date = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()')[1]
# print(time)
year = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()')[0]
# print(year)
date = year+date
date = date.replace('','')
date_format = '%Y年%m月%d'
target_date = datetime.strptime(date, date_format)
# print(target_date)
start_time = '2023年2月18日'
start_date = datetime.strptime(start_time, date_format)
if target_date > start_date:
specific_number = re.search(r'(.?<=font-size: 10pt;\">|<span lang=\"EN-US\">)(\d+)(?=</span>起|起)', html)
number = specific_number.group(2) if specific_number else None
if number == None:
pattern = r'<span lang="EN-US" style="font-size: 10pt;">(\d+)</span><span style="font-size: 10pt'
number_list = re.findall(pattern,html)
if number_list:
number = number_list[0]
else:
number = 0
# print(html)
return [date, number]
else: return None
def get_liuganzhoubao():
link_list_all = []
for url in url_list:
link_list_all += get_Link(url)
link_list_all = list(reversed(link_list_all))
data_all = []
# 使用多进程处理
pool = ThreadPool(30)
data_list = pool.map(get_content, link_list_all)
pool.close()
pool.join()
for data in data_list:
if data:
data_all.append(data)
# print(data_all)
def convert_date_format(date_str):
# 去除'年', '月', '日' 字符,转换为 '2023-2-19' 格式
date_str = date_str.replace('', '-').replace('', '-').replace('', '')
# 将字符串转换为日期对象
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
# 将日期对象格式化为 '2023-02-19' 形式
new_date_str = date_obj.strftime('%Y-%m-%d')
return new_date_str
# 应用转换函数到数据列表
converted_data = [[convert_date_format(item[0]), item[1]] for item in data_all]
print(converted_data)
# 数据载入数据库LiuganWeekData表
for data in converted_data:
# 使用get_or_create来避免重复数据
obj, created = LiuganWeekData.objects.get_or_create(date=data[0], defaults={'infection_number': data[1]})
if created:
print(f"Added new record for date {data[0]} with infections {data[1]}")
else:
print(f"Record for date {data[0]} already exists.")
print('成功载入数据库')
# 调用函数
# get_liuganzhoubao()