You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/app_test/liugan_zhoubao_spider.py

126 lines
4.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import datetime
import os
import random
import re
import time
from datetime import datetime
from multiprocessing.pool import ThreadPool
import django
import requests
from app_test.models import LiuganWeekData
from django.db import IntegrityError
from lxml import etree
from tqdm import *
from .user_agents_pool import *
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings')
django.setup()
# 现在你可以安全地使用 Django 的模型和其他组件了
url_1=['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
url_list2=[f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm' for i in range(1,4)]
url_list=url_1+url_list2
user_Agent = random.choice(agent_list)
headers = {
"User-Agent": user_Agent,
}
def get_Link(url):
link_list = []
response = requests.get(url=url, headers=headers)
time.sleep(1)
html = response.content.decode("utf-8")
tree = etree.HTML(html)
li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
# print(len(li_list))
for table in li_list:
link = table.xpath("./span[1]/a/@href")[0]
link = link.replace('.','')
url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
link = url_head + link
link = link.replace('htm','.htm')
link_list.append(link)
return link_list
def get_content(link):
response = requests.get(url=link, headers=headers)
time.sleep(1.5)
html = response.content.decode("utf-8")
# print(html)
tree = etree.HTML(html)
date = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()')[1]
# print(time)
year = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()')[0]
# print(year)
date = year+date
date = date.replace('','')
date_format = '%Y年%m月%d'
target_date = datetime.strptime(date, date_format)
# print(target_date)
start_time = '2023年2月18日'
start_date = datetime.strptime(start_time, date_format)
if target_date > start_date:
specific_number = re.search(r'(.?<=font-size: 10pt;\">|<span lang=\"EN-US\">)(\d+)(?=</span>起|起)', html)
number = specific_number.group(2) if specific_number else None
if number == None:
pattern = r'<span lang="EN-US" style="font-size: 10pt;">(\d+)</span><span style="font-size: 10pt'
number_list = re.findall(pattern,html)
if number_list:
number = number_list[0]
else:
number = 0
# print(html)
return [date, number]
else: return None
def get_liuganzhoubao():
link_list_all = []
for url in url_list:
link_list_all += get_Link(url)
link_list_all = list(reversed(link_list_all))
data_all = []
# 使用多进程处理
pool = ThreadPool(30)
data_list = pool.map(get_content, link_list_all)
pool.close()
pool.join()
for data in data_list:
if data:
data_all.append(data)
# print(data_all)
def convert_date_format(date_str):
# 去除'年', '月', '日' 字符,转换为 '2023-2-19' 格式
date_str = date_str.replace('', '-').replace('', '-').replace('', '')
# 将字符串转换为日期对象
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
# 将日期对象格式化为 '2023-02-19' 形式
new_date_str = date_obj.strftime('%Y-%m-%d')
return new_date_str
# 应用转换函数到数据列表
converted_data = [[convert_date_format(item[0]), item[1]] for item in data_all]
print(converted_data)
# 数据载入数据库LiuganWeekData表
for data in converted_data:
# 使用get_or_create来避免重复数据
obj, created = LiuganWeekData.objects.get_or_create(date=data[0], defaults={'infection_number': data[1]})
if created:
print(f"Added new record for date {data[0]} with infections {data[1]}")
else:
print(f"Record for date {data[0]} already exists.")
print('成功载入数据库')
# 调用函数
# get_liuganzhoubao()