import datetime import os import random import re import time from datetime import datetime from multiprocessing.pool import ThreadPool import django import requests from app_test.models import LiuganWeekData from django.db import IntegrityError from lxml import etree from tqdm import * from .user_agents_pool import * os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'liugan_yuce.liugan_yuce.settings') django.setup() # 现在你可以安全地使用 Django 的模型和其他组件了 url_1=['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm'] url_list2=[f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm' for i in range(1,4)] url_list=url_1+url_list2 user_Agent = random.choice(agent_list) headers = { "User-Agent": user_Agent, } def get_Link(url): link_list = [] response = requests.get(url=url, headers=headers) time.sleep(1) html = response.content.decode("utf-8") tree = etree.HTML(html) li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li') # print(len(li_list)) for table in li_list: link = table.xpath("./span[1]/a/@href")[0] link = link.replace('.','') url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" link = url_head + link link = link.replace('htm','.htm') link_list.append(link) return link_list def get_content(link): response = requests.get(url=link, headers=headers) time.sleep(1.5) html = response.content.decode("utf-8") # print(html) tree = etree.HTML(html) date = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()')[1] # print(time) year = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()')[0] # print(year) date = year+date date = date.replace('）','') date_format = '%Y年%m月%d日' target_date = datetime.strptime(date, date_format) # print(target_date) start_time = '2023年2月18日' start_date = datetime.strptime(start_time, date_format) if target_date > start_date: specific_number = re.search(r'(.?<=font-size: 10pt;\">|)(\d+)(?=起|起)', html) number = specific_number.group(2) if specific_number else None if number == None: pattern = r'(\d+)