import requests import random import pandas as pd from lxml import etree import time import re from datetime import datetime from tqdm import * from multiprocessing.pool import ThreadPool from user_agents_pool import * url_1 = ['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm'] url_list2 = [ f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm' for i in range(1, 4) ] url_list = url_1 + url_list2 user_Agent = random.choice(agent_list) headers = { "User-Agent": user_Agent, } def get_Link(url): link_list = [] response = requests.get(url=url, headers=headers) time.sleep(2) html = response.content.decode("utf-8") tree = etree.HTML(html) li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li') # print(len(li_list)) for table in li_list: link = table.xpath("./span[1]/a/@href")[0] link = link.replace('.', '') url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" link = url_head + link link = link.replace('htm', '.htm') link_list.append(link) return link_list def get_content(link): response = requests.get(url=link, headers=headers) time.sleep(2) html = response.content.decode("utf-8") # print(html) tree = etree.HTML(html) date = tree.xpath( '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()' )[1] # print(time) year = tree.xpath( '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()' )[0] # print(year) date = year + date date = date.replace(')', '') date_format = '%Y年%m月%d日' target_date = datetime.strptime(date, date_format) # print(target_date) start_time = '2023年2月18日' start_date = datetime.strptime(start_time, date_format) if target_date > start_date: specific_number = re.search( r'(.?<=font-size: 10pt;\">|)(\d+)(?=起|起)', html) number = specific_number.group(2) if specific_number else None if number == None: pattern = r'(\d+)