import requests
import random
import pandas as pd
from lxml import etree
import time
import re
from datetime import datetime
from tqdm import *
from multiprocessing.pool import ThreadPool
from user_agents_pool import *
url_1 = ['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
url_list2 = [
f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm'
for i in range(1, 4)
]
url_list = url_1 + url_list2
user_Agent = random.choice(agent_list)
headers = {
"User-Agent": user_Agent,
}
def get_Link(url):
link_list = []
response = requests.get(url=url, headers=headers)
time.sleep(2)
html = response.content.decode("utf-8")
tree = etree.HTML(html)
li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
# print(len(li_list))
for table in li_list:
link = table.xpath("./span[1]/a/@href")[0]
link = link.replace('.', '')
url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
link = url_head + link
link = link.replace('htm', '.htm')
link_list.append(link)
return link_list
def get_content(link):
response = requests.get(url=link, headers=headers)
time.sleep(2)
html = response.content.decode("utf-8")
# print(html)
tree = etree.HTML(html)
date = tree.xpath(
'/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()'
)[1]
# print(time)
year = tree.xpath(
'/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()'
)[0]
# print(year)
date = year + date
date = date.replace(')', '')
date_format = '%Y年%m月%d日'
target_date = datetime.strptime(date, date_format)
# print(target_date)
start_time = '2023年2月18日'
start_date = datetime.strptime(start_time, date_format)
if target_date > start_date:
specific_number = re.search(
r'(.?<=font-size: 10pt;\">|)(\d+)(?=起|起)',
html)
number = specific_number.group(2) if specific_number else None
if number == None:
pattern = r'(\d+)