import requests from lxml import etree from datetime import datetime import re from multiprocessing.pool import ThreadPool # 初始化常量和配置 START_DATE = datetime.strptime('2023年2月18日', '%Y年%m月%d日') HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0" } SESSION = requests.Session() # 构建URL列表 BASE_URL = 'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index' url_list = [ f'{BASE_URL}_{i}.htm' if i > 0 else f'{BASE_URL}.htm' for i in range(4) ] def get_links(url): try: response = SESSION.get(url, headers=HEADERS) time.sleep(0.3) print(response.status_code) tree = etree.HTML(response.content.decode("utf-8")) links = tree.xpath('//li/span[1]/a/@href') print([ "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" + url.replace('.', '').replace('htm', '.htm') for url in links ]) return [ "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" + url.replace('.', '').replace('htm', '.htm') for url in links ] except Exception as e: print(f"Error fetching links from {url}: {e}") return [] def get_content(link): try: response = SESSION.get(link, headers=HEADERS) time.sleep(0.3) html = response.content.decode("utf-8") tree = etree.HTML(html) date_text = tree.xpath('//div[@class="content"]//p[1]/span/text()')[1] year = tree.xpath('//div[@class="content"]//p[1]/span/span/text()')[0] date = datetime.strptime(year + date_text.replace(')', ''), '%Y年%m月%d日') if date > START_DATE: number = re.search(r'(\d+)(?=起)', html) return [ date.strftime('%Y-%m-%d'), number.group(0) if number else 0 ] except Exception as e: print(f"Error fetching content from {link}: {e}") return None def get_liuganzhoubao(): links = [] for url in url_list: links += get_links(url) print(links) with ThreadPool(10) as pool: data_list = pool.map(get_content, links) return [data for data in data_list if data] if __name__ == "__main__": data = get_liuganzhoubao() print(data)