Influenza_fund_linkage_system/spiders/流感周报爬取2.py

import requests
from lxml import etree
from datetime import datetime
import re
from multiprocessing.pool import ThreadPool

# 初始化常量和配置
START_DATE = datetime.strptime('2023年2月18日', '%Y年%m月%d日')
HEADERS = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
SESSION = requests.Session()

# 构建URL列表
BASE_URL = 'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index'
url_list = [
    f'{BASE_URL}_{i}.htm' if i > 0 else f'{BASE_URL}.htm' for i in range(4)
]


def get_links(url):
    try:
        response = SESSION.get(url, headers=HEADERS)
        time.sleep(0.3)
        print(response.status_code)
        tree = etree.HTML(response.content.decode("utf-8"))
        links = tree.xpath('//li/span[1]/a/@href')
        print([
            "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
            url.replace('.', '').replace('htm', '.htm') for url in links
        ])
        return [
            "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
            url.replace('.', '').replace('htm', '.htm') for url in links
        ]
    except Exception as e:
        print(f"Error fetching links from {url}: {e}")
        return []


def get_content(link):
    try:
        response = SESSION.get(link, headers=HEADERS)
        time.sleep(0.3)
        html = response.content.decode("utf-8")
        tree = etree.HTML(html)
        date_text = tree.xpath('//div[@class="content"]//p[1]/span/text()')[1]
        year = tree.xpath('//div[@class="content"]//p[1]/span/span/text()')[0]
        date = datetime.strptime(year + date_text.replace('）', ''),
                                 '%Y年%m月%d日')
        if date > START_DATE:
            number = re.search(r'(\d+)(?=起)', html)
            return [
                date.strftime('%Y-%m-%d'),
                number.group(0) if number else 0
            ]
    except Exception as e:
        print(f"Error fetching content from {link}: {e}")
        return None


def get_liuganzhoubao():
    links = []
    for url in url_list:
        links += get_links(url)
    print(links)
    with ThreadPool(10) as pool:
        data_list = pool.map(get_content, links)
    return [data for data in data_list if data]


if __name__ == "__main__":
    data = get_liuganzhoubao()
    print(data)