Influenza_fund_linkage_system/spiders/流感周报爬取.py

import requests
import random
import pandas as pd
from lxml import etree
import time
import re
from datetime import datetime
from tqdm import *
from multiprocessing.pool import ThreadPool
from user_agents_pool import *

url_1 = ['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
url_list2 = [
    f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm'
    for i in range(1, 4)
]
url_list = url_1 + url_list2

user_Agent = random.choice(agent_list)
headers = {
    "User-Agent": user_Agent,
}


def get_Link(url):
    link_list = []
    response = requests.get(url=url, headers=headers)
    time.sleep(2)
    html = response.content.decode("utf-8")
    tree = etree.HTML(html)
    li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
    # print(len(li_list))
    for table in li_list:
        link = table.xpath("./span[1]/a/@href")[0]
        link = link.replace('.', '')
        url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
        link = url_head + link
        link = link.replace('htm', '.htm')
        link_list.append(link)
    return link_list


def get_content(link):
    response = requests.get(url=link, headers=headers)
    time.sleep(2)
    html = response.content.decode("utf-8")
    # print(html)
    tree = etree.HTML(html)
    date = tree.xpath(
        '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()'
    )[1]
    # print(time)
    year = tree.xpath(
        '/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()'
    )[0]
    # print(year)
    date = year + date
    date = date.replace('）', '')
    date_format = '%Y年%m月%d日'
    target_date = datetime.strptime(date, date_format)
    # print(target_date)
    start_time = '2023年2月18日'
    start_date = datetime.strptime(start_time, date_format)
    if target_date > start_date:
        specific_number = re.search(
            r'(.?<=font-size: 10pt;\">|<span lang=\"EN-US\">)(\d+)(?=</span>起|起)',
            html)
        number = specific_number.group(2) if specific_number else None
        if number == None:
            pattern = r'<span lang="EN-US" style="font-size: 10pt;">(\d+)</span><span style="font-size: 10pt'
            number_list = re.findall(pattern, html)
            if number_list:
                number = number_list[0]
            else:
                number = 0
            # print(html)
        return [date, number]
    else:
        return None


def get_liuganzhoubao():
    link_list_all = []
    for url in url_list:
        link_list_all += get_Link(url)
    link_list_all = list(reversed(link_list_all))

    data_all = []
    # 使用多进程处理
    pool = ThreadPool(30)
    data_list = pool.map(get_content, link_list_all)
    pool.close()
    pool.join()
    for data in data_list:
        if data:
            data_all.append(data)
    print(data_all)

    df = pd.DataFrame(data_all, columns=['date', 'infection_number'])
    # 将日期列转换为日期时间类型
    df['date'] = pd.to_datetime(df['date'], format='%Y年%m月%d日')
    # 将日期时间类型列格式化为所需的字符串格式
    df['date'] = df['date'].dt.strftime('%Y-%m-%d')
    print(df)
    df.to_csv('liugan_zhoubao.csv', encoding='utf-8')
    print('流感周报数据已经储存在liugan_zhoubao.csv中')


# 调用函数
get_liuganzhoubao()