|
|
import requests
|
|
|
import random
|
|
|
import pandas as pd
|
|
|
from lxml import etree
|
|
|
import time
|
|
|
import re
|
|
|
from datetime import datetime
|
|
|
from tqdm import *
|
|
|
from multiprocessing.pool import ThreadPool
|
|
|
from user_agents_pool import *
|
|
|
|
|
|
url_1 = ['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
|
|
|
url_list2 = [
|
|
|
f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm'
|
|
|
for i in range(1, 4)
|
|
|
]
|
|
|
url_list = url_1 + url_list2
|
|
|
|
|
|
user_Agent = random.choice(agent_list)
|
|
|
headers = {
|
|
|
"User-Agent": user_Agent,
|
|
|
}
|
|
|
|
|
|
|
|
|
def get_Link(url):
|
|
|
link_list = []
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
time.sleep(2)
|
|
|
html = response.content.decode("utf-8")
|
|
|
tree = etree.HTML(html)
|
|
|
li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
|
|
|
# print(len(li_list))
|
|
|
for table in li_list:
|
|
|
link = table.xpath("./span[1]/a/@href")[0]
|
|
|
link = link.replace('.', '')
|
|
|
url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
|
|
|
link = url_head + link
|
|
|
link = link.replace('htm', '.htm')
|
|
|
link_list.append(link)
|
|
|
return link_list
|
|
|
|
|
|
|
|
|
def get_content(link):
|
|
|
response = requests.get(url=link, headers=headers)
|
|
|
time.sleep(2)
|
|
|
html = response.content.decode("utf-8")
|
|
|
# print(html)
|
|
|
tree = etree.HTML(html)
|
|
|
date = tree.xpath(
|
|
|
'/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()'
|
|
|
)[1]
|
|
|
# print(time)
|
|
|
year = tree.xpath(
|
|
|
'/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()'
|
|
|
)[0]
|
|
|
# print(year)
|
|
|
date = year + date
|
|
|
date = date.replace(')', '')
|
|
|
date_format = '%Y年%m月%d日'
|
|
|
target_date = datetime.strptime(date, date_format)
|
|
|
# print(target_date)
|
|
|
start_time = '2023年2月18日'
|
|
|
start_date = datetime.strptime(start_time, date_format)
|
|
|
if target_date > start_date:
|
|
|
specific_number = re.search(
|
|
|
r'(.?<=font-size: 10pt;\">|<span lang=\"EN-US\">)(\d+)(?=</span>起|起)',
|
|
|
html)
|
|
|
number = specific_number.group(2) if specific_number else None
|
|
|
if number == None:
|
|
|
pattern = r'<span lang="EN-US" style="font-size: 10pt;">(\d+)</span><span style="font-size: 10pt'
|
|
|
number_list = re.findall(pattern, html)
|
|
|
if number_list:
|
|
|
number = number_list[0]
|
|
|
else:
|
|
|
number = 0
|
|
|
# print(html)
|
|
|
return [date, number]
|
|
|
else:
|
|
|
return None
|
|
|
|
|
|
|
|
|
def get_liuganzhoubao():
|
|
|
link_list_all = []
|
|
|
for url in url_list:
|
|
|
link_list_all += get_Link(url)
|
|
|
link_list_all = list(reversed(link_list_all))
|
|
|
|
|
|
data_all = []
|
|
|
# 使用多进程处理
|
|
|
pool = ThreadPool(30)
|
|
|
data_list = pool.map(get_content, link_list_all)
|
|
|
pool.close()
|
|
|
pool.join()
|
|
|
for data in data_list:
|
|
|
if data:
|
|
|
data_all.append(data)
|
|
|
print(data_all)
|
|
|
|
|
|
df = pd.DataFrame(data_all, columns=['date', 'infection_number'])
|
|
|
# 将日期列转换为日期时间类型
|
|
|
df['date'] = pd.to_datetime(df['date'], format='%Y年%m月%d日')
|
|
|
# 将日期时间类型列格式化为所需的字符串格式
|
|
|
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
|
|
|
print(df)
|
|
|
df.to_csv('liugan_zhoubao.csv', encoding='utf-8')
|
|
|
print('流感周报数据已经储存在liugan_zhoubao.csv中')
|
|
|
|
|
|
|
|
|
# 调用函数
|
|
|
get_liuganzhoubao()
|