You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/spiders/流感周报爬取.py

111 lines
3.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import random
import pandas as pd
from lxml import etree
import time
import re
from datetime import datetime
from tqdm import *
from multiprocessing.pool import ThreadPool
from user_agents_pool import *
url_1 = ['https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index.htm']
url_list2 = [
f'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index_{i}.htm'
for i in range(1, 4)
]
url_list = url_1 + url_list2
user_Agent = random.choice(agent_list)
headers = {
"User-Agent": user_Agent,
}
def get_Link(url):
link_list = []
response = requests.get(url=url, headers=headers)
time.sleep(2)
html = response.content.decode("utf-8")
tree = etree.HTML(html)
li_list = tree.xpath('/html/body/div[2]/div/div[1]/div/div[2]/ul/li')
# print(len(li_list))
for table in li_list:
link = table.xpath("./span[1]/a/@href")[0]
link = link.replace('.', '')
url_head = "https://ivdc.chinacdc.cn/cnic/zyzx/lgzb"
link = url_head + link
link = link.replace('htm', '.htm')
link_list.append(link)
return link_list
def get_content(link):
response = requests.get(url=link, headers=headers)
time.sleep(2)
html = response.content.decode("utf-8")
# print(html)
tree = etree.HTML(html)
date = tree.xpath(
'/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/text()'
)[1]
# print(time)
year = tree.xpath(
'/html/body/div[2]/div/div[1]/div/div[2]/div/div/div/p[1]/span/span/text()'
)[0]
# print(year)
date = year + date
date = date.replace('', '')
date_format = '%Y年%m月%d'
target_date = datetime.strptime(date, date_format)
# print(target_date)
start_time = '2023年2月18日'
start_date = datetime.strptime(start_time, date_format)
if target_date > start_date:
specific_number = re.search(
r'(.?<=font-size: 10pt;\">|<span lang=\"EN-US\">)(\d+)(?=</span>起|起)',
html)
number = specific_number.group(2) if specific_number else None
if number == None:
pattern = r'<span lang="EN-US" style="font-size: 10pt;">(\d+)</span><span style="font-size: 10pt'
number_list = re.findall(pattern, html)
if number_list:
number = number_list[0]
else:
number = 0
# print(html)
return [date, number]
else:
return None
def get_liuganzhoubao():
link_list_all = []
for url in url_list:
link_list_all += get_Link(url)
link_list_all = list(reversed(link_list_all))
data_all = []
# 使用多进程处理
pool = ThreadPool(30)
data_list = pool.map(get_content, link_list_all)
pool.close()
pool.join()
for data in data_list:
if data:
data_all.append(data)
print(data_all)
df = pd.DataFrame(data_all, columns=['date', 'infection_number'])
# 将日期列转换为日期时间类型
df['date'] = pd.to_datetime(df['date'], format='%Y年%m月%d')
# 将日期时间类型列格式化为所需的字符串格式
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
print(df)
df.to_csv('liugan_zhoubao.csv', encoding='utf-8')
print('流感周报数据已经储存在liugan_zhoubao.csv中')
# 调用函数
get_liuganzhoubao()