You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Influenza_fund_linkage_system/spiders/流感周报爬取2.py

76 lines
2.3 KiB

import requests
from lxml import etree
from datetime import datetime
import re
from multiprocessing.pool import ThreadPool
# 初始化常量和配置
START_DATE = datetime.strptime('2023年2月18日', '%Y年%m月%d')
HEADERS = {
"User-Agent":
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
}
SESSION = requests.Session()
# 构建URL列表
BASE_URL = 'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index'
url_list = [
f'{BASE_URL}_{i}.htm' if i > 0 else f'{BASE_URL}.htm' for i in range(4)
]
def get_links(url):
try:
response = SESSION.get(url, headers=HEADERS)
time.sleep(0.3)
print(response.status_code)
tree = etree.HTML(response.content.decode("utf-8"))
links = tree.xpath('//li/span[1]/a/@href')
print([
"https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
url.replace('.', '').replace('htm', '.htm') for url in links
])
return [
"https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
url.replace('.', '').replace('htm', '.htm') for url in links
]
except Exception as e:
print(f"Error fetching links from {url}: {e}")
return []
def get_content(link):
try:
response = SESSION.get(link, headers=HEADERS)
time.sleep(0.3)
html = response.content.decode("utf-8")
tree = etree.HTML(html)
date_text = tree.xpath('//div[@class="content"]//p[1]/span/text()')[1]
year = tree.xpath('//div[@class="content"]//p[1]/span/span/text()')[0]
date = datetime.strptime(year + date_text.replace('', ''),
'%Y年%m月%d')
if date > START_DATE:
number = re.search(r'(\d+)(?=起)', html)
return [
date.strftime('%Y-%m-%d'),
number.group(0) if number else 0
]
except Exception as e:
print(f"Error fetching content from {link}: {e}")
return None
def get_liuganzhoubao():
links = []
for url in url_list:
links += get_links(url)
print(links)
with ThreadPool(10) as pool:
data_list = pool.map(get_content, links)
return [data for data in data_list if data]
if __name__ == "__main__":
data = get_liuganzhoubao()
print(data)