|
|
import requests
|
|
|
from lxml import etree
|
|
|
from datetime import datetime
|
|
|
import re
|
|
|
from multiprocessing.pool import ThreadPool
|
|
|
|
|
|
# 初始化常量和配置
|
|
|
START_DATE = datetime.strptime('2023年2月18日', '%Y年%m月%d日')
|
|
|
HEADERS = {
|
|
|
"User-Agent":
|
|
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0"
|
|
|
}
|
|
|
SESSION = requests.Session()
|
|
|
|
|
|
# 构建URL列表
|
|
|
BASE_URL = 'https://ivdc.chinacdc.cn/cnic/zyzx/lgzb/index'
|
|
|
url_list = [
|
|
|
f'{BASE_URL}_{i}.htm' if i > 0 else f'{BASE_URL}.htm' for i in range(4)
|
|
|
]
|
|
|
|
|
|
|
|
|
def get_links(url):
|
|
|
try:
|
|
|
response = SESSION.get(url, headers=HEADERS)
|
|
|
time.sleep(0.3)
|
|
|
print(response.status_code)
|
|
|
tree = etree.HTML(response.content.decode("utf-8"))
|
|
|
links = tree.xpath('//li/span[1]/a/@href')
|
|
|
print([
|
|
|
"https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
|
|
|
url.replace('.', '').replace('htm', '.htm') for url in links
|
|
|
])
|
|
|
return [
|
|
|
"https://ivdc.chinacdc.cn/cnic/zyzx/lgzb" +
|
|
|
url.replace('.', '').replace('htm', '.htm') for url in links
|
|
|
]
|
|
|
except Exception as e:
|
|
|
print(f"Error fetching links from {url}: {e}")
|
|
|
return []
|
|
|
|
|
|
|
|
|
def get_content(link):
|
|
|
try:
|
|
|
response = SESSION.get(link, headers=HEADERS)
|
|
|
time.sleep(0.3)
|
|
|
html = response.content.decode("utf-8")
|
|
|
tree = etree.HTML(html)
|
|
|
date_text = tree.xpath('//div[@class="content"]//p[1]/span/text()')[1]
|
|
|
year = tree.xpath('//div[@class="content"]//p[1]/span/span/text()')[0]
|
|
|
date = datetime.strptime(year + date_text.replace(')', ''),
|
|
|
'%Y年%m月%d日')
|
|
|
if date > START_DATE:
|
|
|
number = re.search(r'(\d+)(?=起)', html)
|
|
|
return [
|
|
|
date.strftime('%Y-%m-%d'),
|
|
|
number.group(0) if number else 0
|
|
|
]
|
|
|
except Exception as e:
|
|
|
print(f"Error fetching content from {link}: {e}")
|
|
|
return None
|
|
|
|
|
|
|
|
|
def get_liuganzhoubao():
|
|
|
links = []
|
|
|
for url in url_list:
|
|
|
links += get_links(url)
|
|
|
print(links)
|
|
|
with ThreadPool(10) as pool:
|
|
|
data_list = pool.map(get_content, links)
|
|
|
return [data for data in data_list if data]
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
data = get_liuganzhoubao()
|
|
|
print(data)
|