You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
3.7 KiB

from bs4 import BeautifulSoup
import requests
from datetime import datetime
import json
# 设置请求头
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
}
# 发送请求获取网页内容
url = "http://www.xinhuanet.com/politicspro/"
response = requests.get(url, headers=headers)
html = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')
# 提取标题、链接和段落内容
articles = []
select_items = soup.select('.column-center-item')
for item in select_items:
title_divs = item.select('.tit') # 选择所有class为'tit'的元素
for div in title_divs:
a_tag = div.find('a') # 在每个div中找到第一个'a'标签
if a_tag:
title = a_tag.get_text(strip=True) # 获取标签文本并去除空白字符
link = a_tag.get('href') # 获取链接地址
# 发送请求获取链接页面内容
sub_response = requests.get(link, headers=headers)
sub_html = sub_response.text
sub_soup = BeautifulSoup(sub_html, 'html.parser')
# 提取段落内容
article_content = sub_soup.find('span', id='detailContent')
# 获取所有<p>标签中的文本
paragraphs = []
if article_content:
p_tags = article_content.find_all('p')
for p in p_tags:
if p:
paragraphs.append(p.get_text(strip=True))
if not paragraphs:
paragraphs.append("没有段落内容。")
articles.append({'category': '时政热点', 'title': title, 'link': link, 'paragraphs': paragraphs})
# 提取其他内容
other_content = []
other_items = soup.select('.xhwp_list') # 选择所有class为'xhwp_list'的元素
for ul in other_items:
lis = ul.find_all('li') # 找到每个'ul'中的所有'li'元素
for li in lis:
a_tag = li.find('a') # 找到'li'中的'a'标签
if a_tag:
content = a_tag.get_text(strip=True) # 获取'a'标签文本并去除空白字符
link = a_tag.get('href') # 获取'a'标签中的链接地址
other_content.append({'content': content, 'link': link})
# 格式化输出和时间戳
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # 获取当前时间并格式化
output_txt = f"xinhuanet_articles_{timestamp}.txt" # 输出文本文件名包含时间戳
output_json = f"xinhuanet_articles_{timestamp}.json" # 输出JSON文件名包含时间戳
# 写入文本文件
with open(output_txt, 'w', encoding='utf-8') as file:
file.write("===== 时政热点:标题、链接和段落内容 =====\n\n")
for i, article in enumerate(articles, start=1):
file.write(f"{i}. 标题: {article['title']}\n 链接: {article['link']}\n")
file.write(" 段落文本内容:\n")
for j, p in enumerate(article['paragraphs'], start=1):
file.write(f" {j}. {p}\n")
file.write("\n")
file.write("\n===== 其他内容和链接 =====\n\n")
for i, item in enumerate(other_content, start=1):
file.write(f"{i}. {item['content']} - {item['link']}\n")
print(f"数据已写入文本文件 {output_txt}")
# 准备数据写入JSON文件
data = {
"timestamp": timestamp,
"articles": articles,
"other_content": other_content
}
# 写入JSON文件
with open(output_json, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"数据已写入JSON文件 {output_json}")