You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
3.7 KiB
97 lines
3.7 KiB
from bs4 import BeautifulSoup
|
|
import requests
|
|
from datetime import datetime
|
|
import json
|
|
|
|
# 设置请求头
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
|
|
}
|
|
|
|
# 发送请求获取网页内容
|
|
url = "http://www.xinhuanet.com/politicspro/"
|
|
response = requests.get(url, headers=headers)
|
|
html = response.text
|
|
|
|
# 使用BeautifulSoup解析网页内容
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# 提取标题、链接和段落内容
|
|
articles = []
|
|
|
|
select_items = soup.select('.column-center-item')
|
|
for item in select_items:
|
|
title_divs = item.select('.tit') # 选择所有class为'tit'的元素
|
|
for div in title_divs:
|
|
a_tag = div.find('a') # 在每个div中找到第一个'a'标签
|
|
if a_tag:
|
|
title = a_tag.get_text(strip=True) # 获取标签文本并去除空白字符
|
|
link = a_tag.get('href') # 获取链接地址
|
|
|
|
# 发送请求获取链接页面内容
|
|
sub_response = requests.get(link, headers=headers)
|
|
sub_html = sub_response.text
|
|
sub_soup = BeautifulSoup(sub_html, 'html.parser')
|
|
|
|
# 提取段落内容
|
|
article_content = sub_soup.find('span', id='detailContent')
|
|
|
|
# 获取所有<p>标签中的文本
|
|
paragraphs = []
|
|
if article_content:
|
|
p_tags = article_content.find_all('p')
|
|
for p in p_tags:
|
|
if p:
|
|
paragraphs.append(p.get_text(strip=True))
|
|
|
|
if not paragraphs:
|
|
paragraphs.append("没有段落内容。")
|
|
|
|
articles.append({'category': '时政热点', 'title': title, 'link': link, 'paragraphs': paragraphs})
|
|
|
|
# 提取其他内容
|
|
other_content = []
|
|
other_items = soup.select('.xhwp_list') # 选择所有class为'xhwp_list'的元素
|
|
for ul in other_items:
|
|
lis = ul.find_all('li') # 找到每个'ul'中的所有'li'元素
|
|
for li in lis:
|
|
a_tag = li.find('a') # 找到'li'中的'a'标签
|
|
if a_tag:
|
|
content = a_tag.get_text(strip=True) # 获取'a'标签文本并去除空白字符
|
|
link = a_tag.get('href') # 获取'a'标签中的链接地址
|
|
other_content.append({'content': content, 'link': link})
|
|
|
|
# 格式化输出和时间戳
|
|
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # 获取当前时间并格式化
|
|
output_txt = f"xinhuanet_articles_{timestamp}.txt" # 输出文本文件名包含时间戳
|
|
output_json = f"xinhuanet_articles_{timestamp}.json" # 输出JSON文件名包含时间戳
|
|
|
|
# 写入文本文件
|
|
with open(output_txt, 'w', encoding='utf-8') as file:
|
|
file.write("===== 时政热点:标题、链接和段落内容 =====\n\n")
|
|
for i, article in enumerate(articles, start=1):
|
|
file.write(f"{i}. 标题: {article['title']}\n 链接: {article['link']}\n")
|
|
file.write(" 段落文本内容:\n")
|
|
for j, p in enumerate(article['paragraphs'], start=1):
|
|
file.write(f" {j}. {p}\n")
|
|
file.write("\n")
|
|
|
|
file.write("\n===== 其他内容和链接 =====\n\n")
|
|
for i, item in enumerate(other_content, start=1):
|
|
file.write(f"{i}. {item['content']} - {item['link']}\n")
|
|
|
|
print(f"数据已写入文本文件 {output_txt}")
|
|
|
|
# 准备数据写入JSON文件
|
|
data = {
|
|
"timestamp": timestamp,
|
|
"articles": articles,
|
|
"other_content": other_content
|
|
}
|
|
|
|
# 写入JSON文件
|
|
with open(output_json, 'w', encoding='utf-8') as json_file:
|
|
json.dump(data, json_file, ensure_ascii=False, indent=4)
|
|
|
|
print(f"数据已写入JSON文件 {output_json}")
|