parent
1c05093112
commit
fbdfa83826
@ -0,0 +1,65 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
# 设置请求头
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
|
||||
}
|
||||
|
||||
# 发送请求获取网页内容
|
||||
url = "http://www.xinhuanet.com/politicspro/"
|
||||
response = requests.get(url, headers=headers)
|
||||
html = response.text
|
||||
|
||||
# 使用BeautifulSoup解析网页内容
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# 提取标题
|
||||
titles = []
|
||||
divs_with_titles = soup.select('.tit') # 选择所有class为'tit'的元素
|
||||
for div in divs_with_titles:
|
||||
a_tag = div.find('a') # 在每个div中找到第一个'a'标签
|
||||
if a_tag:
|
||||
title = a_tag.get_text(strip=True) # 获取标签文本并去除空白字符
|
||||
titles.append(title)
|
||||
|
||||
# 提取其他内容
|
||||
other_content = []
|
||||
divs_with_other = soup.select('.xhwp_list') # 选择所有class为'xhwp_list'的元素
|
||||
for ul in divs_with_other:
|
||||
lis = ul.find_all('li') # 找到每个'ul'中的所有'li'元素
|
||||
for li in lis:
|
||||
content = li.get_text(strip=True) # 获取元素文本并去除空白字符
|
||||
other_content.append(content)
|
||||
|
||||
# 格式化输出和时间戳
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # 获取当前时间并格式化
|
||||
output_file_txt = f"xinhuanet_data_{timestamp}.txt" # 输出文本文件名包含时间戳
|
||||
output_file_json = f"xinhuanet_data_{timestamp}.json" # 输出JSON文件名包含时间戳
|
||||
|
||||
# 写入文本文件
|
||||
with open(output_file_txt, 'w', encoding='utf-8') as file:
|
||||
file.write("=== 标题 ===\n\n")
|
||||
for i, title in enumerate(titles, start=1):
|
||||
file.write(f"{i}. {title}\n")
|
||||
|
||||
file.write("\n=== 其他内容 ===\n\n")
|
||||
for i, content in enumerate(other_content, start=1):
|
||||
file.write(f"{i}. {content}\n")
|
||||
|
||||
print(f"数据已写入文本文件 {output_file_txt}")
|
||||
|
||||
# 准备数据写入JSON文件
|
||||
data = {
|
||||
"timestamp": timestamp,
|
||||
"titles": titles,
|
||||
"other_content": other_content
|
||||
}
|
||||
|
||||
# 写入JSON文件
|
||||
with open(output_file_json, 'w', encoding='utf-8') as json_file:
|
||||
json.dump(data, json_file, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"数据已写入JSON文件 {output_file_json}")
|
Loading…
Reference in new issue