tjn/getPoliticespro.py

from bs4 import BeautifulSoup
import requests
from datetime import datetime
import json

# 设置请求头
headers = {
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
}

# 发送请求获取网页内容
url = "http://www.xinhuanet.com/politicspro/"
response = requests.get(url, headers=headers)
html = response.text

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')

# 提取标题、链接和段落内容
articles = []

select_items = soup.select('.column-center-item')
for item in select_items:
    title_divs = item.select('.tit')  # 选择所有class为'tit'的元素
    for div in title_divs:
        a_tag = div.find('a')  # 在每个div中找到第一个'a'标签
        if a_tag:
            title = a_tag.get_text(strip=True)  # 获取标签文本并去除空白字符
            link = a_tag.get('href')  # 获取链接地址

            # 发送请求获取链接页面内容
            sub_response = requests.get(link, headers=headers)
            sub_html = sub_response.text
            sub_soup = BeautifulSoup(sub_html, 'html.parser')

            # 提取段落内容
            article_content = sub_soup.find('span', id='detailContent')

            # 获取所有<p>标签中的文本
            paragraphs = []
            if article_content:
                p_tags = article_content.find_all('p')
                for p in p_tags:
                    if p:
                        paragraphs.append(p.get_text(strip=True))

            if not paragraphs:
                paragraphs.append("没有段落内容。")

            articles.append({'category': '时政热点', 'title': title, 'link': link, 'paragraphs': paragraphs})

# 提取其他内容
other_content = []
other_items = soup.select('.xhwp_list')  # 选择所有class为'xhwp_list'的元素
for ul in other_items:
    lis = ul.find_all('li')  # 找到每个'ul'中的所有'li'元素
    for li in lis:
        a_tag = li.find('a')  # 找到'li'中的'a'标签
        if a_tag:
            content = a_tag.get_text(strip=True)  # 获取'a'标签文本并去除空白字符
            link = a_tag.get('href')  # 获取'a'标签中的链接地址
            other_content.append({'content': content, 'link': link})

# 格式化输出和时间戳
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")  # 获取当前时间并格式化
output_txt = f"xinhuanet_articles_{timestamp}.txt"  # 输出文本文件名包含时间戳
output_json = f"xinhuanet_articles_{timestamp}.json"  # 输出JSON文件名包含时间戳

# 写入文本文件
with open(output_txt, 'w', encoding='utf-8') as file:
    file.write("===== 时政热点：标题、链接和段落内容 =====\n\n")
    for i, article in enumerate(articles, start=1):
        file.write(f"{i}. 标题: {article['title']}\n   链接: {article['link']}\n")
        file.write("   段落文本内容:\n")
        for j, p in enumerate(article['paragraphs'], start=1):
            file.write(f"      {j}. {p}\n")
        file.write("\n")

    file.write("\n===== 其他内容和链接 =====\n\n")
    for i, item in enumerate(other_content, start=1):
        file.write(f"{i}. {item['content']} - {item['link']}\n")

print(f"数据已写入文本文件 {output_txt}")

# 准备数据写入JSON文件
data = {
    "timestamp": timestamp,
    "articles": articles,
    "other_content": other_content
}

# 写入JSON文件
with open(output_json, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"数据已写入JSON文件 {output_json}")