from bs4 import BeautifulSoup import requests from datetime import datetime import json # 设置请求头 headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1" } # 发送请求获取网页内容 url = "http://www.xinhuanet.com/politicspro/" response = requests.get(url, headers=headers) html = response.text # 使用BeautifulSoup解析网页内容 soup = BeautifulSoup(html, 'html.parser') # 提取标题、链接和段落内容 articles = [] select_items = soup.select('.column-center-item') for item in select_items: title_divs = item.select('.tit') # 选择所有class为'tit'的元素 for div in title_divs: a_tag = div.find('a') # 在每个div中找到第一个'a'标签 if a_tag: title = a_tag.get_text(strip=True) # 获取标签文本并去除空白字符 link = a_tag.get('href') # 获取链接地址 # 发送请求获取链接页面内容 sub_response = requests.get(link, headers=headers) sub_html = sub_response.text sub_soup = BeautifulSoup(sub_html, 'html.parser') # 提取段落内容 article_content = sub_soup.find('span', id='detailContent') # 获取所有

标签中的文本 paragraphs = [] if article_content: p_tags = article_content.find_all('p') for p in p_tags: if p: paragraphs.append(p.get_text(strip=True)) if not paragraphs: paragraphs.append("没有段落内容。") articles.append({'category': '时政热点', 'title': title, 'link': link, 'paragraphs': paragraphs}) # 提取其他内容 other_content = [] other_items = soup.select('.xhwp_list') # 选择所有class为'xhwp_list'的元素 for ul in other_items: lis = ul.find_all('li') # 找到每个'ul'中的所有'li'元素 for li in lis: a_tag = li.find('a') # 找到'li'中的'a'标签 if a_tag: content = a_tag.get_text(strip=True) # 获取'a'标签文本并去除空白字符 link = a_tag.get('href') # 获取'a'标签中的链接地址 other_content.append({'content': content, 'link': link}) # 格式化输出和时间戳 timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # 获取当前时间并格式化 output_txt = f"xinhuanet_articles_{timestamp}.txt" # 输出文本文件名包含时间戳 output_json = f"xinhuanet_articles_{timestamp}.json" # 输出JSON文件名包含时间戳 # 写入文本文件 with open(output_txt, 'w', encoding='utf-8') as file: file.write("===== 时政热点:标题、链接和段落内容 =====\n\n") for i, article in enumerate(articles, start=1): file.write(f"{i}. 标题: {article['title']}\n 链接: {article['link']}\n") file.write(" 段落文本内容:\n") for j, p in enumerate(article['paragraphs'], start=1): file.write(f" {j}. {p}\n") file.write("\n") file.write("\n===== 其他内容和链接 =====\n\n") for i, item in enumerate(other_content, start=1): file.write(f"{i}. {item['content']} - {item['link']}\n") print(f"数据已写入文本文件 {output_txt}") # 准备数据写入JSON文件 data = { "timestamp": timestamp, "articles": articles, "other_content": other_content } # 写入JSON文件 with open(output_json, 'w', encoding='utf-8') as json_file: json.dump(data, json_file, ensure_ascii=False, indent=4) print(f"数据已写入JSON文件 {output_json}")