diff --git a/getNet.py b/getNet.py new file mode 100644 index 0000000..8b40bea --- /dev/null +++ b/getNet.py @@ -0,0 +1,65 @@ +from bs4 import BeautifulSoup +import requests +from datetime import datetime +import json + +# 设置请求头 +headers = { + "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1" +} + +# 发送请求获取网页内容 +url = "http://www.xinhuanet.com/politicspro/" +response = requests.get(url, headers=headers) +html = response.text + +# 使用BeautifulSoup解析网页内容 +soup = BeautifulSoup(html, 'html.parser') + +# 提取标题 +titles = [] +divs_with_titles = soup.select('.tit') # 选择所有class为'tit'的元素 +for div in divs_with_titles: + a_tag = div.find('a') # 在每个div中找到第一个'a'标签 + if a_tag: + title = a_tag.get_text(strip=True) # 获取标签文本并去除空白字符 + titles.append(title) + +# 提取其他内容 +other_content = [] +divs_with_other = soup.select('.xhwp_list') # 选择所有class为'xhwp_list'的元素 +for ul in divs_with_other: + lis = ul.find_all('li') # 找到每个'ul'中的所有'li'元素 + for li in lis: + content = li.get_text(strip=True) # 获取元素文本并去除空白字符 + other_content.append(content) + +# 格式化输出和时间戳 +timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # 获取当前时间并格式化 +output_file_txt = f"xinhuanet_data_{timestamp}.txt" # 输出文本文件名包含时间戳 +output_file_json = f"xinhuanet_data_{timestamp}.json" # 输出JSON文件名包含时间戳 + +# 写入文本文件 +with open(output_file_txt, 'w', encoding='utf-8') as file: + file.write("=== 标题 ===\n\n") + for i, title in enumerate(titles, start=1): + file.write(f"{i}. {title}\n") + + file.write("\n=== 其他内容 ===\n\n") + for i, content in enumerate(other_content, start=1): + file.write(f"{i}. {content}\n") + +print(f"数据已写入文本文件 {output_file_txt}") + +# 准备数据写入JSON文件 +data = { + "timestamp": timestamp, + "titles": titles, + "other_content": other_content +} + +# 写入JSON文件 +with open(output_file_json, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, ensure_ascii=False, indent=4) + +print(f"数据已写入JSON文件 {output_file_json}")