tjn/getPoliticespro.py

from bs4 import BeautifulSoup
import requests
from datetime import datetime
import json

# 设置请求头
headers = {
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"
}

# 发送请求获取网页内容
url = "http://www.xinhuanet.com/politicspro/"
response = requests.get(url, headers=headers)
html = response.text

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')

# 提取标题、链接和段落内容
articles = []

select_items = soup.select('.column-center-item')
for item in select_items:
    title_divs = item.select('.tit')  # 选择所有class为'tit'的元素
    for div in title_divs:
        a_tag = div.find('a')  # 在每个div中找到第一个'a'标签
        if a_tag:
            title = a_tag.get_text(strip=True)  # 获取标签文本并去除空白字符
            link = a_tag.get('href')  # 获取链接地址

            # 发送请求获取链接页面内容
            sub_response = requests.get(link, headers=headers)
            sub_html = sub_response.text
            sub_soup = BeautifulSoup(sub_html, 'html.parser')

            # 提取段落内容
            article_content = sub_soup.find('span', id='detailContent')

            # 获取所有<p>标签中的文本
            paragraphs = []
            if article_content:
                p_tags = article_content.find_all('p')
                for p in p_tags:
                    if p:
                        paragraphs.append(p.get_text(strip=True))

            if not paragraphs:
                paragraphs.append("没有段落内容。")

            articles.append({'category': '时政热点', 'title': title, 'link': link, 'paragraphs': paragraphs})

# 提取其他内容
other_content = []
other_items = soup.select('.xhwp_list')  # 选择所有class为'xhwp_list'的元素
for ul in other_items:
    lis = ul.find_all('li')  # 找到每个'ul'中的所有'li'元素
    for li in lis:
        a_tag = li.find('a')  # 找到'li'中的'a'标签
        if a_tag:
            content = a_tag.get_text(strip=True)  # 获取'a'标签文本并去除空白字符
            link = a_tag.get('href')  # 获取'a'标签中的链接地址
            other_content.append({'content': content, 'link': link})

# 格式化输出和时间戳
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")  # 获取当前时间并格式化
output_txt = f"xinhuanet_articles_{timestamp}.txt"  # 输出文本文件名包含时间戳
output_json = f"xinhuanet_articles_{timestamp}.json"  # 输出JSON文件名包含时间戳

# 写入文本文件
with open(output_txt, 'w', encoding='utf-8') as file:
    file.write("===== 时政热点：标题、链接和段落内容 =====\n\n")
    for i, article in enumerate(articles, start=1):
        file.write(f"{i}. 标题: {article['title']}\n   链接: {article['link']}\n")
        file.write("   段落文本内容:\n")
        for j, p in enumerate(article['paragraphs'], start=1):
            file.write(f"      {j}. {p}\n")
        file.write("\n")

    file.write("\n===== 其他内容和链接 =====\n\n")
    for i, item in enumerate(other_content, start=1):
        file.write(f"{i}. {item['content']} - {item['link']}\n")

print(f"数据已写入文本文件 {output_txt}")

# 准备数据写入JSON文件
data = {
    "timestamp": timestamp,
    "articles": articles,
    "other_content": other_content
}

# 写入JSON文件
with open(output_json, 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

print(f"数据已写入JSON文件 {output_json}")
ADD file via upload 8 months ago			`from bs4 import BeautifulSoup`
			`import requests`
			`from datetime import datetime`
			`import json`

			`# 设置请求头`
			`headers = {`
			`"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1"`
			`}`

			`# 发送请求获取网页内容`
			`url = "http://www.xinhuanet.com/politicspro/"`
			`response = requests.get(url, headers=headers)`
			`html = response.text`

			`# 使用BeautifulSoup解析网页内容`
			`soup = BeautifulSoup(html, 'html.parser')`

			`# 提取标题、链接和段落内容`
			`articles = []`

			`select_items = soup.select('.column-center-item')`
			`for item in select_items:`
			`title_divs = item.select('.tit') # 选择所有class为'tit'的元素`
			`for div in title_divs:`
			`a_tag = div.find('a') # 在每个div中找到第一个'a'标签`
			`if a_tag:`
			`title = a_tag.get_text(strip=True) # 获取标签文本并去除空白字符`
			`link = a_tag.get('href') # 获取链接地址`

			`# 发送请求获取链接页面内容`
			`sub_response = requests.get(link, headers=headers)`
			`sub_html = sub_response.text`
			`sub_soup = BeautifulSoup(sub_html, 'html.parser')`

			`# 提取段落内容`
			`article_content = sub_soup.find('span', id='detailContent')`

			`# 获取所有<p>标签中的文本`
			`paragraphs = []`
			`if article_content:`
			`p_tags = article_content.find_all('p')`
			`for p in p_tags:`
			`if p:`
			`paragraphs.append(p.get_text(strip=True))`

			`if not paragraphs:`
			`paragraphs.append("没有段落内容。")`

			`articles.append({'category': '时政热点', 'title': title, 'link': link, 'paragraphs': paragraphs})`

			`# 提取其他内容`
			`other_content = []`
			`other_items = soup.select('.xhwp_list') # 选择所有class为'xhwp_list'的元素`
			`for ul in other_items:`
			`lis = ul.find_all('li') # 找到每个'ul'中的所有'li'元素`
			`for li in lis:`
			`a_tag = li.find('a') # 找到'li'中的'a'标签`
			`if a_tag:`
			`content = a_tag.get_text(strip=True) # 获取'a'标签文本并去除空白字符`
			`link = a_tag.get('href') # 获取'a'标签中的链接地址`
			`other_content.append({'content': content, 'link': link})`

			`# 格式化输出和时间戳`
			`timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # 获取当前时间并格式化`
			`output_txt = f"xinhuanet_articles_{timestamp}.txt" # 输出文本文件名包含时间戳`
			`output_json = f"xinhuanet_articles_{timestamp}.json" # 输出JSON文件名包含时间戳`

			`# 写入文本文件`
			`with open(output_txt, 'w', encoding='utf-8') as file:`
			`file.write("===== 时政热点：标题、链接和段落内容 =====\n\n")`
			`for i, article in enumerate(articles, start=1):`
			`file.write(f"{i}. 标题: {article['title']}\n 链接: {article['link']}\n")`
			`file.write(" 段落文本内容:\n")`
			`for j, p in enumerate(article['paragraphs'], start=1):`
			`file.write(f" {j}. {p}\n")`
			`file.write("\n")`

			`file.write("\n===== 其他内容和链接 =====\n\n")`
			`for i, item in enumerate(other_content, start=1):`
			`file.write(f"{i}. {item['content']} - {item['link']}\n")`

			`print(f"数据已写入文本文件 {output_txt}")`

			`# 准备数据写入JSON文件`
			`data = {`
			`"timestamp": timestamp,`
			`"articles": articles,`
			`"other_content": other_content`
			`}`

			`# 写入JSON文件`
			`with open(output_json, 'w', encoding='utf-8') as json_file:`
			`json.dump(data, json_file, ensure_ascii=False, indent=4)`

			`print(f"数据已写入JSON文件 {output_json}")`