You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
4.0 KiB

import pandas as pd
import requests
from bs4 import BeautifulSoup
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
def get_world_comments():
# 因为没设代理池在复现时请开代理不然有些外网url无法访问
urls = [
"https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/",
"https://www.news.ufl.edu/2024/07/ai-olympics/",
"https://www.thepaper.cn/newsDetail_forward_28287864",
"https://www.wicongress.org.cn/2024/zh/article/6268",
"https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/",
]
headers = {
'User-Agent': USER_AGENT,
}
data = []
for url in urls:
response = requests.get(url, headers=headers,verify=False)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
if url == "https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/":
strong_tags = soup.find_all('strong')
count = 0
for tag in strong_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'strong',
})
count += 1
if count == 5:
break
h1_tags = soup.find_all('h1')
for tag in h1_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h1',
})
if url == "https://www.news.ufl.edu/2024/07/ai-olympics/":
h1_tags = soup.find_all('h1')
for tag in h1_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h1',
})
h2_tags = soup.find_all('h2')
for tag in h2_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h2',
})
if url == "https://www.thepaper.cn/newsDetail_forward_28287864":
p_tags = soup.find_all('p')
for tag in p_tags:
content = tag.get_text()
if content.startswith("技术"):
print(content)
data.append({
'url': url,
'content': content,
'tag': 'p',
})
if url == "https://www.wicongress.org.cn/2024/zh/article/6268" :
strong_tags = soup.find_all('strong')
for tag in strong_tags:
print(tag.get_text())
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'strong',
})
if url == "https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/":
h2_tags = soup.find_all('h2')
for tag in h2_tags:
content = tag.get_text().strip()
if content:
print(content)
data.append({
'url': url,
'content': tag.get_text(),
'tag': 'h2',
})
else:
print(f"无法访问 {url}")
df = pd.DataFrame(data)
df.to_csv('world_comment.csv', index=False, encoding='utf-8')
if __name__ == '__main__':
get_world_comments()