|
|
|
|
import pandas as pd
|
|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_world_comments():
|
|
|
|
|
# 因为没设代理池,在复现时请开代理,不然有些外网url无法访问
|
|
|
|
|
urls = [
|
|
|
|
|
"https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/",
|
|
|
|
|
"https://www.news.ufl.edu/2024/07/ai-olympics/",
|
|
|
|
|
"https://www.thepaper.cn/newsDetail_forward_28287864",
|
|
|
|
|
"https://www.wicongress.org.cn/2024/zh/article/6268",
|
|
|
|
|
"https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/",
|
|
|
|
|
]
|
|
|
|
|
headers = {
|
|
|
|
|
'User-Agent': USER_AGENT,
|
|
|
|
|
}
|
|
|
|
|
data = []
|
|
|
|
|
for url in urls:
|
|
|
|
|
response = requests.get(url, headers=headers,verify=False)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
if url == "https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/":
|
|
|
|
|
strong_tags = soup.find_all('strong')
|
|
|
|
|
count = 0
|
|
|
|
|
for tag in strong_tags:
|
|
|
|
|
print(tag.get_text())
|
|
|
|
|
data.append({
|
|
|
|
|
'url': url,
|
|
|
|
|
'content': tag.get_text(),
|
|
|
|
|
'tag': 'strong',
|
|
|
|
|
})
|
|
|
|
|
count += 1
|
|
|
|
|
if count == 5:
|
|
|
|
|
break
|
|
|
|
|
h1_tags = soup.find_all('h1')
|
|
|
|
|
for tag in h1_tags:
|
|
|
|
|
print(tag.get_text())
|
|
|
|
|
data.append({
|
|
|
|
|
'url': url,
|
|
|
|
|
'content': tag.get_text(),
|
|
|
|
|
'tag': 'h1',
|
|
|
|
|
})
|
|
|
|
|
if url == "https://www.news.ufl.edu/2024/07/ai-olympics/":
|
|
|
|
|
h1_tags = soup.find_all('h1')
|
|
|
|
|
for tag in h1_tags:
|
|
|
|
|
print(tag.get_text())
|
|
|
|
|
data.append({
|
|
|
|
|
'url': url,
|
|
|
|
|
'content': tag.get_text(),
|
|
|
|
|
'tag': 'h1',
|
|
|
|
|
})
|
|
|
|
|
h2_tags = soup.find_all('h2')
|
|
|
|
|
for tag in h2_tags:
|
|
|
|
|
print(tag.get_text())
|
|
|
|
|
data.append({
|
|
|
|
|
'url': url,
|
|
|
|
|
'content': tag.get_text(),
|
|
|
|
|
'tag': 'h2',
|
|
|
|
|
})
|
|
|
|
|
if url == "https://www.thepaper.cn/newsDetail_forward_28287864":
|
|
|
|
|
p_tags = soup.find_all('p')
|
|
|
|
|
for tag in p_tags:
|
|
|
|
|
content = tag.get_text()
|
|
|
|
|
if content.startswith("技术"):
|
|
|
|
|
print(content)
|
|
|
|
|
data.append({
|
|
|
|
|
'url': url,
|
|
|
|
|
'content': content,
|
|
|
|
|
'tag': 'p',
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
if url == "https://www.wicongress.org.cn/2024/zh/article/6268" :
|
|
|
|
|
strong_tags = soup.find_all('strong')
|
|
|
|
|
for tag in strong_tags:
|
|
|
|
|
print(tag.get_text())
|
|
|
|
|
data.append({
|
|
|
|
|
'url': url,
|
|
|
|
|
'content': tag.get_text(),
|
|
|
|
|
'tag': 'strong',
|
|
|
|
|
})
|
|
|
|
|
if url == "https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/":
|
|
|
|
|
h2_tags = soup.find_all('h2')
|
|
|
|
|
for tag in h2_tags:
|
|
|
|
|
content = tag.get_text().strip()
|
|
|
|
|
if content:
|
|
|
|
|
print(content)
|
|
|
|
|
data.append({
|
|
|
|
|
'url': url,
|
|
|
|
|
'content': tag.get_text(),
|
|
|
|
|
'tag': 'h2',
|
|
|
|
|
})
|
|
|
|
|
else:
|
|
|
|
|
print(f"无法访问 {url}")
|
|
|
|
|
df = pd.DataFrame(data)
|
|
|
|
|
df.to_csv('world_comment.csv', index=False, encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
get_world_comments()
|