import pandas as pd import requests from bs4 import BeautifulSoup USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0' def get_world_comments(): # 因为没设代理池,在复现时请开代理,不然有些外网url无法访问 urls = [ "https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/", "https://www.news.ufl.edu/2024/07/ai-olympics/", "https://www.thepaper.cn/newsDetail_forward_28287864", "https://www.wicongress.org.cn/2024/zh/article/6268", "https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/", ] headers = { 'User-Agent': USER_AGENT, } data = [] for url in urls: response = requests.get(url, headers=headers,verify=False) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') if url == "https://www.nytimes.com/athletic/5646415/2024/07/25/2024-paris-olympics-ai-omega/": strong_tags = soup.find_all('strong') count = 0 for tag in strong_tags: print(tag.get_text()) data.append({ 'url': url, 'content': tag.get_text(), 'tag': 'strong', }) count += 1 if count == 5: break h1_tags = soup.find_all('h1') for tag in h1_tags: print(tag.get_text()) data.append({ 'url': url, 'content': tag.get_text(), 'tag': 'h1', }) if url == "https://www.news.ufl.edu/2024/07/ai-olympics/": h1_tags = soup.find_all('h1') for tag in h1_tags: print(tag.get_text()) data.append({ 'url': url, 'content': tag.get_text(), 'tag': 'h1', }) h2_tags = soup.find_all('h2') for tag in h2_tags: print(tag.get_text()) data.append({ 'url': url, 'content': tag.get_text(), 'tag': 'h2', }) if url == "https://www.thepaper.cn/newsDetail_forward_28287864": p_tags = soup.find_all('p') for tag in p_tags: content = tag.get_text() if content.startswith("技术"): print(content) data.append({ 'url': url, 'content': content, 'tag': 'p', }) if url == "https://www.wicongress.org.cn/2024/zh/article/6268" : strong_tags = soup.find_all('strong') for tag in strong_tags: print(tag.get_text()) data.append({ 'url': url, 'content': tag.get_text(), 'tag': 'strong', }) if url == "https://cn.technode.com/post/2024-08-06/paris-2024-esg-ai-digital-new-tech/": h2_tags = soup.find_all('h2') for tag in h2_tags: content = tag.get_text().strip() if content: print(content) data.append({ 'url': url, 'content': tag.get_text(), 'tag': 'h2', }) else: print(f"无法访问 {url}") df = pd.DataFrame(data) df.to_csv('world_comment.csv', index=False, encoding='utf-8') if __name__ == '__main__': get_world_comments()