import requests import pandas as pd import nltk from google.cloud import language_v1 import matplotlib.pyplot as plt import os from datetime import datetime, timedelta nltk.download('punkt') nltk.download('stopwords') # 设置Google Cloud服务账号密钥文件路径 os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "D:\PYcharm\pycharm projects\Life\\blibliCrawler\paris-olympic-436002-8fa93022f012.json" # 第一步:使用Google Custom Search API获取新闻数据 def fetch_news(api_key, cse_id, query, num_results=50): news_data = [] for start in range(1, num_results, 10): url = 'https://www.googleapis.com/customsearch/v1' params = { 'key': '8fa93022f012e9d617de9ca25d49dcc258f17ebd', 'cx': 'paris-olympic-436002', 'q': query, 'start': start, 'lr': 'lang_en', # 搜索英文内容;如需中文,请改为 'lang_zh-CN' 'sort': 'date' # 按日期排序 } response = requests.get(url, params=params) data = response.json() if 'items' not in data: print("未获取到更多数据。") break news_data.extend(data['items']) return pd.DataFrame(news_data) # 第二步:数据预处理 def preprocess_text(text): # 转换为小写 text = text.lower() # 分词 tokens = nltk.word_tokenize(text) # 去除停用词和非字母字符 tokens = [word for word in tokens if word.isalpha() and word not in nltk.corpus.stopwords.words('english')] return ' '.join(tokens) # 第三步:情感分析 def analyze_sentiment(text_content): client = language_v1.LanguageServiceClient() document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT) sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment return sentiment.score # 第四步:可视化 def plot_sentiment_over_time(df): df['date'] = pd.to_datetime(df['date']) df.set_index('date', inplace=True) df.resample('D')['sentiment'].mean().plot() plt.title('情感得分随时间的变化') plt.xlabel('日期') plt.ylabel('平均情感得分') plt.show() # 主函数 if __name__ == "__main__": # 替换为您的API密钥和搜索引擎ID api_key = 'YOUR_GOOGLE_CUSTOM_SEARCH_API_KEY' cse_id = 'YOUR_CUSTOM_SEARCH_ENGINE_ID' query = 'Paris Olympics' # 或 '巴黎奥运会' 获取中文内容 # 获取新闻数据 df = fetch_news(api_key, cse_id, query) print(f"共获取到 {len(df)} 篇文章。") # 提取必要的字段 if 'snippet' in df.columns and 'title' in df.columns: df['text'] = df['title'] + ' ' + df['snippet'] else: print("缺少必要的文本字段。") exit() # 处理日期字段 if 'pagemap' in df.columns: dates = [] for item in df['pagemap']: if 'metatags' in item and 'og:updated_time' in item['metatags'][0]: dates.append(item['metatags'][0]['og:updated_time']) else: dates.append(datetime.now().isoformat()) df['date'] = dates else: df['date'] = datetime.now().isoformat() # 数据预处理 df['cleaned_text'] = df['text'].apply(preprocess_text) # 情感分析 df['sentiment'] = df['cleaned_text'].apply(analyze_sentiment) # 可视化 plot_sentiment_over_time(df)