You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

100 lines
3.4 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import pandas as pd
import nltk
from google.cloud import language_v1
import matplotlib.pyplot as plt
import os
from datetime import datetime, timedelta
nltk.download('punkt')
nltk.download('stopwords')
# 设置Google Cloud服务账号密钥文件路径
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "D:\PYcharm\pycharm projects\Life\\blibliCrawler\paris-olympic-436002-8fa93022f012.json"
# 第一步使用Google Custom Search API获取新闻数据
def fetch_news(api_key, cse_id, query, num_results=50):
news_data = []
for start in range(1, num_results, 10):
url = 'https://www.googleapis.com/customsearch/v1'
params = {
'key': '8fa93022f012e9d617de9ca25d49dcc258f17ebd',
'cx': 'paris-olympic-436002',
'q': query,
'start': start,
'lr': 'lang_en', # 搜索英文内容;如需中文,请改为 'lang_zh-CN'
'sort': 'date' # 按日期排序
}
response = requests.get(url, params=params)
data = response.json()
if 'items' not in data:
print("未获取到更多数据。")
break
news_data.extend(data['items'])
return pd.DataFrame(news_data)
# 第二步:数据预处理
def preprocess_text(text):
# 转换为小写
text = text.lower()
# 分词
tokens = nltk.word_tokenize(text)
# 去除停用词和非字母字符
tokens = [word for word in tokens if word.isalpha() and word not in nltk.corpus.stopwords.words('english')]
return ' '.join(tokens)
# 第三步:情感分析
def analyze_sentiment(text_content):
client = language_v1.LanguageServiceClient()
document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT)
sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment
return sentiment.score
# 第四步:可视化
def plot_sentiment_over_time(df):
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.resample('D')['sentiment'].mean().plot()
plt.title('情感得分随时间的变化')
plt.xlabel('日期')
plt.ylabel('平均情感得分')
plt.show()
# 主函数
if __name__ == "__main__":
# 替换为您的API密钥和搜索引擎ID
api_key = 'YOUR_GOOGLE_CUSTOM_SEARCH_API_KEY'
cse_id = 'YOUR_CUSTOM_SEARCH_ENGINE_ID'
query = 'Paris Olympics' # 或 '巴黎奥运会' 获取中文内容
# 获取新闻数据
df = fetch_news(api_key, cse_id, query)
print(f"共获取到 {len(df)} 篇文章。")
# 提取必要的字段
if 'snippet' in df.columns and 'title' in df.columns:
df['text'] = df['title'] + ' ' + df['snippet']
else:
print("缺少必要的文本字段。")
exit()
# 处理日期字段
if 'pagemap' in df.columns:
dates = []
for item in df['pagemap']:
if 'metatags' in item and 'og:updated_time' in item['metatags'][0]:
dates.append(item['metatags'][0]['og:updated_time'])
else:
dates.append(datetime.now().isoformat())
df['date'] = dates
else:
df['date'] = datetime.now().isoformat()
# 数据预处理
df['cleaned_text'] = df['text'].apply(preprocess_text)
# 情感分析
df['sentiment'] = df['cleaned_text'].apply(analyze_sentiment)
# 可视化
plot_sentiment_over_time(df)