|
|
import requests
|
|
|
import pandas as pd
|
|
|
import nltk
|
|
|
from google.cloud import language_v1
|
|
|
import matplotlib.pyplot as plt
|
|
|
import os
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
|
nltk.download('punkt')
|
|
|
nltk.download('stopwords')
|
|
|
|
|
|
# 设置Google Cloud服务账号密钥文件路径
|
|
|
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "D:\PYcharm\pycharm projects\Life\\blibliCrawler\paris-olympic-436002-8fa93022f012.json"
|
|
|
|
|
|
# 第一步:使用Google Custom Search API获取新闻数据
|
|
|
def fetch_news(api_key, cse_id, query, num_results=50):
|
|
|
news_data = []
|
|
|
for start in range(1, num_results, 10):
|
|
|
url = 'https://www.googleapis.com/customsearch/v1'
|
|
|
params = {
|
|
|
'key': '8fa93022f012e9d617de9ca25d49dcc258f17ebd',
|
|
|
'cx': 'paris-olympic-436002',
|
|
|
'q': query,
|
|
|
'start': start,
|
|
|
'lr': 'lang_en', # 搜索英文内容;如需中文,请改为 'lang_zh-CN'
|
|
|
'sort': 'date' # 按日期排序
|
|
|
}
|
|
|
response = requests.get(url, params=params)
|
|
|
data = response.json()
|
|
|
if 'items' not in data:
|
|
|
print("未获取到更多数据。")
|
|
|
break
|
|
|
news_data.extend(data['items'])
|
|
|
return pd.DataFrame(news_data)
|
|
|
|
|
|
# 第二步:数据预处理
|
|
|
def preprocess_text(text):
|
|
|
# 转换为小写
|
|
|
text = text.lower()
|
|
|
# 分词
|
|
|
tokens = nltk.word_tokenize(text)
|
|
|
# 去除停用词和非字母字符
|
|
|
tokens = [word for word in tokens if word.isalpha() and word not in nltk.corpus.stopwords.words('english')]
|
|
|
return ' '.join(tokens)
|
|
|
|
|
|
# 第三步:情感分析
|
|
|
def analyze_sentiment(text_content):
|
|
|
client = language_v1.LanguageServiceClient()
|
|
|
document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT)
|
|
|
sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment
|
|
|
return sentiment.score
|
|
|
|
|
|
# 第四步:可视化
|
|
|
def plot_sentiment_over_time(df):
|
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
|
df.set_index('date', inplace=True)
|
|
|
df.resample('D')['sentiment'].mean().plot()
|
|
|
plt.title('情感得分随时间的变化')
|
|
|
plt.xlabel('日期')
|
|
|
plt.ylabel('平均情感得分')
|
|
|
plt.show()
|
|
|
|
|
|
# 主函数
|
|
|
if __name__ == "__main__":
|
|
|
# 替换为您的API密钥和搜索引擎ID
|
|
|
api_key = 'YOUR_GOOGLE_CUSTOM_SEARCH_API_KEY'
|
|
|
cse_id = 'YOUR_CUSTOM_SEARCH_ENGINE_ID'
|
|
|
query = 'Paris Olympics' # 或 '巴黎奥运会' 获取中文内容
|
|
|
# 获取新闻数据
|
|
|
df = fetch_news(api_key, cse_id, query)
|
|
|
print(f"共获取到 {len(df)} 篇文章。")
|
|
|
|
|
|
# 提取必要的字段
|
|
|
if 'snippet' in df.columns and 'title' in df.columns:
|
|
|
df['text'] = df['title'] + ' ' + df['snippet']
|
|
|
else:
|
|
|
print("缺少必要的文本字段。")
|
|
|
exit()
|
|
|
|
|
|
# 处理日期字段
|
|
|
if 'pagemap' in df.columns:
|
|
|
dates = []
|
|
|
for item in df['pagemap']:
|
|
|
if 'metatags' in item and 'og:updated_time' in item['metatags'][0]:
|
|
|
dates.append(item['metatags'][0]['og:updated_time'])
|
|
|
else:
|
|
|
dates.append(datetime.now().isoformat())
|
|
|
df['date'] = dates
|
|
|
else:
|
|
|
df['date'] = datetime.now().isoformat()
|
|
|
|
|
|
# 数据预处理
|
|
|
df['cleaned_text'] = df['text'].apply(preprocess_text)
|
|
|
|
|
|
# 情感分析
|
|
|
df['sentiment'] = df['cleaned_text'].apply(analyze_sentiment)
|
|
|
|
|
|
# 可视化
|
|
|
plot_sentiment_over_time(df)
|