|
|
|
@ -1,99 +1,109 @@
|
|
|
|
|
import requests
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import nltk
|
|
|
|
|
from google.cloud import language_v1
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import os
|
|
|
|
|
import numpy as np
|
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from snownlp import SnowNLP
|
|
|
|
|
from matplotlib.font_manager import FontProperties
|
|
|
|
|
|
|
|
|
|
def plot_sentiment_distribution(danmaku_list):
|
|
|
|
|
# 计算每条弹幕的情感得分
|
|
|
|
|
sentiments = [SnowNLP(dmk).sentiments for dmk in danmaku_list]
|
|
|
|
|
|
|
|
|
|
# 绘制情感得分直方图
|
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
|
plt.hist(sentiments, bins=20, color='lightgreen', edgecolor='black')
|
|
|
|
|
plt.xlabel('Sentiment Score')
|
|
|
|
|
plt.ylabel('Bullet Screen Count" or "Danmaku Count')
|
|
|
|
|
plt.title('Danmaku Sentiment Score Distribution Chart')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def plot_top_danmakus(danmaku_frequency):
|
|
|
|
|
# 提取词语和对应的频率
|
|
|
|
|
words = list(danmaku_frequency.keys())
|
|
|
|
|
frequencies = list(danmaku_frequency.values())
|
|
|
|
|
|
|
|
|
|
nltk.download('punkt')
|
|
|
|
|
nltk.download('stopwords')
|
|
|
|
|
|
|
|
|
|
# 设置Google Cloud服务账号密钥文件路径
|
|
|
|
|
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "D:\PYcharm\pycharm projects\Life\\blibliCrawler\paris-olympic-436002-8fa93022f012.json"
|
|
|
|
|
|
|
|
|
|
# 第一步:使用Google Custom Search API获取新闻数据
|
|
|
|
|
def fetch_news(api_key, cse_id, query, num_results=50):
|
|
|
|
|
news_data = []
|
|
|
|
|
for start in range(1, num_results, 10):
|
|
|
|
|
url = 'https://www.googleapis.com/customsearch/v1'
|
|
|
|
|
params = {
|
|
|
|
|
'key': '8fa93022f012e9d617de9ca25d49dcc258f17ebd',
|
|
|
|
|
'cx': 'paris-olympic-436002',
|
|
|
|
|
'q': query,
|
|
|
|
|
'start': start,
|
|
|
|
|
'lr': 'lang_en', # 搜索英文内容;如需中文,请改为 'lang_zh-CN'
|
|
|
|
|
'sort': 'date' # 按日期排序
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(url, params=params)
|
|
|
|
|
data = response.json()
|
|
|
|
|
if 'items' not in data:
|
|
|
|
|
print("未获取到更多数据。")
|
|
|
|
|
break
|
|
|
|
|
news_data.extend(data['items'])
|
|
|
|
|
return pd.DataFrame(news_data)
|
|
|
|
|
|
|
|
|
|
# 第二步:数据预处理
|
|
|
|
|
def preprocess_text(text):
|
|
|
|
|
# 转换为小写
|
|
|
|
|
text = text.lower()
|
|
|
|
|
# 分词
|
|
|
|
|
tokens = nltk.word_tokenize(text)
|
|
|
|
|
# 去除停用词和非字母字符
|
|
|
|
|
tokens = [word for word in tokens if word.isalpha() and word not in nltk.corpus.stopwords.words('english')]
|
|
|
|
|
return ' '.join(tokens)
|
|
|
|
|
|
|
|
|
|
# 第三步:情感分析
|
|
|
|
|
def analyze_sentiment(text_content):
|
|
|
|
|
client = language_v1.LanguageServiceClient()
|
|
|
|
|
document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT)
|
|
|
|
|
sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment
|
|
|
|
|
return sentiment.score
|
|
|
|
|
|
|
|
|
|
# 第四步:可视化
|
|
|
|
|
def plot_sentiment_over_time(df):
|
|
|
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
|
|
|
df.set_index('date', inplace=True)
|
|
|
|
|
df.resample('D')['sentiment'].mean().plot()
|
|
|
|
|
plt.title('情感得分随时间的变化')
|
|
|
|
|
plt.xlabel('日期')
|
|
|
|
|
plt.ylabel('平均情感得分')
|
|
|
|
|
# 绘制柱状图
|
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
|
plt.bar(words, frequencies, color='skyblue')
|
|
|
|
|
plt.xlabel('Danmaku Words')
|
|
|
|
|
plt.ylabel('Frequency of Appearance')
|
|
|
|
|
plt.title('High-Frequency Danmaku Words Statistics')
|
|
|
|
|
plt.xticks(rotation=45)
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
# 主函数
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 替换为您的API密钥和搜索引擎ID
|
|
|
|
|
api_key = 'YOUR_GOOGLE_CUSTOM_SEARCH_API_KEY'
|
|
|
|
|
cse_id = 'YOUR_CUSTOM_SEARCH_ENGINE_ID'
|
|
|
|
|
query = 'Paris Olympics' # 或 '巴黎奥运会' 获取中文内容
|
|
|
|
|
# 获取新闻数据
|
|
|
|
|
df = fetch_news(api_key, cse_id, query)
|
|
|
|
|
print(f"共获取到 {len(df)} 篇文章。")
|
|
|
|
|
|
|
|
|
|
# 提取必要的字段
|
|
|
|
|
if 'snippet' in df.columns and 'title' in df.columns:
|
|
|
|
|
df['text'] = df['title'] + ' ' + df['snippet']
|
|
|
|
|
else:
|
|
|
|
|
print("缺少必要的文本字段。")
|
|
|
|
|
exit()
|
|
|
|
|
|
|
|
|
|
# 处理日期字段
|
|
|
|
|
if 'pagemap' in df.columns:
|
|
|
|
|
dates = []
|
|
|
|
|
for item in df['pagemap']:
|
|
|
|
|
if 'metatags' in item and 'og:updated_time' in item['metatags'][0]:
|
|
|
|
|
dates.append(item['metatags'][0]['og:updated_time'])
|
|
|
|
|
else:
|
|
|
|
|
dates.append(datetime.now().isoformat())
|
|
|
|
|
df['date'] = dates
|
|
|
|
|
else:
|
|
|
|
|
df['date'] = datetime.now().isoformat()
|
|
|
|
|
|
|
|
|
|
# 数据预处理
|
|
|
|
|
df['cleaned_text'] = df['text'].apply(preprocess_text)
|
|
|
|
|
|
|
|
|
|
# 情感分析
|
|
|
|
|
df['sentiment'] = df['cleaned_text'].apply(analyze_sentiment)
|
|
|
|
|
|
|
|
|
|
# 可视化
|
|
|
|
|
plot_sentiment_over_time(df)
|
|
|
|
|
|
|
|
|
|
def create_manual_data():
|
|
|
|
|
# 日期列表
|
|
|
|
|
date_list = pd.date_range(start='2023-07-26', end='2023-08-11', freq='D')
|
|
|
|
|
|
|
|
|
|
# 手动指定情感得分,先上升后下降,8月4日达到最高点
|
|
|
|
|
sentiment_values = [
|
|
|
|
|
0.5, # 7月26日
|
|
|
|
|
0.35, # 7月27日
|
|
|
|
|
0.4, # 7月28日
|
|
|
|
|
0.45, # 7月29日
|
|
|
|
|
0.5, # 7月30日
|
|
|
|
|
0.55, # 7月31日
|
|
|
|
|
0.6, # 8月1日
|
|
|
|
|
0.7, # 8月2日
|
|
|
|
|
0.8, # 8月3日
|
|
|
|
|
0.9, # 8月4日(峰值)
|
|
|
|
|
0.8, # 8月5日
|
|
|
|
|
0.7, # 8月6日
|
|
|
|
|
0.6, # 8月7日
|
|
|
|
|
0.5, # 8月8日
|
|
|
|
|
0.4, # 8月9日
|
|
|
|
|
0.35, # 8月10日
|
|
|
|
|
0.3 # 8月11日
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 创建 DataFrame
|
|
|
|
|
data = {'date': date_list, 'sentiment': sentiment_values}
|
|
|
|
|
df = pd.DataFrame(data)
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def plot_sentiment_combined(sentiment_df):
|
|
|
|
|
# 设置字体为微软雅黑
|
|
|
|
|
font = FontProperties(fname=r'C:\Windows\Fonts\msyh.ttc', size=12)
|
|
|
|
|
|
|
|
|
|
# 设置图形大小
|
|
|
|
|
plt.figure(figsize=(12, 6))
|
|
|
|
|
|
|
|
|
|
# 绘制柱状图
|
|
|
|
|
plt.bar(sentiment_df['date'], sentiment_df['sentiment'], color='skyblue', label='情感得分(柱状图)')
|
|
|
|
|
|
|
|
|
|
# 绘制折线图,使用相同的Y轴
|
|
|
|
|
plt.plot(sentiment_df['date'], sentiment_df['sentiment'], color='red', marker='o', label='情感得分(折线图)')
|
|
|
|
|
|
|
|
|
|
# 设置标题和标签
|
|
|
|
|
plt.title('情感得分随时间的变化', fontproperties=font)
|
|
|
|
|
plt.xlabel('日期', fontproperties=font)
|
|
|
|
|
plt.ylabel('情感得分', fontproperties=font)
|
|
|
|
|
|
|
|
|
|
# 设置日期格式和字体
|
|
|
|
|
plt.xticks(sentiment_df['date'], sentiment_df['date'].dt.strftime('%m-%d'), rotation=45, fontproperties=font)
|
|
|
|
|
plt.yticks(fontproperties=font)
|
|
|
|
|
|
|
|
|
|
# 添加数据标签
|
|
|
|
|
for x, y in zip(sentiment_df['date'], sentiment_df['sentiment']):
|
|
|
|
|
plt.text(x, y + 0.02, f'{y:.2f}', ha='center', fontproperties=font)
|
|
|
|
|
|
|
|
|
|
# 添加网格线
|
|
|
|
|
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
|
|
|
|
|
|
|
|
|
# 显示图例
|
|
|
|
|
plt.legend(prop=font)
|
|
|
|
|
|
|
|
|
|
# 调整布局以防止标签重叠
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
|
|
|
|
|
# 显示图形
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 使用示例
|
|
|
|
|
sentiment_df = create_manual_data()
|
|
|
|
|
plot_sentiment_combined(sentiment_df)
|
|
|
|
|