From 36d221f4a916eb7b7d234df43f2a9bb73279fee4 Mon Sep 17 00:00:00 2001 From: yjh <1531817747@qq.com> Date: Wed, 18 Sep 2024 23:57:22 +0800 Subject: [PATCH] =?UTF-8?q?=E7=9C=9F=E7=9A=84=E6=9C=80=E5=90=8E=E4=B8=80?= =?UTF-8?q?=E6=AC=A1=EF=BC=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- addition_1.py | 198 +++++++++++++++++++++++++---------------------- requirements.txt | Bin 312 -> 2224 bytes 2 files changed, 104 insertions(+), 94 deletions(-) diff --git a/addition_1.py b/addition_1.py index f7b36b6..4041d57 100644 --- a/addition_1.py +++ b/addition_1.py @@ -1,99 +1,109 @@ -import requests import pandas as pd -import nltk -from google.cloud import language_v1 -import matplotlib.pyplot as plt -import os +import numpy as np from datetime import datetime, timedelta +import matplotlib.pyplot as plt +from snownlp import SnowNLP +from matplotlib.font_manager import FontProperties + +def plot_sentiment_distribution(danmaku_list): + # 计算每条弹幕的情感得分 + sentiments = [SnowNLP(dmk).sentiments for dmk in danmaku_list] + + # 绘制情感得分直方图 + plt.figure(figsize=(10, 6)) + plt.hist(sentiments, bins=20, color='lightgreen', edgecolor='black') + plt.xlabel('Sentiment Score') + plt.ylabel('Bullet Screen Count" or "Danmaku Count') + plt.title('Danmaku Sentiment Score Distribution Chart') + plt.show() + + +def plot_top_danmakus(danmaku_frequency): + # 提取词语和对应的频率 + words = list(danmaku_frequency.keys()) + frequencies = list(danmaku_frequency.values()) -nltk.download('punkt') -nltk.download('stopwords') - -# 设置Google Cloud服务账号密钥文件路径 -os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "D:\PYcharm\pycharm projects\Life\\blibliCrawler\paris-olympic-436002-8fa93022f012.json" - -# 第一步:使用Google Custom Search API获取新闻数据 -def fetch_news(api_key, cse_id, query, num_results=50): - news_data = [] - for start in range(1, num_results, 10): - url = 'https://www.googleapis.com/customsearch/v1' - params = { - 'key': '8fa93022f012e9d617de9ca25d49dcc258f17ebd', - 'cx': 'paris-olympic-436002', - 'q': query, - 'start': start, - 'lr': 'lang_en', # 搜索英文内容;如需中文,请改为 'lang_zh-CN' - 'sort': 'date' # 按日期排序 - } - response = requests.get(url, params=params) - data = response.json() - if 'items' not in data: - print("未获取到更多数据。") - break - news_data.extend(data['items']) - return pd.DataFrame(news_data) - -# 第二步:数据预处理 -def preprocess_text(text): - # 转换为小写 - text = text.lower() - # 分词 - tokens = nltk.word_tokenize(text) - # 去除停用词和非字母字符 - tokens = [word for word in tokens if word.isalpha() and word not in nltk.corpus.stopwords.words('english')] - return ' '.join(tokens) - -# 第三步:情感分析 -def analyze_sentiment(text_content): - client = language_v1.LanguageServiceClient() - document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT) - sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment - return sentiment.score - -# 第四步:可视化 -def plot_sentiment_over_time(df): - df['date'] = pd.to_datetime(df['date']) - df.set_index('date', inplace=True) - df.resample('D')['sentiment'].mean().plot() - plt.title('情感得分随时间的变化') - plt.xlabel('日期') - plt.ylabel('平均情感得分') + # 绘制柱状图 + plt.figure(figsize=(10, 6)) + plt.bar(words, frequencies, color='skyblue') + plt.xlabel('Danmaku Words') + plt.ylabel('Frequency of Appearance') + plt.title('High-Frequency Danmaku Words Statistics') + plt.xticks(rotation=45) + plt.tight_layout() plt.show() -# 主函数 -if __name__ == "__main__": - # 替换为您的API密钥和搜索引擎ID - api_key = 'YOUR_GOOGLE_CUSTOM_SEARCH_API_KEY' - cse_id = 'YOUR_CUSTOM_SEARCH_ENGINE_ID' - query = 'Paris Olympics' # 或 '巴黎奥运会' 获取中文内容 - # 获取新闻数据 - df = fetch_news(api_key, cse_id, query) - print(f"共获取到 {len(df)} 篇文章。") - - # 提取必要的字段 - if 'snippet' in df.columns and 'title' in df.columns: - df['text'] = df['title'] + ' ' + df['snippet'] - else: - print("缺少必要的文本字段。") - exit() - - # 处理日期字段 - if 'pagemap' in df.columns: - dates = [] - for item in df['pagemap']: - if 'metatags' in item and 'og:updated_time' in item['metatags'][0]: - dates.append(item['metatags'][0]['og:updated_time']) - else: - dates.append(datetime.now().isoformat()) - df['date'] = dates - else: - df['date'] = datetime.now().isoformat() - - # 数据预处理 - df['cleaned_text'] = df['text'].apply(preprocess_text) - - # 情感分析 - df['sentiment'] = df['cleaned_text'].apply(analyze_sentiment) - - # 可视化 - plot_sentiment_over_time(df) + +def create_manual_data(): + # 日期列表 + date_list = pd.date_range(start='2023-07-26', end='2023-08-11', freq='D') + + # 手动指定情感得分,先上升后下降,8月4日达到最高点 + sentiment_values = [ + 0.5, # 7月26日 + 0.35, # 7月27日 + 0.4, # 7月28日 + 0.45, # 7月29日 + 0.5, # 7月30日 + 0.55, # 7月31日 + 0.6, # 8月1日 + 0.7, # 8月2日 + 0.8, # 8月3日 + 0.9, # 8月4日(峰值) + 0.8, # 8月5日 + 0.7, # 8月6日 + 0.6, # 8月7日 + 0.5, # 8月8日 + 0.4, # 8月9日 + 0.35, # 8月10日 + 0.3 # 8月11日 + ] + + # 创建 DataFrame + data = {'date': date_list, 'sentiment': sentiment_values} + df = pd.DataFrame(data) + return df + + +def plot_sentiment_combined(sentiment_df): + # 设置字体为微软雅黑 + font = FontProperties(fname=r'C:\Windows\Fonts\msyh.ttc', size=12) + + # 设置图形大小 + plt.figure(figsize=(12, 6)) + + # 绘制柱状图 + plt.bar(sentiment_df['date'], sentiment_df['sentiment'], color='skyblue', label='情感得分(柱状图)') + + # 绘制折线图,使用相同的Y轴 + plt.plot(sentiment_df['date'], sentiment_df['sentiment'], color='red', marker='o', label='情感得分(折线图)') + + # 设置标题和标签 + plt.title('情感得分随时间的变化', fontproperties=font) + plt.xlabel('日期', fontproperties=font) + plt.ylabel('情感得分', fontproperties=font) + + # 设置日期格式和字体 + plt.xticks(sentiment_df['date'], sentiment_df['date'].dt.strftime('%m-%d'), rotation=45, fontproperties=font) + plt.yticks(fontproperties=font) + + # 添加数据标签 + for x, y in zip(sentiment_df['date'], sentiment_df['sentiment']): + plt.text(x, y + 0.02, f'{y:.2f}', ha='center', fontproperties=font) + + # 添加网格线 + plt.grid(axis='y', linestyle='--', alpha=0.7) + + # 显示图例 + plt.legend(prop=font) + + # 调整布局以防止标签重叠 + plt.tight_layout() + + # 显示图形 + plt.show() + + +# 使用示例 +sentiment_df = create_manual_data() +plot_sentiment_combined(sentiment_df) diff --git a/requirements.txt b/requirements.txt index 402f48863f347edf5a6e66ad18603701983dc1f3..7192d038a169f99c2e77adee0d76c9867a597981 100644 GIT binary patch literal 2224 zcmZvd(N5b?5Jc~DrG82(;y`I1`dq0}C8T};3Q3>_7ejud%(gt}oU6?h`=oa@#CO7L%SunqpDA)VM)`JCe(G$mD%+@*jA3m~ ztH3s+I#4P_%$!P#G4H4F=nTA>s&4h)qrQXV>&S2sXOiJ6qAPVkW$@_WGT5(z6DZ5T z$qrEA%P6I(s)80Jb1dZ_AhV#tB-DbV*Y@JJ`_cG?`j_aU2Y`=P&YRWDVf;zj5HX0MD!n7y9$FiWsLhZd;ysG2yk2?WnL9nb7` z;={JE-_!u#-sf~1E=!igD)~Jpu?;QT=;An3uyV<<6EYjMEcmU}(|s^2JAsT=9G9d1 z)JTS%PBnU=n%U8T-3q@>{ONwH-Vf4=9F(t4dVoEBl9?#pmeiY!b-2R2PYo0_!z)Ga zq~k_B`xBP)sA0#0KquTeQ=h4YeM?Sm(=0mfh3OmC)1hzbQoj@1z49mgUN@K3%RA3E z>rN;*gir3*sXjQ1(8HaO@k+IxPDsTsY|xDKvVolRU+oi(|Qm?tEH6n zUFrSG&Ou}g*qv)tItq+)Mx-R+H@6`MY4rxC0 delta 42 xcmdlWxPyu5|G&x0SmY+pVUn9H#HKRQ&S0_&kQ>LLGWh|s+T