diff --git a/巴黎奥运弹幕分析.py b/巴黎奥运弹幕分析.py new file mode 100644 index 0000000..e2caebb --- /dev/null +++ b/巴黎奥运弹幕分析.py @@ -0,0 +1,68 @@ +import requests +import pandas as pd +from bs4 import BeautifulSoup +from wordcloud import WordCloud +import matplotlib.pyplot as plt +import numpy as np +from PIL import Image +import re +import collections + +# 2.1 数据获取 +headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36' +} +search_url = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A' +response = requests.get(search_url, headers=headers) +html_content = response.text +soup = BeautifulSoup(html_content, 'html.parser') + +video_links = [] +for link in soup.find_all('a', class_='title')[:300]: + video_links.append('https:' + link['href']) + +def get_danmaku(url): + danmaku_list = [] + # 这里假设获取弹幕的逻辑,可以根据实际情况修改 + return danmaku_list + +all_danmaku = [] +for link in video_links: + danmaku = get_danmaku(link) + all_danmaku.extend(danmaku) + +# 2.2 数据统计 +ai_keywords = ["AI", "人工智能", "AI 技术"] +danmaku_count = collections.defaultdict(int) +for danmaku in all_danmaku: + for keyword in ai_keywords: + if keyword in danmaku: + danmaku_count[danmaku] += 1 + +data = {'弹幕内容': list(danmaku_count.keys()), '数量': list(danmaku_count.values())} +df = pd.DataFrame(data) +sorted_df = df.sort_values(by='数量', ascending=False) +top_8_df = sorted_df.head(8) + +excel_filename = 'ai_danmaku_result.xlsx' +top_8_df.to_excel(excel_filename, index=False) + +# 2.3 数据可视化 +text = " ".join(top_8_df['弹幕内容']) + +background_image = np.array(Image.open('background.jpg')) +wordcloud = WordCloud(width=800, height=400, background_color='white', mask=background_image, font_path='your_font.ttf').generate(text) +image_colors = ImageColorGenerator(background_image) +plt.figure(figsize=(10, 5)) +plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear') +plt.axis('off') +plt.savefig('wordcloud.png') +plt.show() + +# 2.4 数据结论 +# 根据统计数据和观察词云图,可以得出以下主流看法: +if top_8_df is not None and len(top_8_df) > 0: + common_comments = top_8_df['弹幕内容'].tolist() + print(f"当前 B 站用户对于 2024 巴黎奥运会应用 AI 技术的主流看法可能包括:{common_comments}等方面的关注和讨论。") +else: + print("未发现明显的主流看法。") \ No newline at end of file