You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

68 lines
2.4 KiB

import requests
import pandas as pd
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import re
import collections
# 2.1 数据获取
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
search_url = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A'
response = requests.get(search_url, headers=headers)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
video_links = []
for link in soup.find_all('a', class_='title')[:300]:
video_links.append('https:' + link['href'])
def get_danmaku(url):
danmaku_list = []
# 这里假设获取弹幕的逻辑,可以根据实际情况修改
return danmaku_list
all_danmaku = []
for link in video_links:
danmaku = get_danmaku(link)
all_danmaku.extend(danmaku)
# 2.2 数据统计
ai_keywords = ["AI", "人工智能", "AI 技术"]
danmaku_count = collections.defaultdict(int)
for danmaku in all_danmaku:
for keyword in ai_keywords:
if keyword in danmaku:
danmaku_count[danmaku] += 1
data = {'弹幕内容': list(danmaku_count.keys()), '数量': list(danmaku_count.values())}
df = pd.DataFrame(data)
sorted_df = df.sort_values(by='数量', ascending=False)
top_8_df = sorted_df.head(8)
excel_filename = 'ai_danmaku_result.xlsx'
top_8_df.to_excel(excel_filename, index=False)
# 2.3 数据可视化
text = " ".join(top_8_df['弹幕内容'])
background_image = np.array(Image.open('background.jpg'))
wordcloud = WordCloud(width=800, height=400, background_color='white', mask=background_image, font_path='your_font.ttf').generate(text)
image_colors = ImageColorGenerator(background_image)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
plt.axis('off')
plt.savefig('wordcloud.png')
plt.show()
# 2.4 数据结论
# 根据统计数据和观察词云图,可以得出以下主流看法:
if top_8_df is not None and len(top_8_df) > 0:
common_comments = top_8_df['弹幕内容'].tolist()
print(f"当前 B 站用户对于 2024 巴黎奥运会应用 AI 技术的主流看法可能包括:{common_comments}等方面的关注和讨论。")
else:
print("未发现明显的主流看法。")