You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
2.4 KiB
68 lines
2.4 KiB
import requests
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
from wordcloud import WordCloud
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from PIL import Image
|
|
import re
|
|
import collections
|
|
|
|
# 2.1 数据获取
|
|
headers = {
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
|
|
}
|
|
search_url = 'https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A'
|
|
response = requests.get(search_url, headers=headers)
|
|
html_content = response.text
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
video_links = []
|
|
for link in soup.find_all('a', class_='title')[:300]:
|
|
video_links.append('https:' + link['href'])
|
|
|
|
def get_danmaku(url):
|
|
danmaku_list = []
|
|
# 这里假设获取弹幕的逻辑,可以根据实际情况修改
|
|
return danmaku_list
|
|
|
|
all_danmaku = []
|
|
for link in video_links:
|
|
danmaku = get_danmaku(link)
|
|
all_danmaku.extend(danmaku)
|
|
|
|
# 2.2 数据统计
|
|
ai_keywords = ["AI", "人工智能", "AI 技术"]
|
|
danmaku_count = collections.defaultdict(int)
|
|
for danmaku in all_danmaku:
|
|
for keyword in ai_keywords:
|
|
if keyword in danmaku:
|
|
danmaku_count[danmaku] += 1
|
|
|
|
data = {'弹幕内容': list(danmaku_count.keys()), '数量': list(danmaku_count.values())}
|
|
df = pd.DataFrame(data)
|
|
sorted_df = df.sort_values(by='数量', ascending=False)
|
|
top_8_df = sorted_df.head(8)
|
|
|
|
excel_filename = 'ai_danmaku_result.xlsx'
|
|
top_8_df.to_excel(excel_filename, index=False)
|
|
|
|
# 2.3 数据可视化
|
|
text = " ".join(top_8_df['弹幕内容'])
|
|
|
|
background_image = np.array(Image.open('background.jpg'))
|
|
wordcloud = WordCloud(width=800, height=400, background_color='white', mask=background_image, font_path='your_font.ttf').generate(text)
|
|
image_colors = ImageColorGenerator(background_image)
|
|
plt.figure(figsize=(10, 5))
|
|
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
|
|
plt.axis('off')
|
|
plt.savefig('wordcloud.png')
|
|
plt.show()
|
|
|
|
# 2.4 数据结论
|
|
# 根据统计数据和观察词云图,可以得出以下主流看法:
|
|
if top_8_df is not None and len(top_8_df) > 0:
|
|
common_comments = top_8_df['弹幕内容'].tolist()
|
|
print(f"当前 B 站用户对于 2024 巴黎奥运会应用 AI 技术的主流看法可能包括:{common_comments}等方面的关注和讨论。")
|
|
else:
|
|
print("未发现明显的主流看法。") |