You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
3.4 KiB
86 lines
3.4 KiB
import bilibili_spider
|
|
import matplotlib.pyplot as plt
|
|
from wordcloud import WordCloud
|
|
import jieba
|
|
import pandas
|
|
from openpyxl import Workbook
|
|
|
|
def contains_keywords(text, keywords):
|
|
for word in list(jieba.cut(text)):
|
|
for keyword in keywords:
|
|
if word == keyword:
|
|
return True
|
|
return False
|
|
|
|
ai_keywords = [
|
|
"机器学习", "深度学习", "自然语言处理", "计算机视觉", "图像识别",
|
|
"语音识别", "强化学习", "生成对抗网络", "智能推荐系统", "数据挖掘",
|
|
"模式识别", "智能机器人", "自动驾驶", "预测分析", "数据清洗",
|
|
"异常检测", "知识图谱", "人工智能伦理", "智能合约", "虚拟助手",
|
|
"语义分析", "图像生成", "文本生成", "情感分析", "决策支持系统",
|
|
"人脸识别", "智能搜索", "自然语言生成", "人工神经网络", "模型优化",
|
|
"智能监控", "医疗影像分析", "自动化", "智能制造", "虚拟现实",
|
|
"增强现实", "智能家居", "边缘计算", "云计算", "数据隐私",
|
|
"算法公平性", "知识推理", "智能交通", "聊天机器人", "自动化客服",
|
|
"智能推荐引擎", "生物识别", "机器人过程自动化", "多模态学习", "量子计算",
|
|
"自适应系统", "算法优化", "智能数据分析", "虚拟角色", "环境感知",
|
|
"ai", "AI", "人工智能"
|
|
]
|
|
|
|
def list_to_dict(list):
|
|
# 遍历列表中的每个元素
|
|
count_dict = {}
|
|
for item in list:
|
|
if item in count_dict:
|
|
count_dict[item] += 1
|
|
else:
|
|
count_dict[item] = 1
|
|
return count_dict
|
|
|
|
|
|
def main():
|
|
query = '2024巴黎奥运会'
|
|
number = 300
|
|
|
|
# 获取弹幕列表
|
|
danmu_list = bilibili_spider.get_danmu(query=query, number=number, display_progress=True)
|
|
# danmu_list = ["test", "ai", "noai"]
|
|
|
|
# 筛选其中包含AI关键词的弹幕
|
|
ai_danmu_list = []
|
|
for danmu in danmu_list:
|
|
if contains_keywords(danmu, ai_keywords):
|
|
ai_danmu_list.append(danmu)
|
|
|
|
ai_danmu_dict = list_to_dict(ai_danmu_list)
|
|
ai_danmu_dict = dict(sorted(ai_danmu_dict.items(), key=lambda item: item[1], reverse=True))
|
|
|
|
#输出数量排名前8的弹幕
|
|
first_8_ai_danmu = list(ai_danmu_dict.items())[:8]
|
|
for item in first_8_ai_danmu:
|
|
print(f"{item[0]} : 出现{item[1]}次数")
|
|
|
|
# 将所有弹幕数量写入 Excel 文件
|
|
danmu_dict = list_to_dict(danmu_list)
|
|
danmu_dict = dict(sorted(danmu_dict.items(), key=lambda item: item[1], reverse=True))
|
|
Workbook().save('output.xlsx')
|
|
with pandas.ExcelWriter('output.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
|
|
pandas.DataFrame(list(danmu_dict.items())).to_excel(writer, sheet_name='所有弹幕', index=False)
|
|
|
|
# 将ai弹幕数量写入 Excel 文件
|
|
with pandas.ExcelWriter('output.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
|
|
pandas.DataFrame(list(ai_danmu_dict.items())).to_excel(writer, sheet_name='ai弹幕', index=False)
|
|
|
|
# 制作词云图
|
|
font_path = "C:\Windows\Fonts\SimHei.ttf"
|
|
wordcloud = WordCloud(font_path=font_path, width=800, height=400, background_color='white').generate(' '.join(ai_danmu_list))
|
|
|
|
plt.figure(figsize=(10, 5))
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
plt.axis('off')
|
|
|
|
plt.savefig('wordcloud.png', format='png') # 保存为 PNG 文件
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|