|
|
|
|
@ -0,0 +1,242 @@
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
import json
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import jieba
|
|
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from collections import Counter
|
|
|
|
|
import time
|
|
|
|
|
import random
|
|
|
|
|
from openpyxl import Workbook
|
|
|
|
|
|
|
|
|
|
# 设置中文字体
|
|
|
|
|
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
|
|
|
|
|
|
|
|
|
|
class BilibiliDanmakuSpider:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.headers = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
|
|
|
"Referer": "https://www.bilibili.com/"
|
|
|
|
|
}
|
|
|
|
|
self.danmaku_list = []
|
|
|
|
|
|
|
|
|
|
def get_video_ids(self, keyword, page_count=36):
|
|
|
|
|
"""获取搜索结果的视频ID,每页10个视频,36页共360个"""
|
|
|
|
|
video_ids = []
|
|
|
|
|
for page in range(1, page_count + 1):
|
|
|
|
|
try:
|
|
|
|
|
url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
|
|
|
|
|
response = requests.get(url, headers=self.headers)
|
|
|
|
|
data = json.loads(response.text)
|
|
|
|
|
|
|
|
|
|
if data["code"] == 0 and data["data"]["result"]:
|
|
|
|
|
for item in data["data"]["result"]:
|
|
|
|
|
video_ids.append(item["aid"])
|
|
|
|
|
|
|
|
|
|
# 随机延迟,避免被反爬
|
|
|
|
|
time.sleep(random.uniform(1, 3))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"获取第{page}页视频ID失败: {e}")
|
|
|
|
|
|
|
|
|
|
return list(set(video_ids))[:360] # 去重并确保最多360个
|
|
|
|
|
|
|
|
|
|
def get_danmakus(self, aid):
|
|
|
|
|
"""获取单个视频的弹幕"""
|
|
|
|
|
try:
|
|
|
|
|
# 获取cid
|
|
|
|
|
url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
|
|
|
|
|
response = requests.get(url, headers=self.headers)
|
|
|
|
|
cid = json.loads(response.text)["data"]["cid"]
|
|
|
|
|
|
|
|
|
|
# 获取弹幕
|
|
|
|
|
danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
|
|
|
|
|
response = requests.get(danmaku_url, headers=self.headers)
|
|
|
|
|
response.encoding = "utf-8"
|
|
|
|
|
|
|
|
|
|
# 提取弹幕内容
|
|
|
|
|
danmakus = re.findall(r'<d.*?>(.*?)</d>', response.text)
|
|
|
|
|
self.danmaku_list.extend(danmakus)
|
|
|
|
|
|
|
|
|
|
print(f"成功获取视频{aid}的{len(danmakus)}条弹幕")
|
|
|
|
|
time.sleep(random.uniform(0.5, 1.5))
|
|
|
|
|
return True
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"获取视频{aid}弹幕失败: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def run(self, keywords=["大语言模型", "大模型", "LLM"]):
|
|
|
|
|
"""运行爬虫主程序"""
|
|
|
|
|
all_video_ids = []
|
|
|
|
|
for keyword in keywords:
|
|
|
|
|
print(f"搜索关键词: {keyword}")
|
|
|
|
|
video_ids = self.get_video_ids(keyword)
|
|
|
|
|
all_video_ids.extend(video_ids)
|
|
|
|
|
|
|
|
|
|
# 去重并确保总数不超过360
|
|
|
|
|
unique_video_ids = list(set(all_video_ids))[:360]
|
|
|
|
|
print(f"共获取{len(unique_video_ids)}个视频ID,开始爬取弹幕...")
|
|
|
|
|
|
|
|
|
|
for idx, aid in enumerate(unique_video_ids, 1):
|
|
|
|
|
print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频")
|
|
|
|
|
self.get_danmakus(aid)
|
|
|
|
|
|
|
|
|
|
print(f"爬取完成,共获取{len(self.danmaku_list)}条弹幕")
|
|
|
|
|
return self.danmaku_list
|
|
|
|
|
|
|
|
|
|
class DanmakuAnalyzer:
|
|
|
|
|
def __init__(self, danmakus):
|
|
|
|
|
self.danmakus = [d.strip() for d in danmakus if d.strip()]
|
|
|
|
|
self.stopwords = self.load_stopwords()
|
|
|
|
|
|
|
|
|
|
def load_stopwords(self):
|
|
|
|
|
"""加载停用词"""
|
|
|
|
|
try:
|
|
|
|
|
with open("stopwords.txt", "r", encoding="utf-8") as f:
|
|
|
|
|
return set([line.strip() for line in f.readlines()])
|
|
|
|
|
except:
|
|
|
|
|
# 默认停用词
|
|
|
|
|
return set(["的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"])
|
|
|
|
|
|
|
|
|
|
def count_danmakus(self):
|
|
|
|
|
"""统计弹幕数量"""
|
|
|
|
|
# 过滤与AI技术应用无关的弹幕
|
|
|
|
|
ai_related_words = ["AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据", "应用", "成本", "效率", "伦理", "隐私", "安全", "未来", "发展", "编程", "创作", "教育"]
|
|
|
|
|
ai_danmakus = [d for d in self.danmakus if any(word in d for word in ai_related_words)]
|
|
|
|
|
|
|
|
|
|
# 统计所有弹幕
|
|
|
|
|
all_counter = Counter(self.danmakus)
|
|
|
|
|
# 统计AI相关弹幕
|
|
|
|
|
ai_counter = Counter(ai_danmakus)
|
|
|
|
|
|
|
|
|
|
return all_counter, ai_counter
|
|
|
|
|
|
|
|
|
|
def save_to_excel(self, ai_counter, filename="弹幕统计.xlsx"):
|
|
|
|
|
"""保存统计结果到Excel"""
|
|
|
|
|
# 转换为DataFrame
|
|
|
|
|
df = pd.DataFrame(ai_counter.most_common(), columns=["弹幕内容", "出现次数"])
|
|
|
|
|
# 保存到Excel
|
|
|
|
|
df.to_excel(filename, index=False)
|
|
|
|
|
print(f"统计结果已保存到{filename}")
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
def generate_wordcloud(self, filename="弹幕词云.png"):
|
|
|
|
|
"""生成词云图"""
|
|
|
|
|
# 分词处理
|
|
|
|
|
all_text = " ".join(self.danmakus)
|
|
|
|
|
words = jieba.cut(all_text)
|
|
|
|
|
words = [word for word in words if word not in self.stopwords and len(word) > 1]
|
|
|
|
|
text = " ".join(words)
|
|
|
|
|
|
|
|
|
|
# 生成词云
|
|
|
|
|
wc = WordCloud(
|
|
|
|
|
font_path="simhei.ttf", # 替换为你的中文字体路径
|
|
|
|
|
background_color="white",
|
|
|
|
|
width=1200,
|
|
|
|
|
height=800,
|
|
|
|
|
max_words=200,
|
|
|
|
|
collocations=False
|
|
|
|
|
).generate(text)
|
|
|
|
|
|
|
|
|
|
# 显示并保存词云
|
|
|
|
|
plt.figure(figsize=(12, 8))
|
|
|
|
|
plt.imshow(wc, interpolation="bilinear")
|
|
|
|
|
plt.axis("off")
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.savefig(filename, dpi=300)
|
|
|
|
|
plt.show()
|
|
|
|
|
print(f"词云图已保存到{filename}")
|
|
|
|
|
|
|
|
|
|
def get_main_views(self, ai_counter):
|
|
|
|
|
"""分析用户主流看法"""
|
|
|
|
|
# 分类关键词
|
|
|
|
|
cost_related = ["成本", "价格", "费用", "免费", "付费"]
|
|
|
|
|
application_related = ["应用", "使用", "场景", "领域", "行业", "教育", "医疗", "工作", "创作", "编程"]
|
|
|
|
|
negative_related = ["风险", "危险", "伦理", "隐私", "安全", "失业", "替代", "问题"]
|
|
|
|
|
positive_related = ["方便", "高效", "厉害", "强大", "有用", "帮助", "进步"]
|
|
|
|
|
|
|
|
|
|
# 统计各类别弹幕
|
|
|
|
|
categories = {
|
|
|
|
|
"应用成本": 0,
|
|
|
|
|
"应用领域": 0,
|
|
|
|
|
"不利影响": 0,
|
|
|
|
|
"积极影响": 0,
|
|
|
|
|
"其他观点": 0
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for danmaku, count in ai_counter.items():
|
|
|
|
|
matched = False
|
|
|
|
|
for word in cost_related:
|
|
|
|
|
if word in danmaku:
|
|
|
|
|
categories["应用成本"] += count
|
|
|
|
|
matched = True
|
|
|
|
|
break
|
|
|
|
|
if matched:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
for word in application_related:
|
|
|
|
|
if word in danmaku:
|
|
|
|
|
categories["应用领域"] += count
|
|
|
|
|
matched = True
|
|
|
|
|
break
|
|
|
|
|
if matched:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
for word in negative_related:
|
|
|
|
|
if word in danmaku:
|
|
|
|
|
categories["不利影响"] += count
|
|
|
|
|
matched = True
|
|
|
|
|
break
|
|
|
|
|
if matched:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
for word in positive_related:
|
|
|
|
|
if word in danmaku:
|
|
|
|
|
categories["积极影响"] += count
|
|
|
|
|
matched = True
|
|
|
|
|
break
|
|
|
|
|
if matched:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
categories["其他观点"] += count
|
|
|
|
|
|
|
|
|
|
return categories
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 爬取弹幕
|
|
|
|
|
spider = BilibiliDanmakuSpider()
|
|
|
|
|
danmakus = spider.run()
|
|
|
|
|
|
|
|
|
|
# 分析弹幕
|
|
|
|
|
analyzer = DanmakuAnalyzer(danmakus)
|
|
|
|
|
all_counter, ai_counter = analyzer.count_danmakus()
|
|
|
|
|
|
|
|
|
|
# 输出排名前8的AI相关弹幕
|
|
|
|
|
print("\nAI技术应用相关弹幕数量排名前8:")
|
|
|
|
|
top8 = ai_counter.most_common(8)
|
|
|
|
|
for i, (danmaku, count) in enumerate(top8, 1):
|
|
|
|
|
print(f"{i}. {danmaku}: {count}次")
|
|
|
|
|
|
|
|
|
|
# 保存到Excel
|
|
|
|
|
df = analyzer.save_to_excel(ai_counter)
|
|
|
|
|
|
|
|
|
|
# 生成词云
|
|
|
|
|
analyzer.generate_wordcloud()
|
|
|
|
|
|
|
|
|
|
# 分析主流看法
|
|
|
|
|
main_views = analyzer.get_main_views(ai_counter)
|
|
|
|
|
print("\nB站用户对大语言模型技术的主流看法统计:")
|
|
|
|
|
for view, count in main_views.items():
|
|
|
|
|
print(f"{view}: {count}条相关弹幕")
|
|
|
|
|
|
|
|
|
|
# 可视化主流看法
|
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
|
plt.bar(main_views.keys(), main_views.values(), color=['#4CAF50', '#2196F3', '#f44336', '#FFC107', '#9E9E9E'])
|
|
|
|
|
plt.title('用户对大语言模型的主要关注点分布')
|
|
|
|
|
plt.ylabel('弹幕数量')
|
|
|
|
|
plt.xticks(rotation=30)
|
|
|
|
|
for i, v in enumerate(main_views.values()):
|
|
|
|
|
plt.text(i, v + 5, str(v), ha='center')
|
|
|
|
|
plt.tight_layout()
|
|
|
|
|
plt.savefig('用户观点分布.png', dpi=300)
|
|
|
|
|
plt.show()
|