ADD file via upload

main
fzu102301136 4 months ago
parent 0cd18e36a6
commit ac2d3f53ff

@ -0,0 +1,242 @@
import requests
import re
import json
import pandas as pd
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import time
import random
from openpyxl import Workbook
# 设置中文字体
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
class BilibiliDanmakuSpider:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": "https://www.bilibili.com/"
}
self.danmaku_list = []
def get_video_ids(self, keyword, page_count=36):
"""获取搜索结果的视频ID每页10个视频36页共360个"""
video_ids = []
for page in range(1, page_count + 1):
try:
url = f"https://api.bilibili.com/x/web-interface/search/type?keyword={keyword}&search_type=video&page={page}"
response = requests.get(url, headers=self.headers)
data = json.loads(response.text)
if data["code"] == 0 and data["data"]["result"]:
for item in data["data"]["result"]:
video_ids.append(item["aid"])
# 随机延迟,避免被反爬
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"获取第{page}页视频ID失败: {e}")
return list(set(video_ids))[:360] # 去重并确保最多360个
def get_danmakus(self, aid):
"""获取单个视频的弹幕"""
try:
# 获取cid
url = f"https://api.bilibili.com/x/web-interface/view?aid={aid}"
response = requests.get(url, headers=self.headers)
cid = json.loads(response.text)["data"]["cid"]
# 获取弹幕
danmaku_url = f"https://comment.bilibili.com/{cid}.xml"
response = requests.get(danmaku_url, headers=self.headers)
response.encoding = "utf-8"
# 提取弹幕内容
danmakus = re.findall(r'<d.*?>(.*?)</d>', response.text)
self.danmaku_list.extend(danmakus)
print(f"成功获取视频{aid}{len(danmakus)}条弹幕")
time.sleep(random.uniform(0.5, 1.5))
return True
except Exception as e:
print(f"获取视频{aid}弹幕失败: {e}")
return False
def run(self, keywords=["大语言模型", "大模型", "LLM"]):
"""运行爬虫主程序"""
all_video_ids = []
for keyword in keywords:
print(f"搜索关键词: {keyword}")
video_ids = self.get_video_ids(keyword)
all_video_ids.extend(video_ids)
# 去重并确保总数不超过360
unique_video_ids = list(set(all_video_ids))[:360]
print(f"共获取{len(unique_video_ids)}个视频ID开始爬取弹幕...")
for idx, aid in enumerate(unique_video_ids, 1):
print(f"正在爬取第{idx}/{len(unique_video_ids)}个视频")
self.get_danmakus(aid)
print(f"爬取完成,共获取{len(self.danmaku_list)}条弹幕")
return self.danmaku_list
class DanmakuAnalyzer:
def __init__(self, danmakus):
self.danmakus = [d.strip() for d in danmakus if d.strip()]
self.stopwords = self.load_stopwords()
def load_stopwords(self):
"""加载停用词"""
try:
with open("stopwords.txt", "r", encoding="utf-8") as f:
return set([line.strip() for line in f.readlines()])
except:
# 默认停用词
return set(["", "", "", "", "", "", "", "", "", "", "", "", "一个", "", "", "", "", "", "", "", "", "", "", "没有", "", "", "自己", ""])
def count_danmakus(self):
"""统计弹幕数量"""
# 过滤与AI技术应用无关的弹幕
ai_related_words = ["AI", "人工智能", "大模型", "语言模型", "LLM", "训练", "算法", "数据", "应用", "成本", "效率", "伦理", "隐私", "安全", "未来", "发展", "编程", "创作", "教育"]
ai_danmakus = [d for d in self.danmakus if any(word in d for word in ai_related_words)]
# 统计所有弹幕
all_counter = Counter(self.danmakus)
# 统计AI相关弹幕
ai_counter = Counter(ai_danmakus)
return all_counter, ai_counter
def save_to_excel(self, ai_counter, filename="弹幕统计.xlsx"):
"""保存统计结果到Excel"""
# 转换为DataFrame
df = pd.DataFrame(ai_counter.most_common(), columns=["弹幕内容", "出现次数"])
# 保存到Excel
df.to_excel(filename, index=False)
print(f"统计结果已保存到{filename}")
return df
def generate_wordcloud(self, filename="弹幕词云.png"):
"""生成词云图"""
# 分词处理
all_text = " ".join(self.danmakus)
words = jieba.cut(all_text)
words = [word for word in words if word not in self.stopwords and len(word) > 1]
text = " ".join(words)
# 生成词云
wc = WordCloud(
font_path="simhei.ttf", # 替换为你的中文字体路径
background_color="white",
width=1200,
height=800,
max_words=200,
collocations=False
).generate(text)
# 显示并保存词云
plt.figure(figsize=(12, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.savefig(filename, dpi=300)
plt.show()
print(f"词云图已保存到{filename}")
def get_main_views(self, ai_counter):
"""分析用户主流看法"""
# 分类关键词
cost_related = ["成本", "价格", "费用", "免费", "付费"]
application_related = ["应用", "使用", "场景", "领域", "行业", "教育", "医疗", "工作", "创作", "编程"]
negative_related = ["风险", "危险", "伦理", "隐私", "安全", "失业", "替代", "问题"]
positive_related = ["方便", "高效", "厉害", "强大", "有用", "帮助", "进步"]
# 统计各类别弹幕
categories = {
"应用成本": 0,
"应用领域": 0,
"不利影响": 0,
"积极影响": 0,
"其他观点": 0
}
for danmaku, count in ai_counter.items():
matched = False
for word in cost_related:
if word in danmaku:
categories["应用成本"] += count
matched = True
break
if matched:
continue
for word in application_related:
if word in danmaku:
categories["应用领域"] += count
matched = True
break
if matched:
continue
for word in negative_related:
if word in danmaku:
categories["不利影响"] += count
matched = True
break
if matched:
continue
for word in positive_related:
if word in danmaku:
categories["积极影响"] += count
matched = True
break
if matched:
continue
categories["其他观点"] += count
return categories
if __name__ == "__main__":
# 爬取弹幕
spider = BilibiliDanmakuSpider()
danmakus = spider.run()
# 分析弹幕
analyzer = DanmakuAnalyzer(danmakus)
all_counter, ai_counter = analyzer.count_danmakus()
# 输出排名前8的AI相关弹幕
print("\nAI技术应用相关弹幕数量排名前8:")
top8 = ai_counter.most_common(8)
for i, (danmaku, count) in enumerate(top8, 1):
print(f"{i}. {danmaku}: {count}")
# 保存到Excel
df = analyzer.save_to_excel(ai_counter)
# 生成词云
analyzer.generate_wordcloud()
# 分析主流看法
main_views = analyzer.get_main_views(ai_counter)
print("\nB站用户对大语言模型技术的主流看法统计:")
for view, count in main_views.items():
print(f"{view}: {count}条相关弹幕")
# 可视化主流看法
plt.figure(figsize=(10, 6))
plt.bar(main_views.keys(), main_views.values(), color=['#4CAF50', '#2196F3', '#f44336', '#FFC107', '#9E9E9E'])
plt.title('用户对大语言模型的主要关注点分布')
plt.ylabel('弹幕数量')
plt.xticks(rotation=30)
for i, v in enumerate(main_views.values()):
plt.text(i, v + 5, str(v), ha='center')
plt.tight_layout()
plt.savefig('用户观点分布.png', dpi=300)
plt.show()
Loading…
Cancel
Save