|
|
import requests
|
|
|
import json
|
|
|
import re
|
|
|
import openpyxl
|
|
|
from collections import Counter
|
|
|
import matplotlib.pyplot as plt
|
|
|
from wordcloud import WordCloud, STOPWORDS
|
|
|
from PIL import Image
|
|
|
import numpy as np
|
|
|
import logging
|
|
|
import jieba
|
|
|
import time
|
|
|
import random
|
|
|
import os
|
|
|
import platform
|
|
|
import matplotlib.font_manager as fm
|
|
|
|
|
|
# 设置工作目录为脚本所在目录
|
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
|
print(f"工作目录已设置为: {os.getcwd()}")
|
|
|
|
|
|
# 弹幕列表
|
|
|
danmu_list = []
|
|
|
|
|
|
url = "https://api.bilibili.com/x/web-interface/wbi/search/type?page_size=50&keyword=%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B&search_type=video"
|
|
|
headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15",
|
|
|
"Referer": "https://www.bilibili.com/"
|
|
|
}
|
|
|
cookies = {
|
|
|
"buvid4": "E1155D85-CACE-0757-3AB5-FAA55B9BA4CE85795-022082117-PbDzNyYRQFSIZY4dZRE2fg%3D%3D",
|
|
|
"theme-tip-show": "SHOWED",
|
|
|
"browser_resolution": "1329-262",
|
|
|
"home_feed_column": "4",
|
|
|
"b_lsid": "62213F55_19A436DA31C",
|
|
|
"CURRENT_FNVAL": "4048",
|
|
|
"sid": "em2kxc9a",
|
|
|
"bp_t_offset_114343848": "1130356626901958656",
|
|
|
"bili_ticket": "eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjIyNzkwNDAsImlhdCI6MTc2MjAxOTc4MCwicGx0IjotMX0.Yez-DzUP_yfD_omjBoKCpT_PZEZRunGBW_Em9KagqWk",
|
|
|
"bili_ticket_expires": "1762278980",
|
|
|
"enable_web_push": "DISABLE",
|
|
|
"theme-avatar-tip-show": "SHOWED",
|
|
|
"DedeUserID": "114343848",
|
|
|
"DedeUserID__ckMd5": "a1ad3ab2cc29d68f",
|
|
|
"SESSDATA": "1dc9a0f1%2C1771906511%2C3c84d%2A81CjC9x3-vzqR5SfpsKCUQ-HpeBVibiRaFbXisrTSWYmTkF0hJrYVG3iu7_ZcclIW6ZVQSVjVLU0tIekMyNXNVZVVUNGlYQ2pkaEJnTC02eEdlWFl4NjNGa3U5a1YtdnNmNWVlRU9VR21uTnpwY0VlVlQtcVZXN2pTbUtDaHBuVnRoUWlyWkU4M3l3IIEC",
|
|
|
"bili_jct": "1b4c0293f347e832f38b30180cc959eb",
|
|
|
"_uuid": "4E75EE8A-3B34-4C2D-5591-C5E6FE654BF858202infoc",
|
|
|
"b_nut": "1756354457",
|
|
|
"buvid3": "46F5C470-8FAD-264F-4587-E064C939733057781infoc",
|
|
|
"enable_feed_channel": "DISABLE",
|
|
|
"header_theme_version": "CLOSE",
|
|
|
"CURRENT_QUALITY": "80",
|
|
|
"buvid_fp": "206bf26677599ed3a4a4915740f56e57",
|
|
|
"rpdid": "0zbfAGEhWy|OtunE0Mn|182|3w1OW2y6",
|
|
|
"LIVE_BUVID": "AUTO5916605750922671"
|
|
|
}
|
|
|
|
|
|
def get_absolute_path(filename):
|
|
|
"""获取文件在代码所在目录的绝对路径"""
|
|
|
return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
|
|
|
|
|
|
def get_font_path():
|
|
|
"""获取可用的中文字体路径 - 使用你的版本"""
|
|
|
print("正在查找可用字体...")
|
|
|
|
|
|
# macOS 系统字体
|
|
|
if platform.system() == "Darwin":
|
|
|
mac_fonts = [
|
|
|
"/System/Library/Fonts/PingFang.ttc", # 苹方
|
|
|
"/System/Library/Fonts/STHeiti Medium.ttc", # 华文黑体(部分版本)
|
|
|
"/Library/Fonts/Arial Unicode.ttf", # Arial Unicode(备选)
|
|
|
]
|
|
|
for font_path in mac_fonts:
|
|
|
if os.path.exists(font_path):
|
|
|
print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
|
|
|
return font_path
|
|
|
|
|
|
# Windows 系统字体
|
|
|
elif platform.system() == "Windows":
|
|
|
win_fonts = [
|
|
|
"C:/Windows/Fonts/msyh.ttc", # 微软雅黑
|
|
|
"C:/Windows/Fonts/simhei.ttf", # 黑体
|
|
|
"C:/Windows/Fonts/simsun.ttc", # 宋体
|
|
|
"C:/Windows/Fonts/arialuni.ttf" # Arial Unicode(备选)
|
|
|
]
|
|
|
for font_path in win_fonts:
|
|
|
if os.path.exists(font_path):
|
|
|
print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
|
|
|
return font_path
|
|
|
|
|
|
# Linux 系统字体
|
|
|
elif platform.system() == "Linux":
|
|
|
linux_fonts = [
|
|
|
"/usr/share/fonts/truetype/wqy/wqy-microhei.ttc", # 文泉驿微米黑
|
|
|
"/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc", # 文泉驿正黑
|
|
|
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" # 备选(不一定支持中文)
|
|
|
]
|
|
|
for font_path in linux_fonts:
|
|
|
if os.path.exists(font_path):
|
|
|
print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
|
|
|
return font_path
|
|
|
|
|
|
# 如果以上都没找到
|
|
|
print("⚠ 未找到特定中文字体,使用系统默认字体(中文可能显示为方框)")
|
|
|
return None
|
|
|
|
|
|
def load_mask_image():
|
|
|
"""加载词云形状图片"""
|
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
|
# 常见的形状图片文件名
|
|
|
mask_files = [
|
|
|
"mask.png", "mask.jpg", "shape.png", "cloud.png",
|
|
|
"heart.png", "star.png", "circle.png", "wordcloud_mask.png"
|
|
|
]
|
|
|
|
|
|
for mask_file in mask_files:
|
|
|
mask_path = os.path.join(script_dir, mask_file)
|
|
|
if os.path.exists(mask_path):
|
|
|
try:
|
|
|
mask_image = np.array(Image.open(mask_path))
|
|
|
print(f"✓ 找到形状图片: {mask_file}")
|
|
|
return mask_image
|
|
|
except Exception as e:
|
|
|
print(f"加载形状图片 {mask_file} 失败: {e}")
|
|
|
continue
|
|
|
|
|
|
print("⚠ 未找到形状图片,将使用矩形词云")
|
|
|
return None
|
|
|
|
|
|
# 任务一:获取弹幕
|
|
|
def get_danmu(bvid, headers):
|
|
|
try:
|
|
|
cid_url = "https://api.bilibili.com/x/web-interface/view?bvid=" + bvid
|
|
|
cid_req = requests.get(cid_url, headers=headers, timeout=10)
|
|
|
cid_res = json.loads(cid_req.text)
|
|
|
|
|
|
if cid_res.get('code') != 0:
|
|
|
print(f"获取cid失败: {cid_res.get('message')}")
|
|
|
return []
|
|
|
|
|
|
if 'data' not in cid_res or 'cid' not in cid_res['data']:
|
|
|
print(f"视频 {bvid} 无法获取cid,可能已删除或无法访问")
|
|
|
return []
|
|
|
|
|
|
cid = cid_res['data']['cid']
|
|
|
|
|
|
danmu_url = "https://comment.bilibili.com/" + str(cid) + ".xml"
|
|
|
danmu_req = requests.get(danmu_url, headers=headers, timeout=10)
|
|
|
danmu_req.encoding = 'utf-8'
|
|
|
danmu_list = re.findall('<d p=".*?">(.*?)</d>', danmu_req.text)
|
|
|
|
|
|
print(f"视频 {bvid} 获取到 {len(danmu_list)} 条弹幕")
|
|
|
return danmu_list
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"获取弹幕时出错: {e}")
|
|
|
return []
|
|
|
|
|
|
# 获取视频数据
|
|
|
print("开始搜索'大语言模型'相关视频...")
|
|
|
for i in range(6):
|
|
|
try:
|
|
|
sess = requests.session()
|
|
|
req = sess.get(url + "&page=" + str(i+1), headers=headers, cookies=cookies, timeout=10)
|
|
|
res = json.loads(req.text)
|
|
|
|
|
|
print(f"第{i+1}页请求状态码: {req.status_code}")
|
|
|
|
|
|
if res.get('code') != 0:
|
|
|
print(f"请求失败,状态码: {res.get('code')}")
|
|
|
continue
|
|
|
|
|
|
if 'data' not in res or 'result' not in res['data']:
|
|
|
print(f"第{i+1}页未找到视频数据")
|
|
|
continue
|
|
|
|
|
|
videos = res['data']['result']
|
|
|
print(f"第{i+1}页找到 {len(videos)} 个视频")
|
|
|
|
|
|
video_count = 0
|
|
|
for item in videos:
|
|
|
if item.get('type') == 'video' and 'bvid' in item:
|
|
|
danmu_list.extend(get_danmu(item['bvid'], headers))
|
|
|
video_count += 1
|
|
|
time.sleep(1)
|
|
|
|
|
|
print(f"第{i+1}页处理了 {video_count} 个视频")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"处理第{i+1}页时出错: {e}")
|
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
# 检查是否获取到弹幕
|
|
|
if not danmu_list:
|
|
|
print("未获取到弹幕数据,使用示例数据")
|
|
|
danmu_list = [
|
|
|
"大语言模型", "AI", "人工智能", "深度学习", "机器学习",
|
|
|
"Transformer", "gpt", "神经网络", "deepseek", "预训练",
|
|
|
"计算机", "ChatGPT", "LLM", "语言模型", "生成式AI",
|
|
|
"大模型", "深度学习", "人工智能", "机器学习", "自然语言处理"
|
|
|
] * 10
|
|
|
|
|
|
print(f"总共获取到 {len(danmu_list)} 条弹幕")
|
|
|
|
|
|
# 任务二:统计弹幕数量并保存Excel
|
|
|
print("开始统计弹幕...")
|
|
|
danmu_count = Counter(danmu_list)
|
|
|
top_20_count = danmu_count.most_common(20)
|
|
|
|
|
|
# 创建Excel表
|
|
|
workbook = openpyxl.Workbook()
|
|
|
sheet = workbook.active
|
|
|
sheet['A1'] = '弹幕'
|
|
|
sheet['B1'] = '数量'
|
|
|
|
|
|
for i, (danmu, count) in enumerate(top_20_count, start=2):
|
|
|
sheet[f'A{i}'] = danmu
|
|
|
sheet[f'B{i}'] = count
|
|
|
|
|
|
# 保存Excel到代码所在目录
|
|
|
excel_path = get_absolute_path('danmu_statistics.xlsx')
|
|
|
workbook.save(excel_path)
|
|
|
print(f"✓ Excel表已保存到: {excel_path}")
|
|
|
|
|
|
# 打印结果
|
|
|
print("数量排名前20的弹幕:")
|
|
|
for danmu, count in top_20_count:
|
|
|
print(f"{danmu}: {count}")
|
|
|
|
|
|
# 任务三:生成词云图
|
|
|
print("开始生成词云图...")
|
|
|
jieba.setLogLevel(logging.WARNING)
|
|
|
|
|
|
# 过滤弹幕
|
|
|
def filter_danmu(danmu):
|
|
|
if re.search(r'[\u4e00-\u9fff]', danmu) or re.search(r'[a-zA-Z]', danmu):
|
|
|
return danmu
|
|
|
return None
|
|
|
|
|
|
filtered_danmu = [d for d in danmu_list if filter_danmu(d)]
|
|
|
print(f"弹幕已过滤,剩余 {len(filtered_danmu)} 条")
|
|
|
|
|
|
# 将弹幕列表转换为字符串
|
|
|
text = ' '.join(filtered_danmu)
|
|
|
|
|
|
# 对文本进行中文分词
|
|
|
try:
|
|
|
wc_list = jieba.lcut(text)
|
|
|
wc_text = ' '.join(wc_list)
|
|
|
print("✓ 中文分词完成")
|
|
|
|
|
|
# 打印分词结果的前20个词用于调试
|
|
|
print("分词示例:", wc_list[:20])
|
|
|
except Exception as e:
|
|
|
print(f"分词时出错: {e}")
|
|
|
wc_text = text
|
|
|
|
|
|
# 设置停用词 - 修复版:只过滤真正的无意义词
|
|
|
stopwords = set(STOPWORDS)
|
|
|
# 只保留最核心的停用词,避免过滤掉有意义的词
|
|
|
basic_stopwords = [
|
|
|
# 语气词
|
|
|
"啊", "呀", "呢", "吗", "吧", "啦", "哇", "哦", "唉", "嗯",
|
|
|
# 无意义的单个字
|
|
|
"都", "就", "也", "还", "又", "很", "太", "真", "更", "最",
|
|
|
"这", "那", "哪", "啥", "怎", "么",
|
|
|
# 基本虚词
|
|
|
"的", "了", "和", "是", "在", "我", "你", "他", "她", "它",
|
|
|
"我们", "你们", "他们", "这个", "那个", "这些", "那些",
|
|
|
"就是", "可以", "什么", "怎么", "为什么", "因为", "所以",
|
|
|
"但是", "然后", "如果", "虽然", "不过", "其实", "当然"
|
|
|
]
|
|
|
stopwords.update(basic_stopwords)
|
|
|
|
|
|
print(f"停用词数量: {len(stopwords)}")
|
|
|
|
|
|
# 创建词云对象
|
|
|
try:
|
|
|
# 获取字体路径
|
|
|
font_path = get_font_path()
|
|
|
|
|
|
# 加载形状图片
|
|
|
mask_image = load_mask_image()
|
|
|
|
|
|
# 创建词云配置
|
|
|
wordcloud_config = {
|
|
|
'width': 1200,
|
|
|
'height': 800,
|
|
|
'background_color': 'white',
|
|
|
'stopwords': stopwords,
|
|
|
'mask': mask_image,
|
|
|
'max_words': 100, # 减少显示词数,突出重要词汇
|
|
|
'colormap': 'viridis',
|
|
|
'relative_scaling': 0.3, # 调整相对缩放
|
|
|
'min_font_size': 8,
|
|
|
'max_font_size': 120,
|
|
|
'random_state': 42,
|
|
|
'collocations': False, # 避免显示重复的词组
|
|
|
}
|
|
|
|
|
|
# 使用字体路径
|
|
|
if font_path:
|
|
|
wordcloud_config['font_path'] = font_path
|
|
|
print(f"ℹ 使用字体: {font_path}")
|
|
|
else:
|
|
|
print("ℹ 使用系统默认字体")
|
|
|
|
|
|
# 创建词云
|
|
|
wc = WordCloud(**wordcloud_config)
|
|
|
|
|
|
# 生成词云前检查文本内容
|
|
|
print(f"生成词云的文本长度: {len(wc_text)}")
|
|
|
|
|
|
# 如果实际弹幕数据太少,混合示例数据
|
|
|
if len(filtered_danmu) < 50:
|
|
|
print("弹幕数据较少,混合示例数据")
|
|
|
example_words = ["大语言模型", "AI", "人工智能", "深度学习", "机器学习",
|
|
|
"Transformer", "GPT", "神经网络", "预训练", "ChatGPT",
|
|
|
"LLM", "语言模型", "生成式AI", "大模型", "自然语言处理",
|
|
|
"计算机", "算法", "数据", "训练", "推理"]
|
|
|
extra_text = ' '.join(example_words * 3)
|
|
|
wc_text = wc_text + " " + extra_text
|
|
|
|
|
|
wc = wc.generate(wc_text)
|
|
|
|
|
|
# 绘制词云图
|
|
|
plt.figure(figsize=(14, 10))
|
|
|
plt.imshow(wc, interpolation='bilinear')
|
|
|
plt.axis('off')
|
|
|
plt.title('大语言模型相关视频弹幕词云图', fontsize=18, pad=20)
|
|
|
plt.tight_layout()
|
|
|
|
|
|
# 保存词云图到代码所在目录
|
|
|
wc_image_path = get_absolute_path("wc_image.png")
|
|
|
wc.to_file(wc_image_path)
|
|
|
print(f"✓ 词云图已保存为: {wc_image_path}")
|
|
|
|
|
|
# 显示图片
|
|
|
plt.show()
|
|
|
|
|
|
# 打印词云中实际使用的词汇
|
|
|
word_freq = wc.words_
|
|
|
top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]
|
|
|
print("词云中的主要词汇:")
|
|
|
for word, freq in top_words:
|
|
|
print(f" {word}: {freq:.3f}")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"生成词云图时出错: {e}")
|
|
|
import traceback
|
|
|
traceback.print_exc()
|
|
|
|
|
|
# 最终文件检查
|
|
|
print("\n=== 生成文件检查 ===")
|
|
|
files_to_check = ['danmu_statistics.xlsx', 'wc_image.png']
|
|
|
|
|
|
all_files_exist = True
|
|
|
for filename in files_to_check:
|
|
|
file_path = get_absolute_path(filename)
|
|
|
if os.path.exists(file_path):
|
|
|
file_size = os.path.getsize(file_path)
|
|
|
print(f"✓ {filename}: 存在 ({file_size} 字节)")
|
|
|
else:
|
|
|
print(f"✗ {filename}: 不存在")
|
|
|
all_files_exist = False
|
|
|
|
|
|
if all_files_exist:
|
|
|
print("🎉 所有文件生成成功!")
|
|
|
else:
|
|
|
print("⚠ 部分文件生成失败")
|
|
|
|
|
|
print("程序执行完毕!") |