You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

374 lines
13 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import json
import re
import openpyxl
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import numpy as np
import logging
import jieba
import time
import random
import os
import platform
import matplotlib.font_manager as fm
# 设置工作目录为脚本所在目录
os.chdir(os.path.dirname(os.path.abspath(__file__)))
print(f"工作目录已设置为: {os.getcwd()}")
# 弹幕列表
danmu_list = []
url = "https://api.bilibili.com/x/web-interface/wbi/search/type?page_size=50&keyword=%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B&search_type=video"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15",
"Referer": "https://www.bilibili.com/"
}
cookies = {
"buvid4": "E1155D85-CACE-0757-3AB5-FAA55B9BA4CE85795-022082117-PbDzNyYRQFSIZY4dZRE2fg%3D%3D",
"theme-tip-show": "SHOWED",
"browser_resolution": "1329-262",
"home_feed_column": "4",
"b_lsid": "62213F55_19A436DA31C",
"CURRENT_FNVAL": "4048",
"sid": "em2kxc9a",
"bp_t_offset_114343848": "1130356626901958656",
"bili_ticket": "eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3NjIyNzkwNDAsImlhdCI6MTc2MjAxOTc4MCwicGx0IjotMX0.Yez-DzUP_yfD_omjBoKCpT_PZEZRunGBW_Em9KagqWk",
"bili_ticket_expires": "1762278980",
"enable_web_push": "DISABLE",
"theme-avatar-tip-show": "SHOWED",
"DedeUserID": "114343848",
"DedeUserID__ckMd5": "a1ad3ab2cc29d68f",
"SESSDATA": "1dc9a0f1%2C1771906511%2C3c84d%2A81CjC9x3-vzqR5SfpsKCUQ-HpeBVibiRaFbXisrTSWYmTkF0hJrYVG3iu7_ZcclIW6ZVQSVjVLU0tIekMyNXNVZVVUNGlYQ2pkaEJnTC02eEdlWFl4NjNGa3U5a1YtdnNmNWVlRU9VR21uTnpwY0VlVlQtcVZXN2pTbUtDaHBuVnRoUWlyWkU4M3l3IIEC",
"bili_jct": "1b4c0293f347e832f38b30180cc959eb",
"_uuid": "4E75EE8A-3B34-4C2D-5591-C5E6FE654BF858202infoc",
"b_nut": "1756354457",
"buvid3": "46F5C470-8FAD-264F-4587-E064C939733057781infoc",
"enable_feed_channel": "DISABLE",
"header_theme_version": "CLOSE",
"CURRENT_QUALITY": "80",
"buvid_fp": "206bf26677599ed3a4a4915740f56e57",
"rpdid": "0zbfAGEhWy|OtunE0Mn|182|3w1OW2y6",
"LIVE_BUVID": "AUTO5916605750922671"
}
def get_absolute_path(filename):
"""获取文件在代码所在目录的绝对路径"""
return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
def get_font_path():
"""获取可用的中文字体路径 - 使用你的版本"""
print("正在查找可用字体...")
# macOS 系统字体
if platform.system() == "Darwin":
mac_fonts = [
"/System/Library/Fonts/PingFang.ttc", # 苹方
"/System/Library/Fonts/STHeiti Medium.ttc", # 华文黑体(部分版本)
"/Library/Fonts/Arial Unicode.ttf", # Arial Unicode备选
]
for font_path in mac_fonts:
if os.path.exists(font_path):
print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
return font_path
# Windows 系统字体
elif platform.system() == "Windows":
win_fonts = [
"C:/Windows/Fonts/msyh.ttc", # 微软雅黑
"C:/Windows/Fonts/simhei.ttf", # 黑体
"C:/Windows/Fonts/simsun.ttc", # 宋体
"C:/Windows/Fonts/arialuni.ttf" # Arial Unicode备选
]
for font_path in win_fonts:
if os.path.exists(font_path):
print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
return font_path
# Linux 系统字体
elif platform.system() == "Linux":
linux_fonts = [
"/usr/share/fonts/truetype/wqy/wqy-microhei.ttc", # 文泉驿微米黑
"/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc", # 文泉驿正黑
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf" # 备选(不一定支持中文)
]
for font_path in linux_fonts:
if os.path.exists(font_path):
print(f"✓ 找到系统字体: {os.path.basename(font_path)}")
return font_path
# 如果以上都没找到
print("⚠ 未找到特定中文字体,使用系统默认字体(中文可能显示为方框)")
return None
def load_mask_image():
"""加载词云形状图片"""
script_dir = os.path.dirname(os.path.abspath(__file__))
# 常见的形状图片文件名
mask_files = [
"mask.png", "mask.jpg", "shape.png", "cloud.png",
"heart.png", "star.png", "circle.png", "wordcloud_mask.png"
]
for mask_file in mask_files:
mask_path = os.path.join(script_dir, mask_file)
if os.path.exists(mask_path):
try:
mask_image = np.array(Image.open(mask_path))
print(f"✓ 找到形状图片: {mask_file}")
return mask_image
except Exception as e:
print(f"加载形状图片 {mask_file} 失败: {e}")
continue
print("⚠ 未找到形状图片,将使用矩形词云")
return None
# 任务一:获取弹幕
def get_danmu(bvid, headers):
try:
cid_url = "https://api.bilibili.com/x/web-interface/view?bvid=" + bvid
cid_req = requests.get(cid_url, headers=headers, timeout=10)
cid_res = json.loads(cid_req.text)
if cid_res.get('code') != 0:
print(f"获取cid失败: {cid_res.get('message')}")
return []
if 'data' not in cid_res or 'cid' not in cid_res['data']:
print(f"视频 {bvid} 无法获取cid可能已删除或无法访问")
return []
cid = cid_res['data']['cid']
danmu_url = "https://comment.bilibili.com/" + str(cid) + ".xml"
danmu_req = requests.get(danmu_url, headers=headers, timeout=10)
danmu_req.encoding = 'utf-8'
danmu_list = re.findall('<d p=".*?">(.*?)</d>', danmu_req.text)
print(f"视频 {bvid} 获取到 {len(danmu_list)} 条弹幕")
return danmu_list
except Exception as e:
print(f"获取弹幕时出错: {e}")
return []
# 获取视频数据
print("开始搜索'大语言模型'相关视频...")
for i in range(6):
try:
sess = requests.session()
req = sess.get(url + "&page=" + str(i+1), headers=headers, cookies=cookies, timeout=10)
res = json.loads(req.text)
print(f"{i+1}页请求状态码: {req.status_code}")
if res.get('code') != 0:
print(f"请求失败,状态码: {res.get('code')}")
continue
if 'data' not in res or 'result' not in res['data']:
print(f"{i+1}页未找到视频数据")
continue
videos = res['data']['result']
print(f"{i+1}页找到 {len(videos)} 个视频")
video_count = 0
for item in videos:
if item.get('type') == 'video' and 'bvid' in item:
danmu_list.extend(get_danmu(item['bvid'], headers))
video_count += 1
time.sleep(1)
print(f"{i+1}页处理了 {video_count} 个视频")
except Exception as e:
print(f"处理第{i+1}页时出错: {e}")
time.sleep(2)
# 检查是否获取到弹幕
if not danmu_list:
print("未获取到弹幕数据,使用示例数据")
danmu_list = [
"大语言模型", "AI", "人工智能", "深度学习", "机器学习",
"Transformer", "gpt", "神经网络", "deepseek", "预训练",
"计算机", "ChatGPT", "LLM", "语言模型", "生成式AI",
"大模型", "深度学习", "人工智能", "机器学习", "自然语言处理"
] * 10
print(f"总共获取到 {len(danmu_list)} 条弹幕")
# 任务二统计弹幕数量并保存Excel
print("开始统计弹幕...")
danmu_count = Counter(danmu_list)
top_20_count = danmu_count.most_common(20)
# 创建Excel表
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet['A1'] = '弹幕'
sheet['B1'] = '数量'
for i, (danmu, count) in enumerate(top_20_count, start=2):
sheet[f'A{i}'] = danmu
sheet[f'B{i}'] = count
# 保存Excel到代码所在目录
excel_path = get_absolute_path('danmu_statistics.xlsx')
workbook.save(excel_path)
print(f"✓ Excel表已保存到: {excel_path}")
# 打印结果
print("数量排名前20的弹幕")
for danmu, count in top_20_count:
print(f"{danmu}: {count}")
# 任务三:生成词云图
print("开始生成词云图...")
jieba.setLogLevel(logging.WARNING)
# 过滤弹幕
def filter_danmu(danmu):
if re.search(r'[\u4e00-\u9fff]', danmu) or re.search(r'[a-zA-Z]', danmu):
return danmu
return None
filtered_danmu = [d for d in danmu_list if filter_danmu(d)]
print(f"弹幕已过滤,剩余 {len(filtered_danmu)}")
# 将弹幕列表转换为字符串
text = ' '.join(filtered_danmu)
# 对文本进行中文分词
try:
wc_list = jieba.lcut(text)
wc_text = ' '.join(wc_list)
print("✓ 中文分词完成")
# 打印分词结果的前20个词用于调试
print("分词示例:", wc_list[:20])
except Exception as e:
print(f"分词时出错: {e}")
wc_text = text
# 设置停用词 - 修复版:只过滤真正的无意义词
stopwords = set(STOPWORDS)
# 只保留最核心的停用词,避免过滤掉有意义的词
basic_stopwords = [
# 语气词
"", "", "", "", "", "", "", "", "", "",
# 无意义的单个字
"", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "",
# 基本虚词
"", "", "", "", "", "", "", "", "", "",
"我们", "你们", "他们", "这个", "那个", "这些", "那些",
"就是", "可以", "什么", "怎么", "为什么", "因为", "所以",
"但是", "然后", "如果", "虽然", "不过", "其实", "当然"
]
stopwords.update(basic_stopwords)
print(f"停用词数量: {len(stopwords)}")
# 创建词云对象
try:
# 获取字体路径
font_path = get_font_path()
# 加载形状图片
mask_image = load_mask_image()
# 创建词云配置
wordcloud_config = {
'width': 1200,
'height': 800,
'background_color': 'white',
'stopwords': stopwords,
'mask': mask_image,
'max_words': 100, # 减少显示词数,突出重要词汇
'colormap': 'viridis',
'relative_scaling': 0.3, # 调整相对缩放
'min_font_size': 8,
'max_font_size': 120,
'random_state': 42,
'collocations': False, # 避免显示重复的词组
}
# 使用字体路径
if font_path:
wordcloud_config['font_path'] = font_path
print(f" 使用字体: {font_path}")
else:
print(" 使用系统默认字体")
# 创建词云
wc = WordCloud(**wordcloud_config)
# 生成词云前检查文本内容
print(f"生成词云的文本长度: {len(wc_text)}")
# 如果实际弹幕数据太少,混合示例数据
if len(filtered_danmu) < 50:
print("弹幕数据较少,混合示例数据")
example_words = ["大语言模型", "AI", "人工智能", "深度学习", "机器学习",
"Transformer", "GPT", "神经网络", "预训练", "ChatGPT",
"LLM", "语言模型", "生成式AI", "大模型", "自然语言处理",
"计算机", "算法", "数据", "训练", "推理"]
extra_text = ' '.join(example_words * 3)
wc_text = wc_text + " " + extra_text
wc = wc.generate(wc_text)
# 绘制词云图
plt.figure(figsize=(14, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('大语言模型相关视频弹幕词云图', fontsize=18, pad=20)
plt.tight_layout()
# 保存词云图到代码所在目录
wc_image_path = get_absolute_path("wc_image.png")
wc.to_file(wc_image_path)
print(f"✓ 词云图已保存为: {wc_image_path}")
# 显示图片
plt.show()
# 打印词云中实际使用的词汇
word_freq = wc.words_
top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]
print("词云中的主要词汇:")
for word, freq in top_words:
print(f" {word}: {freq:.3f}")
except Exception as e:
print(f"生成词云图时出错: {e}")
import traceback
traceback.print_exc()
# 最终文件检查
print("\n=== 生成文件检查 ===")
files_to_check = ['danmu_statistics.xlsx', 'wc_image.png']
all_files_exist = True
for filename in files_to_check:
file_path = get_absolute_path(filename)
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
print(f"{filename}: 存在 ({file_size} 字节)")
else:
print(f"{filename}: 不存在")
all_files_exist = False
if all_files_exist:
print("🎉 所有文件生成成功!")
else:
print("⚠ 部分文件生成失败")
print("程序执行完毕!")