|
|
import collections # 用于词频统计
|
|
|
import json # 用于处理JSON数据
|
|
|
import requests # 用于发送HTTP请求
|
|
|
import re # 正则表达式模块,用于解析弹幕
|
|
|
import time # 用于时间相关操作
|
|
|
import openpyxl # 用于处理Excel文件
|
|
|
import pandas as pd # 用于数据处理
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed # 用于并发操作
|
|
|
import cProfile
|
|
|
|
|
|
profile = cProfile.Profile()
|
|
|
profile.enable()
|
|
|
|
|
|
|
|
|
# 定义开始和结束日期,用于生成日期范围
|
|
|
startdate = '20240710'
|
|
|
enddate = '20240910'
|
|
|
date = [x for x in pd.date_range(startdate, enddate).strftime('%Y-%m-%d')] # 生成日期列表
|
|
|
|
|
|
# 定义Excel文件名,用于保存弹幕数据
|
|
|
file_xlsx = '我的全部弹幕.xlsx'
|
|
|
|
|
|
# 创建Excel工作簿和工作表,并添加标题行
|
|
|
total_workbook = openpyxl.Workbook()
|
|
|
total_sheet = total_workbook.active
|
|
|
total_sheet.append(['弹幕'])
|
|
|
|
|
|
# 定义B站弹幕API的基础URL,{number}是占位符,用于填充视频的cid号
|
|
|
tempApi = 'https://api.bilibili.com/x/v1/dm/list.so?oid={number}'
|
|
|
|
|
|
# 定义请求头,包含cookie和user-agent,用于伪装请求
|
|
|
headers = {
|
|
|
'cookie':"buvid3=D65868DE-AFD5-34A4-1714-A1C0F783C5DC27124infoc; b_nut=1725930527; _uuid=FF569C27-D2C6-10814-36A8-48AA8141364924857infoc; CURRENT_FNVAL=4048; buvid_fp=2ba89565eab107e1e14c7982fc1ef9ea; buvid4=FAB9A58B-B8F5-8DAF-2AC4-4E874D3D1F0E28371-024091001-a%2FA7nVxQVETBwJOeuHlVsQ%3D%3D; rpdid=|(u))kkYu|lu0J'u~klmJ|lkm; SESSDATA=e8f35e7e%2C1741482645%2Cb3572%2A91CjC7hBYEVq-d38AwweerB9sclbgqT78LR6aribbsaBRVlJ0BoUjCMidR-nm82eDlo70SVlVibjl1UnQ0Y0NzSFFCb21DRGNNSXp4YnRSbFdzMXo0NjR4QkM0TlBKejUweW1TbDJkT0g3Z2Z6bTdmQVJzdmpvVHZmR1JWOEhtbnFGZmpuQUt6WXZnIIEC; bili_jct=7d37b038ea7714a0c41ec3d26603737b; DedeUserID=1917958039; DedeUserID__ckMd5=eaa26b970b7e3104; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjYxOTIwNTMsImlhdCI6MTcyNTkzMjc5MywicGx0IjotMX0.82V6_w7kGoSvzDy9rT-9DpsL7U_BrB24GefbBM0Vvb8; bili_ticket_expires=1726191993; header_theme_version=CLOSE; enable_web_push=DISABLE; home_feed_column=5; browser_resolution=1536-730; b_lsid=953CBCA8_191E441EE95; bp_t_offset_1917958039=976131738646347776; sid=hl295qcj",
|
|
|
'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
|
}
|
|
|
|
|
|
# 定义函数,获取搜索结果中的bvid(视频的唯一标识符)
|
|
|
def get_bvid(page_number, number):
|
|
|
# 构造搜索API的URL,page_number是页码,number是该页中的视频编号
|
|
|
url = f'https://api.bilibili.com/x/web-interface/search/type?page={page_number}&page_size=50&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&search_type=video'
|
|
|
response = requests.get(url=url, headers=headers) # 发送请求
|
|
|
try:
|
|
|
# 解析返回的JSON数据,提取视频的bvid
|
|
|
json_data = json.loads(response.text)
|
|
|
print(json_data)
|
|
|
bvid = json_data['data']['result'][number]['bvid']
|
|
|
print(f"获取到bvid: {bvid}")
|
|
|
return bvid # 返回bvid
|
|
|
except (KeyError, IndexError, json.JSONDecodeError, requests.RequestException) as e:
|
|
|
print(f"获取bvid时出错: {e}")
|
|
|
# 捕获错误并返回None,防止程序崩溃
|
|
|
return None
|
|
|
|
|
|
# 定义函数,根据bvid获取视频的cid(弹幕对应的唯一标识符)
|
|
|
def get_cid(bvid):
|
|
|
try:
|
|
|
# 通过bvid构造获取cid的API请求URL
|
|
|
url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp'
|
|
|
response = requests.get(url, headers=headers) # 发送请求
|
|
|
if response.status_code != 200:
|
|
|
# 如果请求状态码不是200,返回None
|
|
|
return None
|
|
|
# 解析返回的JSON数据,提取cid
|
|
|
json_dict = json.loads(response.text)
|
|
|
return json_dict['data'][0]['cid'] # 返回cid
|
|
|
except (KeyError, IndexError, json.JSONDecodeError, requests.RequestException):
|
|
|
return None # 捕获错误并返回None
|
|
|
|
|
|
# 定义函数,获取并保存某个视频的弹幕
|
|
|
def fetch_and_save_bulletchat(cid):
|
|
|
# 用cid替换API中的占位符
|
|
|
url = tempApi.replace("{number}", str(cid))
|
|
|
try:
|
|
|
# 发送请求获取弹幕数据
|
|
|
response = requests.get(url, headers=headers)
|
|
|
response.encoding = response.apparent_encoding # 设置编码
|
|
|
# 使用正则表达式解析弹幕内容
|
|
|
data = re.findall('<d p=".*?">(.*?)</d>', response.text)
|
|
|
if data:
|
|
|
return data # 返回弹幕列表
|
|
|
except requests.RequestException:
|
|
|
return [] # 如果请求失败,返回空列表
|
|
|
|
|
|
# 定义函数,批量获取bvid和cid,并创建并发任务
|
|
|
def put_api():
|
|
|
tasks = []
|
|
|
# 使用ThreadPoolExecutor创建线程池,用于并发请求
|
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
|
# 控制页码范围(1到5页),每页50个视频
|
|
|
for i in range(1, 7):
|
|
|
for j in range(50):
|
|
|
bvid = get_bvid(i, j) # 获取bvid
|
|
|
if bvid:
|
|
|
cid = get_cid(bvid) # 获取cid
|
|
|
if cid:
|
|
|
# 提交弹幕抓取任务到线程池
|
|
|
tasks.append(executor.submit(fetch_and_save_bulletchat, cid))
|
|
|
return tasks # 返回任务列表
|
|
|
|
|
|
# 定义函数,处理并发任务,收集所有弹幕数据
|
|
|
def get_data(tasks):
|
|
|
all_bulletchats = []
|
|
|
# 遍历所有完成的任务,获取结果
|
|
|
for task in as_completed(tasks):
|
|
|
bulletchat_data = task.result()
|
|
|
if bulletchat_data:
|
|
|
all_bulletchats.extend(bulletchat_data) # 将弹幕数据加入总列表
|
|
|
return all_bulletchats # 返回所有弹幕数据
|
|
|
|
|
|
# 定义函数,将弹幕数据保存到文件和Excel中
|
|
|
def save_to_file(bulletchats):
|
|
|
# 打开文本文件,将弹幕逐行写入
|
|
|
with open('我的全部弹幕.txt', 'a', encoding='utf-8') as file_txt:
|
|
|
for index in bulletchats:
|
|
|
file_txt.write(index + '\n')
|
|
|
total_sheet.append([index]) # 将弹幕写入Excel表格
|
|
|
total_workbook.save(file_xlsx) # 保存Excel文件
|
|
|
|
|
|
# 定义函数,计算弹幕频次,并保存到Excel
|
|
|
def calculate_frequency():
|
|
|
try:
|
|
|
# 读取弹幕Excel文件
|
|
|
fd = pd.read_excel(file_xlsx)
|
|
|
lines = fd['弹幕']
|
|
|
# 将所有弹幕拼接成一个字符串
|
|
|
text = ' '.join(lines.astype(str))
|
|
|
words = text.split() # 将弹幕分割为单词
|
|
|
word_counts = collections.Counter(words) # 统计单词频次
|
|
|
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True) # 按频次排序
|
|
|
|
|
|
# 创建新的Excel工作簿用于保存频次统计结果
|
|
|
workbook = openpyxl.Workbook()
|
|
|
sheet = workbook.active
|
|
|
sheet.append(['弹幕', '频次']) # 添加标题行
|
|
|
|
|
|
# 将排序后的词频结果写入Excel
|
|
|
for word, count in sorted_word_counts:
|
|
|
sheet.append([word, count])
|
|
|
|
|
|
workbook.save('我的统计弹幕出现次数.xlsx') # 保存频次统计的Excel文件
|
|
|
except Exception as e:
|
|
|
print(f"计算频次时出错: {e}")
|
|
|
|
|
|
# 主函数,负责执行整个流程
|
|
|
def main():
|
|
|
tasks = put_api() # 获取bvid和cid并创建并发任务
|
|
|
bulletchats = get_data(tasks) # 获取所有弹幕数据
|
|
|
save_to_file(bulletchats) # 保存弹幕数据到文件和Excel
|
|
|
calculate_frequency() # 计算弹幕频次
|
|
|
print("Finished") # 输出流程结束信息
|
|
|
|
|
|
|
|
|
# 如果此脚本被直接运行,则调用main函数
|
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
|
|
|
|
profile.disable()
|
|
|
# Save the profiling data to a file
|
|
|
profile.dump_stats('./output.prof')
|
|
|
|