|
|
import itertools
|
|
|
import re
|
|
|
import time
|
|
|
from collections import Counter
|
|
|
from multiprocessing import Pool
|
|
|
from random import random
|
|
|
import requests
|
|
|
from fake_useragent import UserAgent
|
|
|
from openpyxl import Workbook
|
|
|
from bilibili_api import video
|
|
|
import asyncio
|
|
|
import configparser
|
|
|
|
|
|
|
|
|
# 定义一些参数
|
|
|
uA = UserAgent()
|
|
|
BVs = [] # 存储bv号
|
|
|
counts = Counter()
|
|
|
header = { # 从浏览器中复制下请求头
|
|
|
'Accept': 'application/json, text/plain, */*', # 接受的响应类型,包括 JSON、纯文本和所有其他类型
|
|
|
'Accept-Encoding': 'gzip, deflate, br', # 支持的编码方式,包括 gzip、deflate 和 br
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9', # 接受的语言,优先考虑简体中文
|
|
|
'Origin': 'https://search.bilibili.com', # 请求的起始地址
|
|
|
'Referer': 'https://search.bilibili.com/all?vt=17031316&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A', # 请求的来源页面地址
|
|
|
'Sec-Ch-Ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Microsoft Edge";v="128"', # 客户端的浏览器信息
|
|
|
'Sec-Ch-Ua-Mobile': '?0', # 客户端是否是移动设备,0 表示不是
|
|
|
'Sec-Ch-Ua-Platform': '"Windows"', # 客户端的操作系统平台
|
|
|
'Sec-Fetch-Dest': 'empty', # 请求的目标类型
|
|
|
'Sec-Fetch-Mode': 'cors', # 请求模式,表示跨域资源共享(CORS)
|
|
|
'Sec-Fetch-Site': 'same-site', # 请求的来源网站,表示请求是来自同一站点
|
|
|
'User-Agent': uA.random # 用户代理信息,`uA.random` 是一个随机生成的用户代理字符串
|
|
|
}
|
|
|
|
|
|
header1 = {
|
|
|
'UserAgent': uA.random
|
|
|
}
|
|
|
|
|
|
|
|
|
def get_Response(html_Url, h=header):
|
|
|
try:
|
|
|
response = requests.get(html_Url, headers=h)
|
|
|
response.raise_for_status()
|
|
|
response.encoding = 'utf-8'
|
|
|
return response
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
print(f"请求网页发生错误,错误信息:{e}")
|
|
|
time.sleep(1)
|
|
|
return None
|
|
|
|
|
|
# 从B站搜索栏中找排名前total_Num的视频,爬取他们的BV号,target为搜索内容,step为一页读取的视频数,默认为30
|
|
|
def get_Target_Video(arg):
|
|
|
page, target, step, Cookie = arg
|
|
|
header['Cookie'] = Cookie
|
|
|
params = { # 针对搜索栏,从浏览器中复制参数调用api
|
|
|
'__refresh__': 'true',
|
|
|
'_extra': '',
|
|
|
'context': '',
|
|
|
'page': page,
|
|
|
'page_size': step, # 设置一页读取step个视频
|
|
|
'from_source': '',
|
|
|
'from_spmid': '333.337',
|
|
|
'platform': 'pc',
|
|
|
'highlight': '1',
|
|
|
'single_column': '0',
|
|
|
'keyword': target, # 此处是搜索内容
|
|
|
'qv_id': '00d1q7iUbrvfsPdFZv2zXXtDqfcWuzER',
|
|
|
'ad_resource': '5654',
|
|
|
'source_tag': '3',
|
|
|
'gaia_vtoken': '',
|
|
|
'category_id': '',
|
|
|
'search_type': 'video',
|
|
|
'dynamic_offset': (step + 1) * (page - 1),
|
|
|
'web_location': '1430654',
|
|
|
'w_rid': 'f879864ca2210cc6a911552ceacadd6e',
|
|
|
'wts': '1693922979'
|
|
|
}
|
|
|
|
|
|
api_Url = "https://api.bilibili.com/x/web-interface/wbi/search/type" # 搜索栏的api
|
|
|
print(f"开始爬取第{page}页视频")
|
|
|
response = requests.get(api_Url, params, headers=header)
|
|
|
if response.ok:
|
|
|
content = response.text
|
|
|
#print(content)
|
|
|
match_Text = r'"bvid":"([^"]+)"'
|
|
|
print(f"已爬取第{page}页视频")
|
|
|
return re.findall(match_Text, content)
|
|
|
else:
|
|
|
print(f"请求网页失败,状态码:{response.status_code},可能是Cookie有误")
|
|
|
return list()
|
|
|
|
|
|
|
|
|
# 根据bvid请求得到cid
|
|
|
def get_Cid(bvid):
|
|
|
# 视频地址:https://www.bilibili.com/video/BV1PK4y1b7dt?t=1
|
|
|
url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}'
|
|
|
# 转换接送类型,便于获取cid号
|
|
|
res = requests.get(url).json()
|
|
|
# 将获取的网页json编码字符串转换为python对象,字典类型
|
|
|
cid = res["data"][0]['cid']
|
|
|
return cid
|
|
|
|
|
|
|
|
|
def get_Danmu(BV): # 对于一个BV号,将其分析为cid,后进入弹幕网站并爬取弹幕进行分析
|
|
|
try:
|
|
|
# 使用事件循环执行异步的 get_Cid 方法
|
|
|
cid = asyncio.get_event_loop().run_until_complete(get_Cid(BV))
|
|
|
if not cid:
|
|
|
print(f"未能解析到 cid: {BV}")
|
|
|
return None
|
|
|
|
|
|
print(f"正在解析cid:{cid}的弹幕")
|
|
|
danmu_Url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}.xml"
|
|
|
content = get_Response(danmu_Url, header1)
|
|
|
|
|
|
if not content:
|
|
|
print(f"获取弹幕内容失败, BV号: {BV}")
|
|
|
return None
|
|
|
|
|
|
# 解析弹幕内容
|
|
|
match_Text = r'<d[^>]*>([^<]+)</d>'
|
|
|
matches = re.findall(match_Text, content.text) # 爬取到的弹幕全部存储在列表中
|
|
|
new_Counts = Counter(matches) # 利用 counter 库对弹幕进行计数,统计每条弹幕的出现次数
|
|
|
return new_Counts
|
|
|
|
|
|
except AttributeError as e:
|
|
|
print(f"出现属性错误: {e}")
|
|
|
return None # 如果 get_Cid() 返回 None,跳过,报错已经在 get_Cid 中进行了
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
# Step 1:利用线程池找到前综合排序前300个视频
|
|
|
print("Step 1: 利用线程池找到前综合排序前300个视频")
|
|
|
pool = Pool(10)
|
|
|
# 得到一个list,其每个元素为装着bv号的list
|
|
|
args = []
|
|
|
for i in range(1, (total_Num // step) + 1):
|
|
|
args.append((i, target, step, Cookie))
|
|
|
bv = pool.map(get_Target_Video, args)
|
|
|
BVs = list(itertools.chain(*bv)) # 将得到的列表整合到一起
|
|
|
pool.close()
|
|
|
pool.join()
|
|
|
# print(BVs)
|
|
|
|
|
|
# Step 2:利用线程池对每一个视频的弹幕进行分析并分析频率
|
|
|
print("Step 2: 利用线程池对每一个视频的弹幕进行爬取并分析频率")
|
|
|
pool = Pool(max_Processing_Num) # 线程太多会403、412,少用一些线程
|
|
|
all_counts = pool.map(get_Danmu, BVs)
|
|
|
for count in all_counts:
|
|
|
counts.update(count)
|
|
|
print("分析完成,正在导入Excel....")
|
|
|
pool.close()
|
|
|
pool.join()
|
|
|
|
|
|
# Step 3: 取出现次数前8的弹幕,将他导入Excel
|
|
|
print("Step 3: 取出现次数前8的弹幕,将他导入Excel")
|
|
|
Top_8_Count = counts.most_common(8)
|
|
|
all_counts = counts.most_common()
|
|
|
# 创建一个新的 Excel 工作簿
|
|
|
workbook = Workbook()
|
|
|
worksheet1 = workbook.active
|
|
|
worksheet1.title = "Top 8"
|
|
|
# 将出现次数前 8 多的元素和计数器写入 Excel 表格中
|
|
|
for row, (item, count) in enumerate(Top_8_Count, start=1):
|
|
|
worksheet1.cell(row=row, column=1, value=item)
|
|
|
worksheet1.cell(row=row, column=2, value=count)
|
|
|
|
|
|
# 在第二页保存所有弹幕留档
|
|
|
worksheet2 = workbook.create_sheet(title="所有弹幕")
|
|
|
for row, (item, count) in enumerate(all_counts, start=1):
|
|
|
worksheet2.cell(row=row, column=1, value=item)
|
|
|
worksheet2.cell(row=row, column=2, value=count)
|
|
|
# 保存 Excel 工作簿
|
|
|
workbook.save('output/Top8_danmu.xlsx')
|
|
|
print("导入完成!")
|
|
|
|
|
|
|
|
|
import configparser
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
config = configparser.ConfigParser()
|
|
|
|
|
|
# 读取配置文件时指定编码为 'utf-8'
|
|
|
try:
|
|
|
with open('config/config.ini', encoding='utf-8') as f:
|
|
|
config.read_file(f)
|
|
|
except UnicodeDecodeError as e:
|
|
|
print(f"读取配置文件时发生编码错误: {e}")
|
|
|
# 这里可以选择其他编码或采取其他措施
|
|
|
exit(1)
|
|
|
|
|
|
config_Section_Name = "CR_DEFAULT"
|
|
|
|
|
|
# 获取配置项
|
|
|
target = config.get(config_Section_Name, 'target', fallback="2024巴黎奥运会")
|
|
|
total_Num = config.getint(config_Section_Name, 'total_Num', fallback=300)
|
|
|
step = config.getint(config_Section_Name, 'step', fallback=30)
|
|
|
max_Processing_Num = config.getint(config_Section_Name, 'max_Processing_Num', fallback=2)
|
|
|
Cookie = config.get(config_Section_Name, 'Cookie',
|
|
|
fallback='buvid4=99B52D59-84E1-D6E5-3334-00A5ABF5485951328-023051720-QsQqaACvYRbeH92Bxo5wCA%3D%3D; enable_web_push=DISABLE; header_theme_version=CLOSE; _uuid=FE44153E-EED7-BA1F-B241-10F7389A82B2B15963infoc; buvid3=46549500-1464-AD65-9582-A44F9C4A512231965infoc; b_nut=1720278731; DedeUserID=357687474; DedeUserID__ckMd5=611d95a70851412a; buvid_fp_plain=undefined; PVID=1; CURRENT_QUALITY=64; CURRENT_FNVAL=4048; rpdid=|(JlklRl)~Yu0Ju~kYuJkk)); SESSDATA=58accf91%2C1741958959%2Ce9f13%2A92CjCG0FTU3yXRKTjvmpd9wl7RXlqJ6qUlwhLziQUN_oRfPs8uPa7egHmLWm8l6qhWVxUSVmcxM0ZYbUM3LUhiYkwxckVKNFA4aDNoWFQ0cWR3clROMnZvR1YwRXBTcDN4WWM4aWZrT2RRUlBkZ2haMVJzbGx0N0JZT1VLNWpmZUJTajJHeHE0TnBRIIEC; bili_jct=29dfdcaafd6930788ec4b9d06acc07cd; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY2NjYzMjUsImlhdCI6MTcyNjQwNzA2NSwicGx0IjotMX0.xjzf0c6AFp5o6yLJ6ahcJvyGwJZHK-NDBZsmlX0F6As; bili_ticket_expires=1726666265; fingerprint=556cd93cd4317211728f459c230e65c0; bsource=search_bing; home_feed_column=5; browser_resolution=1482-787; bp_t_offset_357687474=977795428588191744; sid=6jfn4n6l; buvid_fp=556cd93cd4317211728f459c230e65c0; b_lsid=D64E4486_191FB90E96C')
|
|
|
|
|
|
# 确保 header 字典已初始化
|
|
|
header = {}
|
|
|
header['Cookie'] = Cookie
|
|
|
|
|
|
main()
|
|
|
|