|
|
"""
|
|
|
从B站获取视频的弹幕并保存到文件中
|
|
|
"""
|
|
|
|
|
|
import re
|
|
|
import json
|
|
|
import requests
|
|
|
from common_headers import HEADERS
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
def load_bv_numbers(file_path):
|
|
|
"""从文件中读取BV号"""
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
return [line.strip() for line in f.readlines()]
|
|
|
|
|
|
def fetch_video_cids(bv_list):
|
|
|
"""获取视频的CID号"""
|
|
|
cid_list = []
|
|
|
with ThreadPoolExecutor() as executor:
|
|
|
results = list(executor.map(lambda bv: requests.get(f'https://api.bilibili.com/x/player/pagelist?bvid={bv}&jsonp=jsonp', headers=HEADERS, timeout=10), bv_list))
|
|
|
for response in results:
|
|
|
cid = json.loads(response.text)['data'][0]['cid']
|
|
|
cid_list.append(cid)
|
|
|
return cid_list
|
|
|
|
|
|
def fetch_and_save_danmu(cid_list, danmu_file):
|
|
|
"""爬取视频弹幕并保存到文件"""
|
|
|
with ThreadPoolExecutor() as executor:
|
|
|
results = list(executor.map(lambda cid: requests.get(f'https://api.bilibili.com/x/v1/dm/list.so?oid={cid}', headers=HEADERS, timeout=10), cid_list))
|
|
|
for response in results:
|
|
|
response.encoding = response.apparent_encoding
|
|
|
data_list = re.findall('<d p=".*?">(.*?)</d>', response.text)
|
|
|
with open(danmu_file, mode='a', encoding='utf-8') as f:
|
|
|
for danmu in data_list:
|
|
|
f.write(danmu + '\n')
|
|
|
|
|
|
def main():
|
|
|
"""主函数:从BV号中获取CID并爬取弹幕"""
|
|
|
bv_file_path = '/output/bv_numbers.txt'
|
|
|
danmu_output_file = '/output/danmu.txt'
|
|
|
|
|
|
bv_numbers = load_bv_numbers(bv_file_path)
|
|
|
cids = fetch_video_cids(bv_numbers)
|
|
|
fetch_and_save_danmu(cids, danmu_output_file)
|
|
|
|
|
|
print("弹幕数据爬取完成")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|