import requests from bs4 import BeautifulSoup import os from concurrent.futures import ThreadPoolExecutor, as_completed import json import jieba import time # 利用多线程获取bvid def fetch_page(url, headers): response = requests.get(url, headers=headers) data = response.json() videos = [] if 'data' in data and 'result' in data['data']: for item in data['data']['result']: if item['result_type'] == 'video': for video in item['data']: videos.append(video['bvid']) return videos # 具体获取该关键字下所有的bvid def get_video_ids(keyword, max_results): # 请写入自己的Cookie headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', 'Referer': 'https://www.bilibili.com/', 'Origin': 'https://www.bilibili.com', 'Cookie': '' } videos = [] with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for page in range(1, max_results // 30 + 1): url = f'https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}' futures.append(executor.submit(fetch_page, url, headers)) # 将bvid保存在文档中 for future in as_completed(futures): videos.extend(future.result()) with open(f'videos_ids/{keyword}.txt', 'a') as file: file.write('\n'.join(future.result()) + '\n') print("bvids achieve") return videos[:max_results] # 获取弹幕 def get_danmaku(bvid): # 请写入自己的Cookie headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', 'Cookie': '' } # 获取cid with requests.Session() as session: session.headers.update(headers) # 更新headers cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}' cid_response = session.get(cid_url) cid_data = cid_response.json() cid = cid_data['data'][0]['cid'] print('成功提取视频', cid_data['data'][0]['part'], 'cid:', cid) danmaku_url = f'https://comment.bilibili.com/{cid}.xml' danmaku_response = session.get(danmaku_url) danmaku_response.encoding = 'utf-8' # 设置编码格式 soup = BeautifulSoup(danmaku_response.content, 'lxml-xml') # 解析xml danmakus = [d.text for d in soup.find_all('d')] return danmakus # 加载关键字 def load_keywords(keyword): with open('keywords.json', 'r', encoding='utf-8') as file: data = json.load(file) return data[keyword] # 过滤弹幕 def filter_danmakus(danmakus, keywords): filtered = [dm for dm in danmakus if any(kw in dm for kw in keywords)] return filtered # 加载停用词 def load_stopwords(path): with open(path, 'r', encoding='utf-8') as f: stopwords = set([line.strip() for line in f.readlines()]) return stopwords # 去除停用词 def remove_stopwords(text, stopwords): words = jieba.cut(text) filtered_words = ' '.join(word for word in words if word not in stopwords) return filtered_words