You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
87 lines
3.3 KiB
87 lines
3.3 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import json
|
|
import jieba
|
|
import time
|
|
|
|
# 利用多线程获取bvid
|
|
def fetch_page(url, headers):
|
|
response = requests.get(url, headers=headers)
|
|
data = response.json()
|
|
videos = []
|
|
if 'data' in data and 'result' in data['data']:
|
|
for item in data['data']['result']:
|
|
if item['result_type'] == 'video':
|
|
for video in item['data']:
|
|
videos.append(video['bvid'])
|
|
return videos
|
|
|
|
# 具体获取该关键字下所有的bvid
|
|
def get_video_ids(keyword, max_results):
|
|
# 请写入自己的Cookie
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
|
|
'Referer': 'https://www.bilibili.com/',
|
|
'Origin': 'https://www.bilibili.com',
|
|
'Cookie': ''
|
|
}
|
|
videos = []
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
futures = []
|
|
for page in range(1, max_results // 30 + 1):
|
|
url = f'https://api.bilibili.com/x/web-interface/search/all/v2?keyword={keyword}&page={page}'
|
|
futures.append(executor.submit(fetch_page, url, headers))
|
|
# 将bvid保存在文档中
|
|
for future in as_completed(futures):
|
|
videos.extend(future.result())
|
|
with open(f'videos_ids/{keyword}.txt', 'a') as file:
|
|
file.write('\n'.join(future.result()) + '\n')
|
|
print("bvids achieve")
|
|
return videos[:max_results]
|
|
|
|
# 获取弹幕
|
|
def get_danmaku(bvid):
|
|
# 请写入自己的Cookie
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
|
|
'Cookie': ''
|
|
}
|
|
# 获取cid
|
|
with requests.Session() as session:
|
|
session.headers.update(headers) # 更新headers
|
|
cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}'
|
|
cid_response = session.get(cid_url)
|
|
cid_data = cid_response.json()
|
|
cid = cid_data['data'][0]['cid']
|
|
print('成功提取视频', cid_data['data'][0]['part'], 'cid:', cid)
|
|
danmaku_url = f'https://comment.bilibili.com/{cid}.xml'
|
|
danmaku_response = session.get(danmaku_url)
|
|
danmaku_response.encoding = 'utf-8' # 设置编码格式
|
|
soup = BeautifulSoup(danmaku_response.content, 'lxml-xml') # 解析xml
|
|
danmakus = [d.text for d in soup.find_all('d')]
|
|
return danmakus
|
|
|
|
# 加载关键字
|
|
def load_keywords(keyword):
|
|
with open('keywords.json', 'r', encoding='utf-8') as file:
|
|
data = json.load(file)
|
|
return data[keyword]
|
|
|
|
# 过滤弹幕
|
|
def filter_danmakus(danmakus, keywords):
|
|
filtered = [dm for dm in danmakus if any(kw in dm for kw in keywords)]
|
|
return filtered
|
|
|
|
# 加载停用词
|
|
def load_stopwords(path):
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
stopwords = set([line.strip() for line in f.readlines()])
|
|
return stopwords
|
|
|
|
# 去除停用词
|
|
def remove_stopwords(text, stopwords):
|
|
words = jieba.cut(text)
|
|
filtered_words = ' '.join(word for word in words if word not in stopwords)
|
|
return filtered_words |