parent
18da276714
commit
413b7721e7
@ -1,2 +1,98 @@
|
|||||||
# 1.1
|
import re
|
||||||
|
import requests
|
||||||
|
from multiprocessing.dummy import Pool
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# 配置常量
|
||||||
|
KEYWORD = "2024 巴黎奥运会"
|
||||||
|
DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字
|
||||||
|
PAGENUM = 10 # 设置要爬取的页面数量
|
||||||
|
WORKERS = 6 # 线程池工作线程数
|
||||||
|
|
||||||
|
# HTTP请求头部
|
||||||
|
HEADERS = {
|
||||||
|
"cookie": "your_cookie_here", # 替换为实际cookie
|
||||||
|
'origin': 'https://www.bilibili.com',
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||||
|
"referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0",
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_search_results_html(page: int) -> str:
|
||||||
|
"""获取搜索结果页面的HTML内容"""
|
||||||
|
url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}"
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=HEADERS)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching page {page}: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def get_bvs(html: str) -> list:
|
||||||
|
"""从HTML内容中提取BVs"""
|
||||||
|
return re.findall(r'bvid:"([^"]+)"', html)
|
||||||
|
|
||||||
|
def get_info(vid: str) -> dict:
|
||||||
|
"""获取视频信息,如标题和弹幕数"""
|
||||||
|
url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}"
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
if 'data' in data:
|
||||||
|
info = {
|
||||||
|
"标题": data["data"]["View"]["title"],
|
||||||
|
"总弹幕数": data["data"]["View"]["stat"]["danmaku"],
|
||||||
|
"视频数量": data["data"]["View"]["videos"],
|
||||||
|
"cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
|
||||||
|
}
|
||||||
|
return info
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching info for vid {vid}: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def get_danmu(info: dict) -> list:
|
||||||
|
"""获取视频的弹幕"""
|
||||||
|
all_dms = []
|
||||||
|
for cid in info.get("cid", []):
|
||||||
|
url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}"
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.encoding = "utf-8"
|
||||||
|
data = re.findall('<d p="(.*?)">(.*?)</d>', response.text)
|
||||||
|
dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕
|
||||||
|
all_dms += dms
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching danmu for cid {cid}: {e}")
|
||||||
|
|
||||||
|
print(f"获取弹幕{len(all_dms)}条!")
|
||||||
|
return all_dms
|
||||||
|
|
||||||
|
def save_danmu(bv: str):
|
||||||
|
"""将弹幕保存到文本文件"""
|
||||||
|
info = get_info(bv)
|
||||||
|
danmu = get_danmu(info)
|
||||||
|
with open(f"./{KEYWORD}弹幕.txt", "a", encoding="utf-8") as fout:
|
||||||
|
for dm in danmu:
|
||||||
|
fout.write(dm + "\n")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数:爬取视频信息和弹幕"""
|
||||||
|
pool = Pool(WORKERS)
|
||||||
|
htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1))
|
||||||
|
bvs = []
|
||||||
|
|
||||||
|
for html in htmls:
|
||||||
|
bvs.extend(get_bvs(html))
|
||||||
|
|
||||||
|
# 限制为前三百个视频
|
||||||
|
bvs = bvs[:300]
|
||||||
|
|
||||||
|
# 爬取弹幕
|
||||||
|
for bv in tqdm(bvs, desc="正在爬取弹幕"):
|
||||||
|
save_danmu(bv)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
|
Loading…
Reference in new issue