diff --git a/README.md b/README.md index ba0a00b..8d933b7 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,98 @@ -# 1.1 +import re +import requests +from multiprocessing.dummy import Pool +from tqdm import tqdm + +# 配置常量 +KEYWORD = "2024 巴黎奥运会" +DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字 +PAGENUM = 10 # 设置要爬取的页面数量 +WORKERS = 6 # 线程池工作线程数 + +# HTTP请求头部 +HEADERS = { + "cookie": "your_cookie_here", # 替换为实际cookie + 'origin': 'https://www.bilibili.com', + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0", +} + +def get_search_results_html(page: int) -> str: + """获取搜索结果页面的HTML内容""" + url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}" + try: + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Error fetching page {page}: {e}") + return "" + +def get_bvs(html: str) -> list: + """从HTML内容中提取BVs""" + return re.findall(r'bvid:"([^"]+)"', html) + +def get_info(vid: str) -> dict: + """获取视频信息,如标题和弹幕数""" + url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}" + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + + if 'data' in data: + info = { + "标题": data["data"]["View"]["title"], + "总弹幕数": data["data"]["View"]["stat"]["danmaku"], + "视频数量": data["data"]["View"]["videos"], + "cid": [dic["cid"] for dic in data["data"]["View"]["pages"]] + } + return info + except requests.RequestException as e: + print(f"Error fetching info for vid {vid}: {e}") + return {} + +def get_danmu(info: dict) -> list: + """获取视频的弹幕""" + all_dms = [] + for cid in info.get("cid", []): + url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" + try: + response = requests.get(url) + response.encoding = "utf-8" + data = re.findall('(.*?)', response.text) + dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕 + all_dms += dms + except requests.RequestException as e: + print(f"Error fetching danmu for cid {cid}: {e}") + + print(f"获取弹幕{len(all_dms)}条!") + return all_dms + +def save_danmu(bv: str): + """将弹幕保存到文本文件""" + info = get_info(bv) + danmu = get_danmu(info) + with open(f"./{KEYWORD}弹幕.txt", "a", encoding="utf-8") as fout: + for dm in danmu: + fout.write(dm + "\n") + +def main(): + """主函数:爬取视频信息和弹幕""" + pool = Pool(WORKERS) + htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1)) + bvs = [] + + for html in htmls: + bvs.extend(get_bvs(html)) + + # 限制为前三百个视频 + bvs = bvs[:300] + + # 爬取弹幕 + for bv in tqdm(bvs, desc="正在爬取弹幕"): + save_danmu(bv) + +if __name__ == "__main__": + main()