You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pzrxqba79 413b7721e7
Update README.md
2 months ago
README.md Update README.md 2 months ago

README.md

import re import requests from multiprocessing.dummy import Pool from tqdm import tqdm

配置常量

KEYWORD = "2024 巴黎奥运会" DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字 PAGENUM = 10 # 设置要爬取的页面数量 WORKERS = 6 # 线程池工作线程数

HTTP请求头部

HEADERS = { "cookie": "your_cookie_here", # 替换为实际cookie 'origin': 'https://www.bilibili.com', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0", }

def get_search_results_html(page: int) -> str: """获取搜索结果页面的HTML内容""" url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}" try: response = requests.get(url, headers=HEADERS) response.raise_for_status() return response.text except requests.RequestException as e: print(f"Error fetching page {page}: {e}") return ""

def get_bvs(html: str) -> list: """从HTML内容中提取BVs""" return re.findall(r'bvid:"([^"]+)"', html)

def get_info(vid: str) -> dict: """获取视频信息,如标题和弹幕数""" url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}" try: response = requests.get(url) response.raise_for_status() data = response.json()

    if 'data' in data:
        info = {
            "标题": data["data"]["View"]["title"],
            "总弹幕数": data["data"]["View"]["stat"]["danmaku"],
            "视频数量": data["data"]["View"]["videos"],
            "cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
        }
        return info
except requests.RequestException as e:
    print(f"Error fetching info for vid {vid}: {e}")
return {}

def get_danmu(info: dict) -> list: """获取视频的弹幕""" all_dms = [] for cid in info.get("cid", []): url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" try: response = requests.get(url) response.encoding = "utf-8" data = re.findall('(.*?)', response.text) dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕 all_dms += dms except requests.RequestException as e: print(f"Error fetching danmu for cid {cid}: {e}")

print(f"获取弹幕{len(all_dms)}条!")
return all_dms

def save_danmu(bv: str): """将弹幕保存到文本文件""" info = get_info(bv) danmu = get_danmu(info) with open(f"./{KEYWORD}弹幕.txt", "a", encoding="utf-8") as fout: for dm in danmu: fout.write(dm + "\n")

def main(): """主函数:爬取视频信息和弹幕""" pool = Pool(WORKERS) htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1)) bvs = []

for html in htmls:
    bvs.extend(get_bvs(html))

# 限制为前三百个视频
bvs = bvs[:300]

# 爬取弹幕
for bv in tqdm(bvs, desc="正在爬取弹幕"):
    save_danmu(bv)

if name == "main": main()