You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pzrxqba79 413b7721e7
Update README.md
11 months ago
README.md Update README.md 11 months ago

README.md

import re import requests from multiprocessing.dummy import Pool from tqdm import tqdm

配置常量

KEYWORD = "2024 巴黎奥运会" DANMU_KEYWORD = "AI" # 过滤弹幕中的关键字 PAGENUM = 10 # 设置要爬取的页面数量 WORKERS = 6 # 线程池工作线程数

HTTP请求头部

HEADERS = { "cookie": "your_cookie_here", # 替换为实际cookie 'origin': 'https://www.bilibili.com', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "referer": "https://t.bilibili.com/?spm_id_from=333.337.0.0", }

def get_search_results_html(page: int) -> str: """获取搜索结果页面的HTML内容""" url = f"https://search.bilibili.com/all?keyword={KEYWORD}&order=click&page={page}" try: response = requests.get(url, headers=HEADERS) response.raise_for_status() return response.text except requests.RequestException as e: print(f"Error fetching page {page}: {e}") return ""

def get_bvs(html: str) -> list: """从HTML内容中提取BVs""" return re.findall(r'bvid:"([^"]+)"', html)

def get_info(vid: str) -> dict: """获取视频信息,如标题和弹幕数""" url = f"https://api.bilibili.com/x/web-interface/view/detail?bvid={vid}" try: response = requests.get(url) response.raise_for_status() data = response.json()

    if 'data' in data:
        info = {
            "标题": data["data"]["View"]["title"],
            "总弹幕数": data["data"]["View"]["stat"]["danmaku"],
            "视频数量": data["data"]["View"]["videos"],
            "cid": [dic["cid"] for dic in data["data"]["View"]["pages"]]
        }
        return info
except requests.RequestException as e:
    print(f"Error fetching info for vid {vid}: {e}")
return {}

def get_danmu(info: dict) -> list: """获取视频的弹幕""" all_dms = [] for cid in info.get("cid", []): url = f"https://api.bilibili.com/x/v1/dm/list.so?oid={cid}" try: response = requests.get(url) response.encoding = "utf-8" data = re.findall('(.*?)', response.text) dms = [d[1] for d in data if DANMU_KEYWORD in d[1]] # 过滤包含AI的弹幕 all_dms += dms except requests.RequestException as e: print(f"Error fetching danmu for cid {cid}: {e}")

print(f"获取弹幕{len(all_dms)}条!")
return all_dms

def save_danmu(bv: str): """将弹幕保存到文本文件""" info = get_info(bv) danmu = get_danmu(info) with open(f"./{KEYWORD}弹幕.txt", "a", encoding="utf-8") as fout: for dm in danmu: fout.write(dm + "\n")

def main(): """主函数:爬取视频信息和弹幕""" pool = Pool(WORKERS) htmls = pool.map(get_search_results_html, range(1, PAGENUM + 1)) bvs = []

for html in htmls:
    bvs.extend(get_bvs(html))

# 限制为前三百个视频
bvs = bvs[:300]

# 爬取弹幕
for bv in tqdm(bvs, desc="正在爬取弹幕"):
    save_danmu(bv)

if name == "main": main()