K_class/code/bvid.py

"""
从B站搜索结果中提取视频的BV号，并将其保存到文件中
"""

import re
import requests
from common_headers import HEADERS  # 假设你有一个公共的header文件

def get_source(page_num):
    """获取B站搜索结果页的源码"""
    get_url = (
        f'https://api.bilibili.com/x/web-interface'
        f'/wbi/search/type?__refresh__=true&_extra=&'
        f'context=&page={page_num}'
        '&page_size=42&from_source=&from_spmid=333.337&'
        'platform=pc&highlight=1&single_column=0&'
        'keyword=2024巴黎奥运会'
        '&qv_id=zaOudcC1LJI0GehR81nuNQEKktKQ2aP1&ad_resource=5654'
        '&source_tag=3&gaia_vtoken=&category_id=&search_type=video'
    )
    response = requests.get(url=get_url, headers=HEADERS, timeout=10)
    return response.text

def extract_bv(source_html):
    """从搜索结果的HTML源码中提取BV号"""
    return re.findall('"bvid":"(.*?)","title":".*?', source_html)

def save_bv_to_file(bv_list):
    """将BV号保存到文件中"""
    with open('E:/Crawler/output/bv_numbers.txt', 'a', encoding='utf-8') as f:
        for bv in bv_list:
            f.write(bv + '\n')

def main():
    """主函数：循环获取多页BV号并保存"""
    counter = 0
    for page in range(1, 9):
        html_source = get_source(page)
        bvs = extract_bv(html_source)
        save_bv_to_file(bvs)
        counter += len(bvs)
        if counter >= 300:
            break
    print("BV号收集完成")

if __name__ == '__main__':
    main()