You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
1.7 KiB

"""
从B站搜索结果中提取视频的BV号并将其保存到文件中
"""
import re
import requests
from common_headers import HEADERS
from concurrent.futures import ThreadPoolExecutor
def get_source(page_num):
"""获取B站搜索结果页的源码"""
get_url = (
f'https://api.bilibili.com/x/web-interface'
f'/wbi/search/type?__refresh__=true&_extra=&'
f'context=&page={page_num}&page_size=42&from_source=&from_spmid=333.337&'
'platform=pc&highlight=1&single_column=0&keyword=2024巴黎奥运会'
'&qv_id=zaOudcC1LJI0GehR81nuNQEKktKQ2aP1&ad_resource=5654'
'&source_tag=3&gaia_vtoken=&category_id=&search_type=video'
)
response = requests.get(url=get_url, headers=HEADERS, timeout=10)
return response.text
def extract_bv(source_html):
"""从搜索结果的HTML源码中提取BV号"""
return re.findall('"bvid":"(.*?)","title":".*?', source_html)
def save_bv_to_file(bv_list):
"""将BV号保存到文件中"""
with open('/output/bv_numbers.txt', 'a', encoding='utf-8') as f:
for bv in bv_list:
f.write(bv + '\n')
def process_pages(page_range):
"""并行处理多个页面"""
with ThreadPoolExecutor() as executor:
results = list(executor.map(get_source, page_range))
return results
def main():
"""主函数循环获取多页BV号并保存"""
counter = 0
page_range = range(1, 9)
html_sources = process_pages(page_range)
for html_source in html_sources:
bvs = extract_bv(html_source)
save_bv_to_file(bvs)
counter += len(bvs)
if counter >= 300:
break
print("BV号收集完成")
if __name__ == '__main__':
main()