You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
|
"""
|
|
|
|
|
从B站搜索结果中提取视频的BV号,并将其保存到文件中
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
import requests
|
|
|
|
|
from common_headers import HEADERS
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
|
|
|
def get_source(page_num):
|
|
|
|
|
"""获取B站搜索结果页的源码"""
|
|
|
|
|
get_url = (
|
|
|
|
|
f'https://api.bilibili.com/x/web-interface'
|
|
|
|
|
f'/wbi/search/type?__refresh__=true&_extra=&'
|
|
|
|
|
f'context=&page={page_num}&page_size=42&from_source=&from_spmid=333.337&'
|
|
|
|
|
'platform=pc&highlight=1&single_column=0&keyword=2024巴黎奥运会'
|
|
|
|
|
'&qv_id=zaOudcC1LJI0GehR81nuNQEKktKQ2aP1&ad_resource=5654'
|
|
|
|
|
'&source_tag=3&gaia_vtoken=&category_id=&search_type=video'
|
|
|
|
|
)
|
|
|
|
|
response = requests.get(url=get_url, headers=HEADERS, timeout=10)
|
|
|
|
|
return response.text
|
|
|
|
|
|
|
|
|
|
def extract_bv(source_html):
|
|
|
|
|
"""从搜索结果的HTML源码中提取BV号"""
|
|
|
|
|
return re.findall('"bvid":"(.*?)","title":".*?', source_html)
|
|
|
|
|
|
|
|
|
|
def save_bv_to_file(bv_list):
|
|
|
|
|
"""将BV号保存到文件中"""
|
|
|
|
|
with open('/output/bv_numbers.txt', 'a', encoding='utf-8') as f:
|
|
|
|
|
for bv in bv_list:
|
|
|
|
|
f.write(bv + '\n')
|
|
|
|
|
|
|
|
|
|
def process_pages(page_range):
|
|
|
|
|
"""并行处理多个页面"""
|
|
|
|
|
with ThreadPoolExecutor() as executor:
|
|
|
|
|
results = list(executor.map(get_source, page_range))
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
"""主函数:循环获取多页BV号并保存"""
|
|
|
|
|
counter = 0
|
|
|
|
|
page_range = range(1, 9)
|
|
|
|
|
html_sources = process_pages(page_range)
|
|
|
|
|
|
|
|
|
|
for html_source in html_sources:
|
|
|
|
|
bvs = extract_bv(html_source)
|
|
|
|
|
save_bv_to_file(bvs)
|
|
|
|
|
counter += len(bvs)
|
|
|
|
|
if counter >= 300:
|
|
|
|
|
break
|
|
|
|
|
print("BV号收集完成")
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|