|
|
import json # json.loads 用于解码 JSON 数据。该函数返回 Python 字段的数据类型
|
|
|
import os
|
|
|
import pprint
|
|
|
import requests
|
|
|
# from bs4 import BeautifulSoup # 网页解析,获取数据
|
|
|
import re # 正则表达式,进行文字匹配`
|
|
|
import xlwt # 进行excel操作
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
search_url = 'https://api.bilibili.com/x/web-interface/popular/precious?page_size=100&page=1'
|
|
|
head = { # 模拟浏览器头部信息,向服务器发送消息
|
|
|
"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36",
|
|
|
'referer': 'https://www.bilibili.com/' #防盗链
|
|
|
}
|
|
|
response = requests.get(search_url,headers=head) #爬取网页页面内容
|
|
|
# pprint.pprint(response.json()) #格式化打印
|
|
|
result_list = response.json()['data']['list'] #获取所需数据
|
|
|
# print(result_list)
|
|
|
|
|
|
def download_mp3mp4(url): #下载音视频,合成
|
|
|
# index = result_list[0]
|
|
|
|
|
|
res = requests.get(url, headers=head).text
|
|
|
|
|
|
title = re.findall('<title data-vue-meta="true">(.*?)_哔哩哔哩_bilibili</title> ',res)[0]
|
|
|
title = re.sub(r'[\/:*?"<>|]','',title)
|
|
|
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>',res)[0]
|
|
|
print(title)
|
|
|
#把字符串转成字典
|
|
|
json_data = json.loads(html_data) #字典数据类型
|
|
|
# dic = json.loads(response)
|
|
|
# pprint.pprint(json_data) #格式输出
|
|
|
|
|
|
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
|
|
|
video_url = json_data['data']['dash']['video'][0]['baseUrl']
|
|
|
# print(audio_url)
|
|
|
# print(video_url)
|
|
|
audio_content = requests.get(audio_url,headers=head).content
|
|
|
video_content = requests.get(video_url,headers=head).content
|
|
|
print('音频下载中')
|
|
|
with open(title+'.mp3','wb') as fp:
|
|
|
fp.write(audio_content )
|
|
|
print('视频下载中')
|
|
|
with open(title+'.mp4','wb') as fp:
|
|
|
fp.write(video_content)
|
|
|
n_mp4_n = 'new' + title.split('\\')[-1]
|
|
|
n_mp4_f = title.replace(title.split('\\')[-1], n_mp4_n)
|
|
|
com = f'D:\\pycharm\\ffmpeg\\ffmpeg-2022-04-18-git-d5687236ab-essentials_build\\bin\\ffmpeg.exe -i "{title+".mp3"}" -i "{title+".mp4"}" ' \
|
|
|
f'-acodec copy -vcodec copy "{n_mp4_f+".mp4"}"'
|
|
|
print(com)
|
|
|
os.system(com)
|
|
|
print('视频合成完成')
|
|
|
os.remove(title+'.mp3')
|
|
|
os.remove(title+'.mp4')
|
|
|
|
|
|
#获取网页数据
|
|
|
def getData(datalist):
|
|
|
datalist = [] # 用来存储爬取的网页信息
|
|
|
for i in range(85):
|
|
|
data = []
|
|
|
lis = result_list[i]
|
|
|
# print(lis)
|
|
|
owner = lis['owner']['name'] #UP主昵称
|
|
|
data.append(owner)
|
|
|
title = lis['title'] #视频标题
|
|
|
data.append(title)
|
|
|
achievement = lis['achievement']#视频成就
|
|
|
data.append(achievement)
|
|
|
desc = lis['desc'] #视频简介
|
|
|
desc = re.sub('[(\s+)?(\s+)?]', "", desc)
|
|
|
desc = re.sub('/', " ", desc)
|
|
|
# print(desc)
|
|
|
data.append(desc)
|
|
|
# aid = lis['stat']['aid'] #
|
|
|
view = lis['stat']['view'] #播放量
|
|
|
data.append(view)
|
|
|
danmaku = lis['stat']['danmaku'] #弹幕数
|
|
|
data.append(danmaku)
|
|
|
reply = lis['stat']['reply'] #评论数
|
|
|
data.append(reply)
|
|
|
favorite = lis['stat']['favorite'] #收藏数
|
|
|
data.append(favorite)
|
|
|
coin = lis['stat']['coin'] #投币数
|
|
|
data.append(coin)
|
|
|
share = lis['stat']['share'] #分享数
|
|
|
data.append(share)
|
|
|
datalist.append(data)
|
|
|
# print(datalist)
|
|
|
return datalist
|
|
|
|
|
|
# 3.保存数据
|
|
|
def saveDATA(datalist,savepath):
|
|
|
print('excel saving.....')
|
|
|
workbook = xlwt.Workbook(encoding='utf-8',style_compression=0) # 创建workbook对象
|
|
|
worksheet = workbook.add_sheet('入站必看',cell_overwrite_ok=True) # 创建工作表,cell_overwrite_ok=True表示覆盖原excel表
|
|
|
col = ('UP主昵称','视频标题','视频成就','视频简介','播放量','弹幕数','评论数','收藏数','投币数','分享数')
|
|
|
for i in range(10):
|
|
|
worksheet.write(0, i,col[i] ) # 在(0,i)位置填写
|
|
|
for i in range(0,85): #4900-70
|
|
|
# print("第%d条" %(i+1)) #输出语句,用来测试
|
|
|
data = datalist[i]
|
|
|
for j in range(10):
|
|
|
worksheet.write(i+1,j,data[j]) #数据
|
|
|
workbook.save(savepath)
|
|
|
print('excel saved')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
# 1.爬取网页https://www.bilibili.com/v/popular/history
|
|
|
datalist=getData(search_url)
|
|
|
# print(datalist)
|
|
|
savepath='入B站必看.xls'
|
|
|
#保存数据
|
|
|
saveDATA(datalist,savepath)
|
|
|
|
|
|
with ThreadPoolExecutor(3) as t: #线程池
|
|
|
for index in result_list:
|
|
|
t.submit(download_mp3mp4,f'https://www.bilibili.com/video/{index["bvid"]}')
|
|
|
print('全部下载完毕')
|
|
|
|