import requests
from bs4 import BeautifulSoup
import re
import time
import json
import subprocess
import os
import pprint
from urllib import parse
headers_1 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
url = 'https://www.bilibili.com/video/BV1Fy4y1D7XS?t=798'
headers_2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Referer': url}
def main():
print("1.爬取图片")
print("2.下载五月天的歌曲")
print("3.爬取酷狗音乐歌曲")
print("4.爬取视频")
fun = input("请输入你想要的功能:")
if fun == '1':
beauty = BeautifulPicture()
beauty.get_pic()
elif fun == '2':
get_music1()
elif fun == '3':
get_music2()
elif fun == '4':
get_video()
#####################################################################
def get_video():
html_data = send_request('https://www.bilibili.com/video/BV1w44y1r79C?t=29').text
video_data = get_video_data(html_data)
save_data(video_data[0], video_data[1], video_data[2])
name = title
print(title)
a = f'D:\python\B爬虫\web\{name}.mp4'
b = f'D:\python\B爬虫\web\{name}.mp3'
video_add_mp4(a, b)
def send_request(url):
response = requests.get(url=url, headers=headers_2)
return response
def get_video_data(html_data):
global title
res = '
(.*?)'
title = re.findall(res, html_data)[0]
title = title.split('_')[0]
json_data = re.findall('', html_data)[0]
print(json_data)
json_data = json.loads(json_data)
audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
print('解析到的音频地址:', audio_url)
video_url = json_data['data']['dash']['video'][0]['backupUrl'][0]
print('解析到的视频地址:', video_url)
video_data = [title, audio_url, video_url]
return video_data
def save_data(file_name, audio_url, video_url):
print('正在请求音频数据')
audio_data = send_request(audio_url).content
print('正在请求视频数据')
video_data = send_request(video_url).content
with open(file_name+'.mp3', mode='wb') as f:
f.write(audio_data)
print('正在保存音频数据')
with open(file_name+'.mp4', mode='wb') as f:
f.write(video_data)
print('正在保存视频数据')
def video_add_mp4(file_name, mp4_file):
print("视频开始合成")
outfile_name = file_name.split('.')[0] + '-new.mp4'
cmd = f"D:\\ffmpeg\\bin\\ffmpeg -i \"{mp4_file}\" -i \"{file_name}\" -acodec copy -vcodec copy \"{outfile_name}\""
subprocess.call(cmd, shell=True)
print("视频合成结束")
#####################################################################
#####################################################################
def get_music2():
name = input('请输入你想下载的歌曲:')
kugou = KuGou(name)
song_list = kugou.get_song_list()
pprint.pprint(song_list)
choice = int(input('请输入你想下载歌曲的序号:'))
kugou.save_song(choice)
class KuGou(object):
def __init__(self, name):
self.Name = name
# 获取网页内容
def get_reponse(self, url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'cookie': '',
'referer': 'https://www.kugou.com/yy/html/search.html'
} # 携带user-agent,cookie,referer三个参数
time.sleep(2) # 加来玩的
response = requests.get(url=url, headers=headers)
return response
# 获取url地址。
def get_song_info(self):
song_name = parse.quote(self.Name)
url = 'https://songsearch.kugou.com/song_search_v2?callback=jQuery112406923427025623534_1585454373397&keyword={}&page=1&pagesize=30&userid=-1&clientver=&platform=WebFilter&tag=em&filter=2&iscorrection=1&privilege_filter=0&_=1585454373399'.format(
song_name)
print('----get song info:', url)
time.sleep(2)
response = self.get_reponse(url=url).content.decode('utf-8')
return response
# 获取音乐列表
def get_song_list(self):
data_info = self.get_song_info()
file_name = re.findall('"FileName":"(.*?)"', data_info)
song_name_list = []
for index, songs in enumerate(file_name):
song_name = songs.replace('{}<\\/em>'.format(self.Name), '{}'.format(self.Name))
song_name_list.append(str(index) + '、' + song_name)
return song_name_list
def get_song_hash(self):
data_info = self.get_song_info()
song_hash = re.findall('"FileHash":"(.*?)"', data_info)
return song_hash
def get_song_album(self):
data_info = self.get_song_info()
song_album = re.findall('"AlbumID":"(.*?)"', data_info)
return song_album
def save_song(self, choice):
song_hash = self.get_song_hash()
album_id = self.get_song_album()
url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191017845313983799804_1591241136863&hash={0}&album_id={1}&dfid=0j14jN41N6PP0q6mOr1iALP1&mid=be4d7c2fb6112a816b8dece9812cdfc8&platid=4&_=1591241136865'.format(
song_hash[choice], album_id[choice])
response = self.get_reponse(url=url).content.decode('unicode_escape')
pattern = re.compile('"play_url":"(.*?)"')
song_info = re.findall(pattern, response)[0]
audio_name = re.findall('"audio_name":"(.*?)"', response)[0]
song_url = song_info.replace('\/', '/')
data = self.get_reponse(url=song_url).content
with open('{}.mp3'.format(audio_name), 'wb') as f:
f.write(data)
print('你已经下载:%s' % audio_name)
#####################################################################
#####################################################################
def get_music1():
ID = '13193'
url = 'https://music.163.com/artist?id=' + ID
html = requests.get(url, headers_1).text
get_id(html)
def get_id(html):
findlink = re.compile(r'(.*?)')
findname = re.compile(r'(.*?)
')
singername = re.findall(findname, html)[0]
creat(singername)
ll = re.findall(findlink, html)
for i in ll:
savemusic(i[1], i[0])
time.sleep(0.5)
def creat(singername):
if not os.path.exists(singername):
os.mkdir(singername)
os.chdir(singername)
def savemusic(name, id):
url = 'http://music.163.com/song/media/outer/url?id='+id+'.mp3'
with open(name+'.m4a', 'wb') as f:
print('歌曲《', name, '》 下载中***************')
f.write(requests.get(url=url, headers=headers_1).content)
f.close()
print("《", name, "》下载完成")
print('')
#####################################################################
#####################################################################
class BeautifulPicture():
def __init__(self):
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'}
self.web_url = 'http://www.nipic.com/'
self.folder_path = 'D:\BeautifulPicture'
def get_pic(self):
print('开始网页get请求')
r = self.request(self.web_url)
print('开始获取所有a标签')
all_a = BeautifulSoup(r.text, 'lxml').find_all('img')
print('开始创建文件夹')
self.mkdir(self.folder_path)
print('开始切换文件夹')
os.chdir(self.folder_path)
i = 0
for a in all_a:
img_url = a['src']
print('a标签的src内容是:', img_url)
x = i
img_name = '图片' + str(x)
i = i + 1
self.save_img(img_url, img_name)
def save_img(self, url, name):
print('开始请求图片地址,过程会有点长...')
img = self.request(url)
file_name = name + '.jpg'
print('开始保存图片')
f = open(file_name, 'ab')
f.write(img.content)
print(file_name, '图片保存成功!')
f.close()
def request(self, url):
r = requests.get(url, headers=self.headers)
print(url)
return r
def mkdir(self, path):
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
print('创建名字叫做', path, '的文件夹')
os.makedirs(path)
print('创建成功!')
else:
print(path, '文件夹已经存在了,不再创建')
#####################################################################
if __name__ == "__main__":
main()