|
|
|
|
import requests
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import re
|
|
|
|
|
import time
|
|
|
|
|
import json
|
|
|
|
|
import subprocess
|
|
|
|
|
import os
|
|
|
|
|
import pprint
|
|
|
|
|
from urllib import parse
|
|
|
|
|
headers_1 = {
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
|
|
|
|
|
}
|
|
|
|
|
url = 'https://www.bilibili.com/video/BV1Fy4y1D7XS?t=798'
|
|
|
|
|
headers_2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Referer': url}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
print("1.爬取图片")
|
|
|
|
|
print("2.下载五月天的歌曲")
|
|
|
|
|
print("3.爬取酷狗音乐歌曲")
|
|
|
|
|
print("4.爬取视频")
|
|
|
|
|
fun = input("请输入你想要的功能:")
|
|
|
|
|
if fun == '1':
|
|
|
|
|
beauty = BeautifulPicture()
|
|
|
|
|
beauty.get_pic()
|
|
|
|
|
elif fun == '2':
|
|
|
|
|
get_music1()
|
|
|
|
|
elif fun == '3':
|
|
|
|
|
get_music2()
|
|
|
|
|
elif fun == '4':
|
|
|
|
|
get_video()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
def get_video():
|
|
|
|
|
html_data = send_request('https://www.bilibili.com/video/BV1w44y1r79C?t=29').text
|
|
|
|
|
video_data = get_video_data(html_data)
|
|
|
|
|
save_data(video_data[0], video_data[1], video_data[2])
|
|
|
|
|
name = title
|
|
|
|
|
print(title)
|
|
|
|
|
a = f'D:\python\B爬虫\web\{name}.mp4'
|
|
|
|
|
b = f'D:\python\B爬虫\web\{name}.mp3'
|
|
|
|
|
video_add_mp4(a, b)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def send_request(url):
|
|
|
|
|
response = requests.get(url=url, headers=headers_2)
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_video_data(html_data):
|
|
|
|
|
global title
|
|
|
|
|
res = '<title data-vue-meta="true">(.*?)</title>'
|
|
|
|
|
title = re.findall(res, html_data)[0]
|
|
|
|
|
title = title.split('_')[0]
|
|
|
|
|
|
|
|
|
|
json_data = re.findall('<script>window\.__playinfo__=(.*?)</script>', html_data)[0]
|
|
|
|
|
print(json_data)
|
|
|
|
|
json_data = json.loads(json_data)
|
|
|
|
|
audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
|
|
|
|
|
print('解析到的音频地址:', audio_url)
|
|
|
|
|
video_url = json_data['data']['dash']['video'][0]['backupUrl'][0]
|
|
|
|
|
print('解析到的视频地址:', video_url)
|
|
|
|
|
video_data = [title, audio_url, video_url]
|
|
|
|
|
return video_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_data(file_name, audio_url, video_url):
|
|
|
|
|
print('正在请求音频数据')
|
|
|
|
|
audio_data = send_request(audio_url).content
|
|
|
|
|
print('正在请求视频数据')
|
|
|
|
|
video_data = send_request(video_url).content
|
|
|
|
|
with open(file_name+'.mp3', mode='wb') as f:
|
|
|
|
|
f.write(audio_data)
|
|
|
|
|
print('正在保存音频数据')
|
|
|
|
|
with open(file_name+'.mp4', mode='wb') as f:
|
|
|
|
|
f.write(video_data)
|
|
|
|
|
print('正在保存视频数据')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def video_add_mp4(file_name, mp4_file):
|
|
|
|
|
print("视频开始合成")
|
|
|
|
|
outfile_name = file_name.split('.')[0] + '-new.mp4'
|
|
|
|
|
cmd = f"D:\\ffmpeg\\bin\\ffmpeg -i \"{mp4_file}\" -i \"{file_name}\" -acodec copy -vcodec copy \"{outfile_name}\""
|
|
|
|
|
subprocess.call(cmd, shell=True)
|
|
|
|
|
print("视频合成结束")
|
|
|
|
|
#####################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
def get_music2():
|
|
|
|
|
name = input('请输入你想下载的歌曲:')
|
|
|
|
|
kugou = KuGou(name)
|
|
|
|
|
song_list = kugou.get_song_list()
|
|
|
|
|
pprint.pprint(song_list)
|
|
|
|
|
choice = int(input('请输入你想下载歌曲的序号:'))
|
|
|
|
|
kugou.save_song(choice)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KuGou(object):
|
|
|
|
|
def __init__(self, name):
|
|
|
|
|
self.Name = name
|
|
|
|
|
|
|
|
|
|
# 获取网页内容
|
|
|
|
|
def get_reponse(self, url):
|
|
|
|
|
headers = {
|
|
|
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
|
|
|
|
|
'cookie': '',
|
|
|
|
|
'referer': 'https://www.kugou.com/yy/html/search.html'
|
|
|
|
|
} # 携带user-agent,cookie,referer三个参数
|
|
|
|
|
time.sleep(2) # 加来玩的
|
|
|
|
|
response = requests.get(url=url, headers=headers)
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
# 获取url地址。
|
|
|
|
|
def get_song_info(self):
|
|
|
|
|
song_name = parse.quote(self.Name)
|
|
|
|
|
url = 'https://songsearch.kugou.com/song_search_v2?callback=jQuery112406923427025623534_1585454373397&keyword={}&page=1&pagesize=30&userid=-1&clientver=&platform=WebFilter&tag=em&filter=2&iscorrection=1&privilege_filter=0&_=1585454373399'.format(
|
|
|
|
|
song_name)
|
|
|
|
|
print('----get song info:', url)
|
|
|
|
|
time.sleep(2)
|
|
|
|
|
response = self.get_reponse(url=url).content.decode('utf-8')
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
# 获取音乐列表
|
|
|
|
|
def get_song_list(self):
|
|
|
|
|
data_info = self.get_song_info()
|
|
|
|
|
file_name = re.findall('"FileName":"(.*?)"', data_info)
|
|
|
|
|
song_name_list = []
|
|
|
|
|
for index, songs in enumerate(file_name):
|
|
|
|
|
song_name = songs.replace('<em>{}<\\/em>'.format(self.Name), '{}'.format(self.Name))
|
|
|
|
|
song_name_list.append(str(index) + '、' + song_name)
|
|
|
|
|
return song_name_list
|
|
|
|
|
def get_song_hash(self):
|
|
|
|
|
data_info = self.get_song_info()
|
|
|
|
|
song_hash = re.findall('"FileHash":"(.*?)"', data_info)
|
|
|
|
|
return song_hash
|
|
|
|
|
def get_song_album(self):
|
|
|
|
|
data_info = self.get_song_info()
|
|
|
|
|
song_album = re.findall('"AlbumID":"(.*?)"', data_info)
|
|
|
|
|
return song_album
|
|
|
|
|
def save_song(self, choice):
|
|
|
|
|
song_hash = self.get_song_hash()
|
|
|
|
|
album_id = self.get_song_album()
|
|
|
|
|
url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191017845313983799804_1591241136863&hash={0}&album_id={1}&dfid=0j14jN41N6PP0q6mOr1iALP1&mid=be4d7c2fb6112a816b8dece9812cdfc8&platid=4&_=1591241136865'.format(
|
|
|
|
|
song_hash[choice], album_id[choice])
|
|
|
|
|
response = self.get_reponse(url=url).content.decode('unicode_escape')
|
|
|
|
|
pattern = re.compile('"play_url":"(.*?)"')
|
|
|
|
|
song_info = re.findall(pattern, response)[0]
|
|
|
|
|
audio_name = re.findall('"audio_name":"(.*?)"', response)[0]
|
|
|
|
|
song_url = song_info.replace('\/', '/')
|
|
|
|
|
data = self.get_reponse(url=song_url).content
|
|
|
|
|
with open('{}.mp3'.format(audio_name), 'wb') as f:
|
|
|
|
|
f.write(data)
|
|
|
|
|
print('你已经下载:%s' % audio_name)
|
|
|
|
|
#####################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
def get_music1():
|
|
|
|
|
ID = '13193'
|
|
|
|
|
url = 'https://music.163.com/artist?id=' + ID
|
|
|
|
|
html = requests.get(url, headers_1).text
|
|
|
|
|
get_id(html)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_id(html):
|
|
|
|
|
findlink = re.compile(r'<a href="/song\?id=(\d*)">(.*?)</a></li><li>')
|
|
|
|
|
findname = re.compile(r'<h2 id="artist-name" data-rid=\d* class="sname f-thide sname-max" title=".*?">(.*?)</h2>')
|
|
|
|
|
singername = re.findall(findname, html)[0]
|
|
|
|
|
creat(singername)
|
|
|
|
|
ll = re.findall(findlink, html)
|
|
|
|
|
for i in ll:
|
|
|
|
|
savemusic(i[1], i[0])
|
|
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def creat(singername):
|
|
|
|
|
if not os.path.exists(singername):
|
|
|
|
|
os.mkdir(singername)
|
|
|
|
|
os.chdir(singername)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def savemusic(name, id):
|
|
|
|
|
url = 'http://music.163.com/song/media/outer/url?id='+id+'.mp3'
|
|
|
|
|
with open(name+'.m4a', 'wb') as f:
|
|
|
|
|
print('歌曲《', name, '》 下载中***************')
|
|
|
|
|
f.write(requests.get(url=url, headers=headers_1).content)
|
|
|
|
|
f.close()
|
|
|
|
|
print("《", name, "》下载完成")
|
|
|
|
|
print('')
|
|
|
|
|
#####################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#####################################################################
|
|
|
|
|
class BeautifulPicture():
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'}
|
|
|
|
|
self.web_url = 'http://www.nipic.com/'
|
|
|
|
|
self.folder_path = 'D:\BeautifulPicture'
|
|
|
|
|
|
|
|
|
|
def get_pic(self):
|
|
|
|
|
print('开始网页get请求')
|
|
|
|
|
r = self.request(self.web_url)
|
|
|
|
|
print('开始获取所有a标签')
|
|
|
|
|
all_a = BeautifulSoup(r.text, 'lxml').find_all('img')
|
|
|
|
|
print('开始创建文件夹')
|
|
|
|
|
self.mkdir(self.folder_path)
|
|
|
|
|
print('开始切换文件夹')
|
|
|
|
|
os.chdir(self.folder_path)
|
|
|
|
|
i = 0
|
|
|
|
|
for a in all_a:
|
|
|
|
|
img_url = a['src']
|
|
|
|
|
print('a标签的src内容是:', img_url)
|
|
|
|
|
x = i
|
|
|
|
|
img_name = '图片' + str(x)
|
|
|
|
|
i = i + 1
|
|
|
|
|
self.save_img(img_url, img_name)
|
|
|
|
|
|
|
|
|
|
def save_img(self, url, name):
|
|
|
|
|
print('开始请求图片地址,过程会有点长...')
|
|
|
|
|
img = self.request(url)
|
|
|
|
|
file_name = name + '.jpg'
|
|
|
|
|
print('开始保存图片')
|
|
|
|
|
f = open(file_name, 'ab')
|
|
|
|
|
f.write(img.content)
|
|
|
|
|
print(file_name, '图片保存成功!')
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
def request(self, url):
|
|
|
|
|
r = requests.get(url, headers=self.headers)
|
|
|
|
|
print(url)
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
def mkdir(self, path):
|
|
|
|
|
path = path.strip()
|
|
|
|
|
isExists = os.path.exists(path)
|
|
|
|
|
if not isExists:
|
|
|
|
|
print('创建名字叫做', path, '的文件夹')
|
|
|
|
|
os.makedirs(path)
|
|
|
|
|
print('创建成功!')
|
|
|
|
|
else:
|
|
|
|
|
print(path, '文件夹已经存在了,不再创建')
|
|
|
|
|
#####################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|