You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

250 lines
9.1 KiB

import requests
from bs4 import BeautifulSoup
import re
import time
import json
import subprocess
import os
import pprint
from urllib import parse
headers_1 = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
url = 'https://www.bilibili.com/video/BV1Fy4y1D7XS?t=798'
headers_2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Referer': url}
def main():
print("1.爬取图片")
print("2.下载五月天的歌曲")
print("3.爬取酷狗音乐歌曲")
print("4.爬取视频")
fun = input("请输入你想要的功能:")
if fun == '1':
beauty = BeautifulPicture()
beauty.get_pic()
elif fun == '2':
get_music1()
elif fun == '3':
get_music2()
elif fun == '4':
get_video()
#####################################################################
def get_video():
html_data = send_request('https://www.bilibili.com/video/BV1w44y1r79C?t=29').text
video_data = get_video_data(html_data)
save_data(video_data[0], video_data[1], video_data[2])
name = title
print(title)
a = f'D:\python\B爬虫\web\{name}.mp4'
b = f'D:\python\B爬虫\web\{name}.mp3'
video_add_mp4(a, b)
def send_request(url):
response = requests.get(url=url, headers=headers_2)
return response
def get_video_data(html_data):
global title
res = '<title data-vue-meta="true">(.*?)</title>'
title = re.findall(res, html_data)[0]
title = title.split('_')[0]
json_data = re.findall('<script>window\.__playinfo__=(.*?)</script>', html_data)[0]
print(json_data)
json_data = json.loads(json_data)
audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
print('解析到的音频地址:', audio_url)
video_url = json_data['data']['dash']['video'][0]['backupUrl'][0]
print('解析到的视频地址:', video_url)
video_data = [title, audio_url, video_url]
return video_data
def save_data(file_name, audio_url, video_url):
print('正在请求音频数据')
audio_data = send_request(audio_url).content
print('正在请求视频数据')
video_data = send_request(video_url).content
with open(file_name+'.mp3', mode='wb') as f:
f.write(audio_data)
print('正在保存音频数据')
with open(file_name+'.mp4', mode='wb') as f:
f.write(video_data)
print('正在保存视频数据')
def video_add_mp4(file_name, mp4_file):
print("视频开始合成")
outfile_name = file_name.split('.')[0] + '-new.mp4'
cmd = f"D:\\ffmpeg\\bin\\ffmpeg -i \"{mp4_file}\" -i \"{file_name}\" -acodec copy -vcodec copy \"{outfile_name}\""
subprocess.call(cmd, shell=True)
print("视频合成结束")
#####################################################################
#####################################################################
def get_music2():
name = input('请输入你想下载的歌曲:')
kugou = KuGou(name)
song_list = kugou.get_song_list()
pprint.pprint(song_list)
choice = int(input('请输入你想下载歌曲的序号:'))
kugou.save_song(choice)
class KuGou(object):
def __init__(self, name):
self.Name = name
# 获取网页内容
def get_reponse(self, url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'cookie': '',
'referer': 'https://www.kugou.com/yy/html/search.html'
} # 携带user-agentcookiereferer三个参数
time.sleep(2) # 加来玩的
response = requests.get(url=url, headers=headers)
return response
# 获取url地址。
def get_song_info(self):
song_name = parse.quote(self.Name)
url = 'https://songsearch.kugou.com/song_search_v2?callback=jQuery112406923427025623534_1585454373397&keyword={}&page=1&pagesize=30&userid=-1&clientver=&platform=WebFilter&tag=em&filter=2&iscorrection=1&privilege_filter=0&_=1585454373399'.format(
song_name)
print('----get song info:', url)
time.sleep(2)
response = self.get_reponse(url=url).content.decode('utf-8')
return response
# 获取音乐列表
def get_song_list(self):
data_info = self.get_song_info()
file_name = re.findall('"FileName":"(.*?)"', data_info)
song_name_list = []
for index, songs in enumerate(file_name):
song_name = songs.replace('<em>{}<\\/em>'.format(self.Name), '{}'.format(self.Name))
song_name_list.append(str(index) + '' + song_name)
return song_name_list
def get_song_hash(self):
data_info = self.get_song_info()
song_hash = re.findall('"FileHash":"(.*?)"', data_info)
return song_hash
def get_song_album(self):
data_info = self.get_song_info()
song_album = re.findall('"AlbumID":"(.*?)"', data_info)
return song_album
def save_song(self, choice):
song_hash = self.get_song_hash()
album_id = self.get_song_album()
url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191017845313983799804_1591241136863&hash={0}&album_id={1}&dfid=0j14jN41N6PP0q6mOr1iALP1&mid=be4d7c2fb6112a816b8dece9812cdfc8&platid=4&_=1591241136865'.format(
song_hash[choice], album_id[choice])
response = self.get_reponse(url=url).content.decode('unicode_escape')
pattern = re.compile('"play_url":"(.*?)"')
song_info = re.findall(pattern, response)[0]
audio_name = re.findall('"audio_name":"(.*?)"', response)[0]
song_url = song_info.replace('\/', '/')
data = self.get_reponse(url=song_url).content
with open('{}.mp3'.format(audio_name), 'wb') as f:
f.write(data)
print('你已经下载:%s' % audio_name)
#####################################################################
#####################################################################
def get_music1():
ID = '13193'
url = 'https://music.163.com/artist?id=' + ID
html = requests.get(url, headers_1).text
get_id(html)
def get_id(html):
findlink = re.compile(r'<a href="/song\?id=(\d*)">(.*?)</a></li><li>')
findname = re.compile(r'<h2 id="artist-name" data-rid=\d* class="sname f-thide sname-max" title=".*?">(.*?)</h2>')
singername = re.findall(findname, html)[0]
creat(singername)
ll = re.findall(findlink, html)
for i in ll:
savemusic(i[1], i[0])
time.sleep(0.5)
def creat(singername):
if not os.path.exists(singername):
os.mkdir(singername)
os.chdir(singername)
def savemusic(name, id):
url = 'http://music.163.com/song/media/outer/url?id='+id+'.mp3'
with open(name+'.m4a', 'wb') as f:
print('歌曲《', name, '》 下载中***************')
f.write(requests.get(url=url, headers=headers_1).content)
f.close()
print("", name, "》下载完成")
print('')
#####################################################################
#####################################################################
class BeautifulPicture():
def __init__(self):
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'}
self.web_url = 'http://www.nipic.com/'
self.folder_path = 'D:\BeautifulPicture'
def get_pic(self):
print('开始网页get请求')
r = self.request(self.web_url)
print('开始获取所有a标签')
all_a = BeautifulSoup(r.text, 'lxml').find_all('img')
print('开始创建文件夹')
self.mkdir(self.folder_path)
print('开始切换文件夹')
os.chdir(self.folder_path)
i = 0
for a in all_a:
img_url = a['src']
print('a标签的src内容是', img_url)
x = i
img_name = '图片' + str(x)
i = i + 1
self.save_img(img_url, img_name)
def save_img(self, url, name):
print('开始请求图片地址,过程会有点长...')
img = self.request(url)
file_name = name + '.jpg'
print('开始保存图片')
f = open(file_name, 'ab')
f.write(img.content)
print(file_name, '图片保存成功!')
f.close()
def request(self, url):
r = requests.get(url, headers=self.headers)
print(url)
return r
def mkdir(self, path):
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
print('创建名字叫做', path, '的文件夹')
os.makedirs(path)
print('创建成功!')
else:
print(path, '文件夹已经存在了,不再创建')
#####################################################################
if __name__ == "__main__":
main()