import requests from bs4 import BeautifulSoup import re import time import json import subprocess import os import pprint from urllib import parse headers_1 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36' } url = 'https://www.bilibili.com/video/BV1Fy4y1D7XS?t=798' headers_2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Referer': url} def main(): print("1.爬取图片") print("2.下载五月天的歌曲") print("3.爬取酷狗音乐歌曲") print("4.爬取视频") fun = input("请输入你想要的功能:") if fun == '1': beauty = BeautifulPicture() beauty.get_pic() elif fun == '2': get_music1() elif fun == '3': get_music2() elif fun == '4': get_video() ##################################################################### def get_video(): html_data = send_request('https://www.bilibili.com/video/BV1w44y1r79C?t=29').text video_data = get_video_data(html_data) save_data(video_data[0], video_data[1], video_data[2]) name = title print(title) a = f'D:\python\B爬虫\web\{name}.mp4' b = f'D:\python\B爬虫\web\{name}.mp3' video_add_mp4(a, b) def send_request(url): response = requests.get(url=url, headers=headers_2) return response def get_video_data(html_data): global title res = '(.*?)' title = re.findall(res, html_data)[0] title = title.split('_')[0] json_data = re.findall('', html_data)[0] print(json_data) json_data = json.loads(json_data) audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0] print('解析到的音频地址:', audio_url) video_url = json_data['data']['dash']['video'][0]['backupUrl'][0] print('解析到的视频地址:', video_url) video_data = [title, audio_url, video_url] return video_data def save_data(file_name, audio_url, video_url): print('正在请求音频数据') audio_data = send_request(audio_url).content print('正在请求视频数据') video_data = send_request(video_url).content with open(file_name+'.mp3', mode='wb') as f: f.write(audio_data) print('正在保存音频数据') with open(file_name+'.mp4', mode='wb') as f: f.write(video_data) print('正在保存视频数据') def video_add_mp4(file_name, mp4_file): print("视频开始合成") outfile_name = file_name.split('.')[0] + '-new.mp4' cmd = f"D:\\ffmpeg\\bin\\ffmpeg -i \"{mp4_file}\" -i \"{file_name}\" -acodec copy -vcodec copy \"{outfile_name}\"" subprocess.call(cmd, shell=True) print("视频合成结束") ##################################################################### ##################################################################### def get_music2(): name = input('请输入你想下载的歌曲:') kugou = KuGou(name) song_list = kugou.get_song_list() pprint.pprint(song_list) choice = int(input('请输入你想下载歌曲的序号:')) kugou.save_song(choice) class KuGou(object): def __init__(self, name): self.Name = name # 获取网页内容 def get_reponse(self, url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 'cookie': '', 'referer': 'https://www.kugou.com/yy/html/search.html' } # 携带user-agent,cookie,referer三个参数 time.sleep(2) # 加来玩的 response = requests.get(url=url, headers=headers) return response # 获取url地址。 def get_song_info(self): song_name = parse.quote(self.Name) url = 'https://songsearch.kugou.com/song_search_v2?callback=jQuery112406923427025623534_1585454373397&keyword={}&page=1&pagesize=30&userid=-1&clientver=&platform=WebFilter&tag=em&filter=2&iscorrection=1&privilege_filter=0&_=1585454373399'.format( song_name) print('----get song info:', url) time.sleep(2) response = self.get_reponse(url=url).content.decode('utf-8') return response # 获取音乐列表 def get_song_list(self): data_info = self.get_song_info() file_name = re.findall('"FileName":"(.*?)"', data_info) song_name_list = [] for index, songs in enumerate(file_name): song_name = songs.replace('{}<\\/em>'.format(self.Name), '{}'.format(self.Name)) song_name_list.append(str(index) + '、' + song_name) return song_name_list def get_song_hash(self): data_info = self.get_song_info() song_hash = re.findall('"FileHash":"(.*?)"', data_info) return song_hash def get_song_album(self): data_info = self.get_song_info() song_album = re.findall('"AlbumID":"(.*?)"', data_info) return song_album def save_song(self, choice): song_hash = self.get_song_hash() album_id = self.get_song_album() url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191017845313983799804_1591241136863&hash={0}&album_id={1}&dfid=0j14jN41N6PP0q6mOr1iALP1&mid=be4d7c2fb6112a816b8dece9812cdfc8&platid=4&_=1591241136865'.format( song_hash[choice], album_id[choice]) response = self.get_reponse(url=url).content.decode('unicode_escape') pattern = re.compile('"play_url":"(.*?)"') song_info = re.findall(pattern, response)[0] audio_name = re.findall('"audio_name":"(.*?)"', response)[0] song_url = song_info.replace('\/', '/') data = self.get_reponse(url=song_url).content with open('{}.mp3'.format(audio_name), 'wb') as f: f.write(data) print('你已经下载:%s' % audio_name) ##################################################################### ##################################################################### def get_music1(): ID = '13193' url = 'https://music.163.com/artist?id=' + ID html = requests.get(url, headers_1).text get_id(html) def get_id(html): findlink = re.compile(r'(.*?)
  • ') findname = re.compile(r'

    (.*?)

    ') singername = re.findall(findname, html)[0] creat(singername) ll = re.findall(findlink, html) for i in ll: savemusic(i[1], i[0]) time.sleep(0.5) def creat(singername): if not os.path.exists(singername): os.mkdir(singername) os.chdir(singername) def savemusic(name, id): url = 'http://music.163.com/song/media/outer/url?id='+id+'.mp3' with open(name+'.m4a', 'wb') as f: print('歌曲《', name, '》 下载中***************') f.write(requests.get(url=url, headers=headers_1).content) f.close() print("《", name, "》下载完成") print('') ##################################################################### ##################################################################### class BeautifulPicture(): def __init__(self): self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'} self.web_url = 'http://www.nipic.com/' self.folder_path = 'D:\BeautifulPicture' def get_pic(self): print('开始网页get请求') r = self.request(self.web_url) print('开始获取所有a标签') all_a = BeautifulSoup(r.text, 'lxml').find_all('img') print('开始创建文件夹') self.mkdir(self.folder_path) print('开始切换文件夹') os.chdir(self.folder_path) i = 0 for a in all_a: img_url = a['src'] print('a标签的src内容是:', img_url) x = i img_name = '图片' + str(x) i = i + 1 self.save_img(img_url, img_name) def save_img(self, url, name): print('开始请求图片地址,过程会有点长...') img = self.request(url) file_name = name + '.jpg' print('开始保存图片') f = open(file_name, 'ab') f.write(img.content) print(file_name, '图片保存成功!') f.close() def request(self, url): r = requests.get(url, headers=self.headers) print(url) return r def mkdir(self, path): path = path.strip() isExists = os.path.exists(path) if not isExists: print('创建名字叫做', path, '的文件夹') os.makedirs(path) print('创建成功!') else: print(path, '文件夹已经存在了,不再创建') ##################################################################### if __name__ == "__main__": main()