pythonpachong/main.py

import requests
from bs4 import BeautifulSoup
import re
import time
import json
import subprocess
import os
import pprint
from urllib import parse
headers_1 = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
url = 'https://www.bilibili.com/video/BV1Fy4y1D7XS?t=798'
headers_2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 'Referer': url}


def main():
    print("1.爬取图片")
    print("2.下载五月天的歌曲")
    print("3.爬取酷狗音乐歌曲")
    print("4.爬取视频")
    fun = input("请输入你想要的功能：")
    if fun == '1':
        beauty = BeautifulPicture()
        beauty.get_pic()
    elif fun == '2':
        get_music1()
    elif fun == '3':
        get_music2()
    elif fun == '4':
        get_video()


#####################################################################
def get_video():
    html_data = send_request('https://www.bilibili.com/video/BV1w44y1r79C?t=29').text
    video_data = get_video_data(html_data)
    save_data(video_data[0], video_data[1], video_data[2])
    name = title
    print(title)
    a = f'D:\python\B爬虫\web\{name}.mp4'
    b = f'D:\python\B爬虫\web\{name}.mp3'
    video_add_mp4(a, b)


def send_request(url):
    response = requests.get(url=url, headers=headers_2)
    return response


def get_video_data(html_data):
    global title
    res = '<title data-vue-meta="true">(.*?)</title>'
    title = re.findall(res, html_data)[0]
    title = title.split('_')[0]

    json_data = re.findall('<script>window\.__playinfo__=(.*?)</script>', html_data)[0]
    print(json_data)
    json_data = json.loads(json_data)
    audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0]
    print('解析到的音频地址：', audio_url)
    video_url = json_data['data']['dash']['video'][0]['backupUrl'][0]
    print('解析到的视频地址：', video_url)
    video_data = [title, audio_url, video_url]
    return video_data


def save_data(file_name, audio_url, video_url):
    print('正在请求音频数据')
    audio_data = send_request(audio_url).content
    print('正在请求视频数据')
    video_data = send_request(video_url).content
    with open(file_name+'.mp3', mode='wb') as f:
        f.write(audio_data)
        print('正在保存音频数据')
    with open(file_name+'.mp4', mode='wb') as f:
        f.write(video_data)
        print('正在保存视频数据')


def video_add_mp4(file_name, mp4_file):
    print("视频开始合成")
    outfile_name = file_name.split('.')[0] + '-new.mp4'
    cmd = f"D:\\ffmpeg\\bin\\ffmpeg -i \"{mp4_file}\" -i \"{file_name}\" -acodec copy -vcodec copy \"{outfile_name}\""
    subprocess.call(cmd, shell=True)
    print("视频合成结束")
#####################################################################


#####################################################################
def get_music2():
    name = input('请输入你想下载的歌曲:')
    kugou = KuGou(name)
    song_list = kugou.get_song_list()
    pprint.pprint(song_list)
    choice = int(input('请输入你想下载歌曲的序号:'))
    kugou.save_song(choice)


class KuGou(object):
    def __init__(self, name):
        self.Name = name

    # 获取网页内容
    def get_reponse(self, url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
            'cookie': '',
            'referer': 'https://www.kugou.com/yy/html/search.html'
        }  # 携带user-agent，cookie，referer三个参数
        time.sleep(2)  # 加来玩的
        response = requests.get(url=url, headers=headers)
        return response

    # 获取url地址。
    def get_song_info(self):
        song_name = parse.quote(self.Name)
        url = 'https://songsearch.kugou.com/song_search_v2?callback=jQuery112406923427025623534_1585454373397&keyword={}&page=1&pagesize=30&userid=-1&clientver=&platform=WebFilter&tag=em&filter=2&iscorrection=1&privilege_filter=0&_=1585454373399'.format(
            song_name)
        print('----get song info:', url)
        time.sleep(2)
        response = self.get_reponse(url=url).content.decode('utf-8')
        return response

    # 获取音乐列表
    def get_song_list(self):
        data_info = self.get_song_info()
        file_name = re.findall('"FileName":"(.*?)"', data_info)
        song_name_list = []
        for index, songs in enumerate(file_name):
            song_name = songs.replace('<em>{}<\\/em>'.format(self.Name), '{}'.format(self.Name))
            song_name_list.append(str(index) + '、' + song_name)
        return song_name_list
    def get_song_hash(self):
        data_info = self.get_song_info()
        song_hash = re.findall('"FileHash":"(.*?)"', data_info)
        return song_hash
    def get_song_album(self):
        data_info = self.get_song_info()
        song_album = re.findall('"AlbumID":"(.*?)"', data_info)
        return song_album
    def save_song(self, choice):
        song_hash = self.get_song_hash()
        album_id = self.get_song_album()
        url = 'https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery191017845313983799804_1591241136863&hash={0}&album_id={1}&dfid=0j14jN41N6PP0q6mOr1iALP1&mid=be4d7c2fb6112a816b8dece9812cdfc8&platid=4&_=1591241136865'.format(
            song_hash[choice], album_id[choice])
        response = self.get_reponse(url=url).content.decode('unicode_escape')
        pattern = re.compile('"play_url":"(.*?)"')
        song_info = re.findall(pattern, response)[0]
        audio_name = re.findall('"audio_name":"(.*?)"', response)[0]
        song_url = song_info.replace('\/', '/')
        data = self.get_reponse(url=song_url).content
        with open('{}.mp3'.format(audio_name), 'wb') as f:
            f.write(data)
            print('你已经下载:%s' % audio_name)
#####################################################################


#####################################################################
def get_music1():
    ID = '13193'
    url = 'https://music.163.com/artist?id=' + ID
    html = requests.get(url, headers_1).text
    get_id(html)


def get_id(html):
    findlink = re.compile(r'<a href="/song\?id=(\d*)">(.*?)</a></li><li>')
    findname = re.compile(r'<h2 id="artist-name" data-rid=\d* class="sname f-thide sname-max" title=".*?">(.*?)</h2>')
    singername = re.findall(findname, html)[0]
    creat(singername)
    ll = re.findall(findlink, html)
    for i in ll:
        savemusic(i[1], i[0])
        time.sleep(0.5)


def creat(singername):
    if not os.path.exists(singername):
        os.mkdir(singername)
    os.chdir(singername)


def savemusic(name, id):
    url = 'http://music.163.com/song/media/outer/url?id='+id+'.mp3'
    with open(name+'.m4a', 'wb') as f:
        print('歌曲《', name, '》 下载中***************')
        f.write(requests.get(url=url, headers=headers_1).content)
        f.close()
        print("《", name, "》下载完成")
        print('')
#####################################################################


#####################################################################
class BeautifulPicture():

    def __init__(self):
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'}
        self.web_url = 'http://www.nipic.com/'
        self.folder_path = 'D:\BeautifulPicture'

    def get_pic(self):
        print('开始网页get请求')
        r = self.request(self.web_url)
        print('开始获取所有a标签')
        all_a = BeautifulSoup(r.text, 'lxml').find_all('img')
        print('开始创建文件夹')
        self.mkdir(self.folder_path)
        print('开始切换文件夹')
        os.chdir(self.folder_path)
        i = 0
        for a in all_a:
            img_url = a['src']
            print('a标签的src内容是：', img_url)
            x = i
            img_name = '图片' + str(x)
            i = i + 1
            self.save_img(img_url, img_name)

    def save_img(self, url, name):
        print('开始请求图片地址，过程会有点长...')
        img = self.request(url)
        file_name = name + '.jpg'
        print('开始保存图片')
        f = open(file_name, 'ab')
        f.write(img.content)
        print(file_name, '图片保存成功！')
        f.close()

    def request(self, url):
        r = requests.get(url, headers=self.headers)
        print(url)
        return r

    def mkdir(self, path):
        path = path.strip()
        isExists = os.path.exists(path)
        if not isExists:
            print('创建名字叫做', path, '的文件夹')
            os.makedirs(path)
            print('创建成功！')
        else:
            print(path, '文件夹已经存在了，不再创建')
#####################################################################


if __name__ == "__main__":
    main()