You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
4.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import urllib.request
import re
# 创建一个空字符串变量,用于存储所有爬取的数据
all_data = ""
count = 0
while True:
print('内地榜-->5 香港榜-->59 台湾榜-->61 欧美榜-->3 韩国榜-->16 日本榜-->17')
urltail = input('请输入要爬取的网页编号输入0退出')
if urltail != 0:
count += 1
if urltail == '0':
print("退出程序。")
break
# 第一部分爬取网页数据
# 目标url
baseurl = 'https://y.qq.com/n/ryqq/toplist/'
url = baseurl + str(urltail)
header = {
'Cookie':'RK=+ZHsCvGVT3; ptcz=e8c720f3cadd2924fe7f9382491c380ed7a957f54ba161375f1820384574d975; pgv_pvid=4996355280; pac_uid=0_ff39434e0af16; iip=0; _qimei_uuid42=183030d10331007205af68b50c782865c316e028ff; _qimei_fingerprint=30c99314005cd8c6bd7fafe690270d8a; _qimei_q36=; _qimei_h38=19c5f4b105af68b50c78286502000004a18303; qq_domain_video_guid_verify=1d7d06fcd04b7d06; _clck=5lzhur|1|fkq|0; fqm_pvqid=e7316d13-c654-4f59-9ba5-eae187807884; ts_uid=6272104735; fqm_sessionid=4790d36e-2699-4690-a72e-6ea888c89094; pgv_info=ssid=s4222662588; ts_refer=www.baidu.com/link; _qpsvr_localtk=0.7570286215498101; psrf_qqrefresh_token=; wxuin=1152921505297524933; psrf_qqaccess_token=; qqmusic_key=W_X_63B0aFMliBR_--6KlXDCq6A3Sl32nZujQQYhl3t_-jwAaW4YJ1DW8Ol07IDPVTdAa0s6Ux1Yz1RXruEc; wxrefresh_token=79__3nkjtVcDfdnK7LedBDP3--ijETIbmwGybvkRtzZTCNiDBzKFpkvFyBOU4ctvYvLAtDi1OBCHV8yeB1ecaINSfvXwFhXnyFrCgFiqgREn2E; euin=oK6kowEAoK4z7K-q7i4A7eEioz**; wxuin=1152921505297524933; wxopenid=opCFJw7y_btgERPX0yPfMcS_J1AQ; psrf_qqunionid=; wxunionid=oqFLxsrob2nxFDht3flgijhcCN5M; psrf_qqopenid=; tmeLoginType=1; qm_keyst=W_X_63B0aFMliBR_--6KlXDCq6A3Sl32nZujQQYhl3t_-jwAaW4YJ1DW8Ol07IDPVTdAa0s6Ux1Yz1RXruEc; qm_keyst=W_X_63B0aFMliBR_--6KlXDCq6A3Sl32nZujQQYhl3t_-jwAaW4YJ1DW8Ol07IDPVTdAa0s6Ux1Yz1RXruEc; login_type=2; ts_last=y.qq.com/n/ryqq/toplist/4',
'Referer':'https://y.qq.com/portal/wx_redirect.html?login_type=2&surl=https://y.qq.com/&code=081UD7000R8xXR1hnO100j2BOo2UD70C&state=STATE',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
}
req = urllib.request.Request(url, headers=header)
try:
response = urllib.request.urlopen(req)
data = response.read().decode('utf-8')
# 将本次爬取的数据追加到all_data变量中
all_data += data
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
#第二部分用beautifulsoup解析网页
from bs4 import BeautifulSoup
soup = BeautifulSoup(all_data, 'html.parser')
#提取数据
# 获取MV链接单独显示
mv_source =[]
mv_links = soup.find_all('a', class_='songlist__icon_mv')
# 将链接添加到mv_source列表中
mv_source = []
for link in mv_links:
mv_url = 'https://y.qq.com' + link['href']
mv_source.append(mv_url)
print(mv_source)# 点击MV链接进入MV详情页
song_names = []
artists = []
durations = []
song_source = []
# 提取歌曲名
# 用正则表达式筛选标签~
ranks = soup.find_all("span", attrs={"class": re.compile("^songlist__songname_txt$")})
for rank in ranks:
rank_string = rank.get_text()
if "/" not in rank_string:
song_names.append(rank_string)
# 提取歌手名
ranks = soup.findAll("div", attrs={"class":"songlist__artist"})
for rank in ranks:
rank_string = rank.get_text()
#if "/" not in rank_string:!!!!!!!!!!如果有这句,有两个歌手以上的歌手都会消失哦!
artists.append(rank_string)
#提取歌曲时长
ranks = soup.findAll("div", attrs={"class":"songlist__time"})
for rank in ranks:
rank_string = rank.get_text() # 使用 get_text() 方法获取标签内的文本内容
if "/" not in rank_string:
durations.append(rank_string)
# 提取歌曲链接
ranks = soup.find_all('a', class_='songlist__cover')
for rank in ranks:
song_url = 'https://y.qq.com' + rank['href']
song_source.append(song_url)
#第三部分xlwt
import xlwt
# 创建一个 Excel 文件和一个工作表
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('Sheet 1')
# 写入表头
worksheet.write(0, 0, '排名')
worksheet.write(0, 1, '歌曲')
worksheet.write(0, 2, '歌手')
worksheet.write(0, 3, '时长')
worksheet.write(0, 4, '歌曲链接')
# 将数据写入表格
num = (count-1) * 20
for i in range(num):
worksheet.write(i+1, 0, i+1) # 排名
worksheet.write(i+1, 1, song_names[i]) # 歌曲名
worksheet.write(i+1, 2, artists[i]) # 歌手名
worksheet.write(i+1, 3, durations[i]) # 歌曲时长
worksheet.write(i+1, 4, song_source[i]) # 歌曲来源
workbook.save('qqmusic_last.xls')