|
|
|
@ -0,0 +1,119 @@
|
|
|
|
|
import urllib.request
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
# 创建一个空字符串变量,用于存储所有爬取的数据
|
|
|
|
|
all_data = ""
|
|
|
|
|
count = 0
|
|
|
|
|
while True:
|
|
|
|
|
print('内地榜-->5 香港榜-->59 台湾榜-->61 欧美榜-->3 韩国榜-->16 日本榜-->17')
|
|
|
|
|
urltail = input('请输入要爬取的网页编号(输入0退出):')
|
|
|
|
|
if urltail != 0:
|
|
|
|
|
count += 1
|
|
|
|
|
if urltail == '0':
|
|
|
|
|
print("退出程序。")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# 第一部分爬取网页数据
|
|
|
|
|
# 目标url
|
|
|
|
|
baseurl = 'https://y.qq.com/n/ryqq/toplist/'
|
|
|
|
|
url = baseurl + str(urltail)
|
|
|
|
|
|
|
|
|
|
header = {
|
|
|
|
|
'Cookie':'RK=+ZHsCvGVT3; ptcz=e8c720f3cadd2924fe7f9382491c380ed7a957f54ba161375f1820384574d975; pgv_pvid=4996355280; pac_uid=0_ff39434e0af16; iip=0; _qimei_uuid42=183030d10331007205af68b50c782865c316e028ff; _qimei_fingerprint=30c99314005cd8c6bd7fafe690270d8a; _qimei_q36=; _qimei_h38=19c5f4b105af68b50c78286502000004a18303; qq_domain_video_guid_verify=1d7d06fcd04b7d06; _clck=5lzhur|1|fkq|0; fqm_pvqid=e7316d13-c654-4f59-9ba5-eae187807884; ts_uid=6272104735; fqm_sessionid=4790d36e-2699-4690-a72e-6ea888c89094; pgv_info=ssid=s4222662588; ts_refer=www.baidu.com/link; _qpsvr_localtk=0.7570286215498101; psrf_qqrefresh_token=; wxuin=1152921505297524933; psrf_qqaccess_token=; qqmusic_key=W_X_63B0aFMliBR_--6KlXDCq6A3Sl32nZujQQYhl3t_-jwAaW4YJ1DW8Ol07IDPVTdAa0s6Ux1Yz1RXruEc; wxrefresh_token=79__3nkjtVcDfdnK7LedBDP3--ijETIbmwGybvkRtzZTCNiDBzKFpkvFyBOU4ctvYvLAtDi1OBCHV8yeB1ecaINSfvXwFhXnyFrCgFiqgREn2E; euin=oK6kowEAoK4z7K-q7i4A7eEioz**; wxuin=1152921505297524933; wxopenid=opCFJw7y_btgERPX0yPfMcS_J1AQ; psrf_qqunionid=; wxunionid=oqFLxsrob2nxFDht3flgijhcCN5M; psrf_qqopenid=; tmeLoginType=1; qm_keyst=W_X_63B0aFMliBR_--6KlXDCq6A3Sl32nZujQQYhl3t_-jwAaW4YJ1DW8Ol07IDPVTdAa0s6Ux1Yz1RXruEc; qm_keyst=W_X_63B0aFMliBR_--6KlXDCq6A3Sl32nZujQQYhl3t_-jwAaW4YJ1DW8Ol07IDPVTdAa0s6Ux1Yz1RXruEc; login_type=2; ts_last=y.qq.com/n/ryqq/toplist/4',
|
|
|
|
|
'Referer':'https://y.qq.com/portal/wx_redirect.html?login_type=2&surl=https://y.qq.com/&code=081UD7000R8xXR1hnO100j2BOo2UD70C&state=STATE',
|
|
|
|
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
req = urllib.request.Request(url, headers=header)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
response = urllib.request.urlopen(req)
|
|
|
|
|
data = response.read().decode('utf-8')
|
|
|
|
|
|
|
|
|
|
# 将本次爬取的数据追加到all_data变量中
|
|
|
|
|
all_data += data
|
|
|
|
|
|
|
|
|
|
except urllib.error.URLError as e:
|
|
|
|
|
if hasattr(e, "code"):
|
|
|
|
|
print(e.code)
|
|
|
|
|
if hasattr(e, "reason"):
|
|
|
|
|
print(e.reason)
|
|
|
|
|
|
|
|
|
|
#第二部分:用beautifulsoup解析网页
|
|
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(all_data, 'html.parser')
|
|
|
|
|
|
|
|
|
|
#提取数据
|
|
|
|
|
|
|
|
|
|
# 获取MV链接单独显示
|
|
|
|
|
mv_source =[]
|
|
|
|
|
mv_links = soup.find_all('a', class_='songlist__icon_mv')
|
|
|
|
|
|
|
|
|
|
# 将链接添加到mv_source列表中
|
|
|
|
|
mv_source = []
|
|
|
|
|
for link in mv_links:
|
|
|
|
|
mv_url = 'https://y.qq.com' + link['href']
|
|
|
|
|
mv_source.append(mv_url)
|
|
|
|
|
|
|
|
|
|
print(mv_source)# 点击MV链接,进入MV详情页!!
|
|
|
|
|
|
|
|
|
|
song_names = []
|
|
|
|
|
artists = []
|
|
|
|
|
durations = []
|
|
|
|
|
song_source = []
|
|
|
|
|
|
|
|
|
|
# 提取歌曲名
|
|
|
|
|
# 用正则表达式筛选标签~
|
|
|
|
|
ranks = soup.find_all("span", attrs={"class": re.compile("^songlist__songname_txt$")})
|
|
|
|
|
for rank in ranks:
|
|
|
|
|
rank_string = rank.get_text()
|
|
|
|
|
if "/" not in rank_string:
|
|
|
|
|
song_names.append(rank_string)
|
|
|
|
|
|
|
|
|
|
# 提取歌手名
|
|
|
|
|
ranks = soup.findAll("div", attrs={"class":"songlist__artist"})
|
|
|
|
|
for rank in ranks:
|
|
|
|
|
rank_string = rank.get_text()
|
|
|
|
|
#if "/" not in rank_string:!!!!!!!!!!如果有这句,有两个歌手以上的歌手都会消失哦!
|
|
|
|
|
artists.append(rank_string)
|
|
|
|
|
|
|
|
|
|
#提取歌曲时长
|
|
|
|
|
ranks = soup.findAll("div", attrs={"class":"songlist__time"})
|
|
|
|
|
for rank in ranks:
|
|
|
|
|
rank_string = rank.get_text() # 使用 get_text() 方法获取标签内的文本内容
|
|
|
|
|
if "/" not in rank_string:
|
|
|
|
|
durations.append(rank_string)
|
|
|
|
|
|
|
|
|
|
# 提取歌曲链接
|
|
|
|
|
ranks = soup.find_all('a', class_='songlist__cover')
|
|
|
|
|
for rank in ranks:
|
|
|
|
|
song_url = 'https://y.qq.com' + rank['href']
|
|
|
|
|
song_source.append(song_url)
|
|
|
|
|
|
|
|
|
|
#第三部分:xlwt
|
|
|
|
|
|
|
|
|
|
import xlwt
|
|
|
|
|
|
|
|
|
|
# 创建一个 Excel 文件和一个工作表
|
|
|
|
|
workbook = xlwt.Workbook(encoding='utf-8')
|
|
|
|
|
worksheet = workbook.add_sheet('Sheet 1')
|
|
|
|
|
|
|
|
|
|
# 写入表头
|
|
|
|
|
worksheet.write(0, 0, '排名')
|
|
|
|
|
worksheet.write(0, 1, '歌曲')
|
|
|
|
|
worksheet.write(0, 2, '歌手')
|
|
|
|
|
worksheet.write(0, 3, '时长')
|
|
|
|
|
worksheet.write(0, 4, '歌曲链接')
|
|
|
|
|
|
|
|
|
|
# 将数据写入表格
|
|
|
|
|
num = (count-1) * 20
|
|
|
|
|
for i in range(num):
|
|
|
|
|
worksheet.write(i+1, 0, i+1) # 排名
|
|
|
|
|
worksheet.write(i+1, 1, song_names[i]) # 歌曲名
|
|
|
|
|
worksheet.write(i+1, 2, artists[i]) # 歌手名
|
|
|
|
|
worksheet.write(i+1, 3, durations[i]) # 歌曲时长
|
|
|
|
|
worksheet.write(i+1, 4, song_source[i]) # 歌曲来源
|
|
|
|
|
|
|
|
|
|
workbook.save('qqmusic_last.xls')
|