|
|
@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
# 抓取指定top250排行榜页面
|
|
|
|
|
|
|
|
def get_url_music(url):
|
|
|
|
|
|
|
|
html = requests.get(url, headers=headers, timeout=100)
|
|
|
|
|
|
|
|
print(html.status_code) # 打印 HTTP 响应状态码 是否连接网页成功,200是成功,404是不成功,403是没有权限访问磁网站
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html.text, 'lxml')
|
|
|
|
|
|
|
|
aTags = soup.find_all('a', attrs={'class': 'nbg'}) # 获取a节点中,class(样式)属性的值为nbg的文本内容
|
|
|
|
|
|
|
|
for atag in aTags:
|
|
|
|
|
|
|
|
now_time = datetime.datetime.now()
|
|
|
|
|
|
|
|
print(now_time)
|
|
|
|
|
|
|
|
print(atag['href'], atag['title'])
|
|
|
|
|
|
|
|
get_music_info(filename, atag['href']) # 调用get_music_info函数,传入文件名和链接
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 抓取专辑页面中的信息
|
|
|
|
|
|
|
|
def get_music_info(filename, url):
|
|
|
|
|
|
|
|
html = requests.get(url, headers=headers, timeout=200)
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html.text, 'lxml')
|
|
|
|
|
|
|
|
# 获取专辑的名称
|
|
|
|
|
|
|
|
name = soup.find(attrs={'id': 'wrapper'}).h1.span.text
|
|
|
|
|
|
|
|
print('专辑名称:{}'.format(name))
|
|
|
|
|
|
|
|
# 获取表演者
|
|
|
|
|
|
|
|
author = soup.find(attrs={'id': 'info'}).find('a').text
|
|
|
|
|
|
|
|
print('作者:{}'.format(author))
|
|
|
|
|
|
|
|
# 获取流派
|
|
|
|
|
|
|
|
style = re.findall('<span class="pl">流派:</span> (.*?)<br />', html.text, re.S)
|
|
|
|
|
|
|
|
if len(style) == 0:
|
|
|
|
|
|
|
|
style = '未知'
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
style = style[0].strip()
|
|
|
|
|
|
|
|
print('流派:{}'.format(style))
|
|
|
|
|
|
|
|
# 获取发行时间
|
|
|
|
|
|
|
|
time = re.findall('发行时间:</span> (.*?)<br />', html.text, re.S)[0].strip()
|
|
|
|
|
|
|
|
print(time)
|
|
|
|
|
|
|
|
# 获取出版者
|
|
|
|
|
|
|
|
publishers = re.findall('>出版者:</span> (.*?)<br />', html.text, re.S)
|
|
|
|
|
|
|
|
if len(publishers) == 0:
|
|
|
|
|
|
|
|
publishers = '未知'
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
publishers = publishers[0].strip()
|
|
|
|
|
|
|
|
print(publishers)
|
|
|
|
|
|
|
|
# 将获取的信息整合成一个字典存储起来
|
|
|
|
|
|
|
|
info = {
|
|
|
|
|
|
|
|
'name': name,
|
|
|
|
|
|
|
|
'author': author,
|
|
|
|
|
|
|
|
'style': style,
|
|
|
|
|
|
|
|
'time': time,
|
|
|
|
|
|
|
|
'publishers': publishers
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
print(info)
|
|
|
|
|
|
|
|
save_csv(filename, info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 保存分析结果
|
|
|
|
|
|
|
|
def save_csv(filename, info):
|
|
|
|
|
|
|
|
with open(filename, 'a', encoding='utf-8', newline='') as f:
|
|
|
|
|
|
|
|
fieldnames = ['name', 'author', 'style', 'time', 'publishers']
|
|
|
|
|
|
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
|
|
|
|
writer.writerow(info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
|
|
|
|
|
|
|
|
print(urls)
|
|
|
|
|
|
|
|
filename = 'music.csv'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open(filename, 'w', encoding='utf-8', newline='') as f:
|
|
|
|
|
|
|
|
fieldnames = ['name', 'author', 'style', 'time', 'publishers']
|
|
|
|
|
|
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
|
|
|
|
writer.writeheader()
|
|
|
|
|
|
|
|
for url in urls:
|
|
|
|
|
|
|
|
get_url_music(url)
|
|
|
|
|
|
|
|
time.sleep(1)
|