You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
3.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from bs4 import BeautifulSoup
import csv
import re
import time
import datetime
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
# 抓取指定top250排行榜页面
def get_url_music(url):
html = requests.get(url, headers=headers, timeout=100)
print(html.status_code) # 打印 HTTP 响应状态码 是否连接网页成功200是成功404是不成功403是没有权限访问磁网站
soup = BeautifulSoup(html.text, 'lxml')
aTags = soup.find_all('a', attrs={'class': 'nbg'}) # 获取a节点中class样式属性的值为nbg的文本内容
for atag in aTags:
now_time = datetime.datetime.now()
print(now_time)
print(atag['href'], atag['title'])
get_music_info(filename, atag['href']) # 调用get_music_info函数传入文件名和链接
# 抓取专辑页面中的信息
def get_music_info(filename, url):
html = requests.get(url, headers=headers, timeout=200)
soup = BeautifulSoup(html.text, 'lxml')
# 获取专辑的名称
name = soup.find(attrs={'id': 'wrapper'}).h1.span.text
print('专辑名称:{}'.format(name))
# 获取表演者
author = soup.find(attrs={'id': 'info'}).find('a').text
print('作者:{}'.format(author))
# 获取流派
style = re.findall('<span class="pl">流派:</span>&nbsp;(.*?)<br />', html.text, re.S)
if len(style) == 0:
style = '未知'
else:
style = style[0].strip()
print('流派:{}'.format(style))
# 获取发行时间
time = re.findall('发行时间:</span>&nbsp;(.*?)<br />', html.text, re.S)[0].strip()
print(time)
# 获取出版者
publishers = re.findall('>出版者:</span>&nbsp;(.*?)<br />', html.text, re.S)
if len(publishers) == 0:
publishers = '未知'
else:
publishers = publishers[0].strip()
print(publishers)
# 将获取的信息整合成一个字典存储起来
info = {
'name': name,
'author': author,
'style': style,
'time': time,
'publishers': publishers
}
print(info)
save_csv(filename, info)
# 保存分析结果
def save_csv(filename, info):
with open(filename, 'a', encoding='utf-8', newline='') as f:
fieldnames = ['name', 'author', 'style', 'time', 'publishers']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow(info)
if __name__ == '__main__':
urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
print(urls)
filename = 'music.csv'
with open(filename, 'w', encoding='utf-8', newline='') as f:
fieldnames = ['name', 'author', 'style', 'time', 'publishers']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for url in urls:
get_url_music(url)
time.sleep(1)