diff --git a/1.py b/1.py new file mode 100644 index 0000000..16bfab8 --- /dev/null +++ b/1.py @@ -0,0 +1,80 @@ +import requests +from bs4 import BeautifulSoup +import csv +import re +import time +import datetime + +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' +} +# 抓取指定top250排行榜页面 +def get_url_music(url): + html = requests.get(url, headers=headers, timeout=100) + print(html.status_code) # 打印 HTTP 响应状态码 是否连接网页成功,200是成功,404是不成功,403是没有权限访问磁网站 + soup = BeautifulSoup(html.text, 'lxml') + aTags = soup.find_all('a', attrs={'class': 'nbg'}) # 获取a节点中,class(样式)属性的值为nbg的文本内容 + for atag in aTags: + now_time = datetime.datetime.now() + print(now_time) + print(atag['href'], atag['title']) + get_music_info(filename, atag['href']) # 调用get_music_info函数,传入文件名和链接 + +# 抓取专辑页面中的信息 +def get_music_info(filename, url): + html = requests.get(url, headers=headers, timeout=200) + soup = BeautifulSoup(html.text, 'lxml') + # 获取专辑的名称 + name = soup.find(attrs={'id': 'wrapper'}).h1.span.text + print('专辑名称:{}'.format(name)) + # 获取表演者 + author = soup.find(attrs={'id': 'info'}).find('a').text + print('作者:{}'.format(author)) + # 获取流派 + style = re.findall('流派: (.*?)
', html.text, re.S) + if len(style) == 0: + style = '未知' + else: + style = style[0].strip() + print('流派:{}'.format(style)) + # 获取发行时间 + time = re.findall('发行时间: (.*?)
', html.text, re.S)[0].strip() + print(time) + # 获取出版者 + publishers = re.findall('>出版者: (.*?)
', html.text, re.S) + if len(publishers) == 0: + publishers = '未知' + else: + publishers = publishers[0].strip() + print(publishers) + # 将获取的信息整合成一个字典存储起来 + info = { + 'name': name, + 'author': author, + 'style': style, + 'time': time, + 'publishers': publishers + } + print(info) + save_csv(filename, info) + + +# 保存分析结果 +def save_csv(filename, info): + with open(filename, 'a', encoding='utf-8', newline='') as f: + fieldnames = ['name', 'author', 'style', 'time', 'publishers'] + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writerow(info) + +if __name__ == '__main__': + urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)] + print(urls) + filename = 'music.csv' + + with open(filename, 'w', encoding='utf-8', newline='') as f: + fieldnames = ['name', 'author', 'style', 'time', 'publishers'] + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + for url in urls: + get_url_music(url) + time.sleep(1) \ No newline at end of file