import requests from bs4 import BeautifulSoup import csv import re import time import datetime headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } # 抓取指定top250排行榜页面 def get_url_music(url): html = requests.get(url, headers=headers, timeout=100) print(html.status_code) # 打印 HTTP 响应状态码 是否连接网页成功,200是成功,404是不成功,403是没有权限访问磁网站 soup = BeautifulSoup(html.text, 'lxml') aTags = soup.find_all('a', attrs={'class': 'nbg'}) # 获取a节点中,class(样式)属性的值为nbg的文本内容 for atag in aTags: now_time = datetime.datetime.now() print(now_time) print(atag['href'], atag['title']) get_music_info(filename, atag['href']) # 调用get_music_info函数,传入文件名和链接 # 抓取专辑页面中的信息 def get_music_info(filename, url): html = requests.get(url, headers=headers, timeout=200) soup = BeautifulSoup(html.text, 'lxml') # 获取专辑的名称 name = soup.find(attrs={'id': 'wrapper'}).h1.span.text print('专辑名称:{}'.format(name)) # 获取表演者 author = soup.find(attrs={'id': 'info'}).find('a').text print('作者:{}'.format(author)) # 获取流派 style = re.findall('流派: (.*?)
', html.text, re.S) if len(style) == 0: style = '未知' else: style = style[0].strip() print('流派:{}'.format(style)) # 获取发行时间 time = re.findall('发行时间: (.*?)
', html.text, re.S)[0].strip() print(time) # 获取出版者 publishers = re.findall('>出版者: (.*?)
', html.text, re.S) if len(publishers) == 0: publishers = '未知' else: publishers = publishers[0].strip() print(publishers) # 将获取的信息整合成一个字典存储起来 info = { 'name': name, 'author': author, 'style': style, 'time': time, 'publishers': publishers } print(info) save_csv(filename, info) # 保存分析结果 def save_csv(filename, info): with open(filename, 'a', encoding='utf-8', newline='') as f: fieldnames = ['name', 'author', 'style', 'time', 'publishers'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writerow(info) if __name__ == '__main__': urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)] print(urls) filename = 'music.csv' with open(filename, 'w', encoding='utf-8', newline='') as f: fieldnames = ['name', 'author', 'style', 'time', 'publishers'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for url in urls: get_url_music(url) time.sleep(1)