Compare commits

..

24 Commits

Binary file not shown.

@ -4,9 +4,9 @@ import matplotlib.pyplot as plt
import time
def data_visualization_of_top_10_of_ea_song_comment():
def image_comment_top_10():
"""网易云音乐欧美歌单评论 TOP10"""
df = pd.read_csv('./music_data/music_detail.csv', header=None)
df = pd.read_csv('./data/detail.csv', header=None)
print("正在生成网易云音乐欧美歌单评论 TOP10 图片...")
@ -62,17 +62,17 @@ def data_visualization_of_top_10_of_ea_song_comment():
lines.yaxis.set_ticks_position('none')
# 绘制柱状图,设置柱状图颜色
data.plot.barh(ax=ax, width=0.7, alpha=0.7, color=(160/255, 102/255, 50/255))
ax.set_title('网易云音乐欧美歌单评论 TOP10', fontsize=18, fontweight='light')
data.plot.barh(ax=ax, width=0.7, alpha=0.7, color=(10/255, 70/255, 100/255))
ax.set_title('**【网易云音乐欧美歌单评论 TOP10】**', fontsize=18, fontweight='light')
# 添加歌单评论数量文本
for x, y in enumerate(data.values):
plt.text(y+200, x-0.08, '%s' % y, ha='center')
plt.text(y+200, x-0.08, '%s' % y, ha='left')
# 保存图片
plt.savefig('./music_image/top_10_of_ea_song_comment.png', dpi=None)
plt.savefig('./image/comment_top_10.png', dpi=None)
# 显示图片
plt.show()
print("\n已生成网易云音乐欧美歌单评论 TOP10 图片,保存至 music_image/top_10_of_ea_song_comment.png")
print("\n已生成网易云音乐欧美歌单评论 TOP10 图片,保存至 image/comment_top_10.png")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,16 +1,9 @@
import os
#from music_index import get_data_of_music_list_index_page
#from music_detail import get_data_of_music_list_detail_page
#from top_10_song import data_visualization_of_top_10_song
# from top_10_song_up import data_visualization_of_top_10_song_up
from top_10_ea_song_playlists import data_visualization_of_top_10_ea_song_playlists
#from top_10_of_ea_song_collection import data_visualization_of_top_10_of_ea_song_collection
from top_10_of_ea_song_comment import data_visualization_of_top_10_of_ea_song_comment
#from top_10_ea_song_collection_distribution import data_visualization_of_top_10_ea_song_collection_distribution
#from top_10_ea_song_playlists_distribution import data_visualization_of_top_10_ea_song_playlists_distribution
from label_ea_song import data_visualization_of_label_ea_song
from music_wordcloud import data_visualization_of_music_wordcloud
from playlists_top_10 import image_playlists_top_10
from comment_top_10 import image_comment_top_10
from label import image_label
from lable_wordcloud import image_wordcloud
def menu():
@ -36,23 +29,23 @@ def key_down():
exit(0)
elif option == 'a' or option == 'A':
# 生成网易云音乐欧美歌单播放 TOP10 图片
data_visualization_of_top_10_ea_song_playlists()
image_playlists_top_10()
return
elif option == 'b' or option == 'B':
# 生成网易云音乐欧美歌单评论 TOP10 图片
data_visualization_of_top_10_of_ea_sBong_comment()
image_comment_top_10()
return
elif option == 'c' or option == 'C':
# 生成网易云音乐欧美歌单标签图片
data_visualization_of_label_ea_song()
image_label()
return
elif option == 'd' or option == 'D':
# 生成歌单介绍词云图片
data_visualization_of_music_wordcloud()
image_wordcloud()
return
else:

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 223 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

@ -5,9 +5,9 @@ import matplotlib.pyplot as plt
import time
def data_visualization_of_label_ea_song():
def image_label():
"""网易云音乐欧美歌单标签图"""
df = pd.read_csv('./music_data/music_detail.csv', header=None)
df = pd.read_csv('./data/detail.csv', header=None)
print("正在生成网易云音乐欧美歌单标签图片...")
@ -60,8 +60,8 @@ def data_visualization_of_label_ea_song():
income = df1['num'][:10]
# 绘图 details
colors = ['#993333', '#CC9966', '#333333', '#663366', '#003366', '#009966', '#FF6600', '#FF0033', '#009999',
'#333366']
colors = ['#EB3324', '#F08784', '#7E84F7', '#732BF5','#367E7F', '#7F82BB','#75FA8D' ,
'#3282F6', '#16417C', '#808080']#各个标签的RGB色彩
plot = squarify.plot(sizes=income, label=name, color=colors, alpha=1, value=income, edgecolor='white',
linewidth=1.5)
@ -71,10 +71,10 @@ def data_visualization_of_label_ea_song():
plt.rcParams['axes.unicode_minus'] = False
# 设置标签大小为 1
plt.rc('font', size=6)
plt.rc('font', size=8)
# 设置标题大小
plot.set_title('网易云音乐欧美歌单标签图', fontsize=13, fontweight='light')
plot.set_title('**【网易云音乐欧美歌单标签图】**', fontsize=15, fontweight='light')
# 除坐标轴
plt.axis('off')
@ -83,12 +83,12 @@ def data_visualization_of_label_ea_song():
plt.tick_params(top=False, right=False)
# 保存图片
plt.savefig('./music_image/label_ea_song.png', dpi=None)
plt.savefig('./image/label.png', dpi=None)
# 显示图片
plt.show()
print("\n已生成网易云音乐欧美歌单标签图片,保存至 music_image/label_ea_song.png")
print("\n已生成网易云音乐欧美歌单标签图片,保存至 image/label.png")

@ -6,9 +6,9 @@ import jieba
import time
def data_visualization_of_music_wordcloud():
def image_wordcloud():
"""歌单介绍词云图"""
df = pd.read_csv('./music_data/music_detail.csv', header=None)
df = pd.read_csv('./data/detail.csv', header=None)
text = ''
print("正在生成歌单介绍词云图片...")
@ -30,7 +30,7 @@ def data_visualization_of_music_wordcloud():
for line in df[2]:
text += ' '.join(jieba.cut(line, cut_all=False))
background_image = plt.imread('./music_image/background_image.jpg')
background_image = plt.imread('./image/background_image.jpg')
stopwords = set('')
stopwords.update(
@ -63,10 +63,10 @@ def data_visualization_of_music_wordcloud():
plt.axis('off')
# 保存图片
wc.to_file("./music_image/music_wordcloud.png")
wc.to_file("./image/music_wordcloud.png")
# 显示图片
plt.show()
print("\n已生成歌单介绍词云图片,保存至 music_image/music_wordcloud.png")
print("\n已生成歌单介绍词云图片,保存至 image/music_wordcloud.png")

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MusiclistdemoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class MusiclistdemoSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class MusiclistdemoDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class musicListdemoPipeline(object):
def process_item(self, item, spider):
return item
class musicListdemoInfoPipeline(object):
def open_spider(self, spider):
self.f = open('musicListdemoInfo.txt', 'w')
def close_spider(self, spider):
self.f.close()
def process_item(self, item, spider):
try:
line = str(dict(item)) + '\n'
self.f.write(line)
except:
pass
return item

@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
# Scrapy settings for BaiduStocks project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'musicListdemo'
SPIDER_MODULES = ['musicListdemo.spiders']
NEWSPIDER_MODULE = 'musicListdemo.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'BaiduStocks (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'BaiduStocks.middlewares.BaidustocksSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'BaiduStocks.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'musicListdemo.pipelines.musicListdemoInfoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,75 @@
import scrapy
import pandas as pd
# music_detail.py
INDEX = "C:\\Users\\aaa\\Desktop\\spiders163\\data\\url.csv"
DETAIL = "C:\\Users\\aaa\\Desktop\\spiders163\\data\\detail.csv"
#scrapy crawl music_index
class MusicIndexSpider(scrapy.Spider):
name = "music_detail"
redis_key = 'music_detail'
def start_requests(self):
df = pd.read_csv(INDEX, header=None, names=['url', 'title', 'play','user'])
headers_chrome = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/120.0.0.0 Safari/537.36'
}
for i in df['url']:
url = 'https://music.163.com' + i
yield scrapy.Request(url, headers=headers_chrome)
def parse(self, response):
# 获取歌单标题
title = response.css('h2::text')[0].get().replace(',', ' ')
# 获取标签
tags = []
tags_message = response.css('.u-tag i::text')
for p in tags_message:
tags.append(p.get())
# 对标签进行格式化
if len(tags) > 1:
tag = '-'.join(tags)
elif len(tags):
tag = tags[0]
else:
tag = ''
# 获取歌单介绍
if response.css('#album-desc-more::text'):
text = response.css('#album-desc-more::text')[0].get().replace('\n', '').replace(',', ' ')
else:
text = ''
# 获取歌单收藏量
collection = response.css('#content-operation i::text')[1].get().replace('(', '').replace(')', '')
# 歌单播放量
play = response.css('.s-fc6::text')[0].get()
# 歌单内歌曲数
songs = response.css('#playlist-track-count::text')[0].get()
# 歌单评论数
comments = response.css('#cnt_comment_count::text')[0].get()
# 输出歌单详情页信息
print('\r', title, tag, text, collection, play, songs, comments, end='', flush=True)
# # 输出歌单详情页信息
print('\r', title, text, collection, play, songs, comments, end='', flush=True)
# 将详情页信息写入CSV文件中
with open(DETAIL, 'a+', encoding='utf-8-sig') as f:
f.write(title + ',' + tag + ',' + text + ',' + collection + ',' + play + ',' + songs + ',' + comments +'\n')
print("\n已获取歌单的信息,保存至 data/detail.csv")

@ -0,0 +1,38 @@
import scrapy
import time
import pandas as pd
# music_name.py
INDEX = "C:\\Users\\aaa\\Desktop\\spiders163\\data\\url.csv"
NAME = "C:\\Users\\aaa\\Desktop\\spiders163\\data\\name.csv"
class MusicIndexSpider(scrapy.Spider):
name = "music_name"
def start_requests(self):
urls = []
df = pd.read_csv(INDEX, header=None, names=['url', 'title', 'play','user'])
headers_iphone = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25'
}
for i in df['url']:
url = 'https://music.163.com' + i
yield scrapy.Request(url, headers=headers_iphone)
def parse(self, response):
li = response.css('.sgchfl .f-thide.sgtl::text')
for j in li:
print(j.get())
text = j.get()
with open(NAME, 'a+', encoding='utf-8-sig') as f:
f.write(j.get().replace(",", " ") + '\n')
print("\n已获取歌单详情页的信息,保存至 data/name.csv")

@ -0,0 +1,40 @@
import scrapy
# music_url.py
INDEX = "C:\\Users\\aaa\\Desktop\\spiders163\\data\\url.csv"
class MusicIndexSpider(scrapy.Spider):
name = "music_index"
def start_requests(self):
urls = []
for i in range(35, 1505, 35):
url = f'https://music.163.com/discover/playlist/?cat=80后&order=hot&limit=35&offset={i}'
urls.append(url)
for url in urls:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
yield scrapy.Request(url, headers={'User-Agent': user_agent})
def parse(self, response):
# 获取包含歌单详情页网址的标签
ids = response.css('.dec a')
# 获取包含歌单索引页信息的标签
# lis = response.css('#m-pl-container li')
# print('\r', len(lis), end='', flush=True)
for id in ids:
# 获取歌单详情页地址
url = id.attrib['href']
# 输出歌单索引页信息
print('\r', url, end='', flush=True)
# 将索引页写入CSV文件中
with open(INDEX, 'a+', encoding='utf-8') as f:
f.write(url + '\n')
print("\n已获取歌单索引页的信息,保存至 data/list.csv")

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = musicListdemo.settings
[deploy]
#url = http://localhost:6800/
project = musicListdemo

@ -4,9 +4,9 @@ import matplotlib.pyplot as plt
import time
def data_visualization_of_top_10_ea_song_playlists():
def image_playlists_top_10():
"""网易云音乐欧美歌单播放 TOP10"""
df = pd.read_csv('./music_data/music_detail.csv', header=None)
df = pd.read_csv('./data/detail.csv', header=None)
df['play'] = df[4]
print("正在生成网易云音乐欧美歌单播放 TOP10 图片...")
@ -60,20 +60,20 @@ def data_visualization_of_top_10_ea_song_playlists():
lines.yaxis.set_ticks_position('none')
# 绘制柱状图,设置柱状图颜色
data.plot.barh(ax=ax, width=0.7, alpha=0.7, color=(136/255, 43/255, 48/255))
data.plot.barh(ax=ax, width=0.7, alpha=0.7, color=(135/255, 43/255, 135/255))
# 添加标题,设置字体属性
ax.set_title('网易云音乐欧美歌单播放 TOP10', fontsize=18, fontweight='light')
ax.set_title('**【网易云音乐欧美歌单播放 TOP10】**', fontsize=18, fontweight='light')
# 添加歌单收藏数量文本
for x, y in enumerate(data.values):
num = str(int(y / 10000))
plt.text(y+1800000, x-0.08, '%s' % (num + ''), ha='center')
plt.text(y+1800000, x-0.08, '%s' % (num + ''), ha='left')
# 保存图片
plt.savefig('./music_image/top_10_ea_song_playlists.png', dpi=None)
plt.savefig('./image/playlists_top_10.png', dpi=None)
# 显示图片
plt.show()
print("\n已生成网易云音乐欧美歌单播放 TOP10 图片,保存至 music_image/top_10_ea_song_playlists.png")
print("\n已生成网易云音乐欧美歌单播放 TOP10 图片,保存至 image/playlists_top_10.png")
Loading…
Cancel
Save