You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

252 lines
15 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['盾之勇者成名录 第二季', '约会大作战 第四季', '恋爱游戏世界对路人角色很不友好', '测不准的阿波连同学', '鬼灭之刃 游郭篇', 'RPG不动产', '恋爱要在世界征服后', '夏日幻魂', '处刑少女的生存之道', '理科生坠入情网,故尝试证明。 第二季', '鬼灭之刃 无限列车篇 中配版', '擅长捉弄的高木同学 第三季', '盾之勇者成名录', '国王排名', 'JOJO的奇妙冒险 石之海', '现实主义勇者的王国再建记', '与变成了异世界美少女的好友一起冒险', '咒术回战', '上班族想被治愈', '黑色四叶草', '天才王子的赤字国家振兴术', '明日酱的水手服', '瓦尼塔斯的手记', '鬼灭之刃', '指名!', '博人传 火影忍者新时代', '通灵王', '失格纹的最强贤者~世界最强的贤者为了变得更强而转生了~', '小林家的龙女仆', 'JOJO的奇妙冒险 黄金之风', '名侦探柯南', '关于我转生变成史莱姆这档事 第二季', 'OVERLORD', '鬼灭之刃 无限列车篇', '四月是你的谎言', 'OVERLORD Ⅲ', '白领羽球部', '堀与宫村', 'OVERLORD Ⅱ', '秘密内幕 女警的反击', '女孩的钓鱼慢活', '关于我转生变成史莱姆这档事', '里亚德录大地', '不能播放', '自称贤者弟子的贤者', '半妖的夜叉姬 第二季']\n",
"[2.0, 2.0, 2.0, 2.0, 5.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 12.0, 25.0, 23.0, 12.0, 26.0, 12.0, 24.0, 2.0, 170.0, 12.0, 3.0, 20.0, 26.0, 13.0, 235.0, 45.0, 12.0, 14.0, 39.0, 1095.0, 24.0, 13.0, 7.0, 22.0, 13.0, 7.0, 13.0, 13.0, 13.0, 10.0, 29.0, 12.0, 5.0, 12.0, 18.0]\n",
"[1437.2, 984.1, 364.0, 1428.1, 9039.5, 217.4, 277.1, 174.9, 466.2, 728.8, 516.5, 4911.9, 2.7, 4.7, 1.1, 6736.0, 4728.3, 6.8, 32.8, 3.2, 5851.5, 755.5, 4324.0, 8.4, 431.6, 2.8, 4489.7, 4950.2, 3.2, 4.5, 5.5, 3.7, 4.0, 1.4, 1.6, 3.8, 265.4, 2.1, 3.7, 1350.8, 779.2, 4.2, 5511.5, 57.3, 3086.8, 979.9]\n"
]
},
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'next_sibling'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-e11bff66c2d5>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 184\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 185\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'__main__'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 186\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m<ipython-input-2-e11bff66c2d5>\u001b[0m in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 179\u001b[0m \u001b[0mhtml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_html\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 获取返回值\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 180\u001b[0m \u001b[1;31m# print(html)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 181\u001b[1;33m \u001b[0minfo\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 182\u001b[0m \u001b[0mview\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 183\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-2-e11bff66c2d5>\u001b[0m in \u001b[0;36msave\u001b[1;34m(html)\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[1;31m# ******************************************** 收藏数\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 63\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtag\u001b[0m \u001b[1;32min\u001b[0m \u001b[0msoup\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'div'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclass_\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'detail'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 64\u001b[1;33m \u001b[0msc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtag\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'span'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclass_\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'data-box'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 65\u001b[0m \u001b[0msc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mr'\\d*(\\.)?\\d'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[0mscs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'next_sibling'"
]
}
],
"source": [
"import re\n",
"import pandas\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import font_manager\n",
"\n",
"\n",
"def get_html(url):\n",
" try:\n",
" r = requests.get(url) # 使用get来获取网页数据\n",
" r.raise_for_status() # 如果返回参数不为200抛出异常\n",
" r.encoding = r.apparent_encoding # 获取网页编码方式\n",
" return r.text # 返回获取的内容\n",
" except:\n",
" return '错误'\n",
"\n",
"\n",
"def save(html):\n",
" # 解析网页\n",
" soup = BeautifulSoup(html, 'html.parser') # 指定Beautiful的解析器为“html.parser”\n",
"\n",
" # 定义好相关列表准备存储相关信息\n",
" TScore = [] # 综合评分\n",
" name = [] # 动漫名字\n",
" bfl = [] # 播放量\n",
" pls = [] # 评论数\n",
" scs = [] # 收藏数\n",
"\n",
" # ******************************************** 动漫名字存储\n",
" for tag in soup.find_all('div', class_='info'):\n",
" # print(tag)\n",
" bf = tag.a.string\n",
" name.append(str(bf))\n",
" print(name)\n",
"\n",
" # ******************************************** 播放量存储\n",
" for tag in soup.find_all('div', class_='detail'):\n",
" # print(tag)\n",
" bf = tag.find('span', class_='data-box').get_text()\n",
" # 统一单位为‘万’\n",
" if '亿' in bf:\n",
" num = float(re.search(r'\\d(.\\d)?', bf).group()) * 10000\n",
" # print(num)\n",
" bf = num\n",
" else:\n",
" bf = re.search(r'\\d*(\\.)?\\d', bf).group()\n",
" bfl.append(float(bf))\n",
" print(bfl)\n",
" # ******************************************** 评论数存储\n",
" for tag in soup.find_all('div', class_='detail'):\n",
" # pl = tag.span.next_sibling.next_sibling\n",
" pl = tag.find('span', class_='data-box').next_sibling.next_sibling.get_text()\n",
" # *********统一单位\n",
" if '万' not in pl:\n",
" pl = '%.1f' % (float(pl) / 10000)\n",
" # print(123, pl)\n",
" else:\n",
" pl = re.search(r'\\d*(\\.)?\\d', pl).group()\n",
" pls.append(float(pl))\n",
" print(pls)\n",
" # ******************************************** 收藏数\n",
" for tag in soup.find_all('div', class_='detail'):\n",
" sc = tag.find('span', class_='data-box').next_sibling.next_sibling.next_sibling.next_sibling.get_text()\n",
" sc = re.search(r'\\d*(\\.)?\\d', sc).group()\n",
" scs.append(float(sc))\n",
" print(scs)\n",
" # ******************************************** 综合评分\n",
" for tag in soup.find_all('div', class_='pts'):\n",
" zh = tag.find('div').get_text()\n",
" TScore.append(int(zh))\n",
" print('综合评分', TScore)\n",
"\n",
" # 存储至excel表格中\n",
" info = {'动漫名': name, '播放量(万)': bfl, '评论数(万)': pls, '收藏数(万)': scs, '综合评分': TScore}\n",
" dm_file = pandas.DataFrame(info)\n",
" dm_file.to_excel('Dongman.xlsx', sheet_name=\"动漫数据分析\")\n",
" # 将所有列表返回\n",
" return name, bfl, pls, scs, TScore\n",
"\n",
"\n",
"def view(info):\n",
" my_font = font_manager.FontProperties(fname='./data/STHeiti Medium.ttc') # 设置中文字体(图标中能显示中文)\n",
" dm_name = info[0] # 番剧名\n",
" dm_play = info[1] # 番剧播放量\n",
" dm_review = info[2] # 番剧评论数\n",
" dm_favorite = info[3] # 番剧收藏数\n",
" dm_com_score = info[4] # 番剧综合评分\n",
" # print(dm_com_score)\n",
"\n",
" # 为了坐标轴上能显示中文\n",
" plt.rcParams['font.sans-serif'] = ['SimHei']\n",
" plt.rcParams['axes.unicode_minus'] = False\n",
"\n",
" # **********************************************************************综合评分和播放量对比\n",
" # *******综合评分条形图\n",
" fig, ax1 = plt.subplots()\n",
" plt.bar(dm_name, dm_com_score, color='red') #设置柱状图\n",
" plt.title('综合评分和播放量数据分析', fontproperties=my_font) # 表标题\n",
" ax1.tick_params(labelsize=6)\n",
" plt.xlabel('番剧名') # 横轴名\n",
" plt.ylabel('综合评分') # 纵轴名\n",
" plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色\n",
"\n",
" # *******播放量折线图\n",
" ax2 = ax1.twinx() # 组合图必须加这个\n",
" ax2.plot(dm_play, color='cyan') # 设置线粗细,节点样式\n",
" plt.ylabel('播放量') # y轴\n",
"\n",
" plt.plot(1, label='综合评分', color=\"red\", linewidth=5.0) # 图例\n",
" plt.plot(1, label='播放量', color=\"cyan\", linewidth=1.0, linestyle=\"-\") # 图例\n",
" plt.legend()\n",
"\n",
" plt.savefig(r'E:1.png', dpi=1000, bbox_inches='tight') #保存至本地\n",
"\n",
" # plt.show()\n",
"\n",
" # **********************************************************************评论数和收藏数对比\n",
" # ********评论数条形图\n",
" fig, ax3 = plt.subplots()\n",
" plt.bar(dm_name, dm_review, color='green')\n",
" plt.title('番剧评论数和收藏数分析')\n",
" plt.ylabel('评论数(万)')\n",
" ax3.tick_params(labelsize=6)\n",
" plt.xticks(rotation=90, color='green')\n",
"\n",
" # *******收藏数折线图\n",
" ax4 = ax3.twinx() # 组合图必须加这个\n",
" ax4.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式\n",
" plt.ylabel('收藏数(万)')\n",
"\n",
" plt.plot(1, label='评论数', color=\"green\", linewidth=5.0)\n",
" plt.plot(1, label='收藏数', color=\"yellow\", linewidth=1.0, linestyle=\"-\")\n",
" plt.legend()\n",
" plt.savefig(r'E:2.png', dpi=1000, bbox_inches='tight')\n",
"\n",
" # **********************************************************************综合评分和收藏数对比\n",
" # *******综合评分条形图\n",
" fig, ax5 = plt.subplots()\n",
" plt.bar(dm_name, dm_com_score, color='red')\n",
" plt.title('综合评分和收藏数量数据分析')\n",
" plt.ylabel('综合评分')\n",
" ax5.tick_params(labelsize=6)\n",
" plt.xticks(rotation=90, color='green')\n",
"\n",
" # *******收藏折线图\n",
" ax6 = ax5.twinx() # 组合图必须加这个\n",
" ax6.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式\n",
" plt.ylabel('收藏数(万)')\n",
" plt.plot(1, label='综合评分', color=\"red\", linewidth=5.0)\n",
" plt.plot(1, label='收藏数', color=\"yellow\", linewidth=1.0, linestyle=\"-\")\n",
" plt.legend()\n",
"\n",
" plt.savefig(r'E:3.png', dpi=1000, bbox_inches='tight')\n",
"\n",
" # **********************************************************************播放量和评论数对比\n",
" # *******播放量条形图\n",
" fig, ax7 = plt.subplots()\n",
" plt.bar(dm_name, dm_play, color='cyan')\n",
" plt.title('播放量和评论数 数据分析')\n",
" plt.ylabel('播放量(万)')\n",
" ax7.tick_params(labelsize=6)\n",
" plt.xticks(rotation=90, color='green')\n",
"\n",
" # *******评论数折线图\n",
" ax8 = ax7.twinx() # 组合图必须加这个\n",
" ax8.plot(dm_review, color='green') # 设置线粗细,节点样式\n",
" plt.ylabel('评论数(万)')\n",
" plt.plot(1, label='播放量', color=\"cyan\", linewidth=5.0)\n",
" plt.plot(1, label='评论数', color=\"green\", linewidth=1.0, linestyle=\"-\")\n",
" plt.legend()\n",
" plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')\n",
"\n",
" plt.show()\n",
"\n",
"\n",
"def main():\n",
" url = 'https://www.bilibili.com/v/popular/rank/bangumi' # 网址\n",
" html = get_html(url) # 获取返回值\n",
" # print(html)\n",
" info = save(html)\n",
" view(info)\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" main()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}