diff --git a/播放量和评论数——蒋曙昭.ipynb b/播放量和评论数——蒋曙昭.ipynb new file mode 100644 index 0000000..81e5cc7 --- /dev/null +++ b/播放量和评论数——蒋曙昭.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9a5582a5", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import pandas\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib import font_manager\n", + "\n", + "\n", + "def get_html(url):\n", + " try:\n", + " r = requests.get(url) # 使用get来获取网页数据\n", + " r.raise_for_status() # 如果返回参数不为200,抛出异常\n", + " r.encoding = r.apparent_encoding # 获取网页编码方式\n", + " return r.text # 返回获取的内容\n", + " except:\n", + " return '错误'\n", + "\n", + "\n", + "def save(html):\n", + " # 解析网页\n", + " soup = BeautifulSoup(html, 'html.parser') # 指定Beautiful的解析器为“html.parser”\n", + "\n", + " with open('1.txt', 'r+', encoding='UTF-8') as f:\n", + " f.write(soup.text)\n", + "\n", + " # 定义好相关列表准备存储相关信息\n", + " TScore = [] # 综合评分\n", + " name = [] # 动漫名字\n", + " bfl = [] # 播放量\n", + " pls = [] # 评论数\n", + " scs = [] # 收藏数\n", + "\n", + " # ******************************************** 动漫名字存储\n", + " for tag in soup.find_all('div', class_='info'):\n", + " # print(tag)\n", + " bf = tag.a.string\n", + " name.append(str(bf))\n", + " print(name)\n", + "\n", + " # ******************************************** 播放量存储\n", + " for tag in soup.find_all('div', class_='detail'):\n", + " # print(tag)\n", + " bf = tag.find('span', class_='data-box').get_text()\n", + " # 统一单位为‘万’\n", + " if '亿' in bf:\n", + " num = float(re.search(r'\\d(.\\d)?', bf).group()) * 10000\n", + " # print(num)\n", + " bf = num\n", + " else:\n", + " bf = re.search(r'\\d*(\\.)?\\d', bf).group()\n", + " bfl.append(float(bf))\n", + " print(bfl)\n", + " # ******************************************** 评论数存储\n", + " for tag in soup.find_all('div', class_='detail'):\n", + " # pl = tag.span.next_sibling.next_sibling\n", + " pl = tag.find('span', class_='data-box').next_sibling.next_sibling.get_text()\n", + " # *********统一单位\n", + " if '万' not in pl:\n", + " pl = '%.1f' % (float(pl) / 10000)\n", + " # print(123, pl)\n", + " else:\n", + " pl = re.search(r'\\d*(\\.)?\\d', pl).group()\n", + " pls.append(float(pl))\n", + " print(pls)\n", + " # ******************************************** 收藏数\n", + " for tag in soup.find_all('div', class_='detail'):\n", + " sc = tag.find('span', class_='data-box').next_sibling.next_sibling.next_sibling.next_sibling.get_text()\n", + " sc = re.search(r'\\d*(\\.)?\\d', sc).group()\n", + " scs.append(float(sc))\n", + " print(scs)\n", + " # ******************************************** 综合评分\n", + " for tag in soup.find_all('div', class_='pts'):\n", + " zh = tag.find('div').get_text()\n", + " TScore.append(int(zh))\n", + " print('综合评分', TScore)\n", + "\n", + " # 存储至excel表格中\n", + " info = {'动漫名': name, '播放量(万)': bfl, '评论数(万)': pls, '收藏数(万)': scs, '综合评分': TScore}\n", + " dm_file = pandas.DataFrame(info)\n", + " dm_file.to_excel('Dongman.xlsx', sheet_name=\"动漫数据分析\")\n", + " # 将所有列表返回\n", + " return name, bfl, pls, scs, TScore\n", + "\n", + "\n", + "def view(info):\n", + " my_font = font_manager.FontProperties(fname='./data/STHeiti Medium.ttc') # 设置中文字体(图标中能显示中文)\n", + " dm_name = info[0] # 番剧名\n", + " dm_play = info[1] # 番剧播放量\n", + " dm_review = info[2] # 番剧评论数\n", + " dm_favorite = info[3] # 番剧收藏数\n", + " dm_com_score = info[4] # 番剧综合评分\n", + " # print(dm_com_score)\n", + "\n", + " # 为了坐标轴上能显示中文\n", + " plt.rcParams['font.sans-serif'] = ['SimHei']\n", + " plt.rcParams['axes.unicode_minus'] = False\n", + "\n", + " # **********************************************************************综合评分和播放量对比\n", + " # *******综合评分条形图\n", + " fig, ax1 = plt.subplots()\n", + " plt.bar(dm_name, dm_com_score, color='red') #设置柱状图\n", + " plt.title('综合评分和播放量数据分析', fontproperties=my_font) # 表标题\n", + " ax1.tick_params(labelsize=6)\n", + " plt.xlabel('番剧名') # 横轴名\n", + " plt.ylabel('综合评分') # 纵轴名\n", + " plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色\n", + "\n", + " # *******播放量折y\n", + " ax2 = ax1.twinx() # 组合图必须加这个\n", + " ax2.plot(dm_play, color='cyan') # 设置线粗细,节点样式\n", + " plt.ylabel('播放量') # y轴\n", + "\n", + " plt.plot(1, label='综合评分', color=\"red\", linewidth=5.0) # 图例\n", + " plt.plot(1, label='播放量', color=\"cyan\", linewidth=1.0, linestyle=\"-\") # 图例\n", + " plt.legend()\n", + "\n", + " plt.savefig(r'E:1.png', dpi=1000, bbox_inches='tight') #保存至本地\n", + "\n", + " # plt.show()\n", + "\n", + "def main():\n", + " url = 'https://www.bilibili.com/v/popular/rank/bangumi' # 网址\n", + " html = get_html(url) # 获取返回值\n", + " # print(html)\n", + " info = save(html)\n", + " view(info)\n", + "\n", + "\n", + "if __name__ == '__main__':\n", + " main()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}