Compare commits

...

7 Commits
master ... zy

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -0,0 +1,91 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#爬虫部分\n",
"import requests\n",
"import bs4\n",
"import time\n",
"import random\n",
"import pandas as pd\n",
"\n",
"ulist = []\n",
"url = 'https://www.shanghairanking.cn/rankings/bcur/202011'\n",
"headers = {\n",
" 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}\n",
"\n",
"response = requests.get(url=url, headers=headers)\n",
"response.encoding = \"utf-8\" # 转化字符码\n",
"html = response.text\n",
"\n",
"soup = bs4.BeautifulSoup(html, 'html.parser')\n",
"\n",
"for tr in soup.find('tbody').children:\n",
" if isinstance(tr, bs4.element.Tag):\n",
" tds = tr.find_all('td') # 等价于 tds = tr('td')\n",
" name = tds[1].text\n",
" print(name)\n",
" score = tds[4].text\n",
" print(score)\n",
" pd1 = pd.DataFrame({'学校': name, '总分': score}, index=[0])\n",
" ulist.append(pd1)\n",
"\n",
"second = random.randrange(3, 5)\n",
"time.sleep(second)\n",
"\n",
"university2 = pd.concat(ulist[0:10])\n",
"university2.to_excel('university.xlsx', index=False)\n",
"#可视化部分\n",
"from pyecharts.charts import Bar # 柱状图\n",
"import xlrd # xlrd是对excel读取read xlwt是写入write\n",
"\n",
"data = xlrd.open_workbook(r'D:\\数据可视化\\university.xlsx') # 打开本地excel表格\n",
"\n",
"table = data.sheets()[0] # 拿出表格的第一个sheet\n",
"\n",
"names = [] # 画图用的x轴坐标是姓名\n",
"sources = [] # 画图用的y轴坐标成绩\n",
"# 循环输出每行内容\n",
"for i in range(1, table.nrows):\n",
" a = table.row_values(i) # 把第几行拿出来作为一个列表\n",
" name = a[0] # 姓名提取加入name列表\n",
" names.append(name)\n",
"\n",
" source = a[1]\n",
" sources.append(source)\n",
"\n",
"bar = Bar() # 创建一个柱状图对象\n",
"bar.add_xaxis(names) # 设置x轴\n",
"bar.add_yaxis('总分', sources) # 设置y轴和图标名\n",
"bar.render('柱形图.html') # 输出html文件来显示柱状图"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,70 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'parsel'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-0a98fa3dc863>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mrequests\u001b[0m \u001b[1;31m#数据请求模块\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mparsel\u001b[0m \u001b[1;31m#数据解析模块\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0murl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'https://www.biqugee.com/book/12564/4856870.html'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m#返回相应数据\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'parsel'"
]
}
],
"source": [
"import requests #数据请求模块\n",
"import parsel #数据解析模块\n",
"\n",
"url = 'https://www.biqugee.com/book/12564/4856870.html' \n",
"response = requests.get(url) #返回相应数据\n",
" #print(response.text)#发送请求,获取数据\n",
" #解析数据\n",
"selector = parsel.Selector(response.text) #调用selector里的css选择器的方法进行解析数提取\n",
"title = selector.css('#wrapper > div.content_read > div > div.bookname > h1::text').get()\n",
"content_list = selector.css('#content::text').getall()\n",
"content = '\\n'.join(content_list)\n",
"print(title)\n",
"print(content)\n",
"with open(title + '.csv',mode='a',encoding='utf-8')as f:\n",
" f.write(title)\n",
" f.write(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,251 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['盾之勇者成名录 第二季', '约会大作战 第四季', '恋爱游戏世界对路人角色很不友好', '测不准的阿波连同学', '鬼灭之刃 游郭篇', 'RPG不动产', '恋爱要在世界征服后', '夏日幻魂', '处刑少女的生存之道', '理科生坠入情网,故尝试证明。 第二季', '鬼灭之刃 无限列车篇 中配版', '擅长捉弄的高木同学 第三季', '盾之勇者成名录', '国王排名', 'JOJO的奇妙冒险 石之海', '现实主义勇者的王国再建记', '与变成了异世界美少女的好友一起冒险', '咒术回战', '上班族想被治愈', '黑色四叶草', '天才王子的赤字国家振兴术', '明日酱的水手服', '瓦尼塔斯的手记', '鬼灭之刃', '指名!', '博人传 火影忍者新时代', '通灵王', '失格纹的最强贤者~世界最强的贤者为了变得更强而转生了~', '小林家的龙女仆', 'JOJO的奇妙冒险 黄金之风', '名侦探柯南', '关于我转生变成史莱姆这档事 第二季', 'OVERLORD', '鬼灭之刃 无限列车篇', '四月是你的谎言', 'OVERLORD Ⅲ', '白领羽球部', '堀与宫村', 'OVERLORD Ⅱ', '秘密内幕 女警的反击', '女孩的钓鱼慢活', '关于我转生变成史莱姆这档事', '里亚德录大地', '不能播放', '自称贤者弟子的贤者', '半妖的夜叉姬 第二季']\n",
"[2.0, 2.0, 2.0, 2.0, 5.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 12.0, 25.0, 23.0, 12.0, 26.0, 12.0, 24.0, 2.0, 170.0, 12.0, 3.0, 20.0, 26.0, 13.0, 235.0, 45.0, 12.0, 14.0, 39.0, 1095.0, 24.0, 13.0, 7.0, 22.0, 13.0, 7.0, 13.0, 13.0, 13.0, 10.0, 29.0, 12.0, 5.0, 12.0, 18.0]\n",
"[1437.2, 984.1, 364.0, 1428.1, 9039.5, 217.4, 277.1, 174.9, 466.2, 728.8, 516.5, 4911.9, 2.7, 4.7, 1.1, 6736.0, 4728.3, 6.8, 32.8, 3.2, 5851.5, 755.5, 4324.0, 8.4, 431.6, 2.8, 4489.7, 4950.2, 3.2, 4.5, 5.5, 3.7, 4.0, 1.4, 1.6, 3.8, 265.4, 2.1, 3.7, 1350.8, 779.2, 4.2, 5511.5, 57.3, 3086.8, 979.9]\n"
]
},
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'next_sibling'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-2-e11bff66c2d5>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 184\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 185\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0m__name__\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'__main__'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 186\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m<ipython-input-2-e11bff66c2d5>\u001b[0m in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 179\u001b[0m \u001b[0mhtml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_html\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# 获取返回值\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 180\u001b[0m \u001b[1;31m# print(html)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 181\u001b[1;33m \u001b[0minfo\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msave\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 182\u001b[0m \u001b[0mview\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minfo\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 183\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-2-e11bff66c2d5>\u001b[0m in \u001b[0;36msave\u001b[1;34m(html)\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[1;31m# ******************************************** 收藏数\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 63\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtag\u001b[0m \u001b[1;32min\u001b[0m \u001b[0msoup\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'div'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclass_\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'detail'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 64\u001b[1;33m \u001b[0msc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtag\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'span'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclass_\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'data-box'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnext_sibling\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 65\u001b[0m \u001b[0msc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mre\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mr'\\d*(\\.)?\\d'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroup\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[0mscs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfloat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'next_sibling'"
]
}
],
"source": [
"import re\n",
"import pandas\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import font_manager\n",
"\n",
"\n",
"def get_html(url):\n",
" try:\n",
" r = requests.get(url) # 使用get来获取网页数据\n",
" r.raise_for_status() # 如果返回参数不为200抛出异常\n",
" r.encoding = r.apparent_encoding # 获取网页编码方式\n",
" return r.text # 返回获取的内容\n",
" except:\n",
" return '错误'\n",
"\n",
"\n",
"def save(html):\n",
" # 解析网页\n",
" soup = BeautifulSoup(html, 'html.parser') # 指定Beautiful的解析器为“html.parser”\n",
"\n",
" # 定义好相关列表准备存储相关信息\n",
" TScore = [] # 综合评分\n",
" name = [] # 动漫名字\n",
" bfl = [] # 播放量\n",
" pls = [] # 评论数\n",
" scs = [] # 收藏数\n",
"\n",
" # ******************************************** 动漫名字存储\n",
" for tag in soup.find_all('div', class_='info'):\n",
" # print(tag)\n",
" bf = tag.a.string\n",
" name.append(str(bf))\n",
" print(name)\n",
"\n",
" # ******************************************** 播放量存储\n",
" for tag in soup.find_all('div', class_='detail'):\n",
" # print(tag)\n",
" bf = tag.find('span', class_='data-box').get_text()\n",
" # 统一单位为‘万’\n",
" if '亿' in bf:\n",
" num = float(re.search(r'\\d(.\\d)?', bf).group()) * 10000\n",
" # print(num)\n",
" bf = num\n",
" else:\n",
" bf = re.search(r'\\d*(\\.)?\\d', bf).group()\n",
" bfl.append(float(bf))\n",
" print(bfl)\n",
" # ******************************************** 评论数存储\n",
" for tag in soup.find_all('div', class_='detail'):\n",
" # pl = tag.span.next_sibling.next_sibling\n",
" pl = tag.find('span', class_='data-box').next_sibling.next_sibling.get_text()\n",
" # *********统一单位\n",
" if '万' not in pl:\n",
" pl = '%.1f' % (float(pl) / 10000)\n",
" # print(123, pl)\n",
" else:\n",
" pl = re.search(r'\\d*(\\.)?\\d', pl).group()\n",
" pls.append(float(pl))\n",
" print(pls)\n",
" # ******************************************** 收藏数\n",
" for tag in soup.find_all('div', class_='detail'):\n",
" sc = tag.find('span', class_='data-box').next_sibling.next_sibling.next_sibling.next_sibling.get_text()\n",
" sc = re.search(r'\\d*(\\.)?\\d', sc).group()\n",
" scs.append(float(sc))\n",
" print(scs)\n",
" # ******************************************** 综合评分\n",
" for tag in soup.find_all('div', class_='pts'):\n",
" zh = tag.find('div').get_text()\n",
" TScore.append(int(zh))\n",
" print('综合评分', TScore)\n",
"\n",
" # 存储至excel表格中\n",
" info = {'动漫名': name, '播放量(万)': bfl, '评论数(万)': pls, '收藏数(万)': scs, '综合评分': TScore}\n",
" dm_file = pandas.DataFrame(info)\n",
" dm_file.to_excel('Dongman.xlsx', sheet_name=\"动漫数据分析\")\n",
" # 将所有列表返回\n",
" return name, bfl, pls, scs, TScore\n",
"\n",
"\n",
"def view(info):\n",
" my_font = font_manager.FontProperties(fname='./data/STHeiti Medium.ttc') # 设置中文字体(图标中能显示中文)\n",
" dm_name = info[0] # 番剧名\n",
" dm_play = info[1] # 番剧播放量\n",
" dm_review = info[2] # 番剧评论数\n",
" dm_favorite = info[3] # 番剧收藏数\n",
" dm_com_score = info[4] # 番剧综合评分\n",
" # print(dm_com_score)\n",
"\n",
" # 为了坐标轴上能显示中文\n",
" plt.rcParams['font.sans-serif'] = ['SimHei']\n",
" plt.rcParams['axes.unicode_minus'] = False\n",
"\n",
" # **********************************************************************综合评分和播放量对比\n",
" # *******综合评分条形图\n",
" fig, ax1 = plt.subplots()\n",
" plt.bar(dm_name, dm_com_score, color='red') #设置柱状图\n",
" plt.title('综合评分和播放量数据分析', fontproperties=my_font) # 表标题\n",
" ax1.tick_params(labelsize=6)\n",
" plt.xlabel('番剧名') # 横轴名\n",
" plt.ylabel('综合评分') # 纵轴名\n",
" plt.xticks(rotation=90, color='green') # 设置横坐标变量名旋转度数和颜色\n",
"\n",
" # *******播放量折线图\n",
" ax2 = ax1.twinx() # 组合图必须加这个\n",
" ax2.plot(dm_play, color='cyan') # 设置线粗细,节点样式\n",
" plt.ylabel('播放量') # y轴\n",
"\n",
" plt.plot(1, label='综合评分', color=\"red\", linewidth=5.0) # 图例\n",
" plt.plot(1, label='播放量', color=\"cyan\", linewidth=1.0, linestyle=\"-\") # 图例\n",
" plt.legend()\n",
"\n",
" plt.savefig(r'E:1.png', dpi=1000, bbox_inches='tight') #保存至本地\n",
"\n",
" # plt.show()\n",
"\n",
" # **********************************************************************评论数和收藏数对比\n",
" # ********评论数条形图\n",
" fig, ax3 = plt.subplots()\n",
" plt.bar(dm_name, dm_review, color='green')\n",
" plt.title('番剧评论数和收藏数分析')\n",
" plt.ylabel('评论数(万)')\n",
" ax3.tick_params(labelsize=6)\n",
" plt.xticks(rotation=90, color='green')\n",
"\n",
" # *******收藏数折线图\n",
" ax4 = ax3.twinx() # 组合图必须加这个\n",
" ax4.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式\n",
" plt.ylabel('收藏数(万)')\n",
"\n",
" plt.plot(1, label='评论数', color=\"green\", linewidth=5.0)\n",
" plt.plot(1, label='收藏数', color=\"yellow\", linewidth=1.0, linestyle=\"-\")\n",
" plt.legend()\n",
" plt.savefig(r'E:2.png', dpi=1000, bbox_inches='tight')\n",
"\n",
" # **********************************************************************综合评分和收藏数对比\n",
" # *******综合评分条形图\n",
" fig, ax5 = plt.subplots()\n",
" plt.bar(dm_name, dm_com_score, color='red')\n",
" plt.title('综合评分和收藏数量数据分析')\n",
" plt.ylabel('综合评分')\n",
" ax5.tick_params(labelsize=6)\n",
" plt.xticks(rotation=90, color='green')\n",
"\n",
" # *******收藏折线图\n",
" ax6 = ax5.twinx() # 组合图必须加这个\n",
" ax6.plot(dm_favorite, color='yellow') # 设置线粗细,节点样式\n",
" plt.ylabel('收藏数(万)')\n",
" plt.plot(1, label='综合评分', color=\"red\", linewidth=5.0)\n",
" plt.plot(1, label='收藏数', color=\"yellow\", linewidth=1.0, linestyle=\"-\")\n",
" plt.legend()\n",
"\n",
" plt.savefig(r'E:3.png', dpi=1000, bbox_inches='tight')\n",
"\n",
" # **********************************************************************播放量和评论数对比\n",
" # *******播放量条形图\n",
" fig, ax7 = plt.subplots()\n",
" plt.bar(dm_name, dm_play, color='cyan')\n",
" plt.title('播放量和评论数 数据分析')\n",
" plt.ylabel('播放量(万)')\n",
" ax7.tick_params(labelsize=6)\n",
" plt.xticks(rotation=90, color='green')\n",
"\n",
" # *******评论数折线图\n",
" ax8 = ax7.twinx() # 组合图必须加这个\n",
" ax8.plot(dm_review, color='green') # 设置线粗细,节点样式\n",
" plt.ylabel('评论数(万)')\n",
" plt.plot(1, label='播放量', color=\"cyan\", linewidth=5.0)\n",
" plt.plot(1, label='评论数', color=\"green\", linewidth=1.0, linestyle=\"-\")\n",
" plt.legend()\n",
" plt.savefig(r'E:4.png', dpi=1000, bbox_inches='tight')\n",
"\n",
" plt.show()\n",
"\n",
"\n",
"def main():\n",
" url = 'https://www.bilibili.com/v/popular/rank/bangumi' # 网址\n",
" html = get_html(url) # 获取返回值\n",
" # print(html)\n",
" info = save(html)\n",
" view(info)\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" main()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,240 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始下载第1首音乐勇气.mp3\n",
"\n",
"开始下载第2首音乐可惜不是你.mp3\n",
"\n",
"开始下载第3首音乐宁夏.mp3\n",
"\n",
"开始下载第4首音乐接受.mp3\n",
"\n",
"开始下载第5首音乐分手快乐.mp3\n",
"\n",
"开始下载第6首音乐明明很爱你.mp3\n",
"\n",
"开始下载第7首音乐爱久见人心.mp3\n",
"\n",
"开始下载第8首音乐夜夜夜夜.mp3\n",
"\n",
"开始下载第9首音乐可以的话.mp3\n",
"\n",
"开始下载第10首音乐丝路.mp3\n",
"\n",
"开始下载第11首音乐会过去的.mp3\n",
"\n",
"开始下载第12首音乐爱你不是两三天.mp3\n",
"\n",
"开始下载第13首音乐如果有一天.mp3\n",
"\n",
"开始下载第14首音乐偶阵雨.mp3\n",
"\n",
"开始下载第15首音乐无条件为你.mp3\n",
"\n",
"开始下载第16首音乐慢慢来比较快.mp3\n",
"\n",
"开始下载第17首音乐听不到.mp3\n",
"\n",
"开始下载第18首音乐燕尾蝶.mp3\n",
"\n",
"开始下载第19首音乐昨天.mp3\n",
"\n",
"开始下载第20首音乐Fly Away.mp3\n",
"\n",
"开始下载第21首音乐孤单北半球 (live).mp3\n",
"\n",
"开始下载第22首音乐小手拉大手(Live).mp3\n",
"\n",
"开始下载第23首音乐我喜欢.mp3\n",
"\n",
"开始下载第24首音乐飘洋过海来看你.mp3\n",
"\n",
"开始下载第25首音乐没有人像你.mp3\n",
"\n",
"开始下载第26首音乐不想睡.mp3\n",
"\n",
"开始下载第27首音乐情歌没有告诉你.mp3\n",
"\n",
"开始下载第28首音乐大地之歌.mp3\n",
"\n",
"开始下载第29首音乐瘦瘦的.mp3\n",
"\n",
"开始下载第30首音乐还是好朋友.mp3\n",
"\n",
"开始下载第31首音乐你会不会.mp3\n",
"\n",
"开始下载第32首音乐一路两个人.mp3\n",
"\n",
"开始下载第33首音乐一夜长大.mp3\n",
"\n",
"开始下载第34首音乐她.mp3\n",
"\n",
"开始下载第35首音乐第三者.mp3\n",
"\n",
"开始下载第36首音乐最想环游的世界.mp3\n",
"\n",
"开始下载第37首音乐美丽人生.mp3\n",
"\n",
"开始下载第38首音乐纯真.mp3\n",
"\n",
"开始下载第39首音乐我不害怕.mp3\n",
"\n",
"开始下载第40首音乐彩虹.mp3\n",
"\n",
"开始下载第41首音乐想都没想过.mp3\n",
"\n",
"开始下载第42首音乐最快乐那一年.mp3\n",
"\n",
"开始下载第43首音乐我就知道那是爱.mp3\n",
"\n",
"开始下载第44首音乐漂洋过海来看你(Live).mp3\n",
"\n",
"开始下载第45首音乐让爱转动整个宇宙.mp3\n",
"\n",
"开始下载第46首音乐我还记得.mp3\n",
"\n",
"开始下载第47首音乐有你在.mp3\n",
"\n",
"开始下载第48首音乐向左转向右转.mp3\n",
"\n",
"开始下载第49首音乐明天我要嫁给你了(Live).mp3\n",
"\n",
"开始下载第50首音乐恋着多喜欢.mp3\n",
"\n",
"50首全部歌曲已经下载完毕\n"
]
}
],
"source": [
"import os\n",
"import re\n",
"import json\n",
"import requests\n",
"from lxml import etree\n",
"\n",
"\n",
"def download_songs(url=None):\n",
" if url is None:\n",
" url = 'https://music.163.com/#/playlist?id=2384642500'\n",
"\n",
" url = url.replace('/#', '').replace('https', 'http') # 对字符串进行去空格和转协议处理\n",
" # 网易云音乐外链url接口http://music.163.com/song/media/outer/url?id=xxxx\n",
" out_link = 'http://music.163.com/song/media/outer/url?id='\n",
" # 请求头\n",
" headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',\n",
" 'Referer': 'https://music.163.com/',\n",
" 'Host': 'music.163.com'\n",
" }\n",
" # 请求页面的源码\n",
" res = requests.get(url=url, headers=headers).text\n",
"\n",
" tree = etree.HTML(res)\n",
" # 音乐列表\n",
" song_list = tree.xpath('//ul[@class=\"f-hide\"]/li/a')\n",
" # 如果是歌手页面\n",
" artist_name_tree = tree.xpath('//h2[@id=\"artist-name\"]/text()')\n",
" artist_name = str(artist_name_tree[0]) if artist_name_tree else None\n",
"\n",
" # 如果是歌单页面:\n",
" #song_list_tree = tree.xpath('//*[@id=\"m-playlist\"]/div[1]/div/div/div[2]/div[2]/div/div[1]/table/tbody')\n",
" song_list_name_tree = tree.xpath('//h2[contains(@class,\"f-ff2\")]/text()')\n",
" song_list_name = str(song_list_name_tree[0]) if song_list_name_tree else None\n",
"\n",
" # 设置音乐下载的文件夹为歌手名字或歌单名\n",
" folder = './' + artist_name if artist_name else './' + song_list_name\n",
"\n",
" if not os.path.exists(folder):\n",
" os.mkdir(folder)\n",
"\n",
" for i, s in enumerate(song_list):\n",
" href = str(s.xpath('./@href')[0])\n",
" song_id = href.split('=')[-1]\n",
" src = out_link + song_id # 拼接获取音乐真实的src资源值\n",
" title = str(s.xpath('./text()')[0]) # 音乐的名字\n",
" filename = title + '.mp3'\n",
" filepath = folder + '/' + filename\n",
" print('开始下载第{}首音乐:{}\\n'.format(i + 1, filename))\n",
"\n",
" try: # 下载音乐\n",
" #下载歌词\n",
" #download_lyric(title, song_id)\n",
"\n",
" data = requests.get(src).content # 音乐的二进制数据\n",
"\n",
" with open(filepath, 'wb') as f:\n",
" f.write(data)\n",
" except Exception as e:\n",
" print(e)\n",
"\n",
" print('{}首全部歌曲已经下载完毕!'.format(len(song_list)))\n",
"\n",
"\n",
"def download_lyric(song_name, song_id):\n",
" url = 'http://music.163.com/api/song/lyric?id={}&lv=-1&kv=-1&tv=-1'.format(song_id)\n",
" # 请求头\n",
" headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',\n",
" 'Referer': 'https://music.163.com/',\n",
" 'Host': 'music.163.com'\n",
" # 'Origin': 'https://music.163.com'\n",
" }\n",
" # 请求页面的源码\n",
" res = requests.get(url=url, headers=headers).text\n",
" json_obj = json.loads(res)\n",
" lyric = json_obj['lrc']['lyric']\n",
" reg = re.compile(r'\\[.*\\]')\n",
" lrc_text = re.sub(reg, '', lyric).strip()\n",
"\n",
" print(song_name, lrc_text)\n",
"\n",
"\n",
"\n",
"\n",
"if __name__ == '__main__':\n",
" #music_list = 'https://music.163.com/#/playlist?id=2384642500' #歌曲清单\n",
" music_list = 'https://music.163.com/#/artist?id=8325' #歌手排行榜\n",
" # music_list = 'https://music.163.com/#/search/m/?order=hot&cat=全部&limit=435&offset=435&s=梁静茹' #搜索列表\n",
" download_songs(music_list)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,93 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"下载完成!\n"
]
}
],
"source": [
"import requests\n",
"import re\n",
"import csv, time\n",
"\n",
"# 创建一个保存的csv文件并设置好表头\n",
"timenow = time.strftime(\"%Y-%m-%d-%H%M%S\", time.localtime())\n",
"file = '猫眼电影top100榜-%s.csv' % (timenow)\n",
"# 写入表头\n",
"with open(file, 'a+', encoding='gb18030', newline='') as f:\n",
" writer_f = csv.writer(f)\n",
" writer_f.writerow(['排名', '电影名称', '主演', '上映时间', '评分'])\n",
"\n",
"\n",
"# 定义一个爬取其中一页的电影信息\n",
"def get_one_page(url):\n",
" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'}\n",
" response = requests.get(url, headers=headers).text\n",
"\n",
" pattern = re.compile(\n",
" '<dd>.*?board-index.*?>(.*?)</i>.*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>',\n",
" re.S)\n",
" items = re.findall(pattern, response)\n",
" # print(items)\n",
" for item in items:\n",
" pm = item[0]\n",
" mc = item[1]\n",
" zy = item[2].strip()\n",
" sj = item[3]\n",
" pf = item[4] + item[5]\n",
" print(pm, mc, zy, sj, pf)\n",
" # 写入到csv文件\n",
" with open(file, 'a+', encoding='gb18030', newline='') as f:\n",
" writer = csv.writer(f)\n",
" writer.writerow([pm, mc, zy, sj, pf])\n",
"\n",
"\n",
"# 共有10个分页分批写入\n",
"for i in range(10):\n",
" page = i * 10\n",
" url = 'https://maoyan.com/board/4?offset=' + str(page)\n",
"get_one_page(url)\n",
"time.sleep(1)\n",
"print('下载完成!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading…
Cancel
Save