pachong/小组作业.py

import requests#获取页面数据
from bs4 import BeautifulSoup#解析页面
import re#正则表达模块，匹配过程：1.依次拿出表达式和文本中的字符比较，2，如果每一个字符都能匹配，则匹配成功；一旦有匹配不成功的字符则匹配失败。
import wordcloud
import numpy as np#Python的一种开源的数值计算扩展
import pandas as pd #用于数据清洗
import matplotlib #绘图库
import matplotlib.pyplot as plt#是 Matplotlib 的子库，提供了和 MATLAB 类似的绘图 API
import matplotlib.mlab as mlab# 这个包中的函数可以添加随机变量分布的拟合曲线
import xlwt
from pyecharts.charts import Bar,Grid
from pyecharts import options as opts

def getHtmlText(url):
    try:

        #模拟浏览器
        kv = {'user-agent':'Mozilla/5.0 '}#重新定义user-agent的内容，使其等于Mozilla/5.0，将其伪装成一个浏览器
        html = page_text = requests.get(url=url, headers=kv)#获取页面内容
        html.encoding = 'utf-8' #设置页面编码格式为utf-8防止获取到的是乱码
        htmlText = html.text
        # print(htmlText)
        return htmlText
    except:#如果获取不成功或者编码无法解析
        print("获取页面数据失败")

#使用BeautifulSoup进行页面解析
def yemianjiexi(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')#对html_text进行解析，使用了html格式的parser解析器
    return soup

#对爬取内容进行定位
def Title(): #获取标题并进行存储
    tit = [] #创建列表进行存储
    title = yemianjiexi(getHtmlText(url)).select('.info > a')#使用select筛选元素，获取标题
    for ti in title:
        tit.append(ti.text)#将标题加入列表
    return tit
 #获取播放量并进行存储
def getBf():
    bofang = [] #创建列表进行存储
    bf = yemianjiexi(getHtmlText(url)).select('.detail-state > span:nth-of-type(1)')#使用select筛选元素，获得播放量
    for b in bf:
        bof = re.findall(r'\d+\.\d+|\d+', b.text)#播放量用1个'.'号作为分隔符，分割3个数字
        bofang.append('%.2f' % (float(bof[0])/10))#将播放量存入列表中,播放量以十万为单位
    return bofang

#获取弹幕并进行存储
def danmu():
    danmus = [] #创建存储弹幕数
    danmu = yemianjiexi(getHtmlText(url)).select('.detail-state > span:nth-of-type(2)')
    for d in danmu:
        bof = re.findall('\d+\.\d+|\d+',d.text)#弹幕数用一个'.'号作为分隔符，分割三个数字
        if float(bof[0]) < 10:#弹幕以千为单位
           bof[0] = '%.2f' % (float(bof[0]) * 10)
        else:
           bof[0] = '%.2f' % (float(bof[0])/1000) 
        danmus.append(bof[0])#将弹幕数存入列表中
    return danmus

if __name__ == '__main__':
    url = 'https://www.bilibili.com/v/popular/rank/all'#排行榜地址
    yemianjiexi(getHtmlText(url))
    Title() #获取标题
    getBf() #获取播放量
    danmu() #获取弹幕量
    datas = [] #存储标题和播放量和弹幕数
    print("{:^30}\t\t{:^40}\t\t{:^50}".format( '标题', '播放量/十万','弹幕数/k'))#输出顶端标题
    for i in range(15):#遍历排名前15的视频
        print("{:^30}\t\t{:^40}\t\t{:^50}".format(Title()[i],getBf()[i],danmu()[i]))#输出
        datas.append([Title()[i],getBf()[i],danmu()[i]])#将这些存入列表中
    #print(type(datas))
    #print(datas)
#可视化#
#调用xlwt模块中的Workbook方法来创建一个excel表格类型文件，
#其中的第一个参数是设置数据的编码格式，这里是’utf-8’的形式，
# style_compression设置是否压缩，不是很常用，赋值为0表示不压缩。
#创建excle表格
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('b站播放量排行榜',cell_overwrite_ok=True)
col = ('标题','播放量/十万','弹幕/k')#设置表格列名
for i in range(0,3):#列属性元组col写进sheet表单中
	sheet.write(0,i,col[i])#0是第一行，i是第几列，col【i】是列名，在0行i列写入列名
for i in range(0,15):
    data = datas[i]
    for j in range(0,3):
        sheet.write(i+1,j,data[j])
book.save('D:\爬虫/可视化.xls')


x = []
y1 = []
y2 = []
for i in range(0,15):
    x.append(Title()[i])
    y1.append(str(getBf()[i]))
    y2.append(str(danmu()[i]))
bar = Bar()
grid = Grid()
bar.add_xaxis(x)
bar.add_yaxis("播放量/十万",y1)
bar.add_yaxis("弹幕数/k",y2)
bar.set_global_opts(title_opts = opts.TitleOpts(title = 'b站热门视频排行榜'),tooltip_opts=opts.TooltipOpts(is_show=False),xaxis_opts=opts.AxisOpts(axislabel_opts={"rotate":35}),yaxis_opts=opts.AxisOpts(splitline_opts=opts.SplitLineOpts(is_show=True),))
bar.set_series_opts(labael_opts = opts.LabelOpts(position = "Top"))
#bar.render("可视化.html")
grid.add(bar,grid_opts = opts.GridOpts(pos_top = "5%",pos_bottom ="55%",pos_left = '35%',pos_right = '5%',))#将bar添加到grid中，距离下端的距离增加55%，距离左侧的距离增加35%，距离右侧的距离增加5%
grid.render("可视化.html")