|
|
import requests
|
|
|
import parsel
|
|
|
import redis
|
|
|
import xlwt
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
# 建立Redis 链接
|
|
|
r = redis.Redis(host='localhost', port=6379, db=0)
|
|
|
# 提前设立待爬取队列和已爬取队列的名称
|
|
|
wait_key_name = "waits"
|
|
|
down_key_name = "downs"
|
|
|
|
|
|
# 进入栏目页
|
|
|
category = requests.get("https://www.bilibili.com/v/popular/rank")
|
|
|
category_html = parsel.Selector(category.text)#创建Selector对象,向其传入text类型的参数
|
|
|
category_url = category_html.css("div.rank-list-wrap li a::attr('href')").extract()#css选择器选择指定参数目录下的链接
|
|
|
# 因为获取到的链接不完整,所以拼接 URL 后逐条放入待爬队列中(Redis)
|
|
|
for half_url in category_url:
|
|
|
url = "https:" + half_url
|
|
|
r.sadd(wait_key_name, url)
|
|
|
|
|
|
print("{:^30}\t\t{:^40}\t\t{:^50}".format( '标题', '播放量','弹幕数'))#输出顶端标题
|
|
|
datas = []
|
|
|
for i in range(50):
|
|
|
# 从待爬队列中弹出一条URL
|
|
|
if not r.spop(wait_key_name):
|
|
|
pass
|
|
|
else:
|
|
|
target = str(r.spop(wait_key_name), encoding="utf-8")
|
|
|
resp = requests.get(target)
|
|
|
# 将请求过的URL放入已爬队列
|
|
|
r.sadd(down_key_name, target)
|
|
|
# 使用 parsel 库解析相应正文
|
|
|
html = parsel.Selector(resp.text)
|
|
|
# # 判断用于区分列表页和详情页
|
|
|
if "video" not in target:
|
|
|
# 从列表页中提取详情页的URL
|
|
|
detail_url = html.css("div.img a::attr(href)").extract()
|
|
|
for detail in detail_url:
|
|
|
# 循环拼接详情页URL,并添加到待爬队列
|
|
|
d = "https:" + detail
|
|
|
r.sadd(wait_key_name, d)
|
|
|
else:
|
|
|
# 如果请求的详情页,那么直接提取数据
|
|
|
title = html.css("div.video-info span.tit::text").extract_first()
|
|
|
view = html.css("div.video-data span.view::text").extract_first()
|
|
|
danmu = html.css("div.video-data span.dm::text").extract_first()
|
|
|
if title==None :
|
|
|
pass
|
|
|
else:
|
|
|
print("{:^30}\t\t{:^40}\t\t{:^50}".format(title,view,danmu))#输出
|
|
|
datas.append([title,view,danmu]) # 将这些存入列表中
|
|
|
|
|
|
#将爬取到的数据存入表格中
|
|
|
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
|
|
|
sheet = book.add_sheet('b站播放量排行榜',cell_overwrite_ok=True)
|
|
|
col = ('标题','播放量','弹幕')#设置表格列名
|
|
|
for i in range(0,3):#列属性元组col写进sheet表单中
|
|
|
sheet.write(0,i,col[i])#0是第一行,i是第几列,col【i】是列名,在0行i列写入列名
|
|
|
for i in range(len(datas)):
|
|
|
data = datas[i]
|
|
|
for j in range(0,3):
|
|
|
sheet.write(i+1,j,data[j])
|
|
|
book.save('D:\可视化.xls')
|
|
|
|
|
|
|
|
|
|