You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pachong/redis对等分布式爬虫.py

68 lines
2.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import parsel
import redis
import xlwt
from urllib.parse import urljoin
# 建立Redis 链接
r = redis.Redis(host='localhost', port=6379, db=0)
# 提前设立待爬取队列和已爬取队列的名称
wait_key_name = "waits"
down_key_name = "downs"
# 进入栏目页
category = requests.get("https://www.bilibili.com/v/popular/rank")
category_html = parsel.Selector(category.text)#创建Selector对象向其传入text类型的参数
category_url = category_html.css("div.rank-list-wrap li a::attr('href')").extract()#css选择器选择指定参数目录下的链接
# 因为获取到的链接不完整,所以拼接 URL 后逐条放入待爬队列中(Redis)
for half_url in category_url:
url = "https:" + half_url
r.sadd(wait_key_name, url)
print("{:^30}\t\t{:^40}\t\t{:^50}".format( '标题', '播放量','弹幕数'))#输出顶端标题
datas = []
for i in range(50):
# 从待爬队列中弹出一条URL
if not r.spop(wait_key_name):
pass
else:
target = str(r.spop(wait_key_name), encoding="utf-8")
resp = requests.get(target)
# 将请求过的URL放入已爬队列
r.sadd(down_key_name, target)
# 使用 parsel 库解析相应正文
html = parsel.Selector(resp.text)
# # 判断用于区分列表页和详情页
if "video" not in target:
# 从列表页中提取详情页的URL
detail_url = html.css("div.img a::attr(href)").extract()
for detail in detail_url:
# 循环拼接详情页URL并添加到待爬队列
d = "https:" + detail
r.sadd(wait_key_name, d)
else:
# 如果请求的详情页,那么直接提取数据
title = html.css("div.video-info span.tit::text").extract_first()
view = html.css("div.video-data span.view::text").extract_first()
danmu = html.css("div.video-data span.dm::text").extract_first()
if title==None :
pass
else:
print("{:^30}\t\t{:^40}\t\t{:^50}".format(title,view,danmu))#输出
datas.append([title,view,danmu]) # 将这些存入列表中
#将爬取到的数据存入表格中
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('b站播放量排行榜',cell_overwrite_ok=True)
col = ('标题','播放量','弹幕')#设置表格列名
for i in range(0,3):#列属性元组col写进sheet表单中
sheet.write(0,i,col[i])#0是第一行i是第几列col【i】是列名在0行i列写入列名
for i in range(len(datas)):
data = datas[i]
for j in range(0,3):
sheet.write(i+1,j,data[j])
book.save('D:\可视化.xls')