You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pachong/redis对等分布式爬虫.py

55 lines
2.0 KiB

import requests
import parsel
import redis
from urllib.parse import urljoin
# 建立Redis 链接
r = redis.Redis(host='localhost', port=6379, db=0)
# 提前设立待爬取队列和已爬取队列的名称
wait_key_name = "waits"
down_key_name = "downs"
# 进入栏目页
category = requests.get("https://www.bilibili.com/v/popular/rank")
category_html = parsel.Selector(category.text)
category_url = category_html.css("div.rank-list-wrap li a::attr('href')").extract()
# 拼接 URL 并逐条放入待爬队列中(Redis)
for half_url in category_url:
url = "https:" + half_url
r.sadd(wait_key_name, url)
print("{:^30}\t\t{:^40}\t\t{:^50}".format( '标题', '播放量/十万','弹幕数/k'))#输出顶端标题
while True:
# 从待爬队列中弹出一条URL
if not r.spop(wait_key_name):
pass
else:
target = str(r.spop(wait_key_name), encoding="utf-8")
resp = requests.get(target)
# 将请求过的URL放入已爬队列
r.sadd(down_key_name, target)
# 使用 parsel 库解析相应正文
html = parsel.Selector(resp.text)
# # 判断用于区分列表页和详情页
if "video" not in target:
# 从列表页中提取详情页的URL
detail_url = html.css("div.img a::attr(href)").extract()
print(detail_url)
for detail in detail_url:
# 循环拼接详情页URL并添加到待爬队列
d = "https:" + detail
r.sadd(wait_key_name, d)
else:
# 如果请求的详情页,那么直接提取数据
title = html.css("div.video-info span.tit::text").extract_first()
view = html.css("div.video-data span.view::text").extract_first()
danmu = html.css("div.video-data span.dm::text").extract_first()
if title==None :
pass
else:
print("{:^30}\t\t{:^40}\t\t{:^50}".format(title,view,danmu))#输出