You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
pachong/redis对等分布式爬虫.py

55 lines
2.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import parsel
import redis
from urllib.parse import urljoin
# 建立Redis 链接
r = redis.Redis(host='localhost', port=6379, db=0)
# 提前设立待爬取队列和已爬取队列的名称
wait_key_name = "waits"
down_key_name = "downs"
# 进入栏目页
category = requests.get("https://www.bilibili.com/v/popular/rank")
category_html = parsel.Selector(category.text)
category_url = category_html.css("div.rank-list-wrap li a::attr('href')").extract()
# 拼接 URL 并逐条放入待爬队列中(Redis)
for half_url in category_url:
url = "https:" + half_url
r.sadd(wait_key_name, url)
print("{:^30}\t\t{:^40}\t\t{:^50}".format( '标题', '播放量/十万','弹幕数/k'))#输出顶端标题
while True:
# 从待爬队列中弹出一条URL
if not r.spop(wait_key_name):
pass
else:
target = str(r.spop(wait_key_name), encoding="utf-8")
resp = requests.get(target)
# 将请求过的URL放入已爬队列
r.sadd(down_key_name, target)
# 使用 parsel 库解析相应正文
html = parsel.Selector(resp.text)
# # 判断用于区分列表页和详情页
if "video" not in target:
# 从列表页中提取详情页的URL
detail_url = html.css("div.img a::attr(href)").extract()
print(detail_url)
for detail in detail_url:
# 循环拼接详情页URL并添加到待爬队列
d = "https:" + detail
r.sadd(wait_key_name, d)
else:
# 如果请求的详情页,那么直接提取数据
title = html.css("div.video-info span.tit::text").extract_first()
view = html.css("div.video-data span.view::text").extract_first()
danmu = html.css("div.video-data span.dm::text").extract_first()
if title==None :
pass
else:
print("{:^30}\t\t{:^40}\t\t{:^50}".format(title,view,danmu))#输出