diff --git a/redis对等分布式爬虫.py b/redis对等分布式爬虫.py deleted file mode 100644 index 9e92c0e..0000000 --- a/redis对等分布式爬虫.py +++ /dev/null @@ -1,54 +0,0 @@ -import requests -import parsel -import redis - -from urllib.parse import urljoin - -# 建立Redis 链接 -r = redis.Redis(host='localhost', port=6379, db=0) -# 提前设立待爬取队列和已爬取队列的名称 -wait_key_name = "waits" -down_key_name = "downs" - -# 进入栏目页 -category = requests.get("https://www.bilibili.com/v/popular/rank") -category_html = parsel.Selector(category.text) -category_url = category_html.css("div.rank-list-wrap li a::attr('href')").extract() - -# 拼接 URL 并逐条放入待爬队列中(Redis) -for half_url in category_url: - url = "https:" + half_url - r.sadd(wait_key_name, url) -print("{:^30}\t\t{:^40}\t\t{:^50}".format( '标题', '播放量/十万','弹幕数/k'))#输出顶端标题 -while True: - # 从待爬队列中弹出一条URL - if not r.spop(wait_key_name): - pass - else: - target = str(r.spop(wait_key_name), encoding="utf-8") - resp = requests.get(target) - # 将请求过的URL放入已爬队列 - r.sadd(down_key_name, target) - # 使用 parsel 库解析相应正文 - html = parsel.Selector(resp.text) - # # 判断用于区分列表页和详情页 - if "video" not in target: - # 从列表页中提取详情页的URL - detail_url = html.css("div.img a::attr(href)").extract() - print(detail_url) - for detail in detail_url: - # 循环拼接详情页URL,并添加到待爬队列 - d = "https:" + detail - r.sadd(wait_key_name, d) - else: - # 如果请求的详情页,那么直接提取数据 - title = html.css("div.video-info span.tit::text").extract_first() - view = html.css("div.video-data span.view::text").extract_first() - danmu = html.css("div.video-data span.dm::text").extract_first() - if title==None : - pass - else: - print("{:^30}\t\t{:^40}\t\t{:^50}".format(title,view,danmu))#输出 - - -