diff --git a/redis对等分布式爬虫.py b/redis对等分布式爬虫.py new file mode 100644 index 0000000..4b18a43 --- /dev/null +++ b/redis对等分布式爬虫.py @@ -0,0 +1,67 @@ +import requests +import parsel +import redis +import xlwt +from urllib.parse import urljoin + +# 建立Redis 链接 +r = redis.Redis(host='localhost', port=6379, db=0) +# 提前设立待爬取队列和已爬取队列的名称 +wait_key_name = "waits" +down_key_name = "downs" + +# 进入栏目页 +category = requests.get("https://www.bilibili.com/v/popular/rank") +category_html = parsel.Selector(category.text)#创建Selector对象,向其传入text类型的参数 +category_url = category_html.css("div.rank-list-wrap li a::attr('href')").extract()#css选择器选择指定参数目录下的链接 +# 因为获取到的链接不完整,所以拼接 URL 后逐条放入待爬队列中(Redis) +for half_url in category_url: + url = "https:" + half_url + r.sadd(wait_key_name, url) + +print("{:^30}\t\t{:^40}\t\t{:^50}".format( '标题', '播放量','弹幕数'))#输出顶端标题 +datas = [] +for i in range(50): + # 从待爬队列中弹出一条URL + if not r.spop(wait_key_name): + pass + else: + target = str(r.spop(wait_key_name), encoding="utf-8") + resp = requests.get(target) + # 将请求过的URL放入已爬队列 + r.sadd(down_key_name, target) + # 使用 parsel 库解析相应正文 + html = parsel.Selector(resp.text) + # # 判断用于区分列表页和详情页 + if "video" not in target: + # 从列表页中提取详情页的URL + detail_url = html.css("div.img a::attr(href)").extract() + for detail in detail_url: + # 循环拼接详情页URL,并添加到待爬队列 + d = "https:" + detail + r.sadd(wait_key_name, d) + else: + # 如果请求的详情页,那么直接提取数据 + title = html.css("div.video-info span.tit::text").extract_first() + view = html.css("div.video-data span.view::text").extract_first() + danmu = html.css("div.video-data span.dm::text").extract_first() + if title==None : + pass + else: + print("{:^30}\t\t{:^40}\t\t{:^50}".format(title,view,danmu))#输出 + datas.append([title,view,danmu]) # 将这些存入列表中 + +#将爬取到的数据存入表格中 +book = xlwt.Workbook(encoding='utf-8',style_compression=0) +sheet = book.add_sheet('b站播放量排行榜',cell_overwrite_ok=True) +col = ('标题','播放量','弹幕')#设置表格列名 +for i in range(0,3):#列属性元组col写进sheet表单中 + sheet.write(0,i,col[i])#0是第一行,i是第几列,col【i】是列名,在0行i列写入列名 +for i in range(len(datas)): + data = datas[i] + for j in range(0,3): + sheet.write(i+1,j,data[j]) +book.save('D:\可视化.xls') + + +