You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Spyder_python/controller/SpyderController.py

139 lines
6.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import threading
import time
import datetime
import csv
import json
import requests
from lxml import etree
from entity.BilibiliVideo import BilibiliVideo
class SpyderController:
# Bilibili视频爬虫控制器,打开网页爬取符合BilibiliVideo.py中的数据,将其下载下来保存为csv文件
def getBilibiliVideoList(self, videoCount, threadCount, waitTime):
"""
整个爬虫的调用程序
:param videoCount: 爬取的视频数量若不限制可设置为999
:param threadCount: 爬取的线程并发数量
:param waitTime:float: 每个线程的等待时间,单位秒(避免爬取过快)
:return: list[BilibiliVideo] 返回处理完成后的videoList
"""
all_data_list = []
videoCount = int(videoCount)
threadCount = int(threadCount)
waitTime = float(waitTime)
if videoCount < threadCount:
threadCount = videoCount
if videoCount > 100:
videoCount = 100
json_url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
}
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
#csv_writer = csv.writer(f)
rank_text = requests.get(url=json_url, headers=headers).text
# 将含有"万"的数据转换为数字
def crawl_data(start_index, end_index):
def convert_to_number(lst):
result = []
for item in lst:
if '' in item:
number = int(float(item.replace('', '')) * 10000)
result.append(str(number))
else:
result.append(item)
return result
# url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
# }
f = open("file_3.csv", "a", encoding="GB18030", newline="")
csv_writer = csv.writer(f)
#
# rank_text = requests.get(url=url, headers=headers).text
for i in range(start_index, end_index):
parsed_data = json.loads(rank_text)
aid = str(parsed_data["data"]["list"][i]["aid"])
print(aid)
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
topNo = [str(i+1)]
url = "https://www.bilibili.com/video/av" + aid + "/?"
video_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(video_text)
#print(video_text)
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])]
viewCount = tree.xpath('//div[@class="view item"]/div/text()')
likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()')
coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()')
favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()')
bulletCount = tree.xpath('//div[@class="dm-text"]/text()')
creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
viewCount = convert_to_number(viewCount)
likeCount = convert_to_number(likeCount)
coinCount = convert_to_number(coinCount)
favoriteCount = convert_to_number(favoriteCount)
bulletCount = convert_to_number(bulletCount)
# match = re.search(r'\d+', text)
# number = match.group()
if not creatorFanCount:
creatorFanCount = [str(1)]
else :
followers_str = creatorFanCount[0].strip().split()[1]
followers_num = float(followers_str.replace('', '')) * 10000
# 转化为整数
followers_num = int(followers_num)
creatorFanCount = [str(followers_num)]
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
bulletCount = convert_to_number(bulletCount)
#up_url = "https://space.bilibili.com/" + creatorId[0] + "?"
up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0"
up_text = requests.get(url=up_url, headers=headers).text
tree = etree.HTML(up_text)
#print(up_text)
all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
csv_writer.writerow(all_data)
# all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]),
int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0]))
all_data_list.append(video)
f.close()
# print(all_data_list)
return all_data_list
thread_list = []
video_per_thread = videoCount // threadCount
for i in range(threadCount):
start_index = i * video_per_thread
end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount
thread = threading.Thread(target=crawl_data, args=(start_index, end_index))
thread_list.append(thread)
thread.start()
time.sleep(waitTime)
for thread in thread_list:
thread.join()
print(all_data_list)
return all_data_list
if __name__ == '__main__':
f = open("file_3.csv", "w", encoding="GB18030", newline="")
csv_writer = csv.writer(f)
csv_writer.writerow(
["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount",
"favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"])
f.close()
spyderController = SpyderController()
spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2