You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Spyder_python/controller/SpyderController.py

139 lines
6.8 KiB

9 months ago
import threading
import time
import datetime
import csv
import json
import requests
from lxml import etree
from entity.BilibiliVideo import BilibiliVideo
9 months ago
9 months ago
class SpyderController:
# Bilibili视频爬虫控制器,打开网页爬取符合BilibiliVideo.py中的数据,将其下载下来保存为csv文件
9 months ago
def getBilibiliVideoList(self, videoCount, threadCount, waitTime):
9 months ago
"""
整个爬虫的调用程序
:param videoCount: 爬取的视频数量若不限制可设置为999
:param threadCount: 爬取的线程并发数量
:param waitTime:float: 每个线程的等待时间单位秒避免爬取过快
:return: list[BilibiliVideo] 返回处理完成后的videoList
"""
all_data_list = []
videoCount = int(videoCount)
threadCount = int(threadCount)
waitTime = float(waitTime)
9 months ago
if videoCount < threadCount:
threadCount = videoCount
if videoCount > 100:
videoCount = 100
json_url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
9 months ago
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
}
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
#csv_writer = csv.writer(f)
rank_text = requests.get(url=json_url, headers=headers).text
9 months ago
# 将含有"万"的数据转换为数字
def crawl_data(start_index, end_index):
def convert_to_number(lst):
result = []
for item in lst:
if '' in item:
number = int(float(item.replace('', '')) * 10000)
result.append(str(number))
else:
result.append(item)
return result
# url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
# }
f = open("file_3.csv", "a", encoding="GB18030", newline="")
9 months ago
csv_writer = csv.writer(f)
#
# rank_text = requests.get(url=url, headers=headers).text
for i in range(start_index, end_index):
parsed_data = json.loads(rank_text)
aid = str(parsed_data["data"]["list"][i]["aid"])
print(aid)
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
topNo = [str(i+1)]
url = "https://www.bilibili.com/video/av" + aid + "/?"
video_text = requests.get(url=url, headers=headers).text
9 months ago
tree = etree.HTML(video_text)
#print(video_text)
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])]
viewCount = tree.xpath('//div[@class="view item"]/div/text()')
likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()')
coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()')
favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()')
bulletCount = tree.xpath('//div[@class="dm-text"]/text()')
creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
viewCount = convert_to_number(viewCount)
likeCount = convert_to_number(likeCount)
coinCount = convert_to_number(coinCount)
favoriteCount = convert_to_number(favoriteCount)
bulletCount = convert_to_number(bulletCount)
# match = re.search(r'\d+', text)
# number = match.group()
if not creatorFanCount:
creatorFanCount = [str(1)]
else :
followers_str = creatorFanCount[0].strip().split()[1]
followers_num = float(followers_str.replace('', '')) * 10000
# 转化为整数
followers_num = int(followers_num)
creatorFanCount = [str(followers_num)]
9 months ago
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
bulletCount = convert_to_number(bulletCount)
#up_url = "https://space.bilibili.com/" + creatorId[0] + "?"
up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0"
up_text = requests.get(url=up_url, headers=headers).text
tree = etree.HTML(up_text)
#print(up_text)
all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
9 months ago
csv_writer.writerow(all_data)
# all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]),
int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0]))
all_data_list.append(video)
9 months ago
f.close()
# print(all_data_list)
9 months ago
return all_data_list
thread_list = []
video_per_thread = videoCount // threadCount
for i in range(threadCount):
start_index = i * video_per_thread
end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount
thread = threading.Thread(target=crawl_data, args=(start_index, end_index))
thread_list.append(thread)
thread.start()
time.sleep(waitTime)
for thread in thread_list:
thread.join()
print(all_data_list)
return all_data_list
9 months ago
if __name__ == '__main__':
f = open("file_3.csv", "w", encoding="GB18030", newline="")
9 months ago
csv_writer = csv.writer(f)
csv_writer.writerow(
["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount",
"favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"])
9 months ago
f.close()
spyderController = SpyderController()
spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2