You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Spyder_python/controller/SpyderController.py

139 lines
6.8 KiB

10 months ago
import threading
import time
import datetime
import csv
import json
import requests
from lxml import etree
from entity.BilibiliVideo import BilibiliVideo
10 months ago
11 months ago
class SpyderController:
10 months ago
# Bilibili视频爬虫控制器,打开网页爬取BilibiliVideo.py中的数据,将其下载下来保存为csv文件
def getBilibiliVideoList(self, videoCount, threadCount, waitTime):
11 months ago
"""
整个爬虫的调用程序
:param videoCount: 爬取的视频数量若不限制可设置为999
:param threadCount: 爬取的线程并发数量
:param waitTime:float: 每个线程的等待时间单位秒避免爬取过快
:return: list[BilibiliVideo] 返回处理完成后的videoList
"""
all_data_list = []
videoCount = int(videoCount)
threadCount = int(threadCount)
waitTime = float(waitTime)
10 months ago
if videoCount < threadCount:
threadCount = videoCount
if videoCount > 20:
videoCount = 20
url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
}
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
#csv_writer = csv.writer(f)
rank_text = requests.get(url=url, headers=headers).text
# 将含有"万"的数据转换为数字
def crawl_data(start_index, end_index):
def convert_to_number(lst):
result = []
for item in lst:
if '' in item:
number = int(float(item.replace('', '')) * 10000)
result.append(str(number))
else:
result.append(item)
return result
# url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
# }
f = open("file_3.csv", "a", encoding="UTF-8", newline="")
csv_writer = csv.writer(f)
#
# rank_text = requests.get(url=url, headers=headers).text
for i in range(start_index, end_index):
parsed_data = json.loads(rank_text)
aid = str(parsed_data["data"]["list"][i]["aid"])
print(aid)
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
topNo = [str(i+1)]
video_url = "https://www.bilibili.com/video/av" + aid + "/?"
video_text = requests.get(url=video_url, headers=headers).text
tree = etree.HTML(video_text)
#print(video_text)
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])]
viewCount = tree.xpath('//div[@class="view item"]/div/text()')
likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()')
coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()')
favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()')
bulletCount = tree.xpath('//div[@class="dm-text"]/text()')
creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
viewCount = convert_to_number(viewCount)
likeCount = convert_to_number(likeCount)
coinCount = convert_to_number(coinCount)
favoriteCount = convert_to_number(favoriteCount)
bulletCount = convert_to_number(bulletCount)
# match = re.search(r'\d+', text)
# number = match.group()
if not creatorFanCount:
creatorFanCount = [str(1)]
else :
followers_str = creatorFanCount[0].strip().split()[1]
followers_num = float(followers_str.replace('', '')) * 10000
# 转化为整数
followers_num = int(followers_num)
creatorFanCount = [str(followers_num)]
10 months ago
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
bulletCount = convert_to_number(bulletCount)
#up_url = "https://space.bilibili.com/" + creatorId[0] + "?"
up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0"
up_text = requests.get(url=up_url, headers=headers).text
tree = etree.HTML(up_text)
#print(up_text)
all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
10 months ago
csv_writer.writerow(all_data)
# all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]),
int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0]))
all_data_list.append(video)
10 months ago
f.close()
# print(all_data_list)
10 months ago
return all_data_list
thread_list = []
video_per_thread = videoCount // threadCount
for i in range(threadCount):
start_index = i * video_per_thread
end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount
thread = threading.Thread(target=crawl_data, args=(start_index, end_index))
thread_list.append(thread)
thread.start()
time.sleep(waitTime)
for thread in thread_list:
thread.join()
print(all_data_list)
return all_data_list
10 months ago
if __name__ == '__main__':
f = open("file_3.csv", "w", encoding="UTF-8", newline="")
csv_writer = csv.writer(f)
csv_writer.writerow(
["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount",
"favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"])
10 months ago
f.close()
spyderController = SpyderController()
spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2