|
|
import threading
|
|
|
import time
|
|
|
import datetime
|
|
|
import csv
|
|
|
import json
|
|
|
import requests
|
|
|
from lxml import etree
|
|
|
|
|
|
from entity.BilibiliVideo import BilibiliVideo
|
|
|
|
|
|
|
|
|
class SpyderController:
|
|
|
# Bilibili视频爬虫控制器,打开网页,爬取符合BilibiliVideo.py中的数据,将其下载下来,保存为csv文件
|
|
|
def getBilibiliVideoList(self, videoCount, threadCount, waitTime):
|
|
|
"""
|
|
|
整个爬虫的调用程序
|
|
|
:param videoCount: 爬取的视频数量,若不限制可设置为999
|
|
|
:param threadCount: 爬取的线程并发数量
|
|
|
:param waitTime:float: 每个线程的等待时间,单位秒(避免爬取过快)
|
|
|
:return: list[BilibiliVideo] 返回处理完成后的videoList
|
|
|
"""
|
|
|
all_data_list = []
|
|
|
videoCount = int(videoCount)
|
|
|
threadCount = int(threadCount)
|
|
|
waitTime = float(waitTime)
|
|
|
if videoCount < threadCount:
|
|
|
threadCount = videoCount
|
|
|
if videoCount > 100:
|
|
|
videoCount = 100
|
|
|
json_url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
|
|
|
}
|
|
|
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
|
|
|
#csv_writer = csv.writer(f)
|
|
|
|
|
|
rank_text = requests.get(url=json_url, headers=headers).text
|
|
|
# 将含有"万"的数据转换为数字
|
|
|
def crawl_data(start_index, end_index):
|
|
|
def convert_to_number(lst):
|
|
|
result = []
|
|
|
for item in lst:
|
|
|
if '万' in item:
|
|
|
number = int(float(item.replace('万', '')) * 10000)
|
|
|
result.append(str(number))
|
|
|
else:
|
|
|
result.append(item)
|
|
|
return result
|
|
|
|
|
|
# url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
|
|
|
# headers = {
|
|
|
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
|
|
|
# }
|
|
|
f = open("file_3.csv", "a", encoding="GB18030", newline="")
|
|
|
csv_writer = csv.writer(f)
|
|
|
#
|
|
|
# rank_text = requests.get(url=url, headers=headers).text
|
|
|
for i in range(start_index, end_index):
|
|
|
parsed_data = json.loads(rank_text)
|
|
|
aid = str(parsed_data["data"]["list"][i]["aid"])
|
|
|
print(aid)
|
|
|
|
|
|
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
|
|
|
topNo = [str(i+1)]
|
|
|
|
|
|
url = "https://www.bilibili.com/video/av" + aid + "/?"
|
|
|
video_text = requests.get(url=url, headers=headers).text
|
|
|
tree = etree.HTML(video_text)
|
|
|
#print(video_text)
|
|
|
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
|
|
|
uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])]
|
|
|
viewCount = tree.xpath('//div[@class="view item"]/div/text()')
|
|
|
likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()')
|
|
|
coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()')
|
|
|
favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()')
|
|
|
bulletCount = tree.xpath('//div[@class="dm-text"]/text()')
|
|
|
creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
|
|
|
viewCount = convert_to_number(viewCount)
|
|
|
likeCount = convert_to_number(likeCount)
|
|
|
coinCount = convert_to_number(coinCount)
|
|
|
favoriteCount = convert_to_number(favoriteCount)
|
|
|
bulletCount = convert_to_number(bulletCount)
|
|
|
# match = re.search(r'\d+', text)
|
|
|
# number = match.group()
|
|
|
if not creatorFanCount:
|
|
|
creatorFanCount = [str(1)]
|
|
|
else :
|
|
|
followers_str = creatorFanCount[0].strip().split()[1]
|
|
|
followers_num = float(followers_str.replace('万', '')) * 10000
|
|
|
# 转化为整数
|
|
|
followers_num = int(followers_num)
|
|
|
creatorFanCount = [str(followers_num)]
|
|
|
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
|
|
|
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
|
|
|
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
|
|
|
bulletCount = convert_to_number(bulletCount)
|
|
|
|
|
|
#up_url = "https://space.bilibili.com/" + creatorId[0] + "?"
|
|
|
up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0"
|
|
|
up_text = requests.get(url=up_url, headers=headers).text
|
|
|
tree = etree.HTML(up_text)
|
|
|
#print(up_text)
|
|
|
all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
|
|
|
|
|
|
csv_writer.writerow(all_data)
|
|
|
|
|
|
# all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
|
|
|
video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]),
|
|
|
int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0]))
|
|
|
all_data_list.append(video)
|
|
|
f.close()
|
|
|
# print(all_data_list)
|
|
|
return all_data_list
|
|
|
|
|
|
thread_list = []
|
|
|
video_per_thread = videoCount // threadCount
|
|
|
|
|
|
for i in range(threadCount):
|
|
|
start_index = i * video_per_thread
|
|
|
end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount
|
|
|
thread = threading.Thread(target=crawl_data, args=(start_index, end_index))
|
|
|
thread_list.append(thread)
|
|
|
thread.start()
|
|
|
time.sleep(waitTime)
|
|
|
|
|
|
for thread in thread_list:
|
|
|
thread.join()
|
|
|
print(all_data_list)
|
|
|
return all_data_list
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
f = open("file_3.csv", "w", encoding="GB18030", newline="")
|
|
|
csv_writer = csv.writer(f)
|
|
|
csv_writer.writerow(
|
|
|
["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount",
|
|
|
"favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"])
|
|
|
f.close()
|
|
|
spyderController = SpyderController()
|
|
|
spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2 |