diff --git a/controller/SpyderController.py b/controller/SpyderController.py index 012a320..a0d4314 100644 --- a/controller/SpyderController.py +++ b/controller/SpyderController.py @@ -1,5 +1,15 @@ +import threading +import time +import datetime +import csv +import json +import requests +from lxml import etree + + class SpyderController: - def getBilibiliVideoList(self,videoCount,threadCount,waitTime): + # Bilibili视频爬虫控制器,打开网页,爬取BilibiliVideo.py中的数据,将其下载下来,保存为csv文件 + def getBilibiliVideoList(self, videoCount, threadCount, waitTime): """ 整个爬虫的调用程序 :param videoCount: 爬取的视频数量,若不限制可设置为999 @@ -7,3 +17,111 @@ class SpyderController: :param waitTime:float: 每个线程的等待时间,单位秒(避免爬取过快) :return: list[BilibiliVideo] 返回处理完成后的videoList """ + if videoCount < threadCount: + threadCount = videoCount + if videoCount > 20: + videoCount = 20 + url = "https://api.bilibili.com/x/web-interface/ranking/v2?" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0' + } + #f = open("file_3.csv", "a", encoding="UTF-8", newline="") + #csv_writer = csv.writer(f) + + rank_text = requests.get(url=url, headers=headers).text + # 将含有"万"的数据转换为数字 + def crawl_data(start_index, end_index): + def convert_to_number(lst): + result = [] + for item in lst: + if '万' in item: + number = int(float(item.replace('万', '')) * 10000) + result.append(str(number)) + else: + result.append(item) + return result + + # url = "https://api.bilibili.com/x/web-interface/ranking/v2?" + # headers = { + # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0' + # } + f = open("file_3.csv", "a", encoding="UTF-8", newline="") + csv_writer = csv.writer(f) + # + # rank_text = requests.get(url=url, headers=headers).text + for i in range(start_index, end_index): + parsed_data = json.loads(rank_text) + aid = str(parsed_data["data"]["list"][i]["aid"]) + print(aid) + + bvId = [str(parsed_data['data']['list'][i]['bvid'])] + topNo = [str(i+1)] + + video_url = "https://www.bilibili.com/video/av" + aid + "/?" + video_text = requests.get(url=video_url, headers=headers).text + tree = etree.HTML(video_text) + #print(video_text) + title = tree.xpath('//div[@class="video-info-title-inner"]//text()') + uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])] + viewCount = tree.xpath('//div[@class="view item"]/div/text()') + likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()') + coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()') + favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()') + bulletCount = tree.xpath('//div[@class="dm-text"]/text()') + creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()') + viewCount = convert_to_number(viewCount) + likeCount = convert_to_number(likeCount) + coinCount = convert_to_number(coinCount) + favoriteCount = convert_to_number(favoriteCount) + bulletCount = convert_to_number(bulletCount) + # match = re.search(r'\d+', text) + # number = match.group() + followers_str = creatorFanCount[0].strip().split()[1] + followers_num = float(followers_str.replace('万', '')) * 10000 + # 转化为整数 + followers_num = int(followers_num) + creatorFanCount = [str(followers_num)] + commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])] + creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])] + creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])] + bulletCount = convert_to_number(bulletCount) + + #up_url = "https://space.bilibili.com/" + creatorId[0] + "?" + up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0" + up_text = requests.get(url=up_url, headers=headers).text + tree = etree.HTML(up_text) + #print(up_text) + + all_data = topNo + bvId + title + [ + video_url] + uploadTime + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount + csv_writer.writerow(all_data) + all_data_list = [] + all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount] + f.close() + return all_data_list + + thread_list = [] + video_per_thread = videoCount // threadCount + + for i in range(threadCount): + start_index = i * video_per_thread + end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount + thread = threading.Thread(target=crawl_data, args=(start_index, end_index)) + thread_list.append(thread) + thread.start() + time.sleep(waitTime) + + for thread in thread_list: + thread.join() + + +if __name__ == '__main__': + f = open("file_3.csv", "w", encoding="UTF-8", newline="") + csv_writer = csv.writer(f) + csv_writer.writerow( + ["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount", + "bulletCount", "commentCount", + "creatorId", "creatorName", "creatorFanCount"]) + f.close() + spyderController = SpyderController() + spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2 \ No newline at end of file diff --git a/service/CsvService.py b/service/CsvService.py index 8df1c85..f8eaec3 100644 --- a/service/CsvService.py +++ b/service/CsvService.py @@ -1,7 +1,21 @@ from IFileService import IFileService from entity.BilibiliVideo import BilibiliVideo - +from controller.SpyderController import SpyderController +import csv class CsvService(IFileService): def save(self, filePath, videoList: list[BilibiliVideo]): + f = open("file_3.csv", "w", encoding="UTF-8", newline="") + csv_writer = csv.writer(f) + csv_writer.writerow( + ["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount", + "bulletCount", "commentCount", + "creatorId", "creatorName", "creatorFanCount"]) + for video in videoList: + csv_writer.writerow( + [video.topNo, video.bvId, video.title, video.url, video.uploadTimeText, video.viewCount, video.likeCount, + video.coinCount, video.favoriteCount, video.bulletCount, video.commentCount, + video.creatorId, video.creatorName, video.creatorFanCount]) + f.close() raise NotImplementedError + diff --git a/test/SpyderController_test.py b/test/SpyderController_test.py index 87dc99f..31f580c 100644 --- a/test/SpyderController_test.py +++ b/test/SpyderController_test.py @@ -2,11 +2,22 @@ from controller.SpyderController import SpyderController spyderController = SpyderController() +import csv class TestSpyderController: def test_main(self): # for debug: print("testing SpyderController.main") + spyderController = SpyderController() lst = spyderController.getBilibiliVideoList(999, 1, 0.3) print(lst) # assert len(lst) == 999 + # f = open("file_3.csv", "w", encoding="UTF-8", newline="") + # csv_writer = csv.writer(f) + # csv_writer.writerow( + # ["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount", + # "bulletCount", "commentCount", + # "creatorId", "creatorName", "creatorFanCount"]) + # f.close() + # spyderController = SpyderController() + # spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2