diff --git a/controller/SpyderController.py b/controller/SpyderController.py index 012a320..2a1d2ee 100644 --- a/controller/SpyderController.py +++ b/controller/SpyderController.py @@ -1,5 +1,17 @@ +import threading +import time +import datetime +import csv +import json +import requests +from lxml import etree + +from entity.BilibiliVideo import BilibiliVideo + + class SpyderController: - def getBilibiliVideoList(self,videoCount,threadCount,waitTime): + # Bilibili视频爬虫控制器,打开网页,爬取符合BilibiliVideo.py中的数据,将其下载下来,保存为csv文件 + def getBilibiliVideoList(self, videoCount, threadCount, waitTime): """ 整个爬虫的调用程序 :param videoCount: 爬取的视频数量,若不限制可设置为999 @@ -7,3 +19,121 @@ class SpyderController: :param waitTime:float: 每个线程的等待时间,单位秒(避免爬取过快) :return: list[BilibiliVideo] 返回处理完成后的videoList """ + all_data_list = [] + videoCount = int(videoCount) + threadCount = int(threadCount) + waitTime = float(waitTime) + if videoCount < threadCount: + threadCount = videoCount + if videoCount > 100: + videoCount = 100 + url = "https://api.bilibili.com/x/web-interface/ranking/v2?" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0' + } + #f = open("file_3.csv", "a", encoding="UTF-8", newline="") + #csv_writer = csv.writer(f) + + rank_text = requests.get(url=url, headers=headers).text + # 将含有"万"的数据转换为数字 + def crawl_data(start_index, end_index): + def convert_to_number(lst): + result = [] + for item in lst: + if '万' in item: + number = int(float(item.replace('万', '')) * 10000) + result.append(str(number)) + else: + result.append(item) + return result + + # url = "https://api.bilibili.com/x/web-interface/ranking/v2?" + # headers = { + # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0' + # } + f = open("file_3.csv", "a", encoding="GB18030", newline="") + csv_writer = csv.writer(f) + # + # rank_text = requests.get(url=url, headers=headers).text + for i in range(start_index, end_index): + parsed_data = json.loads(rank_text) + aid = str(parsed_data["data"]["list"][i]["aid"]) + print(aid) + + bvId = [str(parsed_data['data']['list'][i]['bvid'])] + topNo = [str(i+1)] + + video_url = "https://www.bilibili.com/video/av" + aid + "/?" + video_text = requests.get(url=video_url, headers=headers).text + tree = etree.HTML(video_text) + #print(video_text) + title = tree.xpath('//div[@class="video-info-title-inner"]//text()') + uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])] + viewCount = tree.xpath('//div[@class="view item"]/div/text()') + likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()') + coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()') + favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()') + bulletCount = tree.xpath('//div[@class="dm-text"]/text()') + creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()') + viewCount = convert_to_number(viewCount) + likeCount = convert_to_number(likeCount) + coinCount = convert_to_number(coinCount) + favoriteCount = convert_to_number(favoriteCount) + bulletCount = convert_to_number(bulletCount) + # match = re.search(r'\d+', text) + # number = match.group() + if not creatorFanCount: + creatorFanCount = [str(1)] + else : + followers_str = creatorFanCount[0].strip().split()[1] + followers_num = float(followers_str.replace('万', '')) * 10000 + # 转化为整数 + followers_num = int(followers_num) + creatorFanCount = [str(followers_num)] + commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])] + creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])] + creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])] + bulletCount = convert_to_number(bulletCount) + + #up_url = "https://space.bilibili.com/" + creatorId[0] + "?" + up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0" + up_text = requests.get(url=up_url, headers=headers).text + tree = etree.HTML(up_text) + #print(up_text) + all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount + + csv_writer.writerow(all_data) + + # all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount] + video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]), + int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0])) + all_data_list.append(video) + f.close() + # print(all_data_list) + return all_data_list + + thread_list = [] + video_per_thread = videoCount // threadCount + + for i in range(threadCount): + start_index = i * video_per_thread + end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount + thread = threading.Thread(target=crawl_data, args=(start_index, end_index)) + thread_list.append(thread) + thread.start() + time.sleep(waitTime) + + for thread in thread_list: + thread.join() + print(all_data_list) + return all_data_list + +if __name__ == '__main__': + f = open("file_3.csv", "w", encoding="GB18030", newline="") + csv_writer = csv.writer(f) + csv_writer.writerow( + ["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount", + "favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"]) + f.close() + spyderController = SpyderController() + spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2 \ No newline at end of file diff --git a/controller/UIController.py b/controller/UIController.py index 6d0f557..e94d3cb 100644 --- a/controller/UIController.py +++ b/controller/UIController.py @@ -20,9 +20,9 @@ class UIController: print("zzr1") #一些属性 ##csv文件路径 - csv_path=".\csv_file" + csv_path="./csv_file" ## - excel_path=".\excel_file" + excel_path="./excel_file" # 创建主窗口 root = tk.Tk() root.title("Python UI") @@ -39,9 +39,10 @@ class UIController: waitTime=entry3.get() #创建 SpyderController对象调用其函数 SpyderController=SC.SpyderController() - global scRuslt_data - scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime) + # global scRuslt_data + self.scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime) print("爬取完成") + # print(scRuslt_data) @@ -112,7 +113,7 @@ class UIController: button_stop.grid(row=4, column=1,sticky=tk.W) button_save_to_excel = tk.Button(root, text="save to excel", command=button_save_to_exce_click) button_save_to_excel.grid(row=5, column=1, sticky=tk.W) - button_save_to_csv = tk.Button(root, text="save to excel", command=button_save_to_csv_click) + button_save_to_csv = tk.Button(root, text="save to csv", command=button_save_to_csv_click) button_save_to_csv.grid(row=6, column=1,sticky=tk.W) # 创建一个带展示框 tree = ttk.Treeview(root, columns=("bvid", "title","url","upload","topNo","viewCount","likeCount","coinCount","favorite","commentCount","bolletCount","creatorld","creatorName","createFanCount")) diff --git a/service/CsvService.py b/service/CsvService.py index 28c03ae..9cac219 100644 --- a/service/CsvService.py +++ b/service/CsvService.py @@ -1,7 +1,21 @@ from service.IFileService import IFileService from entity.BilibiliVideo import BilibiliVideo - +from controller.SpyderController import SpyderController +import csv class CsvService(IFileService): def save(self, filePath, videoList): - raise NotImplementedError + f = open(filePath+".csv", "w", encoding="GB18030", newline="") + csv_writer = csv.writer(f) + csv_writer.writerow( + ["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount", + "bulletCount", "commentCount", + "creatorId", "creatorName", "creatorFanCount"]) + for video in videoList: + csv_writer.writerow( + [video.topNo, video.bvId, video.title, video.url, video.uploadTimeText, video.viewCount, video.likeCount, + video.coinCount, video.favoriteCount, video.bulletCount, video.commentCount, + video.creatorId, video.creatorName, video.creatorFanCount]) + f.close() + # raise NotImplementedError + diff --git a/test/SpyderController_test.py b/test/SpyderController_test.py index f38f225..ef44f08 100644 --- a/test/SpyderController_test.py +++ b/test/SpyderController_test.py @@ -2,11 +2,22 @@ from controller.SpyderController import SpyderController spyderController = SpyderController() +import csv class TestSpyderController: def test_main(self): # for debug: print("testing SpyderController.main") + spyderController = SpyderController() lst = spyderController.getBilibiliVideoList(999, 1, 0.3) print(lst) - print(123) \ No newline at end of file + # assert len(lst) == 999 + # f = open("file_3.csv", "w", encoding="GB18030", newline="") + # csv_writer = csv.writer(f) + # csv_writer.writerow( + # ["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount", + # "bulletCount", "commentCount", + # "creatorId", "creatorName", "creatorFanCount"]) + # f.close() + # spyderController = SpyderController() + # spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2