diff --git a/controller/SpyderController.py b/controller/SpyderController.py index a0d4314..9fb7797 100644 --- a/controller/SpyderController.py +++ b/controller/SpyderController.py @@ -6,6 +6,8 @@ import json import requests from lxml import etree +from entity.BilibiliVideo import BilibiliVideo + class SpyderController: # Bilibili视频爬虫控制器,打开网页,爬取BilibiliVideo.py中的数据,将其下载下来,保存为csv文件 @@ -17,6 +19,10 @@ class SpyderController: :param waitTime:float: 每个线程的等待时间,单位秒(避免爬取过快) :return: list[BilibiliVideo] 返回处理完成后的videoList """ + all_data_list = [] + videoCount = int(videoCount) + threadCount = int(threadCount) + waitTime = float(waitTime) if videoCount < threadCount: threadCount = videoCount if videoCount > 20: @@ -76,11 +82,14 @@ class SpyderController: bulletCount = convert_to_number(bulletCount) # match = re.search(r'\d+', text) # number = match.group() - followers_str = creatorFanCount[0].strip().split()[1] - followers_num = float(followers_str.replace('万', '')) * 10000 - # 转化为整数 - followers_num = int(followers_num) - creatorFanCount = [str(followers_num)] + if not creatorFanCount: + creatorFanCount = [str(1)] + else : + followers_str = creatorFanCount[0].strip().split()[1] + followers_num = float(followers_str.replace('万', '')) * 10000 + # 转化为整数 + followers_num = int(followers_num) + creatorFanCount = [str(followers_num)] commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])] creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])] creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])] @@ -91,13 +100,16 @@ class SpyderController: up_text = requests.get(url=up_url, headers=headers).text tree = etree.HTML(up_text) #print(up_text) + all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount - all_data = topNo + bvId + title + [ - video_url] + uploadTime + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount csv_writer.writerow(all_data) - all_data_list = [] - all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount] + + # all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount] + video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]), + int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0])) + all_data_list.append(video) f.close() + # print(all_data_list) return all_data_list thread_list = [] @@ -113,15 +125,15 @@ class SpyderController: for thread in thread_list: thread.join() - + print(all_data_list) + return all_data_list if __name__ == '__main__': f = open("file_3.csv", "w", encoding="UTF-8", newline="") csv_writer = csv.writer(f) csv_writer.writerow( - ["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount", - "bulletCount", "commentCount", - "creatorId", "creatorName", "creatorFanCount"]) + ["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount", + "favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"]) f.close() spyderController = SpyderController() spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2 \ No newline at end of file diff --git a/controller/UIController.py b/controller/UIController.py index 6d0f557..284ff6b 100644 --- a/controller/UIController.py +++ b/controller/UIController.py @@ -39,9 +39,10 @@ class UIController: waitTime=entry3.get() #创建 SpyderController对象调用其函数 SpyderController=SC.SpyderController() - global scRuslt_data - scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime) + # global scRuslt_data + self.scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime) print("爬取完成") + # print(scRuslt_data) diff --git a/service/CsvService.py b/service/CsvService.py index 4bcc691..5e26627 100644 --- a/service/CsvService.py +++ b/service/CsvService.py @@ -17,5 +17,5 @@ class CsvService(IFileService): video.coinCount, video.favoriteCount, video.bulletCount, video.commentCount, video.creatorId, video.creatorName, video.creatorFanCount]) f.close() - raise NotImplementedError + # raise NotImplementedError