diff --git a/.gitignore b/.gitignore index 32fde51..a5cfbd6 100644 --- a/.gitignore +++ b/.gitignore @@ -78,3 +78,8 @@ fabric.properties .idea/caches/build_file_checksums.ser /.idea +/.vs +__pycache__ +__pycache__ +/*.xlsx +/*.csv diff --git a/README.md b/README.md index dc82be9..ddb9186 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,21 @@ # Spyder_python +本项目采用多线程爬虫技术,实现了对b站热门排行榜的播放数据爬取与分析,其主要功能包括:对b站排行榜的爬取与分析,数据整理,报表显示,报表输出为csv和excel,分析报告生成,特点是多线程、用户友好、良好git管理、拥有以pytest规范构建的单元测试。 + +随着互联网技术的飞速发展和普及,网络视频平台如bilibili(简称B站)已经成为大众获取信息、娱乐休闲的重要途径。B站以其独特的弹幕文化和丰富的视频资源吸引了大量年轻用户,形成了一个极具活力的社区。在这个背景下,对B站数据的爬取和分析变得尤为重要。 + +B站的热门榜数据具有极高的研究价值和应用前景。视频的播放量、点赞量、评论数等数据可以反映视频的热度和受欢迎程度,对于视频创作者和平台运营者来说,这些数据是优化内容、提升用户体验的重要依据。 + +开发一个针对B站的爬虫项目,实现对B站数据的自动化抓取和分析,对于学术研究、商业分析还是个人兴趣探索都具有重要意义。 + + + +![](assets/2024-04-26-10-47-09-image.png) + +
良好的git分支管理
+ +![](assets/2024-04-26-10-50-50-image.png) + +
项目结构展示
+ + diff --git a/assets/2024-04-26-10-47-09-image.png b/assets/2024-04-26-10-47-09-image.png new file mode 100644 index 0000000..cc38081 Binary files /dev/null and b/assets/2024-04-26-10-47-09-image.png differ diff --git a/assets/2024-04-26-10-50-50-image.png b/assets/2024-04-26-10-50-50-image.png new file mode 100644 index 0000000..331cbcb Binary files /dev/null and b/assets/2024-04-26-10-50-50-image.png differ diff --git a/controller/SpyderController.py b/controller/SpyderController.py index 2a1d2ee..7605ec5 100644 --- a/controller/SpyderController.py +++ b/controller/SpyderController.py @@ -27,14 +27,14 @@ class SpyderController: threadCount = videoCount if videoCount > 100: videoCount = 100 - url = "https://api.bilibili.com/x/web-interface/ranking/v2?" + json_url = "https://api.bilibili.com/x/web-interface/ranking/v2?" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0' } #f = open("file_3.csv", "a", encoding="UTF-8", newline="") #csv_writer = csv.writer(f) - rank_text = requests.get(url=url, headers=headers).text + rank_text = requests.get(url=json_url, headers=headers).text # 将含有"万"的数据转换为数字 def crawl_data(start_index, end_index): def convert_to_number(lst): @@ -63,8 +63,8 @@ class SpyderController: bvId = [str(parsed_data['data']['list'][i]['bvid'])] topNo = [str(i+1)] - video_url = "https://www.bilibili.com/video/av" + aid + "/?" - video_text = requests.get(url=video_url, headers=headers).text + url = "https://www.bilibili.com/video/av" + aid + "/?" + video_text = requests.get(url=url, headers=headers).text tree = etree.HTML(video_text) #print(video_text) title = tree.xpath('//div[@class="video-info-title-inner"]//text()') diff --git a/controller/UIController.py b/controller/UIController.py index e94d3cb..dd7e284 100644 --- a/controller/UIController.py +++ b/controller/UIController.py @@ -35,14 +35,30 @@ class UIController: # 这里可以添加按钮点击事件的逻辑 print("start_Button clicked!") videoCount=entry1.get() - threadCount=entry2.get() - waitTime=entry3.get() + threadCount = entry2.get() + waitTime = entry3.get() + if(videoCount==''): + videoCount=999 + threadCount=10 + waitTime=0.3 + else: + if videoCount.isdigit(): + videoCount=eval(videoCount) + else: + return print("video count is not digit") + if threadCount.isdigit(): + threadCount=eval(threadCount) + else: + return print("thread count is not num ") + if waitTime.isdigit(): + waitTime=eval(waitTime) + else: + return print("waitTime is not num") #创建 SpyderController对象调用其函数 SpyderController=SC.SpyderController() - # global scRuslt_data + self.scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime) print("爬取完成") - # print(scRuslt_data) @@ -61,6 +77,8 @@ class UIController: theList.append(data.bvId) theList.append(data.title) theList.append(data.url) + theList.append(data.uploadTime) + theList.append(data.uploadTimeText) theList.append(data.topNo) theList.append(data.viewCount) theList.append(data.likeCount) @@ -116,21 +134,22 @@ class UIController: button_save_to_csv = tk.Button(root, text="save to csv", command=button_save_to_csv_click) button_save_to_csv.grid(row=6, column=1,sticky=tk.W) # 创建一个带展示框 - tree = ttk.Treeview(root, columns=("bvid", "title","url","upload","topNo","viewCount","likeCount","coinCount","favorite","commentCount","bolletCount","creatorld","creatorName","createFanCount")) + tree = ttk.Treeview(root, columns=("bvid", "title","url",'uploadTime',"uploadTimeText","topNo","viewCount","likeCount","coinCount","favorite","commentCount","bolletCount","creatorld","creatorName","createFanCount")) tree.heading("#1", text="bvid") tree.heading("#2", text="title") tree.heading('#3', text="url") - tree.heading('#4', text="upload") - tree.heading('#5', text="topNo") - tree.heading('#6', text="ViewCount") - tree.heading('#7', text="likeCount") - tree.heading('#8', text="coinCount") - tree.heading('#9', text="favorite") - tree.heading('#10', text="commentCount") - tree.heading('#11', text="bulletCount") - tree.heading('#12', text="creadtorId") - tree.heading('#13', text="creatorName") - tree.heading('#14', text="createFanCount") + tree.heading('#4', text="uploadtime") + tree.heading('#5', text="uploadtimeTexT") + tree.heading('#6', text="topNo") + tree.heading('#7', text="ViewCount") + tree.heading('#8', text="likeCount") + tree.heading('#9', text="coinCount") + tree.heading('#10', text="favorite") + tree.heading('#11', text="commentCount") + tree.heading('#12', text="bulletCount") + tree.heading('#13', text="creadtorId") + tree.heading('#14', text="creatorName") + tree.heading('#15', text="createFanCount") # 向展示框添加数据 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..eae95f8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +lxml==5.2.0 +pytest==8.1.1 +Requests==2.31.0