diff --git a/.gitignore b/.gitignore
index 32fde51..a5cfbd6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,3 +78,8 @@ fabric.properties
.idea/caches/build_file_checksums.ser
/.idea
+/.vs
+__pycache__
+__pycache__
+/*.xlsx
+/*.csv
diff --git a/README.md b/README.md
index dc82be9..ddb9186 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,21 @@
# Spyder_python
+本项目采用多线程爬虫技术,实现了对b站热门排行榜的播放数据爬取与分析,其主要功能包括:对b站排行榜的爬取与分析,数据整理,报表显示,报表输出为csv和excel,分析报告生成,特点是多线程、用户友好、良好git管理、拥有以pytest规范构建的单元测试。
+
+随着互联网技术的飞速发展和普及,网络视频平台如bilibili(简称B站)已经成为大众获取信息、娱乐休闲的重要途径。B站以其独特的弹幕文化和丰富的视频资源吸引了大量年轻用户,形成了一个极具活力的社区。在这个背景下,对B站数据的爬取和分析变得尤为重要。
+
+B站的热门榜数据具有极高的研究价值和应用前景。视频的播放量、点赞量、评论数等数据可以反映视频的热度和受欢迎程度,对于视频创作者和平台运营者来说,这些数据是优化内容、提升用户体验的重要依据。
+
+开发一个针对B站的爬虫项目,实现对B站数据的自动化抓取和分析,对于学术研究、商业分析还是个人兴趣探索都具有重要意义。
+
+
+
+![](assets/2024-04-26-10-47-09-image.png)
+
+
良好的git分支管理
+
+![](assets/2024-04-26-10-50-50-image.png)
+
+项目结构展示
+
+
diff --git a/assets/2024-04-26-10-47-09-image.png b/assets/2024-04-26-10-47-09-image.png
new file mode 100644
index 0000000..cc38081
Binary files /dev/null and b/assets/2024-04-26-10-47-09-image.png differ
diff --git a/assets/2024-04-26-10-50-50-image.png b/assets/2024-04-26-10-50-50-image.png
new file mode 100644
index 0000000..331cbcb
Binary files /dev/null and b/assets/2024-04-26-10-50-50-image.png differ
diff --git a/controller/SpyderController.py b/controller/SpyderController.py
index 2a1d2ee..7605ec5 100644
--- a/controller/SpyderController.py
+++ b/controller/SpyderController.py
@@ -27,14 +27,14 @@ class SpyderController:
threadCount = videoCount
if videoCount > 100:
videoCount = 100
- url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
+ json_url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
}
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
#csv_writer = csv.writer(f)
- rank_text = requests.get(url=url, headers=headers).text
+ rank_text = requests.get(url=json_url, headers=headers).text
# 将含有"万"的数据转换为数字
def crawl_data(start_index, end_index):
def convert_to_number(lst):
@@ -63,8 +63,8 @@ class SpyderController:
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
topNo = [str(i+1)]
- video_url = "https://www.bilibili.com/video/av" + aid + "/?"
- video_text = requests.get(url=video_url, headers=headers).text
+ url = "https://www.bilibili.com/video/av" + aid + "/?"
+ video_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(video_text)
#print(video_text)
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
diff --git a/controller/UIController.py b/controller/UIController.py
index e94d3cb..dd7e284 100644
--- a/controller/UIController.py
+++ b/controller/UIController.py
@@ -35,14 +35,30 @@ class UIController:
# 这里可以添加按钮点击事件的逻辑
print("start_Button clicked!")
videoCount=entry1.get()
- threadCount=entry2.get()
- waitTime=entry3.get()
+ threadCount = entry2.get()
+ waitTime = entry3.get()
+ if(videoCount==''):
+ videoCount=999
+ threadCount=10
+ waitTime=0.3
+ else:
+ if videoCount.isdigit():
+ videoCount=eval(videoCount)
+ else:
+ return print("video count is not digit")
+ if threadCount.isdigit():
+ threadCount=eval(threadCount)
+ else:
+ return print("thread count is not num ")
+ if waitTime.isdigit():
+ waitTime=eval(waitTime)
+ else:
+ return print("waitTime is not num")
#创建 SpyderController对象调用其函数
SpyderController=SC.SpyderController()
- # global scRuslt_data
+
self.scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime)
print("爬取完成")
- # print(scRuslt_data)
@@ -61,6 +77,8 @@ class UIController:
theList.append(data.bvId)
theList.append(data.title)
theList.append(data.url)
+ theList.append(data.uploadTime)
+ theList.append(data.uploadTimeText)
theList.append(data.topNo)
theList.append(data.viewCount)
theList.append(data.likeCount)
@@ -116,21 +134,22 @@ class UIController:
button_save_to_csv = tk.Button(root, text="save to csv", command=button_save_to_csv_click)
button_save_to_csv.grid(row=6, column=1,sticky=tk.W)
# 创建一个带展示框
- tree = ttk.Treeview(root, columns=("bvid", "title","url","upload","topNo","viewCount","likeCount","coinCount","favorite","commentCount","bolletCount","creatorld","creatorName","createFanCount"))
+ tree = ttk.Treeview(root, columns=("bvid", "title","url",'uploadTime',"uploadTimeText","topNo","viewCount","likeCount","coinCount","favorite","commentCount","bolletCount","creatorld","creatorName","createFanCount"))
tree.heading("#1", text="bvid")
tree.heading("#2", text="title")
tree.heading('#3', text="url")
- tree.heading('#4', text="upload")
- tree.heading('#5', text="topNo")
- tree.heading('#6', text="ViewCount")
- tree.heading('#7', text="likeCount")
- tree.heading('#8', text="coinCount")
- tree.heading('#9', text="favorite")
- tree.heading('#10', text="commentCount")
- tree.heading('#11', text="bulletCount")
- tree.heading('#12', text="creadtorId")
- tree.heading('#13', text="creatorName")
- tree.heading('#14', text="createFanCount")
+ tree.heading('#4', text="uploadtime")
+ tree.heading('#5', text="uploadtimeTexT")
+ tree.heading('#6', text="topNo")
+ tree.heading('#7', text="ViewCount")
+ tree.heading('#8', text="likeCount")
+ tree.heading('#9', text="coinCount")
+ tree.heading('#10', text="favorite")
+ tree.heading('#11', text="commentCount")
+ tree.heading('#12', text="bulletCount")
+ tree.heading('#13', text="creadtorId")
+ tree.heading('#14', text="creatorName")
+ tree.heading('#15', text="createFanCount")
# 向展示框添加数据
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..eae95f8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+lxml==5.2.0
+pytest==8.1.1
+Requests==2.31.0