Compare commits

...

18 Commits

Author SHA1 Message Date
Timmoc fee10995eb Merge remote-tracking branch 'origin/UIController'
9 months ago
Timmoc 50e9a9cb9f 删除main函数中的临时测试数据
9 months ago
Timmoc e65e059326 Merge branch 'main' into Excel
9 months ago
Timmoc f98cf77019 Merge branch 'SpyderController'
9 months ago
lfk 7546fe1c16 1.修复了“星穹铁道生日会”系列视频无法爬取的bug
9 months ago
芦笙 92af3260ef 修复了图表不出现,以及被分析数据名不出现的bug,完美契合框架
9 months ago
芦笙 db42f2f9e7 将图表功能写成函数,并且在excelserver中调用函数实现创建图表
9 months ago
Timmoc 9e1f7826eb 修改readme,以及不知原因的list必须改为List,在部分机器上报错。猜测原因是python版本。
9 months ago
芦笙 f4ab70dd0c Merge branch 'refs/heads/main' into Excel
9 months ago
芦笙 857c92ffbe Merge remote-tracking branch 'origin/Excel' into Excel
9 months ago
Timmoc 514040969a 更新主页和git ignore
9 months ago
Timmoc 8238e2c49a Merge branch 'UIController'
9 months ago
lfk 7b1db4ed1c 修复了url相同的问题
9 months ago
芦笙 b12e23ceab Merge branch 'refs/heads/main' into Excel
9 months ago
芦笙 a0ff5dbbe4 Merge branch 'refs/heads/main' into Excel
9 months ago
Timmoc 37f4d10267 增加了ui,未能通过合并要求,打回重新测试
9 months ago
Timmoc eba4935c67 Merge branch 'main' into Excel
9 months ago
芦笙 8ff56b8929 完成了对于excel功能的书写,完成了测试用例
9 months ago

5
.gitignore vendored

@ -78,3 +78,8 @@ fabric.properties
.idea/caches/build_file_checksums.ser
/.idea
/.vs
__pycache__
__pycache__
/*.xlsx
/*.csv

@ -1,2 +1,17 @@
# Spyder_python
本项目采用多线程爬虫技术实现了对b站热门排行榜的播放数据爬取与分析其主要功能包括对b站排行榜的爬取与分析数据整理报表显示报表输出为csv和excel分析报告生成特点是多线程、用户友好、良好git管理、拥有以pytest规范构建的单元测试。
随着互联网技术的飞速发展和普及网络视频平台如bilibili简称B站已经成为大众获取信息、娱乐休闲的重要途径。B站以其独特的弹幕文化和丰富的视频资源吸引了大量年轻用户形成了一个极具活力的社区。在这个背景下对B站数据的爬取和分析变得尤为重要。
B站的热门榜数据具有极高的研究价值和应用前景。视频的播放量、点赞量、评论数等数据可以反映视频的热度和受欢迎程度对于视频创作者和平台运营者来说这些数据是优化内容、提升用户体验的重要依据。
开发一个针对B站的爬虫项目实现对B站数据的自动化抓取和分析对于学术研究、商业分析还是个人兴趣探索都具有重要意义。
<img src="assets/2024-04-26-10-47-09-image.png" title="" alt="" data-align="center">
<center>良好的git分支管理</center>
![](assets/2024-04-26-10-50-50-image.png)
<center>项目结构展示</center>

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

@ -27,14 +27,14 @@ class SpyderController:
threadCount = videoCount
if videoCount > 100:
videoCount = 100
url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
json_url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
}
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
#csv_writer = csv.writer(f)
rank_text = requests.get(url=url, headers=headers).text
rank_text = requests.get(url=json_url, headers=headers).text
# 将含有"万"的数据转换为数字
def crawl_data(start_index, end_index):
def convert_to_number(lst):
@ -63,33 +63,44 @@ class SpyderController:
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
topNo = [str(i+1)]
video_url = "https://www.bilibili.com/video/av" + aid + "/?"
video_text = requests.get(url=video_url, headers=headers).text
url = "https://www.bilibili.com/video/av" + aid + "/?"
video_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(video_text)
#print(video_text)
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])]
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
if(len(title) != 0):
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
viewCount = tree.xpath('//div[@class="view item"]/div/text()')
likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()')
coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()')
favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()')
bulletCount = tree.xpath('//div[@class="dm-text"]/text()')
creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
#creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
viewCount = convert_to_number(viewCount)
likeCount = convert_to_number(likeCount)
coinCount = convert_to_number(coinCount)
favoriteCount = convert_to_number(favoriteCount)
bulletCount = convert_to_number(bulletCount)
# if not creatorFanCount:
# creatorFanCount = [str(1)]
# else:
# followers_str = creatorFanCount[0].strip().split()[1]
# followers_num = float(followers_str.replace('万', '')) * 10000
# # 转化为整数
# followers_num = int(followers_num)
# creatorFanCount = [str(followers_num)]
else:
title = [str(parsed_data["data"]["list"][i]["title"])]
viewCount = [str(parsed_data['data']['list'][i]['stat']['view'])]
likeCount = [str(parsed_data['data']['list'][i]['stat']['like'])]
coinCount = [str(parsed_data['data']['list'][i]['stat']['coin'])]
favoriteCount = [str(parsed_data['data']['list'][i]['stat']['share'])]
bulletCount = [str(parsed_data['data']['list'][i]['stat']['danmaku'])]
#creatorFanCount = [str(1)]
# print(creatorFanCount)
# match = re.search(r'\d+', text)
# number = match.group()
if not creatorFanCount:
creatorFanCount = [str(1)]
else :
followers_str = creatorFanCount[0].strip().split()[1]
followers_num = float(followers_str.replace('', '')) * 10000
# 转化为整数
followers_num = int(followers_num)
creatorFanCount = [str(followers_num)]
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
@ -97,8 +108,12 @@ class SpyderController:
#up_url = "https://space.bilibili.com/" + creatorId[0] + "?"
up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0"
up_text = requests.get(url=up_url, headers=headers).text
tree = etree.HTML(up_text)
up_json = "https://api.bilibili.com/x/relation/stat?vmid=" + creatorId[0]
up_text = requests.get(url=up_json, headers=headers).text
up_data_json = json.loads(up_text)
creatorFanCount = [str(up_data_json['data']['follower'])]
# up_text = requests.get(url=up_url, headers=headers).text
# tree = etree.HTML(up_text)
#print(up_text)
all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount

@ -1,5 +1,6 @@
# from controller.SpyderController import SpyderController
from controller.UIController import UIController
from entity.BilibiliVideo import BilibiliVideo
from test.SpyderController_test import TestSpyderController
# spyderController = SpyderController()

@ -0,0 +1,3 @@
lxml==5.2.0
pytest==8.1.1
Requests==2.31.0

@ -1,10 +1,12 @@
from typing import List
from service.IFileService import IFileService
from entity.BilibiliVideo import BilibiliVideo
from controller.SpyderController import SpyderController
import csv
class CsvService(IFileService):
def save(self, filePath, videoList: list[BilibiliVideo]):
def save(self, filePath, videoList: List[BilibiliVideo]):
f = open(filePath+".csv", "w", encoding="GB18030", newline="")
csv_writer = csv.writer(f)
csv_writer.writerow(

@ -1,5 +1,8 @@
from typing import List
from service.IFileService import IFileService
from entity.BilibiliVideo import BilibiliVideo
from tool import tttt
class ExcelService(IFileService):
@ -9,5 +12,29 @@ class ExcelService(IFileService):
"""
pass
def save(self, filePath, videoList: list[BilibiliVideo]):
raise NotImplementedError
def save(self, filePath, videoList: List[BilibiliVideo]):
filePath += ".xlsx"
tttt.write_to_excel(videoList,filePath)
tttt.calculate_ratio_and_update(filePath, 'Sheet')
texts = [
"approve",
"money",
"practical",
"Stunning",
"interaction"
]
tttt.write_english_texts(filePath, 'Sheet', texts)
tttt.create_bar_chart(filePath,'Sheet')
print("Data analysis written to the Excel file.")

@ -1,11 +1,12 @@
from abc import abstractmethod, ABCMeta
from typing import List
from entity.BilibiliVideo import BilibiliVideo
class IFileService(metaclass=ABCMeta):
@abstractmethod
def save(self,filePath,videoList:list[BilibiliVideo]):
def save(self,filePath,videoList:List[BilibiliVideo]):
"""
保存到文件高级参数在init里面写
:param filePath: 文件保存路径

@ -1,3 +1,91 @@
from entity.BilibiliVideo import BilibiliVideo
from tool import tttt
class TestExcel:
def test_Excel(self):
import openpyxl
def write_to_excel(videos, filename):
# 创建一个新的Excel工作簿
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Bilibili Videos"
# 写入表头
headers = ["bvId", "title", "url", "uploadTime", "uploadTimeText", "topNo",
"viewCount", "likeCount", "coinCount", "favoriteCount",
"commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"]
for col_num, header in enumerate(headers, 1):
ws.cell(row=1, column=col_num, value=header)
# 写入视频数据
for row_num, video in enumerate(videos, 2):
ws.cell(row=row_num, column=1, value=video.bvId)
ws.cell(row=row_num, column=2, value=video.title)
ws.cell(row=row_num, column=3, value=video.url)
ws.cell(row=row_num, column=4, value=video.uploadTime)
ws.cell(row=row_num, column=5, value=video.uploadTimeText)
ws.cell(row=row_num, column=6, value=video.topNo)
ws.cell(row=row_num, column=7, value=video.viewCount)
ws.cell(row=row_num, column=8, value=video.likeCount)
ws.cell(row=row_num, column=9, value=video.coinCount)
ws.cell(row=row_num, column=10, value=video.favoriteCount)
ws.cell(row=row_num, column=11, value=video.commentCount)
ws.cell(row=row_num, column=12, value=video.bulletCount)
ws.cell(row=row_num, column=13, value=video.creatorId)
ws.cell(row=row_num, column=14, value=video.creatorName)
ws.cell(row=row_num, column=15, value=video.creatorFanCount)
# 保存Excel文件
wb.save(filename)
# 示例用法
video1 = BilibiliVideo("bv123456", "视频标题1", "http://video1.com", 1620000000, 1, 1000, 500, 200, 50, 100,
300, "creator123", "up主1", 10000)
video2 = BilibiliVideo("bv789012", "视频标题2", "http://video2.com", 1621000000, 2, 2000, 1000, 400, 60, 200,
400, "creator456", "up主2", 20000)
videos = [video1, video2]
write_to_excel(videos, "bilibili_videos.xlsx")
pass
def test_WTExcel(self):
# 测试数据
test_cases = [
BilibiliVideo("BV1a4411C7i2", "视频标题1", "https://www.bilibili.com/video/BV1a4411C7i2", 1648927200, 1,
100000, 5000, 2000, 3000, 1000, 500, "up123456", "UP主A", 1000000),
BilibiliVideo("BV1F5411S8h3", "视频标题2", "https://www.bilibili.com/video/BV1F5411S8h3", 1648830800, 2,
80000, 4000, 1500, 2000, 800, 400, "up234567", "UP主B", 800000),
BilibiliVideo("BV1bW411d9c4", "视频标题3", "https://www.bilibili.com/video/BV1bW411d9c4", 1648734400, 3,
60000, 3000, 1000, 1500, 600, 300, "up345678", "UP主C", 600000),
BilibiliVideo("BV1qy411z5k5", "视频标题4", "https://www.bilibili.com/video/BV1qy411z5k5", 1648638000, 4,
40000, 2000, 800, 1000, 400, 200, "up456789", "UP主D", 400000),
BilibiliVideo("BV1R4411J0a6", "视频标题5", "https://www.bilibili.com/video/BV1R4411J0a6", 1648541600, 5,
20000, 1000, 500, 700, 200, 100, "up567890", "UP主E", 200000),
BilibiliVideo("BV1gW411H8d7", "视频标题6", "https://www.bilibili.com/video/BV1gW411H8d7", 1648445200, 6,
10000, 500, 300, 400, 100, 50, "up678901", "UP主F", 100000),
BilibiliVideo("BV1py411t6v8", "视频标题7", "https://www.bilibili.com/video/BV1py411t6v8", 1648348800, 7,
5000, 300, 200, 300, 50, 30, "up789012", "UP主G", 50000),
BilibiliVideo("BV1Qy411S3x9", "视频标题8", "https://www.bilibili.com/video/BV1Qy411S3x9", 1648252400, 8,
2000, 100, 100, 200, 20, 10, "up890123", "UP主H", 20000),
BilibiliVideo("BV1a4411C4y0", "视频标题9", "https://www.bilibili.com/video/BV1a4411C4y0", 1648156000, 9,
1000, 50, 50, 100, 10, 5, "up901234", "UP主I", 10000),
BilibiliVideo("BV1F5411C2r1", "视频标题10", "https://www.bilibili.com/video/BV1F5411C2r1", 1648059600,
10, 500, 30, 20, 50, 5, 3, "up012345", "UP主J", 5000)
]
# 将测试数据写入 Excel 文件
tttt.write_to_excel(test_cases, 'bilibili_videos.xlsx')
def test_c_r_a_update(self):
# 示例用法
file_path = 'bilibili_videos.xlsx' # Excel 文件路径
sheet_name = 'Sheet' # 工作表名称
tttt.calculate_ratio_and_update(file_path, sheet_name)
print("Data analysis written to the Excel file.")

@ -0,0 +1,214 @@
import openpyxl
from entity.BilibiliVideo import BilibiliVideo
# 创建一个函数,用于将 BilibiliVideo 实例的属性写入 Excel 表格中
def write_to_excel(video_instances, excel_filename):
wb = openpyxl.Workbook()
ws = wb.active
# 添加表头
ws.append(['bvId', 'title', 'url', 'uploadTime', 'uploadTimeText', 'topNo', 'viewCount', 'likeCount', 'coinCount',
'favoriteCount', 'commentCount', 'bulletCount', 'creatorId', 'creatorName', 'creatorFanCount'])
# 遍历 BilibiliVideo 实例列表,逐个写入表格
for video_instance in video_instances:
ws.append([video_instance.bvId, video_instance.title, video_instance.url, video_instance.uploadTime,
video_instance.uploadTimeText, video_instance.topNo, video_instance.viewCount, video_instance.likeCount,
video_instance.coinCount, video_instance.favoriteCount, video_instance.commentCount,
video_instance.bulletCount, video_instance.creatorId, video_instance.creatorName,
video_instance.creatorFanCount])
# 保存 Excel 文件
wb.save(excel_filename)
if __name__ == '__main__':
# 测试数据
test_cases = [
BilibiliVideo("BV1a4411C7i2", "视频标题1", "https://www.bilibili.com/video/BV1a4411C7i2", 1648927200, 1, 100000, 5000, 2000, 3000, 1000, 500, "up123456", "UP主A", 1000000),
BilibiliVideo("BV1F5411S8h3", "视频标题2", "https://www.bilibili.com/video/BV1F5411S8h3", 1648830800, 2, 80000, 4000, 1500, 2000, 800, 400, "up234567", "UP主B", 800000),
BilibiliVideo("BV1bW411d9c4", "视频标题3", "https://www.bilibili.com/video/BV1bW411d9c4", 1648734400, 3, 60000, 3000, 1000, 1500, 600, 300, "up345678", "UP主C", 600000),
BilibiliVideo("BV1qy411z5k5", "视频标题4", "https://www.bilibili.com/video/BV1qy411z5k5", 1648638000, 4, 40000, 2000, 800, 1000, 400, 200, "up456789", "UP主D", 400000),
BilibiliVideo("BV1R4411J0a6", "视频标题5", "https://www.bilibili.com/video/BV1R4411J0a6", 1648541600, 5, 20000, 1000, 500, 700, 200, 100, "up567890", "UP主E", 200000),
BilibiliVideo("BV1gW411H8d7", "视频标题6", "https://www.bilibili.com/video/BV1gW411H8d7", 1648445200, 6, 10000, 500, 300, 400, 100, 50, "up678901", "UP主F", 100000),
BilibiliVideo("BV1py411t6v8", "视频标题7", "https://www.bilibili.com/video/BV1py411t6v8", 1648348800, 7, 5000, 300, 200, 300, 50, 30, "up789012", "UP主G", 50000),
BilibiliVideo("BV1Qy411S3x9", "视频标题8", "https://www.bilibili.com/video/BV1Qy411S3x9", 1648252400, 8, 2000, 100, 100, 200, 20, 10, "up890123", "UP主H", 20000),
BilibiliVideo("BV1a4411C4y0", "视频标题9", "https://www.bilibili.com/video/BV1a4411C4y0", 1648156000, 9, 1000, 50, 50, 100, 10, 5, "up901234", "UP主I", 10000),
BilibiliVideo("BV1F5411C2r1", "视频标题10", "https://www.bilibili.com/video/BV1F5411C2r1", 1648059600, 10, 500, 30, 20, 50, 5, 3, "up012345", "UP主J", 5000)
]
# 将测试数据写入 Excel 文件
write_to_excel(test_cases, 'bilibili_videos_with_charts.xlsx')
import openpyxl
def calculate_ratio_and_update(file_path, sheet_name):
# 打开 Excel 文件
wb = openpyxl.load_workbook(file_path)
sheet = wb[sheet_name]
# 遍历除第一行外的每一行数据
for row_num in range(2, sheet.max_row + 1):
value1 = sheet.cell(row=row_num, column=7).value #观看
value2 = sheet.cell(row=row_num, column=8).value#点赞
value3 = sheet.cell(row=row_num, column=9).value#投币
value4 = sheet.cell(row=row_num, column=10).value#收藏
value5 = sheet.cell(row=row_num, column=11).value#评论
value6 = sheet.cell(row=row_num, column=15).value#粉丝
value7 = sheet.cell(row=row_num, column=12).value # 弹幕
if value1 is not None and value2 is not None:
value1 = float(value1)
value2 = float(value2)
value3 = float(value3)
value4 = float(value4)
value5 = float(value5)
value6 = float(value6)
ratio1 = value2 / value1
sheet.cell(row=row_num, column=17).value = ratio1 #
ratio2 = value3 / value1
sheet.cell(row=row_num, column=18).value = ratio2
ratio3 = value4 / value1
sheet.cell(row=row_num, column=19).value = ratio3
ratio4 = value6 / value1
sheet.cell(row=row_num, column=20).value = ratio4
ratio5 = value5 + value7
sheet.cell(row=row_num, column=21).value = ratio5
# 保存文件
wb.save(file_path)
if __name__ == '__main__':
# 示例用法
file_path = 'bilibili_videos_with_charts.xlsx' # Excel 文件路径
sheet_name = 'Sheet' # 工作表名称
calculate_ratio_and_update(file_path, sheet_name)
print("Data analysis written to the Excel file.")
import openpyxl
from openpyxl.chart import BarChart, Reference, AreaChart
def create_bar_chart(file_name, sheet_name):
# 读取Excel文件
wb = openpyxl.load_workbook(file_name)
ws = wb[sheet_name]
# 创建柱状图
bar_chart = BarChart()
bar_chart.title = "观众对于视频的认可度"
bar_chart.y_axis.title = "Data"
bar_chart.x_axis.title = "Index"
# 设置柱状图数据
bar_data = Reference(ws, min_col=17, min_row=2, max_row=ws.max_row)
bar_categories = Reference(ws, min_col=2, min_row=2, max_row=ws.max_row)
bar_chart.add_data(bar_data, titles_from_data=True)
bar_chart.set_categories(bar_categories)
# 添加柱状图到工作表
ws.add_chart(bar_chart, "V1")
# 调用函数并传入文件名和工作表名
# 创建柱状图
bar_chart = BarChart()
bar_chart.title = "收益"
bar_chart.y_axis.title = "Data"
bar_chart.x_axis.title = "Index"
# 设置柱状图数据
bar_data = Reference(ws, min_col=18, min_row=2, max_row=ws.max_row)
bar_categories = Reference(ws, min_col=2, min_row=2, max_row=ws.max_row)
bar_chart.add_data(bar_data, titles_from_data=True)
bar_chart.set_categories(bar_categories)
# 添加柱状图到工作表
ws.add_chart(bar_chart, "AD1")
# 创建面积图
area_chart = AreaChart()
area_chart.title = "视频实用性"
area_chart.y_axis.title = "Data"
area_chart.x_axis.title = "Index"
# 设置面积图数据
area_data = Reference(ws, min_col=19, min_row=2, max_row=ws.max_row)
area_categories = Reference(ws, min_col=2, min_row=2, max_row=ws.max_row)
area_chart.add_data(area_data, titles_from_data=True)
area_chart.set_categories(area_categories)
# 添加面积图到工作表
ws.add_chart(area_chart, "AL1")
# 创建柱状图
bar_chart_2 = BarChart()
bar_chart_2.title = "视频惊艳程度"
bar_chart_2.y_axis.title = "Data"
bar_chart_2.x_axis.title = "Index"
# 设置柱状图数据
bar_data_2 = Reference(ws, min_col=20, min_row=2, max_row=ws.max_row)
bar_categories_2 = Reference(ws, min_col=2, min_row=2, max_row=ws.max_row)
bar_chart_2.add_data(bar_data_2, titles_from_data=True)
bar_chart_2.set_categories(bar_categories_2)
# 添加柱状图到工作表
ws.add_chart(bar_chart_2, "AT1")
# 创建柱状图
bar_chart_3 = BarChart()
bar_chart_3.title = "视频的互动性"
bar_chart_3.y_axis.title = "Data"
bar_chart_3.x_axis.title = "Index"
# 设置柱状图数据
bar_data_3 = Reference(ws, min_col=21, min_row=2, max_row=ws.max_row)
bar_categories_3 = Reference(ws, min_col=2, min_row=2, max_row=ws.max_row)
bar_chart_3.add_data(bar_data_3, titles_from_data=True)
bar_chart_3.set_categories(bar_categories_3)
# 添加柱状图到工作表
ws.add_chart(bar_chart_3, "BB1")
# 保存Excel文件
wb.save('bilibili_videos_with_charts.xlsx')
if __name__ == '__main__':
create_bar_chart('bilibili_videos_with_charts.xlsx', 'Sheet')
def write_english_texts(file_path, sheet_name, texts):
# 打开 Excel 文件
wb = openpyxl.load_workbook(file_path)
sheet = wb[sheet_name]
# 循环写入不同的英文文本到第一行的第17到21列
for i, text in enumerate(texts, start=17):
sheet.cell(row=1, column=i).value = text
# 保存文件
wb.save(file_path)
file_path = 'bilibili_videos_with_charts.xlsx' # Excel 文件路径
sheet_name = 'Sheet' # 工作表名称
english_texts = [
"approve",
"money",
"practical",
"Stunning",
"interaction"
] # 要写入的英文文本列表
if __name__ == '__main__':
write_english_texts(file_path, sheet_name, english_texts)
print("Data analysis chart written to the Excel file.")
Loading…
Cancel
Save