Merge branch 'refs/heads/main' into Excel

# Conflicts:
#	service/CsvService.py
Excel
芦笙 9 months ago
commit b12e23ceab

@ -1,5 +1,17 @@
import threading
import time
import datetime
import csv
import json
import requests
from lxml import etree
from entity.BilibiliVideo import BilibiliVideo
class SpyderController:
def getBilibiliVideoList(self,videoCount,threadCount,waitTime):
# Bilibili视频爬虫控制器,打开网页爬取符合BilibiliVideo.py中的数据,将其下载下来保存为csv文件
def getBilibiliVideoList(self, videoCount, threadCount, waitTime):
"""
整个爬虫的调用程序
:param videoCount: 爬取的视频数量若不限制可设置为999
@ -7,3 +19,121 @@ class SpyderController:
:param waitTime:float: 每个线程的等待时间单位秒避免爬取过快
:return: list[BilibiliVideo] 返回处理完成后的videoList
"""
all_data_list = []
videoCount = int(videoCount)
threadCount = int(threadCount)
waitTime = float(waitTime)
if videoCount < threadCount:
threadCount = videoCount
if videoCount > 100:
videoCount = 100
url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
}
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
#csv_writer = csv.writer(f)
rank_text = requests.get(url=url, headers=headers).text
# 将含有"万"的数据转换为数字
def crawl_data(start_index, end_index):
def convert_to_number(lst):
result = []
for item in lst:
if '' in item:
number = int(float(item.replace('', '')) * 10000)
result.append(str(number))
else:
result.append(item)
return result
# url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
# }
f = open("file_3.csv", "a", encoding="GB18030", newline="")
csv_writer = csv.writer(f)
#
# rank_text = requests.get(url=url, headers=headers).text
for i in range(start_index, end_index):
parsed_data = json.loads(rank_text)
aid = str(parsed_data["data"]["list"][i]["aid"])
print(aid)
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
topNo = [str(i+1)]
video_url = "https://www.bilibili.com/video/av" + aid + "/?"
video_text = requests.get(url=video_url, headers=headers).text
tree = etree.HTML(video_text)
#print(video_text)
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])]
viewCount = tree.xpath('//div[@class="view item"]/div/text()')
likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()')
coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()')
favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()')
bulletCount = tree.xpath('//div[@class="dm-text"]/text()')
creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
viewCount = convert_to_number(viewCount)
likeCount = convert_to_number(likeCount)
coinCount = convert_to_number(coinCount)
favoriteCount = convert_to_number(favoriteCount)
bulletCount = convert_to_number(bulletCount)
# match = re.search(r'\d+', text)
# number = match.group()
if not creatorFanCount:
creatorFanCount = [str(1)]
else :
followers_str = creatorFanCount[0].strip().split()[1]
followers_num = float(followers_str.replace('', '')) * 10000
# 转化为整数
followers_num = int(followers_num)
creatorFanCount = [str(followers_num)]
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
bulletCount = convert_to_number(bulletCount)
#up_url = "https://space.bilibili.com/" + creatorId[0] + "?"
up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0"
up_text = requests.get(url=up_url, headers=headers).text
tree = etree.HTML(up_text)
#print(up_text)
all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
csv_writer.writerow(all_data)
# all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]),
int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0]))
all_data_list.append(video)
f.close()
# print(all_data_list)
return all_data_list
thread_list = []
video_per_thread = videoCount // threadCount
for i in range(threadCount):
start_index = i * video_per_thread
end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount
thread = threading.Thread(target=crawl_data, args=(start_index, end_index))
thread_list.append(thread)
thread.start()
time.sleep(waitTime)
for thread in thread_list:
thread.join()
print(all_data_list)
return all_data_list
if __name__ == '__main__':
f = open("file_3.csv", "w", encoding="GB18030", newline="")
csv_writer = csv.writer(f)
csv_writer.writerow(
["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount",
"favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"])
f.close()
spyderController = SpyderController()
spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2

@ -20,9 +20,9 @@ class UIController:
print("zzr1")
#一些属性
##csv文件路径
csv_path=".\csv_file"
csv_path="./csv_file"
##
excel_path=".\excel_file"
excel_path="./excel_file"
# 创建主窗口
root = tk.Tk()
root.title("Python UI")
@ -39,9 +39,10 @@ class UIController:
waitTime=entry3.get()
#创建 SpyderController对象调用其函数
SpyderController=SC.SpyderController()
global scRuslt_data
scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime)
# global scRuslt_data
self.scRuslt_data=SpyderController.getBilibiliVideoList(videoCount,threadCount,waitTime)
print("爬取完成")
# print(scRuslt_data)
@ -112,7 +113,7 @@ class UIController:
button_stop.grid(row=4, column=1,sticky=tk.W)
button_save_to_excel = tk.Button(root, text="save to excel", command=button_save_to_exce_click)
button_save_to_excel.grid(row=5, column=1, sticky=tk.W)
button_save_to_csv = tk.Button(root, text="save to excel", command=button_save_to_csv_click)
button_save_to_csv = tk.Button(root, text="save to csv", command=button_save_to_csv_click)
button_save_to_csv.grid(row=6, column=1,sticky=tk.W)
# 创建一个带展示框
tree = ttk.Treeview(root, columns=("bvid", "title","url","upload","topNo","viewCount","likeCount","coinCount","favorite","commentCount","bolletCount","creatorld","creatorName","createFanCount"))

@ -1,7 +1,21 @@
from service.IFileService import IFileService
from entity.BilibiliVideo import BilibiliVideo
from controller.SpyderController import SpyderController
import csv
class CsvService(IFileService):
def save(self, filePath, videoList):
raise NotImplementedError
f = open(filePath+".csv", "w", encoding="GB18030", newline="")
csv_writer = csv.writer(f)
csv_writer.writerow(
["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount",
"bulletCount", "commentCount",
"creatorId", "creatorName", "creatorFanCount"])
for video in videoList:
csv_writer.writerow(
[video.topNo, video.bvId, video.title, video.url, video.uploadTimeText, video.viewCount, video.likeCount,
video.coinCount, video.favoriteCount, video.bulletCount, video.commentCount,
video.creatorId, video.creatorName, video.creatorFanCount])
f.close()
# raise NotImplementedError

@ -2,11 +2,22 @@ from controller.SpyderController import SpyderController
spyderController = SpyderController()
import csv
class TestSpyderController:
def test_main(self):
# for debug:
print("testing SpyderController.main")
spyderController = SpyderController()
lst = spyderController.getBilibiliVideoList(999, 1, 0.3)
print(lst)
print(123)
# assert len(lst) == 999
# f = open("file_3.csv", "w", encoding="GB18030", newline="")
# csv_writer = csv.writer(f)
# csv_writer.writerow(
# ["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount",
# "bulletCount", "commentCount",
# "creatorId", "creatorName", "creatorFanCount"])
# f.close()
# spyderController = SpyderController()
# spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2

Loading…
Cancel
Save