You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Spyder_python/controller/SpyderController.py

127 lines
6.1 KiB

8 months ago
import threading
import time
import datetime
import csv
import json
import requests
from lxml import etree
8 months ago
class SpyderController:
8 months ago
# Bilibili视频爬虫控制器,打开网页爬取BilibiliVideo.py中的数据,将其下载下来保存为csv文件
def getBilibiliVideoList(self, videoCount, threadCount, waitTime):
8 months ago
"""
整个爬虫的调用程序
:param videoCount: 爬取的视频数量若不限制可设置为999
:param threadCount: 爬取的线程并发数量
:param waitTime:float: 每个线程的等待时间单位秒避免爬取过快
:return: list[BilibiliVideo] 返回处理完成后的videoList
"""
8 months ago
if videoCount < threadCount:
threadCount = videoCount
if videoCount > 20:
videoCount = 20
url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
}
#f = open("file_3.csv", "a", encoding="UTF-8", newline="")
#csv_writer = csv.writer(f)
rank_text = requests.get(url=url, headers=headers).text
# 将含有"万"的数据转换为数字
def crawl_data(start_index, end_index):
def convert_to_number(lst):
result = []
for item in lst:
if '' in item:
number = int(float(item.replace('', '')) * 10000)
result.append(str(number))
else:
result.append(item)
return result
# url = "https://api.bilibili.com/x/web-interface/ranking/v2?"
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0'
# }
f = open("file_3.csv", "a", encoding="UTF-8", newline="")
csv_writer = csv.writer(f)
#
# rank_text = requests.get(url=url, headers=headers).text
for i in range(start_index, end_index):
parsed_data = json.loads(rank_text)
aid = str(parsed_data["data"]["list"][i]["aid"])
print(aid)
bvId = [str(parsed_data['data']['list'][i]['bvid'])]
topNo = [str(i+1)]
video_url = "https://www.bilibili.com/video/av" + aid + "/?"
video_text = requests.get(url=video_url, headers=headers).text
tree = etree.HTML(video_text)
#print(video_text)
title = tree.xpath('//div[@class="video-info-title-inner"]//text()')
uploadTime = [str(parsed_data["data"]["list"][i]["ctime"])]
viewCount = tree.xpath('//div[@class="view item"]/div/text()')
likeCount = tree.xpath('//div[@class="video-like video-toolbar-left-item"]/span/text()')
coinCount = tree.xpath('//div[@class="video-coin video-toolbar-left-item"]/span/text()')
favoriteCount = tree.xpath('//div[@class="video-fav video-toolbar-left-item"]/span/text()')
bulletCount = tree.xpath('//div[@class="dm-text"]/text()')
creatorFanCount = tree.xpath('//div[@class="default-btn follow-btn b-gz not-follow"]/span/text()')
viewCount = convert_to_number(viewCount)
likeCount = convert_to_number(likeCount)
coinCount = convert_to_number(coinCount)
favoriteCount = convert_to_number(favoriteCount)
bulletCount = convert_to_number(bulletCount)
# match = re.search(r'\d+', text)
# number = match.group()
followers_str = creatorFanCount[0].strip().split()[1]
followers_num = float(followers_str.replace('', '')) * 10000
# 转化为整数
followers_num = int(followers_num)
creatorFanCount = [str(followers_num)]
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
bulletCount = convert_to_number(bulletCount)
#up_url = "https://space.bilibili.com/" + creatorId[0] + "?"
up_url = "https://space.bilibili.com/401742377?spm_id_from=333.788.0.0"
up_text = requests.get(url=up_url, headers=headers).text
tree = etree.HTML(up_text)
#print(up_text)
all_data = topNo + bvId + title + [
video_url] + uploadTime + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
csv_writer.writerow(all_data)
all_data_list = []
all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
f.close()
return all_data_list
thread_list = []
video_per_thread = videoCount // threadCount
for i in range(threadCount):
start_index = i * video_per_thread
end_index = (i + 1) * video_per_thread if (i + 1) < threadCount else videoCount
thread = threading.Thread(target=crawl_data, args=(start_index, end_index))
thread_list.append(thread)
thread.start()
time.sleep(waitTime)
for thread in thread_list:
thread.join()
if __name__ == '__main__':
f = open("file_3.csv", "w", encoding="UTF-8", newline="")
csv_writer = csv.writer(f)
csv_writer.writerow(
["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount",
"bulletCount", "commentCount",
"creatorId", "creatorName", "creatorFanCount"])
f.close()
spyderController = SpyderController()
spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2