|
|
|
@ -6,6 +6,8 @@ import json
|
|
|
|
|
import requests
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
from entity.BilibiliVideo import BilibiliVideo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpyderController:
|
|
|
|
|
# Bilibili视频爬虫控制器,打开网页,爬取BilibiliVideo.py中的数据,将其下载下来,保存为csv文件
|
|
|
|
@ -17,6 +19,10 @@ class SpyderController:
|
|
|
|
|
:param waitTime:float: 每个线程的等待时间,单位秒(避免爬取过快)
|
|
|
|
|
:return: list[BilibiliVideo] 返回处理完成后的videoList
|
|
|
|
|
"""
|
|
|
|
|
all_data_list = []
|
|
|
|
|
videoCount = int(videoCount)
|
|
|
|
|
threadCount = int(threadCount)
|
|
|
|
|
waitTime = float(waitTime)
|
|
|
|
|
if videoCount < threadCount:
|
|
|
|
|
threadCount = videoCount
|
|
|
|
|
if videoCount > 20:
|
|
|
|
@ -76,11 +82,14 @@ class SpyderController:
|
|
|
|
|
bulletCount = convert_to_number(bulletCount)
|
|
|
|
|
# match = re.search(r'\d+', text)
|
|
|
|
|
# number = match.group()
|
|
|
|
|
followers_str = creatorFanCount[0].strip().split()[1]
|
|
|
|
|
followers_num = float(followers_str.replace('万', '')) * 10000
|
|
|
|
|
# 转化为整数
|
|
|
|
|
followers_num = int(followers_num)
|
|
|
|
|
creatorFanCount = [str(followers_num)]
|
|
|
|
|
if not creatorFanCount:
|
|
|
|
|
creatorFanCount = [str(1)]
|
|
|
|
|
else :
|
|
|
|
|
followers_str = creatorFanCount[0].strip().split()[1]
|
|
|
|
|
followers_num = float(followers_str.replace('万', '')) * 10000
|
|
|
|
|
# 转化为整数
|
|
|
|
|
followers_num = int(followers_num)
|
|
|
|
|
creatorFanCount = [str(followers_num)]
|
|
|
|
|
commentCount = [str(parsed_data['data']['list'][i]['stat']['reply'])]
|
|
|
|
|
creatorId = [str(parsed_data['data']['list'][i]['owner']['mid'])]
|
|
|
|
|
creatorName = [str(parsed_data['data']['list'][i]['owner']['name'])]
|
|
|
|
@ -91,13 +100,16 @@ class SpyderController:
|
|
|
|
|
up_text = requests.get(url=up_url, headers=headers).text
|
|
|
|
|
tree = etree.HTML(up_text)
|
|
|
|
|
#print(up_text)
|
|
|
|
|
all_data = bvId + title + [url] + uploadTime + topNo + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
|
|
|
|
|
|
|
|
|
|
all_data = topNo + bvId + title + [
|
|
|
|
|
video_url] + uploadTime + viewCount + likeCount + coinCount + favoriteCount + bulletCount + commentCount + creatorId + creatorName + creatorFanCount
|
|
|
|
|
csv_writer.writerow(all_data)
|
|
|
|
|
all_data_list = []
|
|
|
|
|
all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
|
|
|
|
|
|
|
|
|
|
# all_data_list = all_data_list + [topNo, bvId, title, [video_url], uploadTime, viewCount, likeCount, coinCount, favoriteCount, bulletCount, commentCount, creatorId, creatorName, creatorFanCount]
|
|
|
|
|
video = BilibiliVideo(bvId[0], title[0], url, int(uploadTime[0]), int(topNo[0]), int(viewCount[0]), int(likeCount[0]), int(coinCount[0]),
|
|
|
|
|
int(favoriteCount[0]), int(commentCount[0]), int(bulletCount[0]), creatorId[0], creatorName[0], int(creatorFanCount[0]))
|
|
|
|
|
all_data_list.append(video)
|
|
|
|
|
f.close()
|
|
|
|
|
# print(all_data_list)
|
|
|
|
|
return all_data_list
|
|
|
|
|
|
|
|
|
|
thread_list = []
|
|
|
|
@ -113,15 +125,15 @@ class SpyderController:
|
|
|
|
|
|
|
|
|
|
for thread in thread_list:
|
|
|
|
|
thread.join()
|
|
|
|
|
|
|
|
|
|
print(all_data_list)
|
|
|
|
|
return all_data_list
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
f = open("file_3.csv", "w", encoding="UTF-8", newline="")
|
|
|
|
|
csv_writer = csv.writer(f)
|
|
|
|
|
csv_writer.writerow(
|
|
|
|
|
["topNo", "bvId", "title", "url", "uploadTime", "viewCount", "likeCount", "coinCount", "favoriteCount",
|
|
|
|
|
"bulletCount", "commentCount",
|
|
|
|
|
"creatorId", "creatorName", "creatorFanCount"])
|
|
|
|
|
["bvId", "title", "url", "uploadTime", "topNo", "viewCount", "likeCount", "coinCount",
|
|
|
|
|
"favoriteCount", "commentCount", "bulletCount", "creatorId", "creatorName", "creatorFanCount"])
|
|
|
|
|
f.close()
|
|
|
|
|
spyderController = SpyderController()
|
|
|
|
|
spyderController.getBilibiliVideoList(6, 2, 0.3) # 设置线程数为2
|