|
|
import argparse
|
|
|
import logging
|
|
|
import time
|
|
|
import pandas as pd
|
|
|
import jieba
|
|
|
import generate
|
|
|
import dmk
|
|
|
|
|
|
logging.getLogger("jieba").setLevel(logging.ERROR)
|
|
|
|
|
|
|
|
|
class DanmakuCrawler:
|
|
|
def __init__(self, keyword, max_result, top_num, image_style, width, height):
|
|
|
self.keyword = keyword
|
|
|
self.max_result = max_result
|
|
|
self.top_num = top_num
|
|
|
self.image_style = image_style
|
|
|
self.width = width
|
|
|
self.height = height
|
|
|
self.keywords = dmk.load_keywords(self.keyword)
|
|
|
self.stopwords = dmk.load_stopwords('stopwords.txt')
|
|
|
|
|
|
def get_all_danmakus(self):
|
|
|
video_ids = dmk.get_video_ids(self.keyword, self.max_result)
|
|
|
if not video_ids:
|
|
|
print("No video ids retrieved; check the network and API responses.")
|
|
|
return []
|
|
|
|
|
|
all_danmakus = []
|
|
|
# time1 = time.time()
|
|
|
for bvid in video_ids:
|
|
|
danmakus = dmk.get_danmaku(bvid, self.keyword)
|
|
|
filtered_danmakus = dmk.filter_danmakus(danmakus, self.keywords)
|
|
|
all_danmakus.extend(filtered_danmakus)
|
|
|
# print("Time elapsed: %.2f seconds" % (time.time() - time1))
|
|
|
print(all_danmakus)
|
|
|
if not all_danmakus:
|
|
|
print("No danmakus retrieved; unable to generate word cloud.")
|
|
|
#将all_danmakus中出现数量最多的前八个保存在excel中,记录出现个数
|
|
|
danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku'])
|
|
|
top_danmakus = danmaku_df['danmaku'].value_counts().head(self.top_num)
|
|
|
top_danmakus.to_excel(f'top_danmakus_{self.keyword}.xlsx')
|
|
|
return all_danmakus
|
|
|
|
|
|
def generate_word_cloud(self, danmakus):
|
|
|
# start_time = time.time()
|
|
|
danmaku_df = pd.DataFrame(danmakus, columns=['danmaku'])
|
|
|
top_danmakus = danmaku_df['danmaku'].value_counts().head(self.top_num)
|
|
|
danmaku_frequency = top_danmakus.to_dict()
|
|
|
keys = ' '.join(danmaku_frequency.keys())
|
|
|
resulting_string = ' '.join(jieba.cut(keys))
|
|
|
resulting_string = dmk.remove_stopwords(resulting_string, self.stopwords)
|
|
|
|
|
|
generate.generate_wordcloud(resulting_string, "C:\\Windows\\Fonts\\msyh.ttc", self.width, self.height, self.image_style)
|
|
|
# print("Time elapsed: %.2f seconds" % (time.time() - start_time))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
parser = argparse.ArgumentParser()
|
|
|
parser.add_argument('--keyword', default='2024巴黎奥运会')
|
|
|
parser.add_argument('--max_result', type=int, default=300)
|
|
|
parser.add_argument('--top_num', type=int, default=8)
|
|
|
parser.add_argument('--image_style', default='family')
|
|
|
parser.add_argument('--width', type=int, default=300)
|
|
|
parser.add_argument('--height', type=int, default=300)
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
crawler = DanmakuCrawler(args.keyword, args.max_result, args.top_num, args.image_style, args.width, args.height)
|
|
|
danmakus = crawler.get_all_danmakus()
|
|
|
if danmakus:
|
|
|
crawler.generate_word_cloud(danmakus)
|