You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

71 lines
2.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import argparse
import logging
import time
import pandas as pd
import jieba
import generate
import dmk
logging.getLogger("jieba").setLevel(logging.ERROR)
class DanmakuCrawler:
def __init__(self, keyword, max_result, top_num, image_style, width, height):
self.keyword = keyword
self.max_result = max_result
self.top_num = top_num
self.image_style = image_style
self.width = width
self.height = height
self.keywords = dmk.load_keywords(self.keyword)
self.stopwords = dmk.load_stopwords('stopwords.txt')
def get_all_danmakus(self):
video_ids = dmk.get_video_ids(self.keyword, self.max_result)
if not video_ids:
print("No video ids retrieved; check the network and API responses.")
return []
all_danmakus = []
# time1 = time.time()
for bvid in video_ids:
danmakus = dmk.get_danmaku(bvid, self.keyword)
filtered_danmakus = dmk.filter_danmakus(danmakus, self.keywords)
all_danmakus.extend(filtered_danmakus)
# print("Time elapsed: %.2f seconds" % (time.time() - time1))
print(all_danmakus)
if not all_danmakus:
print("No danmakus retrieved; unable to generate word cloud.")
#将all_danmakus中出现数量最多的前八个保存在excel中记录出现个数
danmaku_df = pd.DataFrame(all_danmakus, columns=['danmaku'])
top_danmakus = danmaku_df['danmaku'].value_counts().head(self.top_num)
top_danmakus.to_excel(f'top_danmakus_{self.keyword}.xlsx')
return all_danmakus
def generate_word_cloud(self, danmakus):
# start_time = time.time()
danmaku_df = pd.DataFrame(danmakus, columns=['danmaku'])
top_danmakus = danmaku_df['danmaku'].value_counts().head(self.top_num)
danmaku_frequency = top_danmakus.to_dict()
keys = ' '.join(danmaku_frequency.keys())
resulting_string = ' '.join(jieba.cut(keys))
resulting_string = dmk.remove_stopwords(resulting_string, self.stopwords)
generate.generate_wordcloud(resulting_string, "C:\\Windows\\Fonts\\msyh.ttc", self.width, self.height, self.image_style)
# print("Time elapsed: %.2f seconds" % (time.time() - start_time))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--keyword', default='2024巴黎奥运会')
parser.add_argument('--max_result', type=int, default=300)
parser.add_argument('--top_num', type=int, default=8)
parser.add_argument('--image_style', default='family')
parser.add_argument('--width', type=int, default=300)
parser.add_argument('--height', type=int, default=300)
args = parser.parse_args()
crawler = DanmakuCrawler(args.keyword, args.max_result, args.top_num, args.image_style, args.width, args.height)
danmakus = crawler.get_all_danmakus()
if danmakus:
crawler.generate_word_cloud(danmakus)