r""" -弹幕统计.xlsx(原句/排名 /TopN三个工作表) 前8条弹幕.txt(TopN文本) 词云图.png """ import argparse from pathlib import Path from typing import List, Tuple, Dict import pandas as pd from collections import Counter import re # 词云依赖 try: from wordcloud import WordCloud # type: ignore except ImportError: WordCloud = None #跳过词云生成 def read_lines_from_file(f: Path) -> List[str]: try: return [line.strip() for line in f.read_text(encoding='utf-8', errors='ignore').splitlines()] except Exception: return [] def collect_texts(input_path: Path) -> List[str]: texts: List[str] = [] if input_path.is_dir(): for f in input_path.rglob('*.txt'): texts.extend(read_lines_from_file(f)) else: texts.extend(read_lines_from_file(input_path)) texts = [t for t in texts if t] return texts def rank_exact_sentences(texts: List[str]) -> pd.DataFrame: # 统计一样的弹幕的数量 if not texts: return pd.DataFrame(columns=['原句', '数量', '排名']) s = pd.Series(texts, name='原句') vc = s.value_counts(dropna=False) df = vc.rename('数量').reset_index().rename(columns={'index': '原句'}) # 添加到排名 df['排名'] = df['数量'].rank(method='dense', ascending=False).astype(int) return df def write_excel(raw_texts: List[str], rank_df: pd.DataFrame, out_dir: Path, topn: int): out_dir.mkdir(parents=True, exist_ok=True) excel_path = out_dir / '弹幕统计.xlsx' # 排名表的排序 rank_sorted = rank_df.sort_values(by=['数量', '原句'], ascending=[False, True]) with pd.ExcelWriter(excel_path, engine='openpyxl') as writer: pd.DataFrame({'原句': raw_texts}).to_excel(writer, index=False, sheet_name='原句') rank_sorted.to_excel(writer, index=False, sheet_name='排名') rank_sorted.head(topn).to_excel(writer, index=False, sheet_name='TopN') # 同时输出TopN的文本 top_txt = out_dir / '前8条弹幕.txt' with top_txt.open('w', encoding='utf-8') as f: for _, row in rank_sorted.head(topn).iterrows(): f.write(f"{int(row['数量'])}\t{row['原句']}\n") return excel_path, top_txt def _pick_chinese_font() -> str: """确定字体""" candidates = [ r"C:\\Windows\\Fonts\\simhei.ttf", # 黑体 r"C:\\Windows\\Fonts\\msyh.ttc", # 微软雅黑 r"C:\\Windows\\Fonts\\simsun.ttc", # 宋体 ] for p in candidates: if Path(p).exists(): return p return '' def _simple_tokenize_for_wc(texts: List[str]) -> Counter: #词云用的简单切分 stop = set("""的 了 啊 吗 呢 吧 是 在 我 你 他 她 它 这 那 也 与 和 很 都 就 还 不 没 有 说 啊呀 哦 嗯 呃 啊? 吧? 吗? 呢? ? ! ! , 。 、 . , ~ ~ 哈 哈哈 啊啊 啊啊啊 666 233""".split()) joined = '\n'.join(texts) tokens: List[str] = [] rough = re.split(r"[^\w\u4e00-\u9fff]+", joined) for tk in rough: tk = tk.strip() if len(tk) < 2 or tk in stop: continue if re.fullmatch(r"\d+", tk): continue tokens.append(tk) return Counter(tokens) def generate_wordcloud(raw_texts: List[str], out_dir: Path) -> Path: if WordCloud is None: print('未安装wordcloud,已跳过词云生成。') return Path() out_dir.mkdir(parents=True, exist_ok=True) freq = _simple_tokenize_for_wc(raw_texts) if not freq: print('词云:未得到有效分词,已跳过。') return Path() font_path = _pick_chinese_font() or None wc = WordCloud( width=1600, height=900, background_color='white', font_path=font_path, collocations=False, prefer_horizontal=0.9, ) wc.generate_from_frequencies(freq) out_path = out_dir / '词云图.png' wc.to_file(str(out_path)) return out_path def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True, help='输入:txt 文件或目录(目录下将递归读取 *.txt)') parser.add_argument('-o', '--out', required=True, help='输出目录') parser.add_argument('-n', '--topn', type=int, default=8, help='TopN 的数量(默认 8)') args = parser.parse_args() input_path = Path(args.input) out_dir = Path(args.out) texts = collect_texts(input_path) if not texts: print('没有读取到任何弹幕文本,请检查路径。') return rank_df = rank_exact_sentences(texts) excel, top_txt = write_excel(texts, rank_df, out_dir, args.topn) wc_path = generate_wordcloud(texts, out_dir) print('完成:') print(f' 统计结果 Excel: {excel}') print(f' 前{args.topn} 文本: {top_txt}') if wc_path and wc_path.exists(): print(f' 词云图: {wc_path}') print('TopN预览:') print(rank_df.head(args.topn).to_string(index=False)) if __name__ == '__main__': main()