You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

149 lines
5.1 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

r"""
-弹幕统计.xlsx原句/排名 /TopN三个工作表
前8条弹幕.txtTopN文本
词云图.png
"""
import argparse
from pathlib import Path
from typing import List, Tuple, Dict
import pandas as pd
from collections import Counter
import re
# 词云依赖
try:
from wordcloud import WordCloud # type: ignore
except ImportError:
WordCloud = None #跳过词云生成
def read_lines_from_file(f: Path) -> List[str]:
try:
return [line.strip() for line in f.read_text(encoding='utf-8', errors='ignore').splitlines()]
except Exception:
return []
def collect_texts(input_path: Path) -> List[str]:
texts: List[str] = []
if input_path.is_dir():
for f in input_path.rglob('*.txt'):
texts.extend(read_lines_from_file(f))
else:
texts.extend(read_lines_from_file(input_path))
texts = [t for t in texts if t]
return texts
def rank_exact_sentences(texts: List[str]) -> pd.DataFrame:
# 统计一样的弹幕的数量
if not texts:
return pd.DataFrame(columns=['原句', '数量', '排名'])
s = pd.Series(texts, name='原句')
vc = s.value_counts(dropna=False)
df = vc.rename('数量').reset_index().rename(columns={'index': '原句'})
# 添加到排名
df['排名'] = df['数量'].rank(method='dense', ascending=False).astype(int)
return df
def write_excel(raw_texts: List[str], rank_df: pd.DataFrame, out_dir: Path, topn: int):
out_dir.mkdir(parents=True, exist_ok=True)
excel_path = out_dir / '弹幕统计.xlsx'
# 排名表的排序
rank_sorted = rank_df.sort_values(by=['数量', '原句'], ascending=[False, True])
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
pd.DataFrame({'原句': raw_texts}).to_excel(writer, index=False, sheet_name='原句')
rank_sorted.to_excel(writer, index=False, sheet_name='排名')
rank_sorted.head(topn).to_excel(writer, index=False, sheet_name='TopN')
# 同时输出TopN的文本
top_txt = out_dir / '前8条弹幕.txt'
with top_txt.open('w', encoding='utf-8') as f:
for _, row in rank_sorted.head(topn).iterrows():
f.write(f"{int(row['数量'])}\t{row['原句']}\n")
return excel_path, top_txt
def _pick_chinese_font() -> str:
"""确定字体"""
candidates = [
r"C:\\Windows\\Fonts\\simhei.ttf", # 黑体
r"C:\\Windows\\Fonts\\msyh.ttc", # 微软雅黑
r"C:\\Windows\\Fonts\\simsun.ttc", # 宋体
]
for p in candidates:
if Path(p).exists():
return p
return ''
def _simple_tokenize_for_wc(texts: List[str]) -> Counter:
#词云用的简单切分
stop = set("""的 了 啊 吗 呢 吧 是 在 我 你 他 她 它 这 那 也 与 和 很 都 就 还 不 没 有 说 啊呀 哦 嗯 呃 啊? 吧? 吗? 呢? ! 。 、 . , ~ 哈 哈哈 啊啊 啊啊啊 666 233""".split())
joined = '\n'.join(texts)
tokens: List[str] = []
rough = re.split(r"[^\w\u4e00-\u9fff]+", joined)
for tk in rough:
tk = tk.strip()
if len(tk) < 2 or tk in stop:
continue
if re.fullmatch(r"\d+", tk):
continue
tokens.append(tk)
return Counter(tokens)
def generate_wordcloud(raw_texts: List[str], out_dir: Path) -> Path:
if WordCloud is None:
print('未安装wordcloud已跳过词云生成。')
return Path()
out_dir.mkdir(parents=True, exist_ok=True)
freq = _simple_tokenize_for_wc(raw_texts)
if not freq:
print('词云:未得到有效分词,已跳过。')
return Path()
font_path = _pick_chinese_font() or None
wc = WordCloud(
width=1600,
height=900,
background_color='white',
font_path=font_path,
collocations=False,
prefer_horizontal=0.9,
)
wc.generate_from_frequencies(freq)
out_path = out_dir / '词云图.png'
wc.to_file(str(out_path))
return out_path
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True, help='输入txt 文件或目录(目录下将递归读取 *.txt')
parser.add_argument('-o', '--out', required=True, help='输出目录')
parser.add_argument('-n', '--topn', type=int, default=8, help='TopN 的数量(默认 8')
args = parser.parse_args()
input_path = Path(args.input)
out_dir = Path(args.out)
texts = collect_texts(input_path)
if not texts:
print('没有读取到任何弹幕文本,请检查路径。')
return
rank_df = rank_exact_sentences(texts)
excel, top_txt = write_excel(texts, rank_df, out_dir, args.topn)
wc_path = generate_wordcloud(texts, out_dir)
print('完成:')
print(f' 统计结果 Excel: {excel}')
print(f'{args.topn} 文本: {top_txt}')
if wc_path and wc_path.exists():
print(f' 词云图: {wc_path}')
print('TopN预览')
print(rank_df.head(args.topn).to_string(index=False))
if __name__ == '__main__':
main()