|
|
|
@ -0,0 +1,51 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
# 作者:Halcyon(王思平102201544)
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import jieba
|
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_excel_and_count_words(file_path, sheet_name='Sheet1', column_name='内容'):
|
|
|
|
|
# 1、读取excel文件的指定表格和列
|
|
|
|
|
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
|
|
|
|
|
|
|
|
if column_name not in df.columns:
|
|
|
|
|
print(f"列名 '{column_name}' 在 Excel 文件中未找到。")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# 2、获取指定列的文本内容
|
|
|
|
|
report = ' '.join(df[column_name].astype(str).tolist()) # 将所有行合并为一个字符串
|
|
|
|
|
|
|
|
|
|
# 3、进行分词
|
|
|
|
|
words = jieba.cut(report)
|
|
|
|
|
|
|
|
|
|
# 4、按指定长度提取词
|
|
|
|
|
report_words = [word for word in words if len(word) >= 3]
|
|
|
|
|
|
|
|
|
|
# 5、统计高频词汇
|
|
|
|
|
result = Counter(report_words).most_common(50)
|
|
|
|
|
|
|
|
|
|
# 6、输出结果
|
|
|
|
|
print("高频词汇统计结果:")
|
|
|
|
|
for word, count in result:
|
|
|
|
|
print(f"{word}: {count}")
|
|
|
|
|
|
|
|
|
|
# 7、保存高频词及其频率到excel文件
|
|
|
|
|
result_df = pd.DataFrame(result, columns=['词汇', '频率']) # 创建DataFrame
|
|
|
|
|
|
|
|
|
|
output_folder = 'output' # 指定输出文件夹
|
|
|
|
|
if not os.path.exists(output_folder): # 如果文件夹不存在就创建它
|
|
|
|
|
os.makedirs(output_folder)
|
|
|
|
|
output_file = os.path.join(output_folder, '高频词.xlsx')
|
|
|
|
|
|
|
|
|
|
result_df.to_excel(output_file, index=False, sheet_name='高频词汇') # 保存到excel文件
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
file_path = 'output\\合并弹幕.xlsx' # 要分析的excel文件路径
|
|
|
|
|
sheet_name = 'MergedData' # 工作表名称
|
|
|
|
|
column_name = '弹幕文本' # 弹幕所在列名
|
|
|
|
|
|
|
|
|
|
read_excel_and_count_words(file_path, sheet_name, column_name)
|