You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

52 lines
1.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# 作者Halcyon王思平102201544
import os
import pandas as pd
import jieba
from collections import Counter
def read_excel_and_count_words(file_path, sheet_name='Sheet1', column_name='内容'):
# 1、读取excel文件的指定表格和列
df = pd.read_excel(file_path, sheet_name=sheet_name)
if column_name not in df.columns:
print(f"列名 '{column_name}' 在 Excel 文件中未找到。")
return
# 2、获取指定列的文本内容
report = ' '.join(df[column_name].astype(str).tolist()) # 将所有行合并为一个字符串
# 3、进行分词
words = jieba.cut(report)
# 4、按指定长度提取词
report_words = [word for word in words if len(word) >= 3]
# 5、统计高频词汇
result = Counter(report_words).most_common(50)
# 6、输出结果
print("高频词汇统计结果:")
for word, count in result:
print(f"{word}: {count}")
# 7、保存高频词及其频率到excel文件
result_df = pd.DataFrame(result, columns=['词汇', '频率']) # 创建DataFrame
output_folder = 'output' # 指定输出文件夹
if not os.path.exists(output_folder): # 如果文件夹不存在就创建它
os.makedirs(output_folder)
output_file = os.path.join(output_folder, '高频词.xlsx')
result_df.to_excel(output_file, index=False, sheet_name='高频词汇') # 保存到excel文件
if __name__ == '__main__':
file_path = 'output\\合并弹幕.xlsx' # 要分析的excel文件路径
sheet_name = 'MergedData' # 工作表名称
column_name = '弹幕文本' # 弹幕所在列名
read_excel_and_count_words(file_path, sheet_name, column_name)