parent
1d86f92d6c
commit
a0b2971f40
@ -1,42 +0,0 @@
|
||||
import pandas as pd
|
||||
import jieba
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def read_excel_and_count_words(file_path, sheet_name='Sheet1', column_name='内容'):
|
||||
"""读取指定的 Excel 文件并进行词频统计"""
|
||||
# 1. 读取 Excel 文件的指定表格和列
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||
|
||||
if column_name not in df.columns:
|
||||
print(f"列名 '{column_name}' 在 Excel 文件中未找到。")
|
||||
return
|
||||
|
||||
# 2. 获取指定列的文本内容
|
||||
report = ' '.join(df[column_name].astype(str).tolist()) # 将所有行合并为一个字符串
|
||||
|
||||
# 3. 进行分词
|
||||
words = jieba.cut(report)
|
||||
|
||||
# 4. 按指定长度提取词
|
||||
report_words = [word for word in words if len(word) >= 3]
|
||||
|
||||
# 5. 统计高频词汇
|
||||
result = Counter(report_words).most_common(50)
|
||||
|
||||
# 6. 输出结果
|
||||
print("高频词汇统计结果:")
|
||||
for word, count in result:
|
||||
print(f"{word}: {count}")
|
||||
|
||||
# 7. 保存高频词及其频率至高频词.xlsx
|
||||
result_df = pd.DataFrame(result, columns=['词汇', '频率']) # 创建 DataFrame
|
||||
result_df.to_excel('高频词.xlsx', index=False, sheet_name='高频词汇') # 保存到 Excel 文件
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
file_path = '合并弹幕.xlsx' # 替换为你的 Excel 文件路径
|
||||
sheet_name = 'MergedData' # 替换为你需要的工作表名称
|
||||
column_name = '弹幕文本' # 替换为你的文本内容所在的列名
|
||||
|
||||
read_excel_and_count_words(file_path, sheet_name, column_name)
|
Loading…
Reference in new issue