ADD file via upload

2 months ago · eba6d6049d
parent 74bd3ebef5
commit eba6d6049d
1 changed files with 51 additions and 0 deletions
--- a/(3)数据分析.py
+++ b/(3)数据分析.py
@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# 作者：Halcyon（王思平102201544）
+
+import os
+import pandas as pd
+import jieba
+from collections import Counter
+
+
+def read_excel_and_count_words(file_path, sheet_name='Sheet1', column_name='内容'):
+    # 1、读取excel文件的指定表格和列
+    df = pd.read_excel(file_path, sheet_name=sheet_name)
+
+    if column_name not in df.columns:
+        print(f"列名 '{column_name}' 在 Excel 文件中未找到。")
+        return
+
+    # 2、获取指定列的文本内容
+    report = ' '.join(df[column_name].astype(str).tolist())  # 将所有行合并为一个字符串
+
+    # 3、进行分词
+    words = jieba.cut(report)
+
+    # 4、按指定长度提取词
+    report_words = [word for word in words if len(word) >= 3]
+
+    # 5、统计高频词汇
+    result = Counter(report_words).most_common(50)
+
+    # 6、输出结果
+    print("高频词汇统计结果：")
+    for word, count in result:
+        print(f"{word}: {count}")
+
+    # 7、保存高频词及其频率到excel文件
+    result_df = pd.DataFrame(result, columns=['词汇', '频率'])  # 创建DataFrame
+
+    output_folder = 'output'  # 指定输出文件夹
+    if not os.path.exists(output_folder):  # 如果文件夹不存在就创建它
+        os.makedirs(output_folder)
+    output_file = os.path.join(output_folder, '高频词.xlsx')
+
+    result_df.to_excel(output_file, index=False, sheet_name='高频词汇')  # 保存到excel文件
+
+
+if __name__ == '__main__':
+    file_path = 'output\\合并弹幕.xlsx'  # 要分析的excel文件路径
+    sheet_name = 'MergedData'     # 工作表名称
+    column_name = '弹幕文本'       # 弹幕所在列名
+
+    read_excel_and_count_words(file_path, sheet_name, column_name)