Crawler/数据分析.py

import pandas as pd
import jieba
from collections import Counter


def read_excel_and_count_words(file_path, sheet_name='Sheet1', column_name='内容'):
    """读取指定的 Excel 文件并进行词频统计"""
    # 1. 读取 Excel 文件的指定表格和列
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    if column_name not in df.columns:
        print(f"列名 '{column_name}' 在 Excel 文件中未找到。")
        return

    # 2. 获取指定列的文本内容
    report = ' '.join(df[column_name].astype(str).tolist())  # 将所有行合并为一个字符串

    # 3. 进行分词
    words = jieba.cut(report)

    # 4. 按指定长度提取词
    report_words = [word for word in words if len(word) >= 3]

    # 5. 统计高频词汇
    result = Counter(report_words).most_common(50)

    # 6. 输出结果
    print("高频词汇统计结果：")
    for word, count in result:
        print(f"{word}: {count}")

    # 7. 保存高频词及其频率至高频词.xlsx
    result_df = pd.DataFrame(result, columns=['词汇', '频率'])  # 创建 DataFrame
    result_df.to_excel('高频词.xlsx', index=False, sheet_name='高频词汇')  # 保存到 Excel 文件


if __name__ == '__main__':
    file_path = '合并弹幕.xlsx'  # 替换为你的 Excel 文件路径
    sheet_name = 'MergedData'     # 替换为你需要的工作表名称
    column_name = '弹幕文本'       # 替换为你的文本内容所在的列名

    read_excel_and_count_words(file_path, sheet_name, column_name)
ADD file via upload 2 months ago			`import pandas as pd`
			`import jieba`
			`from collections import Counter`


			`def read_excel_and_count_words(file_path, sheet_name='Sheet1', column_name='内容'):`
			`"""读取指定的 Excel 文件并进行词频统计"""`
			`# 1. 读取 Excel 文件的指定表格和列`
			`df = pd.read_excel(file_path, sheet_name=sheet_name)`

			`if column_name not in df.columns:`
			`print(f"列名 '{column_name}' 在 Excel 文件中未找到。")`
			`return`

			`# 2. 获取指定列的文本内容`
			`report = ' '.join(df[column_name].astype(str).tolist()) # 将所有行合并为一个字符串`

			`# 3. 进行分词`
			`words = jieba.cut(report)`

			`# 4. 按指定长度提取词`
			`report_words = [word for word in words if len(word) >= 3]`

			`# 5. 统计高频词汇`
			`result = Counter(report_words).most_common(50)`

			`# 6. 输出结果`
			`print("高频词汇统计结果：")`
			`for word, count in result:`
			`print(f"{word}: {count}")`

			`# 7. 保存高频词及其频率至高频词.xlsx`
			`result_df = pd.DataFrame(result, columns=['词汇', '频率']) # 创建 DataFrame`
			`result_df.to_excel('高频词.xlsx', index=False, sheet_name='高频词汇') # 保存到 Excel 文件`


			`if __name__ == '__main__':`
			`file_path = '合并弹幕.xlsx' # 替换为你的 Excel 文件路径`
			`sheet_name = 'MergedData' # 替换为你需要的工作表名称`
			`column_name = '弹幕文本' # 替换为你的文本内容所在的列名`

			`read_excel_and_count_words(file_path, sheet_name, column_name)`