import pandas as pd import jieba from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans # 对弹幕内容进行分词 def jieba_tokenizer(text): return jieba.lcut(text) def cluster_analysis(): df = pd.read_csv('barrage.csv') stopwords_list = list(stopwords.words('chinese')) stopwords_list.append('都') stopwords_list.append('不') stopwords_list.append('好') stopwords_list.append('5') # 使用TF-IDF进行特征提取 vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, stop_words=stopwords_list) tfidf_matrix = vectorizer.fit_transform(df['barrage']) # 使用KMeans进行聚类 num_clusters = 10 km = KMeans(n_clusters=num_clusters) km.fit(tfidf_matrix) # 将聚类结果添加到原始数据中 df['cluster'] = km.labels_ # 输出每个聚类的前几个弹幕 for i in range(num_clusters): print(f"cluster {i}:") print(df[df['cluster'] == i]['barrage'].head(10)) print("\n") # 保存聚类结果到CSV文件 df.to_csv('barrage_clustered.csv', index=False) if __name__ == '__main__': cluster_analysis()