You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
45 lines
1.2 KiB
45 lines
1.2 KiB
import pandas as pd
|
|
import jieba
|
|
from nltk.corpus import stopwords
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.cluster import KMeans
|
|
|
|
|
|
# 对弹幕内容进行分词
|
|
def jieba_tokenizer(text):
|
|
return jieba.lcut(text)
|
|
|
|
|
|
def cluster_analysis():
|
|
df = pd.read_csv('barrage.csv')
|
|
stopwords_list = list(stopwords.words('chinese'))
|
|
stopwords_list.append('都')
|
|
stopwords_list.append('不')
|
|
stopwords_list.append('好')
|
|
stopwords_list.append('5')
|
|
|
|
# 使用TF-IDF进行特征提取
|
|
vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, stop_words=stopwords_list)
|
|
tfidf_matrix = vectorizer.fit_transform(df['barrage'])
|
|
|
|
# 使用KMeans进行聚类
|
|
num_clusters = 10
|
|
km = KMeans(n_clusters=num_clusters)
|
|
km.fit(tfidf_matrix)
|
|
|
|
# 将聚类结果添加到原始数据中
|
|
df['cluster'] = km.labels_
|
|
|
|
# 输出每个聚类的前几个弹幕
|
|
for i in range(num_clusters):
|
|
print(f"cluster {i}:")
|
|
print(df[df['cluster'] == i]['barrage'].head(10))
|
|
print("\n")
|
|
|
|
# 保存聚类结果到CSV文件
|
|
df.to_csv('barrage_clustered.csv', index=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cluster_analysis()
|