You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
44 lines
1.4 KiB
44 lines
1.4 KiB
6 months ago
|
import csv
|
||
|
import re
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from wordcloud import WordCloud
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
def purification(self,new_self):
|
||
|
with open(self,"r",newline='',encoding='utf-8') as self:
|
||
|
reader = csv.reader(self)
|
||
|
data_list = []
|
||
|
for row in reader:
|
||
|
new_row_data = str([row[13]])
|
||
|
pattern = re.compile("[\u4e00-\u9fa5]+")
|
||
|
new_row_data = pattern.findall(new_row_data)
|
||
|
data_list.append(new_row_data)
|
||
|
|
||
|
with open(new_self,'w',newline='',encoding='utf-8') as new_self:
|
||
|
writer = csv.writer(new_self)
|
||
|
for row in data_list:
|
||
|
writer.writerow(row)
|
||
|
|
||
|
def wordcloud():
|
||
|
# 读取 csv 文件
|
||
|
df = pd.read_csv('analyse.csv',sep = '$')
|
||
|
# 提取第四列数据并去重
|
||
|
column_data = df.iloc[:,0] # 通过 iloc 方法选取第一列数据
|
||
|
unique_data = column_data.drop_duplicates()
|
||
|
# 将去重后的数据转换为字符串类型
|
||
|
text = ' '.join(unique_data.astype(str).tolist())
|
||
|
# 生成词云
|
||
|
my_stopwords = []
|
||
|
wordcloud = WordCloud(font_path='simhei.ttf',width=1000, height=600, background_color='white', stopwords=my_stopwords).generate(text)
|
||
|
# 显示词云
|
||
|
plt.figure(figsize=(10, 6))
|
||
|
plt.imshow(wordcloud, interpolation='bilinear')
|
||
|
plt.axis("off")
|
||
|
plt.show()
|
||
|
|
||
|
self = 'congtent.csv'
|
||
|
new_self = 'analyse.csv'
|
||
|
purification(self,new_self)
|
||
|
wordcloud()
|