#%% import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud as WC import jieba import re # %% dataset = pd.read_csv('doubandata.csv') dataset = dataset.head(100) """ subset : column label or sequence of labels, optional 用来指定特定的列,默认所有列 keep : {‘first’, ‘last’, False}, default ‘first’ 删除重复项并保留第一次出现的项 inplace : boolean, default False """ dataset = dataset.drop_duplicates(subset=None, keep='first', inplace=False) """ axis:维度,axis=0表示index行,axis=1表示columns列,默认为0 how:"all"表示这一行或列中的元素全部缺失(为nan)才删除这一行或列,"any"表示这一行或列中只要有元素缺失,就删除这一行或列 subset:在某些列的子集中选择出现了缺失值的列删除,不在子集中的含有缺失值得列或行不会删除(有axis决定是行还是列) inplace:刷选过缺失值得新数据是存为副本还是直接在原数据上进行修改。默认是False,即创建新的对象进行修改,原对象不变,和深复制和浅复制有些类似。 """ dataset = dataset.dropna(axis=0, how='any', subset=None, inplace=False) dataset # %% dataset['时间'] = dataset['基本信息'].apply(lambda x: re.findall('\d\d\d\d',x)[0]) dataset['时间'] # %% dataset['评论数'] = dataset['评论数'].apply(lambda x: re.findall('\d+',x)[0]) dataset['评论数'] # %% dataset['类别'] = dataset['基本信息'].apply(lambda x: re.findall('\d\d\d\d(.+)',x)[0]) dataset['类别'] # %% dataset['类别'] = dataset['类别'].apply(lambda x: x.replace(' ','_').replace(' ','_').replace(' ','_')) dataset['类别'] # %% dataset['地点'] = dataset['类别'].apply(lambda x: re.sub('\s+','',x.split(r'/')[1].split('_')[0].replace(' ',''))) dataset['地点'] # %% dataset['类别'] = dataset['类别'].apply(lambda x: re.sub('\s+','',x.split(r'/')[2].split('_')[-1].replace(' ',''))) dataset['类别'] # %% dataset # %% # 折线图 plt.figure(figsize=(12,7),dpi=80) plt.rcParams['font.sans-serif']=['Microsoft YaHei'] Y_li = dataset['时间'].value_counts(ascending=True) X_li = Y_li.index plt.plot([i for i in X_li],[i for i in Y_li],label='年份电影数',picker=4,color='r') for x,y in zip(X_li,Y_li): plt.text(x,y,y) plt.xticks([i for i in X_li],[i for i in X_li],rotation=90) plt.title('年份电影数变化折线图') plt.xlabel('年份') plt.ylabel('数量') plt.legend(loc='upper left') plt.savefig('年份电影数变化折线图') plt.show() # %% # 饼图 plt.figure(figsize=(12,7),dpi=80) plt.rcParams['font.sans-serif']=['Microsoft YaHei'] Y_li = dataset['地点'].value_counts(ascending=True)[:20] X_li = Y_li.index plt.pie([i for i in Y_li],labels=[i for i in X_li],autopct='%.1f%%') plt.title('地区电影数量饼图') plt.savefig('地区电影数量饼图') plt.show() # %% # 柱状图 plt.figure(figsize=(12,7),dpi=80) plt.rcParams['font.sans-serif']=['Microsoft YaHei'] Y_li = dataset['类别'].value_counts(ascending=True) X_li = Y_li.index plt.bar(range(len(X_li)),[i for i in Y_li],width=0.7,color='blue',alpha=0.5,edgecolor='c',label='数量',picker=2) x_ = range(len(X_li)) for x,y in zip(x_,Y_li): plt.text(x-0.2,y+0.1,y) plt.xticks(range(len(X_li)),[i for i in X_li],rotation=90) plt.title('类别电影数柱状图') plt.xlabel('类别') plt.ylabel('数量') plt.legend(loc='upper left') plt.savefig('类别电影数柱状图') plt.show() # %% # 文字分析词云 plt.rcParams['font.sans-serif']=['Microsoft YaHei'] text = '' for i in dataset['简介']: text = text + str(i) txtlist = jieba.lcut(text) string = " ".join(txtlist) wc = WC(width=1000, height=700, background_color='white', font_path='msyh.ttc') wc.generate(string) plt.imshow(wc) plt.savefig('电影简介词云') plt.show() # %%