|
|
#%%
|
|
|
import pandas as pd
|
|
|
import matplotlib.pyplot as plt
|
|
|
from wordcloud import WordCloud as WC
|
|
|
import jieba
|
|
|
import re
|
|
|
# %%
|
|
|
dataset = pd.read_csv('doubandata.csv')
|
|
|
dataset = dataset.head(100)
|
|
|
"""
|
|
|
subset : column label or sequence of labels, optional
|
|
|
用来指定特定的列,默认所有列
|
|
|
keep : {‘first’, ‘last’, False}, default ‘first’
|
|
|
删除重复项并保留第一次出现的项
|
|
|
inplace : boolean, default False
|
|
|
"""
|
|
|
dataset = dataset.drop_duplicates(subset=None, keep='first', inplace=False)
|
|
|
"""
|
|
|
axis:维度,axis=0表示index行,axis=1表示columns列,默认为0
|
|
|
how:"all"表示这一行或列中的元素全部缺失(为nan)才删除这一行或列,"any"表示这一行或列中只要有元素缺失,就删除这一行或列
|
|
|
subset:在某些列的子集中选择出现了缺失值的列删除,不在子集中的含有缺失值得列或行不会删除(有axis决定是行还是列)
|
|
|
inplace:刷选过缺失值得新数据是存为副本还是直接在原数据上进行修改。默认是False,即创建新的对象进行修改,原对象不变,和深复制和浅复制有些类似。
|
|
|
"""
|
|
|
dataset = dataset.dropna(axis=0, how='any', subset=None, inplace=False)
|
|
|
dataset
|
|
|
# %%
|
|
|
dataset['时间'] = dataset['基本信息'].apply(lambda x: re.findall('\d\d\d\d',x)[0])
|
|
|
dataset['时间']
|
|
|
# %%
|
|
|
dataset['评论数'] = dataset['评论数'].apply(lambda x: re.findall('\d+',x)[0])
|
|
|
dataset['评论数']
|
|
|
# %%
|
|
|
dataset['类别'] = dataset['基本信息'].apply(lambda x: re.findall('\d\d\d\d(.+)',x)[0])
|
|
|
dataset['类别']
|
|
|
# %%
|
|
|
dataset['类别'] = dataset['类别'].apply(lambda x: x.replace(' ','_').replace(' ','_').replace(' ','_'))
|
|
|
dataset['类别']
|
|
|
# %%
|
|
|
dataset['地点'] = dataset['类别'].apply(lambda x: re.sub('\s+','',x.split(r'/')[1].split('_')[0].replace(' ','')))
|
|
|
dataset['地点']
|
|
|
# %%
|
|
|
dataset['类别'] = dataset['类别'].apply(lambda x: re.sub('\s+','',x.split(r'/')[2].split('_')[-1].replace(' ','')))
|
|
|
dataset['类别']
|
|
|
# %%
|
|
|
dataset
|
|
|
# %%
|
|
|
# 折线图
|
|
|
plt.figure(figsize=(12,7),dpi=80)
|
|
|
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
|
|
|
Y_li = dataset['时间'].value_counts(ascending=True)
|
|
|
X_li = Y_li.index
|
|
|
plt.plot([i for i in X_li],[i for i in Y_li],label='年份电影数',picker=4,color='r')
|
|
|
for x,y in zip(X_li,Y_li):
|
|
|
plt.text(x,y,y)
|
|
|
plt.xticks([i for i in X_li],[i for i in X_li],rotation=90)
|
|
|
plt.title('年份电影数变化折线图')
|
|
|
plt.xlabel('年份')
|
|
|
plt.ylabel('数量')
|
|
|
plt.legend(loc='upper left')
|
|
|
plt.savefig('年份电影数变化折线图')
|
|
|
plt.show()
|
|
|
# %%
|
|
|
# 饼图
|
|
|
plt.figure(figsize=(12,7),dpi=80)
|
|
|
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
|
|
|
Y_li = dataset['地点'].value_counts(ascending=True)[:20]
|
|
|
X_li = Y_li.index
|
|
|
plt.pie([i for i in Y_li],labels=[i for i in X_li],autopct='%.1f%%')
|
|
|
plt.title('地区电影数量饼图')
|
|
|
plt.savefig('地区电影数量饼图')
|
|
|
plt.show()
|
|
|
# %%
|
|
|
# 柱状图
|
|
|
plt.figure(figsize=(12,7),dpi=80)
|
|
|
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
|
|
|
Y_li = dataset['类别'].value_counts(ascending=True)
|
|
|
X_li = Y_li.index
|
|
|
plt.bar(range(len(X_li)),[i for i in Y_li],width=0.7,color='blue',alpha=0.5,edgecolor='c',label='数量',picker=2)
|
|
|
x_ = range(len(X_li))
|
|
|
for x,y in zip(x_,Y_li):
|
|
|
plt.text(x-0.2,y+0.1,y)
|
|
|
plt.xticks(range(len(X_li)),[i for i in X_li],rotation=90)
|
|
|
plt.title('类别电影数柱状图')
|
|
|
plt.xlabel('类别')
|
|
|
plt.ylabel('数量')
|
|
|
plt.legend(loc='upper left')
|
|
|
plt.savefig('类别电影数柱状图')
|
|
|
plt.show()
|
|
|
# %%
|
|
|
# 文字分析词云
|
|
|
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
|
|
|
text = ''
|
|
|
for i in dataset['简介']:
|
|
|
text = text + str(i)
|
|
|
txtlist = jieba.lcut(text)
|
|
|
string = " ".join(txtlist)
|
|
|
wc = WC(width=1000,
|
|
|
height=700,
|
|
|
background_color='white',
|
|
|
font_path='msyh.ttc')
|
|
|
wc.generate(string)
|
|
|
plt.imshow(wc)
|
|
|
plt.savefig('电影简介词云')
|
|
|
plt.show()
|
|
|
# %%
|