You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
3.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#%%
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud as WC
import jieba
import re
# %%
dataset = pd.read_csv('doubandata.csv')
dataset = dataset.head(100)
"""
subset : column label or sequence of labels, optional
用来指定特定的列,默认所有列
keep : {first, last, False}, default first
删除重复项并保留第一次出现的项
inplace : boolean, default False
"""
dataset = dataset.drop_duplicates(subset=None, keep='first', inplace=False)
"""
axis:维度axis=0表示index行,axis=1表示columns列默认为0
how:"all"表示这一行或列中的元素全部缺失为nan才删除这一行或列"any"表示这一行或列中只要有元素缺失,就删除这一行或列
subset在某些列的子集中选择出现了缺失值的列删除不在子集中的含有缺失值得列或行不会删除有axis决定是行还是列
inplace刷选过缺失值得新数据是存为副本还是直接在原数据上进行修改。默认是False即创建新的对象进行修改原对象不变和深复制和浅复制有些类似。
"""
dataset = dataset.dropna(axis=0, how='any', subset=None, inplace=False)
dataset
# %%
dataset['时间'] = dataset['基本信息'].apply(lambda x: re.findall('\d\d\d\d',x)[0])
dataset['时间']
# %%
dataset['评论数'] = dataset['评论数'].apply(lambda x: re.findall('\d+',x)[0])
dataset['评论数']
# %%
dataset['类别'] = dataset['基本信息'].apply(lambda x: re.findall('\d\d\d\d(.+)',x)[0])
dataset['类别']
# %%
dataset['类别'] = dataset['类别'].apply(lambda x: x.replace(' ','_').replace(' ','_').replace(' ','_'))
dataset['类别']
# %%
dataset['地点'] = dataset['类别'].apply(lambda x: re.sub('\s+','',x.split(r'/')[1].split('_')[0].replace(' ','')))
dataset['地点']
# %%
dataset['类别'] = dataset['类别'].apply(lambda x: re.sub('\s+','',x.split(r'/')[2].split('_')[-1].replace(' ','')))
dataset['类别']
# %%
dataset
# %%
# 折线图
plt.figure(figsize=(12,7),dpi=80)
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
Y_li = dataset['时间'].value_counts(ascending=True)
X_li = Y_li.index
plt.plot([i for i in X_li],[i for i in Y_li],label='年份电影数',picker=4,color='r')
for x,y in zip(X_li,Y_li):
plt.text(x,y,y)
plt.xticks([i for i in X_li],[i for i in X_li],rotation=90)
plt.title('年份电影数变化折线图')
plt.xlabel('年份')
plt.ylabel('数量')
plt.legend(loc='upper left')
plt.savefig('年份电影数变化折线图')
plt.show()
# %%
# 饼图
plt.figure(figsize=(12,7),dpi=80)
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
Y_li = dataset['地点'].value_counts(ascending=True)[:20]
X_li = Y_li.index
plt.pie([i for i in Y_li],labels=[i for i in X_li],autopct='%.1f%%')
plt.title('地区电影数量饼图')
plt.savefig('地区电影数量饼图')
plt.show()
# %%
# 柱状图
plt.figure(figsize=(12,7),dpi=80)
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
Y_li = dataset['类别'].value_counts(ascending=True)
X_li = Y_li.index
plt.bar(range(len(X_li)),[i for i in Y_li],width=0.7,color='blue',alpha=0.5,edgecolor='c',label='数量',picker=2)
x_ = range(len(X_li))
for x,y in zip(x_,Y_li):
plt.text(x-0.2,y+0.1,y)
plt.xticks(range(len(X_li)),[i for i in X_li],rotation=90)
plt.title('类别电影数柱状图')
plt.xlabel('类别')
plt.ylabel('数量')
plt.legend(loc='upper left')
plt.savefig('类别电影数柱状图')
plt.show()
# %%
# 文字分析词云
plt.rcParams['font.sans-serif']=['Microsoft YaHei']
text = ''
for i in dataset['简介']:
text = text + str(i)
txtlist = jieba.lcut(text)
string = " ".join(txtlist)
wc = WC(width=1000,
height=700,
background_color='white',
font_path='msyh.ttc')
wc.generate(string)
plt.imshow(wc)
plt.savefig('电影简介词云')
plt.show()
# %%