|
|
@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from mydisplay import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
|
|
|
|
|
# 步骤一(替换sans-serif字体)
|
|
|
|
|
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
|
|
|
|
# 步骤二(解决坐标轴负数的负号显示问题)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_csv('sales_data_sample9.csv', encoding='gbk')
|
|
|
|
|
|
|
|
myprint(df.info(), 'df.info()') # 查看数据整体情况
|
|
|
|
|
|
|
|
myprint(df.head(3), 'df.head(3)') # 前面n行
|
|
|
|
|
|
|
|
myprint(df.tail(3), 'df.tail(3)') # 末尾n行
|
|
|
|
|
|
|
|
myprint(df.shape, 'df.shape') # 数据集规模
|
|
|
|
|
|
|
|
myprint(df.dtypes, 'df.dtypes') # 数据类型
|
|
|
|
|
|
|
|
df.count()
|
|
|
|
|
|
|
|
df = df.dropna() # 默认丢弃任何含有NaN的行
|
|
|
|
|
|
|
|
df = df.dropna(how='all') # 只丢弃全为NaN的行
|
|
|
|
|
|
|
|
df = df.dropna(how='all', axis=1) # 只丢弃全为NaN的列
|
|
|
|
|
|
|
|
df = df.fillna(0) # 用0填充缺失数据
|
|
|
|
|
|
|
|
myprint(df)
|
|
|
|
|
|
|
|
df['金额'].fillna(df['金额'].mean(), inplace=True) # 用列均值填充
|
|
|
|
|
|
|
|
# 基本统计方法
|
|
|
|
|
|
|
|
# • sum() 求和
|
|
|
|
|
|
|
|
# • max() 最大值 idxmax() 最大值对应索引号
|
|
|
|
|
|
|
|
# • min() 最小值 idxmin() 最小值对应索引号
|
|
|
|
|
|
|
|
# • var() 方差 std() 标准差
|
|
|
|
|
|
|
|
# • median() 中位数 mean() 均值
|
|
|
|
|
|
|
|
# • nlargest(n,列名)该列最大的n个值
|
|
|
|
|
|
|
|
# • nsmallest(n,列名)该列最小的n个值
|
|
|
|
|
|
|
|
# • describe():计算各列的频率、均值、标准差、级值、四分位数
|
|
|
|
|
|
|
|
# • 练习:2003年金额最大的订单,最小单价对应的行索引号,金额最
|
|
|
|
|
|
|
|
# 大的5张订单
|
|
|
|
|
|
|
|
print(df.sort_values(by=['订单号', '订单行号'], ascending=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dfg1 = df.groupby(by='订单号')['金额'].sum() # 各订单销售总额
|
|
|
|
|
|
|
|
dfg2 = df.groupby(by=['年份', '订单号'])['金额'].sum()
|
|
|
|
|
|
|
|
dfg3 = df.groupby(by='年份').size() # size跟count的区别: size计数时包含NaN值,而count不包含NaN值
|
|
|
|
|
|
|
|
dfg4 = df.订单号.nunique() # 订单号数量
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m=df[df['年份']==2004].groupby(by='季度')['金额'].sum()
|
|
|
|
|
|
|
|
plt.bar(m.index,m.values) #绘图
|
|
|
|
|
|
|
|
plt.xticks([1,2,3,4]) #x轴刻度
|
|
|
|
|
|
|
|
plt.ylim(0,2500000) #y轴取值范围
|
|
|
|
|
|
|
|
plt.xticks([1,2,3,4],['春','夏','秋','冬'])
|
|
|
|
|
|
|
|
for x,y in zip(m.index,m.values): #值标签
|
|
|
|
|
|
|
|
plt.text(x,y,'%.2f'%y,ha='center',va='bottom')
|
|
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
#练习:不同年度销售总额对比
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#2004年不同季度销售额占比
|
|
|
|
|
|
|
|
m=df[df['年份']==2004].groupby(by=['季度'])['金额'].sum()
|
|
|
|
|
|
|
|
explode=[0,0,0,0.1]
|
|
|
|
|
|
|
|
plt.pie(m,explode=explode,labels=['春','夏','秋','冬'],autopct='%.2f%%')
|
|
|
|
|
|
|
|
plt.legend()
|
|
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
#练习:2004年不同季度订单数量占比
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#不同年份各月销售总额对比
|
|
|
|
|
|
|
|
m=df[df['年份']==2003].groupby(by=['月份'])['金额'].sum() #不同年份各月销售总额
|
|
|
|
|
|
|
|
n=df[df['年份']==2004].groupby(by=['月份'])['金额'].sum()
|
|
|
|
|
|
|
|
k=df[df['年份']==2005].groupby(by=['月份'])['金额'].sum()
|
|
|
|
|
|
|
|
plt.plot(m.index,m.values,label='2003')
|
|
|
|
|
|
|
|
plt.plot(n.index,n.values,label='2004')
|
|
|
|
|
|
|
|
plt.plot(k.index,n.values,label='2005')
|
|
|
|
|
|
|
|
plt.legend()
|
|
|
|
|
|
|
|
plt.title('不同年份销售额曲线对比')
|
|
|
|
|
|
|
|
plt.grid(True)
|
|
|
|
|
|
|
|
plt.xticks(range(1,13),['%d月'%x for x in range(1,13)])
|
|
|
|
|
|
|
|
plt.ticklabel_format(style='plain',axis='y')
|
|
|
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
#练习:不同年度各月订单数量对比
|