|
|
|
|
@ -0,0 +1,90 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
"""
|
|
|
|
|
Created on Tue Dec 7 10:23:21 2021
|
|
|
|
|
|
|
|
|
|
@author: Administrator
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
#part1
|
|
|
|
|
df1=pd.read_csv("score.csv",encoding="gbk")
|
|
|
|
|
print(df1)
|
|
|
|
|
#排序
|
|
|
|
|
df2=df1.sort_values(by=['语文'],ascending=True)#按语文升序排列
|
|
|
|
|
print(df2)
|
|
|
|
|
df3=df1.sort_values(by=['数学'],ascending=False)#按数学降序
|
|
|
|
|
print(df3)
|
|
|
|
|
#首要关键字相同则比较次关键字
|
|
|
|
|
#假设要同时满足上面两个条件呢?
|
|
|
|
|
df4=df1.sort_values(by=['数学','语文'],ascending=[False,True])#按2个关键字排列,[首要关键字,次要关键字]
|
|
|
|
|
print(df4)
|
|
|
|
|
|
|
|
|
|
#统计方法基础
|
|
|
|
|
df2=df1.append({"num":106,"name":"Jimmy","语文":90,"数学":90,"英语":100},ignore_index=True)
|
|
|
|
|
print(df2.describe())
|
|
|
|
|
df2.语文
|
|
|
|
|
print(df2.语文.max())#最大
|
|
|
|
|
print(df2.英语.idxmax())#最大值对应索引号
|
|
|
|
|
print(df2.语文.min())#最小
|
|
|
|
|
print(df2.语文.idxmin())#最小值对应索引号
|
|
|
|
|
print(df2.语文.mean())#均值
|
|
|
|
|
print(df2.数学.mean())
|
|
|
|
|
print(df2.英语.mean())
|
|
|
|
|
print(df2.英语.median())#中值
|
|
|
|
|
print(df2.数学.sum())#求和
|
|
|
|
|
print(df2.语文.var())#方差
|
|
|
|
|
print(df2.nlargest(2,"数学"))#该列最大的2个值
|
|
|
|
|
print(df2.nsmallest(2,"数学"))#该列最小的2个值
|
|
|
|
|
print(df2["英语"].unique())#以数组形式返回所选列的唯一值
|
|
|
|
|
print(df2["英语"].nunique())#以数组形式返回所选列的唯一值的个数
|
|
|
|
|
df2["平均分"]=((df2["语文"]+df2["数学"]+df2["英语"])/3).round(4)#保留四位小数
|
|
|
|
|
df2["平均分"]=df2.loc[::,["平均分"]].round(2)#不能直接写入,要重新赋值
|
|
|
|
|
df3=df2.sort_values(by=['平均分'],ascending=False)
|
|
|
|
|
#计算比例
|
|
|
|
|
df5=df2.loc[::,["语文","数学","英语"]]
|
|
|
|
|
df6=df5.div(df5.sum(axis=1),axis=0)
|
|
|
|
|
#合并数据集
|
|
|
|
|
df7=df2.copy()#df7为df6的一个复制
|
|
|
|
|
df8=pd.concat([df2,df7],ignore_index=True)#按行方向拼接,增加行
|
|
|
|
|
print(df8)
|
|
|
|
|
df9=pd.concat([df2,df7],axis=1) #按列方向拼接,增加列
|
|
|
|
|
print(df9)
|
|
|
|
|
|
|
|
|
|
#part2
|
|
|
|
|
df=pd.read_csv("C:\\Users\\Administrator\\Desktop\\python学习\\python数据分析\\sales_data_sample.csv",encoding='gbk')
|
|
|
|
|
df.info() #查看数据整体情况
|
|
|
|
|
df.describe()
|
|
|
|
|
print(df.shape)#数据集规模,返回元组
|
|
|
|
|
#(2823行, 9列)
|
|
|
|
|
df.dtypes#数据类型
|
|
|
|
|
df.count()#多少个数
|
|
|
|
|
df=df.dropna()
|
|
|
|
|
#练习2003年金额最大的订单,最小单价对应的 行索引号 ,金额最大的5张订单
|
|
|
|
|
df1=df[df["年份"]==2003]
|
|
|
|
|
print(df1.金额.idxmax())
|
|
|
|
|
print(df1.单价.idxmin())
|
|
|
|
|
print(df1.nsmallest(5,"金额"))#nsmallest,nlargest一次看到最大或最小的几行或几列,而不需要排序
|
|
|
|
|
#分组统计
|
|
|
|
|
#分组统计:groupby(by=列名/列名列表),返回一个DataFrameGroupBy对象,
|
|
|
|
|
#常用函数sum()、mean()、size()、count()、unique()、nunique()等
|
|
|
|
|
df2=df.groupby(by='订单号')['金额'].sum()#通过订单号分组,再每个组里对金额求和
|
|
|
|
|
df_=df2.sort_values(ascending=False)
|
|
|
|
|
print(df_.index[0])
|
|
|
|
|
print(df_.max())
|
|
|
|
|
df3=df.groupby(by='年份').size()#size跟count的区别:size计数时包含 NaN值,而count不包含NaN值
|
|
|
|
|
lst1=list(df3)#转换成列表取值
|
|
|
|
|
df.groupby(by='年份').count()
|
|
|
|
|
df.订单号.unique()#不重复的订单号列表
|
|
|
|
|
#len(df.订单号.unique())=df.订单号.nunique()
|
|
|
|
|
df.订单号.nunique()#不重复的订单号数量
|
|
|
|
|
#练习:各个季度销售总额升序排序,各季度订单号数量
|
|
|
|
|
df4=df.groupby(by="季度")["金额"].sum()
|
|
|
|
|
print(df4.sort_values())#默认升序
|
|
|
|
|
df5=df.groupby(by="季度")["订单号"].size()
|
|
|
|
|
print(df5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|