python/pandas库.py

# -*- coding: utf-8 -*-
"""
Created on Sun Dec  5 14:56:04 2021

@author: Administrator
"""

import pandas as pd
import numpy as np

#Series结构
#通过列表创建
a1=pd.Series([1,2,3,np.nan,6,8])#与np.array([])，list[]类似的数据结构
#np.nan表示空值
#Out: 默认情况下Series的索引都是数字,可以使用额外参数设定
#0    1.0
#1    2.0
#2    3.0
#3    NaN
#4    6.0
#5    8.0
#dtype: float64
a2=pd.Series([10,20,30,np.nan,60,80],index=["a","b","c","d","e","f"])
#Series的创建-字典
dict1={"he":80,"huang":90,"liu":100,"peng":98,"yang":88}
a3=pd.Series(dict1)

print(a1.index)#查找Series的索引
print(a2.index)
print(a3.index[0])#由索引值查看索引名
x1=a1.values#会输出一个np.array([])，pd.Series([])是输出了索引的array,有点像字典，但他是有序的
#=x1.astype("int")
print(x1)
print(a1.values[0])#由索引值查看值
#切片操作和np.array基本相同，不再赘述,只是注意改变index后切片使用新的index来切，不再用0，1，2等
#索引赋值
a1.index.name="索引"
print(a1)
#Out:
#索引
#0    1.0
#1    2.0
#2    3.0
#3    NaN
#4    6.0
#5    8.0
#dtype: float64

#DateFrame结构,是一个二维结构
date=pd.date_range("2018 01 01",periods=6)#构造一个时间序列，作为第一维的下标
#从字典创建DataFrame:pd.DataFrame(dict,columns,index)
df1=pd.DataFrame({"A":1.0,
                  "B":pd.Timestamp("2021 10 01"),
                  "C":pd.Series([10,20,30,40]),
                  "D":np.array([3]*4)},index=['one', 'two', 'three', 'four'])
data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
      'year':[2000,2001,2002,2001,2002],
      'popu':[1.5,1.7,3.6,2.4,2.9]}
df2=pd.DataFrame(data)
print(df2)
#指定列的顺序colums,指'state'定行索引index
df3=pd.DataFrame(data,
                 columns=['year', 'state', 'popu', 'debt'],
                 index=['one', 'two', 'three', 'four', 'five'])
#没有给定debt的值，故为nan
df3["debt"]=1#所有的debt都赋值给1
print(df3)
#DataFrame的属性
df3.shape
#Out[26]: (5, 4),五行四列,返回一个元组
df3.ndim
#Out[27]: 2
df3.dtypes
#Out[30]:
#year       int64
#state     object
#pop      float64
#debt       int64
#dtype: object

#查看数据
df3.index
#Out[31]: Index(['one', 'two', 'three', 'four', 'five'], dtype='object'),一个array类型
df3.columns
#Index(['year', 'state', 'pop', 'debt'], dtype='object')，一个array类型
df3.values#返回一个多维数组
#Out[33]:
#array([[2000, 'Ohio', 1.5, 1],
#       [2001, 'Ohio', 1.7, 1],
#       [2002, 'Ohio', 3.6, 1],
#       [2001, 'Nevada', 2.4, 1],
#       [2002, 'Nevada', 2.9, 1]], dtype=object)
df3.head(3) #头三个
#Out[34]:
#       year state  pop  debt
#one    2000  Ohio  1.5     1
#two    2001  Ohio  1.7     1
#three  2002  Ohio  3.6     1
df3.tail(2)
#Out[35]:
#      year   state  pop  debt
#four  2001  Nevada  2.4     1
#five  2002  Nevada  2.9     1
df3.info()
df3.describe()

#文件的基本操作
#载入数据:csv、excel(xls)、txt
#读取
df11=pd.read_excel("学生体检.xlsx")#"gbk"可以读出中文
#如果数据表格无表头，设置参数header=None
#names:自定义列索引
print(df11)
#保存
#df11.to_csv("score new.csv",index=False)df11写入新的csv文件，不带索引

#df["   "]直接取出列是的数据类型就是pd.series
#df[["   "]]直接取出列是的数据类型就是pd.dateframe

#pandas中有两种索引方法loc、iloc
#基于行名和列名的索引：.loc
print(df11.loc[[0],["sex"]])#取得其中一行或多行
e1=df11.loc[[0],["sex"]]
e2=df11.loc[[1],["sex"]]
list(df11["weight"])#可以直接将其转化为列表,再取值
print(df11.loc[0:3])
print(df11.loc[[0,3]])
print(df11.loc[0:2,['id','sex']])
print(df11.loc[::,['year','weight']])#取得n行及n列,切片不放[]里面，有间隔的取放在[里面]
#基于行索引值和列索引值的索引：.iloc
print(df3.iloc[[0,2],[0,2]])
df=df11.groupby(by="sex")
print(df.get_group(2))
#添加
#添加一行
# 添加后得到一个新的对象,需要赋值
dict1={'year':2003,'state':'Louisiana','popu':1.4}#构造一个字典
a1=pd.Series(dict1)#字典传入Series
a1.name="Six"#命名
#Can only append a Series if ignore_index=True or if the Series has a name
df4=df3.append(a1)#向行末尾添加一行，未指定默认为nan
#也可以直接传入字典，但一定要ignore_index=True
#ignore_index=True忽略索引，即把索引再回归到0，1，2.....
df4=df3.append(dict1,ignore_index=True)
print(df4)
#添加一列
df4["序列"]=range(1,len(df4)+1)#添加一列“序列”,其值依此为1，2，3，.....
df4=pd.DataFrame(df4,columns=["序列","year","state","popu","debt"])
df4['total']=df4['popu']+df4['debt']#添加列，其值为"popu"+"debt"每行所对应的值
#删除
#删除inplace=True表示删除原对象,此时不能再赋值给其他变量，而是直接改变原变量
df5=df4.drop([5])#删除行,df4.drop([index])
print(df5)
df4.drop('序列',axis=1,inplace=True)#列
print(df4)

#获取和修改列类型:
df3['popu'].dtype
df3['popu']=df3['popu'].astype(int)
df3['popu'].dtype
#修改值
df3=pd.DataFrame(data,
                 columns=['year', 'state', 'popu', 'debt'],
                 index=['one', 'two', 'three', 'four', 'five'])
df3["debt"]=1
df3.iloc[2,2]=1.0#将第三行（数学意义上的）第三列的值改为1.0
print(df3)
df4.iloc[5,3]=1.0
df4.loc[5,'total'] = df4.loc[5,'popu']+df4.loc[5,'debt']#修改第六行“total”列的数据为第六行“popu”列的数据+第六行“year”列的数据
print(df4)

#条件选择
df11.loc[df11.height>170]#选取height>170的所有序列
df11.loc[df11.height>170][:10]#选取height>170的所有序列，从头开始取只取10个
df11.loc[(df11.height>175) & (df11.sex==1)]#选取height>175且sex==1的所有序列
df11.loc[(df11.height<175) | (df11.sex==2)]#选取height<175或sex==2的所有序列

#缺失值和异常值的处理
#初始化数据
df3=pd.DataFrame(data,
                 columns=['year', 'state', 'popu', 'debt'],
                 index=['one', 'two', 'three', 'four', 'five'])
df3["debt"]=1.0
dict1={'year':2003,'state':'Louisiana','popu':1.4}#构造一个字典
df4=df3.append(dict1,ignore_index=True)
df4["total"]=df4["popu"]+df4["debt"]
#判断缺失值
df4.isnull()#返回全部数据是否有缺失值
df4.loc[::,["total"]].isnull()#返回"total"列是否有缺失值，索引+isnull()
#填充缺失值
df4["debt"].fillna(1.0,inplace=True)
#删除缺失值
#axis=0 or 1
#inplace=True
df4.dropna(inplace=True)#删除存在缺失值的行or列
#df.dropna(how='all') #只丢弃全为NaN的行
#df.dropna(how='all',axis=1) #只丢弃全为NaN的列

#数据格式转换
df=pd.read_csv('sales_data_sample.csv',encoding='gbk')
print(df.iloc[:5])#前五行
df["日期"].dtypes
df["日期"]=df["日期"].astype("str")
df["日期"].dtypes