ADD file via upload

4 years ago · eaca86cb1f
parent bddb8c0169
commit eaca86cb1f
1 changed files with 223 additions and 0 deletions
--- a/pandas库.py
+++ b/pandas库.py
@ -0,0 +1,223 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Dec  5 14:56:04 2021
+
+@author: Administrator
+"""
+
+import pandas as pd
+import numpy as np
+
+#Series结构
+#通过列表创建
+a1=pd.Series([1,2,3,np.nan,6,8])#与np.array([])，list[]类似的数据结构
+#np.nan表示空值
+#Out: 默认情况下Series的索引都是数字,可以使用额外参数设定
+#0    1.0
+#1    2.0
+#2    3.0
+#3    NaN
+#4    6.0
+#5    8.0
+#dtype: float64
+a2=pd.Series([10,20,30,np.nan,60,80],index=["a","b","c","d","e","f"])
+#Series的创建-字典
+dict1={"he":80,"huang":90,"liu":100,"peng":98,"yang":88}
+a3=pd.Series(dict1)
+
+print(a1.index)#查找Series的索引
+print(a2.index)
+print(a3.index[0])#由索引值查看索引名
+x1=a1.values#会输出一个np.array([])，pd.Series([])是输出了索引的array,有点像字典，但他是有序的
+#=x1.astype("int")
+print(x1)
+print(a1.values[0])#由索引值查看值
+#切片操作和np.array基本相同，不再赘述,只是注意改变index后切片使用新的index来切，不再用0，1，2等
+#索引赋值
+a1.index.name="索引"
+print(a1)
+#Out: 
+#索引
+#0    1.0
+#1    2.0
+#2    3.0
+#3    NaN
+#4    6.0
+#5    8.0
+#dtype: float64
+
+#DateFrame结构,是一个二维结构
+date=pd.date_range("2018 01 01",periods=6)#构造一个时间序列，作为第一维的下标
+#从字典创建DataFrame:pd.DataFrame(dict,columns,index)
+df1=pd.DataFrame({"A":1.0,
+                  "B":pd.Timestamp("2021 10 01"),
+                  "C":pd.Series([10,20,30,40]),
+                  "D":np.array([3]*4)},index=['one', 'two', 'three', 'four'])
+data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
+      'year':[2000,2001,2002,2001,2002],
+      'popu':[1.5,1.7,3.6,2.4,2.9]} 
+df2=pd.DataFrame(data)
+print(df2)
+#指定列的顺序colums,指'state'定行索引index
+df3=pd.DataFrame(data,
+                 columns=['year', 'state', 'popu', 'debt'],
+                 index=['one', 'two', 'three', 'four', 'five'])
+#没有给定debt的值，故为nan
+df3["debt"]=1#所有的debt都赋值给1
+print(df3)
+#DataFrame的属性
+df3.shape
+#Out[26]: (5, 4),五行四列,返回一个元组
+df3.ndim
+#Out[27]: 2
+df3.dtypes
+#Out[30]: 
+#year       int64
+#state     object
+#pop      float64
+#debt       int64
+#dtype: object
+
+#查看数据
+df3.index
+#Out[31]: Index(['one', 'two', 'three', 'four', 'five'], dtype='object'),一个array类型
+df3.columns
+#Index(['year', 'state', 'pop', 'debt'], dtype='object')，一个array类型
+df3.values#返回一个多维数组
+#Out[33]: 
+#array([[2000, 'Ohio', 1.5, 1],
+#       [2001, 'Ohio', 1.7, 1],
+#       [2002, 'Ohio', 3.6, 1],
+#       [2001, 'Nevada', 2.4, 1],
+#       [2002, 'Nevada', 2.9, 1]], dtype=object)
+df3.head(3) #头三个
+#Out[34]: 
+#       year state  pop  debt
+#one    2000  Ohio  1.5     1
+#two    2001  Ohio  1.7     1
+#three  2002  Ohio  3.6     1
+df3.tail(2)
+#Out[35]: 
+#      year   state  pop  debt
+#four  2001  Nevada  2.4     1
+#five  2002  Nevada  2.9     1
+df3.info()
+df3.describe() 
+
+#文件的基本操作
+#载入数据:csv、excel(xls)、txt
+#读取
+df11=pd.read_excel("学生体检.xlsx")#"gbk"可以读出中文
+#如果数据表格无表头，设置参数header=None
+#names:自定义列索引
+print(df11)
+#保存
+#df11.to_csv("score new.csv",index=False)df11写入新的csv文件，不带索引
+
+#df["   "]直接取出列是的数据类型就是pd.series
+#df[["   "]]直接取出列是的数据类型就是pd.dateframe
+
+#pandas中有两种索引方法loc、iloc
+#基于行名和列名的索引：.loc
+print(df11.loc[[0],["sex"]])#取得其中一行或多行
+e1=df11.loc[[0],["sex"]]
+e2=df11.loc[[1],["sex"]]
+list(df11["weight"])#可以直接将其转化为列表,再取值
+print(df11.loc[0:3])
+print(df11.loc[[0,3]])
+print(df11.loc[0:2,['id','sex']])
+print(df11.loc[::,['year','weight']])#取得n行及n列,切片不放[]里面，有间隔的取放在[里面]
+#基于行索引值和列索引值的索引：.iloc
+print(df3.iloc[[0,2],[0,2]])
+df=df11.groupby(by="sex")
+print(df.get_group(2))
+#添加
+#添加一行
+# 添加后得到一个新的对象,需要赋值
+dict1={'year':2003,'state':'Louisiana','popu':1.4}#构造一个字典
+a1=pd.Series(dict1)#字典传入Series
+a1.name="Six"#命名
+#Can only append a Series if ignore_index=True or if the Series has a name
+df4=df3.append(a1)#向行末尾添加一行，未指定默认为nan
+#也可以直接传入字典，但一定要ignore_index=True
+#ignore_index=True忽略索引，即把索引再回归到0，1，2.....
+df4=df3.append(dict1,ignore_index=True)
+print(df4)
+#添加一列
+df4["序列"]=range(1,len(df4)+1)#添加一列“序列”,其值依此为1，2，3，.....
+df4=pd.DataFrame(df4,columns=["序列","year","state","popu","debt"])
+df4['total']=df4['popu']+df4['debt']#添加列，其值为"popu"+"debt"每行所对应的值
+#删除
+#删除inplace=True表示删除原对象,此时不能再赋值给其他变量，而是直接改变原变量
+df5=df4.drop([5])#删除行,df4.drop([index])
+print(df5)
+df4.drop('序列',axis=1,inplace=True)#列
+print(df4)
+
+#获取和修改列类型:
+df3['popu'].dtype
+df3['popu']=df3['popu'].astype(int)
+df3['popu'].dtype
+#修改值
+df3=pd.DataFrame(data,
+                 columns=['year', 'state', 'popu', 'debt'],
+                 index=['one', 'two', 'three', 'four', 'five'])
+df3["debt"]=1
+df3.iloc[2,2]=1.0#将第三行（数学意义上的）第三列的值改为1.0
+print(df3)
+df4.iloc[5,3]=1.0
+df4.loc[5,'total'] = df4.loc[5,'popu']+df4.loc[5,'debt']#修改第六行“total”列的数据为第六行“popu”列的数据+第六行“year”列的数据
+print(df4)
+
+#条件选择
+df11.loc[df11.height>170]#选取height>170的所有序列
+df11.loc[df11.height>170][:10]#选取height>170的所有序列，从头开始取只取10个
+df11.loc[(df11.height>175) & (df11.sex==1)]#选取height>175且sex==1的所有序列
+df11.loc[(df11.height<175) | (df11.sex==2)]#选取height<175或sex==2的所有序列
+
+#缺失值和异常值的处理
+#初始化数据
+df3=pd.DataFrame(data,
+                 columns=['year', 'state', 'popu', 'debt'],
+                 index=['one', 'two', 'three', 'four', 'five'])
+df3["debt"]=1.0
+dict1={'year':2003,'state':'Louisiana','popu':1.4}#构造一个字典
+df4=df3.append(dict1,ignore_index=True)
+df4["total"]=df4["popu"]+df4["debt"]
+#判断缺失值
+df4.isnull()#返回全部数据是否有缺失值
+df4.loc[::,["total"]].isnull()#返回"total"列是否有缺失值，索引+isnull()
+#填充缺失值
+df4["debt"].fillna(1.0,inplace=True)
+#删除缺失值
+#axis=0 or 1
+#inplace=True
+df4.dropna(inplace=True)#删除存在缺失值的行or列
+#df.dropna(how='all') #只丢弃全为NaN的行 
+#df.dropna(how='all',axis=1) #只丢弃全为NaN的列
+
+#数据格式转换
+df=pd.read_csv('sales_data_sample.csv',encoding='gbk')
+print(df.iloc[:5])#前五行
+df["日期"].dtypes
+df["日期"]=df["日期"].astype("str")
+df["日期"].dtypes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+