You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
hnu202109060325 b4bff4732c
Update README.md
3 years ago
README.md Update README.md 3 years ago

README.md

Titanic1

#1.导入训练测试数据,将数据合并并查看数据基本信息 import numpy as np import pandas as pd import matplotlib.pyplot as plt

读入数据

train = pd.read_csv("/data/bigfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv") test = pd.read_csv("/data/bigfiles/7533b82eae4b582610cbd68aa636b017_1607607386511.csv")

缺失值处理train.info()

train['Age']=train['Age'].fillna(train['Age'].mean())#年龄Age字段,使用平均值填充缺失值

train.drop(['Cabin'],axis=1,inplace=True) # 删去Cabin的那一列数据 train.drop(['Fare'],axis=1,inplace=True) # 删去Fare的那一列数据 train.Embarked = train.Embarked.fillna('S') #train.info()

test['Age']=test['Age'].fillna(test['Age'].mean())#年龄Age字段,使用平均值填充缺失值

test.drop(['Cabin'],axis=1,inplace=True) # 删去Cabin的那一列数据 test.drop(['Fare'],axis=1,inplace=True) # 删去Fare的那一列数据 test.Embarked = test.Embarked.fillna('S') #test.info() #显示大小 #print("数据集大小:",data.shape)

数据集详细信息

#print(data.info())

#2.查看数据前10行数据 import numpy as np import pandas as pd import matplotlib.pyplot as plt

读入数据

data2 = pd.read_csv("/data/bigfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv",nrows =10) print(data2)

#3.显示男性与女性乘客生存比例并进行柱状图可视化

男性和女性存活情况

train'Sex','Survived'.groupby('Sex').mean().plot.bar()#在本代码中先选取Sex和Survived两项数据作为新的数据全体 #后使用groupby函数来将Sex与Survived的数据转化为Sex的单类分组,后使用plot.bar绘制柱状图 survive_sex=train.groupby(['Sex','Survived'])['Survived'].count()

查看幸存与性别的关系

#由于本题阐述不清无法确定男女生存比例绘图的具体参数进行二次绘图将全部有记录的数据汇总以是否幸存为x轴以数量为y轴后用男女进行区分 Survived_Sex = train['Sex'].groupby(train['Survived']) Survived_Sex.value_counts().unstack().plot(kind = 'bar') plt.show()

4.显示不同客舱乘客生存比例并进行柱状图可视化

train'Pclass','Survived'.groupby('Pclass').mean().plot.bar()

#5.进行数据预测,选择模型,进行评估 import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score

读入数据

train = pd.read_csv("/data/bigfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv") test = pd.read_csv("/data/bigfiles/7533b82eae4b582610cbd68aa636b017_1607607386511.csv")

缺失值处理:

''' train_age=train[train['Age'].notnull()]#这里是为了确定如何处理缺失值 train['Age']=train['Age'].fillna(train['Age'].mean())#年龄Age字段以平均值填充 train.drop(['Cabin'],axis=1,inplace=True) # 删去Cabin的那一列数据 train.Embarked = train.Embarked.fillna('S')#港口仅有两个值缺失因此使用港口S填充缺失值 '''

dataset = train.append(test,sort=False)#合并后的数据,方便一起清洗

dataset_age=dataset[dataset['Age'].notnull()]#这里是为了确定如何处理缺失值 dataset['Age']=dataset['Age'].fillna(dataset['Age'].mean())#年龄Age字段以平均值填充 dataset.drop(['Cabin','Fare','PassengerId','Name','Ticket','Embarked'],axis=1,inplace=True) # 删去Cabin和Fare的那几列数据 #dataset.Embarked = dataset.Embarked.fillna('S')#港口仅有两个值缺失因此使用港口S填充缺失值不用拉 dataset.info()

#对Sex数据进行编码男的1女的0 sexdict = {'male':1, 'female':0} dataset.Sex = dataset.Sex.map(sexdict)

dataset.head(1)

#对训练集与测试集进行定义 x_train = dataset.iloc[0:891, :] y_train = x_train.Survived x_train.drop(['Survived'], axis=1, inplace =True)

x_test = dataset.iloc[891:, :] x_test.drop(['Survived'], axis=1, inplace =True)

y_test = pd.read_csv('/data/bigfiles/7533b82eae4b582610cbd68aa636b017_1607607386511.csv')#测试集 y_test=np.squeeze(y_test)

x_train.shape,y_train.shape,x_test.shape, y_test.shape

#Logistic Regression模型预测 from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score model = LogisticRegression()

model.fit(x_train.iloc[0:-100,:],y_train.iloc[0:-100])

#使用accuracy_score评估 accuracy_score(model.predict(x_train.iloc[-100:,:]),y_train.iloc[-100:].values.reshape(-1,1)) #评估结果0.8