You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
898 lines
57 KiB
898 lines
57 KiB
|
|
<!DOCTYPE HTML>
|
|
<html lang="" >
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
|
|
<title>探索性数据分析(EDA) · GitBook</title>
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
|
|
<meta name="description" content="">
|
|
<meta name="generator" content="GitBook 3.2.3">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../gitbook/style.css">
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-katex/katex.min.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<meta name="HandheldFriendly" content="true"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
|
|
<meta name="apple-mobile-web-app-capable" content="yes">
|
|
<meta name="apple-mobile-web-app-status-bar-style" content="black">
|
|
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
|
|
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
|
|
|
|
|
|
<link rel="next" href="feature engerning.html" />
|
|
|
|
|
|
<link rel="prev" href="introduction.html" />
|
|
|
|
|
|
</head>
|
|
<body>
|
|
|
|
<div class="book">
|
|
<div class="book-summary">
|
|
|
|
|
|
<div id="book-search-input" role="search">
|
|
<input type="text" placeholder="Type to search" />
|
|
</div>
|
|
|
|
|
|
<nav role="navigation">
|
|
|
|
|
|
|
|
<ul class="summary">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="chapter " data-level="1.1" data-path="../">
|
|
|
|
<a href="../">
|
|
|
|
|
|
简介
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.2" data-path="../machine_learning.html">
|
|
|
|
<a href="../machine_learning.html">
|
|
|
|
|
|
机器学习概述
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3" data-path="../algorithm.html">
|
|
|
|
<a href="../algorithm.html">
|
|
|
|
|
|
常见机器学习算法
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.3.1" data-path="../kNN.html">
|
|
|
|
<a href="../kNN.html">
|
|
|
|
|
|
近朱者赤近墨者黑-kNN
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.2" data-path="../linear_regression.html">
|
|
|
|
<a href="../linear_regression.html">
|
|
|
|
|
|
最简单的回归算法-线性回归
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.3" data-path="../logistic_regression.html">
|
|
|
|
<a href="../logistic_regression.html">
|
|
|
|
|
|
使用回归的思想进行分类-逻辑回归
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.4" data-path="../decision_tree.html">
|
|
|
|
<a href="../decision_tree.html">
|
|
|
|
|
|
最接近人类思维的算法-决策树
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.5" data-path="../random_forest.html">
|
|
|
|
<a href="../random_forest.html">
|
|
|
|
|
|
群众的力量是伟大的-随机森林
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.6" data-path="../kMeans.html">
|
|
|
|
<a href="../kMeans.html">
|
|
|
|
|
|
物以类聚人以群分-kMeans
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.3.7" data-path="../AGNES.html">
|
|
|
|
<a href="../AGNES.html">
|
|
|
|
|
|
以距离为尺-AGNES
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.4" data-path="../metrics.html">
|
|
|
|
<a href="../metrics.html">
|
|
|
|
|
|
模型评估指标
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.4.1" data-path="../classification_metrics.html">
|
|
|
|
<a href="../classification_metrics.html">
|
|
|
|
|
|
分类性能评估指标
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.4.2" data-path="../regression_metrics.html">
|
|
|
|
<a href="../regression_metrics.html">
|
|
|
|
|
|
回归性能评估指标
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.4.3" data-path="../cluster_metrics.html">
|
|
|
|
<a href="../cluster_metrics.html">
|
|
|
|
|
|
聚类性能评估指标
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.5" data-path="../sklearn.html">
|
|
|
|
<a href="../sklearn.html">
|
|
|
|
|
|
使用sklearn进行机器学习
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.6" >
|
|
|
|
<span>
|
|
|
|
|
|
综合实战案例
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.6.1" >
|
|
|
|
<span>
|
|
|
|
|
|
泰坦尼克生还预测
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.6.1.1" data-path="introduction.html">
|
|
|
|
<a href="introduction.html">
|
|
|
|
|
|
简介
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter active" data-level="1.6.1.2" data-path="EDA.html">
|
|
|
|
<a href="EDA.html">
|
|
|
|
|
|
探索性数据分析(EDA)
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.6.1.3" data-path="feature engerning.html">
|
|
|
|
<a href="feature engerning.html">
|
|
|
|
|
|
特征工程
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.6.1.4" data-path="fit and predict.html">
|
|
|
|
<a href="fit and predict.html">
|
|
|
|
|
|
构建模型进行预测
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.6.1.5" data-path="tuning.html">
|
|
|
|
<a href="tuning.html">
|
|
|
|
|
|
调参
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.6.2" >
|
|
|
|
<span>
|
|
|
|
|
|
使用强化学习玩乒乓球游戏
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<ul class="articles">
|
|
|
|
|
|
<li class="chapter " data-level="1.6.2.1" data-path="../pingpong/what is reinforce learning.html">
|
|
|
|
<a href="../pingpong/what is reinforce learning.html">
|
|
|
|
|
|
什么是强化学习
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.6.2.2" data-path="../pingpong/Policy Gradient.html">
|
|
|
|
<a href="../pingpong/Policy Gradient.html">
|
|
|
|
|
|
Policy Gradient原理
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.6.2.3" data-path="../pingpong/coding.html">
|
|
|
|
<a href="../pingpong/coding.html">
|
|
|
|
|
|
使用Policy Gradient玩乒乓球游戏
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
|
|
</ul>
|
|
|
|
</li>
|
|
|
|
<li class="chapter " data-level="1.7" data-path="../recommand.html">
|
|
|
|
<a href="../recommand.html">
|
|
|
|
|
|
实训推荐
|
|
|
|
</a>
|
|
|
|
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
<li class="divider"></li>
|
|
|
|
<li>
|
|
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
|
|
Published with GitBook
|
|
</a>
|
|
</li>
|
|
</ul>
|
|
|
|
|
|
</nav>
|
|
|
|
|
|
</div>
|
|
|
|
<div class="book-body">
|
|
|
|
<div class="body-inner">
|
|
|
|
|
|
|
|
<div class="book-header" role="navigation">
|
|
|
|
|
|
<!-- Title -->
|
|
<h1>
|
|
<i class="fa fa-circle-o-notch fa-spin"></i>
|
|
<a href=".." >探索性数据分析(EDA)</a>
|
|
</h1>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<div class="page-wrapper" tabindex="-1" role="main">
|
|
<div class="page-inner">
|
|
|
|
<div id="book-search-results">
|
|
<div class="search-noresults">
|
|
|
|
<section class="normal markdown-section">
|
|
|
|
<h1 id="探索性数据分析eda">探索性数据分析(EDA)</h1>
|
|
<p>探索性数据分析(EDA)说白了就是通过可视化的方式来看看数据中特征与特征之间,特征与目标之间的潜在关系,看看有什么有用的线索可以挖掘,例如哪些数据是噪声,有哪些特征的相关性比较低,后续可以造出哪些新的特征等。</p>
|
|
<h2 id="初窥">初窥</h2>
|
|
<p>当然,在EDA之前先要加载数据,我们不妨先将训练集train.csv读到内存中,并看一看。</p>
|
|
<pre><code class="lang-python"><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
|
|
<span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
|
|
<span class="hljs-keyword">import</span> matplotlib.pyplot <span class="hljs-keyword">as</span> plt
|
|
<span class="hljs-keyword">import</span> seaborn <span class="hljs-keyword">as</span> sns
|
|
|
|
data=pd.read_csv(<span class="hljs-string">'./Titanic/train.csv'</span>)
|
|
|
|
<span class="hljs-comment"># 看看data的前5行</span>
|
|
data.head()
|
|
</code></pre>
|
|
<p><img src="../img/32.jpg" alt=""></p>
|
|
<p>从图中可以看出数据是由 11 个特征和 1 个标签(Survived)组成的。其中各个特征和标签的意义如下:</p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>特征</th>
|
|
<th>意义</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>Survived</td>
|
|
<td>是否生还,1表示是,0表示否</td>
|
|
</tr>
|
|
<tr>
|
|
<td>PassengerId</td>
|
|
<td>乘客ID</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Pclass</td>
|
|
<td>船票类型, 总共3种类型:1(一等舱),2(二等舱),3(三等舱)</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Name</td>
|
|
<td>船客姓名</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Sex</td>
|
|
<td>船客性别:female,male</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Age</td>
|
|
<td>船客年龄</td>
|
|
</tr>
|
|
<tr>
|
|
<td>SibSp</td>
|
|
<td>船客的兄弟姐妹妻子丈夫的数量</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Parch</td>
|
|
<td>船客的父母,孩子的数量</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Ticket</td>
|
|
<td>船票</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Fare</td>
|
|
<td>船客在船上所花的钱</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Cabin</td>
|
|
<td>船客的船舱号</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Embarked</td>
|
|
<td>船客登船的口岸:C,Q,S</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<p>了解了数据种各个属性的含义之后,我们可以看看这个数据集中有没有缺失值。</p>
|
|
<pre><code class="lang-python">data.isnull().sum()
|
|
</code></pre>
|
|
<p><img src="../img/33.jpg" alt=""></p>
|
|
<p>可以看出 Age,Cabin 和 Embarked 这三个特征中有缺失值,我们需要处理这些缺失值。怎样处理呢?先不着急,我们可以先看看数据中有哪些信息可以挖掘。</p>
|
|
<h2 id="有多少人活了下来">有多少人活了下来</h2>
|
|
<p>我们首先可以看看训练集中有多少人活了下来。</p>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">18</span>,<span class="hljs-number">8</span>))
|
|
<span class="hljs-comment"># 生还比例饼图</span>
|
|
data[<span class="hljs-string">'Survived'</span>].value_counts().plot.pie(explode=[<span class="hljs-number">0</span>,<span class="hljs-number">0.1</span>],autopct=<span class="hljs-string">'%1.1f%%'</span>,ax=ax[<span class="hljs-number">0</span>],shadow=<span class="hljs-keyword">True</span>)
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Survived'</span>)
|
|
ax[<span class="hljs-number">0</span>].set_ylabel(<span class="hljs-string">''</span>)
|
|
<span class="hljs-comment"># 生还数量直方图</span>
|
|
sns.countplot(<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Survived'</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/34.jpg" alt=""></p>
|
|
<p>从图中可以看出泰坦尼克沉船事件中还是凶多吉少的。因为在 891 名船客中,只有约 38% 左右的人幸免于难,那么接下来尝试使用数据集中不同的特征,来看看他们的生还率有多少。其实这样一个过程我们可以看出大概有哪些类型的船客活了下来。</p>
|
|
<h2 id="性别与生还率的关系">性别与生还率的关系</h2>
|
|
<p>首先,看看不同性别的生还者数量。</p>
|
|
<pre><code class="lang-python">data.groupby([<span class="hljs-string">'Sex'</span>,<span class="hljs-string">'Survived'</span>])[<span class="hljs-string">'Survived'</span>].count()
|
|
</code></pre>
|
|
<p><img src="../img/35.jpg" alt=""></p>
|
|
<p>看上去好想女性船客的生还率高一些,我们不妨再可视化一下。</p>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">18</span>,<span class="hljs-number">8</span>))
|
|
data[[<span class="hljs-string">'Sex'</span>,<span class="hljs-string">'Survived'</span>]].groupby([<span class="hljs-string">'Sex'</span>]).mean().plot.bar(ax=ax[<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Survived vs Sex'</span>)
|
|
sns.countplot(<span class="hljs-string">'Sex'</span>,hue=<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Sex:Survived vs Dead'</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/36.jpg" alt=""></p>
|
|
<p>从图中可以看出一个比较有趣的现象,船上的男人是比女人多了 200 多人,但是女人生还的人数几乎是男人生还的人数的两倍,女人的存活率约为 75% ,而男人的存活率约为 19% 的样子。所以 Sex 这个特征应该是一个能够很好的区分一个人是否生还的特征。而且对于生还来说,好像是女士优先。</p>
|
|
<h2 id="船票类型与生还率的关系">船票类型与生还率的关系</h2>
|
|
<p>船票类型分三个档次,其中 1 为一等舱, 2 为二等舱, 3 为三等舱。既然船舱分三六九等,那么是不是越高级的舱,它的生还率越高呢?</p>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">18</span>,<span class="hljs-number">8</span>))
|
|
data[<span class="hljs-string">'Pclass'</span>].value_counts().plot.bar(ax=ax[<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Number Of Passengers By Pclass'</span>)
|
|
ax[<span class="hljs-number">0</span>].set_ylabel(<span class="hljs-string">'Count'</span>)
|
|
sns.countplot(<span class="hljs-string">'Pclass'</span>,hue=<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Pclass:Survived vs Dead'</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/37.jpg" alt=""></p>
|
|
<p>虽然说钱不是万能的,但从可视化结果可以看出,一等舱的生还率最高,大于为 63%,二等舱的生还率约为 48% ,而且虽然三等舱的船客人数是最多的,但生还率确是最低的。所以不难看出,金钱地位还是很重要的,也许一等舱周围有比较多的救生设备。</p>
|
|
<h2 id="上流女性与生还率的关系">上流女性与生还率的关系</h2>
|
|
<p>从前两次可视化结果可以看出,女性,上流人士成为了是否能够活下来的关键,那么上流女性(两者的结合)的生还率会不会很高呢?</p>
|
|
<pre><code class="lang-python">sns.factorplot(<span class="hljs-string">'Pclass'</span>,<span class="hljs-string">'Survived'</span>,hue=<span class="hljs-string">'Sex'</span>,data=data)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/38.jpg" alt=""></p>
|
|
<p>从这张图可以看出一等舱的女性(上流女性)的生还率非常高!几乎接近了百分之百!而且二等舱和三等舱的女性的生还率也远比男性的生还率高。这也验证了我们的猜测,在沉船后是优先女性和一等舱的船客的。</p>
|
|
<h2 id="年龄与生还率的关系">年龄与生还率的关系</h2>
|
|
<p>首先可以先看一下训练集中船客的年龄的最值和均值。</p>
|
|
<pre><code class="lang-python">print(<span class="hljs-string">'Oldest Passenger was of:'</span>,data[<span class="hljs-string">'Age'</span>].max(),<span class="hljs-string">'Years'</span>)
|
|
print(<span class="hljs-string">'Youngest Passenger was of:'</span>,data[<span class="hljs-string">'Age'</span>].min(),<span class="hljs-string">'Years'</span>)
|
|
print(<span class="hljs-string">'Average Age on the ship:'</span>,data[<span class="hljs-string">'Age'</span>].mean(),<span class="hljs-string">'Years'</span>)
|
|
</code></pre>
|
|
<p><img src="../img/39.jpg" alt=""></p>
|
|
<p>年纪最大的是80岁的老爷爷或者老太太,最小的是刚出生的小 baby, 平均年龄快 30 岁。这个还是符合常理的。接下来我们看看船舱等级,年龄和生还率的关系,以及性别,年龄和生还率的关系。</p>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">18</span>,<span class="hljs-number">8</span>))
|
|
sns.violinplot(<span class="hljs-string">"Pclass"</span>,<span class="hljs-string">"Age"</span>, hue=<span class="hljs-string">"Survived"</span>, data=data,split=<span class="hljs-keyword">True</span>,ax=ax[<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Pclass and Age vs Survived'</span>)
|
|
ax[<span class="hljs-number">0</span>].set_yticks(range(<span class="hljs-number">0</span>,<span class="hljs-number">110</span>,<span class="hljs-number">10</span>))
|
|
sns.violinplot(<span class="hljs-string">"Sex"</span>,<span class="hljs-string">"Age"</span>, hue=<span class="hljs-string">"Survived"</span>, data=data,split=<span class="hljs-keyword">True</span>,ax=ax[<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Sex and Age vs Survived'</span>)
|
|
ax[<span class="hljs-number">1</span>].set_yticks(range(<span class="hljs-number">0</span>,<span class="hljs-number">110</span>,<span class="hljs-number">10</span>))
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/40.jpg" alt=""></p>
|
|
<p>从可视化结果可以看出:</p>
|
|
<ul>
|
|
<li><p>儿童的数量随着船舱等级的增加而增加,10 岁以下的小朋友存活率仿佛都还挺高的,跟船舱等级好像没有太大关系。</p>
|
|
</li>
|
|
<li><p>来自一等舱的 20-50 岁的船客的存活率很高,而且对女性的生还率一如既往的高。</p>
|
|
</li>
|
|
<li><p>对于男性来说,年纪越大,生还率越低。</p>
|
|
</li>
|
|
</ul>
|
|
<p>不过我们的年龄是有缺失值的,如果图简单,可以使用平均年龄来填充缺失的年龄。但是这样做并不合适,比如人家只是个 5 岁的小屁孩,但是你把人家强行改成 29 岁显然是不合适的。那有没有能够更加准确地知道缺失的年龄是多少的方法呢?有!我们可以根据姓名来推断缺失的年龄,因为姓名中有很多类似 Mr 或者 Mrs 这样的前缀,所以我们可以根据姓名的前缀来填充缺失的年龄。</p>
|
|
<h2 id="填充缺失年龄">填充缺失年龄</h2>
|
|
<p>外国人的姓名和我们中国人的姓名不太一样,一般都会有 Mr 、 Mrs 、Miss 、Dr 等特殊前缀。所以我们可以先提取姓名中的前缀。</p>
|
|
<pre><code class="lang-python">data[<span class="hljs-string">'Initial'</span>]=<span class="hljs-number">0</span>
|
|
<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> data:
|
|
data[<span class="hljs-string">'Initial'</span>]=data.Name.str.extract(<span class="hljs-string">'([A-Za-z]+)\.'</span>)
|
|
</code></pre>
|
|
<p>这样我们能够提取出诸如:Capt 、Col 、Don 、Lady 、Major 、Sir 等前缀,接着我们可以将这些前缀替换成 Miss 、 Mr 、 Mrs 、 Other 这四个类别,并统计这四个类别的平均年龄。</p>
|
|
<pre><code class="lang-python">data[<span class="hljs-string">'Initial'</span>].replace([<span class="hljs-string">'Mlle'</span>,<span class="hljs-string">'Mme'</span>,<span class="hljs-string">'Ms'</span>,<span class="hljs-string">'Dr'</span>,<span class="hljs-string">'Major'</span>,<span class="hljs-string">'Lady'</span>,<span class="hljs-string">'Countess'</span>,<span class="hljs-string">'Jonkheer'</span>,<span class="hljs-string">'Col'</span>,<span class="hljs-string">'Rev'</span>,<span class="hljs-string">'Capt'</span>,<span class="hljs-string">'Sir'</span>,<span class="hljs-string">'Don'</span>],[<span class="hljs-string">'Miss'</span>,<span class="hljs-string">'Miss'</span>,<span class="hljs-string">'Miss'</span>,<span class="hljs-string">'Mr'</span>,<span class="hljs-string">'Mr'</span>,<span class="hljs-string">'Mrs'</span>,<span class="hljs-string">'Mrs'</span>,<span class="hljs-string">'Other'</span>,<span class="hljs-string">'Other'</span>,<span class="hljs-string">'Other'</span>,<span class="hljs-string">'Mr'</span>,<span class="hljs-string">'Mr'</span>,<span class="hljs-string">'Mr'</span>],inplace=<span class="hljs-keyword">True</span>)
|
|
|
|
data.groupby(<span class="hljs-string">'Initial'</span>)[<span class="hljs-string">'Age'</span>].mean()
|
|
</code></pre>
|
|
<p><img src="../img/41.jpg" alt=""></p>
|
|
<p>接着可以根据前缀来填充缺失的年龄。</p>
|
|
<pre><code class="lang-python">data.loc[(data.Age.isnull())&(data.Initial==<span class="hljs-string">'Mr'</span>),<span class="hljs-string">'Age'</span>]=<span class="hljs-number">33</span>
|
|
data.loc[(data.Age.isnull())&(data.Initial==<span class="hljs-string">'Mrs'</span>),<span class="hljs-string">'Age'</span>]=<span class="hljs-number">36</span>
|
|
data.loc[(data.Age.isnull())&(data.Initial==<span class="hljs-string">'Miss'</span>),<span class="hljs-string">'Age'</span>]=<span class="hljs-number">22</span>
|
|
data.loc[(data.Age.isnull())&(data.Initial==<span class="hljs-string">'Other'</span>),<span class="hljs-string">'Age'</span>]=<span class="hljs-number">46</span>
|
|
</code></pre>
|
|
<p>填充完缺失值后,可以尝试可视化一下。</p>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">20</span>,<span class="hljs-number">10</span>))
|
|
data[data[<span class="hljs-string">'Survived'</span>]==<span class="hljs-number">0</span>].Age.plot.hist(ax=ax[<span class="hljs-number">0</span>],bins=<span class="hljs-number">20</span>,edgecolor=<span class="hljs-string">'black'</span>,color=<span class="hljs-string">'red'</span>)
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Survived= 0'</span>)
|
|
x1=list(range(<span class="hljs-number">0</span>,<span class="hljs-number">85</span>,<span class="hljs-number">5</span>))
|
|
ax[<span class="hljs-number">0</span>].set_xticks(x1)
|
|
data[data[<span class="hljs-string">'Survived'</span>]==<span class="hljs-number">1</span>].Age.plot.hist(ax=ax[<span class="hljs-number">1</span>],color=<span class="hljs-string">'green'</span>,bins=<span class="hljs-number">20</span>,edgecolor=<span class="hljs-string">'black'</span>)
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Survived= 1'</span>)
|
|
x2=list(range(<span class="hljs-number">0</span>,<span class="hljs-number">85</span>,<span class="hljs-number">5</span>))
|
|
ax[<span class="hljs-number">1</span>].set_xticks(x2)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/42.jpg" alt=""></p>
|
|
<p>从图中可以看出 5 岁以下的小屁孩的生还率比较高,80 岁的老人活下来了。</p>
|
|
<pre><code class="lang-python">sns.factorplot(<span class="hljs-string">'Pclass'</span>,<span class="hljs-string">'Survived'</span>,col=<span class="hljs-string">'Initial'</span>,data=data)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/43.jpg" alt=""></p>
|
|
<p>嗯,女性和小孩的生还率比较高。</p>
|
|
<h2 id="登船口岸与生还率的关系">登船口岸与生还率的关系</h2>
|
|
<p>先把口岸和生还率的关系画出来。</p>
|
|
<pre><code class="lang-python">sns.factorplot(<span class="hljs-string">'Embarked'</span>,<span class="hljs-string">'Survived'</span>,data=data)
|
|
fig=plt.gcf()
|
|
fig.set_size_inches(<span class="hljs-number">5</span>,<span class="hljs-number">3</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/44.jpg" alt=""></p>
|
|
<p>可以看出从 C 号口岸上船的生还率最高,最低的是 S 号口岸。嗯,好像并没有什么线索,我们可以再深入一点。</p>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">2</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">20</span>,<span class="hljs-number">15</span>))
|
|
sns.countplot(<span class="hljs-string">'Embarked'</span>,data=data,ax=ax[<span class="hljs-number">0</span>,<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">0</span>,<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'No. Of Passengers Boarded'</span>)
|
|
sns.countplot(<span class="hljs-string">'Embarked'</span>,hue=<span class="hljs-string">'Sex'</span>,data=data,ax=ax[<span class="hljs-number">0</span>,<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">0</span>,<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Male-Female Split for Embarked'</span>)
|
|
sns.countplot(<span class="hljs-string">'Embarked'</span>,hue=<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">1</span>,<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">1</span>,<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Embarked vs Survived'</span>)
|
|
sns.countplot(<span class="hljs-string">'Embarked'</span>,hue=<span class="hljs-string">'Pclass'</span>,data=data,ax=ax[<span class="hljs-number">1</span>,<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>,<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Embarked vs Pclass'</span>)
|
|
plt.subplots_adjust(wspace=<span class="hljs-number">0.2</span>,hspace=<span class="hljs-number">0.5</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/45.jpg" alt=""></p>
|
|
<p>现在能看出很多信息了:</p>
|
|
<ul>
|
|
<li>上船人数最多的口岸是 S 号口岸,而且在 S 号口岸上船的人大多数都是三等舱的船客。</li>
|
|
<li>C 号口岸上船的生还率最高,可能大部分 C 口岸上船的人是一等舱和二等舱船客吧。</li>
|
|
<li>虽然有很多一等舱的土豪们基本上都是在 S 口岸上船的,但是 S 口岸的的生还率最低。这是因为 S 口岸上船的人中有很多都是三等舱的船客。</li>
|
|
<li>Q 号口岸上船的人中有 90% 多都是三等舱的船客。</li>
|
|
</ul>
|
|
<pre><code class="lang-python">sns.factorplot(<span class="hljs-string">'Pclass'</span>,<span class="hljs-string">'Survived'</span>,hue=<span class="hljs-string">'Sex'</span>,col=<span class="hljs-string">'Embarked'</span>,data=data)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/46.jpg" alt=""></p>
|
|
<p>我们可以看出:</p>
|
|
<ul>
|
|
<li>一等舱和二等舱的女性的生还率几乎为 100%, 这与女性是一等舱还是二等舱没啥关系。</li>
|
|
<li>S 号口岸上船并且是三等舱的,不管是男的还是女的,生还率都很低。金钱决定命运。。。</li>
|
|
<li>Q 号口岸上船的男性几乎团灭,因为Q 号口岸上船的基本上都是三等舱船客。</li>
|
|
</ul>
|
|
<h2 id="填充缺失口岸">填充缺失口岸</h2>
|
|
<p>由于大多数人都是从 S 号口岸上的船,我们可以假设由于人多,所以在 S 口岸登记信息时漏了几位船客,所以不妨用 S 号口岸填充缺失值。</p>
|
|
<pre><code class="lang-python">data[<span class="hljs-string">'Embarked'</span>].fillna(<span class="hljs-string">'S'</span>,inplace=<span class="hljs-keyword">True</span>)
|
|
</code></pre>
|
|
<h2 id="兄弟姐妹的数量与生还率的关系">兄弟姐妹的数量与生还率的关系</h2>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">20</span>,<span class="hljs-number">8</span>))
|
|
sns.barplot(<span class="hljs-string">'SibSp'</span>,<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'SibSp vs Survived'</span>)
|
|
sns.factorplot(<span class="hljs-string">'SibSp'</span>,<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'SibSp vs Survived'</span>)
|
|
plt.close(<span class="hljs-number">2</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/47.jpg" alt=""></p>
|
|
<p>从图可以看出,如果一位船客是单独一个人上船旅游,没有兄弟姐妹而且是单身,那么他有大约 34% 的生还率,生还率比较低。如果兄弟姐妹的数量变多,那么生还率还是呈下降趋势的。这其实挺合理的,因为如果是一个家庭在船上的话,可能会设法救他们而不是救自己,这样一来可能谁都救不了。</p>
|
|
<h2 id="父母的数量与生还率的关系">父母的数量与生还率的关系</h2>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">2</span>,figsize=(<span class="hljs-number">20</span>,<span class="hljs-number">8</span>))
|
|
sns.barplot(<span class="hljs-string">'Parch'</span>,<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Parch vs Survived'</span>)
|
|
sns.factorplot(<span class="hljs-string">'Parch'</span>,<span class="hljs-string">'Survived'</span>,data=data,ax=ax[<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Parch vs Survived'</span>)
|
|
plt.close(<span class="hljs-number">2</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/48.jpg" alt=""></p>
|
|
<p>从图上看会发现结果和上面的比较相似,父母在船上的船客有更大的生还机会。而且对于那些在船上有 1-3 个父母的人来说,生还率还是比较高的。</p>
|
|
<h2 id="花费与生还率的关系">花费与生还率的关系</h2>
|
|
<p>首先,先看一下花费的最值和均值。</p>
|
|
<pre><code class="lang-python">print(<span class="hljs-string">'Highest Fare was:'</span>,data[<span class="hljs-string">'Fare'</span>].max())
|
|
print(<span class="hljs-string">'Lowest Fare was:'</span>,data[<span class="hljs-string">'Fare'</span>].min())
|
|
print(<span class="hljs-string">'Average Fare was:'</span>,data[<span class="hljs-string">'Fare'</span>].mean())
|
|
</code></pre>
|
|
<p><img src="../img/49.jpg" alt=""></p>
|
|
<p>惊奇的发现,居然有人可以享受免费豪华邮轮!!!!</p>
|
|
<pre><code class="lang-python">f,ax=plt.subplots(<span class="hljs-number">1</span>,<span class="hljs-number">3</span>,figsize=(<span class="hljs-number">20</span>,<span class="hljs-number">8</span>))
|
|
sns.distplot(data[data[<span class="hljs-string">'Pclass'</span>]==<span class="hljs-number">1</span>].Fare,ax=ax[<span class="hljs-number">0</span>])
|
|
ax[<span class="hljs-number">0</span>].set_title(<span class="hljs-string">'Fares in Pclass 1'</span>)
|
|
sns.distplot(data[data[<span class="hljs-string">'Pclass'</span>]==<span class="hljs-number">2</span>].Fare,ax=ax[<span class="hljs-number">1</span>])
|
|
ax[<span class="hljs-number">1</span>].set_title(<span class="hljs-string">'Fares in Pclass 2'</span>)
|
|
sns.distplot(data[data[<span class="hljs-string">'Pclass'</span>]==<span class="hljs-number">3</span>].Fare,ax=ax[<span class="hljs-number">2</span>])
|
|
ax[<span class="hljs-number">2</span>].set_title(<span class="hljs-string">'Fares in Pclass 3'</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/50.jpg" alt=""></p>
|
|
<p>从图中可以看出平均花费其实是二等舱的普遍消费水平,但是三等舱的人数是最多的,而三等舱的人群中花费人数最多的是 10 左右,因此平均 32 的花费是被有钱的大佬给提上去的。</p>
|
|
<h2 id="简单总结一下">简单总结一下</h2>
|
|
<p>看了这么多特征对于生还的影响,可能有点懵,不妨先简单总结一下根据可视化结果所获得的信息。</p>
|
|
<ul>
|
|
<li>性别:女性的生还率高</li>
|
|
<li>船舱等级:越有钱越容易活下来,头等舱的生还率最高,三等舱的生还率最低。</li>
|
|
<li>年龄:10 岁以下的小朋友的存活率比较高,15-35 岁的年轻人存活率低。可能年轻人就是炮灰吧。</li>
|
|
<li>口岸:即使大多数一等舱的船客在 S 号口岸上的船, 但生还率不是最高的。 Q 号口岸的基本上是三等舱的船客。</li>
|
|
<li>兄弟姐妹父母爱人数量:有 1-2 个兄弟姐妹,配偶在船上,或 1-3 个父母的生还率比较高,独自一人或者一个大家庭都在船上的生还率比较低。</li>
|
|
</ul>
|
|
<h2 id="特征之间的相关性系数">特征之间的相关性系数</h2>
|
|
<p>相关性分为正相关与负相关,正相关指的是:如果特征 A 的数值变大会导致特征 B 的数值变大;负相关指的是:如果特征 A 的数值变小会导致特征 B 的数值变大。通常使用 [-1, 1] 的数值来表示两个特征之间的相关性,这个值称为<strong>相关性系数</strong>。若该系数为 1 那么表示两个特征之间完全正相关,若为 -1 则表示完全负相关,若为 0 则表示两个特征之间没有相关性(线性的)。</p>
|
|
<p>如果现在两个特征高度相关或者完全相关,这就意味着这两个特征都包含高度相似的信息,并且信息的差异非常小,所以其中一个特征是多余的。在构建模型时,我们应该尽量消除这种多余的特征,因为这样能减少训练的时间,也可以在某种程度上缓解过拟合。</p>
|
|
<p>所以接下来用热力图对相关性系数进行可视化。</p>
|
|
<pre><code class="lang-python">sns.heatmap(data.corr(),annot=<span class="hljs-keyword">True</span>,cmap=<span class="hljs-string">'RdYlGn'</span>,linewidths=<span class="hljs-number">0.2</span>) <span class="hljs-comment">#data.corr()-->correlation matrix</span>
|
|
fig=plt.gcf()
|
|
fig.set_size_inches(<span class="hljs-number">10</span>,<span class="hljs-number">8</span>)
|
|
plt.show()
|
|
</code></pre>
|
|
<p><img src="../img/51.jpg" alt=""></p>
|
|
<p>从热力图上可以看出这些特征之间没有太大的相关性,最高的也就 SibSp与Parch,值为 0.41 。</p>
|
|
|
|
|
|
</section>
|
|
|
|
</div>
|
|
<div class="search-results">
|
|
<div class="has-results">
|
|
|
|
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
|
|
<ul class="search-results-list"></ul>
|
|
|
|
</div>
|
|
<div class="no-results">
|
|
|
|
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
|
|
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<a href="introduction.html" class="navigation navigation-prev " aria-label="Previous page: 简介">
|
|
<i class="fa fa-angle-left"></i>
|
|
</a>
|
|
|
|
|
|
<a href="feature engerning.html" class="navigation navigation-next " aria-label="Next page: 特征工程">
|
|
<i class="fa fa-angle-right"></i>
|
|
</a>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
<script>
|
|
var gitbook = gitbook || [];
|
|
gitbook.push(function() {
|
|
gitbook.page.hasChanged({"page":{"title":"探索性数据分析(EDA)","level":"1.6.1.2","depth":3,"next":{"title":"特征工程","level":"1.6.1.3","depth":3,"path":"titanic/feature engerning.md","ref":"./titanic/feature engerning.md","articles":[]},"previous":{"title":"简介","level":"1.6.1.1","depth":3,"path":"titanic/introduction.md","ref":"./titanic/introduction.md","articles":[]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["katex"],"pluginsConfig":{"katex":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"titanic/EDA.md","mtime":"2019-07-05T02:10:48.992Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-07-06T07:31:21.537Z"},"basePath":"..","book":{"language":""}});
|
|
});
|
|
</script>
|
|
</div>
|
|
|
|
|
|
<script src="../gitbook/gitbook.js"></script>
|
|
<script src="../gitbook/theme.js"></script>
|
|
|
|
|
|
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
|
|
|
|
|
|
|
|
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
|
|
|
|
|
|
|
|
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
|
|
|
|
|
|
|
|
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
|
|
|
|
|
|
|
|
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
|
|
|
|
|
|
|
|
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
|
|
|
|
|
|
|
|
</body>
|
|
</html>
|
|
|