|
|
|
|
@ -1,194 +0,0 @@
|
|
|
|
|
# In[1]:
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
from plotly import __version__
|
|
|
|
|
print (__version__)
|
|
|
|
|
from plotly.offline import init_notebook_mode
|
|
|
|
|
init_notebook_mode(connected=True)
|
|
|
|
|
from plotly.graph_objs import Figure, Layout, Pie,Bar
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import seaborn as sns
|
|
|
|
|
import plotly.graph_objects as go
|
|
|
|
|
import plotly.io as pio
|
|
|
|
|
# ## 1.数据检查与理解
|
|
|
|
|
|
|
|
|
|
# In[2]:
|
|
|
|
|
colors = ['#e43620', '#f16d30','#d99a6c','#fed976', '#b3cb95', '#41bfb3','#229bac', '#256894']
|
|
|
|
|
data = pd.read_csv('main/HR_comma_sep.csv')
|
|
|
|
|
print(data.head())
|
|
|
|
|
# In[3]:
|
|
|
|
|
print("共有",data.shape[0],"条员工记录,",data.shape[1],"个员工特征。")
|
|
|
|
|
# ### 1.1. 检查是否存在缺失值
|
|
|
|
|
# In[4]:
|
|
|
|
|
data.isnull().sum()
|
|
|
|
|
# ### 1.2. 适当的改名来更直观的理解和获取特征列
|
|
|
|
|
# In[5]:
|
|
|
|
|
df = data.rename(columns = {"sales":"department","promotion_last_5years":"promotion","Work_accident":"work_accident"})
|
|
|
|
|
print(df.columns)
|
|
|
|
|
# ### 1.3. 查看数据的信息
|
|
|
|
|
# In[6]:
|
|
|
|
|
df.info()
|
|
|
|
|
# ### 1.4. 展示所有类型特征的信息
|
|
|
|
|
# In[7]:
|
|
|
|
|
print(df.describe(include=['O']))
|
|
|
|
|
# ### 1.5. 类别数字化
|
|
|
|
|
# 1. 先设置`salary`与`department`列为**Category**的数据类型
|
|
|
|
|
# 2. 保存类别与对应数值的映射字典
|
|
|
|
|
# 3. 针对`salary`和`department`这两个`Object`类型的**类别**特征,将其进行类别数字化。
|
|
|
|
|
|
|
|
|
|
# In[8]:
|
|
|
|
|
# 1. 先设置`salary`与`department`列为**Category**的数据类型
|
|
|
|
|
df['department'] = df['department'].astype('category')#, categories=cat.categories)
|
|
|
|
|
df['salary'] = df['salary'].astype('category')#, categories=cat.categories)
|
|
|
|
|
|
|
|
|
|
# In[9]:
|
|
|
|
|
df.info()
|
|
|
|
|
# In[10]:
|
|
|
|
|
# 保存类别
|
|
|
|
|
# department_categories = pd.Categorical(df['department']).categories
|
|
|
|
|
# salary_categories = pd.Categorical(df['salary']).categories
|
|
|
|
|
# In[11]:
|
|
|
|
|
# 2. 保存类别与对应数值的映射字典
|
|
|
|
|
salary_dict = dict(enumerate(df['salary'].cat.categories))
|
|
|
|
|
department_dict = dict(enumerate(df['department'].cat.categories))
|
|
|
|
|
print(salary_dict,department_dict)
|
|
|
|
|
# In[12]:
|
|
|
|
|
# 3. 针对`salary`和`department`这两个`Object`类型的类别特征,将其进行类别数字化。
|
|
|
|
|
for feature in df.columns:
|
|
|
|
|
if str(df[feature].dtype) == 'category':
|
|
|
|
|
df[feature] = df[feature].cat.codes
|
|
|
|
|
# df[feature] = pd.Categorical(df[feature]).codes
|
|
|
|
|
df[feature] = df[feature].astype("int64") # 设置数据类型为int64
|
|
|
|
|
# In[13]:
|
|
|
|
|
df.head()
|
|
|
|
|
# ### 1.6. 改变columns的顺序
|
|
|
|
|
# 1.先设置columns的顺序
|
|
|
|
|
# - 将`left`列放置于最后一列以便直观地查看
|
|
|
|
|
# 2.根据排好的列表顺序应用于dataframe上
|
|
|
|
|
# In[14]:
|
|
|
|
|
cols = df.columns
|
|
|
|
|
cols = list(cols[:6]) + list(cols[7:]) + [cols[6]]
|
|
|
|
|
print('Reordered Columns:',cols)
|
|
|
|
|
# In[15]:
|
|
|
|
|
# 根据排好的列表顺序应用于dataframe上
|
|
|
|
|
df = df[cols]
|
|
|
|
|
print(df.head())
|
|
|
|
|
# In[16]:
|
|
|
|
|
print(df.shape)
|
|
|
|
|
df.info()
|
|
|
|
|
|
|
|
|
|
# ## 2.数据探索与分析
|
|
|
|
|
# ### 2.1. 描述性分析
|
|
|
|
|
# **对`left`**列进行Group,进行描述性分析[[1]](https://zhuanlan.zhihu.com/p/30282012)
|
|
|
|
|
# 查看在职与离职类别下,每个特征的均值
|
|
|
|
|
# In[17]:
|
|
|
|
|
left_summary = df.groupby(by=['left']).mean()
|
|
|
|
|
# In[18]:
|
|
|
|
|
print(left_summary)
|
|
|
|
|
# ### 2.2. 相关性分析
|
|
|
|
|
# 根据热力图显示,可以发现:
|
|
|
|
|
#
|
|
|
|
|
# - `满意度(satisfaction_level)`
|
|
|
|
|
# - 员工**满意度**(satisfaction_level)离职(left)呈较大**负相关**(-)关系,与**完成项目数**(number_project)、**在公司的年份**(time_spend_company)也有一定的负相关性。
|
|
|
|
|
# - `绩效评估(last_evaluation)`
|
|
|
|
|
# - 上一次的**绩效评估**(last_evaluation)与**完成项目数**(number_project)和**平均每月工作时间**(average_montly_hours)这两个特征呈较大的**正相关**(+)关系,也就是说,完成项目数越多,平均每月工作时长越长,员工能获得更高的评价。
|
|
|
|
|
# - 但绩效评估与工资,晋升都没有什么相关性,所以员工得到了高绩效评价也不会升职或者涨工资。
|
|
|
|
|
# - `离职(left)`
|
|
|
|
|
# - 离职率与员工**满意度**(satisfaction_level)、过去5年是否有**晋升**(promotion_last_5years)、是否有**工伤**(work_accident)、**工资薪酬**(salary)呈**负相关**(-)关系。如果员工对公司不太满意,且个人价值实现不高,那么离职的可能性会很大。
|
|
|
|
|
# - 离职率与员工的**在公司的年份**(time_spend_company)呈较大**正相关**(+)关系。与**平均每月工作时间**(average_montly_hours),所在**部门**(department)也呈些许正相关性。
|
|
|
|
|
|
|
|
|
|
# In[19]:
|
|
|
|
|
corr = df.corr() # pearson相关系数
|
|
|
|
|
print(corr)
|
|
|
|
|
mask = np.zeros_like(corr)#创建0矩阵大小类似corr
|
|
|
|
|
#print(mask)
|
|
|
|
|
mask[np.tril_indices_from(mask)]=True#np.tril_indices_from 函数会返回下三角矩阵的索引
|
|
|
|
|
# In[20]:
|
|
|
|
|
|
|
|
|
|
with sns.axes_style("white"):#seaborn设置坐标风格
|
|
|
|
|
sns.set(rc={'figure.figsize':(11,7)})#宽度高度
|
|
|
|
|
ax = sns.heatmap(corr,
|
|
|
|
|
xticklabels=True, yticklabels=True, #表示在热力图的 x 轴和 y 轴上显示对应的标签数据框中各列的名称,对应相关系数矩阵的行和列索引
|
|
|
|
|
cmap='RdBu', # 颜色红蓝
|
|
|
|
|
mask=mask, # 使用掩码只绘制矩阵的一部分
|
|
|
|
|
fmt='.3f', # 相关系数格式设置保留3位
|
|
|
|
|
annot=True, # 方格内写入数据
|
|
|
|
|
linewidths=.5, # 热力图矩阵之间的间隔大小设置了热力图中每个方格之间的间隔线条的宽度为 0.5
|
|
|
|
|
vmax=.4, # 指定了热力图颜色映射中颜色所对应的最大值,突出显示相关系数绝对值在 0 到 0.4 这个区间内的变化情况
|
|
|
|
|
square = True #每个方格呈现正方形形状
|
|
|
|
|
# center = 0
|
|
|
|
|
)
|
|
|
|
|
plt.title("Correlation")
|
|
|
|
|
label_x = ax.get_xticklabels()
|
|
|
|
|
plt.setp(label_x,rotation=45, horizontalalignment='right')#热力图x轴数据旋转45度并且设置格式右对齐
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ### 2.3.变量分析
|
|
|
|
|
# #### 2.3.1. 公司当前员工离职与在职的比率
|
|
|
|
|
# In[21]:
|
|
|
|
|
left_count = df['left'].value_counts().reset_index(name = "left_count")
|
|
|
|
|
# In[25]:
|
|
|
|
|
df = df.fillna('')
|
|
|
|
|
# In[27]:
|
|
|
|
|
|
|
|
|
|
trace =Pie(labels = ['在职','离职'], values = left_count.left_count,
|
|
|
|
|
hoverinfo = "label + percent + name",
|
|
|
|
|
marker = dict(colors = colors[3:]), hole = .6, pull = .1)
|
|
|
|
|
layout = Layout(title = "员工在职与离职的比率", width = 380, height = 380)
|
|
|
|
|
fig=go.Figure(Figure(data = [trace], layout = layout))
|
|
|
|
|
pio.write_html(fig, file='1.html', auto_open=True)
|
|
|
|
|
# #### 2.3.2. 公司员工的满意度与入职年份的关系
|
|
|
|
|
# In[28]:
|
|
|
|
|
time_mean_satifaction = df.groupby(by = ['time_spend_company'])['satisfaction_level'].mean().reset_index(name = "average_satisfaction") # 取满意度的均值的
|
|
|
|
|
# In[29]:
|
|
|
|
|
trace = Bar(x=time_mean_satifaction.time_spend_company, y=time_mean_satifaction.average_satisfaction, marker=dict(color = colors),)
|
|
|
|
|
layout = Layout(title= "员工满意度与公司在职时间有什么关联?",
|
|
|
|
|
width = 700, height = 400,
|
|
|
|
|
xaxis = dict(title="在公司时间(年)"),
|
|
|
|
|
yaxis = dict(title = "平均满意度"),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
fig1=go.Figure((Figure(data=[trace],layout= layout)))
|
|
|
|
|
pio.write_html(fig1, file='2.html', auto_open=True)
|
|
|
|
|
# #### 2.3.3. 公司各部门的员工离职与在职情况对比
|
|
|
|
|
# 可以看出,sales部门的离职人数最多,有1014人,其次是technical技术部门离职697人。
|
|
|
|
|
# In[30]:
|
|
|
|
|
depart_left_table = pd.crosstab(index=df['department'],columns=df['left'])
|
|
|
|
|
# In[31]:
|
|
|
|
|
data = []
|
|
|
|
|
left_eles = df.left.unique()
|
|
|
|
|
for l in left_eles:
|
|
|
|
|
trace = Bar(x = depart_left_table[l], y = depart_left_table.index, name=('离职' if l == 1 else '在职'),orientation='h',marker=dict(color=colors[l+4]))
|
|
|
|
|
data.append(trace)
|
|
|
|
|
layout = Layout(title="每个部门的离职员工数与在职员工数对比", barmode="stack",width=800,height=500,yaxis=dict(title="部门",tickmode="array",tickvals=list(department_dict.keys()),ticktext=list(department_dict.values())))
|
|
|
|
|
fig2=go.Figure(Figure(data= data, layout=layout))
|
|
|
|
|
pio.write_html(fig2, file='3.html', auto_open=True)
|
|
|
|
|
# #### 2.3.4. 公司各部门的员工的工资水平
|
|
|
|
|
# 销售部门(sales)低工资水平(low salary)的最多,有2099人,其次是技术部门(technical)与后勤部门(support),分别为1372人与1146人。
|
|
|
|
|
#
|
|
|
|
|
# In[32]:
|
|
|
|
|
depart_salary_table = pd.crosstab(index=df['department'], columns=df['salary'])
|
|
|
|
|
# depart_salary_table
|
|
|
|
|
# In[33]:
|
|
|
|
|
data = []
|
|
|
|
|
for i in range(3):
|
|
|
|
|
trace = Bar(x=depart_salary_table.index, y=depart_salary_table[i],name=salary_dict[i],marker=dict(color=colors[i+2]))
|
|
|
|
|
data.append(trace)
|
|
|
|
|
layout = Layout(title="公司各部门的员工工资情况",width=800,height=450,xaxis = dict(tickmode="array",tickvals=list(department_dict.keys()),ticktext=list(department_dict.values())))
|
|
|
|
|
fig3=go.Figure(Figure(data = data,layout = layout))
|
|
|
|
|
pio.write_html(fig3, file='4.html', auto_open=True)
|
|
|
|
|
# #### 2.3.5 员工薪资与离职率
|
|
|
|
|
# 低薪与中等薪资的员工离职率偏高分别是42%,26%,高薪员工只用7%的离职率。
|
|
|
|
|
# In[34]:
|
|
|
|
|
salary_left_table=pd.crosstab(index=df['salary'],columns=df['left'])
|
|
|
|
|
# In[35]:
|
|
|
|
|
data = []
|
|
|
|
|
for i in range(2):
|
|
|
|
|
trace = Bar(x=salary_left_table.index, y=salary_left_table[i],name=("在职" if i ==0 else "离职"),marker=dict(color=colors[i+4]))
|
|
|
|
|
data.append(trace)
|
|
|
|
|
layout = Layout(title="员工薪资对离职的影响",width=580,height=350,xaxis = dict(tickmode="array",tickvals=list(salary_dict.keys()),ticktext=list(salary_dict.values())))
|
|
|
|
|
fig4=go.Figure(Figure(data = data,layout = layout))
|
|
|
|
|
pio.write_html(fig4, file='5.html', auto_open=True)
|