You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1 line
11 KiB
1 line
11 KiB
import sys
|
|
from PyQt5.QtWidgets import QApplication, QMainWindow, QFileDialog, QWidget
|
|
from PyQt5.QtGui import QPixmap
|
|
from functools import partial
|
|
import datetime
|
|
import numpy as np
|
|
import pandas as pd
|
|
import FirstUi
|
|
import SecondUi
|
|
import ThirdUi
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import classification_report
|
|
import matplotlib.pyplot as plt
|
|
import seaborn
|
|
from sklearn.preprocessing import OneHotEncoder as ohe
|
|
from sklearn.preprocessing import OrdinalEncoder as ode
|
|
from sklearn.preprocessing import KBinsDiscretizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
|
|
class Util:
|
|
def __init__(self, F_Widget, S_Widget, T_Widget, S_Ui, T_Ui):
|
|
self.F_Widget = F_Widget
|
|
self.S_Widget = S_Widget
|
|
self.T_Widget = T_Widget
|
|
self.S_Ui = S_Ui
|
|
self.T_Ui = T_Ui
|
|
self.TrainFile = None
|
|
self.TestFile = None
|
|
self.TestSurvived = None
|
|
self.fig = None
|
|
self.canvas = None
|
|
|
|
self.train = None
|
|
self.test = None
|
|
self.y_test = None
|
|
self.pred_x = None
|
|
self.train_y = None
|
|
self.train_x = None
|
|
self.test_x = None
|
|
self.test_y = None
|
|
|
|
# 将按钮与槽函数连接(数据导入)
|
|
self.F_Widget.pushButton.clicked.connect(self.DataInput)
|
|
self.S_Widget.pushButton.clicked.connect(partial(self.OpenFile, self.S_Widget.lineEdit))
|
|
self.S_Widget.pushButton_2.clicked.connect(partial(self.OpenFile, self.S_Widget.lineEdit_2))
|
|
self.S_Widget.pushButton_5.clicked.connect(partial(self.OpenFile, self.S_Widget.lineEdit_3))
|
|
self.S_Widget.pushButton_3.clicked.connect(self.FileReady)
|
|
self.S_Widget.pushButton_4.clicked.connect(self.FileClear)
|
|
self.S_Widget.lineEdit.textChanged.connect(partial(self.SetFileInfo, self.S_Widget.lineEdit, 1))
|
|
self.S_Widget.lineEdit_2.textChanged.connect(partial(self.SetFileInfo, self.S_Widget.lineEdit_2, 2))
|
|
self.S_Widget.lineEdit_3.textChanged.connect(partial(self.SetFileInfo, self.S_Widget.lineEdit_3, 3))
|
|
|
|
# 将按钮与槽函数连接(特征分析)
|
|
self.F_Widget.pushButton_2.clicked.connect(self.DataProcess)
|
|
|
|
# 将按钮与槽函数连接(算法预测)
|
|
self.F_Widget.pushButton_3.clicked.connect(self.predicted)
|
|
|
|
def SetFileInfo(self, line, num):
|
|
if num == 1:
|
|
if line.text() != '':
|
|
self.TrainFile = line.text()
|
|
if num == 2:
|
|
if line.text() != '':
|
|
self.TestFile = line.text()
|
|
if num == 3:
|
|
if line.text() != '':
|
|
self.TestSurvived = line.text()
|
|
|
|
def DataInput(self):
|
|
self.S_Ui.show()
|
|
|
|
def FileClear(self):
|
|
if self.TrainFile is not None or self.TestFile is not None or self.TestSurvived is not None:
|
|
self.TrainFile = None
|
|
self.TestFile = None
|
|
self.TestSurvived = None
|
|
self.S_Widget.lineEdit.clear()
|
|
self.S_Widget.lineEdit_2.clear()
|
|
self.S_Widget.lineEdit_3.clear()
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "数据清除成功")
|
|
self.S_Ui.close()
|
|
|
|
def FileReady(self):
|
|
if self.TrainFile is not None and self.TestFile is not None and self.TestSurvived is not None:
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "数据导入成功")
|
|
self.S_Ui.close()
|
|
|
|
def OpenFile(self, line):
|
|
# 第一个参数指向父组件
|
|
filename, type = QFileDialog.getOpenFileName(self.S_Ui, "选择目录", r"C:\Users\Administrator\Desktop")
|
|
line.setText(filename)
|
|
|
|
def DataProcess(self):
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "开始进行数据的特征分析")
|
|
if self.TrainFile is not None and self.TestFile is not None and self.TestSurvived is not None:
|
|
self.train = pd.read_csv(self.TrainFile)
|
|
self.test = pd.read_csv(self.TestFile)
|
|
self.y_test = pd.read_csv(self.TestSurvived)
|
|
self.FeatureProcess()
|
|
else:
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "数据尚未导入")
|
|
|
|
def Draw_HeatMap(self, corrDf):
|
|
self.T_Ui.show()
|
|
pic = seaborn.heatmap(corrDf).get_figure()
|
|
pic.savefig('HeatMap.jpg', dpi=300, bbox_inches='tight')
|
|
pix = QPixmap('HeatMap.jpg').scaled(self.T_Widget.label.width(), self.T_Widget.label.height())
|
|
self.T_Widget.label.setPixmap(pix)
|
|
|
|
def FeatureProcess(self):
|
|
rowNum_train = self.train.shape[0]
|
|
|
|
xfull = self.train.loc[:, self.train.columns != "Survived"].append(self.test, ignore_index=True)
|
|
ytrain = self.train["Survived"]
|
|
|
|
self.F_Widget.textBrowser.append("1. 查看缺失值:")
|
|
self.F_Widget.textBrowser.append(xfull.isnull().mean().to_string())
|
|
self.F_Widget.textBrowser.append("开始缺失值填充")
|
|
xfull['Age'] = xfull['Age'].fillna(xfull['Age'].mean())
|
|
xfull['Fare'] = xfull['Fare'].fillna(xfull['Fare'].mean())
|
|
xfull['Embarked'].fillna('S', inplace=True)
|
|
xfull['Cabin'] = xfull['Cabin'].fillna('U')
|
|
self.F_Widget.textBrowser.append("2. 查看异常值:")
|
|
self.F_Widget.textBrowser.append(xfull.describe().to_string())
|
|
self.F_Widget.textBrowser.append("开始处理异常值")
|
|
xfull['Nclass'] = xfull['Name'].map(self.NameSplit)
|
|
|
|
xfull = xfull.loc[:, xfull.columns != 'Name']
|
|
title_mapDict = {
|
|
"Capt": "Officer",
|
|
"Col": "Officer",
|
|
"Major": "Officer",
|
|
"Jonkheer": "Royalty",
|
|
"Don": "Royalty",
|
|
"Sir": "Royalty",
|
|
"Dr": "Officer",
|
|
"Rev": "Officer",
|
|
"the Countess": "Royalty",
|
|
"Dona": "Royalty",
|
|
"Mme": "Mrs",
|
|
"Mlle": "Miss",
|
|
"Ms": "Mrs",
|
|
"Mr": "Mr",
|
|
"Mrs": "Mrs",
|
|
"Miss": "Miss",
|
|
"Master": "Master",
|
|
"Lady": "Royalty"
|
|
}
|
|
xfull['Nclass'] = xfull['Nclass'].map(title_mapDict)
|
|
xfull['Cabin'] = xfull['Cabin'].map(lambda c: c[0])
|
|
xfull['FamilyNum'] = xfull['Parch'] + xfull['SibSp'] + 1
|
|
|
|
xfull = xfull.loc[:, xfull.columns != 'Parch']
|
|
xfull = xfull.loc[:, xfull.columns != 'SibSp']
|
|
|
|
est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
|
|
xfull['Age'] = est.fit_transform(xfull['Age'].values.reshape(-1, 1))
|
|
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
|
|
xfull['Fare'] = est.fit_transform(xfull['Fare'].values.reshape(-1, 1))
|
|
xfull['FamilyNum'] = pd.cut(xfull['FamilyNum'], [0, 1, 4, np.inf], labels=['小家庭', '中等家庭', '大家庭'])
|
|
|
|
hot = ['Sex', 'Embarked', 'Nclass', 'Cabin']
|
|
ordinal = ['Pclass', 'Age', 'Fare', 'FamilyNum']
|
|
xfull.loc[:, ordinal] = ode().fit_transform(xfull.loc[:, ordinal])
|
|
sex = pd.get_dummies(xfull['Sex'], prefix='Sex')
|
|
embarked = pd.get_dummies(xfull['Embarked'], prefix='Embarked')
|
|
nclass = pd.get_dummies(xfull['Nclass'], prefix='Nclass')
|
|
cabin = pd.get_dummies(xfull['Cabin'], prefix='Cabin')
|
|
xfull = pd.concat([xfull, sex, embarked, nclass, cabin], axis=1)
|
|
xfull.drop(hot, axis=1, inplace=True)
|
|
corrDf = pd.concat([xfull, ytrain], axis=1).corr()
|
|
self.Draw_HeatMap(corrDf)
|
|
col = corrDf['Survived'][(abs(corrDf['Survived']).values >= 0.1) & (abs(corrDf['Survived']).values < 1)].index
|
|
xfullnew = xfull[col]
|
|
|
|
x = xfullnew.loc[0:rowNum_train - 1, :]
|
|
y = ytrain
|
|
self.pred_x = xfullnew.loc[rowNum_train:, :]
|
|
self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(x, y, train_size=.8, random_state=420)
|
|
# 输出数据集大小
|
|
self.F_Widget.textBrowser.append("已划分好数据集")
|
|
self.F_Widget.textBrowser.append("原始数据集特征:" + str(x.shape))
|
|
self.F_Widget.textBrowser.append("训练数据集特征:" + str(self.train_x.shape))
|
|
self.F_Widget.textBrowser.append("测试数据集特征:" + str(self.test_x.shape))
|
|
self.F_Widget.textBrowser.append("原始数据集标签:" + str(x.shape))
|
|
self.F_Widget.textBrowser.append("训练数据集标签:" + str(self.train_y.shape))
|
|
self.F_Widget.textBrowser.append("测试数据集标签:" + str(self.test_y.shape))
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "数据分析完毕")
|
|
|
|
def NameSplit(self, x):
|
|
x1 = x.split('.')[0]
|
|
x2 = x1.split(',')[1].strip()
|
|
return x2
|
|
|
|
def predicted(self):
|
|
model = LogisticRegression(C=1.0, max_iter=100, penalty='l2', random_state=0, solver='liblinear')
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "开始训练线性回归模型")
|
|
model.fit(self.train_x, self.train_y)
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "模型训练完毕")
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "开始对模型进行评估")
|
|
self.F_Widget.textBrowser.append("评估分数为:" + str(model.score(self.test_x, self.test_y)))
|
|
curr_time = datetime.datetime.now()
|
|
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
|
|
self.F_Widget.textBrowser.append(time_str + ":" + "开始使用线性回归模型进行预测")
|
|
pred_y = model.predict(self.pred_x)
|
|
y_test = self.y_test['Survived']
|
|
self.F_Widget.textBrowser.append("预测效果:" + "\n" + classification_report(pred_y, y_test))
|
|
|
|
|
|
def main():
|
|
app = QApplication(sys.argv)
|
|
MainWindow = QMainWindow()
|
|
Widget = QWidget()
|
|
Widget1 = QWidget()
|
|
ui = FirstUi.Ui_MainWindow()
|
|
ui2 = SecondUi.Ui_Form()
|
|
ui3 = ThirdUi.Ui_Form()
|
|
ui.setupUi(MainWindow)
|
|
ui2.setupUi(Widget)
|
|
ui3.setupUi(Widget1)
|
|
MainWindow.show()
|
|
u = Util(ui, ui2, ui3, Widget, Widget1)
|
|
sys.exit(app.exec_())
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|