You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1 line
11 KiB

import sys
from PyQt5.QtWidgets import QApplication, QMainWindow, QFileDialog, QWidget
from PyQt5.QtGui import QPixmap
from functools import partial
import datetime
import numpy as np
import pandas as pd
import FirstUi
import SecondUi
import ThirdUi
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn
from sklearn.preprocessing import OneHotEncoder as ohe
from sklearn.preprocessing import OrdinalEncoder as ode
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
class Util:
def __init__(self, F_Widget, S_Widget, T_Widget, S_Ui, T_Ui):
self.F_Widget = F_Widget
self.S_Widget = S_Widget
self.T_Widget = T_Widget
self.S_Ui = S_Ui
self.T_Ui = T_Ui
self.TrainFile = None
self.TestFile = None
self.TestSurvived = None
self.fig = None
self.canvas = None
self.train = None
self.test = None
self.y_test = None
self.pred_x = None
self.train_y = None
self.train_x = None
self.test_x = None
self.test_y = None
# 将按钮与槽函数连接(数据导入)
self.F_Widget.pushButton.clicked.connect(self.DataInput)
self.S_Widget.pushButton.clicked.connect(partial(self.OpenFile, self.S_Widget.lineEdit))
self.S_Widget.pushButton_2.clicked.connect(partial(self.OpenFile, self.S_Widget.lineEdit_2))
self.S_Widget.pushButton_5.clicked.connect(partial(self.OpenFile, self.S_Widget.lineEdit_3))
self.S_Widget.pushButton_3.clicked.connect(self.FileReady)
self.S_Widget.pushButton_4.clicked.connect(self.FileClear)
self.S_Widget.lineEdit.textChanged.connect(partial(self.SetFileInfo, self.S_Widget.lineEdit, 1))
self.S_Widget.lineEdit_2.textChanged.connect(partial(self.SetFileInfo, self.S_Widget.lineEdit_2, 2))
self.S_Widget.lineEdit_3.textChanged.connect(partial(self.SetFileInfo, self.S_Widget.lineEdit_3, 3))
# 将按钮与槽函数连接(特征分析)
self.F_Widget.pushButton_2.clicked.connect(self.DataProcess)
# 将按钮与槽函数连接(算法预测)
self.F_Widget.pushButton_3.clicked.connect(self.predicted)
def SetFileInfo(self, line, num):
if num == 1:
if line.text() != '':
self.TrainFile = line.text()
if num == 2:
if line.text() != '':
self.TestFile = line.text()
if num == 3:
if line.text() != '':
self.TestSurvived = line.text()
def DataInput(self):
self.S_Ui.show()
def FileClear(self):
if self.TrainFile is not None or self.TestFile is not None or self.TestSurvived is not None:
self.TrainFile = None
self.TestFile = None
self.TestSurvived = None
self.S_Widget.lineEdit.clear()
self.S_Widget.lineEdit_2.clear()
self.S_Widget.lineEdit_3.clear()
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "数据清除成功")
self.S_Ui.close()
def FileReady(self):
if self.TrainFile is not None and self.TestFile is not None and self.TestSurvived is not None:
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "数据导入成功")
self.S_Ui.close()
def OpenFile(self, line):
# 第一个参数指向父组件
filename, type = QFileDialog.getOpenFileName(self.S_Ui, "选择目录", r"C:\Users\Administrator\Desktop")
line.setText(filename)
def DataProcess(self):
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "开始进行数据的特征分析")
if self.TrainFile is not None and self.TestFile is not None and self.TestSurvived is not None:
self.train = pd.read_csv(self.TrainFile)
self.test = pd.read_csv(self.TestFile)
self.y_test = pd.read_csv(self.TestSurvived)
self.FeatureProcess()
else:
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "数据尚未导入")
def Draw_HeatMap(self, corrDf):
self.T_Ui.show()
pic = seaborn.heatmap(corrDf).get_figure()
pic.savefig('HeatMap.jpg', dpi=300, bbox_inches='tight')
pix = QPixmap('HeatMap.jpg').scaled(self.T_Widget.label.width(), self.T_Widget.label.height())
self.T_Widget.label.setPixmap(pix)
def FeatureProcess(self):
rowNum_train = self.train.shape[0]
xfull = self.train.loc[:, self.train.columns != "Survived"].append(self.test, ignore_index=True)
ytrain = self.train["Survived"]
self.F_Widget.textBrowser.append("1. 查看缺失值:")
self.F_Widget.textBrowser.append(xfull.isnull().mean().to_string())
self.F_Widget.textBrowser.append("开始缺失值填充")
xfull['Age'] = xfull['Age'].fillna(xfull['Age'].mean())
xfull['Fare'] = xfull['Fare'].fillna(xfull['Fare'].mean())
xfull['Embarked'].fillna('S', inplace=True)
xfull['Cabin'] = xfull['Cabin'].fillna('U')
self.F_Widget.textBrowser.append("2. 查看异常值:")
self.F_Widget.textBrowser.append(xfull.describe().to_string())
self.F_Widget.textBrowser.append("开始处理异常值")
xfull['Nclass'] = xfull['Name'].map(self.NameSplit)
xfull = xfull.loc[:, xfull.columns != 'Name']
title_mapDict = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir": "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess": "Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr": "Mr",
"Mrs": "Mrs",
"Miss": "Miss",
"Master": "Master",
"Lady": "Royalty"
}
xfull['Nclass'] = xfull['Nclass'].map(title_mapDict)
xfull['Cabin'] = xfull['Cabin'].map(lambda c: c[0])
xfull['FamilyNum'] = xfull['Parch'] + xfull['SibSp'] + 1
xfull = xfull.loc[:, xfull.columns != 'Parch']
xfull = xfull.loc[:, xfull.columns != 'SibSp']
est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
xfull['Age'] = est.fit_transform(xfull['Age'].values.reshape(-1, 1))
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
xfull['Fare'] = est.fit_transform(xfull['Fare'].values.reshape(-1, 1))
xfull['FamilyNum'] = pd.cut(xfull['FamilyNum'], [0, 1, 4, np.inf], labels=['小家庭', '中等家庭', '大家庭'])
hot = ['Sex', 'Embarked', 'Nclass', 'Cabin']
ordinal = ['Pclass', 'Age', 'Fare', 'FamilyNum']
xfull.loc[:, ordinal] = ode().fit_transform(xfull.loc[:, ordinal])
sex = pd.get_dummies(xfull['Sex'], prefix='Sex')
embarked = pd.get_dummies(xfull['Embarked'], prefix='Embarked')
nclass = pd.get_dummies(xfull['Nclass'], prefix='Nclass')
cabin = pd.get_dummies(xfull['Cabin'], prefix='Cabin')
xfull = pd.concat([xfull, sex, embarked, nclass, cabin], axis=1)
xfull.drop(hot, axis=1, inplace=True)
corrDf = pd.concat([xfull, ytrain], axis=1).corr()
self.Draw_HeatMap(corrDf)
col = corrDf['Survived'][(abs(corrDf['Survived']).values >= 0.1) & (abs(corrDf['Survived']).values < 1)].index
xfullnew = xfull[col]
x = xfullnew.loc[0:rowNum_train - 1, :]
y = ytrain
self.pred_x = xfullnew.loc[rowNum_train:, :]
self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(x, y, train_size=.8, random_state=420)
# 输出数据集大小
self.F_Widget.textBrowser.append("已划分好数据集")
self.F_Widget.textBrowser.append("原始数据集特征:" + str(x.shape))
self.F_Widget.textBrowser.append("训练数据集特征:" + str(self.train_x.shape))
self.F_Widget.textBrowser.append("测试数据集特征:" + str(self.test_x.shape))
self.F_Widget.textBrowser.append("原始数据集标签:" + str(x.shape))
self.F_Widget.textBrowser.append("训练数据集标签:" + str(self.train_y.shape))
self.F_Widget.textBrowser.append("测试数据集标签:" + str(self.test_y.shape))
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "数据分析完毕")
def NameSplit(self, x):
x1 = x.split('.')[0]
x2 = x1.split(',')[1].strip()
return x2
def predicted(self):
model = LogisticRegression(C=1.0, max_iter=100, penalty='l2', random_state=0, solver='liblinear')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "开始训练线性回归模型")
model.fit(self.train_x, self.train_y)
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "模型训练完毕")
self.F_Widget.textBrowser.append(time_str + ":" + "开始对模型进行评估")
self.F_Widget.textBrowser.append("评估分数为:" + str(model.score(self.test_x, self.test_y)))
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
self.F_Widget.textBrowser.append(time_str + ":" + "开始使用线性回归模型进行预测")
pred_y = model.predict(self.pred_x)
y_test = self.y_test['Survived']
self.F_Widget.textBrowser.append("预测效果:" + "\n" + classification_report(pred_y, y_test))
def main():
app = QApplication(sys.argv)
MainWindow = QMainWindow()
Widget = QWidget()
Widget1 = QWidget()
ui = FirstUi.Ui_MainWindow()
ui2 = SecondUi.Ui_Form()
ui3 = ThirdUi.Ui_Form()
ui.setupUi(MainWindow)
ui2.setupUi(Widget)
ui3.setupUi(Widget1)
MainWindow.show()
u = Util(ui, ui2, ui3, Widget, Widget1)
sys.exit(app.exec_())
if __name__ == '__main__':
main()