import sys import os # 添加项目根目录到Python路径 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import xgboost as xgb import joblib from data.data_generator import preprocess_data def train_xgboost_model(): """ 训练XGBoost模型用于信贷风险评估 """ # 读取数据 print("读取信贷数据...") df = pd.read_csv('data/credit_data.csv') print(f"数据形状: {df.shape}") # 数据预处理 print("数据预处理...") X, y, scaler, le_education, le_home, le_purpose = preprocess_data(df) # 分割数据集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"训练集大小: {X_train.shape}") print(f"测试集大小: {X_test.shape}") # 创建XGBoost分类器 print("创建XGBoost模型...") model = xgb.XGBClassifier( n_estimators=100, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, random_state=42 ) # 训练模型 print("训练模型...") model.fit(X_train, y_train) # 预测 print("模型预测...") y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test)[:, 1] # 评估模型 accuracy = accuracy_score(y_test, y_pred) print(f"模型准确率: {accuracy:.4f}") print("\n分类报告:") print(classification_report(y_test, y_pred)) print("\n混淆矩阵:") print(confusion_matrix(y_test, y_pred)) # 保存模型和预处理器 print("保存模型和预处理器...") joblib.dump(model, 'models/xgboost_model.pkl') joblib.dump(scaler, 'models/scaler.pkl') joblib.dump(le_education, 'models/le_education.pkl') joblib.dump(le_home, 'models/le_home.pkl') joblib.dump(le_purpose, 'models/le_purpose.pkl') # 特征重要性 feature_importance = pd.DataFrame({ 'feature': X.columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) print("\n特征重要性:") print(feature_importance) return model, scaler, le_education, le_home, le_purpose if __name__ == "__main__": model, scaler, le_education, le_home, le_purpose = train_xgboost_model() print("\n模型训练完成!")