Compare commits
No commits in common. 'main' and 'master' have entirely different histories.
@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.13 (大作业)" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (大作业)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/大作业.iml" filepath="$PROJECT_DIR$/.idea/大作业.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.13 (大作业)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
@ -0,0 +1,192 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.optim as optim
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
import joblib
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# 添加项目根目录到Python路径
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
class AutoEncoder(nn.Module):
|
||||||
|
"""
|
||||||
|
自编码器用于异常检测
|
||||||
|
"""
|
||||||
|
def __init__(self, input_dim, hidden_dim1, hidden_dim2):
|
||||||
|
super(AutoEncoder, self).__init__()
|
||||||
|
# 编码器
|
||||||
|
self.encoder = nn.Sequential(
|
||||||
|
nn.Linear(input_dim, hidden_dim1),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim1, hidden_dim2),
|
||||||
|
nn.ReLU(True)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 解码器
|
||||||
|
self.decoder = nn.Sequential(
|
||||||
|
nn.Linear(hidden_dim2, hidden_dim1),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim1, input_dim),
|
||||||
|
nn.ReLU(True)
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.encoder(x)
|
||||||
|
x = self.decoder(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
class AdversarialAutoEncoder(nn.Module):
|
||||||
|
"""
|
||||||
|
对抗自编码器
|
||||||
|
"""
|
||||||
|
def __init__(self, input_dim, hidden_dim1, hidden_dim2, latent_dim):
|
||||||
|
super(AdversarialAutoEncoder, self).__init__()
|
||||||
|
# 编码器
|
||||||
|
self.encoder = nn.Sequential(
|
||||||
|
nn.Linear(input_dim, hidden_dim1),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim1, hidden_dim2),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim2, latent_dim),
|
||||||
|
nn.ReLU(True)
|
||||||
|
)
|
||||||
|
|
||||||
|
# 解码器
|
||||||
|
self.decoder = nn.Sequential(
|
||||||
|
nn.Linear(latent_dim, hidden_dim2),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim2, hidden_dim1),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim1, input_dim),
|
||||||
|
nn.Sigmoid() # 使用Sigmoid确保输出在0-1之间
|
||||||
|
)
|
||||||
|
|
||||||
|
# 判别器
|
||||||
|
self.discriminator = nn.Sequential(
|
||||||
|
nn.Linear(latent_dim, hidden_dim2),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim2, hidden_dim1),
|
||||||
|
nn.ReLU(True),
|
||||||
|
nn.Linear(hidden_dim1, 1),
|
||||||
|
nn.Sigmoid()
|
||||||
|
)
|
||||||
|
|
||||||
|
def encode(self, x):
|
||||||
|
return self.encoder(x)
|
||||||
|
|
||||||
|
def decode(self, z):
|
||||||
|
return self.decoder(z)
|
||||||
|
|
||||||
|
def discriminate(self, z):
|
||||||
|
return self.discriminator(z)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
z = self.encode(x)
|
||||||
|
recon_x = self.decode(z)
|
||||||
|
return recon_x, z
|
||||||
|
|
||||||
|
def train_adversarial_autoencoder():
|
||||||
|
"""
|
||||||
|
训练对抗自编码器
|
||||||
|
"""
|
||||||
|
# 读取数据
|
||||||
|
print("读取信贷数据...")
|
||||||
|
df = pd.read_csv('data/credit_data.csv')
|
||||||
|
|
||||||
|
# 只使用数值特征进行自编码器训练
|
||||||
|
numerical_features = ['age', 'income', 'employment_length', 'loan_amount',
|
||||||
|
'credit_score', 'debt_to_income', 'num_credit_lines']
|
||||||
|
X = df[numerical_features]
|
||||||
|
|
||||||
|
# 标准化数据
|
||||||
|
scaler = StandardScaler()
|
||||||
|
X_scaled = scaler.fit_transform(X)
|
||||||
|
|
||||||
|
# 转换为PyTorch张量
|
||||||
|
X_tensor = torch.FloatTensor(X_scaled)
|
||||||
|
|
||||||
|
# 设置模型参数
|
||||||
|
input_dim = X_tensor.shape[1]
|
||||||
|
hidden_dim1 = 64
|
||||||
|
hidden_dim2 = 32
|
||||||
|
latent_dim = 16
|
||||||
|
|
||||||
|
# 创建模型
|
||||||
|
model = AdversarialAutoEncoder(input_dim, hidden_dim1, hidden_dim2, latent_dim)
|
||||||
|
|
||||||
|
# 设置损失函数和优化器
|
||||||
|
reconstruction_criterion = nn.MSELoss()
|
||||||
|
adversarial_criterion = nn.BCELoss()
|
||||||
|
|
||||||
|
autoencoder_optimizer = optim.Adam(
|
||||||
|
list(model.encoder.parameters()) + list(model.decoder.parameters()),
|
||||||
|
lr=0.001
|
||||||
|
)
|
||||||
|
discriminator_optimizer = optim.Adam(model.discriminator.parameters(), lr=0.001)
|
||||||
|
|
||||||
|
# 训练模型
|
||||||
|
num_epochs = 100
|
||||||
|
batch_size = 64
|
||||||
|
|
||||||
|
print("开始训练对抗自编码器...")
|
||||||
|
for epoch in range(num_epochs):
|
||||||
|
for i in range(0, len(X_tensor), batch_size):
|
||||||
|
batch = X_tensor[i:i+batch_size]
|
||||||
|
|
||||||
|
# 训练自编码器
|
||||||
|
autoencoder_optimizer.zero_grad()
|
||||||
|
|
||||||
|
recon_batch, latent_batch = model(batch)
|
||||||
|
real_labels = torch.ones(batch.size(0), 1)
|
||||||
|
fake_labels = torch.zeros(batch.size(0), 1)
|
||||||
|
|
||||||
|
# 重构损失
|
||||||
|
recon_loss = reconstruction_criterion(recon_batch, batch)
|
||||||
|
|
||||||
|
# 对抗损失 - 生成器希望判别器将生成的潜在向量识别为真实
|
||||||
|
disc_fake = model.discriminate(latent_batch)
|
||||||
|
adversarial_loss = adversarial_criterion(disc_fake, real_labels)
|
||||||
|
|
||||||
|
autoencoder_loss = recon_loss + 0.1 * adversarial_loss
|
||||||
|
autoencoder_loss.backward()
|
||||||
|
autoencoder_optimizer.step()
|
||||||
|
|
||||||
|
# 训练判别器
|
||||||
|
discriminator_optimizer.zero_grad()
|
||||||
|
|
||||||
|
# 真实潜在向量(从标准正态分布采样)
|
||||||
|
real_latent = torch.randn(batch.size(0), latent_dim)
|
||||||
|
disc_real = model.discriminate(real_latent)
|
||||||
|
disc_real_loss = adversarial_criterion(disc_real, real_labels)
|
||||||
|
|
||||||
|
# 生成的潜在向量
|
||||||
|
disc_fake = model.discriminate(latent_batch.detach())
|
||||||
|
disc_fake_loss = adversarial_criterion(disc_fake, fake_labels)
|
||||||
|
|
||||||
|
discriminator_loss = disc_real_loss + disc_fake_loss
|
||||||
|
discriminator_loss.backward()
|
||||||
|
discriminator_optimizer.step()
|
||||||
|
|
||||||
|
if (epoch + 1) % 10 == 0:
|
||||||
|
print(f'Epoch [{epoch+1}/{num_epochs}], '
|
||||||
|
f'Recon Loss: {recon_loss.item():.4f}, '
|
||||||
|
f'Adversarial Loss: {adversarial_loss.item():.4f}, '
|
||||||
|
f'Discriminator Loss: {discriminator_loss.item():.4f}')
|
||||||
|
|
||||||
|
# 保存模型和标准化器
|
||||||
|
print("保存对抗自编码器模型...")
|
||||||
|
torch.save(model.state_dict(), 'models/adversarial_autoencoder.pth')
|
||||||
|
joblib.dump(scaler, 'models/ae_scaler.pkl')
|
||||||
|
|
||||||
|
return model, scaler
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 检查是否有可用的GPU
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
print(f"使用设备: {device}")
|
||||||
|
|
||||||
|
model, scaler = train_adversarial_autoencoder()
|
||||||
|
print("对抗自编码器训练完成!")
|
||||||
@ -0,0 +1,93 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 添加项目根目录到Python路径
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||||
|
import lightgbm as lgb
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
from data.data_generator import preprocess_data
|
||||||
|
|
||||||
|
def train_lightgbm_model():
|
||||||
|
"""
|
||||||
|
训练LightGBM模型用于信贷风险评估
|
||||||
|
"""
|
||||||
|
# 读取数据
|
||||||
|
print("读取信贷数据...")
|
||||||
|
df = pd.read_csv('data/credit_data.csv')
|
||||||
|
print(f"数据形状: {df.shape}")
|
||||||
|
|
||||||
|
# 数据预处理
|
||||||
|
print("数据预处理...")
|
||||||
|
X, y, scaler, le_education, le_home, le_purpose = preprocess_data(df)
|
||||||
|
|
||||||
|
# 分割数据集
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X, y, test_size=0.2, random_state=42, stratify=y
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"训练集大小: {X_train.shape}")
|
||||||
|
print(f"测试集大小: {X_test.shape}")
|
||||||
|
|
||||||
|
# 创建LightGBM分类器
|
||||||
|
print("创建LightGBM模型...")
|
||||||
|
model = lgb.LGBMClassifier(
|
||||||
|
n_estimators=200,
|
||||||
|
max_depth=8,
|
||||||
|
learning_rate=0.05,
|
||||||
|
num_leaves=64,
|
||||||
|
subsample=0.8,
|
||||||
|
colsample_bytree=0.8,
|
||||||
|
random_state=42,
|
||||||
|
verbose=-1
|
||||||
|
)
|
||||||
|
|
||||||
|
# 训练模型
|
||||||
|
print("训练模型...")
|
||||||
|
model.fit(X_train, y_train,
|
||||||
|
eval_set=[(X_test, y_test)],
|
||||||
|
eval_metric='binary_logloss',
|
||||||
|
callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)])
|
||||||
|
|
||||||
|
# 预测
|
||||||
|
print("模型预测...")
|
||||||
|
y_pred = model.predict(X_test)
|
||||||
|
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
||||||
|
|
||||||
|
# 评估模型
|
||||||
|
accuracy = accuracy_score(y_test, y_pred)
|
||||||
|
print(f"模型准确率: {accuracy:.4f}")
|
||||||
|
|
||||||
|
print("\n分类报告:")
|
||||||
|
print(classification_report(y_test, y_pred))
|
||||||
|
|
||||||
|
print("\n混淆矩阵:")
|
||||||
|
print(confusion_matrix(y_test, y_pred))
|
||||||
|
|
||||||
|
# 保存模型和预处理器
|
||||||
|
print("保存模型和预处理器...")
|
||||||
|
joblib.dump(model, 'models/lightgbm_model.pkl')
|
||||||
|
joblib.dump(scaler, 'models/scaler.pkl')
|
||||||
|
joblib.dump(le_education, 'models/le_education.pkl')
|
||||||
|
joblib.dump(le_home, 'models/le_home.pkl')
|
||||||
|
joblib.dump(le_purpose, 'models/le_purpose.pkl')
|
||||||
|
|
||||||
|
# 特征重要性
|
||||||
|
feature_importance = pd.DataFrame({
|
||||||
|
'feature': X.columns,
|
||||||
|
'importance': model.feature_importances_
|
||||||
|
}).sort_values('importance', ascending=False)
|
||||||
|
|
||||||
|
print("\n特征重要性:")
|
||||||
|
print(feature_importance)
|
||||||
|
|
||||||
|
return model, scaler, le_education, le_home, le_purpose
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
model, scaler, le_education, le_home, le_purpose = train_lightgbm_model()
|
||||||
|
print("\n模型训练完成!")
|
||||||
@ -0,0 +1,88 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 添加项目根目录到Python路径
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||||
|
import xgboost as xgb
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
from data.data_generator import preprocess_data
|
||||||
|
|
||||||
|
def train_xgboost_model():
|
||||||
|
"""
|
||||||
|
训练XGBoost模型用于信贷风险评估
|
||||||
|
"""
|
||||||
|
# 读取数据
|
||||||
|
print("读取信贷数据...")
|
||||||
|
df = pd.read_csv('data/credit_data.csv')
|
||||||
|
print(f"数据形状: {df.shape}")
|
||||||
|
|
||||||
|
# 数据预处理
|
||||||
|
print("数据预处理...")
|
||||||
|
X, y, scaler, le_education, le_home, le_purpose = preprocess_data(df)
|
||||||
|
|
||||||
|
# 分割数据集
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||||||
|
X, y, test_size=0.2, random_state=42, stratify=y
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"训练集大小: {X_train.shape}")
|
||||||
|
print(f"测试集大小: {X_test.shape}")
|
||||||
|
|
||||||
|
# 创建XGBoost分类器
|
||||||
|
print("创建XGBoost模型...")
|
||||||
|
model = xgb.XGBClassifier(
|
||||||
|
n_estimators=100,
|
||||||
|
max_depth=6,
|
||||||
|
learning_rate=0.1,
|
||||||
|
subsample=0.8,
|
||||||
|
colsample_bytree=0.8,
|
||||||
|
random_state=42
|
||||||
|
)
|
||||||
|
|
||||||
|
# 训练模型
|
||||||
|
print("训练模型...")
|
||||||
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# 预测
|
||||||
|
print("模型预测...")
|
||||||
|
y_pred = model.predict(X_test)
|
||||||
|
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
||||||
|
|
||||||
|
# 评估模型
|
||||||
|
accuracy = accuracy_score(y_test, y_pred)
|
||||||
|
print(f"模型准确率: {accuracy:.4f}")
|
||||||
|
|
||||||
|
print("\n分类报告:")
|
||||||
|
print(classification_report(y_test, y_pred))
|
||||||
|
|
||||||
|
print("\n混淆矩阵:")
|
||||||
|
print(confusion_matrix(y_test, y_pred))
|
||||||
|
|
||||||
|
# 保存模型和预处理器
|
||||||
|
print("保存模型和预处理器...")
|
||||||
|
joblib.dump(model, 'models/xgboost_model.pkl')
|
||||||
|
joblib.dump(scaler, 'models/scaler.pkl')
|
||||||
|
joblib.dump(le_education, 'models/le_education.pkl')
|
||||||
|
joblib.dump(le_home, 'models/le_home.pkl')
|
||||||
|
joblib.dump(le_purpose, 'models/le_purpose.pkl')
|
||||||
|
|
||||||
|
# 特征重要性
|
||||||
|
feature_importance = pd.DataFrame({
|
||||||
|
'feature': X.columns,
|
||||||
|
'importance': model.feature_importances_
|
||||||
|
}).sort_values('importance', ascending=False)
|
||||||
|
|
||||||
|
print("\n特征重要性:")
|
||||||
|
print(feature_importance)
|
||||||
|
|
||||||
|
return model, scaler, le_education, le_home, le_purpose
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
model, scaler, le_education, le_home, le_purpose = train_xgboost_model()
|
||||||
|
print("\n模型训练完成!")
|
||||||
|
After Width: | Height: | Size: 56 KiB |
|
After Width: | Height: | Size: 269 KiB |
|
After Width: | Height: | Size: 57 KiB |
|
After Width: | Height: | Size: 58 KiB |
|
After Width: | Height: | Size: 52 KiB |
|
After Width: | Height: | Size: 57 KiB |
|
After Width: | Height: | Size: 88 KiB |
|
After Width: | Height: | Size: 58 KiB |
|
|
After Width: | Height: | Size: 87 KiB |
|
After Width: | Height: | Size: 53 KiB |
|
After Width: | Height: | Size: 54 KiB |
|
After Width: | Height: | Size: 112 KiB |
|
After Width: | Height: | Size: 263 KiB |