Compare commits
No commits in common. 'main' and 'master' have entirely different histories.
@ -0,0 +1,8 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.13 (大作业)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (大作业)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/大作业.iml" filepath="$PROJECT_DIR$/.idea/大作业.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.13 (大作业)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
@ -0,0 +1,192 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import joblib
|
||||
import os
|
||||
import sys
|
||||
|
||||
# 添加项目根目录到Python路径
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
class AutoEncoder(nn.Module):
|
||||
"""
|
||||
自编码器用于异常检测
|
||||
"""
|
||||
def __init__(self, input_dim, hidden_dim1, hidden_dim2):
|
||||
super(AutoEncoder, self).__init__()
|
||||
# 编码器
|
||||
self.encoder = nn.Sequential(
|
||||
nn.Linear(input_dim, hidden_dim1),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim1, hidden_dim2),
|
||||
nn.ReLU(True)
|
||||
)
|
||||
|
||||
# 解码器
|
||||
self.decoder = nn.Sequential(
|
||||
nn.Linear(hidden_dim2, hidden_dim1),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim1, input_dim),
|
||||
nn.ReLU(True)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.encoder(x)
|
||||
x = self.decoder(x)
|
||||
return x
|
||||
|
||||
class AdversarialAutoEncoder(nn.Module):
|
||||
"""
|
||||
对抗自编码器
|
||||
"""
|
||||
def __init__(self, input_dim, hidden_dim1, hidden_dim2, latent_dim):
|
||||
super(AdversarialAutoEncoder, self).__init__()
|
||||
# 编码器
|
||||
self.encoder = nn.Sequential(
|
||||
nn.Linear(input_dim, hidden_dim1),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim1, hidden_dim2),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim2, latent_dim),
|
||||
nn.ReLU(True)
|
||||
)
|
||||
|
||||
# 解码器
|
||||
self.decoder = nn.Sequential(
|
||||
nn.Linear(latent_dim, hidden_dim2),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim2, hidden_dim1),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim1, input_dim),
|
||||
nn.Sigmoid() # 使用Sigmoid确保输出在0-1之间
|
||||
)
|
||||
|
||||
# 判别器
|
||||
self.discriminator = nn.Sequential(
|
||||
nn.Linear(latent_dim, hidden_dim2),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim2, hidden_dim1),
|
||||
nn.ReLU(True),
|
||||
nn.Linear(hidden_dim1, 1),
|
||||
nn.Sigmoid()
|
||||
)
|
||||
|
||||
def encode(self, x):
|
||||
return self.encoder(x)
|
||||
|
||||
def decode(self, z):
|
||||
return self.decoder(z)
|
||||
|
||||
def discriminate(self, z):
|
||||
return self.discriminator(z)
|
||||
|
||||
def forward(self, x):
|
||||
z = self.encode(x)
|
||||
recon_x = self.decode(z)
|
||||
return recon_x, z
|
||||
|
||||
def train_adversarial_autoencoder():
|
||||
"""
|
||||
训练对抗自编码器
|
||||
"""
|
||||
# 读取数据
|
||||
print("读取信贷数据...")
|
||||
df = pd.read_csv('data/credit_data.csv')
|
||||
|
||||
# 只使用数值特征进行自编码器训练
|
||||
numerical_features = ['age', 'income', 'employment_length', 'loan_amount',
|
||||
'credit_score', 'debt_to_income', 'num_credit_lines']
|
||||
X = df[numerical_features]
|
||||
|
||||
# 标准化数据
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# 转换为PyTorch张量
|
||||
X_tensor = torch.FloatTensor(X_scaled)
|
||||
|
||||
# 设置模型参数
|
||||
input_dim = X_tensor.shape[1]
|
||||
hidden_dim1 = 64
|
||||
hidden_dim2 = 32
|
||||
latent_dim = 16
|
||||
|
||||
# 创建模型
|
||||
model = AdversarialAutoEncoder(input_dim, hidden_dim1, hidden_dim2, latent_dim)
|
||||
|
||||
# 设置损失函数和优化器
|
||||
reconstruction_criterion = nn.MSELoss()
|
||||
adversarial_criterion = nn.BCELoss()
|
||||
|
||||
autoencoder_optimizer = optim.Adam(
|
||||
list(model.encoder.parameters()) + list(model.decoder.parameters()),
|
||||
lr=0.001
|
||||
)
|
||||
discriminator_optimizer = optim.Adam(model.discriminator.parameters(), lr=0.001)
|
||||
|
||||
# 训练模型
|
||||
num_epochs = 100
|
||||
batch_size = 64
|
||||
|
||||
print("开始训练对抗自编码器...")
|
||||
for epoch in range(num_epochs):
|
||||
for i in range(0, len(X_tensor), batch_size):
|
||||
batch = X_tensor[i:i+batch_size]
|
||||
|
||||
# 训练自编码器
|
||||
autoencoder_optimizer.zero_grad()
|
||||
|
||||
recon_batch, latent_batch = model(batch)
|
||||
real_labels = torch.ones(batch.size(0), 1)
|
||||
fake_labels = torch.zeros(batch.size(0), 1)
|
||||
|
||||
# 重构损失
|
||||
recon_loss = reconstruction_criterion(recon_batch, batch)
|
||||
|
||||
# 对抗损失 - 生成器希望判别器将生成的潜在向量识别为真实
|
||||
disc_fake = model.discriminate(latent_batch)
|
||||
adversarial_loss = adversarial_criterion(disc_fake, real_labels)
|
||||
|
||||
autoencoder_loss = recon_loss + 0.1 * adversarial_loss
|
||||
autoencoder_loss.backward()
|
||||
autoencoder_optimizer.step()
|
||||
|
||||
# 训练判别器
|
||||
discriminator_optimizer.zero_grad()
|
||||
|
||||
# 真实潜在向量(从标准正态分布采样)
|
||||
real_latent = torch.randn(batch.size(0), latent_dim)
|
||||
disc_real = model.discriminate(real_latent)
|
||||
disc_real_loss = adversarial_criterion(disc_real, real_labels)
|
||||
|
||||
# 生成的潜在向量
|
||||
disc_fake = model.discriminate(latent_batch.detach())
|
||||
disc_fake_loss = adversarial_criterion(disc_fake, fake_labels)
|
||||
|
||||
discriminator_loss = disc_real_loss + disc_fake_loss
|
||||
discriminator_loss.backward()
|
||||
discriminator_optimizer.step()
|
||||
|
||||
if (epoch + 1) % 10 == 0:
|
||||
print(f'Epoch [{epoch+1}/{num_epochs}], '
|
||||
f'Recon Loss: {recon_loss.item():.4f}, '
|
||||
f'Adversarial Loss: {adversarial_loss.item():.4f}, '
|
||||
f'Discriminator Loss: {discriminator_loss.item():.4f}')
|
||||
|
||||
# 保存模型和标准化器
|
||||
print("保存对抗自编码器模型...")
|
||||
torch.save(model.state_dict(), 'models/adversarial_autoencoder.pth')
|
||||
joblib.dump(scaler, 'models/ae_scaler.pkl')
|
||||
|
||||
return model, scaler
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 检查是否有可用的GPU
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
print(f"使用设备: {device}")
|
||||
|
||||
model, scaler = train_adversarial_autoencoder()
|
||||
print("对抗自编码器训练完成!")
|
||||
@ -0,0 +1,93 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 添加项目根目录到Python路径
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
import lightgbm as lgb
|
||||
import joblib
|
||||
|
||||
from data.data_generator import preprocess_data
|
||||
|
||||
def train_lightgbm_model():
|
||||
"""
|
||||
训练LightGBM模型用于信贷风险评估
|
||||
"""
|
||||
# 读取数据
|
||||
print("读取信贷数据...")
|
||||
df = pd.read_csv('data/credit_data.csv')
|
||||
print(f"数据形状: {df.shape}")
|
||||
|
||||
# 数据预处理
|
||||
print("数据预处理...")
|
||||
X, y, scaler, le_education, le_home, le_purpose = preprocess_data(df)
|
||||
|
||||
# 分割数据集
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y
|
||||
)
|
||||
|
||||
print(f"训练集大小: {X_train.shape}")
|
||||
print(f"测试集大小: {X_test.shape}")
|
||||
|
||||
# 创建LightGBM分类器
|
||||
print("创建LightGBM模型...")
|
||||
model = lgb.LGBMClassifier(
|
||||
n_estimators=200,
|
||||
max_depth=8,
|
||||
learning_rate=0.05,
|
||||
num_leaves=64,
|
||||
subsample=0.8,
|
||||
colsample_bytree=0.8,
|
||||
random_state=42,
|
||||
verbose=-1
|
||||
)
|
||||
|
||||
# 训练模型
|
||||
print("训练模型...")
|
||||
model.fit(X_train, y_train,
|
||||
eval_set=[(X_test, y_test)],
|
||||
eval_metric='binary_logloss',
|
||||
callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)])
|
||||
|
||||
# 预测
|
||||
print("模型预测...")
|
||||
y_pred = model.predict(X_test)
|
||||
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
||||
|
||||
# 评估模型
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
print(f"模型准确率: {accuracy:.4f}")
|
||||
|
||||
print("\n分类报告:")
|
||||
print(classification_report(y_test, y_pred))
|
||||
|
||||
print("\n混淆矩阵:")
|
||||
print(confusion_matrix(y_test, y_pred))
|
||||
|
||||
# 保存模型和预处理器
|
||||
print("保存模型和预处理器...")
|
||||
joblib.dump(model, 'models/lightgbm_model.pkl')
|
||||
joblib.dump(scaler, 'models/scaler.pkl')
|
||||
joblib.dump(le_education, 'models/le_education.pkl')
|
||||
joblib.dump(le_home, 'models/le_home.pkl')
|
||||
joblib.dump(le_purpose, 'models/le_purpose.pkl')
|
||||
|
||||
# 特征重要性
|
||||
feature_importance = pd.DataFrame({
|
||||
'feature': X.columns,
|
||||
'importance': model.feature_importances_
|
||||
}).sort_values('importance', ascending=False)
|
||||
|
||||
print("\n特征重要性:")
|
||||
print(feature_importance)
|
||||
|
||||
return model, scaler, le_education, le_home, le_purpose
|
||||
|
||||
if __name__ == "__main__":
|
||||
model, scaler, le_education, le_home, le_purpose = train_lightgbm_model()
|
||||
print("\n模型训练完成!")
|
||||
@ -0,0 +1,88 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 添加项目根目录到Python路径
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
import xgboost as xgb
|
||||
import joblib
|
||||
|
||||
from data.data_generator import preprocess_data
|
||||
|
||||
def train_xgboost_model():
|
||||
"""
|
||||
训练XGBoost模型用于信贷风险评估
|
||||
"""
|
||||
# 读取数据
|
||||
print("读取信贷数据...")
|
||||
df = pd.read_csv('data/credit_data.csv')
|
||||
print(f"数据形状: {df.shape}")
|
||||
|
||||
# 数据预处理
|
||||
print("数据预处理...")
|
||||
X, y, scaler, le_education, le_home, le_purpose = preprocess_data(df)
|
||||
|
||||
# 分割数据集
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y
|
||||
)
|
||||
|
||||
print(f"训练集大小: {X_train.shape}")
|
||||
print(f"测试集大小: {X_test.shape}")
|
||||
|
||||
# 创建XGBoost分类器
|
||||
print("创建XGBoost模型...")
|
||||
model = xgb.XGBClassifier(
|
||||
n_estimators=100,
|
||||
max_depth=6,
|
||||
learning_rate=0.1,
|
||||
subsample=0.8,
|
||||
colsample_bytree=0.8,
|
||||
random_state=42
|
||||
)
|
||||
|
||||
# 训练模型
|
||||
print("训练模型...")
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# 预测
|
||||
print("模型预测...")
|
||||
y_pred = model.predict(X_test)
|
||||
y_pred_proba = model.predict_proba(X_test)[:, 1]
|
||||
|
||||
# 评估模型
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
print(f"模型准确率: {accuracy:.4f}")
|
||||
|
||||
print("\n分类报告:")
|
||||
print(classification_report(y_test, y_pred))
|
||||
|
||||
print("\n混淆矩阵:")
|
||||
print(confusion_matrix(y_test, y_pred))
|
||||
|
||||
# 保存模型和预处理器
|
||||
print("保存模型和预处理器...")
|
||||
joblib.dump(model, 'models/xgboost_model.pkl')
|
||||
joblib.dump(scaler, 'models/scaler.pkl')
|
||||
joblib.dump(le_education, 'models/le_education.pkl')
|
||||
joblib.dump(le_home, 'models/le_home.pkl')
|
||||
joblib.dump(le_purpose, 'models/le_purpose.pkl')
|
||||
|
||||
# 特征重要性
|
||||
feature_importance = pd.DataFrame({
|
||||
'feature': X.columns,
|
||||
'importance': model.feature_importances_
|
||||
}).sort_values('importance', ascending=False)
|
||||
|
||||
print("\n特征重要性:")
|
||||
print(feature_importance)
|
||||
|
||||
return model, scaler, le_education, le_home, le_purpose
|
||||
|
||||
if __name__ == "__main__":
|
||||
model, scaler, le_education, le_home, le_purpose = train_xgboost_model()
|
||||
print("\n模型训练完成!")
|
||||
|
After Width: | Height: | Size: 56 KiB |
|
After Width: | Height: | Size: 269 KiB |
|
After Width: | Height: | Size: 57 KiB |
|
After Width: | Height: | Size: 58 KiB |
|
After Width: | Height: | Size: 52 KiB |
|
After Width: | Height: | Size: 57 KiB |
|
After Width: | Height: | Size: 88 KiB |
|
After Width: | Height: | Size: 58 KiB |
|
|
After Width: | Height: | Size: 87 KiB |
|
After Width: | Height: | Size: 53 KiB |
|
After Width: | Height: | Size: 54 KiB |
|
After Width: | Height: | Size: 112 KiB |
|
After Width: | Height: | Size: 263 KiB |