You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

129 lines
4.2 KiB

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
def generate_credit_data(n_samples=10000):
"""
生成模拟的信贷数据集
特征包括:
- age: 年龄
- income: 年收入
- employment_length: 就业年限
- loan_amount: 贷款金额
- credit_score: 信用评分
- debt_to_income: 债务收入比
- num_credit_lines: 信贷账户数量
- education: 教育水平
- home_ownership: 房产情况
- loan_purpose: 贷款目的
"""
np.random.seed(42)
# 生成特征
age = np.random.normal(35, 10, n_samples)
age = np.clip(age, 18, 80)
income = np.random.lognormal(10, 0.5, n_samples)
income = np.clip(income, 10000, 500000)
employment_length = np.random.exponential(2, n_samples)
employment_length = np.clip(employment_length, 0, 40)
loan_amount = np.random.lognormal(9, 0.8, n_samples)
loan_amount = np.clip(loan_amount, 1000, 200000)
credit_score = np.random.normal(650, 100, n_samples)
credit_score = np.clip(credit_score, 300, 850)
debt_to_income = np.random.beta(2, 5, n_samples)
num_credit_lines = np.random.poisson(3, n_samples)
num_credit_lines = np.clip(num_credit_lines, 0, 15)
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
education = np.random.choice(education_levels, n_samples, p=[0.3, 0.4, 0.2, 0.1])
ownership_types = ['Rent', 'Mortgage', 'Own', 'Other']
home_ownership = np.random.choice(ownership_types, n_samples, p=[0.3, 0.4, 0.25, 0.05])
purpose_types = ['Debt Consolidation', 'Home Improvement', 'Business', 'Personal', 'Medical']
loan_purpose = np.random.choice(purpose_types, n_samples, p=[0.4, 0.2, 0.15, 0.15, 0.1])
# 生成目标变量(违约概率)
# 根据特征计算违约概率(简化模型)
default_prob = (
-0.02 * age +
-0.00001 * income +
-0.1 * employment_length +
0.000005 * loan_amount +
-0.005 * credit_score +
2 * debt_to_income +
0.05 * num_credit_lines +
np.random.normal(0, 0.5, n_samples)
)
# 转换为逻辑函数得到违约概率
default_prob = 1 / (1 + np.exp(-default_prob))
# 根据违约概率生成实际违约标签
default = np.random.binomial(1, default_prob, n_samples)
# 创建DataFrame
data = pd.DataFrame({
'age': age,
'income': income,
'employment_length': employment_length,
'loan_amount': loan_amount,
'credit_score': credit_score,
'debt_to_income': debt_to_income,
'num_credit_lines': num_credit_lines,
'education': education,
'home_ownership': home_ownership,
'loan_purpose': loan_purpose,
'default': default
})
return data
def preprocess_data(df):
"""
数据预处理函数
"""
# 复制数据避免修改原始数据
data = df.copy()
# 编码分类变量
le_education = LabelEncoder()
le_home = LabelEncoder()
le_purpose = LabelEncoder()
data['education'] = le_education.fit_transform(data['education'])
data['home_ownership'] = le_home.fit_transform(data['home_ownership'])
data['loan_purpose'] = le_purpose.fit_transform(data['loan_purpose'])
# 分离特征和目标变量
X = data.drop('default', axis=1)
y = data['default']
# 标准化数值特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
return X_scaled, y, scaler, le_education, le_home, le_purpose
if __name__ == "__main__":
# 生成示例数据
print("生成信贷数据...")
df = generate_credit_data(10000)
print(f"数据形状: {df.shape}")
print("\n数据前5行:")
print(df.head())
print("\n违约分布:")
print(df['default'].value_counts())
print(f"\n违约率: {df['default'].mean():.2%}")
# 保存数据
df.to_csv('credit_risk_system/data/credit_data.csv', index=False)
print("\n数据已保存到 credit_risk_system/data/credit_data.csv")