You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
129 lines
4.2 KiB
129 lines
4.2 KiB
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
|
|
def generate_credit_data(n_samples=10000):
|
|
"""
|
|
生成模拟的信贷数据集
|
|
特征包括:
|
|
- age: 年龄
|
|
- income: 年收入
|
|
- employment_length: 就业年限
|
|
- loan_amount: 贷款金额
|
|
- credit_score: 信用评分
|
|
- debt_to_income: 债务收入比
|
|
- num_credit_lines: 信贷账户数量
|
|
- education: 教育水平
|
|
- home_ownership: 房产情况
|
|
- loan_purpose: 贷款目的
|
|
"""
|
|
np.random.seed(42)
|
|
|
|
# 生成特征
|
|
age = np.random.normal(35, 10, n_samples)
|
|
age = np.clip(age, 18, 80)
|
|
|
|
income = np.random.lognormal(10, 0.5, n_samples)
|
|
income = np.clip(income, 10000, 500000)
|
|
|
|
employment_length = np.random.exponential(2, n_samples)
|
|
employment_length = np.clip(employment_length, 0, 40)
|
|
|
|
loan_amount = np.random.lognormal(9, 0.8, n_samples)
|
|
loan_amount = np.clip(loan_amount, 1000, 200000)
|
|
|
|
credit_score = np.random.normal(650, 100, n_samples)
|
|
credit_score = np.clip(credit_score, 300, 850)
|
|
|
|
debt_to_income = np.random.beta(2, 5, n_samples)
|
|
|
|
num_credit_lines = np.random.poisson(3, n_samples)
|
|
num_credit_lines = np.clip(num_credit_lines, 0, 15)
|
|
|
|
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
|
|
education = np.random.choice(education_levels, n_samples, p=[0.3, 0.4, 0.2, 0.1])
|
|
|
|
ownership_types = ['Rent', 'Mortgage', 'Own', 'Other']
|
|
home_ownership = np.random.choice(ownership_types, n_samples, p=[0.3, 0.4, 0.25, 0.05])
|
|
|
|
purpose_types = ['Debt Consolidation', 'Home Improvement', 'Business', 'Personal', 'Medical']
|
|
loan_purpose = np.random.choice(purpose_types, n_samples, p=[0.4, 0.2, 0.15, 0.15, 0.1])
|
|
|
|
# 生成目标变量(违约概率)
|
|
# 根据特征计算违约概率(简化模型)
|
|
default_prob = (
|
|
-0.02 * age +
|
|
-0.00001 * income +
|
|
-0.1 * employment_length +
|
|
0.000005 * loan_amount +
|
|
-0.005 * credit_score +
|
|
2 * debt_to_income +
|
|
0.05 * num_credit_lines +
|
|
np.random.normal(0, 0.5, n_samples)
|
|
)
|
|
|
|
# 转换为逻辑函数得到违约概率
|
|
default_prob = 1 / (1 + np.exp(-default_prob))
|
|
|
|
# 根据违约概率生成实际违约标签
|
|
default = np.random.binomial(1, default_prob, n_samples)
|
|
|
|
# 创建DataFrame
|
|
data = pd.DataFrame({
|
|
'age': age,
|
|
'income': income,
|
|
'employment_length': employment_length,
|
|
'loan_amount': loan_amount,
|
|
'credit_score': credit_score,
|
|
'debt_to_income': debt_to_income,
|
|
'num_credit_lines': num_credit_lines,
|
|
'education': education,
|
|
'home_ownership': home_ownership,
|
|
'loan_purpose': loan_purpose,
|
|
'default': default
|
|
})
|
|
|
|
return data
|
|
|
|
def preprocess_data(df):
|
|
"""
|
|
数据预处理函数
|
|
"""
|
|
# 复制数据避免修改原始数据
|
|
data = df.copy()
|
|
|
|
# 编码分类变量
|
|
le_education = LabelEncoder()
|
|
le_home = LabelEncoder()
|
|
le_purpose = LabelEncoder()
|
|
|
|
data['education'] = le_education.fit_transform(data['education'])
|
|
data['home_ownership'] = le_home.fit_transform(data['home_ownership'])
|
|
data['loan_purpose'] = le_purpose.fit_transform(data['loan_purpose'])
|
|
|
|
# 分离特征和目标变量
|
|
X = data.drop('default', axis=1)
|
|
y = data['default']
|
|
|
|
# 标准化数值特征
|
|
scaler = StandardScaler()
|
|
X_scaled = scaler.fit_transform(X)
|
|
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
|
|
|
|
return X_scaled, y, scaler, le_education, le_home, le_purpose
|
|
|
|
if __name__ == "__main__":
|
|
# 生成示例数据
|
|
print("生成信贷数据...")
|
|
df = generate_credit_data(10000)
|
|
print(f"数据形状: {df.shape}")
|
|
print("\n数据前5行:")
|
|
print(df.head())
|
|
print("\n违约分布:")
|
|
print(df['default'].value_counts())
|
|
print(f"\n违约率: {df['default'].mean():.2%}")
|
|
|
|
# 保存数据
|
|
df.to_csv('credit_risk_system/data/credit_data.csv', index=False)
|
|
print("\n数据已保存到 credit_risk_system/data/credit_data.csv") |