project/credit_risk_system/data/data_generator.py

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

def generate_credit_data(n_samples=10000):
    """
    生成模拟的信贷数据集
    特征包括：
    - age: 年龄
    - income: 年收入
    - employment_length: 就业年限
    - loan_amount: 贷款金额
    - credit_score: 信用评分
    - debt_to_income: 债务收入比
    - num_credit_lines: 信贷账户数量
    - education: 教育水平
    - home_ownership: 房产情况
    - loan_purpose: 贷款目的
    """
    np.random.seed(42)

    # 生成特征
    age = np.random.normal(35, 10, n_samples)
    age = np.clip(age, 18, 80)

    income = np.random.lognormal(10, 0.5, n_samples)
    income = np.clip(income, 10000, 500000)

    employment_length = np.random.exponential(2, n_samples)
    employment_length = np.clip(employment_length, 0, 40)

    loan_amount = np.random.lognormal(9, 0.8, n_samples)
    loan_amount = np.clip(loan_amount, 1000, 200000)

    credit_score = np.random.normal(650, 100, n_samples)
    credit_score = np.clip(credit_score, 300, 850)

    debt_to_income = np.random.beta(2, 5, n_samples)

    num_credit_lines = np.random.poisson(3, n_samples)
    num_credit_lines = np.clip(num_credit_lines, 0, 15)

    education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
    education = np.random.choice(education_levels, n_samples, p=[0.3, 0.4, 0.2, 0.1])

    ownership_types = ['Rent', 'Mortgage', 'Own', 'Other']
    home_ownership = np.random.choice(ownership_types, n_samples, p=[0.3, 0.4, 0.25, 0.05])

    purpose_types = ['Debt Consolidation', 'Home Improvement', 'Business', 'Personal', 'Medical']
    loan_purpose = np.random.choice(purpose_types, n_samples, p=[0.4, 0.2, 0.15, 0.15, 0.1])

    # 生成目标变量（违约概率）
    # 根据特征计算违约概率（简化模型）
    default_prob = (
        -0.02 * age +
        -0.00001 * income +
        -0.1 * employment_length +
        0.000005 * loan_amount +
        -0.005 * credit_score +
        2 * debt_to_income +
        0.05 * num_credit_lines +
        np.random.normal(0, 0.5, n_samples)
    )

    # 转换为逻辑函数得到违约概率
    default_prob = 1 / (1 + np.exp(-default_prob))

    # 根据违约概率生成实际违约标签
    default = np.random.binomial(1, default_prob, n_samples)

    # 创建DataFrame
    data = pd.DataFrame({
        'age': age,
        'income': income,
        'employment_length': employment_length,
        'loan_amount': loan_amount,
        'credit_score': credit_score,
        'debt_to_income': debt_to_income,
        'num_credit_lines': num_credit_lines,
        'education': education,
        'home_ownership': home_ownership,
        'loan_purpose': loan_purpose,
        'default': default
    })

    return data

def preprocess_data(df):
    """
    数据预处理函数
    """
    # 复制数据避免修改原始数据
    data = df.copy()

    # 编码分类变量
    le_education = LabelEncoder()
    le_home = LabelEncoder()
    le_purpose = LabelEncoder()

    data['education'] = le_education.fit_transform(data['education'])
    data['home_ownership'] = le_home.fit_transform(data['home_ownership'])
    data['loan_purpose'] = le_purpose.fit_transform(data['loan_purpose'])

    # 分离特征和目标变量
    X = data.drop('default', axis=1)
    y = data['default']

    # 标准化数值特征
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    return X_scaled, y, scaler, le_education, le_home, le_purpose

if __name__ == "__main__":
    # 生成示例数据
    print("生成信贷数据...")
    df = generate_credit_data(10000)
    print(f"数据形状: {df.shape}")
    print("\n数据前5行:")
    print(df.head())
    print("\n违约分布:")
    print(df['default'].value_counts())
    print(f"\n违约率: {df['default'].mean():.2%}")

    # 保存数据
    df.to_csv('credit_risk_system/data/credit_data.csv', index=False)
    print("\n数据已保存到 credit_risk_system/data/credit_data.csv")