import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder def generate_credit_data(n_samples=10000): """ 生成模拟的信贷数据集 特征包括: - age: 年龄 - income: 年收入 - employment_length: 就业年限 - loan_amount: 贷款金额 - credit_score: 信用评分 - debt_to_income: 债务收入比 - num_credit_lines: 信贷账户数量 - education: 教育水平 - home_ownership: 房产情况 - loan_purpose: 贷款目的 """ np.random.seed(42) # 生成特征 age = np.random.normal(35, 10, n_samples) age = np.clip(age, 18, 80) income = np.random.lognormal(10, 0.5, n_samples) income = np.clip(income, 10000, 500000) employment_length = np.random.exponential(2, n_samples) employment_length = np.clip(employment_length, 0, 40) loan_amount = np.random.lognormal(9, 0.8, n_samples) loan_amount = np.clip(loan_amount, 1000, 200000) credit_score = np.random.normal(650, 100, n_samples) credit_score = np.clip(credit_score, 300, 850) debt_to_income = np.random.beta(2, 5, n_samples) num_credit_lines = np.random.poisson(3, n_samples) num_credit_lines = np.clip(num_credit_lines, 0, 15) education_levels = ['High School', 'Bachelor', 'Master', 'PhD'] education = np.random.choice(education_levels, n_samples, p=[0.3, 0.4, 0.2, 0.1]) ownership_types = ['Rent', 'Mortgage', 'Own', 'Other'] home_ownership = np.random.choice(ownership_types, n_samples, p=[0.3, 0.4, 0.25, 0.05]) purpose_types = ['Debt Consolidation', 'Home Improvement', 'Business', 'Personal', 'Medical'] loan_purpose = np.random.choice(purpose_types, n_samples, p=[0.4, 0.2, 0.15, 0.15, 0.1]) # 生成目标变量(违约概率) # 根据特征计算违约概率(简化模型) default_prob = ( -0.02 * age + -0.00001 * income + -0.1 * employment_length + 0.000005 * loan_amount + -0.005 * credit_score + 2 * debt_to_income + 0.05 * num_credit_lines + np.random.normal(0, 0.5, n_samples) ) # 转换为逻辑函数得到违约概率 default_prob = 1 / (1 + np.exp(-default_prob)) # 根据违约概率生成实际违约标签 default = np.random.binomial(1, default_prob, n_samples) # 创建DataFrame data = pd.DataFrame({ 'age': age, 'income': income, 'employment_length': employment_length, 'loan_amount': loan_amount, 'credit_score': credit_score, 'debt_to_income': debt_to_income, 'num_credit_lines': num_credit_lines, 'education': education, 'home_ownership': home_ownership, 'loan_purpose': loan_purpose, 'default': default }) return data def preprocess_data(df): """ 数据预处理函数 """ # 复制数据避免修改原始数据 data = df.copy() # 编码分类变量 le_education = LabelEncoder() le_home = LabelEncoder() le_purpose = LabelEncoder() data['education'] = le_education.fit_transform(data['education']) data['home_ownership'] = le_home.fit_transform(data['home_ownership']) data['loan_purpose'] = le_purpose.fit_transform(data['loan_purpose']) # 分离特征和目标变量 X = data.drop('default', axis=1) y = data['default'] # 标准化数值特征 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_scaled = pd.DataFrame(X_scaled, columns=X.columns) return X_scaled, y, scaler, le_education, le_home, le_purpose if __name__ == "__main__": # 生成示例数据 print("生成信贷数据...") df = generate_credit_data(10000) print(f"数据形状: {df.shape}") print("\n数据前5行:") print(df.head()) print("\n违约分布:") print(df['default'].value_counts()) print(f"\n违约率: {df['default'].mean():.2%}") # 保存数据 df.to_csv('credit_risk_system/data/credit_data.csv', index=False) print("\n数据已保存到 credit_risk_system/data/credit_data.csv")