import random import pandas as pd import numpy as np import json import os import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import joblib # 设置中文显示 plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决中文显示 plt.rcParams['axes.unicode_minus'] = False # 解决负号显示 # ===================== 数据生成模块 ===================== def generate_student_consume_data( n_train=100, n_test=5, save_paths={"csv": "student_consume_train.csv", "excel": "student_consume_train.xlsx", "json": "student_consume_train.json"}, test_path="student_consume_test.csv" ): """ 生成模拟学生消费数据(含缺失值、异常值) :param n_train: 训练数据量 :param n_test: 测试数据量 :param save_paths: 训练数据保存路径(csv/excel/json) :param test_path: 测试数据保存路径 """ # 生成基础信息 student_ids = [f"2024{str(i).zfill(6)}" for i in range(1, n_train + n_test + 1)] names = [f"学生{j}" for j in range(1, n_train + n_test + 1)] # 生成消费数据(10次消费记录,含缺失/异常值) consume_data = [] for idx, (sid, name) in enumerate(zip(student_ids, names)): row = [sid, name] # 生成10次消费金额(正常范围10-50元,随机插入异常值/缺失值) for _ in range(10): # 10%概率生成缺失值,5%概率生成异常值(>200或<0) if random.random() < 0.1: row.append(None) elif random.random() < 0.05: row.append(random.choice([random.randint(-50, 0), random.randint(200, 500)])) else: row.append(round(random.uniform(10, 50), 2)) consume_data.append(row) # 拆分训练/测试数据 train_data = consume_data[:n_train] test_data = consume_data[n_train:] # 定义列名 cols = ["学号", "姓名"] + [f"消费{i + 1}" for i in range(10)] # 保存训练数据(多格式) # CSV df_train = pd.DataFrame(train_data, columns=cols) df_train.to_csv(save_paths["csv"], index=False) # Excel df_train.to_excel(save_paths["excel"], index=False) # JSON(嵌套格式) json_train = [] for row in train_data: json_train.append({ "student_info": {"学号": row[0], "姓名": row[1]}, "consume_records": row[2:] }) with open(save_paths["json"], "w", encoding="utf-8") as f: json.dump(json_train, f, ensure_ascii=False, indent=2) # 保存测试数据 df_test = pd.DataFrame(test_data, columns=cols) df_test.to_csv(test_path, index=False) print(f"模拟数据生成完成!训练数据{len(train_data)}条,测试数据{len(test_data)}条") return save_paths, test_path # ===================== 数据导入模块 ===================== def import_csv_data(file_path): """导入CSV文件,缺失值补None,返回DataFrame""" if not os.path.exists(file_path): print(f"错误:文件{file_path}不存在!") return pd.DataFrame() if os.path.getsize(file_path) == 0: print(f"提示:文件{file_path}为空!") return pd.DataFrame() df = pd.read_csv(file_path, na_values=["", "NaN", "None"], keep_default_na=False) df = df.where(pd.notna(df), None) # 缺失值统一替换为None return df def import_excel_data(file_path): """导入Excel文件(兼容xls/xlsx),自动识别表头,缺失值补None""" if not os.path.exists(file_path): print(f"错误:文件{file_path}不存在!") return pd.DataFrame() if os.path.getsize(file_path) == 0: print(f"提示:文件{file_path}为空!") return pd.DataFrame() df = pd.read_excel(file_path, header=0) # 自动识别首行作表头 df = df.where(pd.notna(df), None) # 缺失值统一替换为None return df def import_json_data(file_path): """导入JSON文件,解析嵌套字段,缺失值补None""" if not os.path.exists(file_path): print(f"错误:文件{file_path}不存在!") return {} if os.path.getsize(file_path) == 0: print(f"提示:文件{file_path}为空!") return {} with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) # 解析嵌套字段,标准化格式 parsed_data = [] for item in data: row = { "学号": item.get("student_info", {}).get("学号", None), "姓名": item.get("student_info", {}).get("姓名", None), "消费记录": item.get("consume_records", [None] * 10) } parsed_data.append(row) return {"data": parsed_data} # ===================== 数据清洗模块 ===================== def remove_duplicates(data, key_col="学号"): """ 按主键去重,保留第一条 :param data: DataFrame/字典类型数据 :param key_col: 主键列名 :return: 去重后的数据 """ if isinstance(data, pd.DataFrame): if key_col not in data.columns: print(f"错误:主键列{key_col}不存在!") return data cleaned = data.drop_duplicates(subset=[key_col], keep="first") elif isinstance(data, dict) and "data" in data: # 处理字典类型数据 seen_keys = set() cleaned_list = [] for item in data["data"]: key = item.get(key_col, None) if key is None: cleaned_list.append(item) continue if key not in seen_keys: seen_keys.add(key) cleaned_list.append(item) cleaned = {"data": cleaned_list} else: print("错误:不支持的数据类型!") return data print( f"去重完成!原数据{len(data) if isinstance(data, pd.DataFrame) else len(data['data'])}条,清洗后{len(cleaned) if isinstance(cleaned, pd.DataFrame) else len(cleaned['data'])}条") return cleaned def fill_missing_values(data, strategy="mean", key_col="学号"): """ 缺失值填充(均值/众数/前向填充) :param data: DataFrame/字典类型数据 :param strategy: 填充策略(mean/median/ffill) :param key_col: 主键列(用于保留非数值列) :return: 填充后的数据 """ if isinstance(data, pd.DataFrame): # 分离数值列和非数值列 non_numeric_cols = [col for col in data.columns if col in [key_col, "姓名"]] numeric_cols = [col for col in data.columns if col not in non_numeric_cols] df_numeric = data[numeric_cols].copy() # 填充策略 if strategy == "mean": fill_value = df_numeric.astype(float).mean() elif strategy == "mode": fill_value = df_numeric.astype(float).mode().iloc[0] elif strategy == "ffill": df_filled = df_numeric.fillna(method="ffill") else: print(f"错误:不支持的填充策略{strategy}!") return data if strategy != "ffill": df_filled = df_numeric.fillna(fill_value) # 合并非数值列 cleaned = pd.concat([data[non_numeric_cols].reset_index(drop=True), df_filled.reset_index(drop=True)], axis=1) elif isinstance(data, dict) and "data" in data: # 处理字典类型数据 cleaned_list = [] for item in data["data"]: consume_records = item.get("消费记录", [None] * 10) # 过滤None,计算填充值 valid_vals = [v for v in consume_records if v is not None and isinstance(v, (int, float))] if not valid_vals: filled_records = [0.0] * 10 # 无有效值时填充0 else: if strategy == "mean": fill_val = np.mean(valid_vals) elif strategy == "mode": fill_val = np.mode(valid_vals)[0] if len(valid_vals) > 0 else 0.0 elif strategy == "ffill": # 前向填充 filled_records = [] prev_val = 0.0 for val in consume_records: if val is None or not isinstance(val, (int, float)): filled_records.append(prev_val) else: filled_records.append(val) prev_val = val fill_val = None else: print(f"错误:不支持的填充策略{strategy}!") return data if strategy != "ffill": filled_records = [fill_val if v is None else v for v in consume_records] cleaned_item = { "学号": item.get("学号"), "姓名": item.get("姓名"), "消费记录": filled_records } cleaned_list.append(cleaned_item) cleaned = {"data": cleaned_list} else: print("错误:不支持的数据类型!") return data print(f"缺失值填充完成(策略:{strategy})") return cleaned def handle_outliers(data, strategy="mean", key_col="学号", alpha=3): """ 基于3σ准则处理异常值 :param data: DataFrame/字典类型数据 :param strategy: 处理策略(drop/mean/mode/ffill) :param key_col: 主键列 :param alpha: 3σ准则的α值 :return: 处理后的数据 """ if isinstance(data, pd.DataFrame): # 分离数值列和非数值列 non_numeric_cols = [col for col in data.columns if col in [key_col, "姓名"]] numeric_cols = [col for col in data.columns if col not in non_numeric_cols] df_numeric = data[numeric_cols].astype(float).copy() # 计算3σ范围 mean_vals = df_numeric.mean() std_vals = df_numeric.std() lower_bound = mean_vals - alpha * std_vals upper_bound = mean_vals + alpha * std_vals # 标记异常值 outliers_mask = (df_numeric < lower_bound) | (df_numeric > upper_bound) if strategy == "drop": # 删除含异常值的行 row_mask = outliers_mask.any(axis=1) df_cleaned = df_numeric[~row_mask] non_numeric_cleaned = data[non_numeric_cols][~row_mask].reset_index(drop=True) else: # 填充异常值 df_cleaned = df_numeric.copy() for col in numeric_cols: # 计算该列的填充值 valid_vals = df_cleaned[col][~outliers_mask[col]] if len(valid_vals) == 0: fill_val = 0.0 elif strategy == "mean": fill_val = valid_vals.mean() elif strategy == "mode": fill_val = valid_vals.mode().iloc[0] elif strategy == "ffill": df_cleaned[col] = df_cleaned[col].mask(outliers_mask[col]).fillna(method="ffill") continue else: print(f"错误:不支持的异常值处理策略{strategy}!") return data # 填充异常值 if strategy != "ffill": df_cleaned = df_cleaned.mask(outliers_mask, fill_val) non_numeric_cleaned = data[non_numeric_cols].reset_index(drop=True) # 合并数据 cleaned = pd.concat([non_numeric_cleaned, df_cleaned.reset_index(drop=True)], axis=1) elif isinstance(data, dict) and "data" in data: # 处理字典类型数据 cleaned_list = [] for item in data["data"]: consume_records = item.get("消费记录", []) valid_vals = [v for v in consume_records if v is not None and isinstance(v, (int, float))] if not valid_vals: cleaned_list.append(item) continue # 计算3σ范围 mean_val = np.mean(valid_vals) std_val = np.std(valid_vals) lower = mean_val - alpha * std_val upper = mean_val + alpha * std_val # 处理异常值 cleaned_records = [] prev_val = mean_val # 前向填充的初始值 for val in consume_records: if val is None: cleaned_records.append(val) continue # 判断是否为异常值 is_outlier = val < lower or val > upper if not is_outlier: cleaned_records.append(val) prev_val = val else: if strategy == "drop": break # 丢弃整行 elif strategy == "mean": cleaned_records.append(mean_val) elif strategy == "mode": cleaned_records.append(np.mode(valid_vals)[0] if len(valid_vals) > 0 else 0.0) elif strategy == "ffill": cleaned_records.append(prev_val) else: cleaned_records.append(val) if strategy != "drop": cleaned_item = { "学号": item.get("学号"), "姓名": item.get("姓名"), "消费记录": cleaned_records } cleaned_list.append(cleaned_item) cleaned = {"data": cleaned_list} else: print("错误:不支持的数据类型!") return data print(f"异常值处理完成(策略:{strategy},3σ准则)") return cleaned # ===================== 数据分析模块 ===================== def statistical_analysis(data, key_col="学号"): """ 统计特征分析:计算每个学生消费次数、均值、标准差 :param data: 清洗后的DataFrame数据 :return: 统计结果DataFrame """ if not isinstance(data, pd.DataFrame): print("错误:仅支持DataFrame类型数据的统计分析!") return pd.DataFrame() # 分离消费列 consume_cols = [col for col in data.columns if "消费" in col] # 统计每个学生的消费特征 stats = [] for idx, row in data.iterrows(): sid = row[key_col] name = row["姓名"] # 提取有效消费记录 consume_vals = [v for v in row[consume_cols] if v is not None and isinstance(v, (int, float))] # 计算统计特征 count = len(consume_vals) mean_val = round(np.mean(consume_vals), 2) if count > 0 else 0.0 std_val = round(np.std(consume_vals), 2) if count > 1 else 0.0 stats.append({ "学号": sid, "姓名": name, "消费次数": count, "消费均值": mean_val, "消费标准差": std_val }) stats_df = pd.DataFrame(stats) print("统计特征分析完成!") return stats_df def plot_top10_bar(stats_df, save_dir="plots"): """ 绘制Top10柱状图(次数/均值/标准差)并保存 :param stats_df: 统计特征DataFrame :param save_dir: 图片保存目录 """ if not os.path.exists(save_dir): os.makedirs(save_dir) # 1. 消费次数Top10 top10_count = stats_df.nlargest(10, "消费次数") plt.figure(figsize=(12, 4)) plt.bar(top10_count["姓名"], top10_count["消费次数"], color="skyblue") plt.title("学生消费次数Top10") plt.xlabel("学生姓名") plt.ylabel("消费次数") plt.xticks(rotation=45) plt.tight_layout() plt.savefig(os.path.join(save_dir, "top10_count.png")) plt.show() # 2. 消费均值Top10 top10_mean = stats_df.nlargest(10, "消费均值") plt.figure(figsize=(12, 4)) plt.bar(top10_mean["姓名"], top10_mean["消费均值"], color="lightcoral") plt.title("学生消费均值Top10") plt.xlabel("学生姓名") plt.ylabel("消费均值(元)") plt.xticks(rotation=45) plt.tight_layout() plt.savefig(os.path.join(save_dir, "top10_mean.png")) plt.show() # 3. 消费标准差Top10 top10_std = stats_df.nlargest(10, "消费标准差") plt.figure(figsize=(12, 4)) plt.bar(top10_std["姓名"], top10_std["消费标准差"], color="lightgreen") plt.title("学生消费标准差Top10") plt.xlabel("学生姓名") plt.ylabel("消费标准差(元)") plt.xticks(rotation=45) plt.tight_layout() plt.savefig(os.path.join(save_dir, "top10_std.png")) plt.show() print(f"Top10柱状图已保存至{save_dir}目录!") def plot_pca_scatter(stats_df, save_dir="plots"): """ PCA降维后绘制散点图并保存 :param stats_df: 统计特征DataFrame :param save_dir: 图片保存目录 """ if not os.path.exists(save_dir): os.makedirs(save_dir) # 提取数值特征 features = ["消费次数", "消费均值", "消费标准差"] X = stats_df[features].values # 标准化 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # PCA降维至2D pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) # 绘制散点图 plt.figure(figsize=(10, 8)) plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.7, c="orange") # 标注部分学生 for i in range(min(20, len(stats_df))): # 仅标注前20个学生 plt.annotate(stats_df.iloc[i]["姓名"], (X_pca[i, 0], X_pca[i, 1]), fontsize=8) plt.title("学生消费特征PCA降维散点图") plt.xlabel("主成分1") plt.ylabel("主成分2") plt.grid(alpha=0.3) plt.tight_layout() plt.savefig(os.path.join(save_dir, "pca_scatter.png")) plt.show() print(f"PCA散点图已保存至{save_dir}目录!") # ===================== 学生画像模型构建与应用模块 ===================== def build_portrait_model(stats_df, model_path="student_portrait_model.pkl"): """ 构建学生画像模型(按消费特征分标签) 标签规则: - 特别困难:消费均值<15 且 消费次数<5 - 困难:消费均值15-20 且 消费次数<7 - 一般困难:消费均值20-25 或 消费次数7-8 - 不困难:其他情况 :param stats_df: 统计特征DataFrame :param model_path: 模型保存路径 :return: 带标签的DataFrame """ def get_label(row): mean_val = row["消费均值"] count = row["消费次数"] if mean_val < 15 and count < 5: return "特别困难" elif 15 <= mean_val < 20 and count < 7: return "困难" elif (20 <= mean_val < 25) or (7 <= count <= 8): return "一般困难" else: return "不困难" # 生成标签 stats_df["困难标签"] = stats_df.apply(get_label, axis=1) # 保存模型(此处简化为保存标签规则对应的统计阈值,实际可替换为机器学习模型) model = { "rules": { "特别困难": {"消费均值<15": 15, "消费次数<5": 5}, "困难": {"15≤消费均值<20": (15, 20), "消费次数<7": 7}, "一般困难": {"20≤消费均值<25": (20, 25), "7≤消费次数≤8": (7, 8)}, "不困难": "其他" }, "label_counts": stats_df["困难标签"].value_counts().to_dict() } joblib.dump(model, model_path) print(f"画像模型构建完成!标签分布:{model['label_counts']}") print(f"模型已保存至{model_path}") return stats_df def get_aid_strategy(label): """根据标签返回精准帮扶策略""" strategies = { "特别困难": "提供全额助学金+生活补贴+勤工俭学优先安排", "困难": "提供半额助学金+生活补贴", "一般困难": "提供临时补助+学业辅导", "不困难": "无需资助,可推荐奖学金申请" } return strategies.get(label, "暂无适配的帮扶策略") def apply_portrait_model(test_data, model_path="student_portrait_model.pkl"): """ 应用画像模型对测试数据打标签并返回资助建议 :param test_data: 测试数据(DataFrame) :param model_path: 模型路径 :return: 带标签和建议的字典 """ # 加载模型 if not os.path.exists(model_path): print(f"错误:模型文件{model_path}不存在!") return {} model = joblib.load(model_path) # 先对测试数据做统计分析 test_stats = statistical_analysis(test_data) # 打标签+生成建议 results = {} for idx, row in test_stats.iterrows(): sid = row["学号"] name = row["姓名"] mean_val = row["消费均值"] count = row["消费次数"] # 应用模型规则 if mean_val < model["rules"]["特别困难"]["消费均值<15"] and count < model["rules"]["特别困难"]["消费次数<5"]: label = "特别困难" elif (model["rules"]["困难"]["15≤消费均值<20"][0] <= mean_val < model["rules"]["困难"]["15≤消费均值<20"][ 1]) and count < model["rules"]["困难"]["消费次数<7"]: label = "困难" elif (model["rules"]["一般困难"]["20≤消费均值<25"][0] <= mean_val < model["rules"]["一般困难"]["20≤消费均值<25"][1]) or ( model["rules"]["一般困难"]["7≤消费次数≤8"][0] <= count <= model["rules"]["一般困难"]["7≤消费次数≤8"][ 1]): label = "一般困难" else: label = "不困难" # 获取帮扶策略 strategy = get_aid_strategy(label) results[sid] = { "姓名": name, "消费次数": count, "消费均值": mean_val, "困难标签": label, "帮扶策略": strategy } print("画像模型应用完成!测试数据标签及帮扶策略如下:") for sid, info in results.items(): print(f"学号:{sid},姓名:{info['姓名']},标签:{info['困难标签']},策略:{info['帮扶策略']}") return results # ===================== 主函数(全流程执行) ===================== def main(): # 1. 生成模拟数据 save_paths = {"csv": "student_consume_train.csv", "excel": "student_consume_train.xlsx", "json": "student_consume_train.json"} test_path = "student_consume_test.csv" generate_student_consume_data(n_train=100, n_test=5, save_paths=save_paths, test_path=test_path) # 2. 数据导入(以CSV为例,可替换为excel/json) train_data = import_csv_data(save_paths["csv"]) if train_data.empty: return # 3. 数据清洗 # 去重 train_data_dedup = remove_duplicates(train_data) # 缺失值填充(均值填充) train_data_fill = fill_missing_values(train_data_dedup, strategy="mean") # 异常值处理(均值填充) train_data_clean = handle_outliers(train_data_fill, strategy="mean") # 4. 数据分析 # 统计特征分析 stats_df = statistical_analysis(train_data_clean) # 可视化 plot_top10_bar(stats_df) plot_pca_scatter(stats_df) # 5. 构建画像模型 stats_df_with_label = build_portrait_model(stats_df) # 6. 模型应用(测试数据) test_data = import_csv_data(test_path) if not test_data.empty: # 测试数据先清洗 test_data_dedup = remove_duplicates(test_data) test_data_fill = fill_missing_values(test_data_dedup, strategy="mean") test_data_clean = handle_outliers(test_data_fill, strategy="mean") # 应用模型 aid_results = apply_portrait_model(test_data_clean) if __name__ == "__main__": main()