|
|
|
|
@ -0,0 +1,73 @@
|
|
|
|
|
import os
|
|
|
|
|
import pickle
|
|
|
|
|
import numpy as np
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_pkl_files(input_dir, output_path="feature_dataset.pkl"):
|
|
|
|
|
all_features = []
|
|
|
|
|
all_labels = []
|
|
|
|
|
|
|
|
|
|
pkl_files = list(Path(input_dir).glob("*.pkl"))
|
|
|
|
|
if not pkl_files:
|
|
|
|
|
raise FileNotFoundError(f"在目录 {input_dir} 中未找到PKL文件")
|
|
|
|
|
|
|
|
|
|
# 排除已合并的文件和无效文件
|
|
|
|
|
pkl_files = [f for f in pkl_files if f.name not in ["feature_dataset.pkl", "infer_results.pkl"]]
|
|
|
|
|
print(f"发现 {len(pkl_files)} 个有效PKL文件,开始合并...")
|
|
|
|
|
|
|
|
|
|
for file in pkl_files:
|
|
|
|
|
try:
|
|
|
|
|
with open(file, "rb") as f:
|
|
|
|
|
data = pickle.load(f)
|
|
|
|
|
|
|
|
|
|
if "matrix" not in data or "label" not in data:
|
|
|
|
|
print(f"跳过 {file.name}:缺少'matrix'或'label'字段")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
features = data["matrix"]
|
|
|
|
|
labels = data["label"]
|
|
|
|
|
|
|
|
|
|
# 强制将标签转为一维整数数组(核心修复)
|
|
|
|
|
labels = labels.ravel().astype(np.int64) # 转为int64类型
|
|
|
|
|
|
|
|
|
|
# 验证特征和标签数量匹配
|
|
|
|
|
if len(features) != len(labels):
|
|
|
|
|
print(f"跳过 {file.name}:特征({len(features)})与标签({len(labels)})数量不匹配")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 验证特征维度一致性
|
|
|
|
|
if all_features and features.shape[1] != all_features[0].shape[1]:
|
|
|
|
|
print(f"跳过 {file.name}:特征维度与已有数据不一致(现有{all_features[0].shape[1]}维,当前{features.shape[1]}维)")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
all_features.append(features)
|
|
|
|
|
all_labels.append(labels)
|
|
|
|
|
print(f"已加载 {file.name}:{len(features)} 条样本(特征{features.shape[1]}维)")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理 {file.name} 时出错:{str(e)},已跳过")
|
|
|
|
|
|
|
|
|
|
if not all_features:
|
|
|
|
|
raise ValueError("没有有效数据可合并,请检查输入文件")
|
|
|
|
|
|
|
|
|
|
# 合并特征和标签
|
|
|
|
|
merged_matrix = np.vstack(all_features)
|
|
|
|
|
merged_label = np.concatenate(all_labels, axis=0) # 数组拼接
|
|
|
|
|
|
|
|
|
|
print("\n合并结果:")
|
|
|
|
|
print(f"总样本数:{len(merged_matrix)}")
|
|
|
|
|
print(f"特征矩阵形状:{merged_matrix.shape}")
|
|
|
|
|
# 确保标签为整数后再统计分布
|
|
|
|
|
print(f"标签分布:{np.bincount(merged_label)} (索引对应标签值)")
|
|
|
|
|
|
|
|
|
|
with open(output_path, "wb") as f:
|
|
|
|
|
pickle.dump({"matrix": merged_matrix, "label": merged_label}, f)
|
|
|
|
|
|
|
|
|
|
print(f"\n已成功保存至 {output_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
INPUT_DIRECTORY = r"D:\SummerSchool\mat_cv\mat_cv-02"
|
|
|
|
|
OUTPUT_FILE = r"D:\SummerSchool\mat_cv\mat_cv-02\feature_dataset00.pkl" # 路径
|
|
|
|
|
merge_pkl_files(INPUT_DIRECTORY, OUTPUT_FILE)
|