EMCAD/preprocess_synapse_data.py

import os
import shutil
from time import time

import numpy as np
import SimpleITK as sitk
import nibabel as nib
import scipy.ndimage as ndimage
import h5py

splits = ['train', 'test']

# 这里根据你的截图路径进行了修改
# 请确认 data/synapse/Abdomen/RawData/TrainSet 下面是否有 img 和 label 文件夹
# 如果没有，而是所有文件都在 TrainSet 根目录下，你需要手动把图片放入 img，标签放入 label
for split in splits:
    if(split == 'train'):
        ct_path = './data/synapse/Abdomen/RawData/TrainSet/img'
        seg_path = './data/synapse/Abdomen/RawData/TrainSet/label'
        save_path = './data/synapse/train_npz/' # 修改为标准输出路径
    else:
        ct_path = './data/synapse/Abdomen/RawData/TestSet/img'
        seg_path = './data/synapse/Abdomen/RawData/TestSet/label'
        save_path = './data/synapse/test_vol_h5/' # 修改为标准输出路径

    if os.path.exists(save_path) is False:
        os.makedirs(save_path) # 使用 makedirs 以防父目录不存在

    upper = 275
    lower = -125

    start_time = time()

    if not os.path.exists(ct_path):
        print(f"Error: 找不到路径 {ct_path}，请检查你的文件夹结构是否包含 img 子文件夹")
        continue

    for ct_file in os.listdir(ct_path):
        # 过滤掉非 nii 文件
        if not ct_file.endswith('.nii.gz'):
            continue

        ct = nib.load(os.path.join(ct_path, ct_file))
        seg = nib.load(os.path.join(seg_path, ct_file.replace('img', 'label')))

        #Convert them to numpy format,
        ct_array = ct.get_fdata()
        seg_array = seg.get_fdata()

        ct_array = np.clip(ct_array, lower, upper)

        #normalize each 3D image to [0, 1]
        ct_array = (ct_array - lower) / (upper - lower)

        ct_array = np.transpose(ct_array, (2, 0, 1))
        seg_array = np.transpose(seg_array, (2, 0, 1))

        print('Processing:', ct_file, 'Shape:', ct_array.shape)

        ct_number = ct_file.split('.')[0]
        if(split == 'test'):
            new_ct_name = ct_number.replace('img', 'case')+'.npy.h5'
            hf = h5py.File(os.path.join(save_path, new_ct_name), 'w')
            hf.create_dataset('image', data=ct_array)
            hf.create_dataset('label', data=seg_array)
            hf.close()
            continue

        for s_idx in range(ct_array.shape[0]):
            ct_array_s = ct_array[s_idx, :, :]
            seg_array_s = seg_array[s_idx, :, :]
            # 过滤掉全黑的切片，节省空间（可选，这里保持原逻辑）
            slice_no = "{:03d}".format(s_idx)
            new_ct_name = ct_number.replace('img', 'case') + '_slice' + slice_no
            np.savez(os.path.join(save_path, new_ct_name), image=ct_array_s, label=seg_array_s)

        print('Already used {:.3f} min'.format((time() - start_time) / 60))
        print('-----------')