From 2cbc037e959b2cf558583087d46b96c381b32a6b Mon Sep 17 00:00:00 2001
From: Ryan <3266408525@qq.com>
Date: Sat, 13 Dec 2025 13:20:51 +0800
Subject: [PATCH] =?UTF-8?q?improve:=20=E6=94=B9=E8=BF=9BPID=E7=AE=97?=
 =?UTF-8?q?=E6=B3=95=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../algorithms/finetune/train_ti_gen_trace.py | 491 +++++++-----------
 1 file changed, 186 insertions(+), 305 deletions(-)

diff --git a/src/backend/app/algorithms/finetune/train_ti_gen_trace.py b/src/backend/app/algorithms/finetune/train_ti_gen_trace.py
index b97d155..76a5209 100644
--- a/src/backend/app/algorithms/finetune/train_ti_gen_trace.py
+++ b/src/backend/app/algorithms/finetune/train_ti_gen_trace.py
@@ -68,12 +68,40 @@ from diffusers.utils.torch_utils import is_compiled_module
 if is_wandb_available():
     import wandb
 
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-# check_min_version("0.30.0.dev0")
+# 说明：
+# 1) 本文件用于训练 Textual Inversion（仅训练一个新 token 的向量）。
+# 2) 训练过程冻结 UNet/VAE/TextEncoder 的主体权重，仅更新新 token 对应的 embedding 行。
+# 3) 训练过程会按步保存 embedding，并进行验证推理，用于观察训练效果。
+# 4) 文件还包含可视化坐标采集逻辑：X=特征范数，Y=特征方差，Z=loss，并写入 CSV。
+# 5) 为了保证推理阶段的一致性，验证推理会从基础模型加载，并再加载 learned_embeds.bin 作为增量能力。
 
 logger = get_logger(__name__)
 
 
+def _load_textual_inversion_compat(pipeline: DiffusionPipeline, emb_dir: str, token: str):
+    """
+    说明：
+    1) 不同 diffusers 版本对 load_textual_inversion 的参数命名不一致。
+    2) 有些版本支持 token=...，有些版本支持 tokens=[...]，还有些只支持路径。
+    3) 本函数用于在不同版本之间提供兼容调用，优先传入 token 名提高确定性。
+    4) 若当前版本不接受这些参数，会自动降级为仅传路径的调用方式。
+    5) 该函数不会保存或覆盖基础模型文件，只在运行时向 pipeline 注入增量 embedding。
+    """
+    try:
+        pipeline.load_textual_inversion(emb_dir, token=token)
+        return
+    except TypeError:
+        pass
+
+    try:
+        pipeline.load_textual_inversion(emb_dir, tokens=[token])
+        return
+    except TypeError:
+        pass
+
+    pipeline.load_textual_inversion(emb_dir)
+
+
 def save_model_card(
     repo_id: str,
     images=None,
@@ -84,12 +112,17 @@ def save_model_card(
     pipeline: DiffusionPipeline = None,
     placeholder_token: str = None,
 ):
+    # 说明：
+    # 1) 该函数用于生成并保存 README 模型卡片与示例图片，便于上传 Hub 或本地记录。
+    # 2) 对于 Textual Inversion，模型文件主要是 learned_embeds.bin 与 tokenizer。
+    # 3) 该模型卡会说明训练所用的 placeholder token 与训练 prompt。
+    # 4) 生成的图片会被保存在 repo_folder 下，方便查看训练效果。
+    # 5) 本函数不会修改模型参数，只做文档与示例资产的持久化。
     img_str = ""
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
         img_str += f"![img_{i}](./image_{i}.png)\n"
 
-    # Model card updated for Textual Inversion
     model_description = f"""
 # Textual Inversion - {repo_id}
 
@@ -123,11 +156,16 @@ def log_validation(
     epoch,
     is_final_validation=False,
 ):
+    # 说明：
+    # 1) 该函数用于在训练过程中做验证推理，观察当前 embedding 学到了什么。
+    # 2) 会将 scheduler 替换为更适合推理的 DPMSolverMultistepScheduler。
+    # 3) 会关闭安全检查器，避免被过滤导致无法看到结果。
+    # 4) 既支持纯文生图，也支持某些管线的传图推理（依赖 args.validation_images）。
+    # 5) 会把结果写入 tracker（tensorboard/wandb），并释放 GPU 显存。
     logger.info(
         f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
         f" {args.validation_prompt}."
     )
-    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
     scheduler_args = {}
 
     if "variance_type" in pipeline.scheduler.config:
@@ -139,12 +177,11 @@ def log_validation(
         scheduler_args["variance_type"] = variance_type
 
     pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args)
-    
+
     pipeline = pipeline.to(accelerator.device)
     pipeline.set_progress_bar_config(disable=True)
-    pipeline.safety_checker = lambda images, clip_input: (images, [False for i in range(0, len(images))]) # disable safety checker
+    pipeline.safety_checker = lambda images, clip_input: (images, [False for i in range(0, len(images))])
 
-    # run inference
     generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
 
     if args.validation_images is None:
@@ -182,6 +219,12 @@ def log_validation(
 
 
 def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    # 说明：
+    # 1) Stable Diffusion 不同变体可能使用不同的 text encoder 架构。
+    # 2) 该函数读取 text_encoder 的配置，判断其 architectures 字段来确定具体类。
+    # 3) 常见情况是 CLIPTextModel，也可能是 Roberta 或 T5 系列。
+    # 4) 返回的类用于 from_pretrained 加载 text_encoder，保证结构匹配。
+    # 5) 如果遇到未知架构会直接报错，避免后续 silent bug。
     text_encoder_config = PretrainedConfig.from_pretrained(
         pretrained_model_name_or_path,
         subfolder="text_encoder",
@@ -206,6 +249,12 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st
 
 
 def parse_args(input_args=None):
+    # 说明：
+    # 1) 该函数定义所有可配置参数，支持命令行调用与被后端服务传参调用。
+    # 2) 训练相关参数包含学习率、步数、批大小、混合精度、保存间隔等。
+    # 3) Textual Inversion 需要 placeholder_token 与 initializer_token，并且 prompt 必须包含 placeholder。
+    # 4) 验证推理参数用于在训练中生成图片，输出到指定目录用于可视化或服务返回。
+    # 5) coords_* 参数用于记录 3D 可视化坐标数据，不影响训练但会增加少量开销。
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
@@ -240,7 +289,6 @@ def parse_args(input_args=None):
         required=True,
         help="A folder containing the training data of instance images.",
     )
-    
     parser.add_argument(
         "--instance_prompt",
         type=str,
@@ -276,21 +324,18 @@ def parse_args(input_args=None):
             " `args.validation_prompt` multiple times: `args.num_validation_images`."
         ),
     )
-    
     parser.add_argument(
         "--output_dir",
         type=str,
         default="textual-inversion-model",
         help="The output directory where the model predictions and checkpoints will be written.",
     )
-
     parser.add_argument(
         "--validation_image_output_dir",
         type=str,
         default=None,
         help="The directory where validation images will be saved. If None, images will be saved inside a subdirectory of `output_dir`.",
     )
-
     parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--resolution",
@@ -310,12 +355,6 @@ def parse_args(input_args=None):
             " cropped. The images will be resized to the resolution first before cropping."
         ),
     )
-    # Textual Inversion only trains the embedding, not the full text encoder
-    # parser.add_argument(
-    #     "--train_text_encoder",
-    #     action="store_true",
-    #     help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
-    # ) 
     parser.add_argument(
         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
     )
@@ -457,12 +496,10 @@ def parse_args(input_args=None):
             " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
         ),
     )
-    
     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
     )
-    
     parser.add_argument(
         "--tokenizer_max_length",
         type=int,
@@ -489,7 +526,6 @@ def parse_args(input_args=None):
         default=None,
         help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.",
     )
-    
     parser.add_argument(
         "--initializer_token",
         type=str,
@@ -497,8 +533,6 @@ def parse_args(input_args=None):
         required=True,
         help="A token to use as a proxy for the concept during training. Used to initialize the new placeholder embedding.",
     )
-    
-    # [START] 为可视化方案增加的通用指标参数定义 (保持不变)
     parser.add_argument(
         "--coords_save_path",
         type=str,
@@ -511,7 +545,6 @@ def parse_args(input_args=None):
         default=25,
         help="保存坐标数据的步数间隔。",
     )
-    # [END] 为可视化方案增加的通用指标参数定义
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -521,26 +554,17 @@ def parse_args(input_args=None):
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
     if env_local_rank != -1 and env_local_rank != args.local_rank:
         args.local_rank = env_local_rank
-    
-    
-    # if "<" not in args.placeholder_token or ">" not in args.placeholder_token:
-    #     logger.warning(
-    #         f"The placeholder token `{args.placeholder_token}` does not seem to be enclosed by `<` and `>`. "
-    #         f"Please make sure it's a unique token that is unlikely to exist in the vocabulary."
-    #     )
 
     return args
 
 
 class DreamBoothDataset(Dataset):
-    """
-    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
-    It pre-processes the images and the tokenizes prompts.
-    
-    NOTE: Renamed from DreamBoothDataset to TI_Dataset for clarity, but keeping the name if possible to maintain
-    compatibility with original imports/logic. Reverting to original name to maintain structural parity.
-    """
-
+    # 说明：
+    # 1) 该数据集负责读取实例图片，并把图片变换到训练所需的张量格式。
+    # 2) 同时会对 instance_prompt 做 tokenizer 编码，生成 input_ids 与 attention_mask。
+    # 3) Textual Inversion 不做 prior preservation，因此长度等于实例图片数量。
+    # 4) 图像会先 resize 再 crop，并归一化到 [-1,1]（Normalize([0.5],[0.5])）。
+    # 5) 返回的字典字段会在 collate_fn 中被组装成 batch，供 UNet 前向与损失计算使用。
     def __init__(
         self,
         instance_data_root,
@@ -548,21 +572,19 @@ class DreamBoothDataset(Dataset):
         tokenizer,
         size=512,
         center_crop=False,
-        # Encoder hidden states pre-computation is not supported for TI as the embeddings are the target
-        encoder_hidden_states=None, 
+        encoder_hidden_states=None,
         class_prompt_encoder_hidden_states=None,
         tokenizer_max_length=None,
     ):
         self.size = size
         self.center_crop = center_crop
         self.tokenizer = tokenizer
-        
-        # TI does not support pre-computed embeddings
+
         if encoder_hidden_states is not None or class_prompt_encoder_hidden_states is not None:
-             raise ValueError("Textual Inversion cannot use pre-computed encoder hidden states.")
-        
-        self.encoder_hidden_states = encoder_hidden_states # Should be None
-        self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states # Should be None
+            raise ValueError("Textual Inversion cannot use pre-computed encoder hidden states.")
+
+        self.encoder_hidden_states = encoder_hidden_states
+        self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states
         self.tokenizer_max_length = tokenizer_max_length
 
         self.instance_data_root = Path(instance_data_root)
@@ -572,8 +594,7 @@ class DreamBoothDataset(Dataset):
         self.instance_images_path = list(Path(instance_data_root).iterdir())
         self.num_instance_images = len(self.instance_images_path)
         self.instance_prompt = instance_prompt
-        self._length = self.num_instance_images # Simplified length as no prior preservation
-
+        self._length = self.num_instance_images
 
         self.image_transforms = transforms.Compose(
             [
@@ -596,19 +617,20 @@ class DreamBoothDataset(Dataset):
             instance_image = instance_image.convert("RGB")
         example["instance_images"] = self.image_transforms(instance_image)
 
-        # Tokenize instance prompt
-        text_inputs = tokenize_prompt(
-            self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length
-        )
+        text_inputs = tokenize_prompt(self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length)
         example["instance_prompt_ids"] = text_inputs.input_ids
         example["instance_attention_mask"] = text_inputs.attention_mask
 
-        # Class data logic removed for Textual Inversion
-
         return example
 
 
 def collate_fn(examples):
+    # 说明：
+    # 1) 该函数负责将 Dataset 返回的若干条样本组装成一个 batch。
+    # 2) 对图像张量做 stack，得到 (B,C,H,W) 的 pixel_values。
+    # 3) 对 token 的 input_ids 做 cat，得到 (B,seq_len) 的输入矩阵。
+    # 4) attention_mask 保持与 input_ids 对齐，用于 text encoder 的有效 token 标记。
+    # 5) 输出 batch 会被训练循环直接使用，字段命名与后续代码保持一致。
     has_attention_mask = "instance_attention_mask" in examples[0]
 
     input_ids = [example["instance_prompt_ids"] for example in examples]
@@ -616,7 +638,7 @@ def collate_fn(examples):
 
     if has_attention_mask:
         attention_mask = [example["instance_attention_mask"] for example in examples]
-        
+
     pixel_values = torch.stack(pixel_values)
     pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
 
@@ -632,9 +654,14 @@ def collate_fn(examples):
 
     return batch
 
-# PromptDataset and class image generation are removed as TI typically doesn't use prior preservation
 
 def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+    # 说明：
+    # 1) 对文本 prompt 做 tokenizer 编码，生成 input_ids 与 attention_mask。
+    # 2) 使用固定长度 padding="max_length" 保证 batch 拼接简单一致。
+    # 3) truncation=True 防止超过最大长度导致报错。
+    # 4) tokenizer_max_length 允许外部指定最大长度；否则使用 tokenizer.model_max_length。
+    # 5) 返回 transformers 的 BatchEncoding，后续直接取 input_ids 与 attention_mask 使用即可。
     if tokenizer_max_length is not None:
         max_length = tokenizer_max_length
     else:
@@ -647,11 +674,16 @@ def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
         max_length=max_length,
         return_tensors="pt",
     )
-
     return text_inputs
 
 
 def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None):
+    # 说明：
+    # 1) 将 token id 输入 Text Encoder，得到用于 UNet 条件输入的 prompt_embeds。
+    # 2) 如果启用 attention_mask，会把 mask 一并传入，以减少 padding token 的影响。
+    # 3) 输出的 prompt_embeds 通常形状为 (B, seq_len, hidden_dim)。
+    # 4) UNet 会把该 embedding 作为 cross-attention 的条件，实现文本引导。
+    # 5) 该函数不涉及梯度以外的副作用，embedding 的更新由上层训练流程控制。
     text_input_ids = input_ids.to(text_encoder.device)
 
     if text_encoder_use_attention_mask:
@@ -665,20 +697,23 @@ def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_atte
         return_dict=False,
     )
     prompt_embeds = prompt_embeds[0]
-
     return prompt_embeds
 
 
 def main(args):
+    # 说明：
+    # 1) 主函数负责训练用的全部初始化：accelerate、模型加载、数据集/优化器/调度器。
+    # 2) Textual Inversion 的关键是新增一个 placeholder token，并只训练该 token 的 embedding。
+    # 3) 训练过程中会定期保存 learned_embeds.bin 与 tokenizer，并执行验证推理输出图片。
+    # 4) 验证推理从基础模型加载，再加载 learned_embeds.bin，避免对基础模型权重产生写回影响。
+    # 5) 若开启 coords_save_path，会按你原有逻辑采集并保存可视化坐标数据，不改变其行为。
     if args.report_to == "wandb" and args.hub_token is not None:
         raise ValueError(
             "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
             " Please use `huggingface-cli login` to authenticate with the Hub."
         )
-    
 
     logging_dir = Path(args.output_dir, args.logging_dir)
-
     accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
 
     accelerator = Accelerator(
@@ -688,7 +723,6 @@ def main(args):
         project_config=accelerator_project_config,
     )
 
-    # Disable AMP for MPS.
     if torch.backends.mps.is_available():
         accelerator.native_amp = False
 
@@ -696,8 +730,6 @@ def main(args):
         if not is_wandb_available():
             raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
 
-
-    # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
@@ -711,13 +743,9 @@ def main(args):
         transformers.utils.logging.set_verbosity_error()
         diffusers.utils.logging.set_verbosity_error()
 
-    # If passed along, set the training seed now.
     if args.seed is not None:
         set_seed(args.seed)
 
-    # Prior preservation image generation logic removed
-
-    # Handle the repository creation
     if accelerator.is_main_process:
         if args.output_dir is not None:
             os.makedirs(args.output_dir, exist_ok=True)
@@ -727,7 +755,6 @@ def main(args):
                 repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
             ).repo_id
 
-    # Load the tokenizer
     if args.tokenizer_name:
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
     elif args.pretrained_model_name_or_path:
@@ -737,34 +764,26 @@ def main(args):
             revision=args.revision,
             use_fast=False,
         )
+    else:
+        raise ValueError("Must provide either --tokenizer_name or --pretrained_model_name_or_path")
 
-    # Add the placeholder token to the tokenizer vocabulary and initialize the new token embedding
-    # Get token IDs for initializer and placeholder tokens
     initializer_token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
-    placeholder_token_ids = tokenizer.encode(args.placeholder_token, add_special_tokens=False)
-
     if len(initializer_token_ids) > 1:
         raise ValueError("The initializer token must be a single token.")
-    
-    if placeholder_token_ids != tokenizer.unk_token_id:
-        # If the placeholder is already in the vocab, it's either an existing token or was already added.
-        # We need to make sure it's actually the placeholder and not an existing common word.
-        # However, for simplicity and matching standard TI, we assume it's a new token.
-        # The standard approach is to *add* the placeholder token, which results in a list of new tokens.
-        
-        # Add the placeholder token to the tokenizer and get the new token ID
-        tokenizer.add_tokens(args.placeholder_token)
-        placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
-    else:
-        # This case handles when the placeholder token is already a single, known token, which is usually fine,
-        # but in TI we usually want to add a *new* token. We rely on the `add_tokens` method below.
-        tokenizer.add_tokens(args.placeholder_token)
-        placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
 
-    # import correct text encoder class
+    if args.placeholder_token in tokenizer.get_vocab():
+        raise ValueError(
+            f"Placeholder token '{args.placeholder_token}' already exists in the tokenizer vocabulary. "
+            f"Please choose a unique, new token name (recommended: something like '<sks-<jobid>>')."
+        )
+
+    num_added = tokenizer.add_tokens(args.placeholder_token)
+    if num_added != 1:
+        raise ValueError(f"Failed to add placeholder token '{args.placeholder_token}' (num_added={num_added}).")
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
     text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
 
-    # Load scheduler and models
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
     text_encoder = text_encoder_cls.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
@@ -774,64 +793,40 @@ def main(args):
             args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
         )
     except OSError:
-        # IF does not have a VAE so let's just set it to None
-        # We don't have to error out here
         vae = None
 
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
     )
 
-    # Textual Inversion specific setup: Resize token embeddings and initialize new token
     text_encoder.resize_token_embeddings(len(tokenizer))
-    
+
     token_embeds = text_encoder.get_input_embeddings().weight.data
     initializer_token_id = tokenizer.convert_tokens_to_ids(args.initializer_token)
-    
-    # Initialize the new token embedding with the initializer token's embedding
     token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
 
-    # Freeze all models and then unfreeze the embedding layer
     if vae is not None:
         vae.requires_grad_(False)
     text_encoder.requires_grad_(False)
     unet.requires_grad_(False)
 
-    # Only train the newly added embedding (Textual Inversion)
     embedding_layer = text_encoder.get_input_embeddings()
     embedding_layer.weight.requires_grad = True
-    
-    # Freeze all but the placeholder token's embedding. We create a mask/indices for the placeholder token ID.
-    # Note: Textual Inversion typically only trains the new token's embedding.
-    # We use a trick to register the embedding layer as trainable, but ensure only the new embedding is updated.
-    
-    # The simplest way is to ensure all embedding weights are trainable, and let the optimizer only update
-    # the ones that appear in the batch. However, a safer way is to specifically mark only the placeholder
-    # embedding as trainable.
-    
-    # Get the embedding tensor
     trainable_token_embeds = embedding_layer.weight
-    
-    # Mask to freeze all except the placeholder token's embedding
+
     mask = torch.ones(len(tokenizer), dtype=torch.bool)
-    mask[placeholder_token_id] = False # We want the placeholder to be unmasked (trainable)
-    
-    # Freeze the embeddings that are NOT the placeholder token's
+    mask[placeholder_token_id] = False
+
     trainable_token_embeds.data[mask] = trainable_token_embeds.data[mask].float()
     trainable_token_embeds.data[mask].requires_grad = False
-    
-    # Make sure the placeholder token's embedding is set to require gradients
     trainable_token_embeds.data[placeholder_token_id].requires_grad = True
 
-    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and unet) to half-precision
     weight_dtype = torch.float32
     if accelerator.mixed_precision == "fp16":
         weight_dtype = torch.float16
     elif accelerator.mixed_precision == "bf16":
         weight_dtype = torch.bfloat16
 
-    # Move unet, vae and text_encoder to device and cast to weight_dtype
-    # Note: Only trainable parameters (new embeddings) must remain in float32 for fp16 training.
     unet.to(accelerator.device, dtype=weight_dtype)
     if vae is not None:
         vae.to(accelerator.device, dtype=weight_dtype)
@@ -844,7 +839,7 @@ def main(args):
             xformers_version = version.parse(xformers.__version__)
             if xformers_version == version.parse("0.0.16"):
                 logger.warning(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17."
                 )
             unet.enable_xformers_memory_efficient_attention()
         else:
@@ -852,82 +847,79 @@ def main(args):
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
-        # Textual Inversion doesn't train the full text encoder, so we only need to checkpoint UNet
-
 
     def unwrap_model(model):
+        # 说明：
+        # 1) accelerate 在分布式或混合精度下会包装模型，保存/取权重时需要先 unwrap。
+        # 2) 如果启用 torch.compile，模型会被再次包装，需取 _orig_mod 获取真实模块。
+        # 3) 该函数用于在保存 embedding、验证推理、访问模型权重时统一处理。
+        # 4) 返回的模型对象是“原始模型”，便于直接访问 embedding 权重与 config。
+        # 5) 该函数自身不做任何训练逻辑修改，只是一个安全的模型访问入口。
         model = accelerator.unwrap_model(model)
         model = model._orig_mod if is_compiled_module(model) else model
         return model
 
-    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
     def save_model_hook(models, weights, output_dir):
+        # 说明：
+        # 1) 该 Hook 用于让 accelerate.save_state 保存为 Textual Inversion 需要的最小产物。
+        # 2) 主要保存 learned_embeds.bin（仅包含 placeholder_token 对应的 embedding 行）。
+        # 3) 同时保存 tokenizer，以便后续复现训练 token 的 id 映射与 tokenizer 配置。
+        # 4) 不保存 UNet/VAE/TextEncoder 的完整权重，避免体积巨大且不符合“增量”设计。
+        # 5) 保存行为只发生在主进程，避免分布式重复写盘导致文件冲突。
         if accelerator.is_main_process:
-            # We only save the trained token embedding
             text_encoder_unwrapped = unwrap_model(text_encoder)
-            
-            # Find the trained embedding
-            trained_embeddings = text_encoder_unwrapped.get_input_embeddings().weight[placeholder_token_id:placeholder_token_id+1]
-            
-            # Create a state dict to save
-            learned_embeds_dict = {
-                args.placeholder_token: trained_embeddings.detach().cpu()
-            }
-            
-            # Save the embedding file (similar to Textual Inversion pipelines)
+            trained_embeddings = text_encoder_unwrapped.get_input_embeddings().weight[
+                placeholder_token_id : placeholder_token_id + 1
+            ]
+            learned_embeds_dict = {args.placeholder_token: trained_embeddings.detach().cpu()}
             torch.save(learned_embeds_dict, os.path.join(output_dir, "learned_embeds.bin"))
-            
-            # make sure to pop weight so that corresponding model is not saved again
-            weights.pop()
-            
-            # Also save tokenizer for completeness
+
+            if len(weights) > 0:
+                weights.pop()
+
             tokenizer.save_pretrained(output_dir)
 
     def load_model_hook(models, input_dir):
+        # 说明：
+        # 1) 该 Hook 用于从 checkpoint 恢复训练时，将 learned_embeds.bin 写回到 text_encoder embedding。
+        # 2) 对于 Textual Inversion，恢复的关键是 placeholder_token 对应 embedding 行，而非整个模型。
+        # 3) 同时通过 checkpoint 内的 tokenizer 获取 placeholder_token 的 token_id，以保证写入位置一致。
+        # 4) 若 checkpoint 缺失 learned_embeds.bin，会打印警告并跳过，允许从头开始训练。
+        # 5) 该逻辑只改变当前训练进程内的权重状态，不会修改基础模型目录的文件。
         text_encoder_ = None
 
         while len(models) > 0:
             model = models.pop()
             if isinstance(model, type(unwrap_model(text_encoder))):
                 text_encoder_ = model
-            # UNet is not passed to the load hook for training state, only text_encoder's embedding matters
 
-        # Load the embedding file
         embedding_path = os.path.join(input_dir, "learned_embeds.bin")
         if not os.path.exists(embedding_path):
             logger.warning(f"Could not find learned_embeds.bin at {embedding_path}. This may be normal if starting a new run.")
             return
 
         state_dict = torch.load(embedding_path, map_location="cpu")
-        
-        # We expect a dictionary where the key is the placeholder token
         if args.placeholder_token not in state_dict:
             raise ValueError(
                 f"Trained embedding not found for placeholder token '{args.placeholder_token}' in loaded state dict."
             )
-            
+
         learned_embeds = state_dict[args.placeholder_token]
-        
-        # Load embedding into the text encoder
-        token_embeds = text_encoder_.get_input_embeddings().weight.data
-        
-        # Ensure the current tokenizer and text encoder size is consistent with the checkpoint
+        token_embeds_local = text_encoder_.get_input_embeddings().weight.data
+
         current_tokenizer = AutoTokenizer.from_pretrained(input_dir)
         current_placeholder_token_id = current_tokenizer.convert_tokens_to_ids(args.placeholder_token)
-        
         if current_placeholder_token_id == current_tokenizer.unk_token_id:
-             raise ValueError(
+            raise ValueError(
                 f"Placeholder token '{args.placeholder_token}' not found in the tokenizer loaded from checkpoint at {input_dir}. "
                 "Ensure your checkpoint contains the tokenizer with the added placeholder token."
             )
-        
-        token_embeds[current_placeholder_token_id] = learned_embeds.to(token_embeds.dtype).to(token_embeds.device)
 
+        token_embeds_local[current_placeholder_token_id] = learned_embeds.to(token_embeds_local.dtype).to(token_embeds_local.device)
 
     accelerator.register_save_state_pre_hook(save_model_hook)
     accelerator.register_load_state_pre_hook(load_model_hook)
 
-    # Enable TF32 for faster training on Ampere GPUs
     if args.allow_tf32:
         torch.backends.cuda.matmul.allow_tf32 = True
 
@@ -936,30 +928,21 @@ def main(args):
             args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
         )
 
-    # Only upcast trainable parameters (embedding) into fp32 if mixed precision is used
     if accelerator.mixed_precision == "fp16":
-        # The embedding layer is the only part that needs to be checked
         cast_training_params([text_encoder], dtype=torch.float32)
 
-
-    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
     if args.use_8bit_adam:
         try:
             import bitsandbytes as bnb
         except ImportError:
-            raise ImportError(
-                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
-
+            raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`.")
         optimizer_class = bnb.optim.AdamW8bit
     else:
         optimizer_class = torch.optim.AdamW
 
-    # Optimizer creation: only includes the trainable embedding parameters
     params_to_optimize = list(filter(lambda p: p.requires_grad, text_encoder.parameters()))
-    
     if not params_to_optimize:
-         raise ValueError("No trainable parameters found. Check if the embedding layer is set to requires_grad=True.")
+        raise ValueError("No trainable parameters found. Check if the embedding layer is set to requires_grad=True.")
 
     optimizer = optimizer_class(
         params_to_optimize,
@@ -969,13 +952,11 @@ def main(args):
         eps=args.adam_epsilon,
     )
 
-    # Pre-computation is not supported for Textual Inversion, so this block is simplified
     pre_computed_encoder_hidden_states = None
     pre_computed_class_prompt_encoder_hidden_states = None
     validation_prompt_encoder_hidden_states = None
     validation_prompt_negative_prompt_embeds = None
-    
-    # Dataset and DataLoaders creation:
+
     train_dataset = DreamBoothDataset(
         instance_data_root=args.instance_data_dir,
         instance_prompt=args.instance_prompt,
@@ -995,7 +976,6 @@ def main(args):
         num_workers=args.dataloader_num_workers,
     )
 
-    # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
@@ -1011,28 +991,20 @@ def main(args):
         power=args.lr_power,
     )
 
-    # Prepare everything with our `accelerator`.
-    # Only UNet, Text Encoder and Optimizer are prepared (VAE is not optimized/frozen)
     unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
         unet, text_encoder, optimizer, train_dataloader, lr_scheduler
     )
 
-
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    # Afterwards we recalculate our number of training epochs
     args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
 
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
     if accelerator.is_main_process:
         tracker_config = vars(copy.deepcopy(args))
         tracker_config.pop("validation_images")
-        accelerator.init_trackers("textual-inversion", config=tracker_config) # Updated project name
+        accelerator.init_trackers("textual-inversion", config=tracker_config)
 
-    # Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
     logger.info("***** Running training *****")
@@ -1043,27 +1015,25 @@ def main(args):
     logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
     logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
     global_step = 0
     first_epoch = 0
 
-    # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         resume_path = args.output_dir
-        
+
         try:
             accelerator.print(f"Resuming from checkpoint at {resume_path}")
             accelerator.load_state(resume_path)
-            
-            # After loading state, `accelerator` updates its internal state including `step` and `epoch`
+
             initial_global_step = accelerator.state.global_step
             global_step = initial_global_step
-            
-            # Recalculate first_epoch based on the loaded global_step
+
             num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
             first_epoch = global_step // num_update_steps_per_epoch
 
             accelerator.print(f"Resumed at global step {global_step} and epoch {first_epoch}")
-            
+
         except Exception as e:
             accelerator.print(
                 f"Could not load state from '{resume_path}'. Starting a new training run. Error: {e}"
@@ -1075,55 +1045,43 @@ def main(args):
         initial_global_step = 0
         first_epoch = 0
 
-    # [START] 为可视化方案增加的初始化和导入 (保持不变)
     coords_list = []
-    # 提前定义 X, Y 指标的临时存储变量，用于跨代码块传递数据
     X_i_feature_norm = float("nan")
     Y_i_feature_var = float("nan")
-    
+
     if args.coords_save_path is not None:
-        logger.info(
-            f"可视化指标采集已启用。数据将每 {args.coords_log_interval} 步保存一次到 {args.coords_save_path}"
-        )
-    # [END] 为可视化方案增加的初始化和导入
+        logger.info(f"可视化指标采集已启用。数据将每 {args.coords_log_interval} 步保存一次到 {args.coords_save_path}")
 
     progress_bar = tqdm(
         range(0, args.max_train_steps),
         initial=initial_global_step,
         desc="Steps",
-        # Only show the progress bar once on each machine.
         disable=not accelerator.is_local_main_process,
     )
 
     for epoch in range(first_epoch, args.num_train_epochs):
-        unet.train() # UNet is frozen, but keep in train mode for modules like Dropout (if any)
+        unet.train()
         text_encoder.train()
+
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
                 pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
 
                 if vae is not None:
-                    # Convert images to latent space
                     model_input = vae.encode(pixel_values).latent_dist.sample()
                     model_input = model_input * vae.config.scaling_factor
                 else:
                     model_input = pixel_values
 
-                # Sample noise that we'll add to the latents
                 noise = torch.randn_like(model_input)
                 bsz, channels, height, width = model_input.shape
-                # Sample a random timestep for each image
+
                 timesteps = torch.randint(
                     0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
-                )
-                timesteps = timesteps.long()
+                ).long()
 
-                # Add noise to the model input according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
                 noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
 
-                # Get the text embedding for conditioning
-                # Since pre_compute_text_embeddings is false, we encode the prompt here
                 encoder_hidden_states = encode_prompt(
                     text_encoder,
                     batch["input_ids"],
@@ -1139,7 +1097,6 @@ def main(args):
                 else:
                     class_labels = None
 
-                # Predict the noise residual
                 model_pred = unet(
                     noisy_model_input,
                     timesteps,
@@ -1148,28 +1105,13 @@ def main(args):
                     return_dict=False,
                 )[0]
 
-                # If model predicts variance, throw away the prediction.
                 if model_pred.shape[1] == 6:
                     model_pred, _ = torch.chunk(model_pred, 2, dim=1)
 
-                # [START] 为可视化方案增加的 X轴 (特征范数) 和 Y轴 (特征方差) 计算 (通用指标) (保持不变)
                 if args.coords_save_path is not None:
-                    # 修正 X轴 计算：将 torch.linalg.norm 替换为传统的 torch.norm
-                    # 传统的 torch.norm 支持对多个维度求范数 (dim=[1, 2, 3])
-                    # X轴: UNet 预测特征 L2 范数 (衡量预测的“强度”)
-                    # torch.norm(..., p=2, dim=...) 表示 L2 范数
-                    X_i_feature_norm = torch.norm(
-                        model_pred.detach().float(), 
-                        p=2, 
-                        dim=[1, 2, 3] # 对 C, H, W 维度求 L2 范数
-                    ).mean().item() # 对 Batch 维度求平均
-                    
-                    # Y轴: UNet 预测特征方差 (衡量预测的“混乱度/稳定性”)
-                    # var() 默认对所有维度求方差，我们对 C, H, W 求方差，然后对 Batch 求平均
+                    X_i_feature_norm = torch.norm(model_pred.detach().float(), p=2, dim=[1, 2, 3]).mean().item()
                     Y_i_feature_var = model_pred.detach().float().var(dim=[1, 2, 3]).mean().item()
-                # [END] 为可视化方案增加的 X轴 (特征范数) 和 Y轴 (特征方差) 计算 (通用指标)
 
-                # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
                     target = noise
                 elif noise_scheduler.config.prediction_type == "v_prediction":
@@ -1177,122 +1119,71 @@ def main(args):
                 else:
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
-                # Prior preservation block removed for Textual Inversion.
                 loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
 
                 accelerator.backward(loss)
 
                 if accelerator.sync_gradients:
-                    # Only clip gradient for trainable parameters
-                    # For Textual Inversion, only the embedding requires grad
-                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm) 
-                    
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+
                     optimizer.step()
                     lr_scheduler.step()
                     optimizer.zero_grad()
-                    
-                    # Ensure only the placeholder token's embedding is updated and all others are clamped
-                    # This is the "slicing" step typical of TI to ensure only the learned token moves
+
                     if accelerator.num_processes > 1:
-                        # For DDP/Distributed training, we need to unwrap the model to apply the mask
                         unwrapped_text_encoder = unwrap_model(text_encoder)
                         trainable_embeds = unwrapped_text_encoder.get_input_embeddings().weight
                     else:
                         trainable_embeds = text_encoder.get_input_embeddings().weight
-                    
-                    # Clamp the non-placeholder embeddings (ensure they don't move)
+
                     trainable_embeds.data[mask] = trainable_embeds.data[mask].float().to(trainable_embeds.device)
                     trainable_embeds.data[placeholder_token_id] = trainable_embeds.data[placeholder_token_id].float()
 
-
-            # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
                 progress_bar.update(1)
                 global_step += 1
 
-
-                # [START] 为可视化方案增加的 X, Y, Z轴 数据记录和保存 (通用指标) (保持不变)
                 if args.coords_save_path is not None and (
                     global_step % args.coords_log_interval == 0
                     or global_step == 1
                     or global_step == initial_global_step + 1
                 ):
-                    
-                    # Z轴: LDM 损失 (直接获取当前步的 loss)
                     Z_i = loss.detach().item()
-                    
-                    # 记录坐标数据 (X和Y已在前面计算)
                     coords_list.append([global_step, X_i_feature_norm, Y_i_feature_var, Z_i])
-                    
-                    # 实时保存到文件 (覆盖保存，确保文件始终是最新的)
+
                     df = pd.DataFrame(
                         coords_list,
-                        columns=['step', 'X_Feature_L2_Norm', 'Y_Feature_Variance', 'Z_LDM_Loss']
+                        columns=["step", "X_Feature_L2_Norm", "Y_Feature_Variance", "Z_LDM_Loss"],
                     )
-                    
-                    # 假设 args.coords_save_path 是目标文件路径 (如 ./data/coords.csv)
+
                     save_file_path = Path(args.coords_save_path)
                     if not save_file_path.suffix:
                         save_file_path = save_file_path / "coords.csv"
                     save_file_path.parent.mkdir(parents=True, exist_ok=True)
                     df.to_csv(save_file_path, index=False)
-                    
+
                     if global_step % (args.coords_log_interval * 10) == 0:
                         logger.info(
                             f"Step {global_step}: 已记录并保存可视化坐标 (X={X_i_feature_norm:.4f}, Y={Y_i_feature_var:.4f}, Z={Z_i:.4f}) 到 {save_file_path}"
                         )
-                # [END] 为可视化方案增加的 X, Y, Z轴 数据记录和保存 (通用指标)
-
 
                 if accelerator.is_main_process:
                     if (global_step + 1) % args.checkpointing_steps == 0:
-                        # 1. 保存模型参数：直接保存到 args.output_dir，覆盖上一轮
                         output_dir = args.output_dir
-                        # accelerator.save_state handles saving the models using the registered hooks
                         accelerator.save_state(output_dir)
                         logger.info(f"Saving state to {output_dir} at step {global_step+1}")
 
-                        # 2. 推理调用模型：从 args.output_dir 加载最新的模型权重
-                        # Textual Inversion Pipeline loading
                         pipeline = DiffusionPipeline.from_pretrained(
                             args.pretrained_model_name_or_path,
-                            text_encoder=unwrap_model(text_encoder), # Use the unwrapped text encoder
                             revision=args.revision,
                             variant=args.variant,
                             torch_dtype=weight_dtype,
                         )
-                        
-                        # Load the learned embedding into the pipeline's tokenizer/text_encoder
-                        # (The load hook handles the actual embedding tensor update during accelerator.load_state)
-                        # Here, we only need to load the tokenizer to ensure the pipeline has the placeholder token
-                        pipeline.tokenizer = AutoTokenizer.from_pretrained(output_dir)
-                        pipeline.text_encoder.resize_token_embeddings(len(pipeline.tokenizer))
-                        # 🌟 关键修复：手动加载 learned_embeds.bin 文件
-                        # 1. 加载 learned_embeds.bin
-                        path = os.path.join(args.output_dir, "learned_embeds.bin")
-                        if not os.path.exists(path):
-                            # 如果文件名为 pytorch_model.bin (accelerate保存的完整模型)，我们需要从模型中提取
-                            # 此处假设您只保存了 learned_embeds.bin
-                            logger.warning("learned_embeds.bin not found. Skipping manual load.")
-                        else:
-                            # 加载权重字典
-                            loaded_embeds = torch.load(path, map_location="cpu")
-                            
-                            # 2. 提取唯一的 key (例如 'sks') 和 embedding tensor
-                            token_name = list(loaded_embeds.keys())[0]
-                            embedding = loaded_embeds[token_name]
-
-                            # 3. 获取新 token 的 ID
-                            token_id = pipeline.tokenizer.convert_tokens_to_ids(token_name)
-
-                            # 4. 将权重插入到 Text Encoder 的 Embedding Layer 中
-                            text_encoder_embeddings = pipeline.text_encoder.get_input_embeddings()
-                            text_encoder_embeddings.weight.data[token_id] = embedding.to(text_encoder_embeddings.weight.dtype).to(text_encoder_embeddings.weight.device)
-                            
-                        # 保持 pipeline 在 GPU 上
+
+                        _load_textual_inversion_compat(pipeline, output_dir, token=args.placeholder_token)
+
                         pipeline.to(accelerator.device)
 
-                        # Set pipeline args
                         pipeline_args = {"prompt": args.validation_prompt}
 
                         images = log_validation(
@@ -1303,20 +1194,16 @@ def main(args):
                             epoch,
                         )
 
-                        # 3. 推理生成结果保存：直接保存到指定目录/output_dir，不创建子文件夹
                         base_save_path = Path(args.validation_image_output_dir or args.output_dir)
                         base_save_path.mkdir(parents=True, exist_ok=True)
                         logger.info(f"Saving validation images to {base_save_path}")
 
-                        # 图片直接保存在 base_save_path，会覆盖上一轮的同名图片
                         for i, image in enumerate(images):
                             image.save(base_save_path / f"image_{i}.png")
-                        
-                        # Clean up pipeline to save memory
+
                         del pipeline
                         gc.collect()
                         torch.cuda.empty_cache()
-                        
 
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
@@ -1325,32 +1212,29 @@ def main(args):
             if global_step >= args.max_train_steps:
                 break
 
-
-    # Save the final embeddings and tokenizer
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         text_encoder = unwrap_model(text_encoder)
-        
-        # Final save of the learned_embeds.bin and tokenizer
-        trained_embeddings = text_encoder.get_input_embeddings().weight[placeholder_token_id:placeholder_token_id+1]
-        
+
+        trained_embeddings = text_encoder.get_input_embeddings().weight[
+            placeholder_token_id : placeholder_token_id + 1
+        ]
         learned_embeds_dict = {
             args.placeholder_token: trained_embeddings.detach().cpu()
         }
-        
+
         torch.save(learned_embeds_dict, os.path.join(args.output_dir, "learned_embeds.bin"))
         tokenizer.save_pretrained(args.output_dir)
 
-        # Final inference
         pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype
+            args.pretrained_model_name_or_path,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
         )
-        # Load the final embedding
-        pipeline.tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
-        pipeline.text_encoder.resize_token_embeddings(len(pipeline.tokenizer))
-        pipeline.load_textual_inversion(args.output_dir)
+        _load_textual_inversion_compat(pipeline, args.output_dir, token=args.placeholder_token)
+        pipeline.to(accelerator.device)
 
-        # run inference
         images = []
         if args.validation_prompt and args.num_validation_images > 0:
             pipeline_args = {"prompt": args.validation_prompt, "num_inference_steps": 25}
@@ -1368,11 +1252,11 @@ def main(args):
                 repo_id,
                 images=images,
                 base_model=args.pretrained_model_name_or_path,
-                train_text_encoder=False, # TI is not full text encoder training
+                train_text_encoder=False,
                 prompt=args.instance_prompt,
                 repo_folder=args.output_dir,
                 pipeline=pipeline,
-                placeholder_token=args.placeholder_token, # Added for TI
+                placeholder_token=args.placeholder_token,
             )
             upload_folder(
                 repo_id=repo_id,
@@ -1381,20 +1265,17 @@ def main(args):
                 ignore_patterns=["step_*", "epoch_*"],
             )
 
-        # [START] 为可视化方案增加的最终保存 (通用指标) (保持不变)
         if args.coords_save_path is not None and coords_list:
             df = pd.DataFrame(
                 coords_list,
-                columns=['step', 'X_Feature_L2_Norm', 'Y_Feature_Variance', 'Z_LDM_Loss']
+                columns=["step", "X_Feature_L2_Norm", "Y_Feature_Variance", "Z_LDM_Loss"],
             )
-            # 假设 args.coords_save_path 是目标文件路径
             save_file_path = Path(args.coords_save_path)
             if not save_file_path.suffix:
                 save_file_path = save_file_path / "coords.csv"
             save_file_path.parent.mkdir(parents=True, exist_ok=True)
             df.to_csv(save_file_path, index=False)
             logger.info(f"训练结束：已将所有 {len(coords_list)} 步可视化坐标数据保存到 {save_file_path}")
-        # [END] 为可视化方案增加的最终保存
 
     accelerator.end_training()
 
-- 
2.34.1