From 2cbc037e959b2cf558583087d46b96c381b32a6b Mon Sep 17 00:00:00 2001 From: Ryan <3266408525@qq.com> Date: Sat, 13 Dec 2025 13:20:51 +0800 Subject: [PATCH] =?UTF-8?q?improve:=20=E6=94=B9=E8=BF=9BPID=E7=AE=97?= =?UTF-8?q?=E6=B3=95=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../algorithms/finetune/train_ti_gen_trace.py | 491 +++++++----------- 1 file changed, 186 insertions(+), 305 deletions(-) diff --git a/src/backend/app/algorithms/finetune/train_ti_gen_trace.py b/src/backend/app/algorithms/finetune/train_ti_gen_trace.py index b97d155..76a5209 100644 --- a/src/backend/app/algorithms/finetune/train_ti_gen_trace.py +++ b/src/backend/app/algorithms/finetune/train_ti_gen_trace.py @@ -68,12 +68,40 @@ from diffusers.utils.torch_utils import is_compiled_module if is_wandb_available(): import wandb -# Will error if the minimal version of diffusers is not installed. Remove at your own risks. -# check_min_version("0.30.0.dev0") +# 说明: +# 1) 本文件用于训练 Textual Inversion(仅训练一个新 token 的向量)。 +# 2) 训练过程冻结 UNet/VAE/TextEncoder 的主体权重,仅更新新 token 对应的 embedding 行。 +# 3) 训练过程会按步保存 embedding,并进行验证推理,用于观察训练效果。 +# 4) 文件还包含可视化坐标采集逻辑:X=特征范数,Y=特征方差,Z=loss,并写入 CSV。 +# 5) 为了保证推理阶段的一致性,验证推理会从基础模型加载,并再加载 learned_embeds.bin 作为增量能力。 logger = get_logger(__name__) +def _load_textual_inversion_compat(pipeline: DiffusionPipeline, emb_dir: str, token: str): + """ + 说明: + 1) 不同 diffusers 版本对 load_textual_inversion 的参数命名不一致。 + 2) 有些版本支持 token=...,有些版本支持 tokens=[...],还有些只支持路径。 + 3) 本函数用于在不同版本之间提供兼容调用,优先传入 token 名提高确定性。 + 4) 若当前版本不接受这些参数,会自动降级为仅传路径的调用方式。 + 5) 该函数不会保存或覆盖基础模型文件,只在运行时向 pipeline 注入增量 embedding。 + """ + try: + pipeline.load_textual_inversion(emb_dir, token=token) + return + except TypeError: + pass + + try: + pipeline.load_textual_inversion(emb_dir, tokens=[token]) + return + except TypeError: + pass + + pipeline.load_textual_inversion(emb_dir) + + def save_model_card( repo_id: str, images=None, @@ -84,12 +112,17 @@ def save_model_card( pipeline: DiffusionPipeline = None, placeholder_token: str = None, ): + # 说明: + # 1) 该函数用于生成并保存 README 模型卡片与示例图片,便于上传 Hub 或本地记录。 + # 2) 对于 Textual Inversion,模型文件主要是 learned_embeds.bin 与 tokenizer。 + # 3) 该模型卡会说明训练所用的 placeholder token 与训练 prompt。 + # 4) 生成的图片会被保存在 repo_folder 下,方便查看训练效果。 + # 5) 本函数不会修改模型参数,只做文档与示例资产的持久化。 img_str = "" for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) img_str += f"![img_{i}](./image_{i}.png)\n" - # Model card updated for Textual Inversion model_description = f""" # Textual Inversion - {repo_id} @@ -123,11 +156,16 @@ def log_validation( epoch, is_final_validation=False, ): + # 说明: + # 1) 该函数用于在训练过程中做验证推理,观察当前 embedding 学到了什么。 + # 2) 会将 scheduler 替换为更适合推理的 DPMSolverMultistepScheduler。 + # 3) 会关闭安全检查器,避免被过滤导致无法看到结果。 + # 4) 既支持纯文生图,也支持某些管线的传图推理(依赖 args.validation_images)。 + # 5) 会把结果写入 tracker(tensorboard/wandb),并释放 GPU 显存。 logger.info( f"Running validation... \n Generating {args.num_validation_images} images with prompt:" f" {args.validation_prompt}." ) - # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it scheduler_args = {} if "variance_type" in pipeline.scheduler.config: @@ -139,12 +177,11 @@ def log_validation( scheduler_args["variance_type"] = variance_type pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args) - + pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) - pipeline.safety_checker = lambda images, clip_input: (images, [False for i in range(0, len(images))]) # disable safety checker + pipeline.safety_checker = lambda images, clip_input: (images, [False for i in range(0, len(images))]) - # run inference generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None if args.validation_images is None: @@ -182,6 +219,12 @@ def log_validation( def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + # 说明: + # 1) Stable Diffusion 不同变体可能使用不同的 text encoder 架构。 + # 2) 该函数读取 text_encoder 的配置,判断其 architectures 字段来确定具体类。 + # 3) 常见情况是 CLIPTextModel,也可能是 Roberta 或 T5 系列。 + # 4) 返回的类用于 from_pretrained 加载 text_encoder,保证结构匹配。 + # 5) 如果遇到未知架构会直接报错,避免后续 silent bug。 text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", @@ -206,6 +249,12 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st def parse_args(input_args=None): + # 说明: + # 1) 该函数定义所有可配置参数,支持命令行调用与被后端服务传参调用。 + # 2) 训练相关参数包含学习率、步数、批大小、混合精度、保存间隔等。 + # 3) Textual Inversion 需要 placeholder_token 与 initializer_token,并且 prompt 必须包含 placeholder。 + # 4) 验证推理参数用于在训练中生成图片,输出到指定目录用于可视化或服务返回。 + # 5) coords_* 参数用于记录 3D 可视化坐标数据,不影响训练但会增加少量开销。 parser = argparse.ArgumentParser(description="Simple example of a training script.") parser.add_argument( "--pretrained_model_name_or_path", @@ -240,7 +289,6 @@ def parse_args(input_args=None): required=True, help="A folder containing the training data of instance images.", ) - parser.add_argument( "--instance_prompt", type=str, @@ -276,21 +324,18 @@ def parse_args(input_args=None): " `args.validation_prompt` multiple times: `args.num_validation_images`." ), ) - parser.add_argument( "--output_dir", type=str, default="textual-inversion-model", help="The output directory where the model predictions and checkpoints will be written.", ) - parser.add_argument( "--validation_image_output_dir", type=str, default=None, help="The directory where validation images will be saved. If None, images will be saved inside a subdirectory of `output_dir`.", ) - parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--resolution", @@ -310,12 +355,6 @@ def parse_args(input_args=None): " cropped. The images will be resized to the resolution first before cropping." ), ) - # Textual Inversion only trains the embedding, not the full text encoder - # parser.add_argument( - # "--train_text_encoder", - # action="store_true", - # help="Whether to train the text encoder. If set, the text encoder should be float32 precision.", - # ) parser.add_argument( "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." ) @@ -457,12 +496,10 @@ def parse_args(input_args=None): " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." ), ) - parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) - parser.add_argument( "--tokenizer_max_length", type=int, @@ -489,7 +526,6 @@ def parse_args(input_args=None): default=None, help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.", ) - parser.add_argument( "--initializer_token", type=str, @@ -497,8 +533,6 @@ def parse_args(input_args=None): required=True, help="A token to use as a proxy for the concept during training. Used to initialize the new placeholder embedding.", ) - - # [START] 为可视化方案增加的通用指标参数定义 (保持不变) parser.add_argument( "--coords_save_path", type=str, @@ -511,7 +545,6 @@ def parse_args(input_args=None): default=25, help="保存坐标数据的步数间隔。", ) - # [END] 为可视化方案增加的通用指标参数定义 if input_args is not None: args = parser.parse_args(input_args) @@ -521,26 +554,17 @@ def parse_args(input_args=None): env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) if env_local_rank != -1 and env_local_rank != args.local_rank: args.local_rank = env_local_rank - - - # if "<" not in args.placeholder_token or ">" not in args.placeholder_token: - # logger.warning( - # f"The placeholder token `{args.placeholder_token}` does not seem to be enclosed by `<` and `>`. " - # f"Please make sure it's a unique token that is unlikely to exist in the vocabulary." - # ) return args class DreamBoothDataset(Dataset): - """ - A dataset to prepare the instance and class images with the prompts for fine-tuning the model. - It pre-processes the images and the tokenizes prompts. - - NOTE: Renamed from DreamBoothDataset to TI_Dataset for clarity, but keeping the name if possible to maintain - compatibility with original imports/logic. Reverting to original name to maintain structural parity. - """ - + # 说明: + # 1) 该数据集负责读取实例图片,并把图片变换到训练所需的张量格式。 + # 2) 同时会对 instance_prompt 做 tokenizer 编码,生成 input_ids 与 attention_mask。 + # 3) Textual Inversion 不做 prior preservation,因此长度等于实例图片数量。 + # 4) 图像会先 resize 再 crop,并归一化到 [-1,1](Normalize([0.5],[0.5]))。 + # 5) 返回的字典字段会在 collate_fn 中被组装成 batch,供 UNet 前向与损失计算使用。 def __init__( self, instance_data_root, @@ -548,21 +572,19 @@ class DreamBoothDataset(Dataset): tokenizer, size=512, center_crop=False, - # Encoder hidden states pre-computation is not supported for TI as the embeddings are the target - encoder_hidden_states=None, + encoder_hidden_states=None, class_prompt_encoder_hidden_states=None, tokenizer_max_length=None, ): self.size = size self.center_crop = center_crop self.tokenizer = tokenizer - - # TI does not support pre-computed embeddings + if encoder_hidden_states is not None or class_prompt_encoder_hidden_states is not None: - raise ValueError("Textual Inversion cannot use pre-computed encoder hidden states.") - - self.encoder_hidden_states = encoder_hidden_states # Should be None - self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states # Should be None + raise ValueError("Textual Inversion cannot use pre-computed encoder hidden states.") + + self.encoder_hidden_states = encoder_hidden_states + self.class_prompt_encoder_hidden_states = class_prompt_encoder_hidden_states self.tokenizer_max_length = tokenizer_max_length self.instance_data_root = Path(instance_data_root) @@ -572,8 +594,7 @@ class DreamBoothDataset(Dataset): self.instance_images_path = list(Path(instance_data_root).iterdir()) self.num_instance_images = len(self.instance_images_path) self.instance_prompt = instance_prompt - self._length = self.num_instance_images # Simplified length as no prior preservation - + self._length = self.num_instance_images self.image_transforms = transforms.Compose( [ @@ -596,19 +617,20 @@ class DreamBoothDataset(Dataset): instance_image = instance_image.convert("RGB") example["instance_images"] = self.image_transforms(instance_image) - # Tokenize instance prompt - text_inputs = tokenize_prompt( - self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length - ) + text_inputs = tokenize_prompt(self.tokenizer, self.instance_prompt, tokenizer_max_length=self.tokenizer_max_length) example["instance_prompt_ids"] = text_inputs.input_ids example["instance_attention_mask"] = text_inputs.attention_mask - # Class data logic removed for Textual Inversion - return example def collate_fn(examples): + # 说明: + # 1) 该函数负责将 Dataset 返回的若干条样本组装成一个 batch。 + # 2) 对图像张量做 stack,得到 (B,C,H,W) 的 pixel_values。 + # 3) 对 token 的 input_ids 做 cat,得到 (B,seq_len) 的输入矩阵。 + # 4) attention_mask 保持与 input_ids 对齐,用于 text encoder 的有效 token 标记。 + # 5) 输出 batch 会被训练循环直接使用,字段命名与后续代码保持一致。 has_attention_mask = "instance_attention_mask" in examples[0] input_ids = [example["instance_prompt_ids"] for example in examples] @@ -616,7 +638,7 @@ def collate_fn(examples): if has_attention_mask: attention_mask = [example["instance_attention_mask"] for example in examples] - + pixel_values = torch.stack(pixel_values) pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() @@ -632,9 +654,14 @@ def collate_fn(examples): return batch -# PromptDataset and class image generation are removed as TI typically doesn't use prior preservation def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None): + # 说明: + # 1) 对文本 prompt 做 tokenizer 编码,生成 input_ids 与 attention_mask。 + # 2) 使用固定长度 padding="max_length" 保证 batch 拼接简单一致。 + # 3) truncation=True 防止超过最大长度导致报错。 + # 4) tokenizer_max_length 允许外部指定最大长度;否则使用 tokenizer.model_max_length。 + # 5) 返回 transformers 的 BatchEncoding,后续直接取 input_ids 与 attention_mask 使用即可。 if tokenizer_max_length is not None: max_length = tokenizer_max_length else: @@ -647,11 +674,16 @@ def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None): max_length=max_length, return_tensors="pt", ) - return text_inputs def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=None): + # 说明: + # 1) 将 token id 输入 Text Encoder,得到用于 UNet 条件输入的 prompt_embeds。 + # 2) 如果启用 attention_mask,会把 mask 一并传入,以减少 padding token 的影响。 + # 3) 输出的 prompt_embeds 通常形状为 (B, seq_len, hidden_dim)。 + # 4) UNet 会把该 embedding 作为 cross-attention 的条件,实现文本引导。 + # 5) 该函数不涉及梯度以外的副作用,embedding 的更新由上层训练流程控制。 text_input_ids = input_ids.to(text_encoder.device) if text_encoder_use_attention_mask: @@ -665,20 +697,23 @@ def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_atte return_dict=False, ) prompt_embeds = prompt_embeds[0] - return prompt_embeds def main(args): + # 说明: + # 1) 主函数负责训练用的全部初始化:accelerate、模型加载、数据集/优化器/调度器。 + # 2) Textual Inversion 的关键是新增一个 placeholder token,并只训练该 token 的 embedding。 + # 3) 训练过程中会定期保存 learned_embeds.bin 与 tokenizer,并执行验证推理输出图片。 + # 4) 验证推理从基础模型加载,再加载 learned_embeds.bin,避免对基础模型权重产生写回影响。 + # 5) 若开启 coords_save_path,会按你原有逻辑采集并保存可视化坐标数据,不改变其行为。 if args.report_to == "wandb" and args.hub_token is not None: raise ValueError( "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token." " Please use `huggingface-cli login` to authenticate with the Hub." ) - logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( @@ -688,7 +723,6 @@ def main(args): project_config=accelerator_project_config, ) - # Disable AMP for MPS. if torch.backends.mps.is_available(): accelerator.native_amp = False @@ -696,8 +730,6 @@ def main(args): if not is_wandb_available(): raise ImportError("Make sure to install wandb if you want to use it for logging during training.") - - # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", @@ -711,13 +743,9 @@ def main(args): transformers.utils.logging.set_verbosity_error() diffusers.utils.logging.set_verbosity_error() - # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) - # Prior preservation image generation logic removed - - # Handle the repository creation if accelerator.is_main_process: if args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) @@ -727,7 +755,6 @@ def main(args): repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token ).repo_id - # Load the tokenizer if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) elif args.pretrained_model_name_or_path: @@ -737,34 +764,26 @@ def main(args): revision=args.revision, use_fast=False, ) + else: + raise ValueError("Must provide either --tokenizer_name or --pretrained_model_name_or_path") - # Add the placeholder token to the tokenizer vocabulary and initialize the new token embedding - # Get token IDs for initializer and placeholder tokens initializer_token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False) - placeholder_token_ids = tokenizer.encode(args.placeholder_token, add_special_tokens=False) - if len(initializer_token_ids) > 1: raise ValueError("The initializer token must be a single token.") - - if placeholder_token_ids != tokenizer.unk_token_id: - # If the placeholder is already in the vocab, it's either an existing token or was already added. - # We need to make sure it's actually the placeholder and not an existing common word. - # However, for simplicity and matching standard TI, we assume it's a new token. - # The standard approach is to *add* the placeholder token, which results in a list of new tokens. - - # Add the placeholder token to the tokenizer and get the new token ID - tokenizer.add_tokens(args.placeholder_token) - placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token) - else: - # This case handles when the placeholder token is already a single, known token, which is usually fine, - # but in TI we usually want to add a *new* token. We rely on the `add_tokens` method below. - tokenizer.add_tokens(args.placeholder_token) - placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token) - # import correct text encoder class + if args.placeholder_token in tokenizer.get_vocab(): + raise ValueError( + f"Placeholder token '{args.placeholder_token}' already exists in the tokenizer vocabulary. " + f"Please choose a unique, new token name (recommended: something like '>')." + ) + + num_added = tokenizer.add_tokens(args.placeholder_token) + if num_added != 1: + raise ValueError(f"Failed to add placeholder token '{args.placeholder_token}' (num_added={num_added}).") + placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token) + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) - # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant @@ -774,64 +793,40 @@ def main(args): args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant ) except OSError: - # IF does not have a VAE so let's just set it to None - # We don't have to error out here vae = None unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant ) - # Textual Inversion specific setup: Resize token embeddings and initialize new token text_encoder.resize_token_embeddings(len(tokenizer)) - + token_embeds = text_encoder.get_input_embeddings().weight.data initializer_token_id = tokenizer.convert_tokens_to_ids(args.initializer_token) - - # Initialize the new token embedding with the initializer token's embedding token_embeds[placeholder_token_id] = token_embeds[initializer_token_id] - # Freeze all models and then unfreeze the embedding layer if vae is not None: vae.requires_grad_(False) text_encoder.requires_grad_(False) unet.requires_grad_(False) - # Only train the newly added embedding (Textual Inversion) embedding_layer = text_encoder.get_input_embeddings() embedding_layer.weight.requires_grad = True - - # Freeze all but the placeholder token's embedding. We create a mask/indices for the placeholder token ID. - # Note: Textual Inversion typically only trains the new token's embedding. - # We use a trick to register the embedding layer as trainable, but ensure only the new embedding is updated. - - # The simplest way is to ensure all embedding weights are trainable, and let the optimizer only update - # the ones that appear in the batch. However, a safer way is to specifically mark only the placeholder - # embedding as trainable. - - # Get the embedding tensor trainable_token_embeds = embedding_layer.weight - - # Mask to freeze all except the placeholder token's embedding + mask = torch.ones(len(tokenizer), dtype=torch.bool) - mask[placeholder_token_id] = False # We want the placeholder to be unmasked (trainable) - - # Freeze the embeddings that are NOT the placeholder token's + mask[placeholder_token_id] = False + trainable_token_embeds.data[mask] = trainable_token_embeds.data[mask].float() trainable_token_embeds.data[mask].requires_grad = False - - # Make sure the placeholder token's embedding is set to require gradients trainable_token_embeds.data[placeholder_token_id].requires_grad = True - # For mixed precision training we cast all non-trainable weights (vae, text_encoder and unet) to half-precision weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 elif accelerator.mixed_precision == "bf16": weight_dtype = torch.bfloat16 - # Move unet, vae and text_encoder to device and cast to weight_dtype - # Note: Only trainable parameters (new embeddings) must remain in float32 for fp16 training. unet.to(accelerator.device, dtype=weight_dtype) if vae is not None: vae.to(accelerator.device, dtype=weight_dtype) @@ -844,7 +839,7 @@ def main(args): xformers_version = version.parse(xformers.__version__) if xformers_version == version.parse("0.0.16"): logger.warning( - "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17." ) unet.enable_xformers_memory_efficient_attention() else: @@ -852,82 +847,79 @@ def main(args): if args.gradient_checkpointing: unet.enable_gradient_checkpointing() - # Textual Inversion doesn't train the full text encoder, so we only need to checkpoint UNet - def unwrap_model(model): + # 说明: + # 1) accelerate 在分布式或混合精度下会包装模型,保存/取权重时需要先 unwrap。 + # 2) 如果启用 torch.compile,模型会被再次包装,需取 _orig_mod 获取真实模块。 + # 3) 该函数用于在保存 embedding、验证推理、访问模型权重时统一处理。 + # 4) 返回的模型对象是“原始模型”,便于直接访问 embedding 权重与 config。 + # 5) 该函数自身不做任何训练逻辑修改,只是一个安全的模型访问入口。 model = accelerator.unwrap_model(model) model = model._orig_mod if is_compiled_module(model) else model return model - # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format def save_model_hook(models, weights, output_dir): + # 说明: + # 1) 该 Hook 用于让 accelerate.save_state 保存为 Textual Inversion 需要的最小产物。 + # 2) 主要保存 learned_embeds.bin(仅包含 placeholder_token 对应的 embedding 行)。 + # 3) 同时保存 tokenizer,以便后续复现训练 token 的 id 映射与 tokenizer 配置。 + # 4) 不保存 UNet/VAE/TextEncoder 的完整权重,避免体积巨大且不符合“增量”设计。 + # 5) 保存行为只发生在主进程,避免分布式重复写盘导致文件冲突。 if accelerator.is_main_process: - # We only save the trained token embedding text_encoder_unwrapped = unwrap_model(text_encoder) - - # Find the trained embedding - trained_embeddings = text_encoder_unwrapped.get_input_embeddings().weight[placeholder_token_id:placeholder_token_id+1] - - # Create a state dict to save - learned_embeds_dict = { - args.placeholder_token: trained_embeddings.detach().cpu() - } - - # Save the embedding file (similar to Textual Inversion pipelines) + trained_embeddings = text_encoder_unwrapped.get_input_embeddings().weight[ + placeholder_token_id : placeholder_token_id + 1 + ] + learned_embeds_dict = {args.placeholder_token: trained_embeddings.detach().cpu()} torch.save(learned_embeds_dict, os.path.join(output_dir, "learned_embeds.bin")) - - # make sure to pop weight so that corresponding model is not saved again - weights.pop() - - # Also save tokenizer for completeness + + if len(weights) > 0: + weights.pop() + tokenizer.save_pretrained(output_dir) def load_model_hook(models, input_dir): + # 说明: + # 1) 该 Hook 用于从 checkpoint 恢复训练时,将 learned_embeds.bin 写回到 text_encoder embedding。 + # 2) 对于 Textual Inversion,恢复的关键是 placeholder_token 对应 embedding 行,而非整个模型。 + # 3) 同时通过 checkpoint 内的 tokenizer 获取 placeholder_token 的 token_id,以保证写入位置一致。 + # 4) 若 checkpoint 缺失 learned_embeds.bin,会打印警告并跳过,允许从头开始训练。 + # 5) 该逻辑只改变当前训练进程内的权重状态,不会修改基础模型目录的文件。 text_encoder_ = None while len(models) > 0: model = models.pop() if isinstance(model, type(unwrap_model(text_encoder))): text_encoder_ = model - # UNet is not passed to the load hook for training state, only text_encoder's embedding matters - # Load the embedding file embedding_path = os.path.join(input_dir, "learned_embeds.bin") if not os.path.exists(embedding_path): logger.warning(f"Could not find learned_embeds.bin at {embedding_path}. This may be normal if starting a new run.") return state_dict = torch.load(embedding_path, map_location="cpu") - - # We expect a dictionary where the key is the placeholder token if args.placeholder_token not in state_dict: raise ValueError( f"Trained embedding not found for placeholder token '{args.placeholder_token}' in loaded state dict." ) - + learned_embeds = state_dict[args.placeholder_token] - - # Load embedding into the text encoder - token_embeds = text_encoder_.get_input_embeddings().weight.data - - # Ensure the current tokenizer and text encoder size is consistent with the checkpoint + token_embeds_local = text_encoder_.get_input_embeddings().weight.data + current_tokenizer = AutoTokenizer.from_pretrained(input_dir) current_placeholder_token_id = current_tokenizer.convert_tokens_to_ids(args.placeholder_token) - if current_placeholder_token_id == current_tokenizer.unk_token_id: - raise ValueError( + raise ValueError( f"Placeholder token '{args.placeholder_token}' not found in the tokenizer loaded from checkpoint at {input_dir}. " "Ensure your checkpoint contains the tokenizer with the added placeholder token." ) - - token_embeds[current_placeholder_token_id] = learned_embeds.to(token_embeds.dtype).to(token_embeds.device) + token_embeds_local[current_placeholder_token_id] = learned_embeds.to(token_embeds_local.dtype).to(token_embeds_local.device) accelerator.register_save_state_pre_hook(save_model_hook) accelerator.register_load_state_pre_hook(load_model_hook) - # Enable TF32 for faster training on Ampere GPUs if args.allow_tf32: torch.backends.cuda.matmul.allow_tf32 = True @@ -936,30 +928,21 @@ def main(args): args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes ) - # Only upcast trainable parameters (embedding) into fp32 if mixed precision is used if accelerator.mixed_precision == "fp16": - # The embedding layer is the only part that needs to be checked cast_training_params([text_encoder], dtype=torch.float32) - - # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs if args.use_8bit_adam: try: import bitsandbytes as bnb except ImportError: - raise ImportError( - "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." - ) - + raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`.") optimizer_class = bnb.optim.AdamW8bit else: optimizer_class = torch.optim.AdamW - # Optimizer creation: only includes the trainable embedding parameters params_to_optimize = list(filter(lambda p: p.requires_grad, text_encoder.parameters())) - if not params_to_optimize: - raise ValueError("No trainable parameters found. Check if the embedding layer is set to requires_grad=True.") + raise ValueError("No trainable parameters found. Check if the embedding layer is set to requires_grad=True.") optimizer = optimizer_class( params_to_optimize, @@ -969,13 +952,11 @@ def main(args): eps=args.adam_epsilon, ) - # Pre-computation is not supported for Textual Inversion, so this block is simplified pre_computed_encoder_hidden_states = None pre_computed_class_prompt_encoder_hidden_states = None validation_prompt_encoder_hidden_states = None validation_prompt_negative_prompt_embeds = None - - # Dataset and DataLoaders creation: + train_dataset = DreamBoothDataset( instance_data_root=args.instance_data_dir, instance_prompt=args.instance_prompt, @@ -995,7 +976,6 @@ def main(args): num_workers=args.dataloader_num_workers, ) - # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: @@ -1011,28 +991,20 @@ def main(args): power=args.lr_power, ) - # Prepare everything with our `accelerator`. - # Only UNet, Text Encoder and Optimizer are prepared (VAE is not optimized/frozen) unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( unet, text_encoder, optimizer, train_dataloader, lr_scheduler ) - - # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) - # We need to initialize the trackers we use, and also store our configuration. - # The trackers initializes automatically on the main process. if accelerator.is_main_process: tracker_config = vars(copy.deepcopy(args)) tracker_config.pop("validation_images") - accelerator.init_trackers("textual-inversion", config=tracker_config) # Updated project name + accelerator.init_trackers("textual-inversion", config=tracker_config) - # Train! total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") @@ -1043,27 +1015,25 @@ def main(args): logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 first_epoch = 0 - # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: resume_path = args.output_dir - + try: accelerator.print(f"Resuming from checkpoint at {resume_path}") accelerator.load_state(resume_path) - - # After loading state, `accelerator` updates its internal state including `step` and `epoch` + initial_global_step = accelerator.state.global_step global_step = initial_global_step - - # Recalculate first_epoch based on the loaded global_step + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) first_epoch = global_step // num_update_steps_per_epoch accelerator.print(f"Resumed at global step {global_step} and epoch {first_epoch}") - + except Exception as e: accelerator.print( f"Could not load state from '{resume_path}'. Starting a new training run. Error: {e}" @@ -1075,55 +1045,43 @@ def main(args): initial_global_step = 0 first_epoch = 0 - # [START] 为可视化方案增加的初始化和导入 (保持不变) coords_list = [] - # 提前定义 X, Y 指标的临时存储变量,用于跨代码块传递数据 X_i_feature_norm = float("nan") Y_i_feature_var = float("nan") - + if args.coords_save_path is not None: - logger.info( - f"可视化指标采集已启用。数据将每 {args.coords_log_interval} 步保存一次到 {args.coords_save_path}" - ) - # [END] 为可视化方案增加的初始化和导入 + logger.info(f"可视化指标采集已启用。数据将每 {args.coords_log_interval} 步保存一次到 {args.coords_save_path}") progress_bar = tqdm( range(0, args.max_train_steps), initial=initial_global_step, desc="Steps", - # Only show the progress bar once on each machine. disable=not accelerator.is_local_main_process, ) for epoch in range(first_epoch, args.num_train_epochs): - unet.train() # UNet is frozen, but keep in train mode for modules like Dropout (if any) + unet.train() text_encoder.train() + for step, batch in enumerate(train_dataloader): with accelerator.accumulate(unet): pixel_values = batch["pixel_values"].to(dtype=weight_dtype) if vae is not None: - # Convert images to latent space model_input = vae.encode(pixel_values).latent_dist.sample() model_input = model_input * vae.config.scaling_factor else: model_input = pixel_values - # Sample noise that we'll add to the latents noise = torch.randn_like(model_input) bsz, channels, height, width = model_input.shape - # Sample a random timestep for each image + timesteps = torch.randint( 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device - ) - timesteps = timesteps.long() + ).long() - # Add noise to the model input according to the noise magnitude at each timestep - # (this is the forward diffusion process) noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps) - # Get the text embedding for conditioning - # Since pre_compute_text_embeddings is false, we encode the prompt here encoder_hidden_states = encode_prompt( text_encoder, batch["input_ids"], @@ -1139,7 +1097,6 @@ def main(args): else: class_labels = None - # Predict the noise residual model_pred = unet( noisy_model_input, timesteps, @@ -1148,28 +1105,13 @@ def main(args): return_dict=False, )[0] - # If model predicts variance, throw away the prediction. if model_pred.shape[1] == 6: model_pred, _ = torch.chunk(model_pred, 2, dim=1) - # [START] 为可视化方案增加的 X轴 (特征范数) 和 Y轴 (特征方差) 计算 (通用指标) (保持不变) if args.coords_save_path is not None: - # 修正 X轴 计算:将 torch.linalg.norm 替换为传统的 torch.norm - # 传统的 torch.norm 支持对多个维度求范数 (dim=[1, 2, 3]) - # X轴: UNet 预测特征 L2 范数 (衡量预测的“强度”) - # torch.norm(..., p=2, dim=...) 表示 L2 范数 - X_i_feature_norm = torch.norm( - model_pred.detach().float(), - p=2, - dim=[1, 2, 3] # 对 C, H, W 维度求 L2 范数 - ).mean().item() # 对 Batch 维度求平均 - - # Y轴: UNet 预测特征方差 (衡量预测的“混乱度/稳定性”) - # var() 默认对所有维度求方差,我们对 C, H, W 求方差,然后对 Batch 求平均 + X_i_feature_norm = torch.norm(model_pred.detach().float(), p=2, dim=[1, 2, 3]).mean().item() Y_i_feature_var = model_pred.detach().float().var(dim=[1, 2, 3]).mean().item() - # [END] 为可视化方案增加的 X轴 (特征范数) 和 Y轴 (特征方差) 计算 (通用指标) - # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": @@ -1177,122 +1119,71 @@ def main(args): else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") - # Prior preservation block removed for Textual Inversion. loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") accelerator.backward(loss) if accelerator.sync_gradients: - # Only clip gradient for trainable parameters - # For Textual Inversion, only the embedding requires grad - accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm) - + accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm) + optimizer.step() lr_scheduler.step() optimizer.zero_grad() - - # Ensure only the placeholder token's embedding is updated and all others are clamped - # This is the "slicing" step typical of TI to ensure only the learned token moves + if accelerator.num_processes > 1: - # For DDP/Distributed training, we need to unwrap the model to apply the mask unwrapped_text_encoder = unwrap_model(text_encoder) trainable_embeds = unwrapped_text_encoder.get_input_embeddings().weight else: trainable_embeds = text_encoder.get_input_embeddings().weight - - # Clamp the non-placeholder embeddings (ensure they don't move) + trainable_embeds.data[mask] = trainable_embeds.data[mask].float().to(trainable_embeds.device) trainable_embeds.data[placeholder_token_id] = trainable_embeds.data[placeholder_token_id].float() - - # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) global_step += 1 - - # [START] 为可视化方案增加的 X, Y, Z轴 数据记录和保存 (通用指标) (保持不变) if args.coords_save_path is not None and ( global_step % args.coords_log_interval == 0 or global_step == 1 or global_step == initial_global_step + 1 ): - - # Z轴: LDM 损失 (直接获取当前步的 loss) Z_i = loss.detach().item() - - # 记录坐标数据 (X和Y已在前面计算) coords_list.append([global_step, X_i_feature_norm, Y_i_feature_var, Z_i]) - - # 实时保存到文件 (覆盖保存,确保文件始终是最新的) + df = pd.DataFrame( coords_list, - columns=['step', 'X_Feature_L2_Norm', 'Y_Feature_Variance', 'Z_LDM_Loss'] + columns=["step", "X_Feature_L2_Norm", "Y_Feature_Variance", "Z_LDM_Loss"], ) - - # 假设 args.coords_save_path 是目标文件路径 (如 ./data/coords.csv) + save_file_path = Path(args.coords_save_path) if not save_file_path.suffix: save_file_path = save_file_path / "coords.csv" save_file_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(save_file_path, index=False) - + if global_step % (args.coords_log_interval * 10) == 0: logger.info( f"Step {global_step}: 已记录并保存可视化坐标 (X={X_i_feature_norm:.4f}, Y={Y_i_feature_var:.4f}, Z={Z_i:.4f}) 到 {save_file_path}" ) - # [END] 为可视化方案增加的 X, Y, Z轴 数据记录和保存 (通用指标) - if accelerator.is_main_process: if (global_step + 1) % args.checkpointing_steps == 0: - # 1. 保存模型参数:直接保存到 args.output_dir,覆盖上一轮 output_dir = args.output_dir - # accelerator.save_state handles saving the models using the registered hooks accelerator.save_state(output_dir) logger.info(f"Saving state to {output_dir} at step {global_step+1}") - # 2. 推理调用模型:从 args.output_dir 加载最新的模型权重 - # Textual Inversion Pipeline loading pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, - text_encoder=unwrap_model(text_encoder), # Use the unwrapped text encoder revision=args.revision, variant=args.variant, torch_dtype=weight_dtype, ) - - # Load the learned embedding into the pipeline's tokenizer/text_encoder - # (The load hook handles the actual embedding tensor update during accelerator.load_state) - # Here, we only need to load the tokenizer to ensure the pipeline has the placeholder token - pipeline.tokenizer = AutoTokenizer.from_pretrained(output_dir) - pipeline.text_encoder.resize_token_embeddings(len(pipeline.tokenizer)) - # 🌟 关键修复:手动加载 learned_embeds.bin 文件 - # 1. 加载 learned_embeds.bin - path = os.path.join(args.output_dir, "learned_embeds.bin") - if not os.path.exists(path): - # 如果文件名为 pytorch_model.bin (accelerate保存的完整模型),我们需要从模型中提取 - # 此处假设您只保存了 learned_embeds.bin - logger.warning("learned_embeds.bin not found. Skipping manual load.") - else: - # 加载权重字典 - loaded_embeds = torch.load(path, map_location="cpu") - - # 2. 提取唯一的 key (例如 'sks') 和 embedding tensor - token_name = list(loaded_embeds.keys())[0] - embedding = loaded_embeds[token_name] - - # 3. 获取新 token 的 ID - token_id = pipeline.tokenizer.convert_tokens_to_ids(token_name) - - # 4. 将权重插入到 Text Encoder 的 Embedding Layer 中 - text_encoder_embeddings = pipeline.text_encoder.get_input_embeddings() - text_encoder_embeddings.weight.data[token_id] = embedding.to(text_encoder_embeddings.weight.dtype).to(text_encoder_embeddings.weight.device) - - # 保持 pipeline 在 GPU 上 + + _load_textual_inversion_compat(pipeline, output_dir, token=args.placeholder_token) + pipeline.to(accelerator.device) - # Set pipeline args pipeline_args = {"prompt": args.validation_prompt} images = log_validation( @@ -1303,20 +1194,16 @@ def main(args): epoch, ) - # 3. 推理生成结果保存:直接保存到指定目录/output_dir,不创建子文件夹 base_save_path = Path(args.validation_image_output_dir or args.output_dir) base_save_path.mkdir(parents=True, exist_ok=True) logger.info(f"Saving validation images to {base_save_path}") - # 图片直接保存在 base_save_path,会覆盖上一轮的同名图片 for i, image in enumerate(images): image.save(base_save_path / f"image_{i}.png") - - # Clean up pipeline to save memory + del pipeline gc.collect() torch.cuda.empty_cache() - logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) @@ -1325,32 +1212,29 @@ def main(args): if global_step >= args.max_train_steps: break - - # Save the final embeddings and tokenizer accelerator.wait_for_everyone() if accelerator.is_main_process: text_encoder = unwrap_model(text_encoder) - - # Final save of the learned_embeds.bin and tokenizer - trained_embeddings = text_encoder.get_input_embeddings().weight[placeholder_token_id:placeholder_token_id+1] - + + trained_embeddings = text_encoder.get_input_embeddings().weight[ + placeholder_token_id : placeholder_token_id + 1 + ] learned_embeds_dict = { args.placeholder_token: trained_embeddings.detach().cpu() } - + torch.save(learned_embeds_dict, os.path.join(args.output_dir, "learned_embeds.bin")) tokenizer.save_pretrained(args.output_dir) - # Final inference pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, revision=args.revision, variant=args.variant, torch_dtype=weight_dtype + args.pretrained_model_name_or_path, + revision=args.revision, + variant=args.variant, + torch_dtype=weight_dtype, ) - # Load the final embedding - pipeline.tokenizer = AutoTokenizer.from_pretrained(args.output_dir) - pipeline.text_encoder.resize_token_embeddings(len(pipeline.tokenizer)) - pipeline.load_textual_inversion(args.output_dir) + _load_textual_inversion_compat(pipeline, args.output_dir, token=args.placeholder_token) + pipeline.to(accelerator.device) - # run inference images = [] if args.validation_prompt and args.num_validation_images > 0: pipeline_args = {"prompt": args.validation_prompt, "num_inference_steps": 25} @@ -1368,11 +1252,11 @@ def main(args): repo_id, images=images, base_model=args.pretrained_model_name_or_path, - train_text_encoder=False, # TI is not full text encoder training + train_text_encoder=False, prompt=args.instance_prompt, repo_folder=args.output_dir, pipeline=pipeline, - placeholder_token=args.placeholder_token, # Added for TI + placeholder_token=args.placeholder_token, ) upload_folder( repo_id=repo_id, @@ -1381,20 +1265,17 @@ def main(args): ignore_patterns=["step_*", "epoch_*"], ) - # [START] 为可视化方案增加的最终保存 (通用指标) (保持不变) if args.coords_save_path is not None and coords_list: df = pd.DataFrame( coords_list, - columns=['step', 'X_Feature_L2_Norm', 'Y_Feature_Variance', 'Z_LDM_Loss'] + columns=["step", "X_Feature_L2_Norm", "Y_Feature_Variance", "Z_LDM_Loss"], ) - # 假设 args.coords_save_path 是目标文件路径 save_file_path = Path(args.coords_save_path) if not save_file_path.suffix: save_file_path = save_file_path / "coords.csv" save_file_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(save_file_path, index=False) logger.info(f"训练结束:已将所有 {len(coords_list)} 步可视化坐标数据保存到 {save_file_path}") - # [END] 为可视化方案增加的最终保存 accelerator.end_training() -- 2.34.1