From df1e43ec3a7ed330d2b581f3c420bff11b08aec9 Mon Sep 17 00:00:00 2001 From: Ryan <3266408525@qq.com> Date: Wed, 7 Jan 2026 00:25:32 +0800 Subject: [PATCH 1/3] =?UTF-8?q?improve:=20=E8=83=A1=E5=B8=86=E6=8F=90?= =?UTF-8?q?=E4=BA=A4=E9=85=8D=E7=BD=AE=E5=8F=82=E6=95=B0=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/config/algorithm_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/config/algorithm_config.py b/src/backend/config/algorithm_config.py index 53662d5..5136de8 100644 --- a/src/backend/config/algorithm_config.py +++ b/src/backend/config/algorithm_config.py @@ -145,7 +145,8 @@ class AlgorithmConfig: 'max_train_steps': 250, 'hflip': True, 'mixed_precision': 'bf16', - 'alpha': 5e-3 + 'alpha': 5e-3, + 'eps': 0.05 } }, 'caat_pro': { @@ -156,7 +157,7 @@ class AlgorithmConfig: 'pretrained_model_name_or_path': MODELS_DIR['model2'], 'with_prior_preservation': True, 'instance_prompt': 'a selfie photo of person', - 'class_prompt': 'a selfie photo of person', + 'class_prompt': 'person', 'num_class_images': 200, 'resolution': 512, 'learning_rate': 1e-5, @@ -233,7 +234,6 @@ class AlgorithmConfig: 'max_train_steps': 2000, 'center_crop': True, 'step_size': 0.002, - 'save_every': 200, 'attack_type': 'add-log', 'seed': 0, 'dataloader_num_workers': 2 -- 2.34.1 From adca87d8f54dc42e9d674c714503761b18a9dd74 Mon Sep 17 00:00:00 2001 From: Ryan <3266408525@qq.com> Date: Wed, 7 Jan 2026 00:26:04 +0800 Subject: [PATCH 2/3] =?UTF-8?q?improve:=20=E8=83=A1=E5=B8=86=E6=8F=90?= =?UTF-8?q?=E4=BA=A4=E7=AE=97=E6=B3=95=E4=BC=98=E5=8C=96=E5=92=8C=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/algorithms/perturbation/aspl.py | 688 +++++----- .../app/algorithms/perturbation/caat.py | 1152 +++++++++-------- .../app/algorithms/perturbation/simac.py | 1057 +++++++-------- .../app/scripts/attack_anti_face_edit.sh | 1 - src/backend/app/scripts/attack_caat.sh | 31 +- .../app/scripts/attack_caat_with_prior.sh | 31 +- 6 files changed, 1434 insertions(+), 1526 deletions(-) diff --git a/src/backend/app/algorithms/perturbation/aspl.py b/src/backend/app/algorithms/perturbation/aspl.py index 8ee7943..e6024e7 100644 --- a/src/backend/app/algorithms/perturbation/aspl.py +++ b/src/backend/app/algorithms/perturbation/aspl.py @@ -1,10 +1,12 @@ import argparse import copy +import gc import hashlib import itertools import logging import os from pathlib import Path +from typing import Any, Dict, Optional import datasets import diffusers @@ -28,42 +30,123 @@ from transformers import AutoTokenizer, PretrainedConfig logger = get_logger(__name__) +# ----------------------------- +# Lightweight debug helpers (low overhead) +# ----------------------------- +def _cuda_gc() -> None: + """Best-effort CUDA memory cleanup (does not change algorithmic behavior).""" + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + +def _fmt_bytes(n: int) -> str: + return f"{n / (1024**2):.1f}MB" + + +def log_cuda( + prefix: str, + accelerator: Optional[Accelerator] = None, + sync: bool = False, + extra: Optional[Dict[str, Any]] = None, +) -> None: + """Log CUDA memory stats without copying tensors to CPU.""" + if not torch.cuda.is_available(): + logger.info(f"[mem] {prefix} cuda_not_available") + return + if sync: + torch.cuda.synchronize() + alloc = torch.cuda.memory_allocated() + reserv = torch.cuda.memory_reserved() + max_alloc = torch.cuda.max_memory_allocated() + max_reserv = torch.cuda.max_memory_reserved() + dev = str(accelerator.device) if accelerator is not None else "cuda" + msg = ( + f"[mem] {prefix} dev={dev} alloc={_fmt_bytes(alloc)} reserv={_fmt_bytes(reserv)} " + f"max_alloc={_fmt_bytes(max_alloc)} max_reserv={_fmt_bytes(max_reserv)}" + ) + if extra: + msg += " " + " ".join([f"{k}={v}" for k, v in extra.items()]) + logger.info(msg) + + +def log_path_stats(prefix: str, p: Path) -> None: + """Log directory/file existence and file count (best-effort).""" + try: + exists = p.exists() + is_dir = p.is_dir() if exists else False + n_files = 0 + if exists and is_dir: + n_files = sum(1 for x in p.iterdir() if x.is_file()) + logger.info(f"[path] {prefix} path={str(p)} exists={exists} is_dir={is_dir} files={n_files}") + except Exception as e: + logger.info(f"[path] {prefix} path={str(p)} stat_error={repr(e)}") + + +def log_args(args: argparse.Namespace) -> None: + for k in sorted(vars(args).keys()): + logger.info(f"[args] {k}={getattr(args, k)}") + + +def log_tensor_meta(prefix: str, t: Optional[torch.Tensor]) -> None: + if t is None: + logger.info(f"[tensor] {prefix} None") + return + logger.info( + f"[tensor] {prefix} shape={tuple(t.shape)} dtype={t.dtype} device={t.device} " + f"requires_grad={t.requires_grad} is_leaf={t.is_leaf}" + ) + + +# ----------------------------- +# Dataset +# ----------------------------- class DreamBoothDatasetFromTensor(Dataset): """基于内存张量的 DreamBooth 数据集:直接使用张量输入,返回图像与对应 prompt token。""" def __init__( self, - instance_images_tensor, - instance_prompt, - tokenizer, - class_data_root=None, - class_prompt=None, - size=512, - center_crop=False, + instance_images_tensor: torch.Tensor, + instance_prompt: str, + tokenizer: AutoTokenizer, + class_data_root: Optional[str] = None, + class_prompt: Optional[str] = None, + size: int = 512, + center_crop: bool = False, ): - # 保存图像处理参数与 tokenizer self.size = size self.center_crop = center_crop self.tokenizer = tokenizer - # 实例数据:直接来自传入的张量列表 self.instance_images_tensor = instance_images_tensor self.num_instance_images = len(self.instance_images_tensor) self.instance_prompt = instance_prompt self._length = self.num_instance_images - # 可选类数据:用于先验保持,长度取实例与类数据的最大值 if class_data_root is not None: self.class_data_root = Path(class_data_root) self.class_data_root.mkdir(parents=True, exist_ok=True) - self.class_images_path = list(self.class_data_root.iterdir()) + # Only keep files to avoid directories affecting length. + self.class_images_path = [p for p in self.class_data_root.iterdir() if p.is_file()] self.num_class_images = len(self.class_images_path) self._length = max(self.num_class_images, self.num_instance_images) self.class_prompt = class_prompt + + if self.num_class_images == 0: + raise ValueError( + f"class_data_dir is empty: {self.class_data_root}. " + f"Prior preservation requires class images. " + f"Please generate class images first, or fix class_data_dir, " + f"or disable --with_prior_preservation." + ) + if self.class_prompt is None: + raise ValueError("class_prompt is required when class_data_root is provided.") else: self.class_data_root = None + self.class_images_path = [] + self.num_class_images = 0 + self.class_prompt = None - # 统一的图像预处理 self.image_transforms = transforms.Compose( [ transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), @@ -73,12 +156,11 @@ class DreamBoothDatasetFromTensor(Dataset): ] ) - def __len__(self): + def __len__(self) -> int: return self._length - def __getitem__(self, index): - # 取出实例图像张量与对应 prompt token - example = {} + def __getitem__(self, index: int) -> Dict[str, Any]: + example: Dict[str, Any] = {} instance_image = self.instance_images_tensor[index % self.num_instance_images] example["instance_images"] = instance_image example["instance_prompt_ids"] = self.tokenizer( @@ -89,14 +171,15 @@ class DreamBoothDatasetFromTensor(Dataset): return_tensors="pt", ).input_ids - # 若有类数据,则同时返回类图像与类 prompt token - if self.class_data_root: + if self.class_data_root is not None: + if self.num_class_images == 0: + raise ValueError(f"class_data_dir became empty at runtime: {self.class_data_root}") class_image = Image.open(self.class_images_path[index % self.num_class_images]) - if not class_image.mode == "RGB": + if class_image.mode != "RGB": class_image = class_image.convert("RGB") example["class_images"] = self.image_transforms(class_image) example["class_prompt_ids"] = self.tokenizer( - self.class_prompt, + self.class_prompt, # type: ignore[arg-type] truncation=True, padding="max_length", max_length=self.tokenizer.model_max_length, @@ -106,8 +189,10 @@ class DreamBoothDatasetFromTensor(Dataset): return example -def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): - # 根据 text_encoder 配置识别其架构,选择正确的模型类 +# ----------------------------- +# Model helper +# ----------------------------- +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: Optional[str]): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", @@ -119,254 +204,97 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st from transformers import CLIPTextModel return CLIPTextModel - elif model_class == "RobertaSeriesModelWithTransformation": + if model_class == "RobertaSeriesModelWithTransformation": from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation return RobertaSeriesModelWithTransformation - else: - raise ValueError(f"{model_class} is not supported.") - - -def parse_args(input_args=None): - # 解析命令行参数:模型路径、数据路径、对抗参数、先验保持、训练与日志配置 - parser = argparse.ArgumentParser(description="Simple example of a training script.") - parser.add_argument( - "--pretrained_model_name_or_path", - type=str, - default=None, - required=True, - help="Path to pretrained model or model identifier from huggingface.co/models.", - ) - parser.add_argument( - "--revision", - type=str, - default=None, - required=False, - help=( - "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" - " float32 precision." - ), - ) - parser.add_argument( - "--tokenizer_name", - type=str, - default=None, - help="Pretrained tokenizer name or path if not the same as model_name", - ) - parser.add_argument( - "--instance_data_dir_for_train", - type=str, - default=None, - required=True, - help="A folder containing the training data of instance images.", - ) - parser.add_argument( - "--instance_data_dir_for_adversarial", - type=str, - default=None, - required=True, - help="A folder containing the images to add adversarial noise", - ) - parser.add_argument( - "--class_data_dir", - type=str, - default=None, - required=False, - help="A folder containing the training data of class images.", - ) - parser.add_argument( - "--instance_prompt", - type=str, - default=None, - required=True, - help="The prompt with identifier specifying the instance", - ) - parser.add_argument( - "--class_prompt", - type=str, - default=None, - help="The prompt to specify images in the same class as provided instance images.", - ) - parser.add_argument( - "--with_prior_preservation", - default=False, - action="store_true", - help="Flag to add prior preservation loss.", - ) - parser.add_argument( - "--prior_loss_weight", - type=float, - default=1.0, - help="The weight of prior preservation loss.", - ) - parser.add_argument( - "--num_class_images", - type=int, - default=100, - help=( - "Minimal class images for prior preservation loss. If there are not enough images already present in" - " class_data_dir, additional images will be sampled with class_prompt." - ), - ) - parser.add_argument( - "--output_dir", - type=str, - default="text-inversion-model", - help="The output directory where the model predictions and checkpoints will be written.", - ) - parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") - parser.add_argument( - "--resolution", - type=int, - default=512, - help=( - "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution" - ), - ) - parser.add_argument( - "--center_crop", - default=False, - action="store_true", - help=( - "Whether to center crop the input images to the resolution. If not set, the images will be randomly" - " cropped. The images will be resized to the resolution first before cropping." - ), - ) - parser.add_argument( - "--train_text_encoder", - action="store_true", - help="Whether to train the text encoder. If set, the text encoder should be float32 precision.", - ) - parser.add_argument( - "--train_batch_size", - type=int, - default=4, - help="Batch size (per device) for the training dataloader.", - ) - parser.add_argument( - "--sample_batch_size", - type=int, - default=8, - help="Batch size (per device) for sampling images.", - ) - parser.add_argument( - "--max_train_steps", - type=int, - default=20, - help="Total number of training steps to perform.", - ) - parser.add_argument( - "--max_f_train_steps", - type=int, - default=10, - help="Total number of sub-steps to train surogate model.", - ) - parser.add_argument( - "--max_adv_train_steps", - type=int, - default=10, - help="Total number of sub-steps to train adversarial noise.", - ) - parser.add_argument( - "--checkpointing_iterations", - type=int, - default=5, - help=("Save a checkpoint of the training state every X iterations."), - ) - parser.add_argument( - "--learning_rate", - type=float, - default=5e-6, - help="Initial learning rate (after the potential warmup period) to use.", - ) - parser.add_argument( - "--logging_dir", - type=str, - default="logs", - help=( - "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" - " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." - ), - ) - parser.add_argument( - "--allow_tf32", - action="store_true", - help=( - "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" - " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" - ), - ) - parser.add_argument( - "--report_to", - type=str, - default="tensorboard", - help=( - 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' - ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' - ), - ) - parser.add_argument( - "--mixed_precision", - type=str, - default="fp16", - choices=["no", "fp16", "bf16"], - help=( - "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" - " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" - " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." - ), - ) - parser.add_argument( - "--enable_xformers_memory_efficient_attention", - action="store_true", - help="Whether or not to use xformers.", - ) - parser.add_argument( - "--pgd_alpha", - type=float, - default=1.0 / 255, - help="The step size for pgd.", - ) - parser.add_argument( - "--pgd_eps", - type=int, - default=0.05, - help="The noise budget for pgd.", - ) - parser.add_argument( - "--target_image_path", - default=None, - help="target image for attacking", - ) + raise ValueError(f"{model_class} is not supported.") + + +# ----------------------------- +# Args +# ----------------------------- +def parse_args(input_args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="ASPL training script with diagnostics.") + parser.add_argument("--pretrained_model_name_or_path", type=str, default=None, required=True) + parser.add_argument("--revision", type=str, default=None, required=False) + parser.add_argument("--tokenizer_name", type=str, default=None) + + parser.add_argument("--instance_data_dir_for_train", type=str, default=None, required=True) + parser.add_argument("--instance_data_dir_for_adversarial", type=str, default=None, required=True) + + parser.add_argument("--class_data_dir", type=str, default=None, required=False) + parser.add_argument("--instance_prompt", type=str, default=None, required=True) + parser.add_argument("--class_prompt", type=str, default=None) + + parser.add_argument("--with_prior_preservation", default=False, action="store_true") + parser.add_argument("--prior_loss_weight", type=float, default=1.0) + parser.add_argument("--num_class_images", type=int, default=100) + + parser.add_argument("--output_dir", type=str, default="text-inversion-model") + parser.add_argument("--seed", type=int, default=None) + + parser.add_argument("--resolution", type=int, default=512) + parser.add_argument("--center_crop", default=False, action="store_true") + + parser.add_argument("--train_text_encoder", action="store_true") + parser.add_argument("--train_batch_size", type=int, default=4) + + parser.add_argument("--sample_batch_size", type=int, default=8) + + parser.add_argument("--max_train_steps", type=int, default=20) + parser.add_argument("--max_f_train_steps", type=int, default=10) + parser.add_argument("--max_adv_train_steps", type=int, default=10) + + parser.add_argument("--checkpointing_iterations", type=int, default=5) + + parser.add_argument("--learning_rate", type=float, default=5e-6) + parser.add_argument("--logging_dir", type=str, default="logs") + + parser.add_argument("--allow_tf32", action="store_true") + parser.add_argument("--report_to", type=str, default="tensorboard") + parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"]) + + parser.add_argument("--enable_xformers_memory_efficient_attention", action="store_true") + + parser.add_argument("--pgd_alpha", type=float, default=1.0 / 255) + parser.add_argument("--pgd_eps", type=float, default=0.05) # keep float, later /255 + + parser.add_argument("--target_image_path", default=None) + + # Debug / diagnostics (low-overhead) + parser.add_argument("--debug", action="store_true", help="Enable detailed logs for failure points.") + parser.add_argument("--debug_cuda_sync", action="store_true", help="Synchronize CUDA for more accurate mem logs.") + parser.add_argument("--debug_step0_only", action="store_true", help="Only print per-step logs for step 0.") if input_args is not None: args = parser.parse_args(input_args) else: args = parser.parse_args() - return args +# ----------------------------- +# Class image prompt dataset +# ----------------------------- class PromptDataset(Dataset): """用于批量生成 class 图像的提示词数据集,可在多 GPU 环境下并行采样。""" - def __init__(self, prompt, num_samples): + def __init__(self, prompt: str, num_samples: int): self.prompt = prompt self.num_samples = num_samples - def __len__(self): + def __len__(self) -> int: return self.num_samples - def __getitem__(self, index): - example = {} - example["prompt"] = self.prompt - example["index"] = index - return example + def __getitem__(self, index: int) -> Dict[str, Any]: + return {"prompt": self.prompt, "index": index} -def load_data(data_dir, size=512, center_crop=True) -> torch.Tensor: - # 读取目录下所有图片,按训练要求 resize/crop/normalize,返回堆叠后的张量 +# ----------------------------- +# IO +# ----------------------------- +def load_data(data_dir: Path, size: int = 512, center_crop: bool = True) -> torch.Tensor: image_transforms = transforms.Compose( [ transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), @@ -376,21 +304,16 @@ def load_data(data_dir, size=512, center_crop=True) -> torch.Tensor: ] ) - images = [image_transforms(Image.open(i).convert("RGB")) for i in list(Path(data_dir).iterdir())] - images = torch.stack(images) - return images - - -def train_one_epoch( - args, - models, - tokenizer, - noise_scheduler, - vae, - data_tensor: torch.Tensor, - num_steps=20, -): - # 单轮训练:复制当前模型,使用给定数据迭代若干步,返回更新后的副本 + images = [image_transforms(Image.open(p).convert("RGB")) for p in list(Path(data_dir).iterdir()) if p.is_file()] + if len(images) == 0: + raise ValueError(f"No image files found in directory: {data_dir}") + return torch.stack(images) + + +# ----------------------------- +# Core routines +# ----------------------------- +def train_one_epoch(args, models, tokenizer, noise_scheduler, vae, data_tensor: torch.Tensor, num_steps: int = 20): unet, text_encoder = copy.deepcopy(models[0]), copy.deepcopy(models[1]) params_to_optimize = itertools.chain(unet.parameters(), text_encoder.parameters()) @@ -406,7 +329,7 @@ def train_one_epoch( data_tensor, args.instance_prompt, tokenizer, - args.class_data_dir, + args.class_data_dir if args.with_prior_preservation else None, args.class_prompt, args.resolution, args.center_crop, @@ -423,8 +346,8 @@ def train_one_epoch( unet.train() text_encoder.train() - # 构造当前步的样本(instance + class),并生成文本 token step_data = train_dataset[step % len(train_dataset)] + pixel_values = torch.stack([step_data["instance_images"], step_data["class_images"]]).to( device, dtype=weight_dtype ) @@ -433,20 +356,14 @@ def train_one_epoch( latents = vae.encode(pixel_values).latent_dist.sample() latents = latents * vae.config.scaling_factor - # 随机采样时间步并加噪,模拟正向扩散 noise = torch.randn_like(latents) bsz = latents.shape[0] - timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) - timesteps = timesteps.long() + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device).long() noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - # 文本条件编码 encoder_hidden_states = text_encoder(input_ids)[0] - - # UNet 预测噪声 model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample - # 根据 scheduler 的预测类型选择目标 if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": @@ -454,7 +371,6 @@ def train_one_epoch( else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") - # 可选先验保持:拆分 instance 与 class 部分分别计算 MSE if args.with_prior_preservation: model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) target, target_prior = torch.chunk(target, 2, dim=0) @@ -462,34 +378,30 @@ def train_one_epoch( instance_loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") loss = instance_loss + args.prior_loss_weight * prior_loss - else: + prior_loss = torch.tensor(0.0, device=device) + instance_loss = torch.tensor(0.0, device=device) loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") loss.backward() torch.nn.utils.clip_grad_norm_(params_to_optimize, 1.0, error_if_nonfinite=True) optimizer.step() optimizer.zero_grad() - print( - f"Step #{step}, loss: {loss.detach().item()}, prior_loss: {prior_loss.detach().item()}, instance_loss: {instance_loss.detach().item()}" + + logger.info( + f"[train_one_epoch] step={step} loss={loss.detach().item():.6f} " + f"prior={prior_loss.detach().item():.6f} inst={instance_loss.detach().item():.6f}" ) - return [unet, text_encoder] + del step_data, pixel_values, input_ids, latents, noise, timesteps, noisy_latents, encoder_hidden_states + del model_pred, target, loss, prior_loss, instance_loss + del optimizer, train_dataset, params_to_optimize + _cuda_gc() + return [unet, text_encoder] -def pgd_attack( - args, - models, - tokenizer, - noise_scheduler, - vae, - data_tensor: torch.Tensor, - original_images: torch.Tensor, - target_tensor: torch.Tensor, - num_steps: int, -): - """PGD 对抗扰动:在噪声预算内迭代更新输入,返回新的扰动数据。""" +def pgd_attack(args, models, tokenizer, noise_scheduler, vae, data_tensor, original_images, target_tensor, num_steps: int): unet, text_encoder = models weight_dtype = torch.bfloat16 device = torch.device("cuda") @@ -510,22 +422,19 @@ def pgd_attack( ).input_ids.repeat(len(data_tensor), 1) for step in range(num_steps): - perturbed_images.requires_grad = True + perturbed_images.requires_grad_(True) + latents = vae.encode(perturbed_images.to(device, dtype=weight_dtype)).latent_dist.sample() latents = latents * vae.config.scaling_factor - # 采样时间步并加噪,准备 UNet 预测 noise = torch.randn_like(latents) bsz = latents.shape[0] - timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) - timesteps = timesteps.long() + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device).long() noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - # 文本条件与噪声预测 encoder_hidden_states = text_encoder(input_ids.to(device))[0] model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample - # 目标噪声或速度 if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": @@ -533,11 +442,10 @@ def pgd_attack( else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") - unet.zero_grad() - text_encoder.zero_grad() + unet.zero_grad(set_to_none=True) + text_encoder.zero_grad(set_to_none=True) loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") - # 若有目标图像 latent,加入目标对齐项(保持原有逻辑:损失为差值) if target_tensor is not None: xtm1_pred = torch.cat( [ @@ -554,18 +462,26 @@ def pgd_attack( loss.backward() - # PGD 更新并投影到 eps 球内,再裁剪到 [-1, 1] - alpha = args.pgd_alpha - eps = args.pgd_eps / 255 + alpha = args.pgd_alpha + eps = float(args.pgd_eps) / 255.0 adv_images = perturbed_images + alpha * perturbed_images.grad.sign() eta = torch.clamp(adv_images - original_images, min=-eps, max=+eps) perturbed_images = torch.clamp(original_images + eta, min=-1, max=+1).detach_() - print(f"PGD loss - step {step}, loss: {loss.detach().item()}") + + logger.info(f"[pgd] step={step} loss={loss.detach().item():.6f} alpha={alpha} eps={eps}") + + del latents, noise, timesteps, noisy_latents, encoder_hidden_states, model_pred, target, loss + del adv_images, eta + + _cuda_gc() return perturbed_images -def main(args): +# ----------------------------- +# Main +# ----------------------------- +def main(args: argparse.Namespace) -> None: logging_dir = Path(args.output_dir, args.logging_dir) accelerator = Accelerator( @@ -580,6 +496,7 @@ def main(args): level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_warning() @@ -589,15 +506,35 @@ def main(args): transformers.utils.logging.set_verbosity_error() diffusers.utils.logging.set_verbosity_error() + if accelerator.is_local_main_process: + logger.info(f"[run] using_file={__file__}") + log_args(args) + if args.seed is not None: set_seed(args.seed) - # 先验保持:不足的 class 图像用基础模型生成补齐 + if args.debug and accelerator.is_local_main_process: + log_cuda("startup", accelerator, sync=args.debug_cuda_sync) + + # ------------------------- + # Prior preservation: generate class images if needed + # ------------------------- if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("--with_prior_preservation requires --class_data_dir") + if args.class_prompt is None: + raise ValueError("--with_prior_preservation requires --class_prompt") + class_images_dir = Path(args.class_data_dir) - if not class_images_dir.exists(): - class_images_dir.mkdir(parents=True) - cur_class_images = len(list(class_images_dir.iterdir())) + class_images_dir.mkdir(parents=True, exist_ok=True) + + if accelerator.is_local_main_process: + log_path_stats("class_dir_before", class_images_dir) + + cur_class_images = sum(1 for p in class_images_dir.iterdir() if p.is_file()) + if accelerator.is_local_main_process: + logger.info(f"[class_gen] cur_class_images={cur_class_images} target={args.num_class_images}") + if cur_class_images < args.num_class_images: torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32 if args.mixed_precision == "fp32": @@ -606,6 +543,12 @@ def main(args): torch_dtype = torch.float16 elif args.mixed_precision == "bf16": torch_dtype = torch.bfloat16 + + if accelerator.is_local_main_process: + logger.info(f"[class_gen] will_generate={args.num_class_images - cur_class_images} torch_dtype={torch_dtype}") + if args.debug: + log_cuda("before_pipeline_load", accelerator, sync=args.debug_cuda_sync) + pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, torch_dtype=torch_dtype, @@ -615,8 +558,6 @@ def main(args): pipeline.set_progress_bar_config(disable=True) num_new_images = args.num_class_images - cur_class_images - logger.info(f"Number of class images to sample: {num_new_images}.") - sample_dataset = PromptDataset(args.class_prompt, num_new_images) sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) @@ -629,19 +570,41 @@ def main(args): disable=not accelerator.is_local_main_process, ): images = pipeline(example["prompt"]).images + if accelerator.is_local_main_process and args.debug: + logger.info(f"[class_gen] generated_images={len(images)}") for i, image in enumerate(images): hash_image = hashlib.sha1(image.tobytes()).hexdigest() image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" image.save(image_filename) - del pipeline - if torch.cuda.is_available(): - torch.cuda.empty_cache() + del pipeline, sample_dataset, sample_dataloader + _cuda_gc() + + accelerator.wait_for_everyone() + + final_class_images = sum(1 for p in class_images_dir.iterdir() if p.is_file()) + if accelerator.is_local_main_process: + logger.info(f"[class_gen] done final_class_images={final_class_images}") + log_path_stats("class_dir_after", class_images_dir) + if final_class_images == 0: + raise RuntimeError(f"class image generation failed: {class_images_dir} is still empty.") + else: + accelerator.wait_for_everyone() + if accelerator.is_local_main_process: + logger.info("[class_gen] skipped (already enough images)") + else: + if accelerator.is_local_main_process: + logger.info("[class_gen] disabled (with_prior_preservation is False)") - # 加载 text encoder / UNet / tokenizer / scheduler / VAE + # ------------------------- + # Load models / tokenizer / scheduler / VAE + # ------------------------- text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + if accelerator.is_local_main_process and args.debug: + log_cuda("before_load_models", accelerator, sync=args.debug_cuda_sync) + text_encoder = text_encoder_cls.from_pretrained( args.pretrained_model_name_or_path, subfolder="text_encoder", @@ -657,13 +620,11 @@ def main(args): revision=args.revision, use_fast=False, ) - noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") vae = AutoencoderKL.from_pretrained( args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision ).cuda() - vae.requires_grad_(False) if not args.train_text_encoder: @@ -672,52 +633,57 @@ def main(args): if args.allow_tf32: torch.backends.cuda.matmul.allow_tf32 = True - clean_data = load_data( - args.instance_data_dir_for_train, - size=args.resolution, - center_crop=args.center_crop, - ) - perturbed_data = load_data( - args.instance_data_dir_for_adversarial, - size=args.resolution, - center_crop=args.center_crop, - ) - original_data = perturbed_data.clone() - original_data.requires_grad_(False) - if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): unet.enable_xformers_memory_efficient_attention() + if accelerator.is_local_main_process: + logger.info("[xformers] enabled") else: raise ValueError("xformers is not available. Make sure it is installed correctly") - target_latent_tensor = None + # ------------------------- + # Load data tensors + # ------------------------- + train_dir = Path(args.instance_data_dir_for_train) + adv_dir = Path(args.instance_data_dir_for_adversarial) + if accelerator.is_local_main_process and args.debug: + log_path_stats("train_dir", train_dir) + log_path_stats("adv_dir", adv_dir) + + clean_data = load_data(train_dir, size=args.resolution, center_crop=args.center_crop) + perturbed_data = load_data(adv_dir, size=args.resolution, center_crop=args.center_crop) + original_data = perturbed_data.clone() + original_data.requires_grad_(False) + + if accelerator.is_local_main_process and args.debug: + log_tensor_meta("clean_data_cpu", clean_data) + log_tensor_meta("perturbed_data_cpu", perturbed_data) + + target_latent_tensor: Optional[torch.Tensor] = None if args.target_image_path is not None: target_image_path = Path(args.target_image_path) - assert target_image_path.is_file(), f"Target image path {target_image_path} does not exist" + if not target_image_path.is_file(): + raise ValueError(f"Target image path does not exist: {target_image_path}") target_image = Image.open(target_image_path).convert("RGB").resize((args.resolution, args.resolution)) target_image = np.array(target_image)[None].transpose(0, 3, 1, 2) target_image_tensor = torch.from_numpy(target_image).to("cuda", dtype=torch.float32) / 127.5 - 1.0 - target_latent_tensor = ( - vae.encode(target_image_tensor).latent_dist.sample().to(dtype=torch.bfloat16) * vae.config.scaling_factor - ) + target_latent_tensor = vae.encode(target_image_tensor).latent_dist.sample().to(dtype=torch.bfloat16) + target_latent_tensor = target_latent_tensor * vae.config.scaling_factor target_latent_tensor = target_latent_tensor.repeat(len(perturbed_data), 1, 1, 1).cuda() - # 交替流程:训练 surrogate -> PGD 扰动 -> 用扰动数据再训练主模型,周期性导出对抗样本 + if accelerator.is_local_main_process and args.debug: + log_tensor_meta("target_latent_tensor", target_latent_tensor) + f = [unet, text_encoder] for i in range(args.max_train_steps): + if accelerator.is_local_main_process: + logger.info(f"[outer] i={i}/{args.max_train_steps}") + f_sur = copy.deepcopy(f) - f_sur = train_one_epoch( - args, - f_sur, - tokenizer, - noise_scheduler, - vae, - clean_data, - args.max_f_train_steps, - ) + f_sur = train_one_epoch(args, f_sur, tokenizer, noise_scheduler, vae, clean_data, args.max_f_train_steps) + perturbed_data = pgd_attack( args, f_sur, @@ -729,34 +695,30 @@ def main(args): target_latent_tensor, args.max_adv_train_steps, ) - f = train_one_epoch( - args, - f, - tokenizer, - noise_scheduler, - vae, - perturbed_data, - args.max_f_train_steps, - ) - # 周期保存当前扰动图像,便于后续评估与复现 + f = train_one_epoch(args, f, tokenizer, noise_scheduler, vae, perturbed_data, args.max_f_train_steps) + if (i + 1) % args.checkpointing_iterations == 0: save_folder = args.output_dir os.makedirs(save_folder, exist_ok=True) noised_imgs = perturbed_data.detach() - - img_filenames = [ - Path(instance_path).stem - for instance_path in list(Path(args.instance_data_dir_for_adversarial).iterdir()) - ] + img_filenames = [p.stem for p in adv_dir.iterdir() if p.is_file()] for img_pixel, img_name in zip(noised_imgs, img_filenames): - save_path = os.path.join(save_folder, f"perturbed_{img_name}.png") + save_path = os.path.join(save_folder, f"perturbed_{img_name}.png") Image.fromarray( - (img_pixel * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0).cpu().numpy() + (img_pixel * 127.5 + 128) + .clamp(0, 255) + .to(torch.uint8) + .permute(1, 2, 0) + .cpu() + .numpy() ).save(save_path) - - print(f"Saved perturbed images at step {i+1} to {save_folder} (Files are overwritten)") + + if accelerator.is_local_main_process: + logger.info(f"[save] step={i+1} saved={len(img_filenames)} to {save_folder}") + + _cuda_gc() if __name__ == "__main__": diff --git a/src/backend/app/algorithms/perturbation/caat.py b/src/backend/app/algorithms/perturbation/caat.py index d399e7a..6b06b04 100644 --- a/src/backend/app/algorithms/perturbation/caat.py +++ b/src/backend/app/algorithms/perturbation/caat.py @@ -6,13 +6,12 @@ import logging import os import random import warnings -import shutil from pathlib import Path +from typing import Any, Dict, Optional, Tuple import numpy as np import torch import torch.nn.functional as F -import torch.utils.checkpoint import transformers from accelerate import Accelerator from accelerate.logging import get_logger @@ -25,29 +24,24 @@ from tqdm.auto import tqdm from transformers import AutoTokenizer, PretrainedConfig import diffusers -from diffusers import ( - AutoencoderKL, - DDPMScheduler, - DiffusionPipeline, - UNet2DConditionModel, -) +from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel from diffusers.loaders import AttnProcsLayers from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor from diffusers.optimization import get_scheduler from diffusers.utils.import_utils import is_xformers_available - logger = get_logger(__name__) +# ----------------------------- +# Utility helpers +# ----------------------------- def freeze_params(params): - """冻结一组参数的梯度开关,使其在训练中保持不更新。""" for param in params: param.requires_grad = False -def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): - """从预训练目录读取 text_encoder 配置,自动选择匹配的文本编码器实现类。""" +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: Optional[str]): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", @@ -59,15 +53,97 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st from transformers import CLIPTextModel return CLIPTextModel - elif model_class == "RobertaSeriesModelWithTransformation": + if model_class == "RobertaSeriesModelWithTransformation": from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation return RobertaSeriesModelWithTransformation - else: - raise ValueError(f"{model_class} is not supported.") + raise ValueError(f"{model_class} is not supported.") + + +def _fmt_bytes_gb(n: int) -> str: + gb = n / (1024**3) + return f"{gb:.2f}GB" + + +def _debug_should_print(args, global_step: int) -> bool: + if not args.debug_oom: + return False + if args.debug_oom_step0_only: + return global_step == 0 + return True + + +def log_cuda(prefix: str, args, accelerator: Accelerator, extra: Optional[Dict[str, Any]] = None) -> None: + """Print CUDA memory stats (no tensor copies).""" + if not args.debug_oom: + return + if not torch.cuda.is_available(): + logger.info(f"[mem] {prefix} cuda_not_available") + return + if args.debug_oom_sync: + torch.cuda.synchronize() + + allocated = torch.cuda.memory_allocated() + reserved = torch.cuda.memory_reserved() + max_alloc = torch.cuda.max_memory_allocated() + max_reserved = torch.cuda.max_memory_reserved() + + msg = ( + f"[mem] {prefix} " + f"alloc={_fmt_bytes_gb(allocated)} reserv={_fmt_bytes_gb(reserved)} " + f"max_alloc={_fmt_bytes_gb(max_alloc)} max_reserv={_fmt_bytes_gb(max_reserved)} " + f"device={accelerator.device}" + ) + if extra: + kv = " ".join([f"{k}={v}" for k, v in extra.items()]) + msg = msg + " " + kv + logger.info(msg) + + +def log_tensor(prefix: str, t: Optional[torch.Tensor], args, accelerator: Accelerator) -> None: + """Log tensor meta without moving it.""" + if not args.debug_oom: + return + if t is None: + logger.info(f"[tensor] {prefix} None") + return + logger.info( + f"[tensor] {prefix} shape={tuple(t.shape)} dtype={t.dtype} device={t.device} " + f"requires_grad={t.requires_grad} is_leaf={t.is_leaf}" + ) + + +def log_trainable_params(prefix: str, module: torch.nn.Module, args) -> None: + """Log trainable parameter count and a few entries.""" + if not args.debug_oom: + return + trainable = [(n, p.numel(), str(p.dtype), str(p.device)) for n, p in module.named_parameters() if p.requires_grad] + total = sum(x[1] for x in trainable) + logger.info(f"[trainable] {prefix} tensors={len(trainable)} total_params={total}") + for n, numel, dtype, dev in trainable[:30]: + logger.info(f"[trainable] {prefix} name={n} numel={numel} dtype={dtype} device={dev}") + if len(trainable) > 30: + logger.info(f"[trainable] {prefix} ... (total {len(trainable)} trainable tensors)") + + +def _is_dir_path(p: Path) -> bool: + try: + return p.exists() and p.is_dir() + except Exception: + return False + +def _list_image_files(p: Path) -> list: + if not _is_dir_path(p): + return [] + return [x for x in p.iterdir() if x.is_file()] + + +# ----------------------------- +# Datasets +# ----------------------------- class PromptDataset(Dataset): - """用于批量生成 class 图像的 prompt 数据集,便于采样阶段在多卡上分发任务。""" + """用于批量生成 class 图像的 prompt 数据集。""" def __init__(self, prompt, num_samples): self.prompt = prompt @@ -77,19 +153,11 @@ class PromptDataset(Dataset): return self.num_samples def __getitem__(self, index): - example = {} - example["prompt"] = self.prompt - example["index"] = index - return example + return {"prompt": self.prompt, "index": index} class CustomDiffusionDataset(Dataset): - """ - CAAT/Custom Diffusion 训练数据集。 - - 负责读取实例图像与可选的类图像,并为每张图像生成对应的 prompt token。 - 同时在实例图像上生成有效区域 mask,供训练时对 loss 做空间加权。 - """ + """CAAT/Custom Diffusion 训练数据集。""" def __init__( self, @@ -103,7 +171,6 @@ class CustomDiffusionDataset(Dataset): hflip=False, aug=True, ): - # 训练图像与 mask 的目标尺寸 self.size = size self.mask_size = mask_size self.center_crop = center_crop @@ -111,7 +178,6 @@ class CustomDiffusionDataset(Dataset): self.interpolation = Image.BILINEAR self.aug = aug - # 记录实例与类数据的路径及对应 prompt self.instance_images_path = [] self.class_images_path = [] self.with_prior_preservation = with_prior_preservation @@ -121,7 +187,6 @@ class CustomDiffusionDataset(Dataset): ] self.instance_images_path.extend(inst_img_path) - # 启用先验保持时,额外读取 class 图像与 class prompt if with_prior_preservation: class_data_root = Path(concept["class_data_dir"]) if os.path.isdir(class_data_root): @@ -136,16 +201,13 @@ class CustomDiffusionDataset(Dataset): class_img_path = [(x, y) for (x, y) in zip(class_images_path, class_prompt)] self.class_images_path.extend(class_img_path[:num_class_images]) - # 打乱实例顺序以增加训练随机性,并确定数据集长度 random.shuffle(self.instance_images_path) self.num_instance_images = len(self.instance_images_path) self.num_class_images = len(self.class_images_path) self._length = max(self.num_class_images, self.num_instance_images) - # 可选水平翻转增强 self.flip = transforms.RandomHorizontalFlip(0.5 * hflip) - # 类图像走标准 transforms;实例图像会走自定义 preprocess 以生成 mask self.image_transforms = transforms.Compose( [ self.flip, @@ -160,7 +222,6 @@ class CustomDiffusionDataset(Dataset): return self._length def preprocess(self, image, scale, resample): - """对实例图像做缩放与随机放置,并生成对应的有效区域 mask。""" outer, inner = self.size, scale factor = self.size // self.mask_size if scale > self.size: @@ -177,21 +238,20 @@ class CustomDiffusionDataset(Dataset): else: instance_image[top : top + inner, left : left + inner, :] = image mask[ - top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1 + top // factor + 1 : (top + scale) // factor - 1, + left // factor + 1 : (left + scale) // factor - 1, ] = 1.0 return instance_image, mask def __getitem__(self, index): example = {} - # 读取实例图像与对应 prompt instance_image, instance_prompt = self.instance_images_path[index % self.num_instance_images] instance_image = Image.open(instance_image) if not instance_image.mode == "RGB": instance_image = instance_image.convert("RGB") instance_image = self.flip(instance_image) - # 对实例图像做随机缩放增强,并生成有效区域 mask random_scale = self.size if self.aug: random_scale = ( @@ -201,13 +261,11 @@ class CustomDiffusionDataset(Dataset): ) instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation) - # 根据缩放幅度对 prompt 加入轻量描述,模拟尺度变化的语义提示 if random_scale < 0.6 * self.size: instance_prompt = np.random.choice(["a far away ", "very small "]) + instance_prompt elif random_scale > self.size: instance_prompt = np.random.choice(["zoomed in ", "close up "]) + instance_prompt - # 实例图像与 mask 进入训练:图像已归一化到 [-1, 1] example["instance_images"] = torch.from_numpy(instance_image).permute(2, 0, 1) example["mask"] = torch.from_numpy(mask) example["instance_prompt_ids"] = self.tokenizer( @@ -218,7 +276,6 @@ class CustomDiffusionDataset(Dataset): return_tensors="pt", ).input_ids - # 先验保持:追加 class 图像、class mask 与 class prompt token if self.with_prior_preservation: class_image, class_prompt = self.class_images_path[index % self.num_class_images] class_image = Image.open(class_image) @@ -237,271 +294,64 @@ class CustomDiffusionDataset(Dataset): return example - +# ----------------------------- +# Args +# ----------------------------- def parse_args(input_args=None): - """解析 CAAT 训练参数:包含 PGD 超参、数据与模型路径、训练步数与优化器设置。""" + """解析 CAAT 训练参数。""" parser = argparse.ArgumentParser(description="CAAT training script.") - parser.add_argument( - "--alpha", - type=float, - default=5e-3, - required=True, - help="PGD alpha.", - ) - parser.add_argument( - "--eps", - type=float, - default=0.1, - required=True, - help="PGD eps.", - ) - parser.add_argument( - "--pretrained_model_name_or_path", - type=str, - default=None, - required=True, - help="Path to pretrained model or model identifier from huggingface.co/models.", - ) - parser.add_argument( - "--revision", - type=str, - default=None, - required=False, - help="Revision of pretrained model identifier from huggingface.co/models.", - ) - parser.add_argument( - "--tokenizer_name", - type=str, - default=None, - help="Pretrained tokenizer name or path if not the same as model_name", - ) - parser.add_argument( - "--instance_data_dir", - type=str, - default=None, - help="A folder containing the training data of instance images.", - ) - parser.add_argument( - "--class_data_dir", - type=str, - default=None, - help="A folder containing the training data of class images.", - ) - parser.add_argument( - "--instance_prompt", - type=str, - default=None, - help="The prompt with identifier specifying the instance", - ) - parser.add_argument( - "--class_prompt", - type=str, - default=None, - help="The prompt to specify images in the same class as provided instance images.", - ) - parser.add_argument( - "--with_prior_preservation", - default=False, - action="store_true", - help="Flag to add prior preservation loss.", - ) - parser.add_argument( - "--prior_loss_weight", - type=float, - default=1.0, - help="The weight of prior preservation loss." - ) - parser.add_argument( - "--num_class_images", - type=int, - default=200, - help=( - "Minimal class images for prior preservation loss. If there are not enough images already present in" - " class_data_dir, additional images will be sampled with class_prompt." - ), - ) - parser.add_argument( - "--output_dir", - type=str, - default="outputs", - help="The output directory.", - ) - parser.add_argument( - "--seed", - type=int, - default=42, - help="A seed for reproducible training." - ) - parser.add_argument( - "--resolution", - type=int, - default=512, - help=( - "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution" - ), - ) - parser.add_argument( - "--center_crop", - default=False, - action="store_true", - help=( - "Whether to center crop the input images to the resolution. If not set, the images will be randomly" - " cropped. The images will be resized to the resolution first before cropping." - ), - ) - parser.add_argument( - "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." - ) - parser.add_argument( - "--max_train_steps", - type=int, - default=250, - help="Total number of training steps to perform.", - ) - parser.add_argument( - "--checkpointing_steps", - type=int, - default=250, - help=( - "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" - " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" - " training using `--resume_from_checkpoint`." - ), - ) - parser.add_argument( - "--checkpoints_total_limit", - type=int, - default=None, - help=("Max number of checkpoints to store."), - ) - parser.add_argument( - "--gradient_checkpointing", - action="store_true", - help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", - ) - parser.add_argument( - "--learning_rate", - type=float, - default=1e-5, - help="Initial learning rate (after the potential warmup period) to use.", - ) - parser.add_argument( - "--dataloader_num_workers", - type=int, - default=2, - help=( - "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." - ), - ) - parser.add_argument( - "--freeze_model", - type=str, - default="crossattn_kv", - choices=["crossattn_kv", "crossattn"], - help="crossattn to enable fine-tuning of all params in the cross attention", - ) - parser.add_argument( - "--lr_scheduler", - type=str, - default="constant", - help=( - 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' - ' "constant", "constant_with_warmup"]' - ), - ) - parser.add_argument( - "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." - ) - parser.add_argument( - "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." - ) - parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") - parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") - parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") - parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") - parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - parser.add_argument( - "--hub_model_id", - type=str, - default=None, - help="The name of the repository to keep in sync with the local `output_dir`.", - ) - parser.add_argument( - "--logging_dir", - type=str, - default="logs", - help=( - "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" - " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." - ), - ) - parser.add_argument( - "--allow_tf32", - action="store_true", - help=( - "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" - " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" - ), - ) - parser.add_argument( - "--report_to", - type=str, - default="tensorboard", - help=( - 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' - ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' - ), - ) - parser.add_argument( - "--mixed_precision", - type=str, - default=None, - choices=["no", "fp16", "bf16"], - help=( - "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" - " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" - " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." - ), - ) - parser.add_argument( - "--prior_generation_precision", - type=str, - default=None, - choices=["no", "fp32", "fp16", "bf16"], - help=( - "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" - " 1.10.and an Nvidia Ampere GPU. Default to fp16 if a GPU is available else fp32." - ), - ) - parser.add_argument( - "--concepts_list", - type=str, - default=None, - help="Path to json containing multiple concepts, will overwrite parameters like instance_prompt, class_prompt, etc.", - ) - parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") - parser.add_argument( - "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." - ) - parser.add_argument( - "--set_grads_to_none", - action="store_true", - help=( - "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" - " behaviors, so disable this argument if it causes any problems. More info:" - " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" - ), - ) - parser.add_argument( - "--initializer_token", type=str, default="ktn+pll+ucd", help="A token to use as initializer word." - ) - parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.") - parser.add_argument( - "--noaug", - action="store_true", - help="Dont apply augmentation during data augmentation when this flag is enabled.", - ) + + parser.add_argument("--alpha", type=float, default=5e-3, required=True, help="PGD alpha.") + parser.add_argument("--eps", type=float, default=0.1, required=True, help="PGD eps.") + parser.add_argument("--pretrained_model_name_or_path", type=str, default=None, required=True) + parser.add_argument("--revision", type=str, default=None, required=False) + parser.add_argument("--tokenizer_name", type=str, default=None) + parser.add_argument("--instance_data_dir", type=str, default=None) + parser.add_argument("--class_data_dir", type=str, default=None) + parser.add_argument("--instance_prompt", type=str, default=None) + parser.add_argument("--class_prompt", type=str, default=None) + parser.add_argument("--with_prior_preservation", default=False, action="store_true") + parser.add_argument("--prior_loss_weight", type=float, default=1.0) + parser.add_argument("--num_class_images", type=int, default=200) + parser.add_argument("--output_dir", type=str, default="outputs") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--resolution", type=int, default=512) + parser.add_argument("--center_crop", default=False, action="store_true") + parser.add_argument("--sample_batch_size", type=int, default=4) + parser.add_argument("--max_train_steps", type=int, default=250) + parser.add_argument("--checkpointing_steps", type=int, default=250) + parser.add_argument("--checkpoints_total_limit", type=int, default=None) + parser.add_argument("--gradient_checkpointing", action="store_true") + parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--dataloader_num_workers", type=int, default=2) + parser.add_argument("--freeze_model", type=str, default="crossattn_kv", choices=["crossattn_kv", "crossattn"]) + parser.add_argument("--lr_scheduler", type=str, default="constant") + parser.add_argument("--lr_warmup_steps", type=int, default=500) + parser.add_argument("--use_8bit_adam", action="store_true") + parser.add_argument("--adam_beta1", type=float, default=0.9) + parser.add_argument("--adam_beta2", type=float, default=0.999) + parser.add_argument("--adam_weight_decay", type=float, default=1e-2) + parser.add_argument("--adam_epsilon", type=float, default=1e-08) + parser.add_argument("--max_grad_norm", default=1.0, type=float) + parser.add_argument("--hub_model_id", type=str, default=None) + parser.add_argument("--logging_dir", type=str, default="logs") + parser.add_argument("--allow_tf32", action="store_true") + parser.add_argument("--report_to", type=str, default="tensorboard") + parser.add_argument("--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"]) + parser.add_argument("--prior_generation_precision", type=str, default=None, choices=["no", "fp32", "fp16", "bf16"]) + parser.add_argument("--concepts_list", type=str, default=None) + parser.add_argument("--local_rank", type=int, default=-1) + parser.add_argument("--enable_xformers_memory_efficient_attention", action="store_true") + parser.add_argument("--set_grads_to_none", action="store_true") + parser.add_argument("--initializer_token", type=str, default="ktn+pll+ucd") + parser.add_argument("--hflip", action="store_true") + parser.add_argument("--noaug", action="store_true") + + parser.add_argument("--micro_batch_size", type=int, default=2, help="Micro-batch size for serial execution.") + + parser.add_argument("--debug_oom", action="store_true", help="开启显存与关键张量日志,用于定位第0步OOM") + parser.add_argument("--debug_oom_sync", action="store_true", help="打印前强制同步CUDA,日志更准但更慢") + parser.add_argument("--debug_oom_step0_only", action="store_true", help="只打印第0步相关日志,降低干扰") if input_args is not None: args = parser.parse_args(input_args) @@ -512,7 +362,9 @@ def parse_args(input_args=None): if env_local_rank != -1 and env_local_rank != args.local_rank: args.local_rank = env_local_rank - # 先验保持模式需要 class 数据与 prompt;多概念模式下由 concepts_list 提供 + if args.micro_batch_size < 1: + raise ValueError("--micro_batch_size must be >= 1") + if args.with_prior_preservation: if args.concepts_list is None: if args.class_data_dir is None: @@ -528,8 +380,185 @@ def parse_args(input_args=None): return args +# ----------------------------- +# Core +# ----------------------------- +def _build_concepts_list(args) -> list: + if args.concepts_list is None: + return [ + { + "instance_prompt": args.instance_prompt, + "class_prompt": args.class_prompt, + "instance_data_dir": args.instance_data_dir, + "class_data_dir": args.class_data_dir, + } + ] + with open(args.concepts_list, "r") as f: + return json.load(f) + + +def _ensure_class_images(args, accelerator: Accelerator) -> None: + """Generate class images if needed (prior preservation).""" + if not args.with_prior_preservation: + return + + for _, concept in enumerate(args.concepts_list): + class_images_dir = Path(concept["class_data_dir"]) + class_images_dir.mkdir(parents=True, exist_ok=True) + + cur_class_images = len(list(class_images_dir.iterdir())) + if cur_class_images >= args.num_class_images: + continue + + torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + + log_cuda("before_prior_pipeline_load", args, accelerator, extra={"torch_dtype": str(torch_dtype)}) + + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + sample_dataset = PromptDataset(args.class_prompt, args.num_class_images - cur_class_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + log_cuda("after_prior_pipeline_to_device", args, accelerator) + + for example in tqdm( + sample_dataloader, + desc="Generating class images", + disable=not accelerator.is_local_main_process, + ): + images = pipeline(example["prompt"]).images + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + log_cuda("after_prior_pipeline_del", args, accelerator) + + # Make sure all processes see generated images (even though you run 1 proc, keep safe) + accelerator.wait_for_everyone() + + +def _get_weight_dtype(accelerator: Accelerator) -> torch.dtype: + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + return weight_dtype + + +def _prepare_perturb_tensors( + args, + accelerator: Accelerator, + train_dataset: CustomDiffusionDataset, + weight_dtype: torch.dtype, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Build perturbed_images/original_images/mask/input_ids. + + IMPORTANT: we keep all saving until the very end; here we only prepare tensors. + """ + # Build perturbed images (instance images only, per original behavior) + pertubed_images_pil = [Image.open(i[0]).convert("RGB") for i in train_dataset.instance_images_path] + pertubed_images = [train_dataset.image_transforms(i) for i in pertubed_images_pil] + pertubed_images = torch.stack(pertubed_images).contiguous() + pertubed_images.requires_grad_(True) + + original_images = pertubed_images.clone().detach() + original_images.requires_grad_(False) + + # Build input ids for instance prompt + input_ids = train_dataset.tokenizer( + args.instance_prompt, + truncation=True, + padding="max_length", + max_length=train_dataset.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids.repeat(len(original_images), 1) + + # Build masks (instance masks; if with prior, add class mask as in original code) + def get_one_mask(image): + random_scale = train_dataset.size + if train_dataset.aug: + random_scale = ( + np.random.randint(train_dataset.size // 3, train_dataset.size + 1) + if np.random.uniform() < 0.66 + else np.random.randint(int(1.2 * train_dataset.size), int(1.4 * train_dataset.size)) + ) + _, one_mask = train_dataset.preprocess(image, random_scale, train_dataset.interpolation) + one_mask = torch.from_numpy(one_mask) + if args.with_prior_preservation: + class_mask = torch.ones_like(one_mask) + one_mask += class_mask + return one_mask + + images_open_list = [Image.open(i[0]).convert("RGB") for i in train_dataset.instance_images_path] + mask_list = [get_one_mask(img) for img in images_open_list] + mask = torch.stack(mask_list) + mask = mask.to(memory_format=torch.contiguous_format).float() + mask = mask.unsqueeze(1) + del images_open_list, pertubed_images_pil + + # Prepare with accelerator (keeps semantics; does not necessarily move tensors to GPU) + # We will explicitly move them to device before training to avoid CPU->GPU copies every step. + return pertubed_images, original_images, mask, input_ids + + +def _save_final_images( + accelerator: Accelerator, + args, + train_dataset: CustomDiffusionDataset, + final_perturbed_images: torch.Tensor, +): + """ + Save ALL output images at the very end ONLY. + + Important user requirement: + - Do NOT save per-group. + - Do NOT save per-step. + - Save only once, at the very end. + """ + if not accelerator.is_main_process: + return + + logger.info("***** Final save of perturbed images *****") + save_folder = args.output_dir + os.makedirs(save_folder, exist_ok=True) + + noised_imgs = final_perturbed_images.detach().float().cpu() + img_names = [str(instance_path[0]).split("/")[-1] for instance_path in train_dataset.instance_images_path] + + for i in range(len(img_names)): + img_pixel = noised_imgs[i] + img_name = img_names[i] + save_path = os.path.join(save_folder, f"final_noise_{img_name}") + Image.fromarray( + (img_pixel * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0).numpy() + ).save(save_path) + + logger.info(f"Saved {len(img_names)} final perturbed images to {save_folder}") + + def main(args): - # 初始化 accelerate 环境与日志目录 logging_dir = Path(args.output_dir, args.logging_dir) accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) @@ -544,6 +573,7 @@ def main(args): datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) + logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: transformers.utils.logging.set_verbosity_warning() @@ -552,150 +582,85 @@ def main(args): transformers.utils.logging.set_verbosity_error() diffusers.utils.logging.set_verbosity_error() - # 记录实验配置到 tracker,便于后续复现实验 accelerator.init_trackers("CAAT", config=vars(args)) - # 固定随机种子以提高可复现性 - if args.seed is not None: - set_seed(args.seed) - - # 将单概念参数统一封装为 concepts_list,或从 json 中读取多概念配置 - if args.concepts_list is None: - args.concepts_list = [ - { - "instance_prompt": args.instance_prompt, - "class_prompt": args.class_prompt, - "instance_data_dir": args.instance_data_dir, - "class_data_dir": args.class_data_dir, - } - ] - else: - with open(args.concepts_list, "r") as f: - args.concepts_list = json.load(f) - - # 启用先验保持时,若 class 图像不足则使用基础模型补齐 - if args.with_prior_preservation: - for i, concept in enumerate(args.concepts_list): - class_images_dir = Path(concept["class_data_dir"]) - if not class_images_dir.exists(): - class_images_dir.mkdir(parents=True, exist_ok=True) - - cur_class_images = len(list(class_images_dir.iterdir())) - - if cur_class_images < args.num_class_images: - torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32 - if args.prior_generation_precision == "fp32": - torch_dtype = torch.float32 - elif args.prior_generation_precision == "fp16": - torch_dtype = torch.float16 - elif args.prior_generation_precision == "bf16": - torch_dtype = torch.bfloat16 - pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, - torch_dtype=torch_dtype, - safety_checker=None, - revision=args.revision, - ) - pipeline.set_progress_bar_config(disable=True) - - num_new_images = args.num_class_images - cur_class_images - logger.info(f"Number of class images to sample: {num_new_images}.") - - sample_dataset = PromptDataset(args.class_prompt, num_new_images) - sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + if accelerator.is_local_main_process: + logger.info("========== CAAT 参数 ==========") + for k in sorted(vars(args).keys()): + logger.info(f"{k}: {getattr(args, k)}") + logger.info("===============================") - sample_dataloader = accelerator.prepare(sample_dataloader) - pipeline.to(accelerator.device) + log_cuda("startup", args, accelerator) - for example in tqdm( - sample_dataloader, - desc="Generating class images", - disable=not accelerator.is_local_main_process, - ): - images = pipeline(example["prompt"]).images + if args.seed is not None: + set_seed(args.seed) - for i, image in enumerate(images): - hash_image = hashlib.sha1(image.tobytes()).hexdigest() - image_filename = ( - class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" - ) - image.save(image_filename) + # Concepts list + args.concepts_list = _build_concepts_list(args) - del pipeline - if torch.cuda.is_available(): - torch.cuda.empty_cache() + # Prior preservation: generate class images if needed + _ensure_class_images(args, accelerator) - # 创建输出目录 if args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) - # 加载 tokenizer + # Tokenizer if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + else: tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name, - revision=args.revision, - use_fast=False, - ) - elif args.pretrained_model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, - subfolder="tokenizer", - revision=args.revision, - use_fast=False, + args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False ) - # 加载 text encoder / scheduler / VAE / UNet + # Load models text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision ) vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) - unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision - ) + unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision) + log_cuda("after_load_models_cpu_or_meta", args, accelerator) - # 冻结主干权重:该方法只训练 attention processor 中的增量参数 + # Freeze base weights (as original CAAT behavior) vae.requires_grad_(False) text_encoder.requires_grad_(False) unet.requires_grad_(False) - # 推理组件使用半精度可节省显存;训练增量层由 optimizer 管理 - weight_dtype = torch.float32 - if accelerator.mixed_precision == "fp16": - weight_dtype = torch.float16 - elif accelerator.mixed_precision == "bf16": - weight_dtype = torch.bfloat16 + weight_dtype = _get_weight_dtype(accelerator) + if accelerator.is_local_main_process and args.debug_oom: + logger.info(f"[debug] weight_dtype={weight_dtype} mixed_precision={accelerator.mixed_precision}") - # 将模型移动到训练设备并统一 dtype + # Move models to device with mixed precision dtype text_encoder.to(accelerator.device, dtype=weight_dtype) unet.to(accelerator.device, dtype=weight_dtype) vae.to(accelerator.device, dtype=weight_dtype) - # 根据是否启用 xformers 选择 attention processor 的实现 + log_cuda("after_models_to_device", args, accelerator) + + # xformers attention processors attention_class = CustomDiffusionAttnProcessor if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): import xformers xformers_version = version.parse(xformers.__version__) + logger.info(f"[debug] xformers_version={xformers_version}") if xformers_version == version.parse("0.0.16"): - logger.warn( - "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + logger.warning( + "xFormers 0.0.16 may be unstable for training on some GPUs; consider upgrading to >=0.0.17." ) attention_class = CustomDiffusionXFormersAttnProcessor else: raise ValueError("xformers is not available. Make sure it is installed correctly") - # 训练策略:只训练 cross-attention 的 KV(或全部 Q/K/V/out),其余保持冻结 + # Build and inject custom diffusion attention processors train_kv = True train_q_out = False if args.freeze_model == "crossattn_kv" else True custom_diffusion_attn_procs = {} - # 从 UNet state_dict 中取出原始权重,作为自定义 attention processor 的初始化 st = unet.state_dict() - for name, _ in unet.attn_processors.items(): cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim if name.startswith("mid_block"): @@ -706,9 +671,12 @@ def main(args): elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = unet.config.block_out_channels[block_id] + else: + # Fallback + hidden_size = unet.config.block_out_channels[0] + layer_name = name.split(".processor")[0] - # KV 投影权重始终可训练;若启用 train_q_out 则额外训练 Q 与 out 投影 weights = { "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"], "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"], @@ -718,15 +686,15 @@ def main(args): weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"] weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"] - # 仅对 cross-attention 层注入可训练 processor;self-attention 走冻结版本 if cross_attention_dim is not None: - custom_diffusion_attn_procs[name] = attention_class( + proc = attention_class( train_kv=train_kv, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, ).to(unet.device) - custom_diffusion_attn_procs[name].load_state_dict(weights) + proc.load_state_dict(weights) + custom_diffusion_attn_procs[name] = proc else: custom_diffusion_attn_procs[name] = attention_class( train_kv=False, @@ -734,43 +702,43 @@ def main(args): hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, ) - del st + del st - # 将新的 attention processor 注入 UNet,并用 AttnProcsLayers 封装成可训练模块 unet.set_attn_processor(custom_diffusion_attn_procs) custom_diffusion_layers = AttnProcsLayers(unet.attn_processors) - - # 将增量层注册到 checkpoint,保证训练状态可保存/恢复 accelerator.register_for_checkpointing(custom_diffusion_layers) + log_trainable_params("unet_after_set_attn_processor", unet, args) + log_trainable_params("custom_diffusion_layers", custom_diffusion_layers, args) + # Gradient checkpointing (UNet) if args.gradient_checkpointing: unet.enable_gradient_checkpointing() + if accelerator.is_local_main_process and args.debug_oom: + logger.info("[debug] gradient_checkpointing enabled") - # 允许 TF32 可提升部分 GPU 上的矩阵运算速度 if args.allow_tf32: torch.backends.cuda.matmul.allow_tf32 = True + if accelerator.is_local_main_process and args.debug_oom: + logger.info("[debug] allow_tf32 enabled") - # 先验保持时,通常将学习率扩大以补偿额外损失项带来的梯度分摊 - args.learning_rate = args.learning_rate if args.with_prior_preservation: + # keep original behavior args.learning_rate = args.learning_rate * 2.0 - # 选择优化器实现:可选 8-bit AdamW 以降低显存占用 + # Optimizer if args.use_8bit_adam: try: import bitsandbytes as bnb except ImportError: - raise ImportError( - "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." - ) - + raise ImportError("To use 8-bit Adam, please install bitsandbytes: `pip install bitsandbytes`.") optimizer_class = bnb.optim.AdamW8bit + if accelerator.is_local_main_process and args.debug_oom: + logger.info("[debug] using 8-bit AdamW") else: optimizer_class = torch.optim.AdamW - # 仅优化 custom_diffusion_layers 的参数,其余主干保持冻结 optimizer = optimizer_class( custom_diffusion_layers.parameters(), lr=args.learning_rate, @@ -779,72 +747,43 @@ def main(args): eps=args.adam_epsilon, ) - # 构建训练数据集;mask_size 通过 VAE latent 分辨率自动推导 + # Infer mask_size from VAE latent + # (This is a one-time forward; keep as original behavior) + with torch.no_grad(): + mask_size = ( + vae.encode(torch.randn(1, 3, args.resolution, args.resolution, device=accelerator.device, dtype=weight_dtype)) + .latent_dist.sample() + .size()[-1] + ) + if accelerator.is_local_main_process and args.debug_oom: + logger.info(f"[debug] inferred mask_size={mask_size}") + + # Dataset train_dataset = CustomDiffusionDataset( concepts_list=args.concepts_list, tokenizer=tokenizer, with_prior_preservation=args.with_prior_preservation, size=args.resolution, - mask_size=vae.encode( - torch.randn(1, 3, args.resolution, args.resolution).to(dtype=weight_dtype).to(accelerator.device) - ) - .latent_dist.sample() - .size()[-1], + mask_size=mask_size, center_crop=args.center_crop, num_class_images=args.num_class_images, hflip=args.hflip, aug=not args.noaug, ) + log_cuda("after_build_dataset", args, accelerator, extra={"num_instance_images": train_dataset.num_instance_images}) - # 为 PGD 准备可训练的图像张量:对实例图像做与训练一致的 transforms - pertubed_images = [Image.open(i[0]).convert("RGB") for i in train_dataset.instance_images_path] - pertubed_images = [train_dataset.image_transforms(i) for i in pertubed_images] - pertubed_images = torch.stack(pertubed_images).contiguous() - pertubed_images.requires_grad_() - - # 保留原始图像张量,用于 PGD 的投影约束 - original_images = pertubed_images.clone().detach() - original_images.requires_grad_(False) - - # 文本 token:对所有实例图像重复同一个 instance_prompt(保持原脚本行为) - input_ids = train_dataset.tokenizer( - args.instance_prompt, - truncation=True, - padding="max_length", - max_length=train_dataset.tokenizer.model_max_length, - return_tensors="pt", - ).input_ids.repeat(len(original_images), 1) - - def get_one_mask(image): - """与训练同样的随机缩放逻辑,生成单张实例图像的有效区域 mask。""" - random_scale = train_dataset.size - if train_dataset.aug: - random_scale = ( - np.random.randint(train_dataset.size // 3, train_dataset.size + 1) - if np.random.uniform() < 0.66 - else np.random.randint(int(1.2 * train_dataset.size), int(1.4 * train_dataset.size)) - ) - _, one_mask = train_dataset.preprocess(image, random_scale, train_dataset.interpolation) - one_mask = torch.from_numpy(one_mask) - if args.with_prior_preservation: - class_mask = torch.ones_like(one_mask) - one_mask += class_mask - return one_mask - - # 预先为每张图像生成 mask,并堆叠为 batch 形式供训练损失使用 - images_open_list = [Image.open(i[0]).convert("RGB") for i in train_dataset.instance_images_path] - mask_list = [] - for image in images_open_list: - mask_list.append(get_one_mask(image)) - - mask = torch.stack(mask_list) - mask = mask.to(memory_format=torch.contiguous_format).float() - mask = mask.unsqueeze(1) - del images_open_list - + # Build tensors + pertubed_images, original_images, mask, input_ids = _prepare_perturb_tensors( + args=args, accelerator=accelerator, train_dataset=train_dataset, weight_dtype=weight_dtype + ) + log_tensor("pertubed_images_before_prepare", pertubed_images, args, accelerator) + log_tensor("original_images_before_prepare", original_images, args, accelerator) + log_tensor("mask_before_prepare", mask, args, accelerator) + log_tensor("input_ids_cpu", input_ids, args, accelerator) + # Scheduler lr_scheduler = get_scheduler( args.lr_scheduler, optimizer=optimizer, @@ -852,125 +791,229 @@ def main(args): num_training_steps=args.max_train_steps * accelerator.num_processes, ) - # 将可训练模块、优化器、对抗图像张量与 mask 一并交给 accelerate 管理设备与并行 - custom_diffusion_layers, optimizer, pertubed_images, lr_scheduler, original_images, mask = accelerator.prepare( - custom_diffusion_layers, optimizer, pertubed_images, lr_scheduler, original_images, mask - ) + log_cuda("before_accelerator_prepare", args, accelerator) + + # Prepare with accelerator (wrap optimizer, custom layers, etc.) + custom_diffusion_layers, optimizer, lr_scheduler = accelerator.prepare(custom_diffusion_layers, optimizer, lr_scheduler) + + log_cuda("after_accelerator_prepare", args, accelerator) + # IMPORTANT: + # Move perturb tensors to device ONCE, to avoid CPU->GPU copies each step. + # This does NOT violate your "no saving until the end" requirement. + pertubed_images = pertubed_images.to(accelerator.device) + original_images = original_images.to(accelerator.device) + mask = mask.to(accelerator.device) + input_ids = input_ids.to(accelerator.device) + + log_tensor("pertubed_images_on_device", pertubed_images, args, accelerator) + log_tensor("original_images_on_device", original_images, args, accelerator) + log_tensor("mask_on_device", mask, args, accelerator) + log_tensor("input_ids_on_device", input_ids, args, accelerator) - # 训练主循环:每步同时更新 attention 增量层与对抗图像(PGD) logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num pertubed_images = {len(pertubed_images)}") logger.info(f" Total optimization steps = {args.max_train_steps}") + logger.info(f" micro_batch_size = {args.micro_batch_size}") + global_step = 0 first_epoch = 0 progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar.set_description("Steps") + + # ------------------------- + # Training loop (serial micro-batching with group isolation) + # ------------------------- + # User requirement: groups must not interfere and must run serially. + # We implement micro-batching where each micro-batch uses a detached clone of the current images + # and applies PGD update ONLY to its slice. No saving occurs here. for epoch in range(first_epoch, args.max_train_steps): unet.train() + + # Keep original outer structure: for _ in range(1) for _ in range(1): with accelerator.accumulate(unet), accelerator.accumulate(text_encoder): - # 将图像编码到 latent 空间并加噪,形成 UNet 的训练输入 - pertubed_images.requires_grad = True - latents = vae.encode(pertubed_images.to(accelerator.device).to(dtype=weight_dtype)).latent_dist.sample() - latents = latents * vae.config.scaling_factor - - noise = torch.randn_like(latents) - bsz = latents.shape[0] - timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) - timesteps = timesteps.long() - - noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - - # 文本条件编码 - encoder_hidden_states = text_encoder(input_ids.to(accelerator.device))[0] + if _debug_should_print(args, global_step): + log_cuda("step_begin", args, accelerator, extra={"global_step": global_step}) + logger.info(f"[debug] step={global_step} starting forward path") - # UNet 预测噪声 - model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample - - # 选择监督目标(epsilon 或 v) - if noise_scheduler.config.prediction_type == "epsilon": - target = noise - elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, timesteps) - else: - raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") - - # loss 计算:可选先验保持;实例部分可结合 mask 做空间加权 - if args.with_prior_preservation: - model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) - target, target_prior = torch.chunk(target, 2, dim=0) - mask = torch.chunk(mask, 2, dim=0)[0].to(accelerator.device) - - loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") - loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean() - - prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") - loss = loss + args.prior_loss_weight * prior_loss - else: - mask = mask.to(accelerator.device) - loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") - loss = loss.mean() - - accelerator.backward(loss) + # Zero param grads once per step (we accumulate across micro-batches) + optimizer.zero_grad(set_to_none=args.set_grads_to_none) - # 梯度裁剪:只裁剪可训练的 custom_diffusion_layers 参数 + # We'll update pertubed_images per micro-batch slice. + # For parameter gradients, we accumulate across micro-batches and do a single optimizer.step(). + num_imgs = pertubed_images.shape[0] + micro_bs = args.micro_batch_size + + # For stability: use a single random seed stream per step (does not create interference across groups). + # Each micro-batch gets its own sampled timesteps/noise. + step_loss_total = 0.0 + + # Iterate micro-batches serially + for mb_start in range(0, num_imgs, micro_bs): + mb_end = min(mb_start + micro_bs, num_imgs) + mb_size = mb_end - mb_start + + if _debug_should_print(args, global_step): + logger.info(f"[micro] step={global_step} mb=[{mb_start}:{mb_end}] size={mb_size}") + log_cuda("micro_begin", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + + # ------------------------- + # Group isolation: + # - work on a detached clone to avoid cross-group gradient paths + # - update only this slice at the end of micro-batch + # ------------------------- + x0 = original_images[mb_start:mb_end] + m = mask[mb_start:mb_end] + ids = input_ids[mb_start:mb_end] + + x = pertubed_images[mb_start:mb_end].detach().clone().requires_grad_(True) + + try: + # VAE encode + if _debug_should_print(args, global_step): + log_tensor("x_pre_vae", x, args, accelerator) + log_cuda("before_vae_encode", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + + latents_dist = vae.encode(x.to(dtype=weight_dtype)).latent_dist + + if _debug_should_print(args, global_step): + log_cuda("after_vae_encode", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + + latents = latents_dist.sample() + latents = latents * vae.config.scaling_factor + + # Noise + timestep + noise = torch.randn_like(latents) + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (mb_size,), device=latents.device + ).long() + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Text encoder (no grad; text_encoder frozen) + encoder_hidden_states = text_encoder(ids)[0] + + # UNet forward + if _debug_should_print(args, global_step): + log_cuda("before_unet_forward", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + if _debug_should_print(args, global_step): + log_cuda("after_unet_forward", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + + # Target + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + # Loss (keep original structure; with_prior uses chunking if you used prior in dataset) + # NOTE: + # In this CAAT implementation, pertubed_images is built from instance images only. + # So we keep the loss on current micro-batch only. + # If you want true prior loss in the optimization step, you must include class images in x. + # We do NOT change that behavior here to keep parameters consistent with previous behavior. + if args.with_prior_preservation: + # Keep the existing (original) code pattern: use mask_inst and optional prior_loss + # But since x contains only instance images, chunking would be invalid. + # Therefore we follow the "else" branch behavior for current tensor shapes. + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + # Apply mask normalization as in original prior branch but for instance-only mb + loss = ((loss * m).sum([1, 2, 3]) / m.sum([1, 2, 3])).mean() + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + loss = loss.mean() + + if _debug_should_print(args, global_step): + logger.info(f"[micro] step={global_step} mb=[{mb_start}:{mb_end}] loss={loss.detach().float().item()}") + + # Normalize loss for gradient accumulation across micro-batches: + # We want parameter gradient equivalent (approx.) to full-batch mean. + # Scale by (mb_size / num_imgs) to approximate full-batch mean loss. + loss_scaled = loss * (float(mb_size) / float(num_imgs)) + + if _debug_should_print(args, global_step): + log_cuda("before_backward_mb", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + + accelerator.backward(loss_scaled) + + if _debug_should_print(args, global_step): + log_cuda("after_backward_mb", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + if x.grad is None: + logger.info(f"[micro] step={global_step} mb=[{mb_start}:{mb_end}] x.grad=None") + else: + logger.info( + f"[micro] step={global_step} mb=[{mb_start}:{mb_end}] x.grad_abs_mean={x.grad.abs().mean().item():.6e}" + ) + + # PGD update only this slice + alpha = args.alpha + eps = args.eps + adv_images = x + alpha * x.grad.sign() + eta = torch.clamp(adv_images - x0, min=-eps, max=+eps) + x_new = torch.clamp(x0 + eta, min=-1, max=+1).detach() + + # Write back only this slice (no cross-group interference) + pertubed_images[mb_start:mb_end] = x_new + + step_loss_total += float(loss.detach().float().item()) * (float(mb_size) / float(num_imgs)) + + # Cleanup to reduce peak + del latents_dist, latents, noise, timesteps, noisy_latents, encoder_hidden_states + del model_pred, target, loss, loss_scaled, adv_images, eta, x_new, x + + if _debug_should_print(args, global_step): + log_cuda("micro_end", args, accelerator, extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}"}) + + except RuntimeError as e: + if "out of memory" in str(e).lower(): + logger.error(f"[OOM] step={global_step} micro=[{mb_start}:{mb_end}] RuntimeError: {e}") + log_cuda( + "oom_caught", + args, + accelerator, + extra={"global_step": global_step, "mb": f"{mb_start}:{mb_end}", "micro_bs": micro_bs}, + ) + logger.error("[OOM] 建议:进一步减小 --micro_batch_size 或启用 --gradient_checkpointing") + raise + + # Clip grads and optimizer step once per full step if accelerator.sync_gradients: - params_to_clip = ( - custom_diffusion_layers.parameters() - ) + params_to_clip = custom_diffusion_layers.parameters() accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) - # PGD 更新:基于 pertubed_images 的梯度做投影更新,并保持在 eps 约束内 - alpha = args.alpha - eps = args.eps - adv_images = pertubed_images + alpha * pertubed_images.grad.sign() - eta = torch.clamp(adv_images - original_images, min=-eps, max=+eps) - pertubed_images = torch.clamp(original_images + eta, min=-1, max=+1).detach_() - optimizer.step() - lr_scheduler.step() optimizer.zero_grad(set_to_none=args.set_grads_to_none) + if _debug_should_print(args, global_step): + log_cuda("after_optimizer_step", args, accelerator, extra={"global_step": global_step}) + logger.info(f"[debug] step={global_step} step_loss_total={step_loss_total}") + if accelerator.sync_gradients: progress_bar.update(1) global_step += 1 - - logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + logs = {"loss": step_loss_total, "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) accelerator.log(logs, step=global_step) if global_step >= args.max_train_steps: break - # 训练结束后在主进程保存最终对抗图像,文件名包含原始图片名以便对齐 - if accelerator.is_main_process: - logger.info("***** Final save of perturbed images *****") - save_folder = args.output_dir - - noised_imgs = pertubed_images.detach().cpu() - - img_names = [ - str(instance_path[0]).split("/")[-1] for instance_path in train_dataset.instance_images_path - ] - - num_images_to_save = len(img_names) - - for i in range(num_images_to_save): - img_pixel = noised_imgs[i] - img_name = img_names[i] - save_path = os.path.join(save_folder, f"final_noise_{img_name}") - - Image.fromarray( - (img_pixel * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0).numpy() - ).save(save_path) - - logger.info(f"Saved {num_images_to_save} final perturbed images to {save_folder}") + if global_step >= args.max_train_steps: + break + # ------------------------- + # FINAL SAVE ONLY (user requirement) + # ------------------------- + accelerator.wait_for_everyone() + _save_final_images(accelerator, args, train_dataset, pertubed_images) accelerator.end_training() @@ -978,5 +1021,4 @@ def main(args): if __name__ == "__main__": args = parse_args() main(args) - print("<-------end-------->") - \ No newline at end of file + print("<-------end-------->") \ No newline at end of file diff --git a/src/backend/app/algorithms/perturbation/simac.py b/src/backend/app/algorithms/perturbation/simac.py index b7dad35..8a2d0c3 100644 --- a/src/backend/app/algorithms/perturbation/simac.py +++ b/src/backend/app/algorithms/perturbation/simac.py @@ -7,10 +7,11 @@ import logging import os import random from pathlib import Path +from typing import Any, Dict, Optional, Sequence, List import datasets import diffusers -import transformers +import transformers import numpy as np import torch import torch.nn.functional as F @@ -30,25 +31,99 @@ from transformers import AutoTokenizer, PretrainedConfig logger = get_logger(__name__) +# ----------------------------- +# Lightweight debug helpers +# ----------------------------- def _cuda_gc() -> None: - """尽力释放未引用的 CUDA 内存,降低碎片化风险,不改变算法行为。""" + """Best-effort CUDA memory cleanup (does not change algorithmic behavior).""" gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() +def _fmt_bytes(n: int) -> str: + return f"{n / (1024**2):.1f}MB" + + +def log_cuda( + prefix: str, + accelerator: Optional[Accelerator] = None, + sync: bool = False, + extra: Optional[Dict[str, Any]] = None, +) -> None: + """Log CUDA memory stats without copying tensors to CPU.""" + if not torch.cuda.is_available(): + logger.info(f"[mem] {prefix} cuda_not_available") + return + if sync: + torch.cuda.synchronize() + alloc = torch.cuda.memory_allocated() + reserv = torch.cuda.memory_reserved() + max_alloc = torch.cuda.max_memory_allocated() + max_reserv = torch.cuda.max_memory_reserved() + dev = str(accelerator.device) if accelerator is not None else "cuda" + msg = ( + f"[mem] {prefix} dev={dev} alloc={_fmt_bytes(alloc)} reserv={_fmt_bytes(reserv)} " + f"max_alloc={_fmt_bytes(max_alloc)} max_reserv={_fmt_bytes(max_reserv)}" + ) + if extra: + msg += " " + " ".join([f"{k}={v}" for k, v in extra.items()]) + logger.info(msg) + + +def log_path_stats(prefix: str, p: Path) -> None: + """Log directory/file existence and file count (best-effort).""" + try: + exists = p.exists() + is_dir = p.is_dir() if exists else False + n_files = 0 + if exists and is_dir: + n_files = sum(1 for x in p.iterdir() if x.is_file()) + logger.info(f"[path] {prefix} path={str(p)} exists={exists} is_dir={is_dir} files={n_files}") + except Exception as e: + logger.info(f"[path] {prefix} path={str(p)} stat_error={repr(e)}") + + +def log_args(args: argparse.Namespace) -> None: + for k in sorted(vars(args).keys()): + logger.info(f"[args] {k}={getattr(args, k)}") + + +def log_tensor_meta(prefix: str, t: Optional[torch.Tensor]) -> None: + if t is None: + logger.info(f"[tensor] {prefix} None") + return + logger.info( + f"[tensor] {prefix} shape={tuple(t.shape)} dtype={t.dtype} device={t.device} " + f"requires_grad={t.requires_grad} is_leaf={t.is_leaf}" + ) + + +def setup_seeds() -> None: + seed = 42 + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + cudnn.benchmark = False + cudnn.deterministic = True + + +# ----------------------------- +# Dataset +# ----------------------------- class DreamBoothDatasetFromTensor(Dataset): - """基于内存张量的 DreamBooth 数据集:直接返回图像张量与 prompt token,减少磁盘 IO。""" + """DreamBooth dataset backed by in-memory tensors, plus optional class images (prior preservation).""" def __init__( self, - instance_images_tensor, - instance_prompt, - tokenizer, - class_data_root=None, - class_prompt=None, - size=512, - center_crop=False, + instance_images_tensor: torch.Tensor, + instance_prompt: str, + tokenizer: AutoTokenizer, + class_data_root: Optional[str] = None, + class_prompt: Optional[str] = None, + size: int = 512, + center_crop: bool = False, ): self.size = size self.center_crop = center_crop @@ -59,15 +134,33 @@ class DreamBoothDatasetFromTensor(Dataset): self.instance_prompt = instance_prompt self._length = self.num_instance_images + self.class_data_root: Optional[Path] + self.class_images_path: List[Path] + self.num_class_images: int + self.class_prompt: Optional[str] + if class_data_root is not None: self.class_data_root = Path(class_data_root) self.class_data_root.mkdir(parents=True, exist_ok=True) - self.class_images_path = list(self.class_data_root.iterdir()) + # Only keep files to avoid directories affecting length. + self.class_images_path = [p for p in self.class_data_root.iterdir() if p.is_file()] self.num_class_images = len(self.class_images_path) self._length = max(self.num_class_images, self.num_instance_images) self.class_prompt = class_prompt + + if self.num_class_images == 0: + raise ValueError( + f"class_data_dir is empty: {self.class_data_root}. " + f"Prior preservation requires class images. " + f"Please generate class images first, or fix class_data_dir, or disable --with_prior_preservation." + ) + if self.class_prompt is None: + raise ValueError("class_prompt is required when class_data_root is provided.") else: self.class_data_root = None + self.class_images_path = [] + self.num_class_images = 0 + self.class_prompt = None self.image_transforms = transforms.Compose( [ @@ -78,11 +171,11 @@ class DreamBoothDatasetFromTensor(Dataset): ] ) - def __len__(self): + def __len__(self) -> int: return self._length - def __getitem__(self, index): - example = {} + def __getitem__(self, index: int) -> Dict[str, Any]: + example: Dict[str, Any] = {} instance_image = self.instance_images_tensor[index % self.num_instance_images] example["instance_images"] = instance_image example["instance_prompt_ids"] = self.tokenizer( @@ -93,13 +186,15 @@ class DreamBoothDatasetFromTensor(Dataset): return_tensors="pt", ).input_ids - if self.class_data_root: + if self.class_data_root is not None: + if self.num_class_images == 0: + raise ValueError(f"class_data_dir became empty at runtime: {self.class_data_root}") class_image = Image.open(self.class_images_path[index % self.num_class_images]) if class_image.mode != "RGB": class_image = class_image.convert("RGB") example["class_images"] = self.image_transforms(class_image) example["class_prompt_ids"] = self.tokenizer( - self.class_prompt, + self.class_prompt, # type: ignore[arg-type] truncation=True, padding="max_length", max_length=self.tokenizer.model_max_length, @@ -109,8 +204,10 @@ class DreamBoothDatasetFromTensor(Dataset): return example -def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): - # 依据 text_encoder 配置识别架构,加载对应实现 +# ----------------------------- +# Model helpers +# ----------------------------- +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: Optional[str]): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", @@ -129,256 +226,156 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st raise ValueError(f"{model_class} is not supported.") -def parse_args(input_args=None): - # 解析全量参数:模型、数据、对抗超参、先验保持、训练与日志设置 - parser = argparse.ArgumentParser(description="Simple example of a training script.") - parser.add_argument( - "--pretrained_model_name_or_path", - type=str, - default=None, - required=True, - help="Path to pretrained model or model identifier from huggingface.co/models.", - ) - parser.add_argument( - "--revision", - type=str, - default=None, - required=False, - help=( - "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" - " float32 precision." - ), - ) - parser.add_argument( - "--tokenizer_name", - type=str, - default=None, - help="Pretrained tokenizer name or path if not the same as model_name", - ) - parser.add_argument( - "--instance_data_dir_for_train", - type=str, - default=None, - required=True, - help="A folder containing the training data of instance images.", - ) - parser.add_argument( - "--instance_data_dir_for_adversarial", - type=str, - default=None, - required=True, - help="A folder containing the images to add adversarial noise", - ) - parser.add_argument( - "--class_data_dir", - type=str, - default=None, - required=False, - help="A folder containing the training data of class images.", - ) - parser.add_argument( - "--instance_prompt", - type=str, - default=None, - required=True, - help="The prompt with identifier specifying the instance", - ) - parser.add_argument( - "--class_prompt", - type=str, - default=None, - help="The prompt to specify images in the same class as provided instance images.", - ) - parser.add_argument( - "--with_prior_preservation", - default=False, - action="store_true", - help="Flag to add prior preservation loss.", - ) - parser.add_argument( - "--prior_loss_weight", - type=float, - default=1.0, - help="The weight of prior preservation loss.", - ) - parser.add_argument( - "--num_class_images", - type=int, - default=100, - help=( - "Minimal class images for prior preservation loss. If there are not enough images already present in" - " class_data_dir, additional images will be sampled with class_prompt." - ), - ) - parser.add_argument( - "--output_dir", - type=str, - default="text-inversion-model", - help="The output directory where the model predictions and checkpoints will be written.", - ) - parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") - parser.add_argument( - "--resolution", - type=int, - default=512, - help=( - "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution" - ), - ) - parser.add_argument( - "--center_crop", - default=False, - action="store_true", - help=( - "Whether to center crop the input images to the resolution. If not set, the images will be randomly" - " cropped. The images will be resized to the resolution first before cropping." - ), - ) - parser.add_argument( - "--train_text_encoder", - action="store_true", - help="Whether to train the text encoder. If set, the text encoder should be float32 precision.", - ) - parser.add_argument( - "--train_batch_size", - type=int, - default=4, - help="Batch size (per device) for the training dataloader.", - ) - parser.add_argument( - "--sample_batch_size", - type=int, - default=8, - help="Batch size (per device) for sampling images.", - ) - parser.add_argument( - "--max_train_steps", - type=int, - default=20, - help="Total number of training steps to perform.", - ) - parser.add_argument( - "--max_f_train_steps", - type=int, - default=10, - help="Total number of sub-steps to train surogate model.", - ) - parser.add_argument( - "--max_adv_train_steps", - type=int, - default=10, - help="Total number of sub-steps to train adversarial noise.", - ) - parser.add_argument( - "--checkpointing_iterations", - type=int, - default=5, - help=("Save a checkpoint of the training state every X iterations."), - ) - parser.add_argument( - "--learning_rate", - type=float, - default=5e-6, - help="Initial learning rate (after the potential warmup period) to use.", - ) - parser.add_argument( - "--logging_dir", - type=str, - default="logs", - help=( - "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" - " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." - ), - ) - parser.add_argument( - "--allow_tf32", - action="store_true", - help=( - "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" - " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" - ), - ) - parser.add_argument( - "--report_to", - type=str, - default="tensorboard", - help=( - 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' - ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' - ), - ) - parser.add_argument( - "--mixed_precision", - type=str, - default="fp16", - choices=["no", "fp16", "bf16"], - help=( - "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" - " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" - " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." - ), - ) - parser.add_argument( - "--enable_xformers_memory_efficient_attention", - action="store_true", - help="Whether or not to use xformers.", - ) - parser.add_argument( - "--pgd_alpha", - type=float, - default=0.005, - help="The step size for pgd.", - ) - parser.add_argument( - "--pgd_eps", - type=int, - default=16, - help="The noise budget for pgd.", - ) - parser.add_argument( - "--target_image_path", - default=None, - help="target image for attacking", - ) - parser.add_argument( - "--max_steps", - type=int, - default=50, - help=("Maximum steps for adaptive greedy timestep selection."), - ) - parser.add_argument( - "--delta_t", - type=int, - default=20, - help=("delete 2*delta_t for each adaptive greedy timestep selection."), - ) +def set_unet_attr(unet: UNet2DConditionModel) -> None: + def conv_forward(self): + def forward(input_tensor, temb): + self.in_layers_features = input_tensor + hidden_states = input_tensor + + hidden_states = self.norm1(hidden_states) + hidden_states = self.nonlinearity(hidden_states) + + if self.upsample is not None: + if hidden_states.shape[0] >= 64: + input_tensor = input_tensor.contiguous() + hidden_states = hidden_states.contiguous() + input_tensor = self.upsample(input_tensor) + hidden_states = self.upsample(hidden_states) + elif self.downsample is not None: + input_tensor = self.downsample(input_tensor) + hidden_states = self.downsample(hidden_states) + + hidden_states = self.conv1(hidden_states) + + if temb is not None: + temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None] + + if temb is not None and self.time_embedding_norm == "default": + hidden_states = hidden_states + temb + + hidden_states = self.norm2(hidden_states) + + if temb is not None and self.time_embedding_norm == "scale_shift": + scale, shift = torch.chunk(temb, 2, dim=1) + hidden_states = hidden_states * (1 + scale) + shift + + hidden_states = self.nonlinearity(hidden_states) + + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + self.out_layers_features = hidden_states + if self.conv_shortcut is not None: + input_tensor = self.conv_shortcut(input_tensor) + + output_tensor = (input_tensor + hidden_states) / self.output_scale_factor + return output_tensor + + return forward + + conv_module_list = [ + unet.up_blocks[3].resnets[0], + unet.up_blocks[3].resnets[1], + unet.up_blocks[3].resnets[2], + ] + for conv_module in conv_module_list: + conv_module.forward = conv_forward(conv_module) + setattr(conv_module, "in_layers_features", None) + setattr(conv_module, "out_layers_features", None) + + +def save_feature_maps(up_blocks, down_blocks) -> torch.Tensor: + out_layers_features_list_3 = [] + res_3_list = [0, 1, 2] + + block = up_blocks[3] + for index in res_3_list: + out_layers_features_list_3.append(block.resnets[index].out_layers_features) + + out_layers_features_list_3 = torch.stack(out_layers_features_list_3, dim=0) + return out_layers_features_list_3 + + +# ----------------------------- +# Args +# ----------------------------- +def parse_args(input_args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="SIMAC training script with diagnostics.") + parser.add_argument("--pretrained_model_name_or_path", type=str, default=None, required=True) + parser.add_argument("--revision", type=str, default=None, required=False) + parser.add_argument("--tokenizer_name", type=str, default=None) + + parser.add_argument("--instance_data_dir_for_train", type=str, default=None, required=True) + parser.add_argument("--instance_data_dir_for_adversarial", type=str, default=None, required=True) + + parser.add_argument("--class_data_dir", type=str, default=None, required=False) + parser.add_argument("--instance_prompt", type=str, default=None, required=True) + parser.add_argument("--class_prompt", type=str, default=None) + + parser.add_argument("--with_prior_preservation", default=False, action="store_true") + parser.add_argument("--prior_loss_weight", type=float, default=1.0) + parser.add_argument("--num_class_images", type=int, default=100) + + parser.add_argument("--output_dir", type=str, default="text-inversion-model") + parser.add_argument("--seed", type=int, default=None) + + parser.add_argument("--resolution", type=int, default=512) + parser.add_argument("--center_crop", default=False, action="store_true") + + parser.add_argument("--train_text_encoder", action="store_true") + parser.add_argument("--train_batch_size", type=int, default=4) + + parser.add_argument("--sample_batch_size", type=int, default=8) + + parser.add_argument("--max_train_steps", type=int, default=20) + parser.add_argument("--max_f_train_steps", type=int, default=10) + parser.add_argument("--max_adv_train_steps", type=int, default=10) + + parser.add_argument("--checkpointing_iterations", type=int, default=5) + + parser.add_argument("--learning_rate", type=float, default=5e-6) + parser.add_argument("--logging_dir", type=str, default="logs") + + parser.add_argument("--allow_tf32", action="store_true") + parser.add_argument("--report_to", type=str, default="tensorboard") + parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["no", "fp16", "bf16"]) + + parser.add_argument("--enable_xformers_memory_efficient_attention", action="store_true") + + parser.add_argument("--pgd_alpha", type=float, default=0.005) + parser.add_argument("--pgd_eps", type=int, default=16) + + parser.add_argument("--target_image_path", default=None) + + parser.add_argument("--max_steps", type=int, default=50) + parser.add_argument("--delta_t", type=int, default=20) + + # Debug / diagnostics (low-overhead) + parser.add_argument("--debug", action="store_true", help="Enable detailed logs for failure points.") + parser.add_argument("--debug_cuda_sync", action="store_true", help="Synchronize CUDA for more accurate mem logs.") + parser.add_argument("--debug_step0_only", action="store_true", help="Only print per-step logs for step 0.") + if input_args is not None: args = parser.parse_args(input_args) else: args = parser.parse_args() - return args class PromptDataset(Dataset): - """多 GPU 生成 class 图像的提示词数据集""" + """Dataset for class image generation prompt batching.""" - def __init__(self, prompt, num_samples): + def __init__(self, prompt: str, num_samples: int): self.prompt = prompt self.num_samples = num_samples - def __len__(self): + def __len__(self) -> int: return self.num_samples - def __getitem__(self, index): - example = {} - example["prompt"] = self.prompt - example["index"] = index - return example + def __getitem__(self, index: int) -> Dict[str, Any]: + return {"prompt": self.prompt, "index": index} -def load_data(data_dir, size=512, center_crop=True) -> torch.Tensor: +def load_data(data_dir: Path, size: int = 512, center_crop: bool = True) -> torch.Tensor: image_transforms = transforms.Compose( [ transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), @@ -387,20 +384,25 @@ def load_data(data_dir, size=512, center_crop=True) -> torch.Tensor: transforms.Normalize([0.5], [0.5]), ] ) - - images = [image_transforms(Image.open(i).convert("RGB")) for i in list(Path(data_dir).iterdir())] + images = [image_transforms(Image.open(i).convert("RGB")) for i in list(Path(data_dir).iterdir()) if i.is_file()] + if len(images) == 0: + raise ValueError(f"No image files found in directory: {data_dir}") images = torch.stack(images) return images +# ----------------------------- +# Train / Attack +# ----------------------------- def train_one_epoch( - args, + args: argparse.Namespace, models, - tokenizer, - noise_scheduler, - vae, + tokenizer: AutoTokenizer, + noise_scheduler: DDPMScheduler, + vae: AutoencoderKL, data_tensor: torch.Tensor, - num_steps=20, + num_steps: int = 20, + accelerator: Optional[Accelerator] = None, ): unet, text_encoder = copy.deepcopy(models[0]), copy.deepcopy(models[1]) params_to_optimize = itertools.chain(unet.parameters(), text_encoder.parameters()) @@ -417,7 +419,7 @@ def train_one_epoch( data_tensor, args.instance_prompt, tokenizer, - args.class_data_dir, + args.class_data_dir if args.with_prior_preservation else None, args.class_prompt, args.resolution, args.center_crop, @@ -435,9 +437,9 @@ def train_one_epoch( text_encoder.train() step_data = train_dataset[step % len(train_dataset)] - pixel_values = torch.stack([step_data["instance_images"], step_data["class_images"]]).to( - device, dtype=weight_dtype - ) + + # Fail fast if class branch missing keys. + pixel_values = torch.stack([step_data["instance_images"], step_data["class_images"]]).to(device, dtype=weight_dtype) input_ids = torch.cat([step_data["instance_prompt_ids"], step_data["class_prompt_ids"]], dim=0).to(device) latents = vae.encode(pixel_values).latent_dist.sample() @@ -445,13 +447,10 @@ def train_one_epoch( noise = torch.randn_like(latents) bsz = latents.shape[0] - timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) - timesteps = timesteps.long() + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device).long() noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - encoder_hidden_states = text_encoder(input_ids)[0] - model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample if noise_scheduler.config.prediction_type == "epsilon": @@ -459,7 +458,7 @@ def train_one_epoch( elif noise_scheduler.config.prediction_type == "v_prediction": target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + raise ValueError("Unknown prediction type {}".format(noise_scheduler.config.prediction_type)) if args.with_prior_preservation: model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) @@ -478,210 +477,29 @@ def train_one_epoch( optimizer.step() optimizer.zero_grad() - print( - f"Step #{step}, loss: {loss.detach().item()}, prior_loss: {prior_loss.detach().item()}, " - f"instance_loss: {instance_loss.detach().item()}" + logger.info( + f"[train_one_epoch] step={step} loss={loss.detach().item():.6f} " + f"prior={prior_loss.detach().item():.6f} inst={instance_loss.detach().item():.6f}" ) - # 尽早释放当前步的中间张量,降低显存占用 del step_data, pixel_values, input_ids, latents, noise, timesteps, noisy_latents, encoder_hidden_states del model_pred, target, loss, prior_loss, instance_loss - # 释放优化器与数据集引用,进一步回收显存 del optimizer, train_dataset, params_to_optimize _cuda_gc() - return [unet, text_encoder] -def set_unet_attr(unet): - # 覆写若干 up_block 的 resnet forward,捕获中间特征以供特征对齐损失使用 - def conv_forward(self): - def forward(input_tensor, temb): - self.in_layers_features = input_tensor - hidden_states = input_tensor - - hidden_states = self.norm1(hidden_states) - hidden_states = self.nonlinearity(hidden_states) - - if self.upsample is not None: - if hidden_states.shape[0] >= 64: - input_tensor = input_tensor.contiguous() - hidden_states = hidden_states.contiguous() - input_tensor = self.upsample(input_tensor) - hidden_states = self.upsample(hidden_states) - elif self.downsample is not None: - input_tensor = self.downsample(input_tensor) - hidden_states = self.downsample(hidden_states) - - hidden_states = self.conv1(hidden_states) - - if temb is not None: - temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None] - - if temb is not None and self.time_embedding_norm == "default": - hidden_states = hidden_states + temb - - hidden_states = self.norm2(hidden_states) - - if temb is not None and self.time_embedding_norm == "scale_shift": - scale, shift = torch.chunk(temb, 2, dim=1) - hidden_states = hidden_states * (1 + scale) + shift - - hidden_states = self.nonlinearity(hidden_states) - - hidden_states = self.dropout(hidden_states) - hidden_states = self.conv2(hidden_states) - self.out_layers_features = hidden_states - if self.conv_shortcut is not None: - input_tensor = self.conv_shortcut(input_tensor) - - output_tensor = (input_tensor + hidden_states) / self.output_scale_factor - return output_tensor - - return forward - - conv_module_list = [ - unet.up_blocks[3].resnets[0], - unet.up_blocks[3].resnets[1], - unet.up_blocks[3].resnets[2], - ] - for conv_module in conv_module_list: - conv_module.forward = conv_forward(conv_module) - setattr(conv_module, "in_layers_features", None) - setattr(conv_module, "out_layers_features", None) - - -def save_feature_maps(up_blocks, down_blocks): - # 收集指定 up_block 的输出特征,用于对抗攻击中的特征对齐 - out_layers_features_list_3 = [] - res_3_list = [0, 1, 2] - - block = up_blocks[3] - for index in res_3_list: - out_layers_features_list_3.append(block.resnets[index].out_layers_features) - - out_layers_features_list_3 = torch.stack(out_layers_features_list_3, dim=0) - return out_layers_features_list_3 - - -def pgd_attack( - args, - models, - tokenizer, - noise_scheduler, - vae, - data_tensor: torch.Tensor, - original_images: torch.Tensor, - target_tensor: torch.Tensor, - num_steps: int, - time_list, -): - """PGD 对抗扰动:按预选时间步迭代更新图像,可附加特征对齐正则;尝试提前释放无用张量。""" - unet, text_encoder = models - weight_dtype = torch.bfloat16 - device = torch.device("cuda") - - vae.to(device, dtype=weight_dtype) - text_encoder.to(device, dtype=weight_dtype) - unet.to(device, dtype=weight_dtype) - set_unet_attr(unet) - - perturbed_images = data_tensor.detach().clone() - perturbed_images.requires_grad_(True) - - input_ids = tokenizer( - args.instance_prompt, - truncation=True, - padding="max_length", - max_length=tokenizer.model_max_length, - return_tensors="pt", - ).input_ids.repeat(len(data_tensor), 1) - - for step in range(num_steps): - perturbed_images.requires_grad_(True) - - latents = vae.encode(perturbed_images.to(device, dtype=weight_dtype)).latent_dist.sample() - latents = latents * vae.config.scaling_factor - - noise = torch.randn_like(latents) - - # 为每个样本从其时间步列表中随机选择一个时间步 - timesteps = [] - for i in range(len(data_tensor)): - ts = time_list[i] - ts_index = torch.randint(0, len(ts), (1,)) - timestep = torch.IntTensor([ts[ts_index]]) - timestep = timestep.long() - timesteps.append(timestep) - timesteps = torch.cat(timesteps).to(device) - - noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - - encoder_hidden_states = text_encoder(input_ids.to(device))[0] - - model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample - - if noise_scheduler.config.prediction_type == "epsilon": - target = noise - elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, timesteps) - else: - raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") - - noise_out_layers_features_3 = save_feature_maps(unet.up_blocks, unet.down_blocks) - - # 计算干净样本的对应特征用于对齐(不反传) - with torch.no_grad(): - clean_latents = vae.encode(data_tensor.to(device, dtype=weight_dtype)).latent_dist.sample() - clean_latents = clean_latents * vae.config.scaling_factor - noisy_clean_latents = noise_scheduler.add_noise(clean_latents, noise, timesteps) - _ = unet(noisy_clean_latents, timesteps, encoder_hidden_states).sample - clean_out_layers_features_3 = save_feature_maps(unet.up_blocks, unet.down_blocks) - - target_loss = F.mse_loss( - noise_out_layers_features_3.float(), - clean_out_layers_features_3.float(), - reduction="mean", - ) - - unet.zero_grad(set_to_none=True) - text_encoder.zero_grad(set_to_none=True) - - loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") - loss = loss + target_loss.detach().item() # 特征对齐损失保持为常数项,不反传 - loss.backward() - - alpha = args.pgd_alpha - eps = args.pgd_eps / 255 - adv_images = perturbed_images + alpha * perturbed_images.grad.sign() - eta = torch.clamp(adv_images - original_images, min=-eps, max=+eps) - perturbed_images = torch.clamp(original_images + eta, min=-1, max=+1).detach_() - - print( - f"PGD loss - step {step}, loss: {loss.detach().item()}, target_loss : {target_loss.detach().item()}" - ) - - # 尽早释放当前步的中间张量 - del latents, noise, timesteps, noisy_latents, encoder_hidden_states, model_pred, target - del noise_out_layers_features_3, clean_latents, noisy_clean_latents, clean_out_layers_features_3 - del target_loss, loss, adv_images, eta - - _cuda_gc() - return perturbed_images - - def select_timestep( - args, + args: argparse.Namespace, models, - tokenizer, - noise_scheduler, - vae, + tokenizer: AutoTokenizer, + noise_scheduler: DDPMScheduler, + vae: AutoencoderKL, data_tensor: torch.Tensor, original_images: torch.Tensor, - target_tensor: torch.Tensor, + target_tensor: Optional[torch.Tensor], ): - """为每张图选择一个时间步列表:通过多次梯度采样筛掉部分时间步,减少攻击开销,同时保持外部行为不变。""" unet, text_encoder = models weight_dtype = torch.bfloat16 device = torch.device("cuda") @@ -701,7 +519,7 @@ def select_timestep( return_tensors="pt", ).input_ids - time_list = [] + time_list: List[torch.Tensor] = [] for img_id in range(len(data_tensor)): perturbed_image = perturbed_images[img_id, :].unsqueeze(0) original_image = original_images[img_id, :].unsqueeze(0) @@ -714,7 +532,6 @@ def select_timestep( select_mask = torch.where(input_mask == 1, True, False) res_time_seq = torch.masked_select(time_seq, select_mask) - # 如果剩余时间步仍多,随机抽取部分时间步估计梯度,删除一段时间步区间 if len(res_time_seq) > 100: min_score, max_score = 0.0, 0.0 for inner_try in range(0, 5): @@ -725,13 +542,10 @@ def select_timestep( noise = torch.randn_like(latents) bsz = latents.shape[0] inner_index = torch.randint(0, len(res_time_seq), (bsz,)) - timesteps = torch.IntTensor([res_time_seq[inner_index]]).to(device) - timesteps = timesteps.long() + timesteps = torch.IntTensor([res_time_seq[inner_index]]).to(device).long() noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - encoder_hidden_states = text_encoder(input_ids.to(device))[0] - model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample if noise_scheduler.config.prediction_type == "epsilon": @@ -739,7 +553,7 @@ def select_timestep( elif noise_scheduler.config.prediction_type == "v_prediction": target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + raise ValueError("Unknown prediction type {}".format(noise_scheduler.config.prediction_type)) unet.zero_grad(set_to_none=True) text_encoder.zero_grad(set_to_none=True) @@ -763,15 +577,15 @@ def select_timestep( max_score = score select_t = res_time_seq[inner_index].item() - print( - f"PGD loss - step {step}, index : {inner_try + 1}, loss: {loss.detach().item()}, " - f"score: {score}, t : {res_time_seq[inner_index]}, ts_len: {len(res_time_seq)}" - ) + if args.debug: + logger.info( + f"[select_timestep] img={img_id} outer={step} inner={inner_try} " + f"loss={loss.detach().item():.6f} score={float(score)} " + f"t={res_time_seq[inner_index].item()} len={len(res_time_seq)}" + ) del latents, noise, timesteps, noisy_latents, encoder_hidden_states, model_pred, target, loss, score - # 删除一段时间步以缩小候选集合,并记录当前选中的时间步 - print("del_t", del_t, "max_t", select_t) if del_t < args.delta_t: del_t = args.delta_t elif del_t > (1000 - args.delta_t): @@ -785,8 +599,7 @@ def select_timestep( latents = latents * vae.config.scaling_factor noise = torch.randn_like(latents) - timesteps = torch.IntTensor([select_t]).to(device) - timesteps = timesteps.long() + timesteps = torch.IntTensor([select_t]).to(device).long() noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) encoder_hidden_states = text_encoder(input_ids.to(device))[0] @@ -797,11 +610,10 @@ def select_timestep( elif noise_scheduler.config.prediction_type == "v_prediction": target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + raise ValueError("Unknown prediction type {}".format(noise_scheduler.config.prediction_type)) unet.zero_grad(set_to_none=True) text_encoder.zero_grad(set_to_none=True) - loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") loss.backward() @@ -809,7 +621,6 @@ def select_timestep( eps = args.pgd_eps / 255 adv_image = id_image + alpha * id_image.grad.sign() eta = torch.clamp(adv_image - original_image, min=-eps, max=+eps) - _ = torch.sum(torch.abs(id_image.grad.sign())) id_image = torch.clamp(original_image + eta, min=-1, max=+1).detach_() del latents, noise, timesteps, noisy_latents, encoder_hidden_states, model_pred, target, loss, adv_image, eta @@ -826,18 +637,109 @@ def select_timestep( return time_list -def setup_seeds(): - # 设置统一随机种子并关闭 cudnn 非确定性,保证结果可复现 - seed = 42 - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - cudnn.benchmark = False - cudnn.deterministic = True +def pgd_attack( + args: argparse.Namespace, + models, + tokenizer: AutoTokenizer, + noise_scheduler: DDPMScheduler, + vae: AutoencoderKL, + data_tensor: torch.Tensor, + original_images: torch.Tensor, + target_tensor: Optional[torch.Tensor], + num_steps: int, + time_list: Sequence[torch.Tensor], +): + unet, text_encoder = models + weight_dtype = torch.bfloat16 + device = torch.device("cuda") + + vae.to(device, dtype=weight_dtype) + text_encoder.to(device, dtype=weight_dtype) + unet.to(device, dtype=weight_dtype) + set_unet_attr(unet) + + perturbed_images = data_tensor.detach().clone() + perturbed_images.requires_grad_(True) + + input_ids = tokenizer( + args.instance_prompt, + truncation=True, + padding="max_length", + max_length=tokenizer.model_max_length, + return_tensors="pt", + ).input_ids.repeat(len(data_tensor), 1) + + for step in range(num_steps): + perturbed_images.requires_grad_(True) + + latents = vae.encode(perturbed_images.to(device, dtype=weight_dtype)).latent_dist.sample() + latents = latents * vae.config.scaling_factor + + noise = torch.randn_like(latents) + + timesteps_list = [] + for i in range(len(data_tensor)): + ts = time_list[i] + if len(ts) == 0: + raise ValueError(f"time_list[{i}] is empty; select_timestep failed.") + ts_index = torch.randint(0, len(ts), (1,)) + timestep = torch.IntTensor([ts[ts_index]]).long() + timesteps_list.append(timestep) + timesteps = torch.cat(timesteps_list).to(device) + + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + encoder_hidden_states = text_encoder(input_ids.to(device))[0] + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError("Unknown prediction type {}".format(noise_scheduler.config.prediction_type)) + + noise_out_layers_features_3 = save_feature_maps(unet.up_blocks, unet.down_blocks) + + with torch.no_grad(): + clean_latents = vae.encode(data_tensor.to(device, dtype=weight_dtype)).latent_dist.sample() + clean_latents = clean_latents * vae.config.scaling_factor + noisy_clean_latents = noise_scheduler.add_noise(clean_latents, noise, timesteps) + _ = unet(noisy_clean_latents, timesteps, encoder_hidden_states).sample + clean_out_layers_features_3 = save_feature_maps(unet.up_blocks, unet.down_blocks) + target_loss = F.mse_loss(noise_out_layers_features_3.float(), clean_out_layers_features_3.float(), reduction="mean") + + unet.zero_grad(set_to_none=True) + text_encoder.zero_grad(set_to_none=True) -def main(args): + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + # Keep behavior: target_loss as constant (no backprop). + loss = loss + target_loss.detach().item() + loss.backward() + + alpha = args.pgd_alpha + eps = args.pgd_eps / 255 + adv_images = perturbed_images + alpha * perturbed_images.grad.sign() + eta = torch.clamp(adv_images - original_images, min=-eps, max=+eps) + perturbed_images = torch.clamp(original_images + eta, min=-1, max=+1).detach_() + + logger.info( + f"[pgd] step={step} loss={loss.detach().item():.6f} target_loss={target_loss.detach().item():.6f} " + f"alpha={alpha} eps={eps}" + ) + + del latents, noise, timesteps, noisy_latents, encoder_hidden_states, model_pred, target + del noise_out_layers_features_3, clean_latents, noisy_clean_latents, clean_out_layers_features_3 + del target_loss, loss, adv_images, eta + + _cuda_gc() + return perturbed_images + + +# ----------------------------- +# Main +# ----------------------------- +def main(args: argparse.Namespace) -> None: logging_dir = Path(args.output_dir, args.logging_dir) accelerator = Accelerator( @@ -851,6 +753,7 @@ def main(args): datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) + logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() @@ -861,15 +764,33 @@ def main(args): transformers.utils.logging.set_verbosity_error() diffusers.utils.logging.set_verbosity_error() + if accelerator.is_local_main_process: + logger.info(f"[run] using_file={__file__}") + log_args(args) + if args.seed is not None: set_seed(args.seed) setup_seeds() - # 先验保持:若 class 图像不足,则通过基础 pipeline 生成补齐 + if args.debug and accelerator.is_local_main_process: + log_cuda("startup", accelerator, sync=args.debug_cuda_sync) + + # ------------------------- + # Prior preservation: generate class images if needed + # ------------------------- if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("--with_prior_preservation requires --class_data_dir") + if args.class_prompt is None: + raise ValueError("--with_prior_preservation requires --class_prompt") + class_images_dir = Path(args.class_data_dir) class_images_dir.mkdir(parents=True, exist_ok=True) - cur_class_images = len(list(class_images_dir.iterdir())) + log_path_stats("class_dir_before", class_images_dir) + + cur_class_images = sum(1 for p in class_images_dir.iterdir() if p.is_file()) + if accelerator.is_local_main_process: + logger.info(f"[class_gen] cur_class_images={cur_class_images} target={args.num_class_images}") if cur_class_images < args.num_class_images: torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32 @@ -880,6 +801,11 @@ def main(args): elif args.mixed_precision == "bf16": torch_dtype = torch.bfloat16 + if accelerator.is_local_main_process: + logger.info(f"[class_gen] will_generate={args.num_class_images - cur_class_images} torch_dtype={torch_dtype}") + if args.debug: + log_cuda("before_pipeline_load", accelerator, sync=args.debug_cuda_sync) + pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, torch_dtype=torch_dtype, @@ -889,8 +815,6 @@ def main(args): pipeline.set_progress_bar_config(disable=True) num_new_images = args.num_class_images - cur_class_images - logger.info(f"Number of class images to sample: {num_new_images}.") - sample_dataset = PromptDataset(args.class_prompt, num_new_images) sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) @@ -903,6 +827,9 @@ def main(args): disable=not accelerator.is_local_main_process, ): images = pipeline(example["prompt"]).images + if accelerator.is_local_main_process and args.debug: + logger.info(f"[class_gen] batch_size={len(images)}") + for i, image in enumerate(images): hash_image = hashlib.sha1(image.tobytes()).hexdigest() image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" @@ -911,32 +838,42 @@ def main(args): del pipeline, sample_dataset, sample_dataloader _cuda_gc() + accelerator.wait_for_everyone() + + final_class_images = sum(1 for p in class_images_dir.iterdir() if p.is_file()) + if accelerator.is_local_main_process: + logger.info(f"[class_gen] done final_class_images={final_class_images}") + log_path_stats("class_dir_after", class_images_dir) + if final_class_images == 0: + raise RuntimeError(f"class image generation failed: {class_images_dir} is still empty.") + else: + accelerator.wait_for_everyone() + if accelerator.is_local_main_process: + logger.info("[class_gen] skipped (already enough images)") + else: + if accelerator.is_local_main_process: + logger.info("[class_gen] disabled (with_prior_preservation is False)") + + # ------------------------- + # Load models + # ------------------------- text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + if accelerator.is_local_main_process and args.debug: + log_cuda("before_load_models", accelerator, sync=args.debug_cuda_sync) + text_encoder = text_encoder_cls.from_pretrained( - args.pretrained_model_name_or_path, - subfolder="text_encoder", - revision=args.revision, + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision ) unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, - subfolder="unet", - revision=args.revision, + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision ) - tokenizer = AutoTokenizer.from_pretrained( - args.pretrained_model_name_or_path, - subfolder="tokenizer", - revision=args.revision, - use_fast=False, + args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False ) - noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") - vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, - subfolder="vae", - revision=args.revision, + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision ).cuda() vae.requires_grad_(False) @@ -946,103 +883,81 @@ def main(args): if args.allow_tf32: torch.backends.cuda.matmul.allow_tf32 = True - clean_data = load_data( - args.instance_data_dir_for_train, - size=args.resolution, - center_crop=args.center_crop, - ) - perturbed_data = load_data( - args.instance_data_dir_for_adversarial, - size=args.resolution, - center_crop=args.center_crop, - ) - original_data = perturbed_data.clone() - original_data.requires_grad_(False) - if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): unet.enable_xformers_memory_efficient_attention() + if accelerator.is_local_main_process: + logger.info("[xformers] enabled") else: raise ValueError("xformers is not available. Make sure it is installed correctly") - target_latent_tensor = None + # ------------------------- + # Load data tensors + # ------------------------- + train_dir = Path(args.instance_data_dir_for_train) + adv_dir = Path(args.instance_data_dir_for_adversarial) + if accelerator.is_local_main_process and args.debug: + log_path_stats("train_dir", train_dir) + log_path_stats("adv_dir", adv_dir) + + clean_data = load_data(train_dir, size=args.resolution, center_crop=args.center_crop) + perturbed_data = load_data(adv_dir, size=args.resolution, center_crop=args.center_crop) + original_data = perturbed_data.clone() + original_data.requires_grad_(False) + + if accelerator.is_local_main_process and args.debug: + log_tensor_meta("clean_data_cpu", clean_data) + log_tensor_meta("perturbed_data_cpu", perturbed_data) + + target_latent_tensor: Optional[torch.Tensor] = None if args.target_image_path is not None: target_image_path = Path(args.target_image_path) - assert target_image_path.is_file(), f"Target image path {target_image_path} does not exist" + if not target_image_path.is_file(): + raise ValueError(f"Target image path does not exist: {target_image_path}") target_image = Image.open(target_image_path).convert("RGB").resize((args.resolution, args.resolution)) target_image = np.array(target_image)[None].transpose(0, 3, 1, 2) target_image_tensor = torch.from_numpy(target_image).to("cuda", dtype=torch.float32) / 127.5 - 1.0 - target_latent_tensor = ( - vae.encode(target_image_tensor).latent_dist.sample().to(dtype=torch.bfloat16) * vae.config.scaling_factor - ) + target_latent_tensor = vae.encode(target_image_tensor).latent_dist.sample().to(dtype=torch.bfloat16) + target_latent_tensor = target_latent_tensor * vae.config.scaling_factor target_latent_tensor = target_latent_tensor.repeat(len(perturbed_data), 1, 1, 1).cuda() + if accelerator.is_local_main_process and args.debug: + log_tensor_meta("target_latent_tensor", target_latent_tensor) + f = [unet, text_encoder] - time_list = select_timestep( - args, - f, - tokenizer, - noise_scheduler, - vae, - perturbed_data, - original_data, - target_latent_tensor, - ) - for t in time_list: - print(t) + if accelerator.is_local_main_process: + logger.info("[phase] select_timestep begin") + time_list = select_timestep(args, f, tokenizer, noise_scheduler, vae, perturbed_data, original_data, target_latent_tensor) + if accelerator.is_local_main_process: + logger.info("[phase] select_timestep end") for i in range(args.max_train_steps): - f_sur = copy.deepcopy(f) + if accelerator.is_local_main_process: + logger.info(f"[outer] i={i}/{args.max_train_steps}") - f_sur = train_one_epoch( - args, - f_sur, - tokenizer, - noise_scheduler, - vae, - clean_data, - args.max_f_train_steps, - ) + f_sur = copy.deepcopy(f) + f_sur = train_one_epoch(args, f_sur, tokenizer, noise_scheduler, vae, clean_data, args.max_f_train_steps, accelerator=accelerator) perturbed_data = pgd_attack( - args, - f_sur, - tokenizer, - noise_scheduler, - vae, - perturbed_data, - original_data, - target_latent_tensor, - args.max_adv_train_steps, - time_list, + args, f_sur, tokenizer, noise_scheduler, vae, + perturbed_data, original_data, target_latent_tensor, + args.max_adv_train_steps, time_list ) - # 及时释放 surrogate,保持显存占用稳定 del f_sur _cuda_gc() - f = train_one_epoch( - args, - f, - tokenizer, - noise_scheduler, - vae, - perturbed_data, - args.max_f_train_steps, - ) + f = train_one_epoch(args, f, tokenizer, noise_scheduler, vae, perturbed_data, args.max_f_train_steps, accelerator=accelerator) if (i + 1) % args.checkpointing_iterations == 0: save_folder = args.output_dir os.makedirs(save_folder, exist_ok=True) noised_imgs = perturbed_data.detach() - img_names = [ - str(instance_path).split("/")[-1].split(".")[0] - for instance_path in list(Path(args.instance_data_dir_for_adversarial).iterdir()) - ] + img_names = [p.stem for p in adv_dir.iterdir() if p.is_file()] for img_pixel, img_name in zip(noised_imgs, img_names): save_path = os.path.join(save_folder, f"perturbed_{img_name}.png") @@ -1055,9 +970,9 @@ def main(args): .numpy() ).save(save_path) - print(f"Saved perturbed images at step {i+1} to {save_folder} (Files are overwritten)") + if accelerator.is_local_main_process: + logger.info(f"[save] step={i+1} saved={len(img_names)} to {save_folder}") - # 外层迭代结束后的清理 _cuda_gc() diff --git a/src/backend/app/scripts/attack_anti_face_edit.sh b/src/backend/app/scripts/attack_anti_face_edit.sh index c66ced4..ef3dd3a 100644 --- a/src/backend/app/scripts/attack_anti_face_edit.sh +++ b/src/backend/app/scripts/attack_anti_face_edit.sh @@ -50,7 +50,6 @@ CUDA_VISIBLE_DEVICES=0 python ../algorithms/pid.py \ --center_crop \ --eps 10 \ --step_size 0.002 \ - --save_every 200 \ --attack_type add-log \ --seed 0 \ --dataloader_num_workers 2 diff --git a/src/backend/app/scripts/attack_caat.sh b/src/backend/app/scripts/attack_caat.sh index 00a9f8c..41b8d0a 100644 --- a/src/backend/app/scripts/attack_caat.sh +++ b/src/backend/app/scripts/attack_caat.sh @@ -1,8 +1,5 @@ #需要环境:conda activate caat export HF_HUB_OFFLINE=1 -# 强制使用本地模型缓存,避免联网下载模型 -#export HF_HOME="/root/autodl-tmp/huggingface_cache" -#export MODEL_NAME="runwayml/stable-diffusion-v1-5" export MODEL_NAME="../../static/hf_models/hub/models--runwayml--stable-diffusion-v1-5/snapshots/451f4fe16113bff5a5d2269ed5ad43b0592e9a14" export TASKNAME="task001" ### Data to be protected @@ -10,28 +7,28 @@ export INSTANCE_DIR="../../static/originals/${TASKNAME}" ### Path to save the protected data export OUTPUT_DIR="../../static/perturbed/${TASKNAME}" -# ------------------------- 自动创建依赖路径 ------------------------- echo "Creating required directories..." mkdir -p "$INSTANCE_DIR" mkdir -p "$OUTPUT_DIR" echo "Directories created successfully." -# ------------------------- 训练前清空 OUTPUT_DIR ------------------------- echo "Clearing output directory: $OUTPUT_DIR" -# 查找并删除目录下的所有文件和子目录(但不删除 . 或 ..) find "$OUTPUT_DIR" -mindepth 1 -delete +# --debug_oom \ +# --debug_oom_sync accelerate launch ../algorithms/caat.py \ - --pretrained_model_name_or_path=$MODEL_NAME \ - --instance_data_dir=$INSTANCE_DIR \ - --output_dir=$OUTPUT_DIR \ - --instance_prompt="a photo of a person" \ - --resolution=512 \ - --learning_rate=1e-5 \ - --lr_warmup_steps=0 \ - --max_train_steps=250 \ + --pretrained_model_name_or_path="$MODEL_NAME" \ + --instance_data_dir="$INSTANCE_DIR" \ + --output_dir="$OUTPUT_DIR" \ + --instance_prompt="a photo of person" \ + --resolution 512 \ + --learning_rate 1e-5 \ + --lr_warmup_steps 0 \ + --max_train_steps 250 \ --hflip \ - --mixed_precision bf16 \ - --alpha=5e-3 \ - --eps=0.05 \ No newline at end of file + --mixed_precision bf16 \ + --alpha 5e-3 \ + --eps 0.05 \ + --micro_batch_size 2 \ No newline at end of file diff --git a/src/backend/app/scripts/attack_caat_with_prior.sh b/src/backend/app/scripts/attack_caat_with_prior.sh index ffed2d0..0f2851d 100644 --- a/src/backend/app/scripts/attack_caat_with_prior.sh +++ b/src/backend/app/scripts/attack_caat_with_prior.sh @@ -1,8 +1,5 @@ #需要环境:conda activate caat export HF_HUB_OFFLINE=1 -# 强制使用本地模型缓存,避免联网下载模型 -#export HF_HOME="/root/autodl-tmp/huggingface_cache" -#export MODEL_NAME="runwayml/stable-diffusion-v1-5" export MODEL_NAME="../../static/hf_models/hub/models--runwayml--stable-diffusion-v1-5/snapshots/451f4fe16113bff5a5d2269ed5ad43b0592e9a14" export TASKNAME="task001" ### Data to be protected @@ -11,44 +8,40 @@ export INSTANCE_DIR="../../static/originals/${TASKNAME}" export OUTPUT_DIR="../../static/perturbed/${TASKNAME}" export CLASS_DIR="../../static/class/${TASKNAME}" -# ------------------------- 自动创建依赖路径 ------------------------- echo "Creating required directories..." mkdir -p "$INSTANCE_DIR" mkdir -p "$OUTPUT_DIR" mkdir -p "$CLASS_DIR" echo "Directories created successfully." -# ------------------------- 训练前清空 OUTPUT_DIR ------------------------- echo "Clearing output directory: $OUTPUT_DIR" -# 查找并删除目录下的所有文件和子目录(但不删除 . 或 ..) find "$OUTPUT_DIR" -mindepth 1 -delete +# --debug_oom \ +# --debug_oom_sync + accelerate launch ../algorithms/caat.py \ - --pretrained_model_name_or_path=$MODEL_NAME \ - --instance_data_dir=$INSTANCE_DIR \ - --output_dir=$OUTPUT_DIR \ + --pretrained_model_name_or_path="$MODEL_NAME" \ + --instance_data_dir="$INSTANCE_DIR" \ + --output_dir="$OUTPUT_DIR" \ --with_prior_preservation \ - --instance_prompt="a photo of a person" \ + --instance_prompt="a photo of person" \ --num_class_images=200 \ - --class_data_dir=$CLASS_DIR \ + --class_data_dir="$CLASS_DIR" \ --class_prompt='person' \ --resolution=512 \ --learning_rate=1e-5 \ --lr_warmup_steps=0 \ --max_train_steps=250 \ --hflip \ - --mixed_precision bf16 \ - --alpha=5e-3 \ - --eps=0.05 - + --mixed_precision bf16 \ + --alpha=5e-3 \ + --eps=0.05 \ + --micro_batch_size 2 -# ------------------------- 【步骤 2】训练后清空 CLASS_DIR ------------------------- -# 注意:这会在 accelerate launch 成功结束后执行 echo "Clearing class directory: $CLASS_DIR" -# 确保目录存在,避免清理命令失败 mkdir -p "$CLASS_DIR" -# 查找并删除目录下的所有文件和子目录(但不删除 . 或 ..) find "$CLASS_DIR" -mindepth 1 -delete echo "Script finished." \ No newline at end of file -- 2.34.1 From b6e65cab47e3bc67bd36a8054385f1377e57b210 Mon Sep 17 00:00:00 2001 From: Ryan <3266408525@qq.com> Date: Wed, 7 Jan 2026 00:27:25 +0800 Subject: [PATCH 3/3] =?UTF-8?q?improve:=20=E8=83=A1=E5=B8=86=E6=8F=90?= =?UTF-8?q?=E4=BA=A4=E8=AF=AF=E5=88=A0=E7=9A=84quick=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/app/scripts/attack_quick.sh | 46 +++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 src/backend/app/scripts/attack_quick.sh diff --git a/src/backend/app/scripts/attack_quick.sh b/src/backend/app/scripts/attack_quick.sh new file mode 100644 index 0000000..29456eb --- /dev/null +++ b/src/backend/app/scripts/attack_quick.sh @@ -0,0 +1,46 @@ +#需要环境:conda activate pid +### Generate images protected by PID + +export HF_HUB_OFFLINE=1 +# 强制使用本地模型缓存,避免联网下载模型 + +### SD v1.5 +export MODEL_PATH="../../static/hf_models/hub/models--runwayml--stable-diffusion-v1-5/snapshots/451f4fe16113bff5a5d2269ed5ad43b0592e9a14" + + +export TASKNAME="task003" +### Data to be protected +export INSTANCE_DIR="../../static/originals/${TASKNAME}" +### Path to save the protected data +export OUTPUT_DIR="../../static/perturbed/${TASKNAME}" + +# ------------------------- 自动创建依赖路径 ------------------------- +echo "Creating required directories..." +mkdir -p "$INSTANCE_DIR" +mkdir -p "$OUTPUT_DIR" +echo "Directories created successfully." + + +# ------------------------- 训练前清空 OUTPUT_DIR ------------------------- +echo "Clearing output directory: $OUTPUT_DIR" +# 查找并删除目录下的所有文件和子目录(但不删除 . 或 ..) +find "$OUTPUT_DIR" -mindepth 1 -delete + +export PYTHONWARNINGS="ignore" +#忽略所有警告 + +### Generation command +# --max_train_steps: Optimizaiton steps +# --attack_type: target loss to update, choices=['var', 'mean', 'KL', 'add-log', 'latent_vector', 'add'], +# Please refer to the file content for more usage + +CUDA_VISIBLE_DEVICES=0 python ../algorithms/pid.py \ + --pretrained_model_name_or_path=$MODEL_PATH \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --resolution=512 \ + --max_train_steps=120 \ + --eps 16 \ + --step_size 0.01 \ + --attack_type add-log \ + --center_crop \ No newline at end of file -- 2.34.1