Spaces:

multimodalart
/

dreambooth-training

Running

App Files Files Community

multimodalart HF Staff commited on Nov 29, 2022

Commit

1235e6e

1 Parent(s): 173552f

Update training 2

Browse files

Files changed (2) hide show

app.py +12 -4
train_dreambooth.py +68 -11

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ maximum_concepts = 3
 #Pre download the files
 model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
-#model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
 model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
 safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
@@ -171,6 +171,10 @@ def train(*inputs):
             Training_Steps=1400
     stptxt = int((Training_Steps*Train_text_encoder_for)/100)
     if (type_of_thing == "object" or type_of_thing == "style" or (type_of_thing == "person" and not experimental_face_improvement)):
         args_general = argparse.Namespace(
             image_captions_filename = True,
@@ -183,7 +187,7 @@ def train(*inputs):
             output_dir="output_model",
             instance_prompt="",
             seed=42,
-            resolution=512,
             mixed_precision="fp16",
             train_batch_size=1,
             gradient_accumulation_steps=1,
@@ -192,6 +196,8 @@ def train(*inputs):
             lr_scheduler="polynomial",
             lr_warmup_steps = 0,
             max_train_steps=Training_Steps,
         )
         print("Starting single training...")
         lock_file = open("intraining.lock", "w")
@@ -211,7 +217,7 @@ def train(*inputs):
             prior_loss_weight=1.0,
             instance_prompt="",
             seed=42,
-            resolution=512,
             mixed_precision="fp16",
             train_batch_size=1,
             gradient_accumulation_steps=1,
@@ -220,7 +226,9 @@ def train(*inputs):
             lr_scheduler="polynomial",
             lr_warmup_steps = 0,
             max_train_steps=Training_Steps,
-            num_class_images=200,
         )
         print("Starting multi-training...")
         lock_file = open("intraining.lock", "w")

 #Pre download the files
 model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
+model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
 model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
 safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
             Training_Steps=1400
     stptxt = int((Training_Steps*Train_text_encoder_for)/100)
+    #gradient_checkpointing = False if which_model == "v1-5" else True
+    gradient_checkpointing=False
+    resolution = 512 if which_model != "v2-768" else 768
+    cache_latents = True if which_model != "v1-5" else False
     if (type_of_thing == "object" or type_of_thing == "style" or (type_of_thing == "person" and not experimental_face_improvement)):
         args_general = argparse.Namespace(
             image_captions_filename = True,
             output_dir="output_model",
             instance_prompt="",
             seed=42,
+            resolution=resolution,
             mixed_precision="fp16",
             train_batch_size=1,
             gradient_accumulation_steps=1,
             lr_scheduler="polynomial",
             lr_warmup_steps = 0,
             max_train_steps=Training_Steps,
+            gradient_checkpointing=gradient_checkpointing,
+            cache_latents=cache_latents,
         )
         print("Starting single training...")
         lock_file = open("intraining.lock", "w")
             prior_loss_weight=1.0,
             instance_prompt="",
             seed=42,
+            resolution=resolution,
             mixed_precision="fp16",
             train_batch_size=1,
             gradient_accumulation_steps=1,
             lr_scheduler="polynomial",
             lr_warmup_steps = 0,
             max_train_steps=Training_Steps,
+            num_class_images=200,
+            gradient_checkpointing=gradient_checkpointing,
+            cache_latents=cache_latents,
         )
         print("Starting multi-training...")
         lock_file = open("intraining.lock", "w")

train_dreambooth.py CHANGED Viewed

@@ -235,6 +235,13 @@ def parse_args():
         help="Train only the unet",
     )
     parser.add_argument(
         "--Session_dir",
         type=str,
@@ -382,6 +389,16 @@ class PromptDataset(Dataset):
         example["index"] = index
         return example
 def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
@@ -631,6 +648,28 @@ def run_training(args_imported):
     if not args.train_text_encoder:
         text_encoder.to(accelerator.device, dtype=weight_dtype)
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if overrode_max_train_steps:
@@ -669,8 +708,12 @@ def run_training(args_imported):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
                 # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-                latents = latents * 0.18215
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents)
@@ -684,26 +727,40 @@ def run_training(args_imported):
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                 # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
                 # Predict the noise residual
-                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                 if args.with_prior_preservation:
-                    # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
-                    noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0)
-                    noise, noise_prior = torch.chunk(noise, 2, dim=0)
                     # Compute instance loss
-                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="none").mean([1, 2, 3]).mean()
                     # Compute prior loss
-                    prior_loss = F.mse_loss(noise_pred_prior.float(), noise_prior.float(), reduction="mean")
                     # Add the prior loss to the instance loss.
                     loss = loss + args.prior_loss_weight * prior_loss
                 else:
-                    loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:

         help="Train only the unet",
     )
+    parser.add_argument(
+        "--cache_latents",
+        action="store_true",
+        default=False,
+        help="Train only the unet",
+    )
     parser.add_argument(
         "--Session_dir",
         type=str,
         example["index"] = index
         return example
+class LatentsDataset(Dataset):
+    def __init__(self, latents_cache, text_encoder_cache):
+        self.latents_cache = latents_cache
+        self.text_encoder_cache = text_encoder_cache
+    def __len__(self):
+        return len(self.latents_cache)
+    def __getitem__(self, index):
+        return self.latents_cache[index], self.text_encoder_cache[index]
 def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
     if not args.train_text_encoder:
         text_encoder.to(accelerator.device, dtype=weight_dtype)
+    if args.cache_latents:
+        latents_cache = []
+        text_encoder_cache = []
+        for batch in tqdm(train_dataloader, desc="Caching latents"):
+            with torch.no_grad():
+                batch["pixel_values"] = batch["pixel_values"].to(accelerator.device, non_blocking=True, dtype=weight_dtype)
+                batch["input_ids"] = batch["input_ids"].to(accelerator.device, non_blocking=True)
+                latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+                if args.train_text_encoder:
+                    text_encoder_cache.append(batch["input_ids"])
+                else:
+                    text_encoder_cache.append(text_encoder(batch["input_ids"])[0])
+        train_dataset = LatentsDataset(latents_cache, text_encoder_cache)
+        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x, shuffle=True)
+        del vae
+        if not args.train_text_encoder:
+            del text_encoder
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if overrode_max_train_steps:
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
                 # Convert images to latent space
+                with torch.no_grad():
+                    if args.cache_latents:
+                        latents = batch[0][0]
+                    else:
+                        latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                    latents = latents * 0.18215
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents)
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                 # Get the text embedding for conditioning
+                if(args.cache_latents):
+                    if args.train_text_encoder:
+                        encoder_hidden_states = text_encoder(batch[0][1])[0]
+                    else:
+                        encoder_hidden_states = batch[0][1]
+                else:
+                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
                 # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
                 if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
                     # Compute instance loss
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
                     # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
                     # Add the prior loss to the instance loss.
                     loss = loss + args.prior_loss_weight * prior_loss
                 else:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean()
                 accelerator.backward(loss)
                 if accelerator.sync_gradients: