Spaces:

franciszzj
/

Leffa

Running on Zero

App Files Files Community

franciszzj commited on Jan 9

Commit

e9b3585

1 Parent(s): 04d5d6b

change to float16

Browse files

Files changed (3) hide show

app.py +5 -2
leffa/model.py +23 -11
leffa/pipeline.py +0 -1

app.py CHANGED Viewed

@@ -40,18 +40,21 @@ class LeffaPredictor(object):
         vt_model_hd = LeffaModel(
             pretrained_model_name_or_path="./ckpts/stable-diffusion-inpainting",
             pretrained_model="./ckpts/virtual_tryon.pth",
         )
         self.vt_inference_hd = LeffaInference(model=vt_model_hd)
         vt_model_dc = LeffaModel(
             pretrained_model_name_or_path="./ckpts/stable-diffusion-inpainting",
             pretrained_model="./ckpts/virtual_tryon_dc.pth",
         )
         self.vt_inference_dc = LeffaInference(model=vt_model_dc)
         pt_model = LeffaModel(
             pretrained_model_name_or_path="./ckpts/stable-diffusion-xl-1.0-inpainting-0.1",
             pretrained_model="./ckpts/pose_transfer.pth",
         )
         self.pt_inference = LeffaInference(model=pt_model)
@@ -248,7 +251,7 @@ if __name__ == "__main__":
                         )
                         vt_step = gr.Number(
-                            label="Inference Steps", minimum=30, maximum=100, step=1, value=50)
                         vt_scale = gr.Number(
                             label="Guidance Scale", minimum=0.1, maximum=5.0, step=0.1, value=2.5)
@@ -325,7 +328,7 @@ if __name__ == "__main__":
                         )
                         pt_step = gr.Number(
-                            label="Inference Steps", minimum=30, maximum=100, step=1, value=50)
                         pt_scale = gr.Number(
                             label="Guidance Scale", minimum=0.1, maximum=5.0, step=0.1, value=2.5)

         vt_model_hd = LeffaModel(
             pretrained_model_name_or_path="./ckpts/stable-diffusion-inpainting",
             pretrained_model="./ckpts/virtual_tryon.pth",
+            dtype="float16",
         )
         self.vt_inference_hd = LeffaInference(model=vt_model_hd)
         vt_model_dc = LeffaModel(
             pretrained_model_name_or_path="./ckpts/stable-diffusion-inpainting",
             pretrained_model="./ckpts/virtual_tryon_dc.pth",
+            dtype="float16",
         )
         self.vt_inference_dc = LeffaInference(model=vt_model_dc)
         pt_model = LeffaModel(
             pretrained_model_name_or_path="./ckpts/stable-diffusion-xl-1.0-inpainting-0.1",
             pretrained_model="./ckpts/pose_transfer.pth",
+            dtype="float16",
         )
         self.pt_inference = LeffaInference(model=pt_model)
                         )
                         vt_step = gr.Number(
+                            label="Inference Steps", minimum=30, maximum=100, step=1, value=30)
                         vt_scale = gr.Number(
                             label="Guidance Scale", minimum=0.1, maximum=5.0, step=0.1, value=2.5)
                         )
                         pt_step = gr.Number(
+                            label="Inference Steps", minimum=30, maximum=100, step=1, value=30)
                         pt_scale = gr.Number(
                             label="Guidance Scale", minimum=0.1, maximum=5.0, step=0.1, value=2.5)

leffa/model.py CHANGED Viewed

@@ -23,6 +23,7 @@ class LeffaModel(nn.Module):
         new_in_channels: int = 12,  # noisy_image: 4, mask: 1, masked_image: 4, densepose: 3
         height: int = 1024,
         width: int = 768,
     ):
         super().__init__()
@@ -35,6 +36,9 @@ class LeffaModel(nn.Module):
             new_in_channels,
         )
     def build_models(
         self,
         pretrained_model_name_or_path: str = "",
@@ -60,14 +64,16 @@ class LeffaModel(nn.Module):
             return_unused_kwargs=True,
         )
         self.vae = AutoencoderKL.from_config(vae_config, **vae_kwargs)
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         # Reference UNet
         unet_config, unet_kwargs = ReferenceUNet.load_config(
             pretrained_model_name_or_path,
             subfolder="unet",
             return_unused_kwargs=True,
         )
-        self.unet_encoder = ReferenceUNet.from_config(unet_config, **unet_kwargs)
         self.unet_encoder.config.addition_embed_type = None
         # Generative UNet
         unet_config, unet_kwargs = GenerativeUNet.load_config(
@@ -80,7 +86,8 @@ class LeffaModel(nn.Module):
         # Change Generative UNet conv_in and conv_out
         unet_conv_in_channel_changed = self.unet.config.in_channels != new_in_channels
         if unet_conv_in_channel_changed:
-            self.unet.conv_in = self.replace_conv_in_layer(self.unet, new_in_channels)
             self.unet.config.in_channels = new_in_channels
         unet_conv_out_channel_changed = (
             self.unet.config.out_channels != self.vae.config.latent_channels
@@ -114,8 +121,10 @@ class LeffaModel(nn.Module):
         # Load pretrained model
         if pretrained_model != "" and pretrained_model is not None:
-            self.load_state_dict(torch.load(pretrained_model, map_location="cpu"))
-            logger.info("Load pretrained model from {}".format(pretrained_model))
     def replace_conv_in_layer(self, unet_model, new_in_channels):
         original_conv_in = unet_model.conv_in
@@ -168,7 +177,8 @@ class LeffaModel(nn.Module):
         return new_conv_out
     def vae_encode(self, pixel_values):
-        pixel_values = pixel_values.to(device=self.vae.device, dtype=self.vae.dtype)
         with torch.no_grad():
             latent = self.vae.encode(pixel_values).latent_dist.sample()
         latent = latent * self.vae.config.scaling_factor
@@ -208,7 +218,8 @@ def remove_cross_attention(
             hidden_size = unet.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
@@ -239,7 +250,6 @@ def remove_cross_attention(
     return adapter_modules
 class AttnProcessor2_0(torch.nn.Module):
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
@@ -315,10 +325,12 @@ class AttnProcessor2_0(torch.nn.Module):
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
@@ -346,4 +358,4 @@ class AttnProcessor2_0(torch.nn.Module):
         hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states

         new_in_channels: int = 12,  # noisy_image: 4, mask: 1, masked_image: 4, densepose: 3
         height: int = 1024,
         width: int = 768,
+        dtype: str = "float16",
     ):
         super().__init__()
             new_in_channels,
         )
+        if dtype == "float16":
+            self.half()
     def build_models(
         self,
         pretrained_model_name_or_path: str = "",
             return_unused_kwargs=True,
         )
         self.vae = AutoencoderKL.from_config(vae_config, **vae_kwargs)
+        self.vae_scale_factor = 2 ** (
+            len(self.vae.config.block_out_channels) - 1)
         # Reference UNet
         unet_config, unet_kwargs = ReferenceUNet.load_config(
             pretrained_model_name_or_path,
             subfolder="unet",
             return_unused_kwargs=True,
         )
+        self.unet_encoder = ReferenceUNet.from_config(
+            unet_config, **unet_kwargs)
         self.unet_encoder.config.addition_embed_type = None
         # Generative UNet
         unet_config, unet_kwargs = GenerativeUNet.load_config(
         # Change Generative UNet conv_in and conv_out
         unet_conv_in_channel_changed = self.unet.config.in_channels != new_in_channels
         if unet_conv_in_channel_changed:
+            self.unet.conv_in = self.replace_conv_in_layer(
+                self.unet, new_in_channels)
             self.unet.config.in_channels = new_in_channels
         unet_conv_out_channel_changed = (
             self.unet.config.out_channels != self.vae.config.latent_channels
         # Load pretrained model
         if pretrained_model != "" and pretrained_model is not None:
+            self.load_state_dict(torch.load(
+                pretrained_model, map_location="cpu"))
+            logger.info(
+                "Load pretrained model from {}".format(pretrained_model))
     def replace_conv_in_layer(self, unet_model, new_in_channels):
         original_conv_in = unet_model.conv_in
         return new_conv_out
     def vae_encode(self, pixel_values):
+        pixel_values = pixel_values.to(
+            device=self.vae.device, dtype=self.vae.dtype)
         with torch.no_grad():
             latent = self.vae.encode(pixel_values).latent_dist.sample()
         latent = latent * self.vae.config.scaling_factor
             hidden_size = unet.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
             block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[
+                block_id]
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
     return adapter_modules
 class AttnProcessor2_0(torch.nn.Module):
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads,
+                           head_dim).transpose(1, 2)
         key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads,
+                           head_dim).transpose(1, 2)
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
         hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

leffa/pipeline.py CHANGED Viewed

@@ -106,7 +106,6 @@ class LeffaPipeline(object):
             )
             reference_features = list(reference_features)
         with tqdm.tqdm(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latent if we are doing classifier free guidance

             )
             reference_features = list(reference_features)
         with tqdm.tqdm(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latent if we are doing classifier free guidance