Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- README.md +7 -6
- app.py +56 -0
- inference.py +113 -0
- model.py +924 -0
- utils.py +43 -0
README.md
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Text To Image EfficientCLIP GAN
|
| 3 |
+
emoji: 📸
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.22.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
+
short_description: Create images from text utilizing the EfficientCLIP-GAN
|
| 12 |
---
|
| 13 |
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import random
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import requests
|
| 5 |
+
from PIL import Image
|
| 6 |
+
|
| 7 |
+
from utils import read_css_from_file
|
| 8 |
+
from inference import generate_image_from_text, generate_image_from_text_with_persistent_storage
|
| 9 |
+
|
| 10 |
+
# Read CSS from file
|
| 11 |
+
css = read_css_from_file("style.css")
|
| 12 |
+
|
| 13 |
+
DESCRIPTION = '''
|
| 14 |
+
<div id="content_align">
|
| 15 |
+
<span style="color:darkred;font-size:32px;font-weight:bold">
|
| 16 |
+
WordCraft : Visuals from Verbs
|
| 17 |
+
</span>
|
| 18 |
+
</div>
|
| 19 |
+
<div id="content_align">
|
| 20 |
+
<span style="color:blue;font-size:18px;font-weight:bold;">
|
| 21 |
+
<br>A small, lighting fast efficient AI image generator
|
| 22 |
+
</span>
|
| 23 |
+
</div>
|
| 24 |
+
<div id="content_align" style="margin-top: 10px;font-weight:bold;">
|
| 25 |
+
<br>This 💻 demo uses the EfficientCLIP-GAN model trained on CUB 🐦🐥 and CC12M 📸🌃🌉 dataset.
|
| 26 |
+
<br>Keep your prompt coherent to domain of the selected model.
|
| 27 |
+
<br>If you like the demo, don't forget to click on the like 💖 button.
|
| 28 |
+
</div>
|
| 29 |
+
'''
|
| 30 |
+
available_models = [
|
| 31 |
+
("EfficientCLIP-GAN trained on CUB dataset (Restricted to birds)", "CUB"),
|
| 32 |
+
("EfficientCLIP-GAN trained on CC12M dataset (More flexible)", "CC12M")
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# Creating Gradio interface
|
| 36 |
+
with gr.Blocks(css=css) as app:
|
| 37 |
+
gr.Markdown(DESCRIPTION)
|
| 38 |
+
with gr.Row():
|
| 39 |
+
with gr.Column():
|
| 40 |
+
text_prompt = gr.Textbox(label="Input Prompt", value="this tiny bird has a very small bill, a belly covered with white delicate feathers and has a set of black rounded eyes.", lines=3)
|
| 41 |
+
model_selector = gr.Dropdown(choices=available_models, value="CUB", label="Select Model", info="Select a model with which you want to generate images")
|
| 42 |
+
generate_button = gr.Button("Generate Images", variant='primary')
|
| 43 |
+
|
| 44 |
+
with gr.Row():
|
| 45 |
+
with gr.Column():
|
| 46 |
+
image_output1 = gr.Image(label="Generated Image 1")
|
| 47 |
+
image_output2 = gr.Image(label="Generated Image 2")
|
| 48 |
+
|
| 49 |
+
with gr.Column():
|
| 50 |
+
image_output3 = gr.Image(label="Generated Image 3")
|
| 51 |
+
image_output4 = gr.Image(label="Generated Image 4")
|
| 52 |
+
|
| 53 |
+
generate_button.click(generate_image_from_text, inputs=[text_prompt, model_selector], outputs=[image_output1, image_output2, image_output3, image_output4])
|
| 54 |
+
|
| 55 |
+
# Launch the app
|
| 56 |
+
app.launch()
|
inference.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import io
|
| 4 |
+
import torch
|
| 5 |
+
import torchvision
|
| 6 |
+
import clip
|
| 7 |
+
import numpy as np
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
from PIL import Image
|
| 10 |
+
from torchvision.transforms.functional import to_pil_image
|
| 11 |
+
|
| 12 |
+
from utils import load_model_weights
|
| 13 |
+
from model import NetG, CLIP_TXT_ENCODER
|
| 14 |
+
|
| 15 |
+
# checking the device
|
| 16 |
+
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
| 17 |
+
|
| 18 |
+
# Getting the HF token
|
| 19 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 20 |
+
|
| 21 |
+
# repository of the model
|
| 22 |
+
repo_id = "VinayHajare/EfficientCLIP-GAN"
|
| 23 |
+
cub_model = "saved_models/state_epoch_1480.pth"
|
| 24 |
+
cc12m_model = "saved_models/EfficientCLIP-GAN-CC12M.pth"
|
| 25 |
+
|
| 26 |
+
# clip model wrapped with the custom encoder
|
| 27 |
+
clip_text = "ViT-B/32"
|
| 28 |
+
clip_model, preprocessor = clip.load(clip_text, device=device)
|
| 29 |
+
clip_model = clip_model.eval()
|
| 30 |
+
text_encoder = CLIP_TXT_ENCODER(clip_model).to(device)
|
| 31 |
+
|
| 32 |
+
# loading the models from the repository and extracting the generator model
|
| 33 |
+
cub_model_path = hf_hub_download(repo_id = repo_id, filename = cub_model, token = HF_TOKEN)
|
| 34 |
+
checkpoint_cub = torch.load(cub_model_path, map_location=torch.device(device))
|
| 35 |
+
cc12m_model_path = hf_hub_download(repo_id = repo_id, filename = cc12m_model, token = HF_TOKEN)
|
| 36 |
+
checkpoint_cc12m = torch.load(cc12m_model_path, map_location=torch.device(device))
|
| 37 |
+
|
| 38 |
+
# Create a new Generator model and initialize it with the pre-trained weights
|
| 39 |
+
netG = NetG(64, 100, 512, 256, 3, False, clip_model).to(device)
|
| 40 |
+
#cub = load_model_weights(netG, checkpoint_cub['model']['netG'], multi_gpus=False)
|
| 41 |
+
#cc12m = load_model_weights(netG, checkpoint_cc12m['model']['netG'], multi_gpus=False)
|
| 42 |
+
|
| 43 |
+
# Function to generate images from text
|
| 44 |
+
def generate_image_from_text(caption, model, batch_size=4):
|
| 45 |
+
if model == "CUB":
|
| 46 |
+
generator = load_model_weights(netG, checkpoint_cub['model']['netG'], multi_gpus=False)
|
| 47 |
+
else:
|
| 48 |
+
generator = load_model_weights(netG, checkpoint_cc12m['model']['netG'], multi_gpus=False)
|
| 49 |
+
|
| 50 |
+
# Create the noise tensor
|
| 51 |
+
noise = torch.randn((batch_size, 100)).to(device)
|
| 52 |
+
with torch.no_grad():
|
| 53 |
+
# Tokenize caption
|
| 54 |
+
tokenized_text = clip.tokenize([caption]).to(device)
|
| 55 |
+
# Extract the sentence and word embedding from Custom CLIP ENCODER
|
| 56 |
+
sent_emb, word_emb = text_encoder(tokenized_text)
|
| 57 |
+
# Repeat the sentence embedding to match the batch size
|
| 58 |
+
sent_emb = sent_emb.repeat(batch_size, 1)
|
| 59 |
+
# generate the images
|
| 60 |
+
generated_images = generator(noise, sent_emb, eval=True).float()
|
| 61 |
+
|
| 62 |
+
# Convert the tensor images to PIL format
|
| 63 |
+
pil_images = []
|
| 64 |
+
for image_tensor in generated_images.unbind(0):
|
| 65 |
+
# Rescale tensor values to [0, 1]
|
| 66 |
+
image_tensor = image_tensor.data.clamp(-1, 1)
|
| 67 |
+
image_tensor = (image_tensor + 1.0) / 2.0
|
| 68 |
+
|
| 69 |
+
# Convert tensor to numpy array
|
| 70 |
+
image_numpy = image_tensor.permute(1, 2, 0).cpu().numpy()
|
| 71 |
+
|
| 72 |
+
# Clip numpy array values to [0, 1]
|
| 73 |
+
image_numpy = np.clip(image_numpy, 0, 1)
|
| 74 |
+
|
| 75 |
+
# Create a PIL image from the numpy array
|
| 76 |
+
pil_image = Image.fromarray((image_numpy * 255).astype(np.uint8))
|
| 77 |
+
|
| 78 |
+
pil_images.append(pil_image)
|
| 79 |
+
|
| 80 |
+
return pil_images
|
| 81 |
+
|
| 82 |
+
# Function to generate images from text
|
| 83 |
+
def generate_image_from_text_with_persistent_storage(caption, model, batch_size=4):
|
| 84 |
+
if model == "CUB":
|
| 85 |
+
generator = load_model_weights(netG, checkpoint_cub['model']['netG'], multi_gpus=False)
|
| 86 |
+
else:
|
| 87 |
+
generator = load_model_weights(netG, checkpoint_cc12m['model']['netG'], multi_gpus=False)
|
| 88 |
+
|
| 89 |
+
# Create the noise tensor
|
| 90 |
+
noise = torch.randn((batch_size, 100)).to(device)
|
| 91 |
+
with torch.no_grad():
|
| 92 |
+
# Tokenize caption
|
| 93 |
+
tokenized_text = clip.tokenize([caption]).to(device)
|
| 94 |
+
# Extract the sentence and word embedding from Custom CLIP ENCODER
|
| 95 |
+
sent_emb, word_emb = text_encoder(tokenized_text)
|
| 96 |
+
# Repeat the sentence embedding to match the batch size
|
| 97 |
+
sent_emb = sent_emb.repeat(batch_size, 1)
|
| 98 |
+
# generate the images
|
| 99 |
+
generated_images = generator(noise, sent_emb, eval=True).float()
|
| 100 |
+
|
| 101 |
+
# Create a permanent directory if it doesn't exist
|
| 102 |
+
permanent_dir = "generated_images"
|
| 103 |
+
if not os.path.exists(permanent_dir):
|
| 104 |
+
os.makedirs(permanent_dir)
|
| 105 |
+
|
| 106 |
+
image_paths = []
|
| 107 |
+
for idx, image_tensor in enumerate(generated_images.unbind(0)):
|
| 108 |
+
# Save the image tensor to a permanent file
|
| 109 |
+
image_path = os.path.join(permanent_dir, f"image_{idx}.png")
|
| 110 |
+
torchvision.utils.save_image(image_tensor.data, image_path, value_range=(-1, 1), normalize=True)
|
| 111 |
+
image_paths.append(image_path)
|
| 112 |
+
|
| 113 |
+
return image_paths
|
model.py
ADDED
|
@@ -0,0 +1,924 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
from collections import OrderedDict
|
| 6 |
+
from utils import dummy_context_mgr
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CLIP_IMG_ENCODER(nn.Module):
|
| 10 |
+
"""
|
| 11 |
+
CLIP_IMG_ENCODER module for encoding images using CLIP's visual transformer.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, CLIP):
|
| 15 |
+
"""
|
| 16 |
+
Initialize the CLIP_IMG_ENCODER module.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
CLIP (CLIP): Pre-trained CLIP model.
|
| 20 |
+
"""
|
| 21 |
+
super(CLIP_IMG_ENCODER, self).__init__()
|
| 22 |
+
model = CLIP.visual
|
| 23 |
+
self.define_module(model)
|
| 24 |
+
# freeze the parameters of the CLIP model
|
| 25 |
+
for param in self.parameters():
|
| 26 |
+
param.requires_grad = False
|
| 27 |
+
|
| 28 |
+
def define_module(self, model):
|
| 29 |
+
"""
|
| 30 |
+
Define the individual layers and modules of the CLIP visual transformer model.
|
| 31 |
+
Args:
|
| 32 |
+
model (nn.Module): CLIP visual transformer model.
|
| 33 |
+
"""
|
| 34 |
+
# Extract required modules from the CLIP model
|
| 35 |
+
self.conv1 = model.conv1 # Convolutional layer
|
| 36 |
+
self.class_embedding = model.class_embedding # Class embedding layer
|
| 37 |
+
self.positional_embedding = model.positional_embedding # Positional embedding layer
|
| 38 |
+
self.ln_pre = model.ln_pre # Linear Normalization layer for pre-normalization
|
| 39 |
+
self.transformer = model.transformer # Transformer block
|
| 40 |
+
self.ln_post = model.ln_post # Linear Normalization layer for post-normalization
|
| 41 |
+
self.proj = model.proj # projection matrix
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def dtype(self):
|
| 45 |
+
"""
|
| 46 |
+
Get the data type of the convolutional layer weights.
|
| 47 |
+
"""
|
| 48 |
+
return self.conv1.weight.dtype
|
| 49 |
+
|
| 50 |
+
def transf_to_CLIP_input(self, inputs):
|
| 51 |
+
"""
|
| 52 |
+
Transform input images to the format expected by CLIP.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
inputs (torch.Tensor): Input images.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
torch.Tensor: Transformed images.
|
| 59 |
+
"""
|
| 60 |
+
device = inputs.device
|
| 61 |
+
# Check the size of the input image tensor
|
| 62 |
+
if len(inputs.size()) != 4:
|
| 63 |
+
raise ValueError('Expect the (B, C, X, Y) tensor.')
|
| 64 |
+
else:
|
| 65 |
+
# Normalize input images
|
| 66 |
+
mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).unsqueeze(-1).unsqueeze(-1).unsqueeze(0).to(device)
|
| 67 |
+
var = torch.tensor([0.26862954, 0.26130258, 0.27577711]).unsqueeze(-1).unsqueeze(-1).unsqueeze(0).to(device)
|
| 68 |
+
inputs = F.interpolate(inputs * 0.5 + 0.5, size=(224, 224))
|
| 69 |
+
inputs = ((inputs + 1) * 0.5 - mean) / var
|
| 70 |
+
return inputs
|
| 71 |
+
|
| 72 |
+
def forward(self, img: torch.Tensor):
|
| 73 |
+
"""
|
| 74 |
+
Forward pass of the CLIP_IMG_ENCODER module.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
img (torch.Tensor): Input images.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
torch.Tensor: Local features extracted from the image.
|
| 81 |
+
torch.Tensor: Encoded image embeddings.
|
| 82 |
+
"""
|
| 83 |
+
# Transform input images to the format expected by CLIP and set its datatype appropriately
|
| 84 |
+
x = self.transf_to_CLIP_input(img)
|
| 85 |
+
x = x.type(self.dtype)
|
| 86 |
+
|
| 87 |
+
# Pass the image through Convolutional layer
|
| 88 |
+
x = self.conv1(x) # shape = [*, width, grid, grid]
|
| 89 |
+
grid = x.size(-1)
|
| 90 |
+
|
| 91 |
+
# Reshape and permute the tensor for transformer input
|
| 92 |
+
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
|
| 93 |
+
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
|
| 94 |
+
|
| 95 |
+
# Add class and positional embeddings
|
| 96 |
+
x = torch.cat(
|
| 97 |
+
[self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
|
| 98 |
+
x], dim=1) # shape = [*, grid ** 2 + 1, width]
|
| 99 |
+
x = x + self.positional_embedding.to(x.dtype)
|
| 100 |
+
x = self.ln_pre(x)
|
| 101 |
+
|
| 102 |
+
# NLD (Batch Size - Length - Dimension) -> LND (Length - Batch Size - Dimension)
|
| 103 |
+
x = x.permute(1, 0, 2)
|
| 104 |
+
|
| 105 |
+
# Extract local features using transformer blocks
|
| 106 |
+
selected = [1, 4, 8]
|
| 107 |
+
local_features = []
|
| 108 |
+
for i in range(12):
|
| 109 |
+
x = self.transformer.resblocks[i](x)
|
| 110 |
+
if i in selected:
|
| 111 |
+
local_features.append(
|
| 112 |
+
x.permute(1, 0, 2)[:, 1:, :].permute(0, 2, 1).reshape(-1, 768, grid, grid).contiguous().type(
|
| 113 |
+
img.dtype))
|
| 114 |
+
x = x.permute(1, 0, 2) # LND -> NLD
|
| 115 |
+
x = self.ln_post(x[:, 0, :])
|
| 116 |
+
if self.proj is not None:
|
| 117 |
+
x = x @ self.proj # Perform matrix multiplication with projection matrix and tensor
|
| 118 |
+
return torch.stack(local_features, dim=1), x.type(img.dtype)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class CLIP_TXT_ENCODER(nn.Module):
|
| 122 |
+
"""
|
| 123 |
+
CLIP_TXT_ENCODER module for encoding text inputs using CLIP's transformer.
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
def __init__(self, CLIP):
|
| 127 |
+
"""
|
| 128 |
+
Initialize the CLIP_TXT_ENCODER module.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
CLIP (CLIP): Pre-trained CLIP model.
|
| 132 |
+
"""
|
| 133 |
+
super(CLIP_TXT_ENCODER, self).__init__()
|
| 134 |
+
self.define_module(CLIP)
|
| 135 |
+
# Freeze the parameters of the CLIP model
|
| 136 |
+
for param in self.parameters():
|
| 137 |
+
param.requires_grad = False
|
| 138 |
+
|
| 139 |
+
def define_module(self, CLIP):
|
| 140 |
+
"""
|
| 141 |
+
Define the individual modules of the CLIP transformer model.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
CLIP (CLIP): Pre-trained CLIP model.
|
| 145 |
+
"""
|
| 146 |
+
self.transformer = CLIP.transformer # Transformer block
|
| 147 |
+
self.vocab_size = CLIP.vocab_size # Size of the vocabulary of the transformer
|
| 148 |
+
self.token_embedding = CLIP.token_embedding # token embedding block
|
| 149 |
+
self.positional_embedding = CLIP.positional_embedding # positional embedding block
|
| 150 |
+
self.ln_final = CLIP.ln_final # Linear Normalization layer
|
| 151 |
+
self.text_projection = CLIP.text_projection # Projection matrix for text
|
| 152 |
+
|
| 153 |
+
@property
|
| 154 |
+
def dtype(self):
|
| 155 |
+
"""
|
| 156 |
+
Get the data type of the first layer's weights in the transformer.
|
| 157 |
+
"""
|
| 158 |
+
return self.transformer.resblocks[0].mlp.c_fc.weight.dtype
|
| 159 |
+
|
| 160 |
+
def forward(self, text):
|
| 161 |
+
"""
|
| 162 |
+
Forward pass of the CLIP_TXT_ENCODER module.
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
text (torch.Tensor): Input text tokens.
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
torch.Tensor: Encoded sentence embeddings.
|
| 169 |
+
torch.Tensor: Transformer output for the input text.
|
| 170 |
+
"""
|
| 171 |
+
# Embed input text tokens
|
| 172 |
+
x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
|
| 173 |
+
# Add positional embeddings
|
| 174 |
+
x = x + self.positional_embedding.type(self.dtype)
|
| 175 |
+
# Permute dimensions for transformer input
|
| 176 |
+
x = x.permute(1, 0, 2) # NLD -> LND
|
| 177 |
+
# Pass input through the transformer
|
| 178 |
+
x = self.transformer(x)
|
| 179 |
+
# Permute dimensions back to original shape
|
| 180 |
+
x = x.permute(1, 0, 2) # LND -> NLD
|
| 181 |
+
# Apply layer normalization
|
| 182 |
+
x = self.ln_final(x).type(self.dtype) # shape = [batch_size, n_ctx, transformer.width]
|
| 183 |
+
# Extract sentence embeddings from the end-of-text (eot_token : is the highest number in each sequence)
|
| 184 |
+
sent_emb = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
|
| 185 |
+
|
| 186 |
+
# Return the sentence embedding and transformer ouput
|
| 187 |
+
return sent_emb, x
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class CLIP_Mapper(nn.Module):
|
| 191 |
+
"""
|
| 192 |
+
CLIP_Mapper module for mapping images with prompts using CLIP's transformer.
|
| 193 |
+
"""
|
| 194 |
+
|
| 195 |
+
def __init__(self, CLIP):
|
| 196 |
+
"""
|
| 197 |
+
Initialize the CLIP_Mapper module.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
CLIP (CLIP): Pre-trained CLIP model.
|
| 201 |
+
"""
|
| 202 |
+
super(CLIP_Mapper, self).__init__()
|
| 203 |
+
model = CLIP.visual
|
| 204 |
+
self.define_module(model)
|
| 205 |
+
# Freeze the parameters of the CLIP visual model
|
| 206 |
+
for param in model.parameters():
|
| 207 |
+
param.requires_grad = False
|
| 208 |
+
|
| 209 |
+
def define_module(self, model):
|
| 210 |
+
"""
|
| 211 |
+
Define the individual modules of the CLIP visual model.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
model: Pre-trained CLIP visual model.
|
| 215 |
+
"""
|
| 216 |
+
self.conv1 = model.conv1
|
| 217 |
+
self.class_embedding = model.class_embedding
|
| 218 |
+
self.positional_embedding = model.positional_embedding
|
| 219 |
+
self.ln_pre = model.ln_pre
|
| 220 |
+
self.transformer = model.transformer
|
| 221 |
+
|
| 222 |
+
@property
|
| 223 |
+
def dtype(self):
|
| 224 |
+
"""
|
| 225 |
+
Get the data type of the weights of the first convolutional layer.
|
| 226 |
+
"""
|
| 227 |
+
return self.conv1.weight.dtype
|
| 228 |
+
|
| 229 |
+
def forward(self, img: torch.Tensor, prompts: torch.Tensor):
|
| 230 |
+
"""
|
| 231 |
+
Forward pass of the CLIP_Mapper module.
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
img (torch.Tensor): Input image tensor.
|
| 235 |
+
prompts (torch.Tensor): Prompt tokens for mapping.
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
torch.Tensor: Mapped features from the CLIP model.
|
| 239 |
+
"""
|
| 240 |
+
|
| 241 |
+
# Convert input image and prompts to the appropriate data type
|
| 242 |
+
x = img.type(self.dtype)
|
| 243 |
+
prompts = prompts.type(self.dtype)
|
| 244 |
+
grid = x.size(-1)
|
| 245 |
+
|
| 246 |
+
# Reshape the input image tensor
|
| 247 |
+
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
|
| 248 |
+
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
|
| 249 |
+
|
| 250 |
+
# Append the class embeddings to input tensors
|
| 251 |
+
x = torch.cat(
|
| 252 |
+
[self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
|
| 253 |
+
x],
|
| 254 |
+
dim=1
|
| 255 |
+
) # shape = [*, grid ** 2 + 1, width]
|
| 256 |
+
|
| 257 |
+
# Append the positional embeddings to the input tensor
|
| 258 |
+
x = x + self.positional_embedding.to(x.dtype)
|
| 259 |
+
|
| 260 |
+
# Perform the layer normalization
|
| 261 |
+
x = self.ln_pre(x)
|
| 262 |
+
# NLD -> LND
|
| 263 |
+
x = x.permute(1, 0, 2)
|
| 264 |
+
# Local features
|
| 265 |
+
selected = [1, 2, 3, 4, 5, 6, 7, 8]
|
| 266 |
+
begin, end = 0, 12
|
| 267 |
+
prompt_idx = 0
|
| 268 |
+
for i in range(begin, end):
|
| 269 |
+
# Add prompt to the input tensor
|
| 270 |
+
if i in selected:
|
| 271 |
+
prompt = prompts[:, prompt_idx, :].unsqueeze(0)
|
| 272 |
+
prompt_idx = prompt_idx + 1
|
| 273 |
+
x = torch.cat((x, prompt), dim=0)
|
| 274 |
+
x = self.transformer.resblocks[i](x)
|
| 275 |
+
x = x[:-1, :, :]
|
| 276 |
+
else:
|
| 277 |
+
x = self.transformer.resblocks[i](x)
|
| 278 |
+
# Reshape and return mapped features
|
| 279 |
+
return x.permute(1, 0, 2)[:, 1:, :].permute(0, 2, 1).reshape(-1, 768, grid, grid).contiguous().type(img.dtype)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
class CLIP_Adapter(nn.Module):
|
| 283 |
+
"""
|
| 284 |
+
CLIP_Adapter module for adapting features from a generator to match the CLIP model's input requirements.
|
| 285 |
+
"""
|
| 286 |
+
|
| 287 |
+
def __init__(self, in_ch, mid_ch, out_ch, G_ch, CLIP_ch, cond_dim, k, s, p, map_num, CLIP):
|
| 288 |
+
"""
|
| 289 |
+
Initialize the CLIP_Adapter module.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
in_ch (int): Number of input channels.
|
| 293 |
+
mid_ch (int): Number of channels in the intermediate layers.
|
| 294 |
+
out_ch (int): Number of output channels.
|
| 295 |
+
G_ch (int): Number of channels in the generator's output.
|
| 296 |
+
CLIP_ch (int): Number of channels in the CLIP model's input.
|
| 297 |
+
cond_dim (int): Dimension of the conditioning vector.
|
| 298 |
+
k (int): Kernel size for convolutional layers.
|
| 299 |
+
s (int): Stride for convolutional layers.
|
| 300 |
+
p (int): Padding for convolutional layers.
|
| 301 |
+
map_num (int): Number of mapping blocks.
|
| 302 |
+
CLIP: Pre-trained CLIP model.
|
| 303 |
+
"""
|
| 304 |
+
super(CLIP_Adapter, self).__init__()
|
| 305 |
+
self.CLIP_ch = CLIP_ch
|
| 306 |
+
self.FBlocks = nn.ModuleList([])
|
| 307 |
+
# Define Mapping blocks (M_Block) and them to Feature blocks (FBlock) for given number of mapping blocks.
|
| 308 |
+
self.FBlocks.append(M_Block(in_ch, mid_ch, out_ch, cond_dim, k, s, p))
|
| 309 |
+
for i in range(map_num - 1):
|
| 310 |
+
self.FBlocks.append(M_Block(out_ch, mid_ch, out_ch, cond_dim, k, s, p))
|
| 311 |
+
# Convolutional layer to fuse adapted features
|
| 312 |
+
self.conv_fuse = nn.Conv2d(out_ch, CLIP_ch, 5, 1, 2)
|
| 313 |
+
# CLIP Mapper module to map adapted features to CLIP's input space
|
| 314 |
+
self.CLIP_ViT = CLIP_Mapper(CLIP)
|
| 315 |
+
# Convolutional layer to further process mapped features
|
| 316 |
+
self.conv = nn.Conv2d(768, G_ch, 5, 1, 2)
|
| 317 |
+
# Fully connected layer for conditioning
|
| 318 |
+
self.fc_prompt = nn.Linear(cond_dim, CLIP_ch * 8)
|
| 319 |
+
|
| 320 |
+
def forward(self, out, c):
|
| 321 |
+
"""
|
| 322 |
+
Forward pass of the CLIP_Adapter module. Takes output features from the generator and conditioning vector
|
| 323 |
+
as input, adapts features using the Feature block having multiple mapping blocks, fuses them, map them to
|
| 324 |
+
CLIPs input space and returns the processed features
|
| 325 |
+
|
| 326 |
+
Args:
|
| 327 |
+
out (torch.Tensor): Output features from the generator.
|
| 328 |
+
c (torch.Tensor): Conditioning vector.
|
| 329 |
+
|
| 330 |
+
Returns:
|
| 331 |
+
torch.Tensor: Adapted and mapped features for the generator.
|
| 332 |
+
"""
|
| 333 |
+
|
| 334 |
+
# Generate prompts from the conditioning vector
|
| 335 |
+
prompts = self.fc_prompt(c).view(c.size(0), -1, self.CLIP_ch)
|
| 336 |
+
|
| 337 |
+
# Pass features through feature block consisting of multiple mapping blocks
|
| 338 |
+
for FBlock in self.FBlocks:
|
| 339 |
+
out = FBlock(out, c)
|
| 340 |
+
# Fuse adapted features
|
| 341 |
+
fuse_feat = self.conv_fuse(out)
|
| 342 |
+
# Map fused features to CLIP's input space
|
| 343 |
+
map_feat = self.CLIP_ViT(fuse_feat, prompts)
|
| 344 |
+
# Further process mapped features and return
|
| 345 |
+
return self.conv(fuse_feat + 0.1 * map_feat)
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
class NetG(nn.Module):
|
| 349 |
+
"""
|
| 350 |
+
Generator network for synthesizing images conditioned on text and noise
|
| 351 |
+
"""
|
| 352 |
+
|
| 353 |
+
def __init__(self, ngf, nz, cond_dim, imsize, ch_size, mixed_precision, CLIP):
|
| 354 |
+
"""
|
| 355 |
+
Initializes the Generator network.
|
| 356 |
+
|
| 357 |
+
Parameters:
|
| 358 |
+
ngf (int): Number of generator filters.
|
| 359 |
+
nz (int): Dimensionality of the input noise vector.
|
| 360 |
+
cond_dim (int): Dimensionality of the conditioning vector.
|
| 361 |
+
imsize (int): Size of the generated images.
|
| 362 |
+
ch_size (int): Number of output channels for the generated images.
|
| 363 |
+
mixed_precision (bool): Whether to use mixed precision training.
|
| 364 |
+
CLIP: CLIP model for feature adaptation.
|
| 365 |
+
|
| 366 |
+
"""
|
| 367 |
+
super(NetG, self).__init__()
|
| 368 |
+
# Define attributes
|
| 369 |
+
self.ngf = ngf
|
| 370 |
+
self.mixed_precision = mixed_precision
|
| 371 |
+
|
| 372 |
+
# Build CLIP Mapper
|
| 373 |
+
self.code_sz, self.code_ch, self.mid_ch = 7, 64, 32
|
| 374 |
+
self.CLIP_ch = 768
|
| 375 |
+
# fully connected layer to convert the noise vector into a feature map of dimensions (code_sz * code_sz * code_ch)
|
| 376 |
+
self.fc_code = nn.Linear(nz, self.code_sz * self.code_sz * self.code_ch)
|
| 377 |
+
self.mapping = CLIP_Adapter(self.code_ch, self.mid_ch, self.code_ch, ngf * 8, self.CLIP_ch, cond_dim + nz, 3, 1,
|
| 378 |
+
1, 4, CLIP)
|
| 379 |
+
# Build GBlocks
|
| 380 |
+
self.GBlocks = nn.ModuleList([])
|
| 381 |
+
in_out_pairs = list(get_G_in_out_chs(ngf, imsize))
|
| 382 |
+
imsize = 4
|
| 383 |
+
for idx, (in_ch, out_ch) in enumerate(in_out_pairs):
|
| 384 |
+
if idx < (len(in_out_pairs) - 1):
|
| 385 |
+
imsize = imsize * 2
|
| 386 |
+
else:
|
| 387 |
+
imsize = 224
|
| 388 |
+
self.GBlocks.append(G_Block(cond_dim + nz, in_ch, out_ch, imsize))
|
| 389 |
+
|
| 390 |
+
# To RGB image conversion using the sequential layers having leakyReLU activation function
|
| 391 |
+
self.to_rgb = nn.Sequential(
|
| 392 |
+
nn.LeakyReLU(0.2, inplace=True),
|
| 393 |
+
nn.Conv2d(out_ch, ch_size, 3, 1, 1),
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
def forward(self, noise, c, eval=False): # x=noise, c=ent_emb
|
| 397 |
+
"""
|
| 398 |
+
Forward pass of the generator network.
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
noise (torch.Tensor): Input noise vector.
|
| 402 |
+
c (torch.Tensor): Conditioning information, typically an embedding representing attributes of the output.
|
| 403 |
+
eval (bool, optional): Flag indicating whether the network is in evaluation mode. Defaults to False.
|
| 404 |
+
|
| 405 |
+
Returns:
|
| 406 |
+
torch.Tensor: Generated RGB images.
|
| 407 |
+
"""
|
| 408 |
+
# Context manager for enabling automatic mixed precision training
|
| 409 |
+
with torch.cuda.amp.autocast() if self.mixed_precision and not eval else dummy_context_mgr() as mp:
|
| 410 |
+
# Concatenate noise and conditioning information
|
| 411 |
+
cond = torch.cat((noise, c), dim=1)
|
| 412 |
+
|
| 413 |
+
# Pass noise through fully connected layer to generate feature map and adapt features using CLIP Mapper
|
| 414 |
+
out = self.mapping(self.fc_code(noise).view(noise.size(0), self.code_ch, self.code_sz, self.code_sz), cond)
|
| 415 |
+
|
| 416 |
+
# Apply GBlocks to progressively upsample feature representation, fuse text and visual features
|
| 417 |
+
for GBlock in self.GBlocks:
|
| 418 |
+
out = GBlock(out, cond)
|
| 419 |
+
|
| 420 |
+
# Convert final feature representation to RGB images
|
| 421 |
+
out = self.to_rgb(out)
|
| 422 |
+
|
| 423 |
+
return out
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
class NetD(nn.Module):
|
| 427 |
+
"""
|
| 428 |
+
Discriminator network for evaluating the realism of images.
|
| 429 |
+
Attributes:
|
| 430 |
+
DBlocks (nn.ModuleList): List of D_Block modules for processing feature maps.
|
| 431 |
+
main (D_Block): Main D_Block module for final processing.
|
| 432 |
+
"""
|
| 433 |
+
|
| 434 |
+
def __init__(self, ndf, imsize, ch_size, mixed_precision):
|
| 435 |
+
"""
|
| 436 |
+
Initializes the Discriminator network
|
| 437 |
+
|
| 438 |
+
Args:
|
| 439 |
+
ndf (int): Number of channels in the initial features.
|
| 440 |
+
imsize (int): Size of the input images (assumed square).
|
| 441 |
+
ch_size (int): Number of channels in the output feature maps.
|
| 442 |
+
mixed_precision (bool): Flag indicating whether to use mixed precision training.
|
| 443 |
+
"""
|
| 444 |
+
super(NetD, self).__init__()
|
| 445 |
+
self.mixed_precision = mixed_precision
|
| 446 |
+
# Define the DBlock
|
| 447 |
+
self.DBlocks = nn.ModuleList([
|
| 448 |
+
D_Block(768, 768, 3, 1, 1, res=True, CLIP_feat=True),
|
| 449 |
+
D_Block(768, 768, 3, 1, 1, res=True, CLIP_feat=True),
|
| 450 |
+
])
|
| 451 |
+
# Define the main DBlock for the final processing
|
| 452 |
+
self.main = D_Block(768, 512, 3, 1, 1, res=True, CLIP_feat=False)
|
| 453 |
+
|
| 454 |
+
def forward(self, h):
|
| 455 |
+
"""
|
| 456 |
+
Forward pass of the discriminator network.
|
| 457 |
+
Args:
|
| 458 |
+
h (torch.Tensor): Input feature maps.
|
| 459 |
+
Returns:
|
| 460 |
+
torch.Tensor: Discriminator output.
|
| 461 |
+
"""
|
| 462 |
+
with torch.cuda.amp.autocast() if self.mixed_precision else dummy_context_mgr() as mpc:
|
| 463 |
+
# Initial feature map
|
| 464 |
+
out = h[:, 0]
|
| 465 |
+
# Pass the input feature through each DBlock
|
| 466 |
+
for idx in range(len(self.DBlocks)):
|
| 467 |
+
out = self.DBlocks[idx](out, h[:, idx + 1])
|
| 468 |
+
# Final processing through the main DBlock
|
| 469 |
+
out = self.main(out)
|
| 470 |
+
return out
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
class NetC(nn.Module):
|
| 474 |
+
"""
|
| 475 |
+
Classifier / Comparator network for classifying the joint features of the generator output and condition text.
|
| 476 |
+
Attributes:
|
| 477 |
+
cond_dim (int): Dimensionality of the conditioning information.
|
| 478 |
+
mixed_precision (bool): Flag indicating whether to use mixed precision training.
|
| 479 |
+
joint_conv (nn.Sequential): Sequential module defining the classifier layers.
|
| 480 |
+
"""
|
| 481 |
+
def __init__(self, ndf, cond_dim, mixed_precision):
|
| 482 |
+
"""
|
| 483 |
+
|
| 484 |
+
"""
|
| 485 |
+
super(NetC, self).__init__()
|
| 486 |
+
self.cond_dim = cond_dim
|
| 487 |
+
self.mixed_precision = mixed_precision
|
| 488 |
+
# Define the classifier layers, sequential convolutional 2D layer with LeakyReLU as the activation function
|
| 489 |
+
self.joint_conv = nn.Sequential(
|
| 490 |
+
nn.Conv2d(512 + 512, 128, 4, 1, 0, bias=False),
|
| 491 |
+
nn.LeakyReLU(0.2, inplace=True),
|
| 492 |
+
nn.Conv2d(128, 1, 4, 1, 0, bias=False),
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
def forward(self, out, cond):
|
| 496 |
+
"""
|
| 497 |
+
Forward pass of the classifier network.
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
out (torch.Tensor): Generator output feature map.
|
| 501 |
+
cond (torch.Tensor): Conditioning information vector
|
| 502 |
+
"""
|
| 503 |
+
with torch.cuda.amp.autocast() if self.mixed_precision else dummy_context_mgr() as mpc:
|
| 504 |
+
# Reshape and repeat conditioning information vector to match the feature map size
|
| 505 |
+
cond = cond.view(-1, self.cond_dim, 1, 1)
|
| 506 |
+
cond = cond.repeat(1, 1, 7, 7)
|
| 507 |
+
|
| 508 |
+
# Concatenate feature map and conditioned information
|
| 509 |
+
h_c_code = torch.cat((out, cond), 1)
|
| 510 |
+
|
| 511 |
+
# Pass through the classifier layers
|
| 512 |
+
out = self.joint_conv(h_c_code)
|
| 513 |
+
return out
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
class M_Block(nn.Module):
|
| 517 |
+
"""
|
| 518 |
+
Multi-scale block consisting of convolutional layers and conditioning.
|
| 519 |
+
|
| 520 |
+
Attributes:
|
| 521 |
+
conv1 (nn.Conv2d): First convolutional layer.
|
| 522 |
+
fuse1 (DFBlock): Conditioning block for the first convolutional layer.
|
| 523 |
+
conv2 (nn.Conv2d): Second convolutional layer.
|
| 524 |
+
fuse2 (DFBlock): Conditioning block for the second convolutional layer.
|
| 525 |
+
learnable_sc (bool): Flag indicating whether the shortcut connection is learnable.
|
| 526 |
+
c_sc (nn.Conv2d): Convolutional layer for the shortcut connection.
|
| 527 |
+
|
| 528 |
+
"""
|
| 529 |
+
def __init__(self, in_ch, mid_ch, out_ch, cond_dim, k, s, p):
|
| 530 |
+
"""
|
| 531 |
+
Initializes the Multi-scale block.
|
| 532 |
+
|
| 533 |
+
Args:
|
| 534 |
+
in_ch (int): Number of input channels.
|
| 535 |
+
mid_ch (int): Number of channels in the intermediate layers.
|
| 536 |
+
out_ch (int): Number of output channels.
|
| 537 |
+
cond_dim (int): Dimensionality of the conditioning information.
|
| 538 |
+
k (int): Kernel size for convolutional layers.
|
| 539 |
+
s (int): Stride for convolutional layers.
|
| 540 |
+
p (int): Padding for convolutional layers.
|
| 541 |
+
|
| 542 |
+
"""
|
| 543 |
+
super(M_Block, self).__init__()
|
| 544 |
+
|
| 545 |
+
# Define convolutional layers and conditioning blocks
|
| 546 |
+
self.conv1 = nn.Conv2d(in_ch, mid_ch, k, s, p)
|
| 547 |
+
self.fuse1 = DFBLK(cond_dim, mid_ch)
|
| 548 |
+
self.conv2 = nn.Conv2d(mid_ch, out_ch, k, s, p)
|
| 549 |
+
self.fuse2 = DFBLK(cond_dim, out_ch)
|
| 550 |
+
|
| 551 |
+
# Learnable shortcut connection
|
| 552 |
+
self.learnable_sc = in_ch != out_ch
|
| 553 |
+
if self.learnable_sc:
|
| 554 |
+
self.c_sc = nn.Conv2d(in_ch, out_ch, 1, stride=1, padding=0)
|
| 555 |
+
|
| 556 |
+
def shortcut(self, x):
|
| 557 |
+
"""
|
| 558 |
+
Defines the shortcut connection.
|
| 559 |
+
|
| 560 |
+
Args:
|
| 561 |
+
x (torch.Tensor): Input tensor.
|
| 562 |
+
|
| 563 |
+
Returns:
|
| 564 |
+
torch.Tensor: Shortcut connection output.
|
| 565 |
+
"""
|
| 566 |
+
if self.learnable_sc:
|
| 567 |
+
x = self.c_sc(x)
|
| 568 |
+
return x
|
| 569 |
+
|
| 570 |
+
def residual(self, h, text):
|
| 571 |
+
"""
|
| 572 |
+
Defines the residual path with conditioning.
|
| 573 |
+
|
| 574 |
+
Args:
|
| 575 |
+
h (torch.Tensor): Input tensor.
|
| 576 |
+
text (torch.Tensor): Conditioning information.
|
| 577 |
+
|
| 578 |
+
Returns:
|
| 579 |
+
torch.Tensor: Residual path output.
|
| 580 |
+
"""
|
| 581 |
+
h = self.conv1(h)
|
| 582 |
+
h = self.fuse1(h, text)
|
| 583 |
+
h = self.conv2(h)
|
| 584 |
+
h = self.fuse2(h, text)
|
| 585 |
+
return h
|
| 586 |
+
|
| 587 |
+
def forward(self, h, c):
|
| 588 |
+
"""
|
| 589 |
+
Forward pass of the multi-scale block.
|
| 590 |
+
|
| 591 |
+
Args:
|
| 592 |
+
h (torch.Tensor): Input tensor.
|
| 593 |
+
c (torch.Tensor): Conditioning information.
|
| 594 |
+
|
| 595 |
+
Returns:
|
| 596 |
+
torch.Tensor: Output tensor.
|
| 597 |
+
"""
|
| 598 |
+
return self.shortcut(h) + self.residual(h, c)
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
class G_Block(nn.Module):
|
| 602 |
+
"""
|
| 603 |
+
Generator block consisting of convolutional layers and conditioning.
|
| 604 |
+
|
| 605 |
+
Attributes:
|
| 606 |
+
imsize (int): Size of the output image.
|
| 607 |
+
learnable_sc (bool): Flag indicating whether the shortcut connection is learnable.
|
| 608 |
+
c1 (nn.Conv2d): First convolutional layer.
|
| 609 |
+
c2 (nn.Conv2d): Second convolutional layer.
|
| 610 |
+
fuse1 (DFBLK): Conditioning block for the first convolutional layer.
|
| 611 |
+
fuse2 (DFBLK): Conditioning block for the second convolutional layer.
|
| 612 |
+
c_sc (nn.Conv2d): Convolutional layer for the shortcut connection.
|
| 613 |
+
"""
|
| 614 |
+
|
| 615 |
+
def __init__(self, cond_dim, in_ch, out_ch, imsize):
|
| 616 |
+
"""
|
| 617 |
+
Initialize the Generator block.
|
| 618 |
+
|
| 619 |
+
Args:
|
| 620 |
+
cond_dim (int): Dimensionality of the conditioning information.
|
| 621 |
+
in_ch (int): Number of input channels.
|
| 622 |
+
out_ch (int): Number of output channels.
|
| 623 |
+
imsize (int): Size of the output image.
|
| 624 |
+
"""
|
| 625 |
+
super(G_Block, self).__init__()
|
| 626 |
+
|
| 627 |
+
# Initialize attributes
|
| 628 |
+
self.imsize = imsize
|
| 629 |
+
self.learnable_sc = in_ch != out_ch
|
| 630 |
+
|
| 631 |
+
# Define convolution layers and conditioning blocks
|
| 632 |
+
self.c1 = nn.Conv2d(in_ch, out_ch, 3, 1, 1)
|
| 633 |
+
self.c2 = nn.Conv2d(out_ch, out_ch, 3, 1, 1)
|
| 634 |
+
self.fuse1 = DFBLK(cond_dim, in_ch)
|
| 635 |
+
self.fuse2 = DFBLK(cond_dim, out_ch)
|
| 636 |
+
|
| 637 |
+
# Learnable shortcut connection
|
| 638 |
+
if self.learnable_sc:
|
| 639 |
+
self.c_sc = nn.Conv2d(in_ch, out_ch, 1, stride=1, padding=0)
|
| 640 |
+
|
| 641 |
+
def shortcut(self, x):
|
| 642 |
+
"""
|
| 643 |
+
Defines the shortcut connection.
|
| 644 |
+
|
| 645 |
+
Args:
|
| 646 |
+
x (torch.Tensor): Input tensor.
|
| 647 |
+
|
| 648 |
+
Returns:
|
| 649 |
+
torch.Tensor: Shortcut connection output.
|
| 650 |
+
"""
|
| 651 |
+
if self.learnable_sc:
|
| 652 |
+
x = self.c_sc(x)
|
| 653 |
+
return x
|
| 654 |
+
|
| 655 |
+
def residual(self, h, y):
|
| 656 |
+
"""
|
| 657 |
+
Defines the residual path with conditioning.
|
| 658 |
+
|
| 659 |
+
Args:
|
| 660 |
+
h (torch.Tensor): Input tensor.
|
| 661 |
+
y (torch.Tensor): Conditioning information.
|
| 662 |
+
|
| 663 |
+
Returns:
|
| 664 |
+
torch.Tensor: Residual path output.
|
| 665 |
+
"""
|
| 666 |
+
h = self.fuse1(h, y)
|
| 667 |
+
h = self.c1(h)
|
| 668 |
+
h = self.fuse2(h, y)
|
| 669 |
+
h = self.c2(h)
|
| 670 |
+
return h
|
| 671 |
+
|
| 672 |
+
def forward(self, h, y):
|
| 673 |
+
"""
|
| 674 |
+
Forward pass of the generator block.
|
| 675 |
+
|
| 676 |
+
Args:
|
| 677 |
+
h (torch.Tensor): Input tensor.
|
| 678 |
+
y (torch.Tensor): Conditioning information.
|
| 679 |
+
|
| 680 |
+
Returns:
|
| 681 |
+
torch.Tensor: Output tensor.
|
| 682 |
+
"""
|
| 683 |
+
h = F.interpolate(h, size=(self.imsize, self.imsize))
|
| 684 |
+
return self.shortcut(h) + self.residual(h, y)
|
| 685 |
+
|
| 686 |
+
|
| 687 |
+
class D_Block(nn.Module):
|
| 688 |
+
"""
|
| 689 |
+
Discriminator block.
|
| 690 |
+
"""
|
| 691 |
+
def __init__(self, fin, fout, k, s, p, res, CLIP_feat):
|
| 692 |
+
"""
|
| 693 |
+
Initializes Discriminator block.
|
| 694 |
+
|
| 695 |
+
Args:
|
| 696 |
+
- fin (int): Number of input channels.
|
| 697 |
+
- fout (int): Number of output channels.
|
| 698 |
+
- k (int): Kernel size for convolutional layers.
|
| 699 |
+
- s (int): Stride for convolutional layers.
|
| 700 |
+
- p (int): Padding for convolutional layers.
|
| 701 |
+
- res (bool): Whether to use residual connection.
|
| 702 |
+
- CLIP_feat (bool): Whether to incorporate CLIP features.
|
| 703 |
+
"""
|
| 704 |
+
super(D_Block, self).__init__()
|
| 705 |
+
self.res, self.CLIP_feat = res, CLIP_feat
|
| 706 |
+
self.learned_shortcut = (fin != fout)
|
| 707 |
+
|
| 708 |
+
# Convolutional layers for residual path
|
| 709 |
+
self.conv_r = nn.Sequential(
|
| 710 |
+
nn.Conv2d(fin, fout, k, s, p, bias=False),
|
| 711 |
+
nn.LeakyReLU(0.2, inplace=True),
|
| 712 |
+
nn.Conv2d(fout, fout, k, s, p, bias=False),
|
| 713 |
+
nn.LeakyReLU(0.2, inplace=True),
|
| 714 |
+
)
|
| 715 |
+
|
| 716 |
+
# Convolutional layers for shortcut connection
|
| 717 |
+
self.conv_s = nn.Conv2d(fin, fout, 1, stride=1, padding=0)
|
| 718 |
+
|
| 719 |
+
# Parameters for learned residual and CLIP features
|
| 720 |
+
if self.res == True:
|
| 721 |
+
self.gamma = nn.Parameter(torch.zeros(1))
|
| 722 |
+
if self.CLIP_feat == True:
|
| 723 |
+
self.beta = nn.Parameter(torch.zeros(1))
|
| 724 |
+
|
| 725 |
+
def forward(self, x, CLIP_feat=None):
|
| 726 |
+
"""
|
| 727 |
+
Forward pass of the discriminator block.
|
| 728 |
+
|
| 729 |
+
Args:
|
| 730 |
+
- x (torch.Tensor): Input tensor.
|
| 731 |
+
- CLIP_feat (torch.Tensor): Optional CLIP features tensor.
|
| 732 |
+
|
| 733 |
+
Returns:
|
| 734 |
+
- torch.Tensor: Output tensor.
|
| 735 |
+
"""
|
| 736 |
+
# Compute the residual features
|
| 737 |
+
res = self.conv_r(x)
|
| 738 |
+
|
| 739 |
+
# Compute the shortcut connection
|
| 740 |
+
if self.learned_shortcut:
|
| 741 |
+
x = self.conv_s(x)
|
| 742 |
+
|
| 743 |
+
# Incorporate learned residual and CLIP features if enabled
|
| 744 |
+
if (self.res == True) and (self.CLIP_feat == True):
|
| 745 |
+
return x + self.gamma * res + self.beta * CLIP_feat
|
| 746 |
+
elif (self.res == True) and (self.CLIP_feat != True):
|
| 747 |
+
return x + self.gamma * res
|
| 748 |
+
elif (self.res != True) and (self.CLIP_feat == True):
|
| 749 |
+
return x + self.beta * CLIP_feat
|
| 750 |
+
else:
|
| 751 |
+
return x
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
class DFBLK(nn.Module):
|
| 755 |
+
"""
|
| 756 |
+
Diffusion Block of the Generator network with Conditional feature block
|
| 757 |
+
"""
|
| 758 |
+
def __init__(self, cond_dim, in_ch):
|
| 759 |
+
"""
|
| 760 |
+
Initializing the Conditional feature block of the DFBlock.
|
| 761 |
+
|
| 762 |
+
Args:
|
| 763 |
+
- cond_dim (int): Dimensionality of the conditional input.
|
| 764 |
+
- in_ch (int): Number of input channels.
|
| 765 |
+
"""
|
| 766 |
+
super(DFBLK, self).__init__()
|
| 767 |
+
# Define conditional affine transformations
|
| 768 |
+
self.affine0 = Affine(cond_dim, in_ch)
|
| 769 |
+
self.affine1 = Affine(cond_dim, in_ch)
|
| 770 |
+
|
| 771 |
+
def forward(self, x, y=None):
|
| 772 |
+
"""
|
| 773 |
+
Forward pass of the conditional feature block.
|
| 774 |
+
|
| 775 |
+
Args:
|
| 776 |
+
- x (torch.Tensor): Input tensor.
|
| 777 |
+
- y (torch.Tensor, optional): Conditional input tensor. Default is None.
|
| 778 |
+
|
| 779 |
+
Returns:
|
| 780 |
+
- torch.Tensor: Output tensor.
|
| 781 |
+
"""
|
| 782 |
+
# Apply the first affine transformation and activation function
|
| 783 |
+
h = self.affine0(x, y)
|
| 784 |
+
h = nn.LeakyReLU(0.2, inplace=True)(h)
|
| 785 |
+
# Apply second affine transformation and activation function
|
| 786 |
+
h = self.affine1(h, y)
|
| 787 |
+
h = nn.LeakyReLU(0.2, inplace=True)(h)
|
| 788 |
+
return h
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
class QuickGELU(nn.Module):
|
| 792 |
+
"""
|
| 793 |
+
Efficient and faster version of GELU,
|
| 794 |
+
for non-linearity and to learn complex patterns
|
| 795 |
+
"""
|
| 796 |
+
def forward(self, x: torch.Tensor):
|
| 797 |
+
"""
|
| 798 |
+
Forward pass of the QuickGELU activation function.
|
| 799 |
+
|
| 800 |
+
Args:
|
| 801 |
+
- x (torch.Tensor): Input tensor.
|
| 802 |
+
|
| 803 |
+
Returns:
|
| 804 |
+
- torch.Tensor: Output tensor.
|
| 805 |
+
"""
|
| 806 |
+
# Apply QuickGELU activation function
|
| 807 |
+
return x * torch.sigmoid(1.702 * x)
|
| 808 |
+
|
| 809 |
+
|
| 810 |
+
# Taken from the RAT-GAN repository
|
| 811 |
+
class Affine(nn.Module):
|
| 812 |
+
"""
|
| 813 |
+
Affine transformation module that applies conditional scaling and shifting to input features,
|
| 814 |
+
to incorporate additional control over the generated output based on input conditions.
|
| 815 |
+
"""
|
| 816 |
+
def __init__(self, cond_dim, num_features):
|
| 817 |
+
"""
|
| 818 |
+
Initialize the affine transformation module.
|
| 819 |
+
Args:
|
| 820 |
+
cond_dim (int): Dimensionality of the conditioning information.
|
| 821 |
+
num_features (int): Number of input features.
|
| 822 |
+
"""
|
| 823 |
+
super(Affine, self).__init__()
|
| 824 |
+
# Define 2 fully connected networks to compute gamma and beta parameters
|
| 825 |
+
# each 2 linear layers with RELU activation in between
|
| 826 |
+
self.fc_gamma = nn.Sequential(OrderedDict([
|
| 827 |
+
('linear1', nn.Linear(cond_dim, num_features)),
|
| 828 |
+
('relu1', nn.ReLU(inplace=True)),
|
| 829 |
+
('linear2', nn.Linear(num_features, num_features)),
|
| 830 |
+
]))
|
| 831 |
+
self.fc_beta = nn.Sequential(OrderedDict([
|
| 832 |
+
('linear1', nn.Linear(cond_dim, num_features)),
|
| 833 |
+
('relu1', nn.ReLU(inplace=True)),
|
| 834 |
+
('linear2', nn.Linear(num_features, num_features)),
|
| 835 |
+
]))
|
| 836 |
+
# Initializes the weights and biases of the network
|
| 837 |
+
self._initialize()
|
| 838 |
+
|
| 839 |
+
def _initialize(self):
|
| 840 |
+
"""
|
| 841 |
+
Initializes the weights and biases of the linear layers responsible for computing gamma and beta
|
| 842 |
+
"""
|
| 843 |
+
nn.init.zeros_(self.fc_gamma.linear2.weight.data)
|
| 844 |
+
nn.init.ones_(self.fc_gamma.linear2.bias.data)
|
| 845 |
+
nn.init.zeros_(self.fc_beta.linear2.weight.data)
|
| 846 |
+
nn.init.zeros_(self.fc_beta.linear2.bias.data)
|
| 847 |
+
|
| 848 |
+
def forward(self, x, y=None):
|
| 849 |
+
"""
|
| 850 |
+
Forward pass of the Affine transformation module.
|
| 851 |
+
|
| 852 |
+
Args:
|
| 853 |
+
x (torch.Tensor): Input tensor.
|
| 854 |
+
y (torch.Tensor, optional): Conditioning information tensor. Default is None.
|
| 855 |
+
|
| 856 |
+
Returns:
|
| 857 |
+
torch.Tensor: Transformed tensor after applying affine transformation.
|
| 858 |
+
"""
|
| 859 |
+
# Compute gamma and beta parameters
|
| 860 |
+
weight = self.fc_gamma(y)
|
| 861 |
+
bias = self.fc_beta(y)
|
| 862 |
+
|
| 863 |
+
# Ensure proper shape for weight and bias tensors
|
| 864 |
+
if weight.dim() == 1:
|
| 865 |
+
weight = weight.unsqueeze(0)
|
| 866 |
+
if bias.dim() == 1:
|
| 867 |
+
bias = bias.unsqueeze(0)
|
| 868 |
+
|
| 869 |
+
# Expand weight and bias tensors to match input tensor shape
|
| 870 |
+
size = x.size()
|
| 871 |
+
weight = weight.unsqueeze(-1).unsqueeze(-1).expand(size)
|
| 872 |
+
bias = bias.unsqueeze(-1).unsqueeze(-1).expand(size)
|
| 873 |
+
|
| 874 |
+
# Apply affine transformation
|
| 875 |
+
return weight * x + bias
|
| 876 |
+
|
| 877 |
+
|
| 878 |
+
def get_G_in_out_chs(nf, imsize):
|
| 879 |
+
"""
|
| 880 |
+
Compute input-output channel pairs for generator blocks based on given number of channels and image size.
|
| 881 |
+
|
| 882 |
+
Args:
|
| 883 |
+
nf (int): Number of input channels.
|
| 884 |
+
imsize (int): Size of the input image.
|
| 885 |
+
|
| 886 |
+
Returns:
|
| 887 |
+
list: List of tuples containing input-output channel pairs for generator blocks.
|
| 888 |
+
"""
|
| 889 |
+
# Determine the number of layers based on image size
|
| 890 |
+
layer_num = int(np.log2(imsize)) - 1
|
| 891 |
+
|
| 892 |
+
# Compute the number of channels for each layer
|
| 893 |
+
channel_nums = [nf * min(2 ** idx, 8) for idx in range(layer_num)]
|
| 894 |
+
|
| 895 |
+
# Reverse the channel numbers to start with the highest channel count
|
| 896 |
+
channel_nums = channel_nums[::-1]
|
| 897 |
+
|
| 898 |
+
# Generate input-output channel pairs for generator blocks
|
| 899 |
+
in_out_pairs = zip(channel_nums[:-1], channel_nums[1:])
|
| 900 |
+
|
| 901 |
+
return in_out_pairs
|
| 902 |
+
|
| 903 |
+
|
| 904 |
+
def get_D_in_out_chs(nf, imsize):
|
| 905 |
+
"""
|
| 906 |
+
Compute input-output channel pairs for discriminator blocks based on given number of channels and image size.
|
| 907 |
+
|
| 908 |
+
Args:
|
| 909 |
+
nf (int): Number of input channels.
|
| 910 |
+
imsize (int): Size of the input image.
|
| 911 |
+
|
| 912 |
+
Returns:
|
| 913 |
+
list: List of tuples containing input-output channel pairs for discriminator blocks.
|
| 914 |
+
"""
|
| 915 |
+
# Determine the number of layers based on image size
|
| 916 |
+
layer_num = int(np.log2(imsize)) - 1
|
| 917 |
+
|
| 918 |
+
# Compute the number of channels for each layer
|
| 919 |
+
channel_nums = [nf * min(2 ** idx, 8) for idx in range(layer_num)]
|
| 920 |
+
|
| 921 |
+
# Generate input-output channel pairs for discriminator blocks
|
| 922 |
+
in_out_pairs = zip(channel_nums[:-1], channel_nums[1:])
|
| 923 |
+
|
| 924 |
+
return in_out_pairs
|
utils.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def load_model_weights(model, weights, multi_gpus, train=True):
|
| 2 |
+
"""
|
| 3 |
+
Load the model weights from the given checkpoint file
|
| 4 |
+
"""
|
| 5 |
+
# If model was originally trained on a single GPU but needs to be loaded onto multiple ones,
|
| 6 |
+
# it removes the "module" prefix from the weight keys
|
| 7 |
+
if list(weights.keys())[0].find('module') == -1:
|
| 8 |
+
pretrained_with_multi_gpu = False
|
| 9 |
+
else:
|
| 10 |
+
pretrained_with_multi_gpu = True
|
| 11 |
+
|
| 12 |
+
if (multi_gpus is False) or (train is False):
|
| 13 |
+
if pretrained_with_multi_gpu:
|
| 14 |
+
state_dict = {
|
| 15 |
+
key[7:]: value
|
| 16 |
+
for key, value in weights.items()
|
| 17 |
+
}
|
| 18 |
+
else:
|
| 19 |
+
state_dict = weights
|
| 20 |
+
else:
|
| 21 |
+
state_dict = weights
|
| 22 |
+
|
| 23 |
+
# load the model from the state_dict
|
| 24 |
+
model.load_state_dict(state_dict)
|
| 25 |
+
return model
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# Class to work with if mixed precision is failing
|
| 29 |
+
class dummy_context_mgr:
|
| 30 |
+
def __init__(self):
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
def __enter__(self):
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Function to read CSS from file
|
| 41 |
+
def read_css_from_file(filename):
|
| 42 |
+
with open(filename, 'r') as file:
|
| 43 |
+
return file.read()
|