RelightVid / configs /instruct_v2v_ori.yaml
aleafy's picture
Start fresh
0a63786
expt_dir: experiments
expt_name: instruct_v2v
trainer_args:
max_epochs: 10
accelerator: "gpu"
devices: [0]
limit_train_batches: 2048
limit_val_batches: 1
# strategy: "ddp"
strategy: "deepspeed_stage_2"
accumulate_grad_batches: 256
check_val_every_n_epoch: 5
diffusion:
target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
params:
beta_schedule_args:
beta_schedule: scaled_linear
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
clip_sample: false
thresholding: false
prediction_type: epsilon
loss_fn: l2
optim_args:
lr: 1e-5
unet_init_weights:
- pretrained_models/instruct_pix2pix/diffusion_pytorch_model.bin
- pretrained_models/Motion_Module/mm_sd_v15.ckpt
vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt
scale_factor: 0.18215
guidance_scale: 5 # not used
ddim_sampling_steps: 20
text_cfg: 7.5
img_cfg: 1.2
cond_image_dropout: 0.1
prompt_type: edit_prompt
unet:
target: modules.video_unet_temporal.unet.UNet3DConditionModel
params:
in_channels: 8
out_channels: 4
act_fn: silu
attention_head_dim: 8
block_out_channels:
- 320
- 640
- 1280
- 1280
cross_attention_dim: 768
down_block_types:
- CrossAttnDownBlock3D
- CrossAttnDownBlock3D
- CrossAttnDownBlock3D
- DownBlock3D
up_block_types:
- UpBlock3D
- CrossAttnUpBlock3D
- CrossAttnUpBlock3D
- CrossAttnUpBlock3D
downsample_padding: 1
layers_per_block: 2
mid_block_scale_factor: 1
norm_eps: 1e-05
norm_num_groups: 32
sample_size: 64
use_motion_module: true
motion_module_resolutions:
- 1
- 2
- 4
- 8
motion_module_mid_block: false
motion_module_decoder_only: false
motion_module_type: Vanilla
motion_module_kwargs:
num_attention_heads: 8
num_transformer_block: 1
attention_block_types:
- Temporal_Self
- Temporal_Self
temporal_position_encoding: true
temporal_position_encoding_max_len: 32
temporal_attention_dim_div: 1
vae:
target: modules.kl_autoencoder.autoencoder.AutoencoderKL
params:
embed_dim: 4
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
text_model:
target: modules.openclip.modules.FrozenCLIPEmbedder
params:
freeze: true
data:
batch_size: 1
val_batch_size: 1
train:
target: dataset.videoP2P.VideoPromptToPromptMotionAug
params:
root_dirs:
- video_ptp/raw_generated
- video_ptp/raw_generated_webvid
num_frames: 16
zoom_ratio: 0.2
max_zoom: 1.25
translation_ratio: 0.7
translation_range: [0, 0.2]
val:
target: dataset.videoP2P.VideoPromptToPromptMotionAug
params:
root_dirs:
- video_ptp/raw_generated
num_frames: 16
zoom_ratio: 0.2
max_zoom: 1.25
translation_ratio: 0.7
translation_range: [0, 0.2]
callbacks:
- target: pytorch_lightning.callbacks.ModelCheckpoint
params:
dirpath: "${expt_dir}/${expt_name}"
filename: "{epoch:04d}"
monitor: epoch
mode: max
save_top_k: 5
save_last: true
- target: callbacks.instruct_p2p_video.InstructP2PLogger
params:
max_num_images: 1
require_wandb: true