File size: 3,679 Bytes
0a63786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
expt_dir: experiments
expt_name: instruct_v2v
trainer_args:
  max_epochs: 10
  accelerator: "gpu"
  devices: [0]
  limit_train_batches: 2048
  limit_val_batches: 1
  # strategy: "ddp"
  strategy: "deepspeed_stage_2"
  accumulate_grad_batches: 256
  check_val_every_n_epoch: 5
diffusion:
  target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
  params:
    beta_schedule_args:
      beta_schedule: scaled_linear
      num_train_timesteps: 1000
      beta_start: 0.00085
      beta_end: 0.012
      clip_sample: false
      thresholding: false
    prediction_type: epsilon
    loss_fn: l2
    optim_args:
      lr: 1e-5
    unet_init_weights: 
      - pretrained_models/instruct_pix2pix/diffusion_pytorch_model.bin
      - pretrained_models/Motion_Module/mm_sd_v15.ckpt
    vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
    text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt
    scale_factor: 0.18215
    guidance_scale: 5 # not used
    ddim_sampling_steps: 20
    text_cfg: 7.5
    img_cfg: 1.2
    cond_image_dropout: 0.1
    prompt_type: edit_prompt
unet:
  target: modules.video_unet_temporal.unet.UNet3DConditionModel
  params:
    in_channels: 8
    out_channels: 4
    act_fn: silu
    attention_head_dim: 8
    block_out_channels: 
      - 320
      - 640
      - 1280
      - 1280
    cross_attention_dim: 768
    down_block_types: 
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - DownBlock3D
    up_block_types: 
      - UpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
    downsample_padding: 1
    layers_per_block: 2
    mid_block_scale_factor: 1
    norm_eps: 1e-05
    norm_num_groups: 32
    sample_size: 64
    use_motion_module: true
    motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
    motion_module_mid_block: false
    motion_module_decoder_only: false
    motion_module_type: Vanilla
    motion_module_kwargs:
      num_attention_heads: 8
      num_transformer_block: 1
      attention_block_types:
      - Temporal_Self
      - Temporal_Self
      temporal_position_encoding: true
      temporal_position_encoding_max_len: 32
      temporal_attention_dim_div: 1
vae:
  target: modules.kl_autoencoder.autoencoder.AutoencoderKL
  params:
    embed_dim: 4
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
        - 1
        - 2
        - 4
        - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
    lossconfig:
      target: torch.nn.Identity
text_model:
  target: modules.openclip.modules.FrozenCLIPEmbedder
  params:
    freeze: true
data:
  batch_size: 1
  val_batch_size: 1
  train:
    target: dataset.videoP2P.VideoPromptToPromptMotionAug
    params:
      root_dirs: 
        - video_ptp/raw_generated
        - video_ptp/raw_generated_webvid
      num_frames: 16
      zoom_ratio: 0.2
      max_zoom: 1.25
      translation_ratio: 0.7
      translation_range: [0, 0.2]
  val:
    target: dataset.videoP2P.VideoPromptToPromptMotionAug
    params:
      root_dirs: 
        - video_ptp/raw_generated
      num_frames: 16
      zoom_ratio: 0.2
      max_zoom: 1.25
      translation_ratio: 0.7
      translation_range: [0, 0.2]
callbacks:
  - target: pytorch_lightning.callbacks.ModelCheckpoint
    params:
      dirpath: "${expt_dir}/${expt_name}"
      filename: "{epoch:04d}"
      monitor: epoch
      mode: max
      save_top_k: 5
      save_last: true
  - target: callbacks.instruct_p2p_video.InstructP2PLogger
    params:
      max_num_images: 1
    require_wandb: true