__object__:
  path: projects.video_diffusion_sr.train
  name: VideoDiffusionTrainer

dit:
  model:
    __object__:
      path: models.dit_v2.nadit
      name: NaDiT
      args: as_params
    vid_in_channels: 33
    vid_out_channels: 16
    vid_dim: 2560
    vid_out_norm: fusedrms
    txt_in_dim: 5120
    txt_in_norm: fusedln
    txt_dim: ${.vid_dim}
    emb_dim: ${eval:'6 * ${.vid_dim}'}
    heads: 20
    head_dim: 128 # llm-like
    expand_ratio: 4
    norm: fusedrms
    norm_eps: 1.0e-05
    ada: single
    qk_bias: False
    qk_norm: fusedrms
    patch_size: [ 1,2,2 ]
    num_layers: 32 # llm-like
    mm_layers: 10
    mlp_type: swiglu
    msa_type: None
    block_type: ${eval:'${.num_layers} * ["mmdit_sr"]'} # space-full
    window: ${eval:'${.num_layers} * [(4,3,3)]'} # space-full
    window_method: ${eval:'${.num_layers} // 2 * ["720pwin_by_size_bysize","720pswin_by_size_bysize"]'} # space-full
    rope_type: mmrope3d
    rope_dim: 128
  compile: False
  gradient_checkpoint: True
  fsdp:
    sharding_strategy: _HYBRID_SHARD_ZERO2

ema:
  decay: 0.9998

vae:
  model:
    __inherit__: models/video_vae_v3/s8_c16_t4_inflation_sd3.yaml
    freeze_encoder: False
    # gradient_checkpoint: True
  slicing:
    split_size: 4
    memory_device: same
  memory_limit:
    conv_max_mem: 0.5
    norm_max_mem: 0.5
  checkpoint: ./ckpts/ema_vae.pth
  scaling_factor: 0.9152
  compile: False
  grouping: False
  dtype: bfloat16

diffusion:
  schedule:
    type: lerp
    T: 1000.0
  sampler:
    type: euler
    prediction_type: v_lerp
  timesteps:
    training:
      type: logitnormal
      loc: 0.0
      scale: 1.0
    sampling:
      type: uniform_trailing
      steps: 50
    transform: True
  loss:
    type: v_lerp
  cfg:
    scale: 7.5
    rescale: 0

condition:
  i2v: 0.0
  v2v: 0.0
  sr: 1.0
  noise_scale: 0.25