__object__: path: projects.video_diffusion_sr.train name: VideoDiffusionTrainer dit: model: __object__: path: models.dit_v2.nadit name: NaDiT args: as_params vid_in_channels: 33 vid_out_channels: 16 vid_dim: 2560 vid_out_norm: fusedrms txt_in_dim: 5120 txt_in_norm: fusedln txt_dim: ${.vid_dim} emb_dim: ${eval:'6 * ${.vid_dim}'} heads: 20 head_dim: 128 # llm-like expand_ratio: 4 norm: fusedrms norm_eps: 1.0e-05 ada: single qk_bias: False qk_norm: fusedrms patch_size: [ 1,2,2 ] num_layers: 32 # llm-like mm_layers: 10 mlp_type: swiglu msa_type: None block_type: ${eval:'${.num_layers} * ["mmdit_sr"]'} # space-full window: ${eval:'${.num_layers} * [(4,3,3)]'} # space-full window_method: ${eval:'${.num_layers} // 2 * ["720pwin_by_size_bysize","720pswin_by_size_bysize"]'} # space-full rope_type: mmrope3d rope_dim: 128 compile: False gradient_checkpoint: True fsdp: sharding_strategy: _HYBRID_SHARD_ZERO2 ema: decay: 0.9998 vae: model: __inherit__: models/video_vae_v3/s8_c16_t4_inflation_sd3.yaml freeze_encoder: False # gradient_checkpoint: True slicing: split_size: 4 memory_device: same memory_limit: conv_max_mem: 0.5 norm_max_mem: 0.5 checkpoint: ./ckpts/ema_vae.pth scaling_factor: 0.9152 compile: False grouping: False dtype: bfloat16 diffusion: schedule: type: lerp T: 1000.0 sampler: type: euler prediction_type: v_lerp timesteps: training: type: logitnormal loc: 0.0 scale: 1.0 sampling: type: uniform_trailing steps: 50 transform: True loss: type: v_lerp cfg: scale: 7.5 rescale: 0 condition: i2v: 0.0 v2v: 0.0 sr: 1.0 noise_scale: 0.25