expt_dir: experiments
expt_name: instruct_v2v_ic_pexels_text_hdr_test_lr0.5_aug_lossc_fix_bs1 #! 注意传入log里面, 不要每次修改
trainer_args:
  max_epochs: 10
  accelerator: "gpu"
  devices: [0,1,2,3,4,5,6,7] #! change to get more cards
  limit_train_batches: 2048
  limit_val_batches: 3 #! 这边限制了每个epoch只跑多少个batch的validation
  # strategy: "ddp"
  strategy: "deepspeed_stage_2"
  # autotune_only_on_rank_zero: true # 确保只有一个进程执行调优表操作 
  accumulate_grad_batches: 128 #! 注意一下这个值  256->128
  check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
  # precision: 16 # 启用半精度 (FP16)
diffusion:
  target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporalText
  params:
    beta_schedule_args:
      beta_schedule: scaled_linear
      num_train_timesteps: 1000
      beta_start: 0.00085
      beta_end: 0.012
      clip_sample: false
      thresholding: false
    prediction_type: epsilon
    loss_fn: l2
    optim_args:
      lr: 1e-5 #! 原来是1e-5
    unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
      - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
      - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
      - pretrained_models/iclight/iclight_sd15_fc.safetensors # iclight lora weights
    base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
    # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
    # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
    scale_factor: 0.18215
    guidance_scale: 5 # not used
    ddim_sampling_steps: 20
    text_cfg: 7.5
    img_cfg: 1.2
    hdr_cfg: 7.5
    cond_image_dropout: 0.1
    cond_text_dropout: 0.1
    cond_hdr_dropout: 0.1
    ic_condition: fg
    hdr_train: True
    prompt_type: edit_prompt
unet:
  target: modules.video_unet_temporal.unet.UNet3DConditionModel
  params:
    in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
    out_channels: 4
    act_fn: silu
    attention_head_dim: 8
    block_out_channels: 
      - 320
      - 640
      - 1280
      - 1280
    cross_attention_dim: 768
    down_block_types: 
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - CrossAttnDownBlock3D
      - DownBlock3D
    up_block_types: 
      - UpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
      - CrossAttnUpBlock3D
    downsample_padding: 1
    layers_per_block: 2
    mid_block_scale_factor: 1
    norm_eps: 1e-05
    norm_num_groups: 32
    sample_size: 64
    use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
    motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
    motion_module_mid_block: false
    motion_module_decoder_only: false
    motion_module_type: Vanilla
    motion_module_kwargs:
      num_attention_heads: 8
      num_transformer_block: 1
      attention_block_types:
      - Temporal_Self
      - Temporal_Self
      temporal_position_encoding: true
      temporal_position_encoding_max_len: 32
      temporal_attention_dim_div: 1
text_model:
  target: modules.openclip.modules.FrozenCLIPEmbedder
  params:
    freeze: true
data:
  batch_size: 1
  val_batch_size: 1
  train:
    target: dataset.videoP2P.VideoPromptToPromptMotionAugPexelsHDR
    params: # 注意修改一下training的路径，和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
      root_dirs: #! 注意root_dirs已经更改
        # - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
        - /mnt/hwfile/mllm/sunzeyi/iclight_video/rendered_data_rgb_fixlast
      hdr_dir: /mnt/hwfile/mllm/sunzeyi/iclight_video/haven_hdr_rgb
      num_frames: 16
      zoom_ratio: 0.2
      max_zoom: 1.25
      translation_ratio: 0.7
      translation_range: [0, 0.2]
      is_train: True
      ic_condition: fg
  val:
    target: dataset.videoP2P.VideoPromptToPromptMotionAugPexelsHDR
    params:
      root_dirs: 
        # - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
        - /mnt/hwfile/mllm/sunzeyi/iclight_video/rendered_data_rgb_fixlast
      hdr_dir: /mnt/hwfile/mllm/sunzeyi/iclight_video/haven_hdr_rgb
      num_frames: 16
      zoom_ratio: 0.2
      max_zoom: 1.25
      translation_ratio: 0.7
      translation_range: [0, 0.2]
      ic_condition: fg
callbacks:
  - target: pytorch_lightning.callbacks.ModelCheckpoint
    params:
      dirpath: "${expt_dir}/${expt_name}"
      # filename: "{epoch:04d}"
      filename: "{step:06d}"
      every_n_train_steps: 1
      save_last: false
      # monitor: epoch
      # mode: max
      # save_top_k: 3
      # save_last: false
  - target: callbacks.instruct_p2p_video.InstructP2PLogger
    params:
      max_num_images: 1
      expt_name: instruct_v2v_ic_pexels_text_hdr_test_lr0.5_aug_lossc_fix_bs1
      # accumulate_grad_batches: 128
    require_wandb: true
  - target: pytorch_lightning.callbacks.DeviceStatsMonitor