jdh-algo
/

JoyHallo-v1

+data:
+  train_bs: 4
+  val_bs: 1
+  train_width: 512
+  train_height: 512
+  fps: 25
+  sample_rate: 16000
+  n_motion_frames: 2
+  n_sample_frames: 16
+  audio_margin: 2
+  train_meta_paths:
+    - "./data/inference.json"
+wav2vec_config:
+  audio_type: "vocals" # audio vocals
+  model_scale: "base" # base large
+  features: "all" # last avg all
+  model_path: ./pretrained_models/chinese-wav2vec2-base
+audio_separator:
+  model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx
+face_expand_ratio: 1.2
+solver:
+  gradient_accumulation_steps: 1
+  mixed_precision: "no"
+  enable_xformers_memory_efficient_attention: True
+  gradient_checkpointing: True
+  max_train_steps: 30000
+  max_grad_norm: 1.0
+  # lr
+  learning_rate: 1e-5
+  scale_lr: False
+  lr_warmup_steps: 1
+  lr_scheduler: "constant"
+  # optimizer
+  use_8bit_adam: True
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_weight_decay: 1.0e-2
+  adam_epsilon: 1.0e-8
+val:
+  validation_steps: 1000
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start: 0.00085
+  beta_end: 0.012
+  beta_schedule: "linear"
+  steps_offset: 1
+  clip_sample: false
+unet_additional_kwargs:
+  use_inflated_groupnorm: true
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention: false
+  use_motion_module: true
+  use_audio_module: true
+  motion_module_resolutions:
+    - 1
+    - 2
+    - 4
+    - 8
+  motion_module_mid_block: true
+  motion_module_decoder_only: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads: 8
+    num_transformer_block: 1
+    attention_block_types:
+      - Temporal_Self
+      - Temporal_Self
+    temporal_position_encoding: true
+    temporal_position_encoding_max_len: 32
+    temporal_attention_dim_div: 1
+  audio_attention_dim: 768
+  stack_enable_blocks_name:
+    - "up"
+    - "down"
+    - "mid"
+  stack_enable_blocks_depth: [0,1,2,3]
+trainable_para:
+  - audio_modules
+  - motion_modules
+base_model_path: "./pretrained_models/stable-diffusion-v1-5"
+vae_model_path: "./pretrained_models/sd-vae-ft-mse"
+face_analysis_model_path: "./pretrained_models/face_analysis"
+mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt"
+weight_dtype: "fp16" # [fp16, fp32]
+uncond_img_ratio: 0.05
+uncond_audio_ratio: 0.05
+uncond_ia_ratio: 0.05
+start_ratio: 0.05
+noise_offset: 0.05
+snr_gamma: 5.0
+enable_zero_snr: True
+stage1_ckpt_dir: "./exp_output/stage1/"
+single_inference_times: 10
+inference_steps: 40
+cfg_scale: 3.5
+seed: 42
+resume_from_checkpoint: "latest"
+checkpointing_steps: 500
+exp_name: "joyhallo"
+output_dir: "./opts"
+audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth"
+ref_img_path:
+  - "examples/reference_images/1.jpg"
+  - "examples/reference_images/2.jpg"
+  - "examples/reference_images/3.jpg"
+  - "examples/reference_images/4.jpg"
+  - "examples/reference_images/5.jpg"
+  - "examples/reference_images/6.jpg"
+  - "examples/reference_images/7.jpg"
+audio_path:
+  - "examples/driving_audios/0.wav"
+  - "examples/driving_audios/0.wav"
+  - "examples/driving_audios/0.wav"
+  - "examples/driving_audios/0.wav"
+  - "examples/driving_audios/0.wav"
+  - "examples/driving_audios/0.wav"
+  - "examples/driving_audios/0.wav"