| data: | |
| train_bs: 4 | |
| val_bs: 1 | |
| train_width: 512 | |
| train_height: 512 | |
| fps: 25 | |
| sample_rate: 16000 | |
| n_motion_frames: 2 | |
| n_sample_frames: 16 | |
| audio_margin: 2 | |
| train_meta_paths: | |
| - "./data/inference.json" | |
| wav2vec_config: | |
| audio_type: "vocals" # audio vocals | |
| model_scale: "base" # base large | |
| features: "all" # last avg all | |
| model_path: ./pretrained_models/chinese-wav2vec2-base | |
| audio_separator: | |
| model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx | |
| face_expand_ratio: 1.2 | |
| solver: | |
| gradient_accumulation_steps: 1 | |
| mixed_precision: "no" | |
| enable_xformers_memory_efficient_attention: True | |
| gradient_checkpointing: True | |
| max_train_steps: 30000 | |
| max_grad_norm: 1.0 | |
| # lr | |
| learning_rate: 1e-5 | |
| scale_lr: False | |
| lr_warmup_steps: 1 | |
| lr_scheduler: "constant" | |
| # optimizer | |
| use_8bit_adam: True | |
| adam_beta1: 0.9 | |
| adam_beta2: 0.999 | |
| adam_weight_decay: 1.0e-2 | |
| adam_epsilon: 1.0e-8 | |
| val: | |
| validation_steps: 1000 | |
| noise_scheduler_kwargs: | |
| num_train_timesteps: 1000 | |
| beta_start: 0.00085 | |
| beta_end: 0.012 | |
| beta_schedule: "linear" | |
| steps_offset: 1 | |
| clip_sample: false | |
| unet_additional_kwargs: | |
| use_inflated_groupnorm: true | |
| unet_use_cross_frame_attention: false | |
| unet_use_temporal_attention: false | |
| use_motion_module: true | |
| use_audio_module: true | |
| motion_module_resolutions: | |
| - 1 | |
| - 2 | |
| - 4 | |
| - 8 | |
| motion_module_mid_block: true | |
| motion_module_decoder_only: false | |
| motion_module_type: Vanilla | |
| motion_module_kwargs: | |
| num_attention_heads: 8 | |
| num_transformer_block: 1 | |
| attention_block_types: | |
| - Temporal_Self | |
| - Temporal_Self | |
| temporal_position_encoding: true | |
| temporal_position_encoding_max_len: 32 | |
| temporal_attention_dim_div: 1 | |
| audio_attention_dim: 768 | |
| stack_enable_blocks_name: | |
| - "up" | |
| - "down" | |
| - "mid" | |
| stack_enable_blocks_depth: [0,1,2,3] | |
| trainable_para: | |
| - audio_modules | |
| - motion_modules | |
| base_model_path: "./pretrained_models/stable-diffusion-v1-5" | |
| vae_model_path: "./pretrained_models/sd-vae-ft-mse" | |
| face_analysis_model_path: "./pretrained_models/face_analysis" | |
| mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt" | |
| weight_dtype: "fp16" # [fp16, fp32] | |
| uncond_img_ratio: 0.05 | |
| uncond_audio_ratio: 0.05 | |
| uncond_ia_ratio: 0.05 | |
| start_ratio: 0.05 | |
| noise_offset: 0.05 | |
| snr_gamma: 5.0 | |
| enable_zero_snr: True | |
| stage1_ckpt_dir: "./exp_output/stage1/" | |
| single_inference_times: 10 | |
| inference_steps: 40 | |
| cfg_scale: 3.5 | |
| seed: 42 | |
| resume_from_checkpoint: "latest" | |
| checkpointing_steps: 500 | |
| exp_name: "joyhallo" | |
| output_dir: "./opts" | |
| audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth" | |
| ref_img_path: | |
| - "examples/reference_images/1.jpg" | |
| - "examples/reference_images/2.jpg" | |
| - "examples/reference_images/3.jpg" | |
| - "examples/reference_images/4.jpg" | |
| - "examples/reference_images/5.jpg" | |
| - "examples/reference_images/6.jpg" | |
| - "examples/reference_images/7.jpg" | |
| audio_path: | |
| - "examples/driving_audios/0.wav" | |
| - "examples/driving_audios/0.wav" | |
| - "examples/driving_audios/0.wav" | |
| - "examples/driving_audios/0.wav" | |
| - "examples/driving_audios/0.wav" | |
| - "examples/driving_audios/0.wav" | |
| - "examples/driving_audios/0.wav" |