jdh-algo
/

JoyHallo-v1

mandarin-chinese

audio-driven-talking-face

Model card Files Files and versions

JoyHallo-v1 / config.json

shisheng7's picture

Create config.json

098012a verified about 1 year ago

history blame contribute delete

3.13 kB

	data:
	train_bs: 4
	val_bs: 1
	train_width: 512
	train_height: 512
	fps: 25
	sample_rate: 16000
	n_motion_frames: 2
	n_sample_frames: 16
	audio_margin: 2
	train_meta_paths:
	- "./data/inference.json"

	wav2vec_config:
	audio_type: "vocals" # audio vocals
	model_scale: "base" # base large
	features: "all" # last avg all
	model_path: ./pretrained_models/chinese-wav2vec2-base
	audio_separator:
	model_path: ./pretrained_models/audio_separator/Kim_Vocal_2.onnx
	face_expand_ratio: 1.2

	solver:
	gradient_accumulation_steps: 1
	mixed_precision: "no"
	enable_xformers_memory_efficient_attention: True
	gradient_checkpointing: True
	max_train_steps: 30000
	max_grad_norm: 1.0
	# lr
	learning_rate: 1e-5
	scale_lr: False
	lr_warmup_steps: 1
	lr_scheduler: "constant"

	# optimizer
	use_8bit_adam: True
	adam_beta1: 0.9
	adam_beta2: 0.999
	adam_weight_decay: 1.0e-2
	adam_epsilon: 1.0e-8

	val:
	validation_steps: 1000

	noise_scheduler_kwargs:
	num_train_timesteps: 1000
	beta_start: 0.00085
	beta_end: 0.012
	beta_schedule: "linear"
	steps_offset: 1
	clip_sample: false

	unet_additional_kwargs:
	use_inflated_groupnorm: true
	unet_use_cross_frame_attention: false
	unet_use_temporal_attention: false
	use_motion_module: true
	use_audio_module: true
	motion_module_resolutions:
	- 1
	- 2
	- 4
	- 8
	motion_module_mid_block: true
	motion_module_decoder_only: false
	motion_module_type: Vanilla
	motion_module_kwargs:
	num_attention_heads: 8
	num_transformer_block: 1
	attention_block_types:
	- Temporal_Self
	- Temporal_Self
	temporal_position_encoding: true
	temporal_position_encoding_max_len: 32
	temporal_attention_dim_div: 1
	audio_attention_dim: 768
	stack_enable_blocks_name:
	- "up"
	- "down"
	- "mid"
	stack_enable_blocks_depth: [0,1,2,3]

	trainable_para:
	- audio_modules
	- motion_modules

	base_model_path: "./pretrained_models/stable-diffusion-v1-5"
	vae_model_path: "./pretrained_models/sd-vae-ft-mse"
	face_analysis_model_path: "./pretrained_models/face_analysis"
	mm_path: "./pretrained_models/motion_module/mm_sd_v15_v2.ckpt"

	weight_dtype: "fp16" # [fp16, fp32]
	uncond_img_ratio: 0.05
	uncond_audio_ratio: 0.05
	uncond_ia_ratio: 0.05
	start_ratio: 0.05
	noise_offset: 0.05
	snr_gamma: 5.0
	enable_zero_snr: True
	stage1_ckpt_dir: "./exp_output/stage1/"

	single_inference_times: 10
	inference_steps: 40
	cfg_scale: 3.5

	seed: 42
	resume_from_checkpoint: "latest"
	checkpointing_steps: 500

	exp_name: "joyhallo"
	output_dir: "./opts"

	audio_ckpt_dir: "./pretrained_models/joyhallo/net.pth"

	ref_img_path:
	- "examples/reference_images/1.jpg"
	- "examples/reference_images/2.jpg"
	- "examples/reference_images/3.jpg"
	- "examples/reference_images/4.jpg"
	- "examples/reference_images/5.jpg"
	- "examples/reference_images/6.jpg"
	- "examples/reference_images/7.jpg"

	audio_path:
	- "examples/driving_audios/0.wav"
	- "examples/driving_audios/0.wav"
	- "examples/driving_audios/0.wav"
	- "examples/driving_audios/0.wav"
	- "examples/driving_audios/0.wav"
	- "examples/driving_audios/0.wav"
	- "examples/driving_audios/0.wav"