defaults:
  - _self_
  - dset: musdb44
  - svd: default
  - variant: default
  - override hydra/hydra_logging: colorlog
  - override hydra/job_logging: colorlog

dummy:
dset:
  musdb: /checkpoint/defossez/datasets/musdbhq
  musdb_samplerate: 44100
  use_musdb: true   # set to false to not use musdb as training data.
  wav:  # path to custom wav dataset
  wav2:  # second custom wav dataset
  segment: 11
  shift: 1
  train_valid: false
  full_cv: true
  samplerate: 44100
  channels: 2
  normalize: true
  metadata: ./metadata
  sources: ['drums', 'bass', 'other', 'vocals']
  valid_samples: # valid dataset size
  backend: null   # if provided select torchaudio backend.

test:
  save: False
  best: True
  workers: 2
  every: 20
  split: true
  shifts: 1
  overlap: 0.25
  sdr: true
  metric: 'loss'  # metric used for best model selection on the valid set, can also be nsdr
  nonhq:   # path to non hq MusDB for evaluation

epochs: 360
batch_size: 64
max_batches:  # limit the number of batches per epoch, useful for debugging
              # or if your dataset is gigantic.
optim:
  lr: 3e-4
  momentum: 0.9
  beta2: 0.999
  loss: l1    # l1 or mse
  optim: adam
  weight_decay: 0
  clip_grad: 0

seed: 42
debug: false
valid_apply: true
flag:
save_every:
weights: [1., 1., 1., 1.]  # weights over each source for the training/valid loss.

augment:
  shift_same: false
  repitch:
    proba: 0.2
    max_tempo: 12
  remix:
    proba: 1
    group_size: 4
  scale:
    proba: 1
    min: 0.25
    max: 1.25
  flip: true

continue_from:  # continue from other XP, give the XP Dora signature.
continue_pretrained:   # signature of a pretrained XP, this cannot be a bag of models.
pretrained_repo:   # repo for pretrained model (default is official AWS)
continue_best: true
continue_opt: false

misc:
  num_workers: 10
  num_prints: 4
  show: false
  verbose: false

# List of decay for EMA at batch or epoch level, e.g. 0.999.
# Batch level EMA are kept on GPU for speed.
ema:
  epoch: []
  batch: []

use_train_segment: true  # to remove
model_segment:  # override the segment parameter for the model, usually 4 times the training segment.
model: demucs  # see demucs/train.py for the possibilities, and config for each model hereafter.
demucs:  # see demucs/demucs.py for a detailed description
  # Channels
  channels: 64
  growth: 2
  # Main structure
  depth: 6
  rewrite: true
  lstm_layers: 0
  # Convolutions
  kernel_size: 8
  stride: 4
  context: 1
  # Activations
  gelu: true
  glu: true
  # Normalization
  norm_groups: 4
  norm_starts: 4
  # DConv residual branch
  dconv_depth: 2
  dconv_mode: 1  # 1 = branch in encoder, 2 = in decoder, 3 = in both.
  dconv_comp: 4
  dconv_attn: 4
  dconv_lstm: 4
  dconv_init: 1e-4
  # Pre/post treatment
  resample: true
  normalize: false
  # Weight init
  rescale: 0.1

hdemucs:  # see demucs/hdemucs.py for a detailed description
  # Channels
  channels: 48
  channels_time:
  growth: 2
  # STFT
  nfft: 4096
  wiener_iters: 0
  end_iters: 0
  wiener_residual: false
  cac: true
  # Main structure
  depth: 6
  rewrite: true
  hybrid: true
  hybrid_old: false
  # Frequency Branch
  multi_freqs: []
  multi_freqs_depth: 3
  freq_emb: 0.2
  emb_scale: 10
  emb_smooth: true
  # Convolutions
  kernel_size: 8
  stride: 4
  time_stride: 2
  context: 1
  context_enc: 0
  # normalization
  norm_starts: 4
  norm_groups: 4
  # DConv residual branch
  dconv_mode: 1
  dconv_depth: 2
  dconv_comp: 4
  dconv_attn: 4
  dconv_lstm: 4
  dconv_init: 1e-3
  # Weight init
  rescale: 0.1

# Torchaudio implementation of HDemucs
torch_hdemucs:
# Channels
  channels: 48
  growth: 2
  # STFT
  nfft: 4096
  # Main structure
  depth: 6
  freq_emb: 0.2
  emb_scale: 10
  emb_smooth: true
  # Convolutions
  kernel_size: 8
  stride: 4
  time_stride: 2
  context: 1
  context_enc: 0
  # normalization
  norm_starts: 4
  norm_groups: 4
  # DConv residual branch
  dconv_depth: 2
  dconv_comp: 4
  dconv_attn: 4
  dconv_lstm: 4
  dconv_init: 1e-3

htdemucs:  # see demucs/htdemucs.py for a detailed description
  # Channels
  channels: 48
  channels_time:
  growth: 2
  # STFT
  nfft: 4096
  wiener_iters: 0
  end_iters: 0
  wiener_residual: false
  cac: true
  # Main structure
  depth: 4
  rewrite: true
  # Frequency Branch
  multi_freqs: []
  multi_freqs_depth: 3
  freq_emb: 0.2
  emb_scale: 10
  emb_smooth: true
  # Convolutions
  kernel_size: 8
  stride: 4
  time_stride: 2
  context: 1
  context_enc: 0
  # normalization
  norm_starts: 4
  norm_groups: 4
  # DConv residual branch
  dconv_mode: 1
  dconv_depth: 2
  dconv_comp: 8
  dconv_init: 1e-3
  # Before the Transformer
  bottom_channels: 0
  # CrossTransformer
  # ------ Common to all
  # Regular parameters
  t_layers: 5
  t_hidden_scale: 4.0
  t_heads: 8
  t_dropout: 0.0
  t_layer_scale: True
  t_gelu: True
  # ------------- Positional Embedding
  t_emb: sin
  t_max_positions: 10000 # for the scaled embedding
  t_max_period: 10000.0
  t_weight_pos_embed: 1.0
  t_cape_mean_normalize: True
  t_cape_augment: True
  t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
  t_sin_random_shift: 0
  # ------------- norm before a transformer encoder
  t_norm_in: True
  t_norm_in_group: False
  # ------------- norm inside the encoder
  t_group_norm: False
  t_norm_first: True
  t_norm_out: True
  # ------------- optim
  t_weight_decay: 0.0
  t_lr:
  # ------------- sparsity
  t_sparse_self_attn: False
  t_sparse_cross_attn: False
  t_mask_type: diag
  t_mask_random_seed: 42
  t_sparse_attn_window: 400
  t_global_window: 100
  t_sparsity: 0.95
  t_auto_sparsity: False
  # Cross Encoder First (False)
  t_cross_first: False
  # Weight init
  rescale: 0.1

svd:  # see svd.py for documentation
  penalty: 0
  min_size: 0.1
  dim: 1
  niters: 2
  powm: false
  proba: 1
  conv_only: false
  convtr: false
  bs: 1

quant:  # quantization hyper params
  diffq:    # diffq penalty, typically 1e-4 or 3e-4
  qat:      # use QAT with a fixed number of bits (not as good as diffq)
  min_size: 0.2
  group_size: 8

dora:
  dir: outputs
  exclude: ["misc.*", "slurm.*", 'test.reval', 'flag', 'dset.backend']

slurm:
  time: 4320
  constraint: volta32gb
  setup: ['module load cudnn/v8.4.1.50-cuda.11.6 NCCL/2.11.4-6-cuda.11.6 cuda/11.6']

# Hydra config
hydra:
  job_logging:
    formatters:
      colorlog:
        datefmt: "%m-%d %H:%M:%S"