DEBUG interpolation of voice style

Browse files

Files changed (4) hide show

Modules/hifigan.py +10 -9
Modules/utils.py +0 -14
models.py +30 -138
msinference.py +31 -71

Modules/hifigan.py CHANGED Viewed

@@ -3,11 +3,12 @@ import torch.nn.functional as F
 import torch.nn as nn
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-from .utils import init_weights, get_padding
 import math
 import random
-import numpy as np
 LRELU_SLOPE = 0.1
@@ -42,7 +43,7 @@ class AdaINResBlock1(torch.nn.Module):
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
                                padding=get_padding(kernel_size, dilation[2])))
         ])
-        self.convs1.apply(init_weights)
         self.convs2 = nn.ModuleList([
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
@@ -52,7 +53,7 @@ class AdaINResBlock1(torch.nn.Module):
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
                                padding=get_padding(kernel_size, 1)))
         ])
-        self.convs2.apply(init_weights)
         self.adain1 = nn.ModuleList([
             AdaIN1d(style_dim, channels),
@@ -274,8 +275,6 @@ class SourceModuleHnNSF(torch.nn.Module):
         # source for noise branch, in the same shape as uv
         noise = torch.randn_like(uv) * self.sine_amp / 3
         return sine_merge, noise, uv
-def padDiff(x):
-    return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
 class Generator(torch.nn.Module):
     def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
@@ -323,8 +322,7 @@ class Generator(torch.nn.Module):
                 self.resblocks.append(resblock(ch, k, d, style_dim))
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
     def forward(self, x, s, f0):
@@ -365,6 +363,9 @@ class Generator(torch.nn.Module):
 class AdainResBlk1d(nn.Module):
     def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
                  upsample='none', dropout_p=0.0):
         super().__init__()

 import torch.nn as nn
 from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 import math
 import random
+import numpy as np
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
 LRELU_SLOPE = 0.1
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
                                padding=get_padding(kernel_size, dilation[2])))
         ])
+        # self.convs1.apply(init_weights)
         self.convs2 = nn.ModuleList([
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
             weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
                                padding=get_padding(kernel_size, 1)))
         ])
+        # self.convs2.apply(init_weights)
         self.adain1 = nn.ModuleList([
             AdaIN1d(style_dim, channels),
         # source for noise branch, in the same shape as uv
         noise = torch.randn_like(uv) * self.sine_amp / 3
         return sine_merge, noise, uv
 class Generator(torch.nn.Module):
     def __init__(self, style_dim, resblock_kernel_sizes, upsample_rates, upsample_initial_channel, resblock_dilation_sizes, upsample_kernel_sizes):
                 self.resblocks.append(resblock(ch, k, d, style_dim))
         self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
     def forward(self, x, s, f0):
 class AdainResBlk1d(nn.Module):
+    # also used in ProsodyPredictor()
     def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
                  upsample='none', dropout_p=0.0):
         super().__init__()

Modules/utils.py DELETED Viewed

@@ -1,14 +0,0 @@
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-def apply_weight_norm(m):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        weight_norm(m)
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size*dilation - dilation)/2)

models.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch.nn.functional as F
 from torch.nn.utils import weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
-from Modules.hifigan import AdaIN1d
 import yaml
@@ -18,9 +18,11 @@ class LearnedDownSample(nn.Module):
         self.layer_type = layer_type
         if self.layer_type == 'none':
-            self.conv = nn.Identity()
         elif self.layer_type == 'timepreserve':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
         elif self.layer_type == 'half':
             self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
         else:
@@ -48,20 +50,7 @@ class DownSample(nn.Module):
             raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
-class UpSample(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
-        elif self.layer_type == 'half':
-            return F.interpolate(x, scale_factor=2, mode='nearest')
-        else:
-            raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
 class ResBlk(nn.Module):
@@ -137,9 +126,11 @@ class StyleEncoder(nn.Module):
         h = self.shared(x)  # [bs, 512, 1, 11]
         h = h.mean(3, keepdims=True)  # UN COMMENT FOR TIME INVARIANT GLOBAL SPEAKER STYLE
         h = h.transpose(1, 3)
         s = self.unshared(h)
         return s
@@ -249,114 +240,37 @@ class TextEncoder(nn.Module):
         self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
-    def forward(self, x, input_lengths, m):
         x = self.embedding(x)  # [B, T, emb]
         x = x.transpose(1, 2)  # [B, emb, T]
-        m = m.to(input_lengths.device).unsqueeze(1)
-        x.masked_fill_(m, 0.0)
         for c in self.cnn:
-            x = c(x)
-            x.masked_fill_(m, 0.0)
         x = x.transpose(1, 2)  # [B, T, chn]
         input_lengths = input_lengths.cpu().numpy()
         x = nn.utils.rnn.pack_padded_sequence(
-            x, input_lengths, batch_first=True, enforce_sorted=False)
         self.lstm.flatten_parameters()
         x, _ = self.lstm(x)
         x, _ = nn.utils.rnn.pad_packed_sequence(
             x, batch_first=True)
         x = x.transpose(-1, -2)
-        x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
-        x_pad[:, :, :x.shape[-1]] = x
-        x = x_pad.to(x.device)
-        x.masked_fill_(m, 0.0)
         return x
-    def inference(self, x):
-        x = self.embedding(x)
-        x = x.transpose(1, 2)
-        x = self.cnn(x)
-        x = x.transpose(1, 2)
-        self.lstm.flatten_parameters()
-        x, _ = self.lstm(x)
-        return x
-    def length_to_mask(self, lengths):
-        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-        mask = torch.gt(mask+1, lengths.unsqueeze(1))
-        return mask
-class UpSample1d(nn.Module):
-    def __init__(self, layer_type):
-        super().__init__()
-        self.layer_type = layer_type
-    def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        else:
-            return F.interpolate(x, scale_factor=2, mode='nearest')
-class AdainResBlk1d(nn.Module):
-    # only instantiated in ProsodyPredictor
-    def __init__(self, dim_in,
-                 dim_out,
-                 style_dim=64,
-                 actv=nn.LeakyReLU(0.2),
-                 upsample='none',
-                 dropout_p=0.0):
-        super().__init__()
-        self.actv = actv
-        self.upsample_type = upsample
-        self.upsample = UpSample1d(upsample)
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out, style_dim)
-        self.dropout = nn.Dropout(dropout_p)
-        if upsample == 'none':
-            self.pool = nn.Identity()
-        else:
-            self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
-    def _build_weights(self, dim_in, dim_out, style_dim):
-        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
-        self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
-        self.norm1 = AdaIN1d(style_dim, dim_in)
-        self.norm2 = AdaIN1d(style_dim, dim_out)
-        if self.learned_sc:
-            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def _shortcut(self, x):
-        x = self.upsample(x)
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        return x
-    def _residual(self, x, s):
-        x = self.norm1(x, s)
-        x = self.actv(x)
-        x = self.pool(x)
-        x = self.conv1(self.dropout(x))
-        x = self.norm2(x, s)
-        x = self.actv(x)
-        x = self.conv2(self.dropout(x))
-        return x
-    def forward(self, x, s):
-        out = self._residual(x, s)
-        out = (out + self._shortcut(x)) / math.sqrt(2)
-        return out
 class AdaLayerNorm(nn.Module):
@@ -423,11 +337,6 @@ class ProsodyPredictor(nn.Module):
         return F0.squeeze(1), N.squeeze(1)
-    def length_to_mask(self, lengths):
-        mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-        mask = torch.gt(mask+1, lengths.unsqueeze(1))
-        return mask
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
@@ -447,21 +356,13 @@ class DurationEncoder(nn.Module):
         self.d_model = d_model
         self.sty_dim = sty_dim
-    def forward(self, x, style, text_lengths, m):
-        masks = m.to(text_lengths.device)
-        # x : [bs, 512, 987]
-        # print('DURATION ENCODER', x.shape, style.shape, masks.shape)
-        # s = style.expand(x.shape[0], x.shape[1], -1)
         style = style[:, :, 0, :].transpose(2, 1)  # [bs, 128, 11]
-        # print("S IN DURATION ENC", style.shape, x.shape)
-        style = F.interpolate(style, x.shape[2])
-        print(f'L468 IN DURATION ENC {x.shape=}, {style.shape=} {masks.shape=}')  # mask = [1,75]
         x = torch.cat([x, style], axis=1)  # [bs, 640, 75]
-        x.masked_fill_(masks[:, None, :], 0.0)
         input_lengths = text_lengths.cpu().numpy()
@@ -471,7 +372,7 @@ class DurationEncoder(nn.Module):
                 print(f'\n=========ENTER ADALAYNORM L479 models.py {x.shape=}, {style.shape=}')
                 x = block(x, style)   # [bs, 75, 512]
                 x = torch.cat([x.transpose(1, 2), style], axis=1) # [bs, 512, 75]
-                x.masked_fill_(masks[:, None, :], 0.0)
             else:
                 # print(f'{x.shape=} ENTER LSTM')  # [bs, 640, 75]  LSTM reduce ch 640 -> 512
                 x = x.transpose(-1, -2)
@@ -483,15 +384,6 @@ class DurationEncoder(nn.Module):
                     x, batch_first=True)
                 x = F.dropout(x, p=self.dropout, training=self.training)
                 x = x.transpose(-1, -2)
-                x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
-                x_pad[:, :, :x.shape[-1]] = x
-                x = x_pad.to(x.device)
-                # print(f'{x.shape=} EXIR LSTM')  # [bs, 512, 75]
-#         print('Calling Duration Encoder\n\n\n\n',x.shape, x.min(), x.max())
-#         Calling Duration Encoder
-#  torch.Size([1, 640, 107]) tensor(-3.0903, device='cuda:0') tensor(2.3089, device='cuda:0')
         return x.transpose(-1, -2)

 from torch.nn.utils import weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
+from Modules.hifigan import AdainResBlk1d
 import yaml
         self.layer_type = layer_type
         if self.layer_type == 'none':
+            raise ValueError
+            # self.conv = nn.Identity()
         elif self.layer_type == 'timepreserve':
+            raise ValueError
+            # self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
         elif self.layer_type == 'half':
             self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
         else:
             raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
 class ResBlk(nn.Module):
         h = self.shared(x)  # [bs, 512, 1, 11]
         h = h.mean(3, keepdims=True)  # UN COMMENT FOR TIME INVARIANT GLOBAL SPEAKER STYLE
+        # h = .7 * h + .25 * h.mean(3, keepdims=True)
         h = h.transpose(1, 3)
         s = self.unshared(h)
         return s
         self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
+    def forward(self, x, input_lengths):
         x = self.embedding(x)  # [B, T, emb]
         x = x.transpose(1, 2)  # [B, emb, T]
         for c in self.cnn:
+            x = c(x)
         x = x.transpose(1, 2)  # [B, T, chn]
         input_lengths = input_lengths.cpu().numpy()
         x = nn.utils.rnn.pack_padded_sequence(
+            x, input_lengths,
+            batch_first=True,
+            enforce_sorted=False)
         self.lstm.flatten_parameters()
         x, _ = self.lstm(x)
         x, _ = nn.utils.rnn.pad_packed_sequence(
             x, batch_first=True)
         x = x.transpose(-1, -2)
         return x
+    # def inference(self, x):
+    #     x = self.embedding(x)
+    #     x = x.transpose(1, 2)
+    #     x = self.cnn(x)
+    #     x = x.transpose(1, 2)
+    #     self.lstm.flatten_parameters()
+    #     x, _ = self.lstm(x)
+    #     return x
+    # def length_to_mask(self, lengths):
+    #     mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+    #     mask = torch.gt(mask+1, lengths.unsqueeze(1))
+    #     return mask
 class AdaLayerNorm(nn.Module):
         return F0.squeeze(1), N.squeeze(1)
 class DurationEncoder(nn.Module):
     def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
         self.d_model = d_model
         self.sty_dim = sty_dim
+    def forward(self, x, style, text_lengths):
         style = style[:, :, 0, :].transpose(2, 1)  # [bs, 128, 11]
+        style = F.interpolate(style, x.shape[2], mode='nearest')
         x = torch.cat([x, style], axis=1)  # [bs, 640, 75]
         input_lengths = text_lengths.cpu().numpy()
                 print(f'\n=========ENTER ADALAYNORM L479 models.py {x.shape=}, {style.shape=}')
                 x = block(x, style)   # [bs, 75, 512]
                 x = torch.cat([x.transpose(1, 2), style], axis=1) # [bs, 512, 75]
             else:
                 # print(f'{x.shape=} ENTER LSTM')  # [bs, 640, 75]  LSTM reduce ch 640 -> 512
                 x = x.transpose(-1, -2)
                     x, batch_first=True)
                 x = F.dropout(x, p=self.dropout, training=self.training)
                 x = x.transpose(-1, -2)
         return x.transpose(-1, -2)

msinference.py CHANGED Viewed

@@ -51,23 +51,11 @@ to_mel = torchaudio.transforms.MelSpectrogram(
     n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
 mean, std = -4, 4
-# START UTIL
 def alpha_num(f):
     f = re.sub(' +', ' ', f)              # delete spaces
     f = re.sub(r'[^A-Z a-z0-9 ]+', '', f)  # del non alpha num
     return f
-# ======== UTILS ABOVE
-def length_to_mask(lengths):
-    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-    mask = torch.gt(mask+1, lengths.unsqueeze(1))
-    return mask
 def preprocess(wave):
     wave_tensor = torch.from_numpy(wave).float()
     mel_tensor = to_mel(wave_tensor)
@@ -201,51 +189,31 @@ params = params_whole['net']
 # --
 from collections import OrderedDict
-new_state_dict = OrderedDict()
-for k, v in params['bert'].items():
-    new_state_dict[k[7:]] = v    # del 'module.'
-bert.load_state_dict(new_state_dict, strict=True)
-# --
-new_state_dict = OrderedDict()
-for k, v in params['bert_encoder'].items():
-    new_state_dict[k[7:]] = v    # del 'module.'
-bert_encoder.load_state_dict(new_state_dict, strict=True)
-# --
-new_state_dict = OrderedDict()
-for k, v in params['predictor'].items():
-    new_state_dict[k[7:]] = v    # del 'module.'
-predictor.load_state_dict(new_state_dict, strict=True)  # XTRA non-ckpt LSTMs nlayers add slowiness to voice
-# --
-new_state_dict = OrderedDict()
-for k, v in params['decoder'].items():
-    new_state_dict[k[7:]] = v
-decoder.load_state_dict(new_state_dict, strict=True)
-# --
-new_state_dict = OrderedDict()
-for k, v in params['text_encoder'].items():
-    new_state_dict[k[7:]] = v
-text_encoder.load_state_dict(new_state_dict, strict=True)
-# --
-new_state_dict = OrderedDict()
-for k, v in params['predictor_encoder'].items():
-    new_state_dict[k[7:]] = v
-predictor_encoder.load_state_dict(new_state_dict, strict=True)
-# --
-new_state_dict = OrderedDict()
-for k, v in params['style_encoder'].items():
-    new_state_dict[k[7:]] = v
-style_encoder.load_state_dict(new_state_dict, strict=True)
-# --
-new_state_dict = OrderedDict()
-for k, v in params['text_aligner'].items():
-    new_state_dict[k[7:]] = v    # del 'module.'
-text_aligner.load_state_dict(new_state_dict, strict=True)
-# --
-new_state_dict = OrderedDict()
-for k, v in params['pitch_extractor'].items():
-    new_state_dict[k[7:]] = v
-pitch_extractor.load_state_dict(new_state_dict, strict=True)
 def inference(text,
               ref_s,
@@ -267,7 +235,7 @@ def inference(text,
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
-        text_mask = length_to_mask(input_lengths).to(device)
         # -----------------------
         # WHO TRANSLATES these tokens to sylla
         # print(text_mask.shape, '\n__\n', tokens, '\n__\n',  text_mask.min(), text_mask.max())
@@ -282,13 +250,9 @@ def inference(text,
                         #   54, 156,  63, 158, 147,  83,  56,  16,   4]], device='cuda:0')
-        t_en = text_encoder(tokens, input_lengths, text_mask)
-        bert_dur = bert(tokens, attention_mask=(~text_mask).int())
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
-        # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
-        # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
         ref = ref_s[:, :, :, :128] # [bs, 11, 1, 128]
         s = ref_s[:, :, :, 128:]   # have channels as last dim so it can go through nn.Linear layers
@@ -299,13 +263,13 @@ def inference(text,
         # s = .74 * s  # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
-        print(f'{d_en.shape=}  {s.shape=} {input_lengths.shape=}  {text_mask.shape=}')
         d = predictor.text_encoder(d_en,
                                          s,
-                                         input_lengths,
-                                         text_mask)
         x, _ = predictor.lstm(d)
         duration = predictor.duration_proj(x)
         duration = torch.sigmoid(duration).sum(axis=-1)
@@ -364,14 +328,12 @@ def inference(text,
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 import os
 import re
 import tempfile
 import torch
 import sys
-import numpy as np
-import audiofile
 from huggingface_hub import hf_hub_download
 # Setup TTS env
@@ -393,8 +355,6 @@ with open(f"Utils/all_langs.csv") as f:
 # LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
 # ==============================================================================================
-import re
-from num2words import num2words
 PHONEME_MAP = {
         'služ' : 'sloooozz', # 'službeno'

     n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
 mean, std = -4, 4
 def alpha_num(f):
     f = re.sub(' +', ' ', f)              # delete spaces
     f = re.sub(r'[^A-Z a-z0-9 ]+', '', f)  # del non alpha num
     return f
 def preprocess(wave):
     wave_tensor = torch.from_numpy(wave).float()
     mel_tensor = to_mel(wave_tensor)
 # --
 from collections import OrderedDict
+def _del_prefix(d):
+    # del ".module"
+    out = OrderedDict()
+    for k, v in d.items():
+        out[k[7:]] = v
+    return out
+bert.load_state_dict(        _del_prefix(params['bert']), strict=True)
+bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
+predictor.load_state_dict(   _del_prefix(params['predictor']), strict=True)  # XTRA non-ckpt LSTMs nlayers add slowiness to voice
+decoder.load_state_dict(     _del_prefix(params['decoder']), strict=True)
+text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
+predictor_encoder.load_state_dict(_del_prefix(params['predictor_encoder']), strict=True)
+style_encoder.load_state_dict(_del_prefix(params['style_encoder']), strict=True)
+text_aligner.load_state_dict( _del_prefix(params['text_aligner']), strict=True)
+pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=True)
+# def _shift(x):
+#     # [bs, samples] shift circular each batch elem of sound
+#     n = x.shape[1]
+#     for i, batch_elem in enumerate(x):
+#         offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
+#         x[i, ...] = torch.roll(batch_elem, offset, dims=1)  # batch_elem = [400000, ]
+#     return x
 def inference(text,
               ref_s,
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
         # -----------------------
         # WHO TRANSLATES these tokens to sylla
         # print(text_mask.shape, '\n__\n', tokens, '\n__\n',  text_mask.min(), text_mask.max())
                         #   54, 156,  63, 158, 147,  83,  56,  16,   4]], device='cuda:0')
+        t_en = text_encoder(tokens, input_lengths)
+        bert_dur = bert(tokens, attention_mask=None)
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
         ref = ref_s[:, :, :, :128] # [bs, 11, 1, 128]
         s = ref_s[:, :, :, 128:]   # have channels as last dim so it can go through nn.Linear layers
         # s = .74 * s  # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
+        print(f'{d_en.shape=}  {s.shape=} {input_lengths.shape=}')
         d = predictor.text_encoder(d_en,
                                          s,
+                                         input_lengths)
         x, _ = predictor.lstm(d)
+        print(d.shape, x.shape, 'Lstm')
         duration = predictor.duration_proj(x)
         duration = torch.sigmoid(duration).sum(axis=-1)
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+from num2words import num2words
 import os
 import re
 import tempfile
 import torch
 import sys
 from huggingface_hub import hf_hub_download
 # Setup TTS env
 # LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
 # ==============================================================================================
 PHONEME_MAP = {
         'služ' : 'sloooozz', # 'službeno'