dkounadis
/

artificial-styletts2

@@ -18,7 +18,7 @@ from flask_cors import CORS
 from moviepy.editor import *
 from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
-NUM_SOUND_GENERATIONS = 1  # batch size to generate same text (same scene for long video)
 sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
@@ -82,14 +82,14 @@ def _shift(x):
     # fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4))  +  .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
     return x  #* fade_in   # silence this
-def overlay(x, scene=None):
-    if scene is not None:
         # SOUNDS
-        print(f'AudioGen {NUM_SOUND_GENERATIONS} x {scene}')
         background = sound_generator.generate(
-                                        [scene] * NUM_SOUND_GENERATIONS
                                         ).reshape(-1).detach().cpu().numpy() # bs, 11400
         # upsample 16 kHz AudioGen to 24kHZ StyleTTS
@@ -113,7 +113,7 @@ def overlay(x, scene=None):
         # background = _shift(background)
         print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
               f'{np.abs(background.max())=}\n{x.shape=}')
-        x = .1 * x + .9 * background[:len(x)]
     else:
         print('sound_background = None')
     return x
@@ -121,7 +121,7 @@ def overlay(x, scene=None):
 def tts_multi_sentence(precomputed_style_vector=None,
                        text=None,
                        voice=None,
-                       scene=None,
                        speed=None):
     '''create 24kHZ np.array with tts
@@ -129,7 +129,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
                                     to perform affective TTS.
        text  : string
        voice : string or None (falls to styleTTS)
-       scene : 'A castle in far away lands' -> if passed will generate background sound scene
        '''
@@ -161,7 +161,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
     x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
-    return overlay(x, scene=scene)
@@ -201,7 +201,7 @@ def serve_wav():
         affective =       r.get('affective')[0],
         voice     =       r.get('voice')[0],
         speed     = float(r.get('speed')[0]),  # For Non-English MMS TTS
-        scene=r.get('scene')[0] if r.get('scene') is not None else None,
                 )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
@@ -399,7 +399,7 @@ def serve_wav():
                 pieces.append(tts_multi_sentence(text=[_text_],
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
-                                                 scene=args.scene,
                                                  speed=args.speed)
                               )
             total = np.concatenate(pieces, 0)
@@ -420,7 +420,7 @@ def serve_wav():
             x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
-                               scene=args.scene,
                                speed=args.speed)
             soundfile.write(AUDIO_TRACK, x, 24000)
@@ -439,7 +439,7 @@ def serve_wav():
         x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
-                               scene=args.scene,
                                speed=args.speed
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
@@ -468,7 +468,7 @@ def serve_wav():
         x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
-                               scene=args.scene,
                                speed=args.speed)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)

 from moviepy.editor import *
 from audiocraft.builders import AudioGen
 CACHE_DIR = 'flask_cache/'
+NUM_SOUND_GENERATIONS = 1  # batch size to generate same text (same soundscape for long video)
 sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
     # fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4))  +  .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
     return x  #* fade_in   # silence this
+def overlay(x, soundscape=None):
+    if soundscape is not None:
         # SOUNDS
+        print(f'AudioGen {NUM_SOUND_GENERATIONS} x {soundscape}')
         background = sound_generator.generate(
+                                        [soundscape] * NUM_SOUND_GENERATIONS
                                         ).reshape(-1).detach().cpu().numpy() # bs, 11400
         # upsample 16 kHz AudioGen to 24kHZ StyleTTS
         # background = _shift(background)
         print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
               f'{np.abs(background.max())=}\n{x.shape=}')
+        x = .6 * x + .4 * background[:len(x)]
     else:
         print('sound_background = None')
     return x
 def tts_multi_sentence(precomputed_style_vector=None,
                        text=None,
                        voice=None,
+                       soundscape=None,
                        speed=None):
     '''create 24kHZ np.array with tts
                                     to perform affective TTS.
        text  : string
        voice : string or None (falls to styleTTS)
+       soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
        '''
     x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
+    return overlay(x, soundscape=soundscape)
         affective =       r.get('affective')[0],
         voice     =       r.get('voice')[0],
         speed     = float(r.get('speed')[0]),  # For Non-English MMS TTS
+        soundscape=r.get('soundscape')[0] if r.get('soundscape') is not None else None,
                 )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
                 pieces.append(tts_multi_sentence(text=[_text_],
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
+                                                 soundscape=args.soundscape,
                                                  speed=args.speed)
                               )
             total = np.concatenate(pieces, 0)
             x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
+                               soundscape=args.soundscape,
                                speed=args.speed)
             soundfile.write(AUDIO_TRACK, x, 24000)
         x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
+                               soundscape=args.soundscape,
                                speed=args.speed
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
         x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
+                               soundscape=args.soundscape,
                                speed=args.speed)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)

msinference.py CHANGED Viewed

@@ -373,13 +373,16 @@ class TextForeign(object):
 def foreign(text=None,   # list of text
             lang='romanian',
             speed=None):
     # https://huggingface.co/spaces/mms-meta/MMS
-    if 'hun' in lang.lower():
         lang_code = 'hun'
-    elif 'ser' in lang.lower():
         if has_cyrillic(text[0]):  # check 0-th sentence if is cyrillic
@@ -389,14 +392,22 @@ def foreign(text=None,   # list of text
             lang_code = 'rmc-script_latin'   # romani carpathian (has also Vlax)
-    elif 'rom' in lang.lower():
         lang_code = 'ron'
         speed = 1.24 if speed is None else speed
     else:
         lang_code = lang.split()[0].strip()
-    # Decoded Language
     print(f'\n\nLANG {lang_code=}\n_____________________\n')
     vocab_file = hf_hub_download(
         repo_id="facebook/mms-tts",
@@ -444,8 +455,10 @@ def foreign(text=None,   # list of text
             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
             _t = text_mapper.uromanize(_t, uroman_pl)
-        _t = _t.lower().replace("ţ", "ț").replace('ț','ts') #.replace('ț', 'ts').replace('Ţ', 'ts').replace('î', 'u').replace('Î', 'u')
         _t = text_mapper.filter_oov(_t, lang=lang)
         # print(f'{speed=}\n\n\n\n_______________________________ {_t}')
         stn_tst = text_mapper.get_text(_t, hps)
         with torch.no_grad():
@@ -464,16 +477,11 @@ def foreign(text=None,   # list of text
     x /= np.abs(x).max() + 1e-7
-    # hyp = (hyp * 32768).astype(np.int16)
-    # x =  hyp  #, text
-    print(x.shape, x.min(), x.max(), hps.data.sampling_rate)  # (hps.data.sampling_rate,
     x = audresample.resample(signal=x.astype(np.float32),
                              original_rate=16000,
                              target_rate=24000)[0, :]  # reshapes (64,) -> (1,64)
     return x

 def foreign(text=None,   # list of text
             lang='romanian',
             speed=None):
+    lang = lang.lower()  # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
     # https://huggingface.co/spaces/mms-meta/MMS
+    if 'hun' in lang:
         lang_code = 'hun'
+    elif 'ser' in lang:
         if has_cyrillic(text[0]):  # check 0-th sentence if is cyrillic
             lang_code = 'rmc-script_latin'   # romani carpathian (has also Vlax)
+    elif 'rom' in lang:
         lang_code = 'ron'
         speed = 1.24 if speed is None else speed
+    elif 'ger' in lang:
+        lang_code = 'deu'
+        speed = 1.14 if speed is None else speed
     else:
         lang_code = lang.split()[0].strip()
+    #  Load VITS
     print(f'\n\nLANG {lang_code=}\n_____________________\n')
     vocab_file = hf_hub_download(
         repo_id="facebook/mms-tts",
             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
             _t = text_mapper.uromanize(_t, uroman_pl)
+        _t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
         _t = text_mapper.filter_oov(_t, lang=lang)
         # print(f'{speed=}\n\n\n\n_______________________________ {_t}')
         stn_tst = text_mapper.get_text(_t, hps)
         with torch.no_grad():
     x /= np.abs(x).max() + 1e-7
+    # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
     x = audresample.resample(signal=x.astype(np.float32),
                              original_rate=16000,
                              target_rate=24000)[0, :]  # reshapes (64,) -> (1,64)
     return x

tts.py CHANGED Viewed

@@ -42,6 +42,14 @@ def command_line_args():
         default='sample.txt',
         type=str,
     )
     parser.add_argument(
         '--native',
         help="""
@@ -83,21 +91,24 @@ def command_line_args():
 def send_to_server(args):
     url = "http://192.168.88.209:5000"
     payload = {
         'affective': args.affective,
         'voice': args.voice,
         'native': args.native,
         'text': args.text,
         'image': args.image,
         'video': args.video,
         'speed': args.speed,
         # 'out_file': args.out_file   # let serve save as temp
     }
-    # In data= we can write args
-    # In files=  sent actual files if provided
     text_file = open(args.text, 'rb')
     image_file, video_file, native_file = None, None, None
@@ -107,7 +118,6 @@ def send_to_server(args):
             image_file = open(args.image, 'rb')
         except FileNotFoundError:
             pass
     if args.video is not None:
         print('\nLOADING vid\n')
@@ -122,14 +132,10 @@ def send_to_server(args):
             native_file = open(args.native, 'rb')
         except FileNotFoundError:
             pass
-        # --------------------- send this extra
-    # print('Sending...\n')
-    response = requests.post(url, data=payload,
                              files=[(args.text, text_file),
                                     (args.image, image_file),
                                     (args.video, video_file),

         default='sample.txt',
         type=str,
     )
+    parser.add_argument(
+        '--soundscape',
+        help='soundscape - MUST BE IN BRACKETS: \"forest\"',
+        default='wind fjord',
+        nargs='?',
+        type=str,
+        const=None,
+    )
     parser.add_argument(
         '--native',
         help="""
 def send_to_server(args):
     url = "http://192.168.88.209:5000"
+    # Args
     payload = {
         'affective': args.affective,
         'voice': args.voice,
+        'soundscape': args.soundscape,
         'native': args.native,
         'text': args.text,
         'image': args.image,
         'video': args.video,
         'speed': args.speed,
         # 'out_file': args.out_file   # let serve save as temp
     }
+    # Send Files
     text_file = open(args.text, 'rb')
     image_file, video_file, native_file = None, None, None
             image_file = open(args.image, 'rb')
         except FileNotFoundError:
             pass
     if args.video is not None:
         print('\nLOADING vid\n')
             native_file = open(args.native, 'rb')
         except FileNotFoundError:
             pass
+    #
+    response = requests.post(url, data=payload,    # contains str
                              files=[(args.text, text_file),
                                     (args.image, image_file),
                                     (args.video, video_file),