Updated the inference code. Added a notebook and a demo audio. (#1)

- Updated the inference code. Added a notebook and a demo audio. (4d65b3767f874d49d374f3d164978816aba11308)

Co-authored-by: Yassine Ennaour <[email protected]>

Files changed (1) hide show

README.md CHANGED Viewed

@@ -42,29 +42,36 @@ python -m mlx_audio.tts.generate --model Marvis-AI/marvis-tts-250m-v0.1  --strea
 ## Using transformers
-**Without Voice Cloning**
 ```python
 import torch
 from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
 from tokenizers.processors import TemplateProcessing
 import soundfile as sf
-model_id = "Marvis-AI/marvis-tts-250m-v0.1-transformers"
 device = "cuda"if torch.cuda.is_available() else "cpu"
 # load the model and the processor
 processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
 # prepare the inputs
 text = "[0]Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices." # `[0]` for speaker id 0
-inputs = processor(text, add_special_tokens=True, return_tensors="pt").to(device).pop("token_type_ids")
 # infer the model
-audio = model.generate(**inputs, output_audio=True)
 sf.write("example_without_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")
 ```
 # Model Description

 ## Using transformers
+**Without Voice Cloning**([Colab Notebook](https://colab.research.google.com/drive/1m9pdNFGlWMZW8gyXwkN9MNgbBEWP5lfO?usp=sharing))
 ```python
 import torch
 from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
 from tokenizers.processors import TemplateProcessing
 import soundfile as sf
+model_id = "Marvis-AI/marvis-tts-0.25m-v0.1-transformers"
 device = "cuda"if torch.cuda.is_available() else "cpu"
 # load the model and the processor
 processor = AutoProcessor.from_pretrained(model_id)
+model = CsmForConditionalGeneration.from_pretrained(model_id).to(device)
 # prepare the inputs
 text = "[0]Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices." # `[0]` for speaker id 0
+inputs = processor(text, add_special_tokens=True, return_tensors="pt").to(device)
 # infer the model
+audio = model.generate(input_ids=inputs['input_ids'], output_audio=True)
 sf.write("example_without_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")
 ```
+**Output:**
+<audio controls>
+  <source src="https://audio.jukehost.co.uk/gqWAk28VaBoRaX3UPdnMBedGWgXLJ8Mt" type="audio/mpeg">
+</audio>
+---
 # Model Description