Updated the inference code. Added a notebook and a demo audio. (#1)
Browse files- Updated the inference code. Added a notebook and a demo audio. (4d65b3767f874d49d374f3d164978816aba11308)
Co-authored-by: Yassine Ennaour <[email protected]>
README.md
CHANGED
|
@@ -42,29 +42,36 @@ python -m mlx_audio.tts.generate --model Marvis-AI/marvis-tts-250m-v0.1 --strea
|
|
| 42 |
|
| 43 |
## Using transformers
|
| 44 |
|
| 45 |
-
**Without Voice Cloning**
|
| 46 |
```python
|
| 47 |
import torch
|
| 48 |
from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
|
| 49 |
from tokenizers.processors import TemplateProcessing
|
| 50 |
import soundfile as sf
|
| 51 |
|
| 52 |
-
model_id = "Marvis-AI/marvis-tts-
|
| 53 |
device = "cuda"if torch.cuda.is_available() else "cpu"
|
| 54 |
|
| 55 |
# load the model and the processor
|
| 56 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 57 |
-
model = CsmForConditionalGeneration.from_pretrained(model_id
|
| 58 |
|
| 59 |
# prepare the inputs
|
| 60 |
text = "[0]Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices." # `[0]` for speaker id 0
|
| 61 |
-
inputs = processor(text, add_special_tokens=True, return_tensors="pt").to(device)
|
| 62 |
# infer the model
|
| 63 |
-
audio = model.generate(
|
| 64 |
sf.write("example_without_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")
|
| 65 |
|
| 66 |
```
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
# Model Description
|
| 70 |
|
|
|
|
| 42 |
|
| 43 |
## Using transformers
|
| 44 |
|
| 45 |
+
**Without Voice Cloning**([Colab Notebook](https://colab.research.google.com/drive/1m9pdNFGlWMZW8gyXwkN9MNgbBEWP5lfO?usp=sharing))
|
| 46 |
```python
|
| 47 |
import torch
|
| 48 |
from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
|
| 49 |
from tokenizers.processors import TemplateProcessing
|
| 50 |
import soundfile as sf
|
| 51 |
|
| 52 |
+
model_id = "Marvis-AI/marvis-tts-0.25m-v0.1-transformers"
|
| 53 |
device = "cuda"if torch.cuda.is_available() else "cpu"
|
| 54 |
|
| 55 |
# load the model and the processor
|
| 56 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 57 |
+
model = CsmForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 58 |
|
| 59 |
# prepare the inputs
|
| 60 |
text = "[0]Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices." # `[0]` for speaker id 0
|
| 61 |
+
inputs = processor(text, add_special_tokens=True, return_tensors="pt").to(device)
|
| 62 |
# infer the model
|
| 63 |
+
audio = model.generate(input_ids=inputs['input_ids'], output_audio=True)
|
| 64 |
sf.write("example_without_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")
|
| 65 |
|
| 66 |
```
|
| 67 |
|
| 68 |
+
**Output:**
|
| 69 |
+
|
| 70 |
+
<audio controls>
|
| 71 |
+
<source src="https://audio.jukehost.co.uk/gqWAk28VaBoRaX3UPdnMBedGWgXLJ8Mt" type="audio/mpeg">
|
| 72 |
+
</audio>
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
|
| 76 |
# Model Description
|
| 77 |
|