Spaces:
Sleeping
Sleeping
| import io | |
| import os | |
| import math | |
| from queue import Queue | |
| from threading import Thread | |
| from typing import Optional | |
| import numpy as np | |
| import gradio as gr | |
| import torch | |
| import nltk | |
| from pydub import AudioSegment | |
| from transformers import AutoTokenizer, AutoFeatureExtractor | |
| from parler_tts import ParlerTTSForConditionalGeneration | |
| nltk.download('punkt') | |
| device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" | |
| torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32 | |
| # Use only the pretrained model | |
| repo_id = "ai4bharat/indic-parler-tts-pretrained" | |
| model = ParlerTTSForConditionalGeneration.from_pretrained( | |
| repo_id, attn_implementation="eager", torch_dtype=torch_dtype, | |
| ).to(device) | |
| tokenizer = AutoTokenizer.from_pretrained(repo_id) | |
| description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large") | |
| feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) | |
| sampling_rate = model.audio_encoder.config.sampling_rate | |
| examples = [ | |
| [ | |
| "मुले बागेत खेळत आहेत आणि पक्षी किलबिलाट करत आहेत.", | |
| "Sunita speaks slowly in a calm, moderate-pitched voice, delivering the news with a neutral tone. The recording is very high quality with no background noise.", | |
| 3.0 | |
| ], | |
| [ | |
| "This is the best time of my life, Bartley,' she said happily", | |
| "A male speaker with a low-pitched voice speaks with a British accent at a fast pace in a small, confined space with very clear audio and an animated tone.", | |
| 3.0 | |
| ], | |
| [ | |
| "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.", | |
| "A female speaker with a slightly low-pitched, quite monotone voice speaks with an American accent at a slightly faster-than-average pace in a confined space with very clear audio.", | |
| 3.0 | |
| ], | |
| [ | |
| "बगीचे में बच्चे खेल रहे हैं और पक्षी चहचहा रहे हैं।", | |
| "Rohit speaks with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.", | |
| 3.0 | |
| ] | |
| ] | |
| def numpy_to_mp3(audio_array, sampling_rate): | |
| if np.issubdtype(audio_array.dtype, np.floating): | |
| max_val = np.max(np.abs(audio_array)) | |
| audio_array = (audio_array / max_val) * 32767 | |
| audio_array = audio_array.astype(np.int16) | |
| audio_segment = AudioSegment( | |
| audio_array.tobytes(), | |
| frame_rate=sampling_rate, | |
| sample_width=audio_array.dtype.itemsize, | |
| channels=1 | |
| ) | |
| mp3_io = io.BytesIO() | |
| audio_segment.export(mp3_io, format="mp3", bitrate="320k") | |
| return mp3_io.getvalue() | |
| def generate(text, description): | |
| chunk_size = 25 | |
| inputs = description_tokenizer(description, return_tensors="pt").to(device) | |
| sentences_text = nltk.sent_tokenize(text) | |
| curr_sentence = "" | |
| chunks = [] | |
| for sentence in sentences_text: | |
| candidate = " ".join([curr_sentence, sentence]) | |
| if len(candidate.split()) >= chunk_size: | |
| chunks.append(curr_sentence) | |
| curr_sentence = sentence | |
| else: | |
| curr_sentence = candidate | |
| if curr_sentence != "": | |
| chunks.append(curr_sentence) | |
| all_audio = [] | |
| for chunk in chunks: | |
| prompt = tokenizer(chunk, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| generation = model.generate( | |
| input_ids=inputs.input_ids, | |
| attention_mask=inputs.attention_mask, | |
| prompt_input_ids=prompt.input_ids, | |
| prompt_attention_mask=prompt.attention_mask, | |
| do_sample=True, | |
| return_dict_in_generate=True | |
| ) | |
| if hasattr(generation, 'sequences') and hasattr(generation, 'audios_length'): | |
| audio = generation.sequences[0, :generation.audios_length[0]] | |
| audio_np = audio.to(torch.float32).cpu().numpy().squeeze() | |
| if len(audio_np.shape) > 1: | |
| audio_np = audio_np.flatten() | |
| all_audio.append(audio_np) | |
| combined_audio = np.concatenate(all_audio) | |
| return numpy_to_mp3(combined_audio, sampling_rate=sampling_rate) | |
| css = """ | |
| #share-btn-container { | |
| display: flex; | |
| padding-left: 0.5rem !important; | |
| padding-right: 0.5rem !important; | |
| background-color: #000000; | |
| justify-content: center; | |
| align-items: center; | |
| border-radius: 9999px !important; | |
| width: 13rem; | |
| margin-top: 10px; | |
| margin-left: auto; | |
| flex: unset !important; | |
| } | |
| #share-btn { | |
| all: initial; | |
| color: #ffffff; | |
| font-weight: 600; | |
| cursor: pointer; | |
| font-family: 'IBM Plex Sans', sans-serif; | |
| margin-left: 0.5rem !important; | |
| padding-top: 0.25rem !important; | |
| padding-bottom: 0.25rem !important; | |
| right:0; | |
| } | |
| #share-btn * { | |
| all: unset !important; | |
| } | |
| #share-btn-container div:nth-child(-n+2){ | |
| width: auto !important; | |
| min-height: 0px !important; | |
| } | |
| #share-btn-container .wrap { | |
| display: none !important; | |
| } | |
| """ | |
| with gr.Blocks(css=css) as block: | |
| gr.HTML(""" | |
| <div style="text-align: center; max-width: 700px; margin: 0 auto;"> | |
| <h1 style="font-weight: 900; margin-bottom: 7px;">Parler-TTS 🗣️</h1> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <p>This demo uses the <b>Pretrained Indic Parler-TTS</b> model for expressive, controllable text-to-speech in Indian languages. Describe the speaker and context in the description box to guide the model.</p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox(label="Input Text", lines=2, value=examples[0][0]) | |
| description = gr.Textbox(label="Description", lines=2, value=examples[0][1]) | |
| run_button = gr.Button("Generate Audio", variant="primary") | |
| with gr.Column(): | |
| audio_out = gr.Audio(label="Generated Audio", format="mp3", autoplay=True) | |
| inputs = [input_text, description] | |
| outputs = [audio_out] | |
| gr.Examples(examples=examples, fn=generate, inputs=inputs, outputs=outputs, cache_examples=False) | |
| run_button.click(fn=generate, inputs=inputs, outputs=outputs, queue=True) | |
| gr.HTML(""" | |
| <p>To learn more, visit the <a href="https://huggingface.co/ai4bharat/indic-parler-tts-pretrained">Indic Parler-TTS Pretrained</a> model card or check out the <a href="https://github.com/huggingface/parler-tts">Parler-TTS GitHub repo</a>.</p> | |
| """) | |
| block.queue() | |
| block.launch(share=True) | |