Llama-3B-Mono-Ceylia 
	 
 
Llama-3B-Mono-Ceylia is a Llama-based Speech-LLM designed for high-quality, empathetic text-to-speech generation. This model has been fine-tuned to deliver human-like speech synthesis, achieving exceptional clarity, expressiveness, and real-time streaming performance. The model has been fine-tuned from mono audio of a female voice named 'Ceylia' using the base model canopylabs/orpheus-3b-0.1-ft.
 
In some cases, the results may be inconsistent, particularly when handling complex speech transformations.
 
[ paralinguistic emotions soft] 
	
		
	 
	
		Model Details 
	 
 
Base Model:  canopylabs/orpheus-3b-0.1-ft 
Languages Supported:  English 
License:  Llama 3.2 
Model Version:  N/A 
 
 
	
		
	 
	
		Paralinguistic Elements 
	 
 
The model can generate speech with the following emotions:
	
		
Elements 
Elements 
Elements 
 
		
laugh 
chuckle 
sigh 
 
sniffle 
groan 
yawn 
 
gasp 
uhm 
giggles & more 
 
 
	
 
 
	
		
	 
	
		Run with Transformers π€ 
	 
 
from  huggingface_hub import  notebook_login, HfApi
notebook_login()
 
	
		
	 
	
		Install Dependencies 
	 
 
%%capture
!pip install snac accelerate
!pip install transformers
!pip install gradio
 
	
		
	 
	
		Usage 
	 
 
import  torch
from  transformers import  AutoTokenizer, AutoModelForCausalLM
import  gradio as  gr
from  snac import  SNAC
def  redistribute_codes (row ):
    """ 
    Convert a sequence of token codes into an audio waveform using SNAC. 
    The code assumes each 7 tokens represent one group of instructions. 
    """ 
    row_length = row.size(0 )
    new_length = (row_length // 7 ) * 7 
    trimmed_row = row[:new_length]
    code_list = [t - 128266  for  t in  trimmed_row]
    
    layer_1, layer_2, layer_3 = [], [], []
    
    for  i in  range ((len (code_list) + 1 ) // 7 ):
        layer_1.append(code_list[7  * i][None ])
        layer_2.append(code_list[7  * i + 1 ][None ] - 4096 )
        layer_3.append(code_list[7  * i + 2 ][None ] - (2  * 4096 ))
        layer_3.append(code_list[7  * i + 3 ][None ] - (3  * 4096 ))
        layer_2.append(code_list[7  * i + 4 ][None ] - (4  * 4096 ))
        layer_3.append(code_list[7  * i + 5 ][None ] - (5  * 4096 ))
        layer_3.append(code_list[7  * i + 6 ][None ] - (6  * 4096 ))
    
    with  torch.no_grad():
        codes = [
            torch.concat(layer_1),
            torch.concat(layer_2),
            torch.concat(layer_3)
        ]
        for  i in  range (len (codes)):
            codes[i][codes[i] < 0 ] = 0 
            codes[i] = codes[i][None ]
        
        audio_hat = snac_model.decode(codes)
        return  audio_hat.cpu()[0 , 0 ]
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz" ).to("cuda" )
tokenizer = AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia' )
model = AutoModelForCausalLM.from_pretrained(
    'prithivMLmods/Llama-3B-Mono-Ceylia' , torch_dtype=torch.bfloat16
).cuda()
def  generate_audio (text, temperature, top_p, max_new_tokens ):
    """ 
    Given input text, generate speech audio. 
    """ 
    speaker = "Ceylia" 
    prompt = f'<custom_token_3><|begin_of_text|>{speaker} : {text} <|eot_id|><custom_token_4><custom_token_5><custom_token_1>' 
    input_ids = tokenizer(prompt, add_special_tokens=False , return_tensors='pt' ).to('cuda' )
    
    with  torch.no_grad():
        generated_ids = model.generate(
            **input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True ,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.1 ,
            num_return_sequences=1 ,
            eos_token_id=128258 ,
        )
    
    row = generated_ids[0 , input_ids['input_ids' ].shape[1 ]:]
    y_tensor = redistribute_codes(row)
    y_np = y_tensor.detach().cpu().numpy()
    return  (24000 , y_np)
with  gr.Blocks() as  demo:
    gr.Markdown("# Llama-3B-Mono-Ceylia - Single Speaker Audio Generation" )
    gr.Markdown("Generate speech audio using the `prithivMLmods/Llama-3B-Mono-Ceylia` model." )
    
    with  gr.Row():
        text_input = gr.Textbox(lines=4 , label="Input Text" )
    
    with  gr.Row():
        temp_slider = gr.Slider(minimum=0.1 , maximum=2.0 , step=0.1 , value=0.9 , label="Temperature" )
        top_p_slider = gr.Slider(minimum=0.1 , maximum=1.0 , step=0.05 , value=0.8 , label="Top-p" )
        tokens_slider = gr.Slider(minimum=100 , maximum=2000 , step=50 , value=1200 , label="Max New Tokens" )
    
    output_audio = gr.Audio(type ="numpy" , label="Generated Audio" )
    generate_button = gr.Button("Generate Audio" )
    
    generate_button.click(
        fn=generate_audio,
        inputs=[text_input, temp_slider, top_p_slider, tokens_slider],
        outputs=output_audio
    )
if  __name__ == "__main__" :
    demo.launch()
 
[ or ]
import  torch
from  transformers import  AutoTokenizer, AutoModelForCausalLM
import  gradio as  gr
from  snac import  SNAC
def  redistribute_codes (row ):
    """ 
    Convert a sequence of token codes into an audio waveform using SNAC. 
    The code assumes each 7 tokens represent one group of instructions. 
    """ 
    row_length = row.size(0 )
    new_length = (row_length // 7 ) * 7 
    trimmed_row = row[:new_length]
    code_list = [t - 128266  for  t in  trimmed_row]
    
    layer_1, layer_2, layer_3 = [], [], []
    
    for  i in  range ((len (code_list) + 1 ) // 7 ):
        layer_1.append(code_list[7  * i][None ])
        layer_2.append(code_list[7  * i + 1 ][None ] - 4096 )
        layer_3.append(code_list[7  * i + 2 ][None ] - (2  * 4096 ))
        layer_3.append(code_list[7  * i + 3 ][None ] - (3  * 4096 ))
        layer_2.append(code_list[7  * i + 4 ][None ] - (4  * 4096 ))
        layer_3.append(code_list[7  * i + 5 ][None ] - (5  * 4096 ))
        layer_3.append(code_list[7  * i + 6 ][None ] - (6  * 4096 ))
    
    with  torch.no_grad():
        codes = [
            torch.concat(layer_1),
            torch.concat(layer_2),
            torch.concat(layer_3)
        ]
        for  i in  range (len (codes)):
            codes[i][codes[i] < 0 ] = 0 
            codes[i] = codes[i][None ]
        
        audio_hat = snac_model.decode(codes)
        return  audio_hat.cpu()[0 , 0 ]
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz" ).to("cuda" )
tokenizer = AutoTokenizer.from_pretrained('prithivMLmods/Llama-3B-Mono-Ceylia' )
model = AutoModelForCausalLM.from_pretrained(
    'prithivMLmods/Llama-3B-Mono-Ceylia' , torch_dtype=torch.bfloat16
).cuda()
def  generate_audio (text, temperature, top_p, max_new_tokens ):
    """ 
    Given input text, generate speech audio. 
    """ 
    prompt = f'<custom_token_3><|begin_of_text|>{text} <|eot_id|><custom_token_4><custom_token_5><custom_token_1>' 
    input_ids = tokenizer(prompt, add_special_tokens=False , return_tensors='pt' ).to('cuda' )
    
    with  torch.no_grad():
        generated_ids = model.generate(
            **input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True ,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.1 ,
            num_return_sequences=1 ,
            eos_token_id=128258 ,
        )
    
    row = generated_ids[0 , input_ids['input_ids' ].shape[1 ]:]
    y_tensor = redistribute_codes(row)
    y_np = y_tensor.detach().cpu().numpy()
    return  (24000 , y_np)
with  gr.Blocks() as  demo:
    gr.Markdown("# Llama-3B-Mono-Ceylia - Single Speaker Audio Generation" )
    gr.Markdown("Generate speech audio using the `prithivMLmods/Llama-3B-Mono-Ceylia` model." )
    
    with  gr.Row():
        text_input = gr.Textbox(lines=4 , label="Input Text" )
    
    with  gr.Row():
        temp_slider = gr.Slider(minimum=0.1 , maximum=2.0 , step=0.1 , value=0.9 , label="Temperature" )
        top_p_slider = gr.Slider(minimum=0.1 , maximum=1.0 , step=0.05 , value=0.8 , label="Top-p" )
        tokens_slider = gr.Slider(minimum=100 , maximum=2000 , step=50 , value=1200 , label="Max New Tokens" )
    
    output_audio = gr.Audio(type ="numpy" , label="Generated Audio" )
    generate_button = gr.Button("Generate Audio" )
    
    generate_button.click(
        fn=generate_audio,
        inputs=[text_input, temp_slider, top_p_slider, tokens_slider],
        outputs=output_audio
    )
if  __name__ == "__main__" :
    demo.launch()
 
 
	
		
	 
	
		Intended Use 
	 
 
Designed for high-quality, single-speaker text-to-speech generation. 
Ideal for applications requiring human-like speech synthesis. 
Supports a range of emotions for expressive speech output. 
Suitable for AI voice assistants, storytelling, and accessibility applications.