Spaces:
Paused
Paused
| # Install dependencies in application code, as we don't have access to a GPU at build time | |
| # Thanks to https://huggingface.co/Steveeeeeeen for their code to handle this! | |
| import os | |
| import shlex | |
| import subprocess | |
| subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True) | |
| subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True) | |
| subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True) | |
| import spaces | |
| import gradio as gr | |
| import numpy as np | |
| from typing import Tuple, Dict, Any, Optional | |
| from taproot import Task | |
| # Configuration | |
| is_hf_spaces = os.getenv("SYSTEM", "") == "spaces" | |
| max_characters = 2000 | |
| header_markdown = """ | |
| # Zonos v0.1 | |
| State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio) | |
| ## Unleashed | |
| Use this space to generate long-form speech up to around ~2 minutes in length. To generate an unlimited length, clone this space and run it locally. | |
| ### Tips | |
| - If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice. | |
| - When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition. | |
| - The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz. | |
| - The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed. | |
| - Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect. | |
| """.strip() | |
| # Create pipelines, downloading required files as necessary | |
| hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False) | |
| hybrid_task.download_required_files(text_callback=print) | |
| hybrid_pipe = hybrid_task() | |
| hybrid_pipe.load() | |
| transformer_task = Task.get( | |
| "speech-synthesis", model="zonos-transformer", available_only=False | |
| ) | |
| transformer_task.download_required_files(text_callback=print) | |
| transformer_pipe = transformer_task() | |
| if is_hf_spaces: | |
| # Must load all models on GPU when using ZERO | |
| transformer_pipe.load() | |
| # Global state | |
| pipelines = { | |
| "Zonos Transformer v0.1": transformer_pipe, | |
| "Zonos Hybrid v0.1": hybrid_pipe, | |
| } | |
| pipeline_names = list(pipelines.keys()) | |
| supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes | |
| # Model toggle | |
| def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]: | |
| """ | |
| Dynamically show/hide UI elements based on the model's conditioners. | |
| """ | |
| if not is_hf_spaces: | |
| # When not using ZERO, we can onload/offload pipes | |
| for pipeline_name, pipeline in pipelines.items(): | |
| if pipeline_name == pipeline_choice: | |
| pipeline.load() | |
| else: | |
| pipeline.unload() | |
| pipe = pipelines[pipeline_choice] | |
| cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners] | |
| vqscore_update = gr.update(visible=("vqscore_8" in cond_names)) | |
| emotion_update = gr.update(visible=("emotion" in cond_names)) | |
| fmax_update = gr.update(visible=("fmax" in cond_names)) | |
| pitch_update = gr.update(visible=("pitch_std" in cond_names)) | |
| speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names)) | |
| dnsmos_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) | |
| speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names)) | |
| return ( | |
| vqscore_update, | |
| emotion_update, | |
| fmax_update, | |
| pitch_update, | |
| speaking_rate_update, | |
| dnsmos_update, | |
| speaker_noised_update, | |
| ) | |
| # Invocation method | |
| def generate_audio( | |
| pipeline_choice: str, | |
| text: str, | |
| language: str, | |
| speaker_audio: Optional[str], | |
| prefix_audio: Optional[str], | |
| e1: float, | |
| e2: float, | |
| e3: float, | |
| e4: float, | |
| e5: float, | |
| e6: float, | |
| e7: float, | |
| e8: float, | |
| vq_single: float, | |
| fmax: float, | |
| pitch_std: float, | |
| speaking_rate: float, | |
| dnsmos_ovrl: float, | |
| speaker_noised: bool, | |
| cfg_scale: float, | |
| min_p: float, | |
| seed: int, | |
| max_chunk_length: int, | |
| cross_fade_duration: float, | |
| punctuation_pause_duration: float, | |
| target_rms: float, | |
| randomize_seed: bool, | |
| skip_dnsmos: bool, | |
| skip_vqscore: bool, | |
| skip_fmax: bool, | |
| skip_pitch: bool, | |
| skip_speaking_rate: bool, | |
| skip_emotion: bool, | |
| skip_speaker: bool, | |
| progress=gr.Progress(), | |
| ) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]: | |
| """ | |
| Generates audio based on the provided UI parameters. | |
| """ | |
| selected_pipeline = pipelines[pipeline_choice] | |
| if randomize_seed: | |
| seed = np.random.randint(0, 2**32) | |
| def on_progress(step: int, total: int) -> None: | |
| progress((step, total)) | |
| selected_pipeline.on_progress(on_progress) | |
| try: | |
| wav_out = selected_pipeline( | |
| text=text, | |
| language=language, | |
| reference_audio=speaker_audio, | |
| prefix_audio=prefix_audio, | |
| seed=seed, | |
| max_chunk_length=max_chunk_length, | |
| cross_fade_duration=cross_fade_duration, | |
| punctuation_pause_duration=punctuation_pause_duration, | |
| target_rms=target_rms, | |
| cfg_scale=cfg_scale, | |
| min_p=min_p, | |
| fmax=fmax, | |
| pitch_std=pitch_std, | |
| emotion_happiness=e1, | |
| emotion_sadness=e2, | |
| emotion_disgust=e3, | |
| emotion_fear=e4, | |
| emotion_surprise=e5, | |
| emotion_anger=e6, | |
| emotion_other=e7, | |
| emotion_neutral=e8, | |
| speaking_rate=speaking_rate, | |
| vq_score=vq_single, | |
| speaker_noised=speaker_noised, | |
| dnsmos=dnsmos_ovrl, | |
| skip_speaker=skip_speaker, | |
| skip_dnsmos=skip_dnsmos, | |
| skip_vq_score=skip_vqscore, | |
| skip_fmax=skip_fmax, | |
| skip_pitch=skip_pitch, | |
| skip_speaking_rate=skip_speaking_rate, | |
| skip_emotion=skip_emotion, | |
| output_format="float", | |
| ) | |
| return (44100, wav_out.squeeze().numpy()), seed | |
| finally: | |
| selected_pipeline.off_progress() | |
| # Interface | |
| if __name__ == "__main__": | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| gr.Markdown(header_markdown) | |
| gr.Image( | |
| value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png", | |
| container=False, | |
| interactive=False, | |
| show_label=False, | |
| show_share_button=False, | |
| show_fullscreen_button=False, | |
| show_download_button=False, | |
| ) | |
| with gr.Row(equal_height=True): | |
| pipeline_choice = gr.Dropdown( | |
| choices=pipeline_names, | |
| value=pipeline_names[0], | |
| label="Zonos Model Variant", | |
| ) | |
| language = gr.Dropdown( | |
| choices=supported_language_codes, | |
| value="en-us", | |
| label="Language", | |
| ) | |
| with gr.Row(): | |
| if not is_hf_spaces: | |
| limit_text = "Unlimited" | |
| else: | |
| limit_text = f"Up to {max_characters}" | |
| text = gr.Textbox( | |
| label=f"Speech Text ({limit_text} Characters)", | |
| value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.", | |
| lines=4, | |
| max_lines=20, | |
| max_length=max_characters if is_hf_spaces else None, | |
| ) | |
| with gr.Row(): | |
| generate_button = gr.Button("Generate Audio") | |
| with gr.Row(): | |
| output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True) | |
| with gr.Row(): | |
| gr.Markdown("## Long-Form Parameters") | |
| with gr.Column(variant="panel"): | |
| with gr.Row(equal_height=True): | |
| max_chunk_length = gr.Slider( | |
| 1, 300, 150, 1, label="Max Chunk Length (Characters)", | |
| info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so." | |
| ) | |
| target_rms = gr.Slider( | |
| 0.0, 1.0, 0.10, 0.01, label="Target RMS", | |
| info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels." | |
| ) | |
| with gr.Row(equal_height=True): | |
| punctuation_pause_duration = gr.Slider( | |
| 0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)", | |
| info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration." | |
| ) | |
| cross_fade_duration = gr.Slider( | |
| 0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)", | |
| info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration." | |
| ) | |
| with gr.Row(): | |
| gr.Markdown("## Generation Parameters") | |
| with gr.Row(variant="panel", equal_height=True): | |
| with gr.Column(): | |
| prefix_audio = gr.Audio( | |
| label="Optional Prefix Audio (continue from this audio)", | |
| type="filepath", | |
| ) | |
| with gr.Column(scale=3): | |
| cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale") | |
| min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P") | |
| seed_number = gr.Number(label="Seed", value=6475309, precision=0) | |
| randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True) | |
| with gr.Row(): | |
| gr.Markdown( | |
| "## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled." | |
| ) | |
| with gr.Row(variant="panel", equal_height=True) as speaker_row: | |
| with gr.Column(): | |
| speaker_uncond = gr.Checkbox(label="Skip Speaker") | |
| speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker", value=False) | |
| speaker_audio = gr.Audio( | |
| label="Optional Speaker Audio (for cloning)", | |
| type="filepath", | |
| scale=3, | |
| ) | |
| with gr.Row(variant="panel", equal_height=True) as emotion_row: | |
| emotion_uncond = gr.Checkbox(label="Skip Emotion") | |
| with gr.Column(scale=3): | |
| with gr.Row(): | |
| emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness") | |
| emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness") | |
| emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust") | |
| emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear") | |
| with gr.Row(): | |
| emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise") | |
| emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger") | |
| emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other") | |
| emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral") | |
| with gr.Row(variant="panel", equal_height=True) as dnsmos_row: | |
| dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS") | |
| dnsmos_slider = gr.Slider( | |
| 1.0, | |
| 5.0, | |
| value=4.0, | |
| step=0.1, | |
| label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]", | |
| scale=3, | |
| ) | |
| with gr.Row(variant="panel", equal_height=True) as vq_score_row: | |
| vq_uncond = gr.Checkbox(label="Skip VQScore") | |
| vq_single_slider = gr.Slider( | |
| 0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3 | |
| ) | |
| with gr.Row(variant="panel", equal_height=True) as fmax_row: | |
| fmax_uncond = gr.Checkbox(label="Skip Fmax") | |
| fmax_slider = gr.Slider( | |
| 0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3 | |
| ) | |
| with gr.Row(variant="panel", equal_height=True) as pitch_row: | |
| pitch_uncond = gr.Checkbox(label="Skip Pitch") | |
| pitch_std_slider = gr.Slider( | |
| 0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3 | |
| ) | |
| with gr.Row(variant="panel", equal_height=True) as speaking_rate_row: | |
| speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate") | |
| speaking_rate_slider = gr.Slider( | |
| 5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3 | |
| ) | |
| pipeline_choice.change( | |
| fn=update_ui, | |
| inputs=[pipeline_choice], | |
| outputs=[ | |
| vq_score_row, | |
| emotion_row, | |
| fmax_row, | |
| pitch_row, | |
| speaking_rate_row, | |
| dnsmos_row, | |
| speaker_noised_checkbox, | |
| ], | |
| ) | |
| # Trigger UI update on load | |
| demo.load( | |
| fn=update_ui, | |
| inputs=[pipeline_choice], | |
| outputs=[ | |
| vq_score_row, | |
| emotion_row, | |
| fmax_row, | |
| pitch_row, | |
| speaking_rate_row, | |
| dnsmos_row, | |
| speaker_noised_checkbox, | |
| ], | |
| ) | |
| # Generate audio on button click | |
| generate_button.click( | |
| fn=generate_audio, | |
| inputs=[ | |
| pipeline_choice, | |
| text, | |
| language, | |
| speaker_audio, | |
| prefix_audio, | |
| emotion1, | |
| emotion2, | |
| emotion3, | |
| emotion4, | |
| emotion5, | |
| emotion6, | |
| emotion7, | |
| emotion8, | |
| vq_single_slider, | |
| fmax_slider, | |
| pitch_std_slider, | |
| speaking_rate_slider, | |
| dnsmos_slider, | |
| speaker_noised_checkbox, | |
| cfg_scale_slider, | |
| min_p_slider, | |
| seed_number, | |
| max_chunk_length, | |
| cross_fade_duration, | |
| punctuation_pause_duration, | |
| target_rms, | |
| randomize_seed_toggle, | |
| dnsmos_uncond, | |
| vq_uncond, | |
| fmax_uncond, | |
| pitch_uncond, | |
| speaking_rate_uncond, | |
| emotion_uncond, | |
| speaker_uncond, | |
| ], | |
| outputs=[output_audio, seed_number], | |
| ) | |
| demo.launch() |