Vocos-Mel-24kHz: Mel-Spectrogram Neural Vocoder (Transformers-compatible version)
The Vocos model was proposed in Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis.
This model outputs 24kHz audio from mel spectrograms. This checkpoint is a Transformers-compatible version of charactr/vocos-mel-24khz.
π Audio samples below π
Example usage
from datasets import Audio, load_dataset
from scipy.io.wavfile import write as write_wav
from transformers import VocosModel, VocosFeatureExtractor
# load model and feature extractor
model_id = "hf-audio/vocos-mel-24khz"
feature_extractor = VocosFeatureExtractor.from_pretrained(model_id)
model = VocosModel.from_pretrained(model_id, device_map="auto")
sampling_rate = feature_extractor.sampling_rate
# load audio sample
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio = ds[0]["audio"]["array"]
inputs = feature_extractor(audio=audio, sampling_rate=sampling_rate).to(model.device)
print(inputs.audio_spectrogram.shape)
# -- (batch, mel, frame): [1, 100, 549]
outputs = model(**inputs)
audio_vocos = outputs.audio
print(audio_vocos.shape)
# -- (batch, time): [1, 140288]
# save audio to file
write_wav("vocos_mel.wav", sampling_rate, audio_vocos[0].detach().cpu().numpy())
Original
Mel-based Vocos (this model)
EnCodec-based Vocos (hf-audio/vocos-encodec-24khz)
Batch processing
For batch processing, the padding_mask output VocosFeatureExtractor can be used to get equivalent outputs as single-file processing.
from datasets import Audio, load_dataset
from scipy.io.wavfile import write as write_wav
from transformers import VocosModel, VocosFeatureExtractor
n_audio = 2 # number of audio samples to process in a batch
# load model and feature extractor
model_id = "hf-audio/vocos-mel-24khz"
feature_extractor = VocosFeatureExtractor.from_pretrained(model_id)
model = VocosModel.from_pretrained(model_id, device_map="auto")
sampling_rate = feature_extractor.sampling_rate
# load audio sample
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio = [audio_sample["array"] for audio_sample in ds[-n_audio:]["audio"]]
print(f"Input audio shape: {[_sample.shape for _sample in audio]}")
# Input audio shape: [(170760,), (107520,)]
# prepare batch
inputs = feature_extractor(audio=audio, sampling_rate=sampling_rate, device=model.device)
print(inputs.audio_spectrogram.shape)
# torch.Size([2, 100, 669])
# apply model
outputs = model(**inputs)
audio_vocos = outputs.audio
print(audio_vocos.shape)
# torch.Size([2, 171008])
# save audio to file
for i in range(n_audio):
# remove padding
padding_mask = inputs.padding_mask[i].bool()
valid_audio = audio_vocos[i][padding_mask].detach().cpu().numpy()
print(f"Output audio shape {i}: {valid_audio.shape}")
# Output audio shape 0: (170760,)
# Output audio shape 1: (107520,)
write_wav(f"vocos_mel_{i}.wav", sampling_rate, valid_audio)
# save original audio to file
for i in range(n_audio):
write_wav(f"original_{i}.wav", sampling_rate, audio[i])
- Downloads last month
- 475
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
π
Ask for provider support