Adding train/evaluate/metrics files
Browse files- evaluate.py +32 -0
- metrics.py +57 -0
- train.py +59 -0
evaluate.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from utils.metrics import (
|
| 2 |
+
calculate_msd,
|
| 3 |
+
calculate_f0_correlation,
|
| 4 |
+
calculate_phoneme_accuracy,
|
| 5 |
+
calculate_spectral_convergence
|
| 6 |
+
)
|
| 7 |
+
from inference import run_tts
|
| 8 |
+
|
| 9 |
+
def evaluate_bd_tts(model, test_dataset):
|
| 10 |
+
metrics = {}
|
| 11 |
+
pred_audio, target_audio = [], []
|
| 12 |
+
|
| 13 |
+
for text, target in test_dataset:
|
| 14 |
+
pred = run_tts(text)
|
| 15 |
+
pred_audio.append(pred)
|
| 16 |
+
target_audio.append(target)
|
| 17 |
+
|
| 18 |
+
metrics['mel_spectral_distance'] = calculate_msd(pred_audio, target_audio)
|
| 19 |
+
metrics['f0_correlation'] = calculate_f0_correlation(pred_audio, target_audio)
|
| 20 |
+
metrics['phoneme_accuracy'] = calculate_phoneme_accuracy(pred_audio, target_audio)
|
| 21 |
+
metrics['spectral_convergence'] = calculate_spectral_convergence(pred_audio, target_audio)
|
| 22 |
+
|
| 23 |
+
# Accent classifier is usually a pretrained model
|
| 24 |
+
# Placeholder: you’d plug in your Bangla accent classifier here
|
| 25 |
+
metrics['accent_score'] = 0.85
|
| 26 |
+
|
| 27 |
+
return metrics
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
test_dataset = [("আমি বাংলা বলি।", "reference.wav")] # dummy dataset
|
| 31 |
+
print(evaluate_bd_tts(None, test_dataset))
|
| 32 |
+
|
metrics.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/metrics.py
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
|
| 5 |
+
def calculate_msd(pred_audio, target_audio, sr=22050):
|
| 6 |
+
"""
|
| 7 |
+
Mel Spectral Distance (MSD) between predicted and target audio.
|
| 8 |
+
"""
|
| 9 |
+
# Convert to mel-spectrogram
|
| 10 |
+
pred_mel = librosa.feature.melspectrogram(y=pred_audio, sr=sr)
|
| 11 |
+
target_mel = librosa.feature.melspectrogram(y=target_audio, sr=sr)
|
| 12 |
+
|
| 13 |
+
# Convert to dB
|
| 14 |
+
pred_db = librosa.power_to_db(pred_mel, ref=np.max)
|
| 15 |
+
target_db = librosa.power_to_db(target_mel, ref=np.max)
|
| 16 |
+
|
| 17 |
+
# Mean squared difference
|
| 18 |
+
return np.mean((pred_db - target_db) ** 2)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def calculate_f0_correlation(pred_audio, target_audio, sr=22050):
|
| 22 |
+
"""
|
| 23 |
+
Pitch correlation (F0 correlation) between predicted and target.
|
| 24 |
+
"""
|
| 25 |
+
f0_pred, _, _ = librosa.pyin(pred_audio, fmin=50, fmax=500, sr=sr)
|
| 26 |
+
f0_target, _, _ = librosa.pyin(target_audio, fmin=50, fmax=500, sr=sr)
|
| 27 |
+
|
| 28 |
+
# Remove NaNs
|
| 29 |
+
mask = ~np.isnan(f0_pred) & ~np.isnan(f0_target)
|
| 30 |
+
if np.sum(mask) == 0:
|
| 31 |
+
return 0.0
|
| 32 |
+
return np.corrcoef(f0_pred[mask], f0_target[mask])[0, 1]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def calculate_phoneme_accuracy(pred_phonemes, target_phonemes):
|
| 36 |
+
"""
|
| 37 |
+
Simple phoneme accuracy metric.
|
| 38 |
+
(Here, pred_phonemes and target_phonemes are lists of symbols)
|
| 39 |
+
"""
|
| 40 |
+
if len(target_phonemes) == 0:
|
| 41 |
+
return 0.0
|
| 42 |
+
correct = sum(p == t for p, t in zip(pred_phonemes, target_phonemes))
|
| 43 |
+
return correct / len(target_phonemes)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def calculate_spectral_convergence(pred_audio, target_audio, sr=22050):
|
| 47 |
+
"""
|
| 48 |
+
Spectral convergence: how close the predicted spectrum is to the target.
|
| 49 |
+
"""
|
| 50 |
+
pred_spec = np.abs(librosa.stft(pred_audio))
|
| 51 |
+
target_spec = np.abs(librosa.stft(target_audio))
|
| 52 |
+
|
| 53 |
+
numerator = np.linalg.norm(target_spec - pred_spec, 'fro')
|
| 54 |
+
denominator = np.linalg.norm(target_spec, 'fro')
|
| 55 |
+
|
| 56 |
+
return numerator / (denominator + 1e-8)
|
| 57 |
+
|
train.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch.utils.data import DataLoader
|
| 3 |
+
from transformers import AdamW, get_linear_schedule_with_warmup
|
| 4 |
+
from utils.dataset import BDTtsDataset
|
| 5 |
+
from inference import tts # reuse your model
|
| 6 |
+
|
| 7 |
+
training_config = {
|
| 8 |
+
"learning_rate": 1e-4,
|
| 9 |
+
"batch_size": 16,
|
| 10 |
+
"warmup_steps": 1000,
|
| 11 |
+
"gradient_accumulation_steps": 4,
|
| 12 |
+
"mixed_precision": True,
|
| 13 |
+
"save_strategy": "steps",
|
| 14 |
+
"save_steps": 500,
|
| 15 |
+
"eval_steps": 100,
|
| 16 |
+
"num_epochs": 5
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
def train():
|
| 20 |
+
dataset = BDTtsDataset("./data/train")
|
| 21 |
+
dataloader = DataLoader(dataset, batch_size=training_config["batch_size"], shuffle=True)
|
| 22 |
+
|
| 23 |
+
optimizer = AdamW(tts.model.parameters(), lr=training_config["learning_rate"])
|
| 24 |
+
scheduler = get_linear_schedule_with_warmup(
|
| 25 |
+
optimizer,
|
| 26 |
+
num_warmup_steps=training_config["warmup_steps"],
|
| 27 |
+
num_training_steps=len(dataloader) * training_config["num_epochs"]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
scaler = torch.cuda.amp.GradScaler() if training_config["mixed_precision"] else None
|
| 31 |
+
step = 0
|
| 32 |
+
|
| 33 |
+
for epoch in range(training_config["num_epochs"]):
|
| 34 |
+
for batch in dataloader:
|
| 35 |
+
inputs, targets = batch
|
| 36 |
+
optimizer.zero_grad()
|
| 37 |
+
|
| 38 |
+
with torch.cuda.amp.autocast(enabled=scaler is not None):
|
| 39 |
+
outputs = tts.model(inputs)
|
| 40 |
+
loss = outputs.loss if hasattr(outputs, "loss") else torch.nn.functional.mse_loss(outputs, targets)
|
| 41 |
+
|
| 42 |
+
if scaler:
|
| 43 |
+
scaler.scale(loss).backward()
|
| 44 |
+
scaler.step(optimizer)
|
| 45 |
+
scaler.update()
|
| 46 |
+
else:
|
| 47 |
+
loss.backward()
|
| 48 |
+
optimizer.step()
|
| 49 |
+
|
| 50 |
+
scheduler.step()
|
| 51 |
+
step += 1
|
| 52 |
+
|
| 53 |
+
if step % training_config["save_steps"] == 0:
|
| 54 |
+
torch.save(tts.model.state_dict(), f"checkpoints/model_step{step}.pth")
|
| 55 |
+
print(f"Saved checkpoint at step {step}")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
train()
|
| 59 |
+
|