Ravi-9 commited on
Commit
86e31ba
·
verified ·
1 Parent(s): d7a8830

Adding train/evaluate/metrics files

Browse files
Files changed (3) hide show
  1. evaluate.py +32 -0
  2. metrics.py +57 -0
  3. train.py +59 -0
evaluate.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.metrics import (
2
+ calculate_msd,
3
+ calculate_f0_correlation,
4
+ calculate_phoneme_accuracy,
5
+ calculate_spectral_convergence
6
+ )
7
+ from inference import run_tts
8
+
9
+ def evaluate_bd_tts(model, test_dataset):
10
+ metrics = {}
11
+ pred_audio, target_audio = [], []
12
+
13
+ for text, target in test_dataset:
14
+ pred = run_tts(text)
15
+ pred_audio.append(pred)
16
+ target_audio.append(target)
17
+
18
+ metrics['mel_spectral_distance'] = calculate_msd(pred_audio, target_audio)
19
+ metrics['f0_correlation'] = calculate_f0_correlation(pred_audio, target_audio)
20
+ metrics['phoneme_accuracy'] = calculate_phoneme_accuracy(pred_audio, target_audio)
21
+ metrics['spectral_convergence'] = calculate_spectral_convergence(pred_audio, target_audio)
22
+
23
+ # Accent classifier is usually a pretrained model
24
+ # Placeholder: you’d plug in your Bangla accent classifier here
25
+ metrics['accent_score'] = 0.85
26
+
27
+ return metrics
28
+
29
+ if __name__ == "__main__":
30
+ test_dataset = [("আমি বাংলা বলি।", "reference.wav")] # dummy dataset
31
+ print(evaluate_bd_tts(None, test_dataset))
32
+
metrics.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/metrics.py
2
+ import numpy as np
3
+ import librosa
4
+
5
+ def calculate_msd(pred_audio, target_audio, sr=22050):
6
+ """
7
+ Mel Spectral Distance (MSD) between predicted and target audio.
8
+ """
9
+ # Convert to mel-spectrogram
10
+ pred_mel = librosa.feature.melspectrogram(y=pred_audio, sr=sr)
11
+ target_mel = librosa.feature.melspectrogram(y=target_audio, sr=sr)
12
+
13
+ # Convert to dB
14
+ pred_db = librosa.power_to_db(pred_mel, ref=np.max)
15
+ target_db = librosa.power_to_db(target_mel, ref=np.max)
16
+
17
+ # Mean squared difference
18
+ return np.mean((pred_db - target_db) ** 2)
19
+
20
+
21
+ def calculate_f0_correlation(pred_audio, target_audio, sr=22050):
22
+ """
23
+ Pitch correlation (F0 correlation) between predicted and target.
24
+ """
25
+ f0_pred, _, _ = librosa.pyin(pred_audio, fmin=50, fmax=500, sr=sr)
26
+ f0_target, _, _ = librosa.pyin(target_audio, fmin=50, fmax=500, sr=sr)
27
+
28
+ # Remove NaNs
29
+ mask = ~np.isnan(f0_pred) & ~np.isnan(f0_target)
30
+ if np.sum(mask) == 0:
31
+ return 0.0
32
+ return np.corrcoef(f0_pred[mask], f0_target[mask])[0, 1]
33
+
34
+
35
+ def calculate_phoneme_accuracy(pred_phonemes, target_phonemes):
36
+ """
37
+ Simple phoneme accuracy metric.
38
+ (Here, pred_phonemes and target_phonemes are lists of symbols)
39
+ """
40
+ if len(target_phonemes) == 0:
41
+ return 0.0
42
+ correct = sum(p == t for p, t in zip(pred_phonemes, target_phonemes))
43
+ return correct / len(target_phonemes)
44
+
45
+
46
+ def calculate_spectral_convergence(pred_audio, target_audio, sr=22050):
47
+ """
48
+ Spectral convergence: how close the predicted spectrum is to the target.
49
+ """
50
+ pred_spec = np.abs(librosa.stft(pred_audio))
51
+ target_spec = np.abs(librosa.stft(target_audio))
52
+
53
+ numerator = np.linalg.norm(target_spec - pred_spec, 'fro')
54
+ denominator = np.linalg.norm(target_spec, 'fro')
55
+
56
+ return numerator / (denominator + 1e-8)
57
+
train.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import DataLoader
3
+ from transformers import AdamW, get_linear_schedule_with_warmup
4
+ from utils.dataset import BDTtsDataset
5
+ from inference import tts # reuse your model
6
+
7
+ training_config = {
8
+ "learning_rate": 1e-4,
9
+ "batch_size": 16,
10
+ "warmup_steps": 1000,
11
+ "gradient_accumulation_steps": 4,
12
+ "mixed_precision": True,
13
+ "save_strategy": "steps",
14
+ "save_steps": 500,
15
+ "eval_steps": 100,
16
+ "num_epochs": 5
17
+ }
18
+
19
+ def train():
20
+ dataset = BDTtsDataset("./data/train")
21
+ dataloader = DataLoader(dataset, batch_size=training_config["batch_size"], shuffle=True)
22
+
23
+ optimizer = AdamW(tts.model.parameters(), lr=training_config["learning_rate"])
24
+ scheduler = get_linear_schedule_with_warmup(
25
+ optimizer,
26
+ num_warmup_steps=training_config["warmup_steps"],
27
+ num_training_steps=len(dataloader) * training_config["num_epochs"]
28
+ )
29
+
30
+ scaler = torch.cuda.amp.GradScaler() if training_config["mixed_precision"] else None
31
+ step = 0
32
+
33
+ for epoch in range(training_config["num_epochs"]):
34
+ for batch in dataloader:
35
+ inputs, targets = batch
36
+ optimizer.zero_grad()
37
+
38
+ with torch.cuda.amp.autocast(enabled=scaler is not None):
39
+ outputs = tts.model(inputs)
40
+ loss = outputs.loss if hasattr(outputs, "loss") else torch.nn.functional.mse_loss(outputs, targets)
41
+
42
+ if scaler:
43
+ scaler.scale(loss).backward()
44
+ scaler.step(optimizer)
45
+ scaler.update()
46
+ else:
47
+ loss.backward()
48
+ optimizer.step()
49
+
50
+ scheduler.step()
51
+ step += 1
52
+
53
+ if step % training_config["save_steps"] == 0:
54
+ torch.save(tts.model.state_dict(), f"checkpoints/model_step{step}.pth")
55
+ print(f"Saved checkpoint at step {step}")
56
+
57
+ if __name__ == "__main__":
58
+ train()
59
+