Ravi-9 commited on
Commit
07b3a05
·
verified ·
1 Parent(s): 2f69d13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +594 -573
app.py CHANGED
@@ -1,755 +1,776 @@
1
- # -*- coding: utf-8 -*-
2
- """ML Engineer Assignment: Bangladeshi Bangla TTS Finetuning.ipynb
3
 
4
- Automatically generated by Colab.
5
 
6
- Original file is located at
7
- https://colab.research.google.com/drive/12ZrU_dlECt3YzVZ7k7qpwySH3eXUS7bj
8
- """
9
 
 
 
10
 
11
- # pip install transformers datasets torch torchaudio librosa
12
- # pip install coqui-tts phonemizer espeak-ng
13
- # pip install wandb tensorboard matplotlib seaborn
14
 
15
- # git lfs install
16
- git clone https://huggingface.co/bangla-speech-processing/bangla_tts_female
 
 
 
 
 
17
 
18
- ls bangla_tts_female
 
19
 
20
- tts --model_path bangla_tts_female/pytorch_model.pth \
21
- --config_path bangla_tts_female/config.json \
22
- --text "আমি বাংলাদেশ থেকে এসেছি।" \
23
- --out_path baseline.wav
24
 
25
- from IPython.display import Audio
26
- Audio("baseline.wav")
27
 
28
- sentences = [
29
- "আমি বাংলাদেশ থেকে এসেছি।",
30
- "আজকের আবহাওয়া সুন্দর।",
31
- "তুমি কোথায় যাচ্ছ?",
32
- "আমরা ঢাকায় থাকি।",
33
- "এটা আমার প্রিয় বই।"
34
- ]
35
 
36
- for i, text in enumerate(sentences, 1):
37
- safe_text = text.replace('"', '\\"')
38
- tts --model_path bangla_tts_female/pytorch_model.pth \
39
- --config_path bangla_tts_female/config.json \
40
- --text "{safe_text}" \
41
- --out_path "baseline_{i}.wav"
42
 
43
- from IPython.display import Audio
44
- Audio("baseline_2.wav")
45
 
46
- """Checking the config.json"""
 
 
47
 
48
- import json
 
49
 
50
- with open("bangla_tts_female/config.json", "r", encoding="utf-8") as f:
51
- config = json.load(f)
52
 
53
- print(json.dumps(config, indent=2, ensure_ascii=False))
 
 
 
54
 
55
- """Count parameters"""
 
56
 
57
- from TTS.utils.synthesizer import Synthesizer
58
- import torch
 
 
 
 
 
59
 
60
- synthesizer = Synthesizer(
61
- tts_checkpoint="bangla_tts_female/pytorch_model.pth",
62
- tts_config_path="bangla_tts_female/config.json",
63
- use_cuda=torch.cuda.is_available()
64
- )
 
65
 
66
- model_params = sum(p.numel() for p in synthesizer.tts_model.parameters())
67
- print(f"Total parameters: {model_params:,}")
68
 
69
- """Check tokenizer / phoneme system"""
70
 
71
- print("Phonemizer:", config.get("phonemizer", "Not specified"))
72
- print("Characters:", config.get("characters", "Not specified"))
73
 
74
- """# Task 2"""
 
75
 
76
- !wget https://www.openslr.org/resources/53/asr_bengali_6.zip
77
 
78
- !unzip asr_bengali_6.zip -d openslr_53
79
 
80
- !find /content -type d -name "*asr_bengali*"
 
81
 
82
- !ls /content/openslr_53/asr_bengali
 
 
 
 
83
 
84
- import pandas as pd
 
85
 
86
- tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
87
- df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
88
- print(df.head())
89
 
90
- import os
 
91
 
92
- audio_dir = "/content/openslr_53/asr_bengali/data"
93
- df["audio_path"] = df["utt_id"].apply(lambda x: os.path.join(audio_dir, f"{x}.wav"))
94
- print(df.head())
95
 
96
- df = df[df["audio_path"].apply(os.path.exists)]
97
- print(f"Total usable audio files: {len(df)}")
98
 
99
- import os, glob
100
- import pandas as pd
101
 
 
102
 
103
- tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
104
- df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
105
 
 
106
 
107
- file_dict = {
108
- os.path.splitext(os.path.basename(f))[0]: f
109
- for f in glob.glob("/content/openslr_53/asr_bengali/data/**/*.flac", recursive=True)
110
- }
111
 
112
- df["audio_path"] = df["utt_id"].map(file_dict)
113
 
114
- df = df[df["audio_path"].notnull()]
115
- print(f"Usable audio files: {len(df)}")
116
- print(df.head())
117
 
118
- !find /content/openslr_53/asr_bengali/data -type f | head -20
 
119
 
120
- import librosa
121
- import numpy as np
122
 
123
- durations = []
124
- for path in df["audio_path"].sample(100):
125
- y, sr = librosa.load(path, sr=None)
126
- durations.append(len(y) / sr)
127
 
128
- print(f"Total samples: {len(df)}")
129
- print(f"Duration: min={np.min(durations):.2f}s, mean={np.mean(durations):.2f}s, max={np.max(durations):.2f}s")
130
- print(f"Unique speakers: {df['speaker_id'].nunique()}")
131
 
132
- import pandas as pd
133
 
134
- sample_df = df.sample(300, random_state=42)
135
- sample_df.to_csv("accent_labeling_sample.csv", index=False)
 
 
136
 
137
- from google.colab import files
138
- files.download("accent_labeling_sample.csv")
139
 
140
- from google.colab import files
141
- uploaded = files.upload()
 
142
 
143
- import pandas as pd
144
- labeled_df = pd.read_csv("accent_labeling_sample.csv")
145
 
146
- print(labeled_df.columns)
 
147
 
148
- sample_df = df.sample(300, random_state=42)
149
- sample_df.to_csv("accent_labeling_sample.csv", index=False)
 
 
150
 
151
- import pandas as pd
 
 
152
 
153
- label_df = df.sample(50, random_state=42).reset_index(drop=True)
154
- label_df["accent_label"] = None
155
 
156
- label_df.to_csv("labeling_in_progress.csv", index=False)
 
157
 
158
- from IPython.display import Audio, display
159
- import ipywidgets as widgets
160
 
161
- label_df = pd.read_csv("labeling_in_progress.csv")
 
162
 
163
- def label_clip(idx, label):
164
- label_df.loc[idx, "accent_label"] = label
165
- label_df.to_csv("labeling_in_progress.csv", index=False)
166
- print(f"Labeled index {idx} as {'BD' if label==1 else 'IN'}")
167
 
168
- def play_and_label(idx):
169
- if idx >= len(label_df):
170
- print("✅ All clips labeled!")
171
- return
172
 
173
- row = label_df.iloc[idx]
174
- print(f"Index: {idx} | Speaker: {row['speaker_id']}")
175
- print(f"Text: {row['text']}")
176
- display(Audio(row["audio_path"]))
177
 
178
- bd_btn = widgets.Button(description="BD Accent (1)", button_style='success')
179
- in_btn = widgets.Button(description="IN Accent (0)", button_style='danger')
180
- skip_btn = widgets.Button(description="Skip", button_style='warning')
181
 
182
- def on_bd(b):
183
- label_clip(idx, 1)
184
- play_and_label(idx+1)
185
- def on_in(b):
186
- label_clip(idx, 0)
187
- play_and_label(idx+1)
188
- def on_skip(b):
189
- label_clip(idx, None)
190
- play_and_label(idx+1)
191
 
192
- bd_btn.on_click(on_bd)
193
- in_btn.on_click(on_in)
194
- skip_btn.on_click(on_skip)
195
 
196
- display(widgets.HBox([bd_btn, in_btn, skip_btn]))
 
197
 
198
- play_and_label(0)
199
 
200
- final_labels = pd.read_csv("labeling_in_progress.csv")
201
- final_labels = final_labels.dropna(subset=["accent_label"])
202
- final_labels.to_csv("accent_labeling_sample_labeled.csv", index=False)
203
- print(f"Saved {len(final_labels)} labeled samples.")
204
 
205
- import librosa
206
- import numpy as np
207
- import pandas as pd
208
- from sklearn.ensemble import RandomForestClassifier
209
- from sklearn.model_selection import train_test_split
210
- from sklearn.metrics import classification_report
211
 
212
- labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv")
 
 
 
213
 
214
- def extract_mfcc(path, n_mfcc=13):
215
- y, sr = librosa.load(path, sr=22050)
216
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
217
- return np.mean(mfcc, axis=1)
218
 
219
- X = np.array([extract_mfcc(p) for p in labeled_df["audio_path"]])
220
- y = np.array(labeled_df["accent_label"])
 
 
 
 
 
 
 
221
 
 
 
 
222
 
223
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
224
 
225
- clf = RandomForestClassifier(n_estimators=200, random_state=42)
226
- clf.fit(X_train, y_train)
227
 
 
 
 
 
228
 
229
- y_pred = clf.predict(X_test)
230
- print(classification_report(y_test, y_pred))
 
 
 
 
231
 
232
- df["accent_label"] = df["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
233
- bd_df = df[df["accent_label"] == 1]
234
- print(f"Bangladeshi-accent samples: {len(bd_df)}")
235
 
236
- bd_df.to_csv("bd_openslr53.csv", index=False)
 
 
 
237
 
238
- wget https://www.openslr.org/resources/53/asr_bengali_a.zip
 
239
 
240
- unzip asr_bengali_a.zip -d asr_bengali_a
241
 
242
- ls asr_bengali_a
243
- find asr_bengali_a -type f | head -20
244
 
245
- find /content -type d -name "*asr_bengali*"
 
246
 
247
- ls /content/asr_bengali_a/asr_bengali
248
 
249
- import pandas as pd
250
- import glob, os
251
 
252
- tsv_path = "/content/asr_bengali_a/asr_bengali/utt_spk_text.tsv"
253
- df_a = pd.read_csv(tsv_path, sep="\t", names=["utt_id", "speaker_id", "text"])
 
254
 
 
255
 
256
- audio_files = glob.glob("asr_bengali_a/data/**/*.flac", recursive=True)
257
- audio_map = {os.path.splitext(os.path.basename(f))[0]: f for f in audio_files}
258
 
 
259
 
260
- df_a["audio_path"] = df_a["utt_id"].map(audio_map)
 
261
 
 
262
 
263
- df_a = df_a.dropna(subset=["audio_path"])
264
- print(df_a.head())
265
 
266
- df_a["accent_label"] = df_a["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
267
- bd_df_a = df_a[df_a["accent_label"] == 1]
268
- print(f"Bangladeshi-accent samples: {len(bd_df_a)}")
269
 
270
- bd_df_a.to_csv("bd_asr_bengali_a.csv", index=False)
 
271
 
272
- final_df = pd.concat([
273
- pd.read_csv("bd_openslr53.csv"),
274
- pd.read_csv("bd_asr_bengali_a.csv")
275
- ])
276
- final_df.to_csv("bd_combined_dataset.csv", index=False)
277
 
278
- import soundfile as sf
279
- import os
280
 
281
- os.makedirs("processed_bd_audio", exist_ok=True)
282
- meta_lines = []
283
 
284
- for i, row in final_df.iterrows():
285
- y, sr = librosa.load(row["audio_path"], sr=22050)
286
- y, _ = librosa.effects.trim(y)
287
- y = y / (np.max(np.abs(y)) + 1e-9)
288
- out_path = f"processed_bd_audio/{i}.wav"
289
- sf.write(out_path, y, 22050)
290
- meta_lines.append(f"{out_path}|{row['text']}|bd_speaker")
291
 
292
- with open("metadata.csv", "w", encoding="utf-8") as f:
293
- f.write("\n".join(meta_lines))
294
 
295
- """# TASK 3"""
 
296
 
297
- # pip install librosa soundfile scikit-learn joblib numpy tqdm
 
 
298
 
299
- import os
300
- import numpy as np
301
- import pandas as pd
302
- import librosa
303
- from tqdm import tqdm
304
- from sklearn.ensemble import RandomForestClassifier
305
- from sklearn.model_selection import train_test_split
306
- from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
307
- import joblib
308
 
 
 
 
 
 
309
 
310
- SR = 22050
311
- N_MFCC = 13
312
 
 
 
313
 
314
- def extract_accent_features(audio_path, sr=SR, n_mfcc=N_MFCC):
315
- try:
316
- y, orig_sr = librosa.load(audio_path, sr=None)
317
- except:
318
- return None
 
 
319
 
320
- if orig_sr != sr:
321
- y = librosa.resample(y=y, orig_sr=orig_sr, target_sr=sr)
322
 
323
- y, _ = librosa.effects.trim(y, top_db=20)
324
- if y.size == 0:
325
- return None
326
 
327
- y = y / (np.max(np.abs(y)) + 1e-9)
328
- features = []
329
 
330
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
331
- delta = librosa.feature.delta(mfcc)
332
- features += list(np.mean(mfcc, axis=1))
333
- features += list(np.std(mfcc, axis=1))
334
- features += list(np.mean(delta, axis=1))
335
- features += list(np.std(delta, axis=1))
 
 
 
336
 
337
- cent = librosa.feature.spectral_centroid(y=y, sr=sr)
338
- bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
339
- rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
340
- zcr = librosa.feature.zero_crossing_rate(y)
341
- rms = librosa.feature.rms(y=y)
342
- features += [np.mean(cent), np.std(cent)]
343
- features += [np.mean(bw), np.std(bw)]
344
- features += [np.mean(rolloff), np.std(rolloff)]
345
- features += [np.mean(zcr), np.std(zcr)]
346
- features += [np.mean(rms), np.std(rms)]
347
 
348
- try:
349
- f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin=50, fmax=600, sr=sr)
350
- if f0 is None:
351
- f0_stats = [0,0,0,0]
352
- else:
353
- voiced = ~np.isnan(f0)
354
- if voiced.sum() == 0:
355
- f0_stats = [0,0,0,0]
356
- else:
357
- f0_vals = f0[voiced]
358
- f0_stats = [
359
- np.mean(f0_vals),
360
- np.std(f0_vals),
361
- np.median(f0_vals),
362
- float(np.sum(voiced)) / len(f0)
363
- ]
364
- except:
365
- f0_stats = [0,0,0,0]
366
- features += f0_stats
367
-
368
- features += [len(y) / sr]
369
-
370
- return np.array(features)
371
-
372
- labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv") # Must have: audio_path, accent_label
373
- X, y = [], []
374
-
375
- for _, row in tqdm(labeled_df.iterrows(), total=len(labeled_df)):
376
- feats = extract_accent_features(row["audio_path"])
377
- if feats is not None:
378
- X.append(feats)
379
- y.append(int(row["accent_label"]))
380
-
381
- X = np.vstack(X)
382
- y = np.array(y)
383
-
384
- X_train, X_test, y_train, y_test = train_test_split(
385
- X, y, test_size=0.2, random_state=42
386
- )
387
 
388
 
389
- clf = RandomForestClassifier(
390
- n_estimators=300, random_state=42, n_jobs=-1
391
- )
392
- clf.fit(X_train, y_train)
 
393
 
 
 
394
 
395
- y_pred = clf.predict(X_test)
396
- print("✅ Accuracy:", accuracy_score(y_test, y_pred))
397
- print(classification_report(y_test, y_pred))
398
- print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
399
 
400
- joblib.dump(clf, "accent_rf_model.joblib")
401
- np.save("feature_shape.npy", X.shape[1])
402
- print("💾 Model saved as accent_rf_model.joblib")
403
 
404
- """# TASK 4"""
 
 
 
 
 
405
 
406
- from transformers import VitsModel
 
 
 
 
 
 
 
 
 
407
 
408
- class BDVitsModel(VitsModel):
409
- def __init__(self, config):
410
- super().__init__(config)
411
- self.bd_accent_adapter = torch.nn.Linear(config.hidden_size, config.hidden_size)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
- def forward(self, input_ids, attention_mask=None, **kwargs):
414
- outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
415
- hidden_states = outputs.last_hidden_state
416
- hidden_states = self.bd_accent_adapter(hidden_states)
417
- return outputs
418
 
419
- def bd_text_normalize(text):
420
- text = text.replace("ড়", "র")
421
- text = text.replace("ঋ", "রি")
422
- text = text.replace("ই", "ঈ") if "..." else text
423
- return text
424
 
425
- sample_text = "ঋণী ছেলে বড় রাস্তা দিয়ে যাবে।"
426
- normalized_text = bd_text_normalize(sample_text)
427
 
428
- print("Original text: ", sample_text)
429
- print("Normalized text:", normalized_text)
430
 
431
- def bd_accent_loss(pred_mel, target_mel, pred_phonemes, target_phonemes, accent_weight=0.1, phoneme_weight=0.5):
432
- mel_loss = F.mse_loss(pred_mel, target_mel)
433
- phoneme_loss = F.cross_entropy(pred_phonemes, target_phonemes)
434
- accent_loss = accent_discriminator_loss(pred_mel)
435
- total_loss = mel_loss + phoneme_weight * phoneme_loss + accent_weight * accent_loss
436
 
437
- print(f"Mel Loss: {mel_loss.item():.4f} | Phoneme Loss: {phoneme_loss.item():.4f} | "
438
- f"Accent Loss: {accent_loss:.4f} | Total Loss: {total_loss.item():.4f}")
439
- return total_loss
440
 
441
- """# TASK 5"""
442
 
443
- # !pip install torch torchaudio transformers datasets librosa soundfile wandb accelerate
444
- # !pip install tqdm librosa
445
 
446
- import os, time, math, random
447
- import torch
448
- import torch.nn.functional as F
449
- from torch import nn, optim
450
- from torch.utils.data import DataLoader, Dataset
451
- from torch.cuda.amp import autocast, GradScaler
452
- import librosa, soundfile as sf, numpy as np
453
- from tqdm.auto import tqdm
454
- import joblib
455
- import wandb
456
 
457
- training_config = {
458
- "learning_rate": 1e-4,
459
- "batch_size": 16,
460
- "warmup_steps": 1000,
461
- "gradient_accumulation_steps": 4,
462
- "mixed_precision": True,
463
- "save_strategy": "steps",
464
- "save_steps": 500,
465
- "eval_steps": 100,
466
- "num_train_epochs": 3,
467
- "device": "cuda" if torch.cuda.is_available() else "cpu",
468
- "output_dir": "/content/drive/MyDrive/bd_tts_finetune",
469
- }
470
- os.makedirs(training_config["output_dir"], exist_ok=True)
471
 
472
- import pandas as pd
 
473
 
474
- df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text", "accent_label"])
475
 
476
- print(df.head())
477
- print(df.shape)
478
 
479
- head -n 10 metadata.csv
 
480
 
481
- df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text"])
482
 
483
- df.to_csv("metadata_clean.csv", index=False)
 
 
 
 
 
 
484
 
485
- """# TASK 6"""
 
 
 
 
 
 
 
486
 
487
- import torch
488
- import numpy as np
 
 
489
 
490
- sample = {
491
- 'text_input': "আমার নাম রাজি",
492
- 'mel_spectrogram': torch.randn(80, 200),
493
 
494
- 'audio_waveform': np.random.randn(44100).astype(np.float32),
495
 
496
- 'phonemes': ["a", "m", "a", "r", "n", "a", "m", "r", "a", "j", "i"]
497
- }
498
 
499
- import librosa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
- audio_path = "/content/processed_bd_audio/audio.wav"
502
- audio, sr = librosa.load(audio_path, sr=22050)
 
503
 
504
- mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=80)
505
- mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
506
 
507
- import matplotlib.pyplot as plt
 
508
 
509
- plt.figure(figsize=(10, 4))
510
- plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
511
- plt.colorbar(format='%+2.0f dB')
512
- plt.title('Mel Spectrogram (dB)')
513
- plt.xlabel('Time frames')
514
- plt.ylabel('Mel frequency bins')
515
- plt.show()
516
 
517
- plt.figure(figsize=(10, 4))
518
- plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
519
- plt.colorbar(format='%+2.0f dB')
520
- plt.title('Mel Spectrogram (dB)')
521
- plt.xlabel('Time frames')
522
- plt.ylabel('Mel frequency bins')
523
- plt.savefig("/content/mel_spectrogram.png")
524
- plt.close()
525
 
526
- from IPython.display import Image
527
- Image("/content/mel_spectrogram.png")
528
 
529
- import torch
 
 
 
 
 
 
530
 
531
- mel_tensor = torch.tensor(mel_spectrogram_db).unsqueeze(0) # add batch dim if needed
532
- torch.save(mel_tensor, "/content/mel_spectrogram.pt")
 
 
 
 
 
 
 
 
 
 
533
 
534
- """# TASK 7"""
535
 
536
- import torch
537
- import torch.nn as nn
538
 
539
- class RelativePositionMultiHeadAttention(nn.Module):
540
- def __init__(self, num_heads=8, k_channels=64):
541
- super().__init__()
542
- self.num_heads = num_heads
543
- self.k_channels = k_channels
544
- self.conv_k = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
545
- self.conv_v = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
546
- self.conv_o = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
547
 
548
- @torch.jit.ignore
549
- def attention(self, query, key, value, mask=None):
550
- b = key.size(0)
551
- d = key.size(1)
552
- t_s = key.size(2)
553
- t_t = query.size(2)
 
 
 
 
 
 
 
 
554
 
555
- query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
556
- key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
557
- value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
558
 
559
- scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
 
 
 
 
 
560
 
561
- if mask is not None:
562
- scores = scores.masked_fill(mask == 0, float('-inf'))
 
563
 
564
- attn = torch.softmax(scores, dim=-1)
565
- out = torch.matmul(attn, value)
566
 
567
- out = out.transpose(2, 3).contiguous().view(b, d, t_t)
 
568
 
569
- return out, attn
 
570
 
571
- def forward(self, c, attn_mask=None):
572
- q = c
573
- k = self.conv_k(c)
574
- v = self.conv_v(c)
575
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
576
- x = self.conv_o(x)
577
- return x
578
 
579
- if __name__ == "__main__":
580
- batch_size = 2
581
- d_model = 512
582
- seq_len = 50
583
- num_heads = 8
584
- k_channels = d_model // num_heads
 
585
 
586
- model = RelativePositionMultiHeadAttention(num_heads=num_heads, k_channels=k_channels)
 
 
 
 
587
 
588
- c = torch.randn(batch_size, d_model, seq_len)
589
- output = model(c)
590
- print("Output shape:", output.shape)
 
591
 
 
 
 
592
 
593
- scripted_model = torch.jit.script(model)
594
- print("TorchScript model compiled successfully.")
595
 
596
- b, d, t = 2, 512, 50
597
- dummy_input = torch.randn(b, d, t)
598
- model = RelativePositionMultiHeadAttention(num_heads=8, k_channels=d//8)
 
599
 
600
- output = model(dummy_input)
601
- print(output.shape)
 
 
 
 
602
 
603
- import torch
604
- import torch.nn as nn
605
- import gradio as gr
606
- import numpy as np
607
- import librosa
608
-
609
- class RelativePositionMultiHeadAttention(nn.Module):
610
- def __init__(self, d_model=512, num_heads=8):
611
- super().__init__()
612
- self.num_heads = num_heads
613
- self.k_channels = d_model // num_heads
614
-
615
- self.conv_k = nn.Conv1d(d_model, d_model, kernel_size=1)
616
- self.conv_v = nn.Conv1d(d_model, d_model, kernel_size=1)
617
- self.conv_o = nn.Conv1d(d_model, d_model, kernel_size=1)
618
-
619
- @torch.jit.ignore
620
- def attention(self, query, key, value, mask=None):
621
- b = key.size(0)
622
- d = key.size(1)
623
- t_s = key.size(2)
624
- t_t = query.size(2)
625
-
626
- query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
627
- key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
628
- value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
629
-
630
- scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
631
-
632
- if mask is not None:
633
- scores = scores.masked_fill(mask == 0, float('-inf'))
634
-
635
- attn = torch.softmax(scores, dim=-1)
636
- out = torch.matmul(attn, value)
637
-
638
- out = out.transpose(2, 3).contiguous().view(b, d, t_t)
639
- return out, attn
640
-
641
- def forward(self, c, attn_mask=None):
642
- q = c
643
- k = self.conv_k(c)
644
- v = self.conv_v(c)
645
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
646
- x = self.conv_o(x)
647
- return x
648
-
649
- def preprocess_text(text):
650
- bengali_chars = "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ড়"
651
- char_to_idx = {ch: i+1 for i, ch in enumerate(bengali_chars)}
652
- tokens = [char_to_idx.get(ch, 0) for ch in text if ch.strip() != '']
653
- return tokens
654
-
655
- class TokenEmbedding(nn.Module):
656
- def __init__(self, vocab_size, d_model):
657
- super().__init__()
658
- self.embedding = nn.Embedding(vocab_size + 1, d_model, padding_idx=0)
659
-
660
- def forward(self, tokens):
661
- embedded = self.embedding(tokens)
662
- return embedded.transpose(1, 2)
663
-
664
- def mel_to_audio(mel_spectrogram, n_iter=60, sr=22050, n_fft=1024, hop_length=256):
665
-
666
- mel_power = librosa.db_to_power(mel_spectrogram)
667
- S = librosa.feature.inverse.mel_to_stft(mel_power, sr=sr, n_fft=n_fft)
668
- audio = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length)
669
- return audio
670
-
671
- d_model = 512
672
- vocab_size = 50
673
- embedding = TokenEmbedding(vocab_size=vocab_size, d_model=d_model)
674
- attention_model = RelativePositionMultiHeadAttention(d_model=d_model, num_heads=8)
675
- embedding.eval()
676
- attention_model.eval()
677
-
678
- def tts_pipeline(user_text):
679
- tokens = preprocess_text(user_text)
680
- if len(tokens) == 0:
681
- return None
682
-
683
- input_tensor = torch.tensor(tokens).unsqueeze(0)
684
-
685
- with torch.no_grad():
686
- embedded = embedding(input_tensor)
687
- output = attention_model(embedded)
688
- mel = output.squeeze(0).cpu().numpy()
689
- mel = mel[:80, :]
690
-
691
- mel_db = 20 * np.log10(np.maximum(mel, 1e-5))
692
-
693
- audio = mel_to_audio(mel_db)
694
-
695
- return (22050, audio.astype(np.float32))
696
- import numpy as np
697
 
698
- import gradio as gr
699
 
700
- iface = gr.Interface(
701
- fn=tts_pipeline,
702
- inputs=gr.Textbox(label="Enter Bengali Text"),
703
- outputs=gr.Audio(label="Generated Speech"),
704
- title="Bangladeshi Bengali TTS Demo"
705
- )
706
 
707
- iface.launch()
708
 
709
- import subprocess
710
- import os
711
- import gradio as gr
712
 
 
 
713
 
714
- MODEL_PATH = "bangla_tts_female/pytorch_model.pth"
715
- CONFIG_PATH = "bangla_tts_female/config.json"
716
 
717
- def tts_from_cli(text):
718
- if not text.strip():
719
- return None
 
 
 
720
 
721
- safe_text = text.replace('"', '\\"')
722
 
723
- output_wav = "output.wav"
 
 
724
 
725
 
726
- cmd = [
727
- "tts",
728
- "--model_path", MODEL_PATH,
729
- "--config_path", CONFIG_PATH,
730
- "--text", safe_text,
731
- "--out_path", output_wav
732
- ]
733
 
 
 
 
734
 
735
- result = subprocess.run(cmd, capture_output=True, text=True)
736
 
737
- if result.returncode != 0:
738
- print("Error:", result.stderr)
739
- return None
740
 
741
- if os.path.exists(output_wav):
742
- return output_wav
743
- else:
744
- print("Output audio not found")
745
- return None
746
 
 
 
 
 
 
 
 
747
 
748
- iface = gr.Interface(
749
- fn=tts_from_cli,
750
- inputs=gr.Textbox(lines=2, placeholder="Enter Bengali text here..."),
751
- outputs=gr.Audio(type="filepath"),
752
- title="Bengali TTS with CLI Model"
753
- )
754
 
755
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # -*- coding: utf-8 -*-
2
+ # """ML Engineer Assignment: Bangladeshi Bangla TTS Finetuning.ipynb
3
 
4
+ # Automatically generated by Colab.
5
 
6
+ # Original file is located at
7
+ # https://colab.research.google.com/drive/12ZrU_dlECt3YzVZ7k7qpwySH3eXUS7bj
8
+ # """
9
 
10
+ import gradio as gr
11
+ from inference import run_tts
12
 
13
+ def text_to_speech(text):
14
+ return run_tts(text)
 
15
 
16
+ demo = gr.Interface(
17
+ fn=text_to_speech,
18
+ inputs="text",
19
+ outputs="audio",
20
+ title="Bangla Text to Speech",
21
+ description="Enter Bangla text and hear the generated audio."
22
+ )
23
 
24
+ if __name__ == "__main__":
25
+ demo.launch()
26
 
 
 
 
 
27
 
 
 
28
 
 
 
 
 
 
 
 
29
 
 
 
 
 
 
 
30
 
 
 
31
 
32
+ # # pip install transformers datasets torch torchaudio librosa
33
+ # # pip install coqui-tts phonemizer espeak-ng
34
+ # # pip install wandb tensorboard matplotlib seaborn
35
 
36
+ # # git lfs install
37
+ # git clone https://huggingface.co/bangla-speech-processing/bangla_tts_female
38
 
39
+ # ls bangla_tts_female
 
40
 
41
+ # tts --model_path bangla_tts_female/pytorch_model.pth \
42
+ # --config_path bangla_tts_female/config.json \
43
+ # --text "আমি বাংলাদেশ থেকে এসেছি।" \
44
+ # --out_path baseline.wav
45
 
46
+ # from IPython.display import Audio
47
+ # Audio("baseline.wav")
48
 
49
+ # sentences = [
50
+ # "আমি বাংলাদেশ থেকে এসেছি।",
51
+ # "আজকের আবহাওয়া সুন্দর।",
52
+ # "তুমি কোথায় যাচ্ছ?",
53
+ # "আমরা ঢাকায় থাকি।",
54
+ # "এটা আমার প্রিয় বই।"
55
+ # ]
56
 
57
+ # for i, text in enumerate(sentences, 1):
58
+ # safe_text = text.replace('"', '\\"')
59
+ # tts --model_path bangla_tts_female/pytorch_model.pth \
60
+ # --config_path bangla_tts_female/config.json \
61
+ # --text "{safe_text}" \
62
+ # --out_path "baseline_{i}.wav"
63
 
64
+ # from IPython.display import Audio
65
+ # Audio("baseline_2.wav")
66
 
67
+ # """Checking the config.json"""
68
 
69
+ # import json
 
70
 
71
+ # with open("bangla_tts_female/config.json", "r", encoding="utf-8") as f:
72
+ # config = json.load(f)
73
 
74
+ # print(json.dumps(config, indent=2, ensure_ascii=False))
75
 
76
+ # """Count parameters"""
77
 
78
+ # from TTS.utils.synthesizer import Synthesizer
79
+ # import torch
80
 
81
+ # synthesizer = Synthesizer(
82
+ # tts_checkpoint="bangla_tts_female/pytorch_model.pth",
83
+ # tts_config_path="bangla_tts_female/config.json",
84
+ # use_cuda=torch.cuda.is_available()
85
+ # )
86
 
87
+ # model_params = sum(p.numel() for p in synthesizer.tts_model.parameters())
88
+ # print(f"Total parameters: {model_params:,}")
89
 
90
+ # """Check tokenizer / phoneme system"""
 
 
91
 
92
+ # print("Phonemizer:", config.get("phonemizer", "Not specified"))
93
+ # print("Characters:", config.get("characters", "Not specified"))
94
 
95
+ # """# Task 2"""
 
 
96
 
97
+ # !wget https://www.openslr.org/resources/53/asr_bengali_6.zip
 
98
 
99
+ # !unzip asr_bengali_6.zip -d openslr_53
 
100
 
101
+ # !find /content -type d -name "*asr_bengali*"
102
 
103
+ # !ls /content/openslr_53/asr_bengali
 
104
 
105
+ # import pandas as pd
106
 
107
+ # tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
108
+ # df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
109
+ # print(df.head())
 
110
 
111
+ # import os
112
 
113
+ # audio_dir = "/content/openslr_53/asr_bengali/data"
114
+ # df["audio_path"] = df["utt_id"].apply(lambda x: os.path.join(audio_dir, f"{x}.wav"))
115
+ # print(df.head())
116
 
117
+ # df = df[df["audio_path"].apply(os.path.exists)]
118
+ # print(f"Total usable audio files: {len(df)}")
119
 
120
+ # import os, glob
121
+ # import pandas as pd
122
 
 
 
 
 
123
 
124
+ # tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
125
+ # df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
 
126
 
 
127
 
128
+ # file_dict = {
129
+ # os.path.splitext(os.path.basename(f))[0]: f
130
+ # for f in glob.glob("/content/openslr_53/asr_bengali/data/**/*.flac", recursive=True)
131
+ # }
132
 
133
+ # df["audio_path"] = df["utt_id"].map(file_dict)
 
134
 
135
+ # df = df[df["audio_path"].notnull()]
136
+ # print(f"Usable audio files: {len(df)}")
137
+ # print(df.head())
138
 
139
+ # !find /content/openslr_53/asr_bengali/data -type f | head -20
 
140
 
141
+ # import librosa
142
+ # import numpy as np
143
 
144
+ # durations = []
145
+ # for path in df["audio_path"].sample(100):
146
+ # y, sr = librosa.load(path, sr=None)
147
+ # durations.append(len(y) / sr)
148
 
149
+ # print(f"Total samples: {len(df)}")
150
+ # print(f"Duration: min={np.min(durations):.2f}s, mean={np.mean(durations):.2f}s, max={np.max(durations):.2f}s")
151
+ # print(f"Unique speakers: {df['speaker_id'].nunique()}")
152
 
153
+ # import pandas as pd
 
154
 
155
+ # sample_df = df.sample(300, random_state=42)
156
+ # sample_df.to_csv("accent_labeling_sample.csv", index=False)
157
 
158
+ # from google.colab import files
159
+ # files.download("accent_labeling_sample.csv")
160
 
161
+ # from google.colab import files
162
+ # uploaded = files.upload()
163
 
164
+ # import pandas as pd
165
+ # labeled_df = pd.read_csv("accent_labeling_sample.csv")
 
 
166
 
167
+ # print(labeled_df.columns)
 
 
 
168
 
169
+ # sample_df = df.sample(300, random_state=42)
170
+ # sample_df.to_csv("accent_labeling_sample.csv", index=False)
 
 
171
 
172
+ # import pandas as pd
 
 
173
 
174
+ # label_df = df.sample(50, random_state=42).reset_index(drop=True)
175
+ # label_df["accent_label"] = None
 
 
 
 
 
 
 
176
 
177
+ # label_df.to_csv("labeling_in_progress.csv", index=False)
 
 
178
 
179
+ # from IPython.display import Audio, display
180
+ # import ipywidgets as widgets
181
 
182
+ # label_df = pd.read_csv("labeling_in_progress.csv")
183
 
184
+ # def label_clip(idx, label):
185
+ # label_df.loc[idx, "accent_label"] = label
186
+ # label_df.to_csv("labeling_in_progress.csv", index=False)
187
+ # print(f"Labeled index {idx} as {'BD' if label==1 else 'IN'}")
188
 
189
+ # def play_and_label(idx):
190
+ # if idx >= len(label_df):
191
+ # print("✅ All clips labeled!")
192
+ # return
 
 
193
 
194
+ # row = label_df.iloc[idx]
195
+ # print(f"Index: {idx} | Speaker: {row['speaker_id']}")
196
+ # print(f"Text: {row['text']}")
197
+ # display(Audio(row["audio_path"]))
198
 
199
+ # bd_btn = widgets.Button(description="BD Accent (1)", button_style='success')
200
+ # in_btn = widgets.Button(description="IN Accent (0)", button_style='danger')
201
+ # skip_btn = widgets.Button(description="Skip", button_style='warning')
 
202
 
203
+ # def on_bd(b):
204
+ # label_clip(idx, 1)
205
+ # play_and_label(idx+1)
206
+ # def on_in(b):
207
+ # label_clip(idx, 0)
208
+ # play_and_label(idx+1)
209
+ # def on_skip(b):
210
+ # label_clip(idx, None)
211
+ # play_and_label(idx+1)
212
 
213
+ # bd_btn.on_click(on_bd)
214
+ # in_btn.on_click(on_in)
215
+ # skip_btn.on_click(on_skip)
216
 
217
+ # display(widgets.HBox([bd_btn, in_btn, skip_btn]))
218
 
219
+ # play_and_label(0)
 
220
 
221
+ # final_labels = pd.read_csv("labeling_in_progress.csv")
222
+ # final_labels = final_labels.dropna(subset=["accent_label"])
223
+ # final_labels.to_csv("accent_labeling_sample_labeled.csv", index=False)
224
+ # print(f"Saved {len(final_labels)} labeled samples.")
225
 
226
+ # import librosa
227
+ # import numpy as np
228
+ # import pandas as pd
229
+ # from sklearn.ensemble import RandomForestClassifier
230
+ # from sklearn.model_selection import train_test_split
231
+ # from sklearn.metrics import classification_report
232
 
233
+ # labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv")
 
 
234
 
235
+ # def extract_mfcc(path, n_mfcc=13):
236
+ # y, sr = librosa.load(path, sr=22050)
237
+ # mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
238
+ # return np.mean(mfcc, axis=1)
239
 
240
+ # X = np.array([extract_mfcc(p) for p in labeled_df["audio_path"]])
241
+ # y = np.array(labeled_df["accent_label"])
242
 
 
243
 
244
+ # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
245
 
246
+ # clf = RandomForestClassifier(n_estimators=200, random_state=42)
247
+ # clf.fit(X_train, y_train)
248
 
 
249
 
250
+ # y_pred = clf.predict(X_test)
251
+ # print(classification_report(y_test, y_pred))
252
 
253
+ # df["accent_label"] = df["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
254
+ # bd_df = df[df["accent_label"] == 1]
255
+ # print(f"Bangladeshi-accent samples: {len(bd_df)}")
256
 
257
+ # bd_df.to_csv("bd_openslr53.csv", index=False)
258
 
259
+ # wget https://www.openslr.org/resources/53/asr_bengali_a.zip
 
260
 
261
+ # unzip asr_bengali_a.zip -d asr_bengali_a
262
 
263
+ # ls asr_bengali_a
264
+ # find asr_bengali_a -type f | head -20
265
 
266
+ # find /content -type d -name "*asr_bengali*"
267
 
268
+ # ls /content/asr_bengali_a/asr_bengali
 
269
 
270
+ # import pandas as pd
271
+ # import glob, os
 
272
 
273
+ # tsv_path = "/content/asr_bengali_a/asr_bengali/utt_spk_text.tsv"
274
+ # df_a = pd.read_csv(tsv_path, sep="\t", names=["utt_id", "speaker_id", "text"])
275
 
 
 
 
 
 
276
 
277
+ # audio_files = glob.glob("asr_bengali_a/data/**/*.flac", recursive=True)
278
+ # audio_map = {os.path.splitext(os.path.basename(f))[0]: f for f in audio_files}
279
 
 
 
280
 
281
+ # df_a["audio_path"] = df_a["utt_id"].map(audio_map)
 
 
 
 
 
 
282
 
 
 
283
 
284
+ # df_a = df_a.dropna(subset=["audio_path"])
285
+ # print(df_a.head())
286
 
287
+ # df_a["accent_label"] = df_a["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
288
+ # bd_df_a = df_a[df_a["accent_label"] == 1]
289
+ # print(f"Bangladeshi-accent samples: {len(bd_df_a)}")
290
 
291
+ # bd_df_a.to_csv("bd_asr_bengali_a.csv", index=False)
 
 
 
 
 
 
 
 
292
 
293
+ # final_df = pd.concat([
294
+ # pd.read_csv("bd_openslr53.csv"),
295
+ # pd.read_csv("bd_asr_bengali_a.csv")
296
+ # ])
297
+ # final_df.to_csv("bd_combined_dataset.csv", index=False)
298
 
299
+ # import soundfile as sf
300
+ # import os
301
 
302
+ # os.makedirs("processed_bd_audio", exist_ok=True)
303
+ # meta_lines = []
304
 
305
+ # for i, row in final_df.iterrows():
306
+ # y, sr = librosa.load(row["audio_path"], sr=22050)
307
+ # y, _ = librosa.effects.trim(y)
308
+ # y = y / (np.max(np.abs(y)) + 1e-9)
309
+ # out_path = f"processed_bd_audio/{i}.wav"
310
+ # sf.write(out_path, y, 22050)
311
+ # meta_lines.append(f"{out_path}|{row['text']}|bd_speaker")
312
 
313
+ # with open("metadata.csv", "w", encoding="utf-8") as f:
314
+ # f.write("\n".join(meta_lines))
315
 
316
+ # """# TASK 3"""
 
 
317
 
318
+ # # pip install librosa soundfile scikit-learn joblib numpy tqdm
 
319
 
320
+ # import os
321
+ # import numpy as np
322
+ # import pandas as pd
323
+ # import librosa
324
+ # from tqdm import tqdm
325
+ # from sklearn.ensemble import RandomForestClassifier
326
+ # from sklearn.model_selection import train_test_split
327
+ # from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
328
+ # import joblib
329
 
 
 
 
 
 
 
 
 
 
 
330
 
331
+ # SR = 22050
332
+ # N_MFCC = 13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
 
335
+ # def extract_accent_features(audio_path, sr=SR, n_mfcc=N_MFCC):
336
+ # try:
337
+ # y, orig_sr = librosa.load(audio_path, sr=None)
338
+ # except:
339
+ # return None
340
 
341
+ # if orig_sr != sr:
342
+ # y = librosa.resample(y=y, orig_sr=orig_sr, target_sr=sr)
343
 
344
+ # y, _ = librosa.effects.trim(y, top_db=20)
345
+ # if y.size == 0:
346
+ # return None
 
347
 
348
+ # y = y / (np.max(np.abs(y)) + 1e-9)
349
+ # features = []
 
350
 
351
+ # mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
352
+ # delta = librosa.feature.delta(mfcc)
353
+ # features += list(np.mean(mfcc, axis=1))
354
+ # features += list(np.std(mfcc, axis=1))
355
+ # features += list(np.mean(delta, axis=1))
356
+ # features += list(np.std(delta, axis=1))
357
 
358
+ # cent = librosa.feature.spectral_centroid(y=y, sr=sr)
359
+ # bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
360
+ # rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
361
+ # zcr = librosa.feature.zero_crossing_rate(y)
362
+ # rms = librosa.feature.rms(y=y)
363
+ # features += [np.mean(cent), np.std(cent)]
364
+ # features += [np.mean(bw), np.std(bw)]
365
+ # features += [np.mean(rolloff), np.std(rolloff)]
366
+ # features += [np.mean(zcr), np.std(zcr)]
367
+ # features += [np.mean(rms), np.std(rms)]
368
 
369
+ # try:
370
+ # f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin=50, fmax=600, sr=sr)
371
+ # if f0 is None:
372
+ # f0_stats = [0,0,0,0]
373
+ # else:
374
+ # voiced = ~np.isnan(f0)
375
+ # if voiced.sum() == 0:
376
+ # f0_stats = [0,0,0,0]
377
+ # else:
378
+ # f0_vals = f0[voiced]
379
+ # f0_stats = [
380
+ # np.mean(f0_vals),
381
+ # np.std(f0_vals),
382
+ # np.median(f0_vals),
383
+ # float(np.sum(voiced)) / len(f0)
384
+ # ]
385
+ # except:
386
+ # f0_stats = [0,0,0,0]
387
+ # features += f0_stats
388
+
389
+ # features += [len(y) / sr]
390
+
391
+ # return np.array(features)
392
+
393
+ # labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv") # Must have: audio_path, accent_label
394
+ # X, y = [], []
395
+
396
+ # for _, row in tqdm(labeled_df.iterrows(), total=len(labeled_df)):
397
+ # feats = extract_accent_features(row["audio_path"])
398
+ # if feats is not None:
399
+ # X.append(feats)
400
+ # y.append(int(row["accent_label"]))
401
+
402
+ # X = np.vstack(X)
403
+ # y = np.array(y)
404
+
405
+ # X_train, X_test, y_train, y_test = train_test_split(
406
+ # X, y, test_size=0.2, random_state=42
407
+ # )
408
+
409
+
410
+ # clf = RandomForestClassifier(
411
+ # n_estimators=300, random_state=42, n_jobs=-1
412
+ # )
413
+ # clf.fit(X_train, y_train)
414
+
415
+
416
+ # y_pred = clf.predict(X_test)
417
+ # print("✅ Accuracy:", accuracy_score(y_test, y_pred))
418
+ # print(classification_report(y_test, y_pred))
419
+ # print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
420
+
421
+ # joblib.dump(clf, "accent_rf_model.joblib")
422
+ # np.save("feature_shape.npy", X.shape[1])
423
+ # print("💾 Model saved as accent_rf_model.joblib")
424
+
425
+ # """# TASK 4"""
426
+
427
+ # from transformers import VitsModel
428
+
429
+ # class BDVitsModel(VitsModel):
430
+ # def __init__(self, config):
431
+ # super().__init__(config)
432
+ # self.bd_accent_adapter = torch.nn.Linear(config.hidden_size, config.hidden_size)
433
+
434
+ # def forward(self, input_ids, attention_mask=None, **kwargs):
435
+ # outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
436
+ # hidden_states = outputs.last_hidden_state
437
+ # hidden_states = self.bd_accent_adapter(hidden_states)
438
+ # return outputs
439
+
440
+ # def bd_text_normalize(text):
441
+ # text = text.replace("ড়", "র")
442
+ # text = text.replace("ঋ", "রি")
443
+ # text = text.replace("ই", "ঈ") if "..." else text
444
+ # return text
445
+
446
+ # sample_text = "ঋণী ছেলে বড় রাস্তা দিয়ে যাবে।"
447
+ # normalized_text = bd_text_normalize(sample_text)
448
+
449
+ # print("Original text: ", sample_text)
450
+ # print("Normalized text:", normalized_text)
451
+
452
+ # def bd_accent_loss(pred_mel, target_mel, pred_phonemes, target_phonemes, accent_weight=0.1, phoneme_weight=0.5):
453
+ # mel_loss = F.mse_loss(pred_mel, target_mel)
454
+ # phoneme_loss = F.cross_entropy(pred_phonemes, target_phonemes)
455
+ # accent_loss = accent_discriminator_loss(pred_mel)
456
+ # total_loss = mel_loss + phoneme_weight * phoneme_loss + accent_weight * accent_loss
457
+
458
+ # print(f"Mel Loss: {mel_loss.item():.4f} | Phoneme Loss: {phoneme_loss.item():.4f} | "
459
+ # f"Accent Loss: {accent_loss:.4f} | Total Loss: {total_loss.item():.4f}")
460
+ # return total_loss
461
+
462
+ # """# TASK 5"""
463
+
464
+ # # !pip install torch torchaudio transformers datasets librosa soundfile wandb accelerate
465
+ # # !pip install tqdm librosa
466
+
467
+ # import os, time, math, random
468
+ # import torch
469
+ # import torch.nn.functional as F
470
+ # from torch import nn, optim
471
+ # from torch.utils.data import DataLoader, Dataset
472
+ # from torch.cuda.amp import autocast, GradScaler
473
+ # import librosa, soundfile as sf, numpy as np
474
+ # from tqdm.auto import tqdm
475
+ # import joblib
476
+ # import wandb
477
+
478
+ # training_config = {
479
+ # "learning_rate": 1e-4,
480
+ # "batch_size": 16,
481
+ # "warmup_steps": 1000,
482
+ # "gradient_accumulation_steps": 4,
483
+ # "mixed_precision": True,
484
+ # "save_strategy": "steps",
485
+ # "save_steps": 500,
486
+ # "eval_steps": 100,
487
+ # "num_train_epochs": 3,
488
+ # "device": "cuda" if torch.cuda.is_available() else "cpu",
489
+ # "output_dir": "/content/drive/MyDrive/bd_tts_finetune",
490
+ # }
491
+ # os.makedirs(training_config["output_dir"], exist_ok=True)
492
 
493
+ # import pandas as pd
 
 
 
 
494
 
495
+ # df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text", "accent_label"])
 
 
 
 
496
 
497
+ # print(df.head())
498
+ # print(df.shape)
499
 
500
+ # head -n 10 metadata.csv
 
501
 
502
+ # df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text"])
 
 
 
 
503
 
504
+ # df.to_csv("metadata_clean.csv", index=False)
 
 
505
 
506
+ # """# TASK 6"""
507
 
508
+ # import torch
509
+ # import numpy as np
510
 
511
+ # sample = {
512
+ # 'text_input': "আমার নাম রাজি",
513
+ # 'mel_spectrogram': torch.randn(80, 200),
 
 
 
 
 
 
 
514
 
515
+ # 'audio_waveform': np.random.randn(44100).astype(np.float32),
 
 
 
 
 
 
 
 
 
 
 
 
 
516
 
517
+ # 'phonemes': ["a", "m", "a", "r", "n", "a", "m", "r", "a", "j", "i"]
518
+ # }
519
 
520
+ # import librosa
521
 
522
+ # audio_path = "/content/processed_bd_audio/audio.wav"
523
+ # audio, sr = librosa.load(audio_path, sr=22050)
524
 
525
+ # mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=80)
526
+ # mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
527
 
528
+ # import matplotlib.pyplot as plt
529
 
530
+ # plt.figure(figsize=(10, 4))
531
+ # plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
532
+ # plt.colorbar(format='%+2.0f dB')
533
+ # plt.title('Mel Spectrogram (dB)')
534
+ # plt.xlabel('Time frames')
535
+ # plt.ylabel('Mel frequency bins')
536
+ # plt.show()
537
 
538
+ # plt.figure(figsize=(10, 4))
539
+ # plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
540
+ # plt.colorbar(format='%+2.0f dB')
541
+ # plt.title('Mel Spectrogram (dB)')
542
+ # plt.xlabel('Time frames')
543
+ # plt.ylabel('Mel frequency bins')
544
+ # plt.savefig("/content/mel_spectrogram.png")
545
+ # plt.close()
546
 
547
+ # from IPython.display import Image
548
+ # Image("/content/mel_spectrogram.png")
549
+
550
+ # import torch
551
 
552
+ # mel_tensor = torch.tensor(mel_spectrogram_db).unsqueeze(0) # add batch dim if needed
553
+ # torch.save(mel_tensor, "/content/mel_spectrogram.pt")
 
554
 
555
+ # """# TASK 7"""
556
 
557
+ # import torch
558
+ # import torch.nn as nn
559
 
560
+ # class RelativePositionMultiHeadAttention(nn.Module):
561
+ # def __init__(self, num_heads=8, k_channels=64):
562
+ # super().__init__()
563
+ # self.num_heads = num_heads
564
+ # self.k_channels = k_channels
565
+ # self.conv_k = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
566
+ # self.conv_v = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
567
+ # self.conv_o = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
568
+
569
+ # @torch.jit.ignore
570
+ # def attention(self, query, key, value, mask=None):
571
+ # b = key.size(0)
572
+ # d = key.size(1)
573
+ # t_s = key.size(2)
574
+ # t_t = query.size(2)
575
 
576
+ # query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
577
+ # key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
578
+ # value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
579
 
580
+ # scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
 
581
 
582
+ # if mask is not None:
583
+ # scores = scores.masked_fill(mask == 0, float('-inf'))
584
 
585
+ # attn = torch.softmax(scores, dim=-1)
586
+ # out = torch.matmul(attn, value)
 
 
 
 
 
587
 
588
+ # out = out.transpose(2, 3).contiguous().view(b, d, t_t)
 
 
 
 
 
 
 
589
 
590
+ # return out, attn
 
591
 
592
+ # def forward(self, c, attn_mask=None):
593
+ # q = c
594
+ # k = self.conv_k(c)
595
+ # v = self.conv_v(c)
596
+ # x, self.attn = self.attention(q, k, v, mask=attn_mask)
597
+ # x = self.conv_o(x)
598
+ # return x
599
 
600
+ # if __name__ == "__main__":
601
+ # batch_size = 2
602
+ # d_model = 512
603
+ # seq_len = 50
604
+ # num_heads = 8
605
+ # k_channels = d_model // num_heads
606
+
607
+ # model = RelativePositionMultiHeadAttention(num_heads=num_heads, k_channels=k_channels)
608
+
609
+ # c = torch.randn(batch_size, d_model, seq_len)
610
+ # output = model(c)
611
+ # print("Output shape:", output.shape)
612
 
 
613
 
614
+ # scripted_model = torch.jit.script(model)
615
+ # print("TorchScript model compiled successfully.")
616
 
617
+ # b, d, t = 2, 512, 50
618
+ # dummy_input = torch.randn(b, d, t)
619
+ # model = RelativePositionMultiHeadAttention(num_heads=8, k_channels=d//8)
 
 
 
 
 
620
 
621
+ # output = model(dummy_input)
622
+ # print(output.shape)
623
+
624
+ # import torch
625
+ # import torch.nn as nn
626
+ # import gradio as gr
627
+ # import numpy as np
628
+ # import librosa
629
+
630
+ # class RelativePositionMultiHeadAttention(nn.Module):
631
+ # def __init__(self, d_model=512, num_heads=8):
632
+ # super().__init__()
633
+ # self.num_heads = num_heads
634
+ # self.k_channels = d_model // num_heads
635
 
636
+ # self.conv_k = nn.Conv1d(d_model, d_model, kernel_size=1)
637
+ # self.conv_v = nn.Conv1d(d_model, d_model, kernel_size=1)
638
+ # self.conv_o = nn.Conv1d(d_model, d_model, kernel_size=1)
639
 
640
+ # @torch.jit.ignore
641
+ # def attention(self, query, key, value, mask=None):
642
+ # b = key.size(0)
643
+ # d = key.size(1)
644
+ # t_s = key.size(2)
645
+ # t_t = query.size(2)
646
 
647
+ # query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
648
+ # key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
649
+ # value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
650
 
651
+ # scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
 
652
 
653
+ # if mask is not None:
654
+ # scores = scores.masked_fill(mask == 0, float('-inf'))
655
 
656
+ # attn = torch.softmax(scores, dim=-1)
657
+ # out = torch.matmul(attn, value)
658
 
659
+ # out = out.transpose(2, 3).contiguous().view(b, d, t_t)
660
+ # return out, attn
 
 
 
 
 
661
 
662
+ # def forward(self, c, attn_mask=None):
663
+ # q = c
664
+ # k = self.conv_k(c)
665
+ # v = self.conv_v(c)
666
+ # x, self.attn = self.attention(q, k, v, mask=attn_mask)
667
+ # x = self.conv_o(x)
668
+ # return x
669
 
670
+ # def preprocess_text(text):
671
+ # bengali_chars = "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ড়"
672
+ # char_to_idx = {ch: i+1 for i, ch in enumerate(bengali_chars)}
673
+ # tokens = [char_to_idx.get(ch, 0) for ch in text if ch.strip() != '']
674
+ # return tokens
675
 
676
+ # class TokenEmbedding(nn.Module):
677
+ # def __init__(self, vocab_size, d_model):
678
+ # super().__init__()
679
+ # self.embedding = nn.Embedding(vocab_size + 1, d_model, padding_idx=0)
680
 
681
+ # def forward(self, tokens):
682
+ # embedded = self.embedding(tokens)
683
+ # return embedded.transpose(1, 2)
684
 
685
+ # def mel_to_audio(mel_spectrogram, n_iter=60, sr=22050, n_fft=1024, hop_length=256):
 
686
 
687
+ # mel_power = librosa.db_to_power(mel_spectrogram)
688
+ # S = librosa.feature.inverse.mel_to_stft(mel_power, sr=sr, n_fft=n_fft)
689
+ # audio = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length)
690
+ # return audio
691
 
692
+ # d_model = 512
693
+ # vocab_size = 50
694
+ # embedding = TokenEmbedding(vocab_size=vocab_size, d_model=d_model)
695
+ # attention_model = RelativePositionMultiHeadAttention(d_model=d_model, num_heads=8)
696
+ # embedding.eval()
697
+ # attention_model.eval()
698
 
699
+ # def tts_pipeline(user_text):
700
+ # tokens = preprocess_text(user_text)
701
+ # if len(tokens) == 0:
702
+ # return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
 
704
+ # input_tensor = torch.tensor(tokens).unsqueeze(0)
705
 
706
+ # with torch.no_grad():
707
+ # embedded = embedding(input_tensor)
708
+ # output = attention_model(embedded)
709
+ # mel = output.squeeze(0).cpu().numpy()
710
+ # mel = mel[:80, :]
 
711
 
712
+ # mel_db = 20 * np.log10(np.maximum(mel, 1e-5))
713
 
714
+ # audio = mel_to_audio(mel_db)
 
 
715
 
716
+ # return (22050, audio.astype(np.float32))
717
+ # import numpy as np
718
 
719
+ # import gradio as gr
 
720
 
721
+ # iface = gr.Interface(
722
+ # fn=tts_pipeline,
723
+ # inputs=gr.Textbox(label="Enter Bengali Text"),
724
+ # outputs=gr.Audio(label="Generated Speech"),
725
+ # title="Bangladeshi Bengali TTS Demo"
726
+ # )
727
 
728
+ # iface.launch()
729
 
730
+ # import subprocess
731
+ # import os
732
+ # import gradio as gr
733
 
734
 
735
+ # MODEL_PATH = "bangla_tts_female/pytorch_model.pth"
736
+ # CONFIG_PATH = "bangla_tts_female/config.json"
 
 
 
 
 
737
 
738
+ # def tts_from_cli(text):
739
+ # if not text.strip():
740
+ # return None
741
 
742
+ # safe_text = text.replace('"', '\\"')
743
 
744
+ # output_wav = "output.wav"
 
 
745
 
 
 
 
 
 
746
 
747
+ # cmd = [
748
+ # "tts",
749
+ # "--model_path", MODEL_PATH,
750
+ # "--config_path", CONFIG_PATH,
751
+ # "--text", safe_text,
752
+ # "--out_path", output_wav
753
+ # ]
754
 
 
 
 
 
 
 
755
 
756
+ # result = subprocess.run(cmd, capture_output=True, text=True)
757
+
758
+ # if result.returncode != 0:
759
+ # print("Error:", result.stderr)
760
+ # return None
761
+
762
+ # if os.path.exists(output_wav):
763
+ # return output_wav
764
+ # else:
765
+ # print("Output audio not found")
766
+ # return None
767
+
768
+
769
+ # iface = gr.Interface(
770
+ # fn=tts_from_cli,
771
+ # inputs=gr.Textbox(lines=2, placeholder="Enter Bengali text here..."),
772
+ # outputs=gr.Audio(type="filepath"),
773
+ # title="Bengali TTS with CLI Model"
774
+ # )
775
+
776
+ # iface.launch()