Ravi-9 commited on
Commit
9da2a2c
·
1 Parent(s): 2a45039

Initial commit with app.py, inference.py, requirements.txt, and README

Browse files
Files changed (3) hide show
  1. app.py +756 -0
  2. inference.py +12 -0
  3. requirement.txt +15 -0
app.py ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ML Engineer Assignment: Bangladeshi Bangla TTS Finetuning.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/12ZrU_dlECt3YzVZ7k7qpwySH3eXUS7bj
8
+ """
9
+
10
+ !nvidia-smi
11
+
12
+ !pip install transformers datasets torch torchaudio librosa
13
+ !pip install coqui-tts phonemizer espeak-ng
14
+ !pip install wandb tensorboard matplotlib seaborn
15
+
16
+ !git lfs install
17
+ !git clone https://huggingface.co/bangla-speech-processing/bangla_tts_female
18
+
19
+ !ls bangla_tts_female
20
+
21
+ !tts --model_path bangla_tts_female/pytorch_model.pth \
22
+ --config_path bangla_tts_female/config.json \
23
+ --text "আমি বাংলাদেশ থেকে এসেছি।" \
24
+ --out_path baseline.wav
25
+
26
+ from IPython.display import Audio
27
+ Audio("baseline.wav")
28
+
29
+ sentences = [
30
+ "আমি বাংলাদেশ থেকে এসেছি।",
31
+ "আজকের আবহাওয়া সুন্দর।",
32
+ "তুমি কোথায় যাচ্ছ?",
33
+ "আমরা ঢাকায় থাকি।",
34
+ "এটা আমার প্রিয় বই।"
35
+ ]
36
+
37
+ for i, text in enumerate(sentences, 1):
38
+ safe_text = text.replace('"', '\\"')
39
+ !tts --model_path bangla_tts_female/pytorch_model.pth \
40
+ --config_path bangla_tts_female/config.json \
41
+ --text "{safe_text}" \
42
+ --out_path "baseline_{i}.wav"
43
+
44
+ from IPython.display import Audio
45
+ Audio("baseline_2.wav")
46
+
47
+ """Checking the config.json"""
48
+
49
+ import json
50
+
51
+ with open("bangla_tts_female/config.json", "r", encoding="utf-8") as f:
52
+ config = json.load(f)
53
+
54
+ print(json.dumps(config, indent=2, ensure_ascii=False))
55
+
56
+ """Count parameters"""
57
+
58
+ from TTS.utils.synthesizer import Synthesizer
59
+ import torch
60
+
61
+ synthesizer = Synthesizer(
62
+ tts_checkpoint="bangla_tts_female/pytorch_model.pth",
63
+ tts_config_path="bangla_tts_female/config.json",
64
+ use_cuda=torch.cuda.is_available()
65
+ )
66
+
67
+ model_params = sum(p.numel() for p in synthesizer.tts_model.parameters())
68
+ print(f"Total parameters: {model_params:,}")
69
+
70
+ """Check tokenizer / phoneme system"""
71
+
72
+ print("Phonemizer:", config.get("phonemizer", "Not specified"))
73
+ print("Characters:", config.get("characters", "Not specified"))
74
+
75
+ """# Task 2"""
76
+
77
+ !wget https://www.openslr.org/resources/53/asr_bengali_6.zip
78
+
79
+ !unzip asr_bengali_6.zip -d openslr_53
80
+
81
+ !find /content -type d -name "*asr_bengali*"
82
+
83
+ !ls /content/openslr_53/asr_bengali
84
+
85
+ import pandas as pd
86
+
87
+ tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
88
+ df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
89
+ print(df.head())
90
+
91
+ import os
92
+
93
+ audio_dir = "/content/openslr_53/asr_bengali/data"
94
+ df["audio_path"] = df["utt_id"].apply(lambda x: os.path.join(audio_dir, f"{x}.wav"))
95
+ print(df.head())
96
+
97
+ df = df[df["audio_path"].apply(os.path.exists)]
98
+ print(f"Total usable audio files: {len(df)}")
99
+
100
+ import os, glob
101
+ import pandas as pd
102
+
103
+
104
+ tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
105
+ df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
106
+
107
+
108
+ file_dict = {
109
+ os.path.splitext(os.path.basename(f))[0]: f
110
+ for f in glob.glob("/content/openslr_53/asr_bengali/data/**/*.flac", recursive=True)
111
+ }
112
+
113
+ df["audio_path"] = df["utt_id"].map(file_dict)
114
+
115
+ df = df[df["audio_path"].notnull()]
116
+ print(f"Usable audio files: {len(df)}")
117
+ print(df.head())
118
+
119
+ !find /content/openslr_53/asr_bengali/data -type f | head -20
120
+
121
+ import librosa
122
+ import numpy as np
123
+
124
+ durations = []
125
+ for path in df["audio_path"].sample(100):
126
+ y, sr = librosa.load(path, sr=None)
127
+ durations.append(len(y) / sr)
128
+
129
+ print(f"Total samples: {len(df)}")
130
+ print(f"Duration: min={np.min(durations):.2f}s, mean={np.mean(durations):.2f}s, max={np.max(durations):.2f}s")
131
+ print(f"Unique speakers: {df['speaker_id'].nunique()}")
132
+
133
+ import pandas as pd
134
+
135
+ sample_df = df.sample(300, random_state=42)
136
+ sample_df.to_csv("accent_labeling_sample.csv", index=False)
137
+
138
+ from google.colab import files
139
+ files.download("accent_labeling_sample.csv")
140
+
141
+ from google.colab import files
142
+ uploaded = files.upload()
143
+
144
+ import pandas as pd
145
+ labeled_df = pd.read_csv("accent_labeling_sample.csv")
146
+
147
+ print(labeled_df.columns)
148
+
149
+ sample_df = df.sample(300, random_state=42)
150
+ sample_df.to_csv("accent_labeling_sample.csv", index=False)
151
+
152
+ import pandas as pd
153
+
154
+ label_df = df.sample(50, random_state=42).reset_index(drop=True)
155
+ label_df["accent_label"] = None
156
+
157
+ label_df.to_csv("labeling_in_progress.csv", index=False)
158
+
159
+ from IPython.display import Audio, display
160
+ import ipywidgets as widgets
161
+
162
+ label_df = pd.read_csv("labeling_in_progress.csv")
163
+
164
+ def label_clip(idx, label):
165
+ label_df.loc[idx, "accent_label"] = label
166
+ label_df.to_csv("labeling_in_progress.csv", index=False)
167
+ print(f"Labeled index {idx} as {'BD' if label==1 else 'IN'}")
168
+
169
+ def play_and_label(idx):
170
+ if idx >= len(label_df):
171
+ print("✅ All clips labeled!")
172
+ return
173
+
174
+ row = label_df.iloc[idx]
175
+ print(f"Index: {idx} | Speaker: {row['speaker_id']}")
176
+ print(f"Text: {row['text']}")
177
+ display(Audio(row["audio_path"]))
178
+
179
+ bd_btn = widgets.Button(description="BD Accent (1)", button_style='success')
180
+ in_btn = widgets.Button(description="IN Accent (0)", button_style='danger')
181
+ skip_btn = widgets.Button(description="Skip", button_style='warning')
182
+
183
+ def on_bd(b):
184
+ label_clip(idx, 1)
185
+ play_and_label(idx+1)
186
+ def on_in(b):
187
+ label_clip(idx, 0)
188
+ play_and_label(idx+1)
189
+ def on_skip(b):
190
+ label_clip(idx, None)
191
+ play_and_label(idx+1)
192
+
193
+ bd_btn.on_click(on_bd)
194
+ in_btn.on_click(on_in)
195
+ skip_btn.on_click(on_skip)
196
+
197
+ display(widgets.HBox([bd_btn, in_btn, skip_btn]))
198
+
199
+ play_and_label(0)
200
+
201
+ final_labels = pd.read_csv("labeling_in_progress.csv")
202
+ final_labels = final_labels.dropna(subset=["accent_label"])
203
+ final_labels.to_csv("accent_labeling_sample_labeled.csv", index=False)
204
+ print(f"Saved {len(final_labels)} labeled samples.")
205
+
206
+ import librosa
207
+ import numpy as np
208
+ import pandas as pd
209
+ from sklearn.ensemble import RandomForestClassifier
210
+ from sklearn.model_selection import train_test_split
211
+ from sklearn.metrics import classification_report
212
+
213
+ labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv")
214
+
215
+ def extract_mfcc(path, n_mfcc=13):
216
+ y, sr = librosa.load(path, sr=22050)
217
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
218
+ return np.mean(mfcc, axis=1)
219
+
220
+ X = np.array([extract_mfcc(p) for p in labeled_df["audio_path"]])
221
+ y = np.array(labeled_df["accent_label"])
222
+
223
+
224
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
225
+
226
+ clf = RandomForestClassifier(n_estimators=200, random_state=42)
227
+ clf.fit(X_train, y_train)
228
+
229
+
230
+ y_pred = clf.predict(X_test)
231
+ print(classification_report(y_test, y_pred))
232
+
233
+ df["accent_label"] = df["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
234
+ bd_df = df[df["accent_label"] == 1]
235
+ print(f"Bangladeshi-accent samples: {len(bd_df)}")
236
+
237
+ bd_df.to_csv("bd_openslr53.csv", index=False)
238
+
239
+ !wget https://www.openslr.org/resources/53/asr_bengali_a.zip
240
+
241
+ !unzip asr_bengali_a.zip -d asr_bengali_a
242
+
243
+ !ls asr_bengali_a
244
+ !find asr_bengali_a -type f | head -20
245
+
246
+ !find /content -type d -name "*asr_bengali*"
247
+
248
+ !ls /content/asr_bengali_a/asr_bengali
249
+
250
+ import pandas as pd
251
+ import glob, os
252
+
253
+ tsv_path = "/content/asr_bengali_a/asr_bengali/utt_spk_text.tsv"
254
+ df_a = pd.read_csv(tsv_path, sep="\t", names=["utt_id", "speaker_id", "text"])
255
+
256
+
257
+ audio_files = glob.glob("asr_bengali_a/data/**/*.flac", recursive=True)
258
+ audio_map = {os.path.splitext(os.path.basename(f))[0]: f for f in audio_files}
259
+
260
+
261
+ df_a["audio_path"] = df_a["utt_id"].map(audio_map)
262
+
263
+
264
+ df_a = df_a.dropna(subset=["audio_path"])
265
+ print(df_a.head())
266
+
267
+ df_a["accent_label"] = df_a["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
268
+ bd_df_a = df_a[df_a["accent_label"] == 1]
269
+ print(f"Bangladeshi-accent samples: {len(bd_df_a)}")
270
+
271
+ bd_df_a.to_csv("bd_asr_bengali_a.csv", index=False)
272
+
273
+ final_df = pd.concat([
274
+ pd.read_csv("bd_openslr53.csv"),
275
+ pd.read_csv("bd_asr_bengali_a.csv")
276
+ ])
277
+ final_df.to_csv("bd_combined_dataset.csv", index=False)
278
+
279
+ import soundfile as sf
280
+ import os
281
+
282
+ os.makedirs("processed_bd_audio", exist_ok=True)
283
+ meta_lines = []
284
+
285
+ for i, row in final_df.iterrows():
286
+ y, sr = librosa.load(row["audio_path"], sr=22050)
287
+ y, _ = librosa.effects.trim(y)
288
+ y = y / (np.max(np.abs(y)) + 1e-9)
289
+ out_path = f"processed_bd_audio/{i}.wav"
290
+ sf.write(out_path, y, 22050)
291
+ meta_lines.append(f"{out_path}|{row['text']}|bd_speaker")
292
+
293
+ with open("metadata.csv", "w", encoding="utf-8") as f:
294
+ f.write("\n".join(meta_lines))
295
+
296
+ """# TASK 3"""
297
+
298
+ !pip install librosa soundfile scikit-learn joblib numpy tqdm
299
+
300
+ import os
301
+ import numpy as np
302
+ import pandas as pd
303
+ import librosa
304
+ from tqdm import tqdm
305
+ from sklearn.ensemble import RandomForestClassifier
306
+ from sklearn.model_selection import train_test_split
307
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
308
+ import joblib
309
+
310
+
311
+ SR = 22050
312
+ N_MFCC = 13
313
+
314
+
315
+ def extract_accent_features(audio_path, sr=SR, n_mfcc=N_MFCC):
316
+ try:
317
+ y, orig_sr = librosa.load(audio_path, sr=None)
318
+ except:
319
+ return None
320
+
321
+ if orig_sr != sr:
322
+ y = librosa.resample(y=y, orig_sr=orig_sr, target_sr=sr)
323
+
324
+ y, _ = librosa.effects.trim(y, top_db=20)
325
+ if y.size == 0:
326
+ return None
327
+
328
+ y = y / (np.max(np.abs(y)) + 1e-9)
329
+ features = []
330
+
331
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
332
+ delta = librosa.feature.delta(mfcc)
333
+ features += list(np.mean(mfcc, axis=1))
334
+ features += list(np.std(mfcc, axis=1))
335
+ features += list(np.mean(delta, axis=1))
336
+ features += list(np.std(delta, axis=1))
337
+
338
+ cent = librosa.feature.spectral_centroid(y=y, sr=sr)
339
+ bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
340
+ rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
341
+ zcr = librosa.feature.zero_crossing_rate(y)
342
+ rms = librosa.feature.rms(y=y)
343
+ features += [np.mean(cent), np.std(cent)]
344
+ features += [np.mean(bw), np.std(bw)]
345
+ features += [np.mean(rolloff), np.std(rolloff)]
346
+ features += [np.mean(zcr), np.std(zcr)]
347
+ features += [np.mean(rms), np.std(rms)]
348
+
349
+ try:
350
+ f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin=50, fmax=600, sr=sr)
351
+ if f0 is None:
352
+ f0_stats = [0,0,0,0]
353
+ else:
354
+ voiced = ~np.isnan(f0)
355
+ if voiced.sum() == 0:
356
+ f0_stats = [0,0,0,0]
357
+ else:
358
+ f0_vals = f0[voiced]
359
+ f0_stats = [
360
+ np.mean(f0_vals),
361
+ np.std(f0_vals),
362
+ np.median(f0_vals),
363
+ float(np.sum(voiced)) / len(f0)
364
+ ]
365
+ except:
366
+ f0_stats = [0,0,0,0]
367
+ features += f0_stats
368
+
369
+ features += [len(y) / sr]
370
+
371
+ return np.array(features)
372
+
373
+ labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv") # Must have: audio_path, accent_label
374
+ X, y = [], []
375
+
376
+ for _, row in tqdm(labeled_df.iterrows(), total=len(labeled_df)):
377
+ feats = extract_accent_features(row["audio_path"])
378
+ if feats is not None:
379
+ X.append(feats)
380
+ y.append(int(row["accent_label"]))
381
+
382
+ X = np.vstack(X)
383
+ y = np.array(y)
384
+
385
+ X_train, X_test, y_train, y_test = train_test_split(
386
+ X, y, test_size=0.2, random_state=42
387
+ )
388
+
389
+
390
+ clf = RandomForestClassifier(
391
+ n_estimators=300, random_state=42, n_jobs=-1
392
+ )
393
+ clf.fit(X_train, y_train)
394
+
395
+
396
+ y_pred = clf.predict(X_test)
397
+ print("✅ Accuracy:", accuracy_score(y_test, y_pred))
398
+ print(classification_report(y_test, y_pred))
399
+ print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
400
+
401
+ joblib.dump(clf, "accent_rf_model.joblib")
402
+ np.save("feature_shape.npy", X.shape[1])
403
+ print("💾 Model saved as accent_rf_model.joblib")
404
+
405
+ """# TASK 4"""
406
+
407
+ from transformers import VitsModel
408
+
409
+ class BDVitsModel(VitsModel):
410
+ def __init__(self, config):
411
+ super().__init__(config)
412
+ self.bd_accent_adapter = torch.nn.Linear(config.hidden_size, config.hidden_size)
413
+
414
+ def forward(self, input_ids, attention_mask=None, **kwargs):
415
+ outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
416
+ hidden_states = outputs.last_hidden_state
417
+ hidden_states = self.bd_accent_adapter(hidden_states)
418
+ return outputs
419
+
420
+ def bd_text_normalize(text):
421
+ text = text.replace("ড়", "র")
422
+ text = text.replace("ঋ", "রি")
423
+ text = text.replace("ই", "ঈ") if "..." else text
424
+ return text
425
+
426
+ sample_text = "ঋণী ছেলে বড় রাস্তা দিয়ে যাবে।"
427
+ normalized_text = bd_text_normalize(sample_text)
428
+
429
+ print("Original text: ", sample_text)
430
+ print("Normalized text:", normalized_text)
431
+
432
+ def bd_accent_loss(pred_mel, target_mel, pred_phonemes, target_phonemes, accent_weight=0.1, phoneme_weight=0.5):
433
+ mel_loss = F.mse_loss(pred_mel, target_mel)
434
+ phoneme_loss = F.cross_entropy(pred_phonemes, target_phonemes)
435
+ accent_loss = accent_discriminator_loss(pred_mel)
436
+ total_loss = mel_loss + phoneme_weight * phoneme_loss + accent_weight * accent_loss
437
+
438
+ print(f"Mel Loss: {mel_loss.item():.4f} | Phoneme Loss: {phoneme_loss.item():.4f} | "
439
+ f"Accent Loss: {accent_loss:.4f} | Total Loss: {total_loss.item():.4f}")
440
+ return total_loss
441
+
442
+ """# TASK 5"""
443
+
444
+ !pip install torch torchaudio transformers datasets librosa soundfile wandb accelerate
445
+ !pip install tqdm librosa
446
+
447
+ import os, time, math, random
448
+ import torch
449
+ import torch.nn.functional as F
450
+ from torch import nn, optim
451
+ from torch.utils.data import DataLoader, Dataset
452
+ from torch.cuda.amp import autocast, GradScaler
453
+ import librosa, soundfile as sf, numpy as np
454
+ from tqdm.auto import tqdm
455
+ import joblib
456
+ import wandb
457
+
458
+ training_config = {
459
+ "learning_rate": 1e-4,
460
+ "batch_size": 16,
461
+ "warmup_steps": 1000,
462
+ "gradient_accumulation_steps": 4,
463
+ "mixed_precision": True,
464
+ "save_strategy": "steps",
465
+ "save_steps": 500,
466
+ "eval_steps": 100,
467
+ "num_train_epochs": 3,
468
+ "device": "cuda" if torch.cuda.is_available() else "cpu",
469
+ "output_dir": "/content/drive/MyDrive/bd_tts_finetune",
470
+ }
471
+ os.makedirs(training_config["output_dir"], exist_ok=True)
472
+
473
+ import pandas as pd
474
+
475
+ df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text", "accent_label"])
476
+
477
+ print(df.head())
478
+ print(df.shape)
479
+
480
+ !head -n 10 metadata.csv
481
+
482
+ df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text"])
483
+
484
+ df.to_csv("metadata_clean.csv", index=False)
485
+
486
+ """# TASK 6"""
487
+
488
+ import torch
489
+ import numpy as np
490
+
491
+ sample = {
492
+ 'text_input': "আমার নাম রাজি",
493
+ 'mel_spectrogram': torch.randn(80, 200),
494
+
495
+ 'audio_waveform': np.random.randn(44100).astype(np.float32),
496
+
497
+ 'phonemes': ["a", "m", "a", "r", "n", "a", "m", "r", "a", "j", "i"]
498
+ }
499
+
500
+ import librosa
501
+
502
+ audio_path = "/content/processed_bd_audio/audio.wav"
503
+ audio, sr = librosa.load(audio_path, sr=22050)
504
+
505
+ mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=80)
506
+ mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
507
+
508
+ import matplotlib.pyplot as plt
509
+
510
+ plt.figure(figsize=(10, 4))
511
+ plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
512
+ plt.colorbar(format='%+2.0f dB')
513
+ plt.title('Mel Spectrogram (dB)')
514
+ plt.xlabel('Time frames')
515
+ plt.ylabel('Mel frequency bins')
516
+ plt.show()
517
+
518
+ plt.figure(figsize=(10, 4))
519
+ plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
520
+ plt.colorbar(format='%+2.0f dB')
521
+ plt.title('Mel Spectrogram (dB)')
522
+ plt.xlabel('Time frames')
523
+ plt.ylabel('Mel frequency bins')
524
+ plt.savefig("/content/mel_spectrogram.png")
525
+ plt.close()
526
+
527
+ from IPython.display import Image
528
+ Image("/content/mel_spectrogram.png")
529
+
530
+ import torch
531
+
532
+ mel_tensor = torch.tensor(mel_spectrogram_db).unsqueeze(0) # add batch dim if needed
533
+ torch.save(mel_tensor, "/content/mel_spectrogram.pt")
534
+
535
+ """# TASK 7"""
536
+
537
+ import torch
538
+ import torch.nn as nn
539
+
540
+ class RelativePositionMultiHeadAttention(nn.Module):
541
+ def __init__(self, num_heads=8, k_channels=64):
542
+ super().__init__()
543
+ self.num_heads = num_heads
544
+ self.k_channels = k_channels
545
+ self.conv_k = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
546
+ self.conv_v = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
547
+ self.conv_o = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
548
+
549
+ @torch.jit.ignore
550
+ def attention(self, query, key, value, mask=None):
551
+ b = key.size(0)
552
+ d = key.size(1)
553
+ t_s = key.size(2)
554
+ t_t = query.size(2)
555
+
556
+ query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
557
+ key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
558
+ value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
559
+
560
+ scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
561
+
562
+ if mask is not None:
563
+ scores = scores.masked_fill(mask == 0, float('-inf'))
564
+
565
+ attn = torch.softmax(scores, dim=-1)
566
+ out = torch.matmul(attn, value)
567
+
568
+ out = out.transpose(2, 3).contiguous().view(b, d, t_t)
569
+
570
+ return out, attn
571
+
572
+ def forward(self, c, attn_mask=None):
573
+ q = c
574
+ k = self.conv_k(c)
575
+ v = self.conv_v(c)
576
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
577
+ x = self.conv_o(x)
578
+ return x
579
+
580
+ if __name__ == "__main__":
581
+ batch_size = 2
582
+ d_model = 512
583
+ seq_len = 50
584
+ num_heads = 8
585
+ k_channels = d_model // num_heads
586
+
587
+ model = RelativePositionMultiHeadAttention(num_heads=num_heads, k_channels=k_channels)
588
+
589
+ c = torch.randn(batch_size, d_model, seq_len)
590
+ output = model(c)
591
+ print("Output shape:", output.shape)
592
+
593
+
594
+ scripted_model = torch.jit.script(model)
595
+ print("TorchScript model compiled successfully.")
596
+
597
+ b, d, t = 2, 512, 50
598
+ dummy_input = torch.randn(b, d, t)
599
+ model = RelativePositionMultiHeadAttention(num_heads=8, k_channels=d//8)
600
+
601
+ output = model(dummy_input)
602
+ print(output.shape)
603
+
604
+ import torch
605
+ import torch.nn as nn
606
+ import gradio as gr
607
+ import numpy as np
608
+ import librosa
609
+
610
+ class RelativePositionMultiHeadAttention(nn.Module):
611
+ def __init__(self, d_model=512, num_heads=8):
612
+ super().__init__()
613
+ self.num_heads = num_heads
614
+ self.k_channels = d_model // num_heads
615
+
616
+ self.conv_k = nn.Conv1d(d_model, d_model, kernel_size=1)
617
+ self.conv_v = nn.Conv1d(d_model, d_model, kernel_size=1)
618
+ self.conv_o = nn.Conv1d(d_model, d_model, kernel_size=1)
619
+
620
+ @torch.jit.ignore
621
+ def attention(self, query, key, value, mask=None):
622
+ b = key.size(0)
623
+ d = key.size(1)
624
+ t_s = key.size(2)
625
+ t_t = query.size(2)
626
+
627
+ query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
628
+ key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
629
+ value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
630
+
631
+ scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
632
+
633
+ if mask is not None:
634
+ scores = scores.masked_fill(mask == 0, float('-inf'))
635
+
636
+ attn = torch.softmax(scores, dim=-1)
637
+ out = torch.matmul(attn, value)
638
+
639
+ out = out.transpose(2, 3).contiguous().view(b, d, t_t)
640
+ return out, attn
641
+
642
+ def forward(self, c, attn_mask=None):
643
+ q = c
644
+ k = self.conv_k(c)
645
+ v = self.conv_v(c)
646
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
647
+ x = self.conv_o(x)
648
+ return x
649
+
650
+ def preprocess_text(text):
651
+ bengali_chars = "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ড়"
652
+ char_to_idx = {ch: i+1 for i, ch in enumerate(bengali_chars)}
653
+ tokens = [char_to_idx.get(ch, 0) for ch in text if ch.strip() != '']
654
+ return tokens
655
+
656
+ class TokenEmbedding(nn.Module):
657
+ def __init__(self, vocab_size, d_model):
658
+ super().__init__()
659
+ self.embedding = nn.Embedding(vocab_size + 1, d_model, padding_idx=0)
660
+
661
+ def forward(self, tokens):
662
+ embedded = self.embedding(tokens)
663
+ return embedded.transpose(1, 2)
664
+
665
+ def mel_to_audio(mel_spectrogram, n_iter=60, sr=22050, n_fft=1024, hop_length=256):
666
+
667
+ mel_power = librosa.db_to_power(mel_spectrogram)
668
+ S = librosa.feature.inverse.mel_to_stft(mel_power, sr=sr, n_fft=n_fft)
669
+ audio = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length)
670
+ return audio
671
+
672
+ d_model = 512
673
+ vocab_size = 50
674
+ embedding = TokenEmbedding(vocab_size=vocab_size, d_model=d_model)
675
+ attention_model = RelativePositionMultiHeadAttention(d_model=d_model, num_heads=8)
676
+ embedding.eval()
677
+ attention_model.eval()
678
+
679
+ def tts_pipeline(user_text):
680
+ tokens = preprocess_text(user_text)
681
+ if len(tokens) == 0:
682
+ return None
683
+
684
+ input_tensor = torch.tensor(tokens).unsqueeze(0)
685
+
686
+ with torch.no_grad():
687
+ embedded = embedding(input_tensor)
688
+ output = attention_model(embedded)
689
+ mel = output.squeeze(0).cpu().numpy()
690
+ mel = mel[:80, :]
691
+
692
+ mel_db = 20 * np.log10(np.maximum(mel, 1e-5))
693
+
694
+ audio = mel_to_audio(mel_db)
695
+
696
+ return (22050, audio.astype(np.float32))
697
+ import numpy as np
698
+
699
+ import gradio as gr
700
+
701
+ iface = gr.Interface(
702
+ fn=tts_pipeline,
703
+ inputs=gr.Textbox(label="Enter Bengali Text"),
704
+ outputs=gr.Audio(label="Generated Speech"),
705
+ title="Bangladeshi Bengali TTS Demo"
706
+ )
707
+
708
+ iface.launch()
709
+
710
+ import subprocess
711
+ import os
712
+ import gradio as gr
713
+
714
+
715
+ MODEL_PATH = "bangla_tts_female/pytorch_model.pth"
716
+ CONFIG_PATH = "bangla_tts_female/config.json"
717
+
718
+ def tts_from_cli(text):
719
+ if not text.strip():
720
+ return None
721
+
722
+ safe_text = text.replace('"', '\\"')
723
+
724
+ output_wav = "output.wav"
725
+
726
+
727
+ cmd = [
728
+ "tts",
729
+ "--model_path", MODEL_PATH,
730
+ "--config_path", CONFIG_PATH,
731
+ "--text", safe_text,
732
+ "--out_path", output_wav
733
+ ]
734
+
735
+
736
+ result = subprocess.run(cmd, capture_output=True, text=True)
737
+
738
+ if result.returncode != 0:
739
+ print("Error:", result.stderr)
740
+ return None
741
+
742
+ if os.path.exists(output_wav):
743
+ return output_wav
744
+ else:
745
+ print("Output audio not found")
746
+ return None
747
+
748
+
749
+ iface = gr.Interface(
750
+ fn=tts_from_cli,
751
+ inputs=gr.Textbox(lines=2, placeholder="Enter Bengali text here..."),
752
+ outputs=gr.Audio(type="filepath"),
753
+ title="Bengali TTS with CLI Model"
754
+ )
755
+
756
+ iface.launch()
inference.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from TTS.api import TTS
2
+
3
+ tts = TTS("./") # local model path or Hugging Face ID
4
+
5
+
6
+ !tts --model_path bangla_tts_female/pytorch_model.pth \
7
+ --config_path bangla_tts_female/config.json \
8
+ --text "আমি বাংলাদেশ থেকে এসেছি।" \
9
+ --out_path baseline.wav
10
+
11
+ from IPython.display import Audio
12
+ Audio("baseline.wav")
requirement.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
4
+ soundfile
5
+ librosa
6
+ gradio
7
+ numpy
8
+ torchaudio
9
+ phonemizer
10
+ espeak-ng
11
+ coqui-tts
12
+ joblib
13
+ tqdm
14
+ numpy
15
+ scikit-learn