Initial commit with app.py, inference.py, requirements.txt, and README
Browse files- app.py +756 -0
- inference.py +12 -0
- requirement.txt +15 -0
app.py
ADDED
|
@@ -0,0 +1,756 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""ML Engineer Assignment: Bangladeshi Bangla TTS Finetuning.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/12ZrU_dlECt3YzVZ7k7qpwySH3eXUS7bj
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
!nvidia-smi
|
| 11 |
+
|
| 12 |
+
!pip install transformers datasets torch torchaudio librosa
|
| 13 |
+
!pip install coqui-tts phonemizer espeak-ng
|
| 14 |
+
!pip install wandb tensorboard matplotlib seaborn
|
| 15 |
+
|
| 16 |
+
!git lfs install
|
| 17 |
+
!git clone https://huggingface.co/bangla-speech-processing/bangla_tts_female
|
| 18 |
+
|
| 19 |
+
!ls bangla_tts_female
|
| 20 |
+
|
| 21 |
+
!tts --model_path bangla_tts_female/pytorch_model.pth \
|
| 22 |
+
--config_path bangla_tts_female/config.json \
|
| 23 |
+
--text "আমি বাংলাদেশ থেকে এসেছি।" \
|
| 24 |
+
--out_path baseline.wav
|
| 25 |
+
|
| 26 |
+
from IPython.display import Audio
|
| 27 |
+
Audio("baseline.wav")
|
| 28 |
+
|
| 29 |
+
sentences = [
|
| 30 |
+
"আমি বাংলাদেশ থেকে এসেছি।",
|
| 31 |
+
"আজকের আবহাওয়া সুন্দর।",
|
| 32 |
+
"তুমি কোথায় যাচ্ছ?",
|
| 33 |
+
"আমরা ঢাকায় থাকি।",
|
| 34 |
+
"এটা আমার প্রিয় বই।"
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
for i, text in enumerate(sentences, 1):
|
| 38 |
+
safe_text = text.replace('"', '\\"')
|
| 39 |
+
!tts --model_path bangla_tts_female/pytorch_model.pth \
|
| 40 |
+
--config_path bangla_tts_female/config.json \
|
| 41 |
+
--text "{safe_text}" \
|
| 42 |
+
--out_path "baseline_{i}.wav"
|
| 43 |
+
|
| 44 |
+
from IPython.display import Audio
|
| 45 |
+
Audio("baseline_2.wav")
|
| 46 |
+
|
| 47 |
+
"""Checking the config.json"""
|
| 48 |
+
|
| 49 |
+
import json
|
| 50 |
+
|
| 51 |
+
with open("bangla_tts_female/config.json", "r", encoding="utf-8") as f:
|
| 52 |
+
config = json.load(f)
|
| 53 |
+
|
| 54 |
+
print(json.dumps(config, indent=2, ensure_ascii=False))
|
| 55 |
+
|
| 56 |
+
"""Count parameters"""
|
| 57 |
+
|
| 58 |
+
from TTS.utils.synthesizer import Synthesizer
|
| 59 |
+
import torch
|
| 60 |
+
|
| 61 |
+
synthesizer = Synthesizer(
|
| 62 |
+
tts_checkpoint="bangla_tts_female/pytorch_model.pth",
|
| 63 |
+
tts_config_path="bangla_tts_female/config.json",
|
| 64 |
+
use_cuda=torch.cuda.is_available()
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
model_params = sum(p.numel() for p in synthesizer.tts_model.parameters())
|
| 68 |
+
print(f"Total parameters: {model_params:,}")
|
| 69 |
+
|
| 70 |
+
"""Check tokenizer / phoneme system"""
|
| 71 |
+
|
| 72 |
+
print("Phonemizer:", config.get("phonemizer", "Not specified"))
|
| 73 |
+
print("Characters:", config.get("characters", "Not specified"))
|
| 74 |
+
|
| 75 |
+
"""# Task 2"""
|
| 76 |
+
|
| 77 |
+
!wget https://www.openslr.org/resources/53/asr_bengali_6.zip
|
| 78 |
+
|
| 79 |
+
!unzip asr_bengali_6.zip -d openslr_53
|
| 80 |
+
|
| 81 |
+
!find /content -type d -name "*asr_bengali*"
|
| 82 |
+
|
| 83 |
+
!ls /content/openslr_53/asr_bengali
|
| 84 |
+
|
| 85 |
+
import pandas as pd
|
| 86 |
+
|
| 87 |
+
tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
|
| 88 |
+
df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
|
| 89 |
+
print(df.head())
|
| 90 |
+
|
| 91 |
+
import os
|
| 92 |
+
|
| 93 |
+
audio_dir = "/content/openslr_53/asr_bengali/data"
|
| 94 |
+
df["audio_path"] = df["utt_id"].apply(lambda x: os.path.join(audio_dir, f"{x}.wav"))
|
| 95 |
+
print(df.head())
|
| 96 |
+
|
| 97 |
+
df = df[df["audio_path"].apply(os.path.exists)]
|
| 98 |
+
print(f"Total usable audio files: {len(df)}")
|
| 99 |
+
|
| 100 |
+
import os, glob
|
| 101 |
+
import pandas as pd
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
tsv_path = "/content/openslr_53/asr_bengali/utt_spk_text.tsv"
|
| 105 |
+
df = pd.read_csv(tsv_path, sep="\t", header=None, names=["utt_id", "speaker_id", "text"])
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
file_dict = {
|
| 109 |
+
os.path.splitext(os.path.basename(f))[0]: f
|
| 110 |
+
for f in glob.glob("/content/openslr_53/asr_bengali/data/**/*.flac", recursive=True)
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
df["audio_path"] = df["utt_id"].map(file_dict)
|
| 114 |
+
|
| 115 |
+
df = df[df["audio_path"].notnull()]
|
| 116 |
+
print(f"Usable audio files: {len(df)}")
|
| 117 |
+
print(df.head())
|
| 118 |
+
|
| 119 |
+
!find /content/openslr_53/asr_bengali/data -type f | head -20
|
| 120 |
+
|
| 121 |
+
import librosa
|
| 122 |
+
import numpy as np
|
| 123 |
+
|
| 124 |
+
durations = []
|
| 125 |
+
for path in df["audio_path"].sample(100):
|
| 126 |
+
y, sr = librosa.load(path, sr=None)
|
| 127 |
+
durations.append(len(y) / sr)
|
| 128 |
+
|
| 129 |
+
print(f"Total samples: {len(df)}")
|
| 130 |
+
print(f"Duration: min={np.min(durations):.2f}s, mean={np.mean(durations):.2f}s, max={np.max(durations):.2f}s")
|
| 131 |
+
print(f"Unique speakers: {df['speaker_id'].nunique()}")
|
| 132 |
+
|
| 133 |
+
import pandas as pd
|
| 134 |
+
|
| 135 |
+
sample_df = df.sample(300, random_state=42)
|
| 136 |
+
sample_df.to_csv("accent_labeling_sample.csv", index=False)
|
| 137 |
+
|
| 138 |
+
from google.colab import files
|
| 139 |
+
files.download("accent_labeling_sample.csv")
|
| 140 |
+
|
| 141 |
+
from google.colab import files
|
| 142 |
+
uploaded = files.upload()
|
| 143 |
+
|
| 144 |
+
import pandas as pd
|
| 145 |
+
labeled_df = pd.read_csv("accent_labeling_sample.csv")
|
| 146 |
+
|
| 147 |
+
print(labeled_df.columns)
|
| 148 |
+
|
| 149 |
+
sample_df = df.sample(300, random_state=42)
|
| 150 |
+
sample_df.to_csv("accent_labeling_sample.csv", index=False)
|
| 151 |
+
|
| 152 |
+
import pandas as pd
|
| 153 |
+
|
| 154 |
+
label_df = df.sample(50, random_state=42).reset_index(drop=True)
|
| 155 |
+
label_df["accent_label"] = None
|
| 156 |
+
|
| 157 |
+
label_df.to_csv("labeling_in_progress.csv", index=False)
|
| 158 |
+
|
| 159 |
+
from IPython.display import Audio, display
|
| 160 |
+
import ipywidgets as widgets
|
| 161 |
+
|
| 162 |
+
label_df = pd.read_csv("labeling_in_progress.csv")
|
| 163 |
+
|
| 164 |
+
def label_clip(idx, label):
|
| 165 |
+
label_df.loc[idx, "accent_label"] = label
|
| 166 |
+
label_df.to_csv("labeling_in_progress.csv", index=False)
|
| 167 |
+
print(f"Labeled index {idx} as {'BD' if label==1 else 'IN'}")
|
| 168 |
+
|
| 169 |
+
def play_and_label(idx):
|
| 170 |
+
if idx >= len(label_df):
|
| 171 |
+
print("✅ All clips labeled!")
|
| 172 |
+
return
|
| 173 |
+
|
| 174 |
+
row = label_df.iloc[idx]
|
| 175 |
+
print(f"Index: {idx} | Speaker: {row['speaker_id']}")
|
| 176 |
+
print(f"Text: {row['text']}")
|
| 177 |
+
display(Audio(row["audio_path"]))
|
| 178 |
+
|
| 179 |
+
bd_btn = widgets.Button(description="BD Accent (1)", button_style='success')
|
| 180 |
+
in_btn = widgets.Button(description="IN Accent (0)", button_style='danger')
|
| 181 |
+
skip_btn = widgets.Button(description="Skip", button_style='warning')
|
| 182 |
+
|
| 183 |
+
def on_bd(b):
|
| 184 |
+
label_clip(idx, 1)
|
| 185 |
+
play_and_label(idx+1)
|
| 186 |
+
def on_in(b):
|
| 187 |
+
label_clip(idx, 0)
|
| 188 |
+
play_and_label(idx+1)
|
| 189 |
+
def on_skip(b):
|
| 190 |
+
label_clip(idx, None)
|
| 191 |
+
play_and_label(idx+1)
|
| 192 |
+
|
| 193 |
+
bd_btn.on_click(on_bd)
|
| 194 |
+
in_btn.on_click(on_in)
|
| 195 |
+
skip_btn.on_click(on_skip)
|
| 196 |
+
|
| 197 |
+
display(widgets.HBox([bd_btn, in_btn, skip_btn]))
|
| 198 |
+
|
| 199 |
+
play_and_label(0)
|
| 200 |
+
|
| 201 |
+
final_labels = pd.read_csv("labeling_in_progress.csv")
|
| 202 |
+
final_labels = final_labels.dropna(subset=["accent_label"])
|
| 203 |
+
final_labels.to_csv("accent_labeling_sample_labeled.csv", index=False)
|
| 204 |
+
print(f"Saved {len(final_labels)} labeled samples.")
|
| 205 |
+
|
| 206 |
+
import librosa
|
| 207 |
+
import numpy as np
|
| 208 |
+
import pandas as pd
|
| 209 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 210 |
+
from sklearn.model_selection import train_test_split
|
| 211 |
+
from sklearn.metrics import classification_report
|
| 212 |
+
|
| 213 |
+
labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv")
|
| 214 |
+
|
| 215 |
+
def extract_mfcc(path, n_mfcc=13):
|
| 216 |
+
y, sr = librosa.load(path, sr=22050)
|
| 217 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
|
| 218 |
+
return np.mean(mfcc, axis=1)
|
| 219 |
+
|
| 220 |
+
X = np.array([extract_mfcc(p) for p in labeled_df["audio_path"]])
|
| 221 |
+
y = np.array(labeled_df["accent_label"])
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 225 |
+
|
| 226 |
+
clf = RandomForestClassifier(n_estimators=200, random_state=42)
|
| 227 |
+
clf.fit(X_train, y_train)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
y_pred = clf.predict(X_test)
|
| 231 |
+
print(classification_report(y_test, y_pred))
|
| 232 |
+
|
| 233 |
+
df["accent_label"] = df["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
|
| 234 |
+
bd_df = df[df["accent_label"] == 1]
|
| 235 |
+
print(f"Bangladeshi-accent samples: {len(bd_df)}")
|
| 236 |
+
|
| 237 |
+
bd_df.to_csv("bd_openslr53.csv", index=False)
|
| 238 |
+
|
| 239 |
+
!wget https://www.openslr.org/resources/53/asr_bengali_a.zip
|
| 240 |
+
|
| 241 |
+
!unzip asr_bengali_a.zip -d asr_bengali_a
|
| 242 |
+
|
| 243 |
+
!ls asr_bengali_a
|
| 244 |
+
!find asr_bengali_a -type f | head -20
|
| 245 |
+
|
| 246 |
+
!find /content -type d -name "*asr_bengali*"
|
| 247 |
+
|
| 248 |
+
!ls /content/asr_bengali_a/asr_bengali
|
| 249 |
+
|
| 250 |
+
import pandas as pd
|
| 251 |
+
import glob, os
|
| 252 |
+
|
| 253 |
+
tsv_path = "/content/asr_bengali_a/asr_bengali/utt_spk_text.tsv"
|
| 254 |
+
df_a = pd.read_csv(tsv_path, sep="\t", names=["utt_id", "speaker_id", "text"])
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
audio_files = glob.glob("asr_bengali_a/data/**/*.flac", recursive=True)
|
| 258 |
+
audio_map = {os.path.splitext(os.path.basename(f))[0]: f for f in audio_files}
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
df_a["audio_path"] = df_a["utt_id"].map(audio_map)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
df_a = df_a.dropna(subset=["audio_path"])
|
| 265 |
+
print(df_a.head())
|
| 266 |
+
|
| 267 |
+
df_a["accent_label"] = df_a["audio_path"].apply(lambda p: clf.predict([extract_mfcc(p)])[0])
|
| 268 |
+
bd_df_a = df_a[df_a["accent_label"] == 1]
|
| 269 |
+
print(f"Bangladeshi-accent samples: {len(bd_df_a)}")
|
| 270 |
+
|
| 271 |
+
bd_df_a.to_csv("bd_asr_bengali_a.csv", index=False)
|
| 272 |
+
|
| 273 |
+
final_df = pd.concat([
|
| 274 |
+
pd.read_csv("bd_openslr53.csv"),
|
| 275 |
+
pd.read_csv("bd_asr_bengali_a.csv")
|
| 276 |
+
])
|
| 277 |
+
final_df.to_csv("bd_combined_dataset.csv", index=False)
|
| 278 |
+
|
| 279 |
+
import soundfile as sf
|
| 280 |
+
import os
|
| 281 |
+
|
| 282 |
+
os.makedirs("processed_bd_audio", exist_ok=True)
|
| 283 |
+
meta_lines = []
|
| 284 |
+
|
| 285 |
+
for i, row in final_df.iterrows():
|
| 286 |
+
y, sr = librosa.load(row["audio_path"], sr=22050)
|
| 287 |
+
y, _ = librosa.effects.trim(y)
|
| 288 |
+
y = y / (np.max(np.abs(y)) + 1e-9)
|
| 289 |
+
out_path = f"processed_bd_audio/{i}.wav"
|
| 290 |
+
sf.write(out_path, y, 22050)
|
| 291 |
+
meta_lines.append(f"{out_path}|{row['text']}|bd_speaker")
|
| 292 |
+
|
| 293 |
+
with open("metadata.csv", "w", encoding="utf-8") as f:
|
| 294 |
+
f.write("\n".join(meta_lines))
|
| 295 |
+
|
| 296 |
+
"""# TASK 3"""
|
| 297 |
+
|
| 298 |
+
!pip install librosa soundfile scikit-learn joblib numpy tqdm
|
| 299 |
+
|
| 300 |
+
import os
|
| 301 |
+
import numpy as np
|
| 302 |
+
import pandas as pd
|
| 303 |
+
import librosa
|
| 304 |
+
from tqdm import tqdm
|
| 305 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 306 |
+
from sklearn.model_selection import train_test_split
|
| 307 |
+
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
| 308 |
+
import joblib
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
SR = 22050
|
| 312 |
+
N_MFCC = 13
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def extract_accent_features(audio_path, sr=SR, n_mfcc=N_MFCC):
|
| 316 |
+
try:
|
| 317 |
+
y, orig_sr = librosa.load(audio_path, sr=None)
|
| 318 |
+
except:
|
| 319 |
+
return None
|
| 320 |
+
|
| 321 |
+
if orig_sr != sr:
|
| 322 |
+
y = librosa.resample(y=y, orig_sr=orig_sr, target_sr=sr)
|
| 323 |
+
|
| 324 |
+
y, _ = librosa.effects.trim(y, top_db=20)
|
| 325 |
+
if y.size == 0:
|
| 326 |
+
return None
|
| 327 |
+
|
| 328 |
+
y = y / (np.max(np.abs(y)) + 1e-9)
|
| 329 |
+
features = []
|
| 330 |
+
|
| 331 |
+
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
|
| 332 |
+
delta = librosa.feature.delta(mfcc)
|
| 333 |
+
features += list(np.mean(mfcc, axis=1))
|
| 334 |
+
features += list(np.std(mfcc, axis=1))
|
| 335 |
+
features += list(np.mean(delta, axis=1))
|
| 336 |
+
features += list(np.std(delta, axis=1))
|
| 337 |
+
|
| 338 |
+
cent = librosa.feature.spectral_centroid(y=y, sr=sr)
|
| 339 |
+
bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
|
| 340 |
+
rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
|
| 341 |
+
zcr = librosa.feature.zero_crossing_rate(y)
|
| 342 |
+
rms = librosa.feature.rms(y=y)
|
| 343 |
+
features += [np.mean(cent), np.std(cent)]
|
| 344 |
+
features += [np.mean(bw), np.std(bw)]
|
| 345 |
+
features += [np.mean(rolloff), np.std(rolloff)]
|
| 346 |
+
features += [np.mean(zcr), np.std(zcr)]
|
| 347 |
+
features += [np.mean(rms), np.std(rms)]
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
f0, voiced_flag, voiced_prob = librosa.pyin(y, fmin=50, fmax=600, sr=sr)
|
| 351 |
+
if f0 is None:
|
| 352 |
+
f0_stats = [0,0,0,0]
|
| 353 |
+
else:
|
| 354 |
+
voiced = ~np.isnan(f0)
|
| 355 |
+
if voiced.sum() == 0:
|
| 356 |
+
f0_stats = [0,0,0,0]
|
| 357 |
+
else:
|
| 358 |
+
f0_vals = f0[voiced]
|
| 359 |
+
f0_stats = [
|
| 360 |
+
np.mean(f0_vals),
|
| 361 |
+
np.std(f0_vals),
|
| 362 |
+
np.median(f0_vals),
|
| 363 |
+
float(np.sum(voiced)) / len(f0)
|
| 364 |
+
]
|
| 365 |
+
except:
|
| 366 |
+
f0_stats = [0,0,0,0]
|
| 367 |
+
features += f0_stats
|
| 368 |
+
|
| 369 |
+
features += [len(y) / sr]
|
| 370 |
+
|
| 371 |
+
return np.array(features)
|
| 372 |
+
|
| 373 |
+
labeled_df = pd.read_csv("accent_labeling_sample_labeled.csv") # Must have: audio_path, accent_label
|
| 374 |
+
X, y = [], []
|
| 375 |
+
|
| 376 |
+
for _, row in tqdm(labeled_df.iterrows(), total=len(labeled_df)):
|
| 377 |
+
feats = extract_accent_features(row["audio_path"])
|
| 378 |
+
if feats is not None:
|
| 379 |
+
X.append(feats)
|
| 380 |
+
y.append(int(row["accent_label"]))
|
| 381 |
+
|
| 382 |
+
X = np.vstack(X)
|
| 383 |
+
y = np.array(y)
|
| 384 |
+
|
| 385 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 386 |
+
X, y, test_size=0.2, random_state=42
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
clf = RandomForestClassifier(
|
| 391 |
+
n_estimators=300, random_state=42, n_jobs=-1
|
| 392 |
+
)
|
| 393 |
+
clf.fit(X_train, y_train)
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
y_pred = clf.predict(X_test)
|
| 397 |
+
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
|
| 398 |
+
print(classification_report(y_test, y_pred))
|
| 399 |
+
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
|
| 400 |
+
|
| 401 |
+
joblib.dump(clf, "accent_rf_model.joblib")
|
| 402 |
+
np.save("feature_shape.npy", X.shape[1])
|
| 403 |
+
print("💾 Model saved as accent_rf_model.joblib")
|
| 404 |
+
|
| 405 |
+
"""# TASK 4"""
|
| 406 |
+
|
| 407 |
+
from transformers import VitsModel
|
| 408 |
+
|
| 409 |
+
class BDVitsModel(VitsModel):
|
| 410 |
+
def __init__(self, config):
|
| 411 |
+
super().__init__(config)
|
| 412 |
+
self.bd_accent_adapter = torch.nn.Linear(config.hidden_size, config.hidden_size)
|
| 413 |
+
|
| 414 |
+
def forward(self, input_ids, attention_mask=None, **kwargs):
|
| 415 |
+
outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
|
| 416 |
+
hidden_states = outputs.last_hidden_state
|
| 417 |
+
hidden_states = self.bd_accent_adapter(hidden_states)
|
| 418 |
+
return outputs
|
| 419 |
+
|
| 420 |
+
def bd_text_normalize(text):
|
| 421 |
+
text = text.replace("ড়", "র")
|
| 422 |
+
text = text.replace("ঋ", "রি")
|
| 423 |
+
text = text.replace("ই", "ঈ") if "..." else text
|
| 424 |
+
return text
|
| 425 |
+
|
| 426 |
+
sample_text = "ঋণী ছেলে বড় রাস্তা দিয়ে যাবে।"
|
| 427 |
+
normalized_text = bd_text_normalize(sample_text)
|
| 428 |
+
|
| 429 |
+
print("Original text: ", sample_text)
|
| 430 |
+
print("Normalized text:", normalized_text)
|
| 431 |
+
|
| 432 |
+
def bd_accent_loss(pred_mel, target_mel, pred_phonemes, target_phonemes, accent_weight=0.1, phoneme_weight=0.5):
|
| 433 |
+
mel_loss = F.mse_loss(pred_mel, target_mel)
|
| 434 |
+
phoneme_loss = F.cross_entropy(pred_phonemes, target_phonemes)
|
| 435 |
+
accent_loss = accent_discriminator_loss(pred_mel)
|
| 436 |
+
total_loss = mel_loss + phoneme_weight * phoneme_loss + accent_weight * accent_loss
|
| 437 |
+
|
| 438 |
+
print(f"Mel Loss: {mel_loss.item():.4f} | Phoneme Loss: {phoneme_loss.item():.4f} | "
|
| 439 |
+
f"Accent Loss: {accent_loss:.4f} | Total Loss: {total_loss.item():.4f}")
|
| 440 |
+
return total_loss
|
| 441 |
+
|
| 442 |
+
"""# TASK 5"""
|
| 443 |
+
|
| 444 |
+
!pip install torch torchaudio transformers datasets librosa soundfile wandb accelerate
|
| 445 |
+
!pip install tqdm librosa
|
| 446 |
+
|
| 447 |
+
import os, time, math, random
|
| 448 |
+
import torch
|
| 449 |
+
import torch.nn.functional as F
|
| 450 |
+
from torch import nn, optim
|
| 451 |
+
from torch.utils.data import DataLoader, Dataset
|
| 452 |
+
from torch.cuda.amp import autocast, GradScaler
|
| 453 |
+
import librosa, soundfile as sf, numpy as np
|
| 454 |
+
from tqdm.auto import tqdm
|
| 455 |
+
import joblib
|
| 456 |
+
import wandb
|
| 457 |
+
|
| 458 |
+
training_config = {
|
| 459 |
+
"learning_rate": 1e-4,
|
| 460 |
+
"batch_size": 16,
|
| 461 |
+
"warmup_steps": 1000,
|
| 462 |
+
"gradient_accumulation_steps": 4,
|
| 463 |
+
"mixed_precision": True,
|
| 464 |
+
"save_strategy": "steps",
|
| 465 |
+
"save_steps": 500,
|
| 466 |
+
"eval_steps": 100,
|
| 467 |
+
"num_train_epochs": 3,
|
| 468 |
+
"device": "cuda" if torch.cuda.is_available() else "cpu",
|
| 469 |
+
"output_dir": "/content/drive/MyDrive/bd_tts_finetune",
|
| 470 |
+
}
|
| 471 |
+
os.makedirs(training_config["output_dir"], exist_ok=True)
|
| 472 |
+
|
| 473 |
+
import pandas as pd
|
| 474 |
+
|
| 475 |
+
df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text", "accent_label"])
|
| 476 |
+
|
| 477 |
+
print(df.head())
|
| 478 |
+
print(df.shape)
|
| 479 |
+
|
| 480 |
+
!head -n 10 metadata.csv
|
| 481 |
+
|
| 482 |
+
df = pd.read_csv("metadata.csv", sep="|", names=["audio_path", "text"])
|
| 483 |
+
|
| 484 |
+
df.to_csv("metadata_clean.csv", index=False)
|
| 485 |
+
|
| 486 |
+
"""# TASK 6"""
|
| 487 |
+
|
| 488 |
+
import torch
|
| 489 |
+
import numpy as np
|
| 490 |
+
|
| 491 |
+
sample = {
|
| 492 |
+
'text_input': "আমার নাম রাজি",
|
| 493 |
+
'mel_spectrogram': torch.randn(80, 200),
|
| 494 |
+
|
| 495 |
+
'audio_waveform': np.random.randn(44100).astype(np.float32),
|
| 496 |
+
|
| 497 |
+
'phonemes': ["a", "m", "a", "r", "n", "a", "m", "r", "a", "j", "i"]
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
import librosa
|
| 501 |
+
|
| 502 |
+
audio_path = "/content/processed_bd_audio/audio.wav"
|
| 503 |
+
audio, sr = librosa.load(audio_path, sr=22050)
|
| 504 |
+
|
| 505 |
+
mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=80)
|
| 506 |
+
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
|
| 507 |
+
|
| 508 |
+
import matplotlib.pyplot as plt
|
| 509 |
+
|
| 510 |
+
plt.figure(figsize=(10, 4))
|
| 511 |
+
plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
|
| 512 |
+
plt.colorbar(format='%+2.0f dB')
|
| 513 |
+
plt.title('Mel Spectrogram (dB)')
|
| 514 |
+
plt.xlabel('Time frames')
|
| 515 |
+
plt.ylabel('Mel frequency bins')
|
| 516 |
+
plt.show()
|
| 517 |
+
|
| 518 |
+
plt.figure(figsize=(10, 4))
|
| 519 |
+
plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower', cmap='magma')
|
| 520 |
+
plt.colorbar(format='%+2.0f dB')
|
| 521 |
+
plt.title('Mel Spectrogram (dB)')
|
| 522 |
+
plt.xlabel('Time frames')
|
| 523 |
+
plt.ylabel('Mel frequency bins')
|
| 524 |
+
plt.savefig("/content/mel_spectrogram.png")
|
| 525 |
+
plt.close()
|
| 526 |
+
|
| 527 |
+
from IPython.display import Image
|
| 528 |
+
Image("/content/mel_spectrogram.png")
|
| 529 |
+
|
| 530 |
+
import torch
|
| 531 |
+
|
| 532 |
+
mel_tensor = torch.tensor(mel_spectrogram_db).unsqueeze(0) # add batch dim if needed
|
| 533 |
+
torch.save(mel_tensor, "/content/mel_spectrogram.pt")
|
| 534 |
+
|
| 535 |
+
"""# TASK 7"""
|
| 536 |
+
|
| 537 |
+
import torch
|
| 538 |
+
import torch.nn as nn
|
| 539 |
+
|
| 540 |
+
class RelativePositionMultiHeadAttention(nn.Module):
|
| 541 |
+
def __init__(self, num_heads=8, k_channels=64):
|
| 542 |
+
super().__init__()
|
| 543 |
+
self.num_heads = num_heads
|
| 544 |
+
self.k_channels = k_channels
|
| 545 |
+
self.conv_k = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
|
| 546 |
+
self.conv_v = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
|
| 547 |
+
self.conv_o = nn.Conv1d(in_channels=k_channels * num_heads, out_channels=k_channels * num_heads, kernel_size=1)
|
| 548 |
+
|
| 549 |
+
@torch.jit.ignore
|
| 550 |
+
def attention(self, query, key, value, mask=None):
|
| 551 |
+
b = key.size(0)
|
| 552 |
+
d = key.size(1)
|
| 553 |
+
t_s = key.size(2)
|
| 554 |
+
t_t = query.size(2)
|
| 555 |
+
|
| 556 |
+
query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
|
| 557 |
+
key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
|
| 558 |
+
value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
|
| 559 |
+
|
| 560 |
+
scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
|
| 561 |
+
|
| 562 |
+
if mask is not None:
|
| 563 |
+
scores = scores.masked_fill(mask == 0, float('-inf'))
|
| 564 |
+
|
| 565 |
+
attn = torch.softmax(scores, dim=-1)
|
| 566 |
+
out = torch.matmul(attn, value)
|
| 567 |
+
|
| 568 |
+
out = out.transpose(2, 3).contiguous().view(b, d, t_t)
|
| 569 |
+
|
| 570 |
+
return out, attn
|
| 571 |
+
|
| 572 |
+
def forward(self, c, attn_mask=None):
|
| 573 |
+
q = c
|
| 574 |
+
k = self.conv_k(c)
|
| 575 |
+
v = self.conv_v(c)
|
| 576 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
| 577 |
+
x = self.conv_o(x)
|
| 578 |
+
return x
|
| 579 |
+
|
| 580 |
+
if __name__ == "__main__":
|
| 581 |
+
batch_size = 2
|
| 582 |
+
d_model = 512
|
| 583 |
+
seq_len = 50
|
| 584 |
+
num_heads = 8
|
| 585 |
+
k_channels = d_model // num_heads
|
| 586 |
+
|
| 587 |
+
model = RelativePositionMultiHeadAttention(num_heads=num_heads, k_channels=k_channels)
|
| 588 |
+
|
| 589 |
+
c = torch.randn(batch_size, d_model, seq_len)
|
| 590 |
+
output = model(c)
|
| 591 |
+
print("Output shape:", output.shape)
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
scripted_model = torch.jit.script(model)
|
| 595 |
+
print("TorchScript model compiled successfully.")
|
| 596 |
+
|
| 597 |
+
b, d, t = 2, 512, 50
|
| 598 |
+
dummy_input = torch.randn(b, d, t)
|
| 599 |
+
model = RelativePositionMultiHeadAttention(num_heads=8, k_channels=d//8)
|
| 600 |
+
|
| 601 |
+
output = model(dummy_input)
|
| 602 |
+
print(output.shape)
|
| 603 |
+
|
| 604 |
+
import torch
|
| 605 |
+
import torch.nn as nn
|
| 606 |
+
import gradio as gr
|
| 607 |
+
import numpy as np
|
| 608 |
+
import librosa
|
| 609 |
+
|
| 610 |
+
class RelativePositionMultiHeadAttention(nn.Module):
|
| 611 |
+
def __init__(self, d_model=512, num_heads=8):
|
| 612 |
+
super().__init__()
|
| 613 |
+
self.num_heads = num_heads
|
| 614 |
+
self.k_channels = d_model // num_heads
|
| 615 |
+
|
| 616 |
+
self.conv_k = nn.Conv1d(d_model, d_model, kernel_size=1)
|
| 617 |
+
self.conv_v = nn.Conv1d(d_model, d_model, kernel_size=1)
|
| 618 |
+
self.conv_o = nn.Conv1d(d_model, d_model, kernel_size=1)
|
| 619 |
+
|
| 620 |
+
@torch.jit.ignore
|
| 621 |
+
def attention(self, query, key, value, mask=None):
|
| 622 |
+
b = key.size(0)
|
| 623 |
+
d = key.size(1)
|
| 624 |
+
t_s = key.size(2)
|
| 625 |
+
t_t = query.size(2)
|
| 626 |
+
|
| 627 |
+
query = query.view(b, self.num_heads, self.k_channels, t_t).transpose(2, 3)
|
| 628 |
+
key = key.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
|
| 629 |
+
value = value.view(b, self.num_heads, self.k_channels, t_s).transpose(2, 3)
|
| 630 |
+
|
| 631 |
+
scores = torch.matmul(query, key.transpose(-2, -1)) / (self.k_channels ** 0.5)
|
| 632 |
+
|
| 633 |
+
if mask is not None:
|
| 634 |
+
scores = scores.masked_fill(mask == 0, float('-inf'))
|
| 635 |
+
|
| 636 |
+
attn = torch.softmax(scores, dim=-1)
|
| 637 |
+
out = torch.matmul(attn, value)
|
| 638 |
+
|
| 639 |
+
out = out.transpose(2, 3).contiguous().view(b, d, t_t)
|
| 640 |
+
return out, attn
|
| 641 |
+
|
| 642 |
+
def forward(self, c, attn_mask=None):
|
| 643 |
+
q = c
|
| 644 |
+
k = self.conv_k(c)
|
| 645 |
+
v = self.conv_v(c)
|
| 646 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
| 647 |
+
x = self.conv_o(x)
|
| 648 |
+
return x
|
| 649 |
+
|
| 650 |
+
def preprocess_text(text):
|
| 651 |
+
bengali_chars = "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ড়"
|
| 652 |
+
char_to_idx = {ch: i+1 for i, ch in enumerate(bengali_chars)}
|
| 653 |
+
tokens = [char_to_idx.get(ch, 0) for ch in text if ch.strip() != '']
|
| 654 |
+
return tokens
|
| 655 |
+
|
| 656 |
+
class TokenEmbedding(nn.Module):
|
| 657 |
+
def __init__(self, vocab_size, d_model):
|
| 658 |
+
super().__init__()
|
| 659 |
+
self.embedding = nn.Embedding(vocab_size + 1, d_model, padding_idx=0)
|
| 660 |
+
|
| 661 |
+
def forward(self, tokens):
|
| 662 |
+
embedded = self.embedding(tokens)
|
| 663 |
+
return embedded.transpose(1, 2)
|
| 664 |
+
|
| 665 |
+
def mel_to_audio(mel_spectrogram, n_iter=60, sr=22050, n_fft=1024, hop_length=256):
|
| 666 |
+
|
| 667 |
+
mel_power = librosa.db_to_power(mel_spectrogram)
|
| 668 |
+
S = librosa.feature.inverse.mel_to_stft(mel_power, sr=sr, n_fft=n_fft)
|
| 669 |
+
audio = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length)
|
| 670 |
+
return audio
|
| 671 |
+
|
| 672 |
+
d_model = 512
|
| 673 |
+
vocab_size = 50
|
| 674 |
+
embedding = TokenEmbedding(vocab_size=vocab_size, d_model=d_model)
|
| 675 |
+
attention_model = RelativePositionMultiHeadAttention(d_model=d_model, num_heads=8)
|
| 676 |
+
embedding.eval()
|
| 677 |
+
attention_model.eval()
|
| 678 |
+
|
| 679 |
+
def tts_pipeline(user_text):
|
| 680 |
+
tokens = preprocess_text(user_text)
|
| 681 |
+
if len(tokens) == 0:
|
| 682 |
+
return None
|
| 683 |
+
|
| 684 |
+
input_tensor = torch.tensor(tokens).unsqueeze(0)
|
| 685 |
+
|
| 686 |
+
with torch.no_grad():
|
| 687 |
+
embedded = embedding(input_tensor)
|
| 688 |
+
output = attention_model(embedded)
|
| 689 |
+
mel = output.squeeze(0).cpu().numpy()
|
| 690 |
+
mel = mel[:80, :]
|
| 691 |
+
|
| 692 |
+
mel_db = 20 * np.log10(np.maximum(mel, 1e-5))
|
| 693 |
+
|
| 694 |
+
audio = mel_to_audio(mel_db)
|
| 695 |
+
|
| 696 |
+
return (22050, audio.astype(np.float32))
|
| 697 |
+
import numpy as np
|
| 698 |
+
|
| 699 |
+
import gradio as gr
|
| 700 |
+
|
| 701 |
+
iface = gr.Interface(
|
| 702 |
+
fn=tts_pipeline,
|
| 703 |
+
inputs=gr.Textbox(label="Enter Bengali Text"),
|
| 704 |
+
outputs=gr.Audio(label="Generated Speech"),
|
| 705 |
+
title="Bangladeshi Bengali TTS Demo"
|
| 706 |
+
)
|
| 707 |
+
|
| 708 |
+
iface.launch()
|
| 709 |
+
|
| 710 |
+
import subprocess
|
| 711 |
+
import os
|
| 712 |
+
import gradio as gr
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
MODEL_PATH = "bangla_tts_female/pytorch_model.pth"
|
| 716 |
+
CONFIG_PATH = "bangla_tts_female/config.json"
|
| 717 |
+
|
| 718 |
+
def tts_from_cli(text):
|
| 719 |
+
if not text.strip():
|
| 720 |
+
return None
|
| 721 |
+
|
| 722 |
+
safe_text = text.replace('"', '\\"')
|
| 723 |
+
|
| 724 |
+
output_wav = "output.wav"
|
| 725 |
+
|
| 726 |
+
|
| 727 |
+
cmd = [
|
| 728 |
+
"tts",
|
| 729 |
+
"--model_path", MODEL_PATH,
|
| 730 |
+
"--config_path", CONFIG_PATH,
|
| 731 |
+
"--text", safe_text,
|
| 732 |
+
"--out_path", output_wav
|
| 733 |
+
]
|
| 734 |
+
|
| 735 |
+
|
| 736 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 737 |
+
|
| 738 |
+
if result.returncode != 0:
|
| 739 |
+
print("Error:", result.stderr)
|
| 740 |
+
return None
|
| 741 |
+
|
| 742 |
+
if os.path.exists(output_wav):
|
| 743 |
+
return output_wav
|
| 744 |
+
else:
|
| 745 |
+
print("Output audio not found")
|
| 746 |
+
return None
|
| 747 |
+
|
| 748 |
+
|
| 749 |
+
iface = gr.Interface(
|
| 750 |
+
fn=tts_from_cli,
|
| 751 |
+
inputs=gr.Textbox(lines=2, placeholder="Enter Bengali text here..."),
|
| 752 |
+
outputs=gr.Audio(type="filepath"),
|
| 753 |
+
title="Bengali TTS with CLI Model"
|
| 754 |
+
)
|
| 755 |
+
|
| 756 |
+
iface.launch()
|
inference.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from TTS.api import TTS
|
| 2 |
+
|
| 3 |
+
tts = TTS("./") # local model path or Hugging Face ID
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
!tts --model_path bangla_tts_female/pytorch_model.pth \
|
| 7 |
+
--config_path bangla_tts_female/config.json \
|
| 8 |
+
--text "আমি বাংলাদেশ থেকে এসেছি।" \
|
| 9 |
+
--out_path baseline.wav
|
| 10 |
+
|
| 11 |
+
from IPython.display import Audio
|
| 12 |
+
Audio("baseline.wav")
|
requirement.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
transformers
|
| 3 |
+
datasets
|
| 4 |
+
soundfile
|
| 5 |
+
librosa
|
| 6 |
+
gradio
|
| 7 |
+
numpy
|
| 8 |
+
torchaudio
|
| 9 |
+
phonemizer
|
| 10 |
+
espeak-ng
|
| 11 |
+
coqui-tts
|
| 12 |
+
joblib
|
| 13 |
+
tqdm
|
| 14 |
+
numpy
|
| 15 |
+
scikit-learn
|