update eval
Browse files
eval.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
| 2 |
import argparse
|
| 3 |
import functools
|
| 4 |
import re
|
|
|
|
|
|
|
| 5 |
from typing import Dict
|
| 6 |
|
| 7 |
from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
|
|
@@ -50,9 +52,17 @@ def log_results(result: Dataset, args: Dict[str, str]):
|
|
| 50 |
def normalize_text(text: str) -> str:
|
| 51 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
| 52 |
|
| 53 |
-
chars_to_ignore_regex = '[
|
| 54 |
|
| 55 |
-
text = re.sub(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
| 58 |
# note that order is important here!
|
|
@@ -107,7 +117,7 @@ def main(args):
|
|
| 107 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
| 108 |
|
| 109 |
# for testing: only process the first two examples as a test
|
| 110 |
-
dataset = dataset.select(range(10))
|
| 111 |
|
| 112 |
# load processor
|
| 113 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
|
|
|
| 2 |
import argparse
|
| 3 |
import functools
|
| 4 |
import re
|
| 5 |
+
import string
|
| 6 |
+
import unidecode
|
| 7 |
from typing import Dict
|
| 8 |
|
| 9 |
from datasets import Audio, Dataset, DatasetDict, load_dataset, load_metric
|
|
|
|
| 52 |
def normalize_text(text: str) -> str:
|
| 53 |
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
| 54 |
|
| 55 |
+
chars_to_ignore_regex = f'[{re.escape(string.punctuation)}]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
| 56 |
|
| 57 |
+
text = re.sub(
|
| 58 |
+
chars_to_ignore_regex,
|
| 59 |
+
"",
|
| 60 |
+
re.sub("['`´]", "’", # elsewhere probably meant as glottal stop
|
| 61 |
+
re.sub("([og])['`´]", "\g<1>‘", # after o/g indicate modified char
|
| 62 |
+
unidecode.unidecode(text).lower()
|
| 63 |
+
)
|
| 64 |
+
)
|
| 65 |
+
) + " "
|
| 66 |
|
| 67 |
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
| 68 |
# note that order is important here!
|
|
|
|
| 117 |
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
| 118 |
|
| 119 |
# for testing: only process the first two examples as a test
|
| 120 |
+
# dataset = dataset.select(range(10))
|
| 121 |
|
| 122 |
# load processor
|
| 123 |
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
mozilla-foundation_common_voice_8_0_uz_test_eval_results.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
WER: 0.4056227604601665
|
| 2 |
+
CER: 0.082530664990714
|