add lm info

Browse files

Files changed (5) hide show

build_lm_processor.ipynb +22 -34
eval.sh +1 -1
preprocessor_config.json +1 -0
special_tokens_map.json +1 -1
tokenizer_config.json +1 -1

build_lm_processor.ipynb CHANGED Viewed

@@ -2,8 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "id": "5393aa33",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,8 +24,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "2d34d3b8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,30 +35,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "f0354cb2",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading the LM will be faster if you build a binary file.\n",
-      "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
-      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
-      "****************************************************************************************************\n"
-     ]
-    }
-   ],
    "source": [
-    "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "109f28e9",
    "metadata": {},
    "outputs": [
     {
@@ -77,8 +65,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "id": "300cec39",
    "metadata": {},
    "outputs": [
     {
@@ -88,8 +76,8 @@
       "Loading the LM will be faster if you build a binary file.\n",
       "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
-      "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
-      "****************************************************************************************************\n"
      ]
     }
    ],
@@ -102,8 +90,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "27dd8427",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -116,8 +104,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "94eb248e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -126,7 +114,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8f9b3dcc",
    "metadata": {},
    "source": [
     "## Save Model"
@@ -135,7 +123,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "8b584690",
    "metadata": {},
    "outputs": [
     {
@@ -160,7 +148,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "3712c030",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,7 +158,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b5d8de20",
    "metadata": {},
    "outputs": [],
    "source": []

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 8,
+   "id": "4ceb07da",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
+   "id": "adaa2f36",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
+   "id": "4f07fc9d",
    "metadata": {},
+   "outputs": [],
    "source": [
+    "processor = AutoProcessor.from_pretrained(\"vitouphy/wav2vec2-xls-r-1b-km\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
+   "id": "17473aee",
    "metadata": {},
    "outputs": [
     {
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
+   "id": "33fa6838",
    "metadata": {},
    "outputs": [
     {
       "Loading the LM will be faster if you build a binary file.\n",
       "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "****************************************************************************************************\n",
+      "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
+   "id": "ae0d32e9",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
+   "id": "d1acffc0",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "499eb495",
    "metadata": {},
    "source": [
     "## Save Model"
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "bdd7821c",
    "metadata": {},
    "outputs": [
     {
   {
    "cell_type": "code",
    "execution_count": 12,
+   "id": "3c78a0bf",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "202fbb76",
    "metadata": {},
    "outputs": [],
    "source": []

eval.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 ./eval.py \
---model_id ./ \
 --dataset openslr \
 --config km \
 --split test \

 ./eval.py \
+--model_id vitouphy/wav2vec2-xls-r-1b-km \
 --dataset openslr \
 --config km \
 --split test \

preprocessor_config.json CHANGED Viewed

@@ -4,6 +4,7 @@
   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
   "return_attention_mask": true,
   "sampling_rate": 16000
 }

   "feature_size": 1,
   "padding_side": "right",
   "padding_value": 0.0,
+  "processor_class": "Wav2Vec2ProcessorWithLM",
   "return_attention_mask": true,
   "sampling_rate": 16000
 }

special_tokens_map.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

tokenizer_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "vitouphy/wav2vec2-xls-r-1b-km", "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorWithLM"}