mtem-pruner

Running

App Files Files Community

antoinelouis commited on Oct 6, 2024

Commit

0209ea7

verified ·

1 Parent(s): 2075e20

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -33

app.py CHANGED Viewed

@@ -41,21 +41,15 @@ def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str
     sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
     if os.path.exists(sentences_file):
         df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
         counter = Counter(tokenizer.all_special_tokens)
-        counter.update(tok for t in tqdm(df.text) for tok in tokenizer.tokenize(t))
-        with open(f"data.nosync/{language}_filtered_tokens.txt", "w") as f:
-            f.write("\n".join(map(str, set(counter))))
     else:
         raise FileNotFoundError
-def get_pruned_vocabulary(language: str):
-    filtered_tokens_file = f"data.nosync/{language}_filtered_tokens.txt"
-    if os.path.exists(filtered_tokens_file):
-        with open(filtered_tokens_file, "r") as f:
-            return set(f.read().splitlines())
-    else:
-        raise FileNotFoundError(f"No filtered tokens file found for language {language}. Please run `estimate_pruned_vocabulary` first.")
 @st.cache_resource
 def load_model_and_tokenizer(model_name: str):
     model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
@@ -78,14 +72,12 @@ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
     return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
 def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
-    print(f"'{hf_token}'")
-    _ = whoami(token=hf_token)
     api = HfApi(endpoint="https://huggingface.co", token=hf_token)
     repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
     api.create_repo(repo_id=repo_id, repo_type="model", private=private)
     api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
-def prune_model(model_name: str, language: str, hf_username: str, hf_token: str):
     st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
     # Load the model and its tokenizer
@@ -103,13 +95,23 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
         f"with **{encoder_params/1e6:.1f}M** parameters only."
     )
-    # Estimate the most used tokens in the language.
-    filtered_tokens = get_pruned_vocabulary(language)
-    st.markdown(
-        f"- {language.capitalize()} seems to only use **{len(filtered_tokens)/tokenizer.vocab_size*100:.0f}%** "+
-        f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
-    )
     with st.status("Pruning the model...", expanded=True) as status:
         st.write("- *Updating the tokenizer*")
         outdir = f"{language}-{model_name.split('/')[-1]}"
@@ -165,7 +167,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
     with st.status("Testing the conversion...", expanded=True) as status:
         st.write(f"- *Checking the pruned tokenizer*")
-        assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
         assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
         st.write(f"- *Checking the pruned model*")
@@ -247,7 +249,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
         f.write(readme_content)
     with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
-        #push_to_hub(hf_username, hf_token, outdir)
         shutil.rmtree(outdir)
         status.update(state="complete", expanded=False)
@@ -279,22 +281,31 @@ def main():
     """)
     model_name = st.selectbox("Choose a multilingual model", MODELS)
-    language = st.selectbox(
-        "Pick your target language",
-        options=list(LANGUAGES.keys()),
-        format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
-    )
-    col1, col2 = st.columns(2)
     with col1:
-        hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
     with col2:
         hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
-    if st.button("Prune Model"):
         if not hf_username or not hf_token:
-            st.error("Your HF username and access token is required to save the pruned model on your account.")
         else:
-            prune_model(model_name, language, hf_username, hf_token)
     st.markdown(
         """

     sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
     if os.path.exists(sentences_file):
         df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
+        my_bar = st.progress(0)
         counter = Counter(tokenizer.all_special_tokens)
+        for i, text in enumerate(df.text):
+            counter.update(tok for tok in tokenizer.tokenize(text))
+            my_bar.progress(i/len(df)+1, text=f"{i/len(df)*100:.0f}%")
+        return set(counter)
     else:
         raise FileNotFoundError
 @st.cache_resource
 def load_model_and_tokenizer(model_name: str):
     model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
     return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
 def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
     api = HfApi(endpoint="https://huggingface.co", token=hf_token)
     repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
     api.create_repo(repo_id=repo_id, repo_type="model", private=private)
     api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
+def prune_model(model_name: str, language: str, hf_username: str, hf_token: str, keep_english: bool):
     st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
     # Load the model and its tokenizer
         f"with **{encoder_params/1e6:.1f}M** parameters only."
     )
+    with st.status(f"Computing the {language.capitalize()} vocabulary...", expanded=True) as status:
+        filtered_tokens = estimate_pruned_vocabulary(tokenizer, language)
+        num_filtered_tokens = len(filtered_tokens)
+        st.write(
+            f"{language.capitalize()} only uses **{num_filtered_tokens/tokenizer.vocab_size*100:.0f}%** "+
+            f"of the model vocabulary (i.e., {num_filtered_tokens} out of the original {tokenizer.vocab_size} tokens)."
+        )
+        status.update(state="complete", expanded=True)
+    if keep_english:
+        with st.status(f"Computing the English vocabulary...", expanded=True) as status:
+            english_tokens = estimate_pruned_vocabulary(tokenizer, "english")
+            filtered_tokens.update(english_tokens)
+            st.write(f"Considering the **English** tokens adds **{len(filtered_tokens) - num_filtered_tokens}** tokens to the vocabulary.")
+            num_filtered_tokens = len(filtered_tokens)
+            status.update(state="complete", expanded=True)
     with st.status("Pruning the model...", expanded=True) as status:
         st.write("- *Updating the tokenizer*")
         outdir = f"{language}-{model_name.split('/')[-1]}"
     with st.status("Testing the conversion...", expanded=True) as status:
         st.write(f"- *Checking the pruned tokenizer*")
+        assert len(new_tokenizer) == num_filtered_tokens, f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({num_filtered_tokens})"
         assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
         st.write(f"- *Checking the pruned model*")
         f.write(readme_content)
     with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
+        push_to_hub(hf_username, hf_token, outdir)
         shutil.rmtree(outdir)
         status.update(state="complete", expanded=False)
     """)
     model_name = st.selectbox("Choose a multilingual model", MODELS)
+    col1, col2 = st.columns([3, 1])
     with col1:
+        language = st.selectbox(
+            "Pick your target language",
+            options=list(LANGUAGES.keys()),
+            format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
+        )
     with col2:
+        st.write("")
+        st.write("")
+        keep_english = st.checkbox("Keep English", value=False, help="Keep English tokens in addition to the selected language")
+    col3, col4 = st.columns(2)
+    with col3:
+        hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
+    with col4:
         hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
+    if st.button("Prune model"):
         if not hf_username or not hf_token:
+            st.error("Your HF username and access token are required to save the pruned model on your account.")
         else:
+            _ = whoami(token=hf_token)
+            prune_model(model_name, language, hf_username, hf_token, keep_english)
     st.markdown(
         """