Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -41,21 +41,15 @@ def estimate_pruned_vocabulary(tokenizer: PreTrainedTokenizerFast, language: str
|
|
| 41 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
| 42 |
if os.path.exists(sentences_file):
|
| 43 |
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
|
|
|
| 44 |
counter = Counter(tokenizer.all_special_tokens)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
| 48 |
else:
|
| 49 |
raise FileNotFoundError
|
| 50 |
|
| 51 |
-
def get_pruned_vocabulary(language: str):
|
| 52 |
-
filtered_tokens_file = f"data.nosync/{language}_filtered_tokens.txt"
|
| 53 |
-
if os.path.exists(filtered_tokens_file):
|
| 54 |
-
with open(filtered_tokens_file, "r") as f:
|
| 55 |
-
return set(f.read().splitlines())
|
| 56 |
-
else:
|
| 57 |
-
raise FileNotFoundError(f"No filtered tokens file found for language {language}. Please run `estimate_pruned_vocabulary` first.")
|
| 58 |
-
|
| 59 |
@st.cache_resource
|
| 60 |
def load_model_and_tokenizer(model_name: str):
|
| 61 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
|
@@ -78,14 +72,12 @@ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
|
|
| 78 |
return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
|
| 79 |
|
| 80 |
def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
|
| 81 |
-
print(f"'{hf_token}'")
|
| 82 |
-
_ = whoami(token=hf_token)
|
| 83 |
api = HfApi(endpoint="https://huggingface.co", token=hf_token)
|
| 84 |
repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
|
| 85 |
api.create_repo(repo_id=repo_id, repo_type="model", private=private)
|
| 86 |
api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
|
| 87 |
|
| 88 |
-
def prune_model(model_name: str, language: str, hf_username: str, hf_token: str):
|
| 89 |
st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
|
| 90 |
|
| 91 |
# Load the model and its tokenizer
|
|
@@ -103,13 +95,23 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
|
|
| 103 |
f"with **{encoder_params/1e6:.1f}M** parameters only."
|
| 104 |
)
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
with st.status("Pruning the model...", expanded=True) as status:
|
| 114 |
st.write("- *Updating the tokenizer*")
|
| 115 |
outdir = f"{language}-{model_name.split('/')[-1]}"
|
|
@@ -165,7 +167,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
|
|
| 165 |
|
| 166 |
with st.status("Testing the conversion...", expanded=True) as status:
|
| 167 |
st.write(f"- *Checking the pruned tokenizer*")
|
| 168 |
-
assert len(new_tokenizer) ==
|
| 169 |
assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
|
| 170 |
|
| 171 |
st.write(f"- *Checking the pruned model*")
|
|
@@ -247,7 +249,7 @@ def prune_model(model_name: str, language: str, hf_username: str, hf_token: str)
|
|
| 247 |
f.write(readme_content)
|
| 248 |
|
| 249 |
with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
|
| 250 |
-
|
| 251 |
shutil.rmtree(outdir)
|
| 252 |
status.update(state="complete", expanded=False)
|
| 253 |
|
|
@@ -279,22 +281,31 @@ def main():
|
|
| 279 |
""")
|
| 280 |
|
| 281 |
model_name = st.selectbox("Choose a multilingual model", MODELS)
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
options=list(LANGUAGES.keys()),
|
| 285 |
-
format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
|
| 286 |
-
)
|
| 287 |
-
col1, col2 = st.columns(2)
|
| 288 |
with col1:
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
with col2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
|
| 292 |
|
| 293 |
-
if st.button("Prune
|
| 294 |
if not hf_username or not hf_token:
|
| 295 |
-
st.error("Your HF username and access token
|
| 296 |
else:
|
| 297 |
-
|
|
|
|
| 298 |
|
| 299 |
st.markdown(
|
| 300 |
"""
|
|
|
|
| 41 |
sentences_file = f'data.nosync/{language}_news_2020_1M-sentences.txt'
|
| 42 |
if os.path.exists(sentences_file):
|
| 43 |
df = pd.read_csv(sentences_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, names=['id', 'text'])
|
| 44 |
+
my_bar = st.progress(0)
|
| 45 |
counter = Counter(tokenizer.all_special_tokens)
|
| 46 |
+
for i, text in enumerate(df.text):
|
| 47 |
+
counter.update(tok for tok in tokenizer.tokenize(text))
|
| 48 |
+
my_bar.progress(i/len(df)+1, text=f"{i/len(df)*100:.0f}%")
|
| 49 |
+
return set(counter)
|
| 50 |
else:
|
| 51 |
raise FileNotFoundError
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
@st.cache_resource
|
| 54 |
def load_model_and_tokenizer(model_name: str):
|
| 55 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
|
|
|
| 72 |
return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
|
| 73 |
|
| 74 |
def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
|
|
|
|
|
|
|
| 75 |
api = HfApi(endpoint="https://huggingface.co", token=hf_token)
|
| 76 |
repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
|
| 77 |
api.create_repo(repo_id=repo_id, repo_type="model", private=private)
|
| 78 |
api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
|
| 79 |
|
| 80 |
+
def prune_model(model_name: str, language: str, hf_username: str, hf_token: str, keep_english: bool):
|
| 81 |
st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
|
| 82 |
|
| 83 |
# Load the model and its tokenizer
|
|
|
|
| 95 |
f"with **{encoder_params/1e6:.1f}M** parameters only."
|
| 96 |
)
|
| 97 |
|
| 98 |
+
with st.status(f"Computing the {language.capitalize()} vocabulary...", expanded=True) as status:
|
| 99 |
+
filtered_tokens = estimate_pruned_vocabulary(tokenizer, language)
|
| 100 |
+
num_filtered_tokens = len(filtered_tokens)
|
| 101 |
+
st.write(
|
| 102 |
+
f"{language.capitalize()} only uses **{num_filtered_tokens/tokenizer.vocab_size*100:.0f}%** "+
|
| 103 |
+
f"of the model vocabulary (i.e., {num_filtered_tokens} out of the original {tokenizer.vocab_size} tokens)."
|
| 104 |
+
)
|
| 105 |
+
status.update(state="complete", expanded=True)
|
| 106 |
+
|
| 107 |
+
if keep_english:
|
| 108 |
+
with st.status(f"Computing the English vocabulary...", expanded=True) as status:
|
| 109 |
+
english_tokens = estimate_pruned_vocabulary(tokenizer, "english")
|
| 110 |
+
filtered_tokens.update(english_tokens)
|
| 111 |
+
st.write(f"Considering the **English** tokens adds **{len(filtered_tokens) - num_filtered_tokens}** tokens to the vocabulary.")
|
| 112 |
+
num_filtered_tokens = len(filtered_tokens)
|
| 113 |
+
status.update(state="complete", expanded=True)
|
| 114 |
+
|
| 115 |
with st.status("Pruning the model...", expanded=True) as status:
|
| 116 |
st.write("- *Updating the tokenizer*")
|
| 117 |
outdir = f"{language}-{model_name.split('/')[-1]}"
|
|
|
|
| 167 |
|
| 168 |
with st.status("Testing the conversion...", expanded=True) as status:
|
| 169 |
st.write(f"- *Checking the pruned tokenizer*")
|
| 170 |
+
assert len(new_tokenizer) == num_filtered_tokens, f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({num_filtered_tokens})"
|
| 171 |
assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
|
| 172 |
|
| 173 |
st.write(f"- *Checking the pruned model*")
|
|
|
|
| 249 |
f.write(readme_content)
|
| 250 |
|
| 251 |
with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
|
| 252 |
+
push_to_hub(hf_username, hf_token, outdir)
|
| 253 |
shutil.rmtree(outdir)
|
| 254 |
status.update(state="complete", expanded=False)
|
| 255 |
|
|
|
|
| 281 |
""")
|
| 282 |
|
| 283 |
model_name = st.selectbox("Choose a multilingual model", MODELS)
|
| 284 |
+
|
| 285 |
+
col1, col2 = st.columns([3, 1])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
with col1:
|
| 287 |
+
language = st.selectbox(
|
| 288 |
+
"Pick your target language",
|
| 289 |
+
options=list(LANGUAGES.keys()),
|
| 290 |
+
format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
|
| 291 |
+
)
|
| 292 |
with col2:
|
| 293 |
+
st.write("")
|
| 294 |
+
st.write("")
|
| 295 |
+
keep_english = st.checkbox("Keep English", value=False, help="Keep English tokens in addition to the selected language")
|
| 296 |
+
|
| 297 |
+
col3, col4 = st.columns(2)
|
| 298 |
+
with col3:
|
| 299 |
+
hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
|
| 300 |
+
with col4:
|
| 301 |
hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
|
| 302 |
|
| 303 |
+
if st.button("Prune model"):
|
| 304 |
if not hf_username or not hf_token:
|
| 305 |
+
st.error("Your HF username and access token are required to save the pruned model on your account.")
|
| 306 |
else:
|
| 307 |
+
_ = whoami(token=hf_token)
|
| 308 |
+
prune_model(model_name, language, hf_username, hf_token, keep_english)
|
| 309 |
|
| 310 |
st.markdown(
|
| 311 |
"""
|