import sentencepiece as spm import os import pandas as pd # Путь к тестовому корпусу test_file = "kyrgyz_clean_sentences.txt" models_dir = "models" # Загружаем первые 1000 предложений with open(test_file, "r", encoding="utf-8") as f: sentences = [line.strip() for line in f if line.strip()] sentences = sentences[:10000] # Найдём все .model файлы в папке models models = sorted([f for f in os.listdir(models_dir) if f.endswith(".model")]) results = [] for model_file in models: model_path = os.path.join(models_dir, model_file) sp = spm.SentencePieceProcessor() sp.load(model_path) total_tokens = 0 total_unk = 0 total_chars = 0 total_words = 0 for sentence in sentences: ids = sp.encode(sentence) pieces = sp.encode(sentence, out_type=str) total_tokens += len(ids) total_words += 1 total_unk += ids.count(sp.unk_id()) total_chars += sum(len(p) for p in pieces) avg_tokens = total_tokens / total_words avg_token_len = total_chars / total_tokens unk_rate = total_unk / total_tokens results.append({ "model": model_file, "avg_tokens": round(avg_tokens, 2), "avg_token_len": round(avg_token_len, 2), "unk_rate_%": round(unk_rate * 100, 2) }) df = pd.DataFrame(results) df = df.sort_values("avg_tokens") print(df) import matplotlib.pyplot as plt # 🎨 График 1: Среднее количество токенов plt.figure(figsize=(12, 6)) plt.bar(df["model"], df["avg_tokens"]) plt.title("📏 Average Number of Tokens per Sentence") plt.ylabel("Average Token Count") plt.xticks(rotation=45, ha="right") plt.tight_layout() plt.show()