agentic-language-partner / src /app /difficulty_scorer.py
yusenthebot
Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project
aa3fdef
# -*- coding: utf-8 -*-
"""
Difficulty Scorer - Multi-language Support
Supports 6 languages with proficiency test databases:
- English (en): CEFR A1-C2
- Chinese (zh-cn): HSK 1-6
- German (de): CEFR A1-C2
- Spanish (es): CEFR A1-C2
- Japanese (ja): JLPT N5-N1
- Korean (ko): TOPIK 1-6
"""
import json
from typing import Dict, Any, List, Optional
from pathlib import Path
class DifficultyScorer:
"""Multi-language difficulty scoring system"""
LANGUAGE_TESTS = {
'en': 'cefr',
'de': 'cefr',
'es': 'cefr',
'fr': 'cefr',
'it': 'cefr',
'zh-cn': 'hsk',
'zh-tw': 'hsk',
'ja': 'jlpt',
'ko': 'topik',
'ru': 'cefr',
}
JLPT_MAPPING = {
'N5': 1, 'N4': 2, 'N3': 3, 'N2': 4, 'N1': 5
}
def __init__(self, data_dir: str = None):
"""
Initialize multi-language difficulty scorer
Args:
data_dir: Path to data directory containing proficiency databases
"""
if data_dir is None:
current_dir = Path(__file__).parent
project_root = current_dir.parent.parent
data_dir = project_root / "data"
self.data_dir = Path(data_dir)
self.databases = self._load_all_databases()
self.word_lookups = self._create_word_lookups()
def _load_all_databases(self) -> Dict[str, Dict]:
"""Load all language proficiency databases"""
databases = {}
# Load CEFR (English, German, Spanish, etc.)
cefr_path = self.data_dir / "cefr" / "cefr_words.json"
if cefr_path.exists():
try:
with open(cefr_path, 'r', encoding='utf-8') as f:
databases['cefr'] = json.load(f)
except Exception as e:
print(f"[DifficultyScorer] Failed to load CEFR: {e}")
# Load HSK (Chinese)
hsk_path = self.data_dir / "hsk" / "hsk_words.json"
if hsk_path.exists():
try:
with open(hsk_path, 'r', encoding='utf-8') as f:
databases['hsk'] = json.load(f)
except Exception as e:
print(f"[DifficultyScorer] Failed to load HSK: {e}")
# Load JLPT (Japanese)
jlpt_path = self.data_dir / "jlpt" / "jlpt_words.json"
if jlpt_path.exists():
try:
with open(jlpt_path, 'r', encoding='utf-8') as f:
databases['jlpt'] = json.load(f)
except Exception as e:
print(f"[DifficultyScorer] Failed to load JLPT: {e}")
# Load TOPIK (Korean)
topik_path = self.data_dir / "topik" / "topik_words.json"
if topik_path.exists():
try:
with open(topik_path, 'r', encoding='utf-8') as f:
databases['topik'] = json.load(f)
except Exception as e:
print(f"[DifficultyScorer] Failed to load TOPIK: {e}")
return databases
def _create_word_lookups(self) -> Dict[str, Dict[str, int]]:
"""Create word-to-score lookup tables for all languages"""
lookups = {}
# CEFR lookups
if 'cefr' in self.databases:
cefr = self.databases['cefr']
for lang_code in ['en', 'de', 'es', 'fr', 'it', 'ru']:
lookups[lang_code] = {}
if 'levels' in cefr:
for level, data in cefr['levels'].items():
score = data.get('score', 3)
if lang_code in data:
for word in data[lang_code]:
lookups[lang_code][word.lower()] = score
# HSK lookup (Chinese)
if 'hsk' in self.databases:
lookups['zh-cn'] = {}
lookups['zh-tw'] = {}
if 'levels' in self.databases['hsk']:
for level, data in self.databases['hsk']['levels'].items():
score = data.get('score', 3)
for word in data.get('words', []):
lookups['zh-cn'][word] = score
lookups['zh-tw'][word] = score
# JLPT lookup (Japanese)
if 'jlpt' in self.databases:
lookups['ja'] = {}
if 'levels' in self.databases['jlpt']:
for level, data in self.databases['jlpt']['levels'].items():
score = data.get('score', 3)
for word in data.get('words', []):
lookups['ja'][word] = score
# TOPIK lookup (Korean)
if 'topik' in self.databases:
lookups['ko'] = {}
if 'levels' in self.databases['topik']:
for level, data in self.databases['topik']['levels'].items():
score = data.get('score', 3)
for word in data.get('words', []):
lookups['ko'][word] = score
return lookups
def get_proficiency_score(self, word: str, language: str) -> float:
"""
Get proficiency test score for a word
Args:
word: Word or phrase
language: Language code
Returns:
Score 1-6 (1=easiest, 6=hardest)
"""
language = language.lower()
if language not in self.word_lookups:
return self._estimate_by_length(word)
lookup = self.word_lookups[language]
search_word = word if language in ['zh-cn', 'zh-tw', 'ja', 'ko'] else word.lower()
if search_word in lookup:
return float(lookup[search_word])
return self._estimate_by_length(word)
def _estimate_by_length(self, word: str) -> float:
"""Estimate difficulty by word length (fallback)"""
length = len(word)
if length <= 3:
return 2.0
elif length <= 6:
return 3.5
elif length <= 10:
return 4.5
else:
return 5.5
def get_length_score(self, word: str) -> float:
"""Score based on word length"""
length = len(word)
if length == 1:
return 1.0
elif length <= 3:
return 2.0
elif length <= 6:
return 3.0
elif length <= 10:
return 4.0
elif length <= 15:
return 5.0
else:
return 6.0
def calculate_difficulty(self, word: str, language: str) -> Dict[str, Any]:
"""
Calculate comprehensive difficulty score
Weights:
- Proficiency level: 60%
- Word length: 40%
"""
proficiency_score = self.get_proficiency_score(word, language)
length_score = self.get_length_score(word)
overall_score = proficiency_score * 0.6 + length_score * 0.4
if overall_score <= 2.5:
level = "beginner"
elif overall_score <= 4.5:
level = "intermediate"
else:
level = "advanced"
test_name = self.LANGUAGE_TESTS.get(language.lower(), 'unknown')
return {
"overall_score": round(overall_score, 2),
"level": level,
"factors": {
"proficiency_score": round(proficiency_score, 2),
"length": len(word),
"length_score": round(length_score, 2),
"test_system": test_name.upper()
}
}
def score_flashcard(self, card: Dict[str, Any]) -> Dict[str, Any]:
"""Add difficulty score to flashcard"""
word = card.get('front', '')
language = card.get('language', 'en')
difficulty = self.calculate_difficulty(word, language)
card_with_difficulty = card.copy()
card_with_difficulty['difficulty'] = difficulty
return card_with_difficulty
def score_all_flashcards(self, flashcards: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Score all flashcards"""
return [self.score_flashcard(card) for card in flashcards]
def get_statistics(self, flashcards: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Generate difficulty statistics"""
if not flashcards:
return {}
level_counts = {"beginner": 0, "intermediate": 0, "advanced": 0}
scores = []
by_language = {}
for card in flashcards:
if 'difficulty' in card:
level = card['difficulty']['level']
level_counts[level] += 1
scores.append(card['difficulty']['overall_score'])
lang = card.get('language', 'unknown')
if lang not in by_language:
by_language[lang] = {"count": 0, "scores": []}
by_language[lang]["count"] += 1
by_language[lang]["scores"].append(card['difficulty']['overall_score'])
for lang in by_language:
lang_scores = by_language[lang]["scores"]
by_language[lang]["avg_score"] = round(sum(lang_scores) / len(lang_scores), 2)
del by_language[lang]["scores"]
return {
"total_cards": len(flashcards),
"by_level": level_counts,
"by_language": by_language,
"average_score": round(sum(scores) / len(scores), 2) if scores else 0,
"min_score": round(min(scores), 2) if scores else 0,
"max_score": round(max(scores), 2) if scores else 0
}
# Global instance (lazy initialization)
_difficulty_scorer = None
def get_difficulty_scorer() -> DifficultyScorer:
"""Get or create the global DifficultyScorer instance"""
global _difficulty_scorer
if _difficulty_scorer is None:
_difficulty_scorer = DifficultyScorer()
return _difficulty_scorer