# -*- coding: utf-8 -*- """ Difficulty Scorer - Multi-language Support Supports 6 languages with proficiency test databases: - English (en): CEFR A1-C2 - Chinese (zh-cn): HSK 1-6 - German (de): CEFR A1-C2 - Spanish (es): CEFR A1-C2 - Japanese (ja): JLPT N5-N1 - Korean (ko): TOPIK 1-6 """ import json from typing import Dict, Any, List, Optional from pathlib import Path class DifficultyScorer: """Multi-language difficulty scoring system""" LANGUAGE_TESTS = { 'en': 'cefr', 'de': 'cefr', 'es': 'cefr', 'fr': 'cefr', 'it': 'cefr', 'zh-cn': 'hsk', 'zh-tw': 'hsk', 'ja': 'jlpt', 'ko': 'topik', 'ru': 'cefr', } JLPT_MAPPING = { 'N5': 1, 'N4': 2, 'N3': 3, 'N2': 4, 'N1': 5 } def __init__(self, data_dir: str = None): """ Initialize multi-language difficulty scorer Args: data_dir: Path to data directory containing proficiency databases """ if data_dir is None: current_dir = Path(__file__).parent project_root = current_dir.parent.parent data_dir = project_root / "data" self.data_dir = Path(data_dir) self.databases = self._load_all_databases() self.word_lookups = self._create_word_lookups() def _load_all_databases(self) -> Dict[str, Dict]: """Load all language proficiency databases""" databases = {} # Load CEFR (English, German, Spanish, etc.) cefr_path = self.data_dir / "cefr" / "cefr_words.json" if cefr_path.exists(): try: with open(cefr_path, 'r', encoding='utf-8') as f: databases['cefr'] = json.load(f) except Exception as e: print(f"[DifficultyScorer] Failed to load CEFR: {e}") # Load HSK (Chinese) hsk_path = self.data_dir / "hsk" / "hsk_words.json" if hsk_path.exists(): try: with open(hsk_path, 'r', encoding='utf-8') as f: databases['hsk'] = json.load(f) except Exception as e: print(f"[DifficultyScorer] Failed to load HSK: {e}") # Load JLPT (Japanese) jlpt_path = self.data_dir / "jlpt" / "jlpt_words.json" if jlpt_path.exists(): try: with open(jlpt_path, 'r', encoding='utf-8') as f: databases['jlpt'] = json.load(f) except Exception as e: print(f"[DifficultyScorer] Failed to load JLPT: {e}") # Load TOPIK (Korean) topik_path = self.data_dir / "topik" / "topik_words.json" if topik_path.exists(): try: with open(topik_path, 'r', encoding='utf-8') as f: databases['topik'] = json.load(f) except Exception as e: print(f"[DifficultyScorer] Failed to load TOPIK: {e}") return databases def _create_word_lookups(self) -> Dict[str, Dict[str, int]]: """Create word-to-score lookup tables for all languages""" lookups = {} # CEFR lookups if 'cefr' in self.databases: cefr = self.databases['cefr'] for lang_code in ['en', 'de', 'es', 'fr', 'it', 'ru']: lookups[lang_code] = {} if 'levels' in cefr: for level, data in cefr['levels'].items(): score = data.get('score', 3) if lang_code in data: for word in data[lang_code]: lookups[lang_code][word.lower()] = score # HSK lookup (Chinese) if 'hsk' in self.databases: lookups['zh-cn'] = {} lookups['zh-tw'] = {} if 'levels' in self.databases['hsk']: for level, data in self.databases['hsk']['levels'].items(): score = data.get('score', 3) for word in data.get('words', []): lookups['zh-cn'][word] = score lookups['zh-tw'][word] = score # JLPT lookup (Japanese) if 'jlpt' in self.databases: lookups['ja'] = {} if 'levels' in self.databases['jlpt']: for level, data in self.databases['jlpt']['levels'].items(): score = data.get('score', 3) for word in data.get('words', []): lookups['ja'][word] = score # TOPIK lookup (Korean) if 'topik' in self.databases: lookups['ko'] = {} if 'levels' in self.databases['topik']: for level, data in self.databases['topik']['levels'].items(): score = data.get('score', 3) for word in data.get('words', []): lookups['ko'][word] = score return lookups def get_proficiency_score(self, word: str, language: str) -> float: """ Get proficiency test score for a word Args: word: Word or phrase language: Language code Returns: Score 1-6 (1=easiest, 6=hardest) """ language = language.lower() if language not in self.word_lookups: return self._estimate_by_length(word) lookup = self.word_lookups[language] search_word = word if language in ['zh-cn', 'zh-tw', 'ja', 'ko'] else word.lower() if search_word in lookup: return float(lookup[search_word]) return self._estimate_by_length(word) def _estimate_by_length(self, word: str) -> float: """Estimate difficulty by word length (fallback)""" length = len(word) if length <= 3: return 2.0 elif length <= 6: return 3.5 elif length <= 10: return 4.5 else: return 5.5 def get_length_score(self, word: str) -> float: """Score based on word length""" length = len(word) if length == 1: return 1.0 elif length <= 3: return 2.0 elif length <= 6: return 3.0 elif length <= 10: return 4.0 elif length <= 15: return 5.0 else: return 6.0 def calculate_difficulty(self, word: str, language: str) -> Dict[str, Any]: """ Calculate comprehensive difficulty score Weights: - Proficiency level: 60% - Word length: 40% """ proficiency_score = self.get_proficiency_score(word, language) length_score = self.get_length_score(word) overall_score = proficiency_score * 0.6 + length_score * 0.4 if overall_score <= 2.5: level = "beginner" elif overall_score <= 4.5: level = "intermediate" else: level = "advanced" test_name = self.LANGUAGE_TESTS.get(language.lower(), 'unknown') return { "overall_score": round(overall_score, 2), "level": level, "factors": { "proficiency_score": round(proficiency_score, 2), "length": len(word), "length_score": round(length_score, 2), "test_system": test_name.upper() } } def score_flashcard(self, card: Dict[str, Any]) -> Dict[str, Any]: """Add difficulty score to flashcard""" word = card.get('front', '') language = card.get('language', 'en') difficulty = self.calculate_difficulty(word, language) card_with_difficulty = card.copy() card_with_difficulty['difficulty'] = difficulty return card_with_difficulty def score_all_flashcards(self, flashcards: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Score all flashcards""" return [self.score_flashcard(card) for card in flashcards] def get_statistics(self, flashcards: List[Dict[str, Any]]) -> Dict[str, Any]: """Generate difficulty statistics""" if not flashcards: return {} level_counts = {"beginner": 0, "intermediate": 0, "advanced": 0} scores = [] by_language = {} for card in flashcards: if 'difficulty' in card: level = card['difficulty']['level'] level_counts[level] += 1 scores.append(card['difficulty']['overall_score']) lang = card.get('language', 'unknown') if lang not in by_language: by_language[lang] = {"count": 0, "scores": []} by_language[lang]["count"] += 1 by_language[lang]["scores"].append(card['difficulty']['overall_score']) for lang in by_language: lang_scores = by_language[lang]["scores"] by_language[lang]["avg_score"] = round(sum(lang_scores) / len(lang_scores), 2) del by_language[lang]["scores"] return { "total_cards": len(flashcards), "by_level": level_counts, "by_language": by_language, "average_score": round(sum(scores) / len(scores), 2) if scores else 0, "min_score": round(min(scores), 2) if scores else 0, "max_score": round(max(scores), 2) if scores else 0 } # Global instance (lazy initialization) _difficulty_scorer = None def get_difficulty_scorer() -> DifficultyScorer: """Get or create the global DifficultyScorer instance""" global _difficulty_scorer if _difficulty_scorer is None: _difficulty_scorer = DifficultyScorer() return _difficulty_scorer