yusenthebot
Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project
aa3fdef
| # -*- coding: utf-8 -*- | |
| """ | |
| Difficulty Scorer - Multi-language Support | |
| Supports 6 languages with proficiency test databases: | |
| - English (en): CEFR A1-C2 | |
| - Chinese (zh-cn): HSK 1-6 | |
| - German (de): CEFR A1-C2 | |
| - Spanish (es): CEFR A1-C2 | |
| - Japanese (ja): JLPT N5-N1 | |
| - Korean (ko): TOPIK 1-6 | |
| """ | |
| import json | |
| from typing import Dict, Any, List, Optional | |
| from pathlib import Path | |
| class DifficultyScorer: | |
| """Multi-language difficulty scoring system""" | |
| LANGUAGE_TESTS = { | |
| 'en': 'cefr', | |
| 'de': 'cefr', | |
| 'es': 'cefr', | |
| 'fr': 'cefr', | |
| 'it': 'cefr', | |
| 'zh-cn': 'hsk', | |
| 'zh-tw': 'hsk', | |
| 'ja': 'jlpt', | |
| 'ko': 'topik', | |
| 'ru': 'cefr', | |
| } | |
| JLPT_MAPPING = { | |
| 'N5': 1, 'N4': 2, 'N3': 3, 'N2': 4, 'N1': 5 | |
| } | |
| def __init__(self, data_dir: str = None): | |
| """ | |
| Initialize multi-language difficulty scorer | |
| Args: | |
| data_dir: Path to data directory containing proficiency databases | |
| """ | |
| if data_dir is None: | |
| current_dir = Path(__file__).parent | |
| project_root = current_dir.parent.parent | |
| data_dir = project_root / "data" | |
| self.data_dir = Path(data_dir) | |
| self.databases = self._load_all_databases() | |
| self.word_lookups = self._create_word_lookups() | |
| def _load_all_databases(self) -> Dict[str, Dict]: | |
| """Load all language proficiency databases""" | |
| databases = {} | |
| # Load CEFR (English, German, Spanish, etc.) | |
| cefr_path = self.data_dir / "cefr" / "cefr_words.json" | |
| if cefr_path.exists(): | |
| try: | |
| with open(cefr_path, 'r', encoding='utf-8') as f: | |
| databases['cefr'] = json.load(f) | |
| except Exception as e: | |
| print(f"[DifficultyScorer] Failed to load CEFR: {e}") | |
| # Load HSK (Chinese) | |
| hsk_path = self.data_dir / "hsk" / "hsk_words.json" | |
| if hsk_path.exists(): | |
| try: | |
| with open(hsk_path, 'r', encoding='utf-8') as f: | |
| databases['hsk'] = json.load(f) | |
| except Exception as e: | |
| print(f"[DifficultyScorer] Failed to load HSK: {e}") | |
| # Load JLPT (Japanese) | |
| jlpt_path = self.data_dir / "jlpt" / "jlpt_words.json" | |
| if jlpt_path.exists(): | |
| try: | |
| with open(jlpt_path, 'r', encoding='utf-8') as f: | |
| databases['jlpt'] = json.load(f) | |
| except Exception as e: | |
| print(f"[DifficultyScorer] Failed to load JLPT: {e}") | |
| # Load TOPIK (Korean) | |
| topik_path = self.data_dir / "topik" / "topik_words.json" | |
| if topik_path.exists(): | |
| try: | |
| with open(topik_path, 'r', encoding='utf-8') as f: | |
| databases['topik'] = json.load(f) | |
| except Exception as e: | |
| print(f"[DifficultyScorer] Failed to load TOPIK: {e}") | |
| return databases | |
| def _create_word_lookups(self) -> Dict[str, Dict[str, int]]: | |
| """Create word-to-score lookup tables for all languages""" | |
| lookups = {} | |
| # CEFR lookups | |
| if 'cefr' in self.databases: | |
| cefr = self.databases['cefr'] | |
| for lang_code in ['en', 'de', 'es', 'fr', 'it', 'ru']: | |
| lookups[lang_code] = {} | |
| if 'levels' in cefr: | |
| for level, data in cefr['levels'].items(): | |
| score = data.get('score', 3) | |
| if lang_code in data: | |
| for word in data[lang_code]: | |
| lookups[lang_code][word.lower()] = score | |
| # HSK lookup (Chinese) | |
| if 'hsk' in self.databases: | |
| lookups['zh-cn'] = {} | |
| lookups['zh-tw'] = {} | |
| if 'levels' in self.databases['hsk']: | |
| for level, data in self.databases['hsk']['levels'].items(): | |
| score = data.get('score', 3) | |
| for word in data.get('words', []): | |
| lookups['zh-cn'][word] = score | |
| lookups['zh-tw'][word] = score | |
| # JLPT lookup (Japanese) | |
| if 'jlpt' in self.databases: | |
| lookups['ja'] = {} | |
| if 'levels' in self.databases['jlpt']: | |
| for level, data in self.databases['jlpt']['levels'].items(): | |
| score = data.get('score', 3) | |
| for word in data.get('words', []): | |
| lookups['ja'][word] = score | |
| # TOPIK lookup (Korean) | |
| if 'topik' in self.databases: | |
| lookups['ko'] = {} | |
| if 'levels' in self.databases['topik']: | |
| for level, data in self.databases['topik']['levels'].items(): | |
| score = data.get('score', 3) | |
| for word in data.get('words', []): | |
| lookups['ko'][word] = score | |
| return lookups | |
| def get_proficiency_score(self, word: str, language: str) -> float: | |
| """ | |
| Get proficiency test score for a word | |
| Args: | |
| word: Word or phrase | |
| language: Language code | |
| Returns: | |
| Score 1-6 (1=easiest, 6=hardest) | |
| """ | |
| language = language.lower() | |
| if language not in self.word_lookups: | |
| return self._estimate_by_length(word) | |
| lookup = self.word_lookups[language] | |
| search_word = word if language in ['zh-cn', 'zh-tw', 'ja', 'ko'] else word.lower() | |
| if search_word in lookup: | |
| return float(lookup[search_word]) | |
| return self._estimate_by_length(word) | |
| def _estimate_by_length(self, word: str) -> float: | |
| """Estimate difficulty by word length (fallback)""" | |
| length = len(word) | |
| if length <= 3: | |
| return 2.0 | |
| elif length <= 6: | |
| return 3.5 | |
| elif length <= 10: | |
| return 4.5 | |
| else: | |
| return 5.5 | |
| def get_length_score(self, word: str) -> float: | |
| """Score based on word length""" | |
| length = len(word) | |
| if length == 1: | |
| return 1.0 | |
| elif length <= 3: | |
| return 2.0 | |
| elif length <= 6: | |
| return 3.0 | |
| elif length <= 10: | |
| return 4.0 | |
| elif length <= 15: | |
| return 5.0 | |
| else: | |
| return 6.0 | |
| def calculate_difficulty(self, word: str, language: str) -> Dict[str, Any]: | |
| """ | |
| Calculate comprehensive difficulty score | |
| Weights: | |
| - Proficiency level: 60% | |
| - Word length: 40% | |
| """ | |
| proficiency_score = self.get_proficiency_score(word, language) | |
| length_score = self.get_length_score(word) | |
| overall_score = proficiency_score * 0.6 + length_score * 0.4 | |
| if overall_score <= 2.5: | |
| level = "beginner" | |
| elif overall_score <= 4.5: | |
| level = "intermediate" | |
| else: | |
| level = "advanced" | |
| test_name = self.LANGUAGE_TESTS.get(language.lower(), 'unknown') | |
| return { | |
| "overall_score": round(overall_score, 2), | |
| "level": level, | |
| "factors": { | |
| "proficiency_score": round(proficiency_score, 2), | |
| "length": len(word), | |
| "length_score": round(length_score, 2), | |
| "test_system": test_name.upper() | |
| } | |
| } | |
| def score_flashcard(self, card: Dict[str, Any]) -> Dict[str, Any]: | |
| """Add difficulty score to flashcard""" | |
| word = card.get('front', '') | |
| language = card.get('language', 'en') | |
| difficulty = self.calculate_difficulty(word, language) | |
| card_with_difficulty = card.copy() | |
| card_with_difficulty['difficulty'] = difficulty | |
| return card_with_difficulty | |
| def score_all_flashcards(self, flashcards: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """Score all flashcards""" | |
| return [self.score_flashcard(card) for card in flashcards] | |
| def get_statistics(self, flashcards: List[Dict[str, Any]]) -> Dict[str, Any]: | |
| """Generate difficulty statistics""" | |
| if not flashcards: | |
| return {} | |
| level_counts = {"beginner": 0, "intermediate": 0, "advanced": 0} | |
| scores = [] | |
| by_language = {} | |
| for card in flashcards: | |
| if 'difficulty' in card: | |
| level = card['difficulty']['level'] | |
| level_counts[level] += 1 | |
| scores.append(card['difficulty']['overall_score']) | |
| lang = card.get('language', 'unknown') | |
| if lang not in by_language: | |
| by_language[lang] = {"count": 0, "scores": []} | |
| by_language[lang]["count"] += 1 | |
| by_language[lang]["scores"].append(card['difficulty']['overall_score']) | |
| for lang in by_language: | |
| lang_scores = by_language[lang]["scores"] | |
| by_language[lang]["avg_score"] = round(sum(lang_scores) / len(lang_scores), 2) | |
| del by_language[lang]["scores"] | |
| return { | |
| "total_cards": len(flashcards), | |
| "by_level": level_counts, | |
| "by_language": by_language, | |
| "average_score": round(sum(scores) / len(scores), 2) if scores else 0, | |
| "min_score": round(min(scores), 2) if scores else 0, | |
| "max_score": round(max(scores), 2) if scores else 0 | |
| } | |
| # Global instance (lazy initialization) | |
| _difficulty_scorer = None | |
| def get_difficulty_scorer() -> DifficultyScorer: | |
| """Get or create the global DifficultyScorer instance""" | |
| global _difficulty_scorer | |
| if _difficulty_scorer is None: | |
| _difficulty_scorer = DifficultyScorer() | |
| return _difficulty_scorer | |