# -*- coding: utf-8 -*- """ Flashcard Generator - Extracts vocabulary with context from OCR results Supports multi-language extraction and context sentence generation """ import json import re from pathlib import Path from typing import List, Dict, Any, Optional from deep_translator import GoogleTranslator class FlashcardGenerator: """Generate flashcards from OCR results with multi-language support""" def __init__(self): self.supported_languages = { 'zh-cn': 'Chinese (Simplified)', 'zh-tw': 'Chinese (Traditional)', 'ja': 'Japanese', 'ko': 'Korean', 'en': 'English', 'fr': 'French', 'de': 'German', 'es': 'Spanish', 'ru': 'Russian', } self.lang_map = { 'zh-cn': 'zh-CN', 'zh-tw': 'zh-TW', 'ja': 'ja', 'ko': 'ko', 'ru': 'ru', } self.translator_cache = {} # Stop words for filtering common words self.stop_words = { 'zh-cn': { '的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '他', '她', '它', '们', '个', '吗', '呢', '吧', '啊', '哦', '嗯', '呀' }, 'en': { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'my', 'your', 'his', 'her', 'its' }, 'de': { 'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einer', 'und', 'oder', 'aber', 'in', 'an', 'auf', 'für', 'mit', 'von', 'zu', 'ist', 'sind', 'war', 'waren', 'ich', 'du', 'er', 'sie', 'es' }, 'es': { 'el', 'la', 'los', 'las', 'un', 'una', 'unos', 'unas', 'y', 'o', 'pero', 'en', 'a', 'de', 'con', 'por', 'para', 'es', 'son', 'era', 'yo', 'tú', 'él', 'ella', 'nosotros', 'vosotros', 'ellos', 'ellas' }, 'ja': { 'の', 'に', 'は', 'を', 'た', 'が', 'で', 'て', 'と', 'し', 'れ', 'さ', 'ある', 'いる', 'も', 'する', 'から', 'な', 'こ', 'そ' }, 'ko': { '은', '는', '이', '가', '을', '를', '의', '에', '에서', '로', '와', '과', '도', '만', '까지', '부터', '하다', '되다', '있다', '없다' }, 'ru': { 'и', 'в', 'на', 'с', 'к', 'по', 'за', 'из', 'у', 'о', 'а', 'но', 'что', 'это', 'как', 'он', 'она', 'они', 'мы', 'вы' } } def extract_chinese_text(self, text: str) -> List[str]: """Extract Chinese characters/phrases""" pattern = re.compile(r'[\u4e00-\u9fff]+') return pattern.findall(text) def extract_japanese_text(self, text: str) -> List[str]: """Extract Japanese text (kanji + hiragana + katakana)""" pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]+') return pattern.findall(text) def extract_korean_text(self, text: str) -> List[str]: """Extract Korean words""" pattern = re.compile(r'[\uAC00-\uD7AF]+') return pattern.findall(text) def extract_european_words(self, text: str) -> List[str]: """Extract words from European languages""" pattern = re.compile(r'[a-zA-ZäöüßÄÖÜáéíóúñÁÉÍÓÚÑàèìòùÀÈÌÒÙ\u0400-\u04FF]+') return pattern.findall(text) def filter_by_length(self, items: List[str], min_len: int = 2, max_len: int = 15) -> List[str]: """Filter items by character length""" return [item for item in items if min_len <= len(item) <= max_len] def filter_stop_words(self, items: List[str], language: str) -> List[str]: """Remove common stop words""" stop_words = self.stop_words.get(language, set()) if language in ['en', 'de', 'es', 'ru']: return [item for item in items if item.lower() not in stop_words] return [item for item in items if item not in stop_words] def extract_vocabulary_by_language(self, text: str, language: str) -> List[str]: """Extract vocabulary based on language type""" if language in ['zh-cn', 'zh-tw']: return self.extract_chinese_text(text) elif language == 'ja': return self.extract_japanese_text(text) elif language == 'ko': return self.extract_korean_text(text) else: return self.extract_european_words(text) def get_sentence_delimiter(self, language: str) -> str: """Get sentence delimiter pattern for a language""" return r'[。!?.!?\n]+' def extract_context_sentence(self, word: str, text: str, language: str = 'zh-cn') -> str: """Extract context around the word""" delimiter = self.get_sentence_delimiter(language) sentences = re.split(delimiter, text) sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return "" # Find sentence containing the word word_sentence_idx = -1 for idx, sentence in enumerate(sentences): if word in sentence: word_sentence_idx = idx break if word_sentence_idx == -1: return "" word_sentence = sentences[word_sentence_idx] is_same_as_sentence = (word_sentence == word or word_sentence.replace(' ', '') == word.replace(' ', '')) is_title = (is_same_as_sentence and (word_sentence_idx <= 3 or word_sentence_idx < len(sentences) - 1)) context_sentences = [] if is_title: context_sentences.append(word_sentence) for i in range(word_sentence_idx + 1, min(word_sentence_idx + 3, len(sentences))): next_sentence = sentences[i] if len(next_sentence) > 3: context_sentences.append(next_sentence) break else: if word_sentence_idx > 0: prev_sentence = sentences[word_sentence_idx - 1] if len(prev_sentence) > 5: context_sentences.append(prev_sentence) context_sentences.append(word_sentence) if word_sentence_idx < len(sentences) - 1: next_sentence = sentences[word_sentence_idx + 1] if len(next_sentence) > 5: context_sentences.append(next_sentence) if language in ['zh-cn', 'zh-tw', 'ja']: context = ''.join(context_sentences) else: context = ' '.join(context_sentences) if len(context) > 150: context = context[:150] + '...' return context def translate_to_target(self, text: str, source_lang: str, target_lang: str = 'en') -> str: """Translate text to target language""" cache_key = f"{source_lang}:{target_lang}:{text}" if cache_key in self.translator_cache: return self.translator_cache[cache_key] try: source = self.lang_map.get(source_lang, source_lang) target = self.lang_map.get(target_lang, target_lang) translator = GoogleTranslator(source=source, target=target) translation = translator.translate(text) self.translator_cache[cache_key] = translation return translation except Exception as e: return f"[Translation failed: {text}]" def extract_learnable_items(self, ocr_result: Dict[str, Any], target_lang: str = 'en') -> List[Dict[str, Any]]: """Extract vocabulary items from OCR result""" original_text = ocr_result.get('original_text', '') or ocr_result.get('text', '') language = ocr_result.get('detected_language', 'unknown') filename = ocr_result.get('filename', '') if not original_text or language == 'unknown': return [] language = language.lower() # Extract vocabulary vocabulary_items = self.extract_vocabulary_by_language(original_text, language) if not vocabulary_items: return [] # Determine length constraints if language in ['zh-cn', 'zh-tw', 'ja']: min_len, max_len = 2, 6 elif language == 'ko': min_len, max_len = 2, 10 else: min_len, max_len = 3, 15 filtered_items = self.filter_by_length(vocabulary_items, min_len=min_len, max_len=max_len) filtered_items = self.filter_stop_words(filtered_items, language) # Remove duplicates unique_items = list(dict.fromkeys(filtered_items))[:10] if not unique_items: return [] items = [] for idx, item in enumerate(unique_items): # Get translation if language == target_lang: translation = item else: translation = self.translate_to_target(item, language, target_lang) # Skip if translation is same as original if translation.strip().lower() == item.strip().lower(): continue # Extract context context = self.extract_context_sentence(item, original_text, language) context_translated = "" if context and language != target_lang: context_translated = self.translate_to_target(context, language, target_lang) items.append({ 'id': idx + 1, 'front': item, 'back': translation, 'context': context, 'context_en': context_translated, 'language': language, 'content_type': 'ocr_vocab', 'source_file': filename, }) return items def generate_flashcards(self, ocr_results: List[Dict[str, Any]], target_lang: str = 'en') -> Dict[str, Any]: """Generate flashcards from OCR results""" all_cards = [] for result in ocr_results: learnable_items = self.extract_learnable_items(result, target_lang) all_cards.extend(learnable_items) return { 'total_cards': len(all_cards), 'cards': all_cards, 'metadata': { 'generator': 'FlashcardGenerator v2.0', 'method': 'context-extraction', } } def save_flashcards(self, flashcards: Dict[str, Any], output_path: str): """Save flashcards to JSON file""" with open(output_path, 'w', encoding='utf-8') as f: json.dump(flashcards, f, ensure_ascii=False, indent=2) def load_ocr_results(self, input_path: str) -> List[Dict[str, Any]]: """Load OCR results from JSON file""" with open(input_path, 'r', encoding='utf-8') as f: return json.load(f)