# -*- coding: utf-8 -*- """ OCR Tools - Advanced text extraction with multi-language support Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian """ import io import re from typing import Any, Dict, List, Optional import numpy as np from PIL import Image import pytesseract from deep_translator import GoogleTranslator # Try to import optional dependencies try: import cv2 HAS_CV2 = True except ImportError: HAS_CV2 = False try: from langdetect import detect HAS_LANGDETECT = True except ImportError: HAS_LANGDETECT = False try: from paddleocr import PaddleOCR HAS_PADDLEOCR = True _paddle_ocr = None except ImportError: HAS_PADDLEOCR = False _paddle_ocr = None # Language code mapping LANG_CODE_MAP = { 'zh-cn': 'zh-CN', 'zh-tw': 'zh-TW', 'en': 'en', 'ja': 'ja', 'ko': 'ko', 'fr': 'fr', 'de': 'de', 'es': 'es', 'ru': 'ru', } # Tesseract language codes for each supported language TESSERACT_LANG_MAP = { 'en': 'eng', 'english': 'eng', 'zh-cn': 'chi_sim', 'chinese': 'chi_sim', 'zh-tw': 'chi_tra', 'ja': 'jpn', 'japanese': 'jpn', 'ko': 'kor', 'korean': 'kor', 'de': 'deu', 'german': 'deu', 'es': 'spa', 'spanish': 'spa', 'ru': 'rus', 'russian': 'rus', 'fr': 'fra', 'french': 'fra', } def _get_paddle_ocr(): """Lazily initialize PaddleOCR""" global _paddle_ocr if HAS_PADDLEOCR and _paddle_ocr is None: try: _paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False) except Exception as e: print(f"[OCR] PaddleOCR init failed: {e}") return _paddle_ocr def filter_pinyin_keep_chinese(text: str) -> str: """ Filter out pinyin and keep only Chinese characters. Preserves complete sentences with Chinese characters. """ lines = text.split('\n') filtered_lines = [] for line in lines: line_stripped = line.strip() if not line_stripped: continue # Check if line contains Chinese characters has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line)) # Check if line is pure pinyin is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped)) if is_pinyin: continue if has_chinese: chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line) if chinese_parts: filtered_lines.append(''.join(chinese_parts)) return '\n'.join(filtered_lines) def detect_language_from_text(text: str) -> str: """Detect language, with special handling for Chinese characters""" has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text)) if has_chinese: return 'zh-cn' has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text)) if has_japanese: return 'ja' has_korean = bool(re.search(r'[\uac00-\ud7af]', text)) if has_korean: return 'ko' if HAS_LANGDETECT: try: return detect(text) except: pass return 'en' def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray: """Apply image preprocessing for better OCR accuracy""" if not HAS_CV2: return img_array # Convert to grayscale if needed if len(img_array.shape) == 3: gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) else: gray = img_array if method == 'simple': _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return binary elif method == 'adaptive': return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) elif method == 'clahe': clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return binary elif method == 'denoised': kernel = np.ones((2, 2), np.uint8) denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1) _, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) return binary elif method == 'advanced': clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21) return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) else: return gray def _ocr_with_paddleocr(image_bytes: bytes) -> tuple: """Use PaddleOCR for text extraction (best for Chinese)""" paddle = _get_paddle_ocr() if paddle is None: return None, 0 try: img = Image.open(io.BytesIO(image_bytes)).convert("RGB") img_array = np.array(img) result = paddle.ocr(img_array, cls=True) if not result or len(result) == 0 or result[0] is None: return None, 0 texts = [] scores = [] for line in result[0]: if line and len(line) >= 2: text_info = line[1] if isinstance(text_info, tuple) and len(text_info) >= 2: texts.append(text_info[0]) scores.append(text_info[1]) if not texts: return None, 0 full_text = '\n'.join(texts) avg_confidence = sum(scores) / len(scores) if scores else 0 return full_text, avg_confidence * 100 except Exception as e: print(f"[OCR] PaddleOCR error: {e}") return None, 0 def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple: """Use Tesseract with multiple preprocessing methods""" img = Image.open(io.BytesIO(image_bytes)).convert("RGB") img_array = np.array(img) best_text = "" best_confidence = 0 best_method = "" # Try different preprocessing methods methods = ['simple', 'adaptive', 'clahe', 'denoised'] if HAS_CV2: methods.append('advanced') for method in methods: try: if HAS_CV2: processed = _preprocess_image(img_array, method) processed_img = Image.fromarray(processed) else: processed_img = img # Get OCR data with confidence data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT) text = pytesseract.image_to_string(processed_img, lang=lang) # Calculate average confidence confidences = [int(conf) for conf in data['conf'] if int(conf) > 0] avg_confidence = sum(confidences) / len(confidences) if confidences else 0 if text.strip() and avg_confidence > best_confidence: best_text = text best_confidence = avg_confidence best_method = method except Exception as e: continue return best_text.strip(), best_confidence, best_method def ocr_single_image( image_bytes: bytes, source_lang: Optional[str] = None, target_lang: str = "en", use_paddle: bool = True, ) -> Dict[str, Any]: """ Extract text from a single image and translate. Args: image_bytes: Raw image bytes source_lang: Source language hint (auto-detect if None) target_lang: Target language for translation use_paddle: Whether to try PaddleOCR first Returns: Dict with original_text, translated_text, detected_language, confidence, method """ best_text = "" best_method = "" best_confidence = 0 # Determine Tesseract language string tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra' if source_lang: mapped = TESSERACT_LANG_MAP.get(source_lang.lower()) if mapped: tess_lang = mapped # Try PaddleOCR first (best for Chinese) if use_paddle and HAS_PADDLEOCR: paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes) if paddle_text and paddle_text.strip(): best_text = paddle_text best_method = "PaddleOCR" best_confidence = paddle_conf # Try Tesseract (fallback or if PaddleOCR failed) if not best_text.strip(): tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang) if tess_text and (tess_conf > best_confidence or not best_text): best_text = tess_text best_method = f"Tesseract-{tess_method}" best_confidence = tess_conf if not best_text.strip(): return { "original_text": "", "translated_text": "", "detected_language": "unknown", "confidence": 0, "method": "none", "error": "No text detected" } # Filter pinyin for Chinese text filtered_text = filter_pinyin_keep_chinese(best_text) if not filtered_text.strip(): filtered_text = best_text # Detect language detected_lang = detect_language_from_text(filtered_text) # Translate try: source = LANG_CODE_MAP.get(detected_lang, detected_lang) target = LANG_CODE_MAP.get(target_lang, target_lang) translator = GoogleTranslator(source=source, target=target) translated = translator.translate(filtered_text) except Exception as e: translated = "" return { "original_text": filtered_text.strip(), "translated_text": translated.strip() if translated else "", "detected_language": detected_lang, "confidence": round(best_confidence, 2), "method": best_method } def ocr_and_translate_batch( images: List[bytes], target_lang: str = "en", prefer_ocr_local: bool = True, ) -> List[Dict]: """ Runs OCR on a batch of images with advanced processing. Args: images: List of image bytes target_lang: Target language for translation prefer_ocr_local: Whether to prefer local OCR (PaddleOCR) Returns: List of dicts with OCR results """ results = [] for img_bytes in images: result = ocr_single_image( image_bytes=img_bytes, target_lang=target_lang, use_paddle=prefer_ocr_local and HAS_PADDLEOCR ) # Convert to expected format for backward compatibility results.append({ "text": result.get("original_text", ""), "translation": result.get("translated_text", ""), "target_lang": target_lang, "detected_language": result.get("detected_language", "unknown"), "confidence": result.get("confidence", 0), "method": result.get("method", "unknown"), }) return results # Keep old function for backward compatibility def _simple_ocr(image_bytes: bytes) -> str: """Simple OCR using pytesseract (backward compatibility)""" img = Image.open(io.BytesIO(image_bytes)).convert("RGB") text = pytesseract.image_to_string(img) return text.strip()