yusenthebot
Integrate advanced OCR, FlashcardGenerator, DifficultyScorer, and AI Quiz from language project
aa3fdef
# -*- coding: utf-8 -*-
"""
OCR Tools - Advanced text extraction with multi-language support
Supports: English, Chinese, Japanese, Korean, German, Spanish, Russian
"""
import io
import re
from typing import Any, Dict, List, Optional
import numpy as np
from PIL import Image
import pytesseract
from deep_translator import GoogleTranslator
# Try to import optional dependencies
try:
import cv2
HAS_CV2 = True
except ImportError:
HAS_CV2 = False
try:
from langdetect import detect
HAS_LANGDETECT = True
except ImportError:
HAS_LANGDETECT = False
try:
from paddleocr import PaddleOCR
HAS_PADDLEOCR = True
_paddle_ocr = None
except ImportError:
HAS_PADDLEOCR = False
_paddle_ocr = None
# Language code mapping
LANG_CODE_MAP = {
'zh-cn': 'zh-CN',
'zh-tw': 'zh-TW',
'en': 'en',
'ja': 'ja',
'ko': 'ko',
'fr': 'fr',
'de': 'de',
'es': 'es',
'ru': 'ru',
}
# Tesseract language codes for each supported language
TESSERACT_LANG_MAP = {
'en': 'eng',
'english': 'eng',
'zh-cn': 'chi_sim',
'chinese': 'chi_sim',
'zh-tw': 'chi_tra',
'ja': 'jpn',
'japanese': 'jpn',
'ko': 'kor',
'korean': 'kor',
'de': 'deu',
'german': 'deu',
'es': 'spa',
'spanish': 'spa',
'ru': 'rus',
'russian': 'rus',
'fr': 'fra',
'french': 'fra',
}
def _get_paddle_ocr():
"""Lazily initialize PaddleOCR"""
global _paddle_ocr
if HAS_PADDLEOCR and _paddle_ocr is None:
try:
_paddle_ocr = PaddleOCR(use_textline_orientation=True, lang='ch', show_log=False)
except Exception as e:
print(f"[OCR] PaddleOCR init failed: {e}")
return _paddle_ocr
def filter_pinyin_keep_chinese(text: str) -> str:
"""
Filter out pinyin and keep only Chinese characters.
Preserves complete sentences with Chinese characters.
"""
lines = text.split('\n')
filtered_lines = []
for line in lines:
line_stripped = line.strip()
if not line_stripped:
continue
# Check if line contains Chinese characters
has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', line))
# Check if line is pure pinyin
is_pinyin = bool(re.match(r'^[a-zA-Z\u0101\u00e1\u01ce\u00e0\u0113\u00e9\u011b\u00e8\u012b\u00ed\u01d0\u00ec\u014d\u00f3\u01d2\u00f2\u016b\u00fa\u01d4\u00f9\u00fc\u01d6\u01d8\u01da\u01dc\u0144\u0148\u01f9\s]+$', line_stripped))
if is_pinyin:
continue
if has_chinese:
chinese_parts = re.findall(r'[\u4e00-\u9fff\u3400-\u4dbf]+', line)
if chinese_parts:
filtered_lines.append(''.join(chinese_parts))
return '\n'.join(filtered_lines)
def detect_language_from_text(text: str) -> str:
"""Detect language, with special handling for Chinese characters"""
has_chinese = bool(re.search(r'[\u4e00-\u9fff\u3400-\u4dbf]', text))
if has_chinese:
return 'zh-cn'
has_japanese = bool(re.search(r'[\u3040-\u309f\u30a0-\u30ff]', text))
if has_japanese:
return 'ja'
has_korean = bool(re.search(r'[\uac00-\ud7af]', text))
if has_korean:
return 'ko'
if HAS_LANGDETECT:
try:
return detect(text)
except:
pass
return 'en'
def _preprocess_image(img_array: np.ndarray, method: str = 'simple') -> np.ndarray:
"""Apply image preprocessing for better OCR accuracy"""
if not HAS_CV2:
return img_array
# Convert to grayscale if needed
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
if method == 'simple':
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return binary
elif method == 'adaptive':
return cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
elif method == 'clahe':
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return binary
elif method == 'denoised':
kernel = np.ones((2, 2), np.uint8)
denoised = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel, iterations=1)
_, binary = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return binary
elif method == 'advanced':
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
denoised = cv2.fastNlMeansDenoising(enhanced, None, 10, 7, 21)
return cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
else:
return gray
def _ocr_with_paddleocr(image_bytes: bytes) -> tuple:
"""Use PaddleOCR for text extraction (best for Chinese)"""
paddle = _get_paddle_ocr()
if paddle is None:
return None, 0
try:
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
img_array = np.array(img)
result = paddle.ocr(img_array, cls=True)
if not result or len(result) == 0 or result[0] is None:
return None, 0
texts = []
scores = []
for line in result[0]:
if line and len(line) >= 2:
text_info = line[1]
if isinstance(text_info, tuple) and len(text_info) >= 2:
texts.append(text_info[0])
scores.append(text_info[1])
if not texts:
return None, 0
full_text = '\n'.join(texts)
avg_confidence = sum(scores) / len(scores) if scores else 0
return full_text, avg_confidence * 100
except Exception as e:
print(f"[OCR] PaddleOCR error: {e}")
return None, 0
def _ocr_with_tesseract(image_bytes: bytes, lang: str = 'eng+chi_sim+jpn+kor') -> tuple:
"""Use Tesseract with multiple preprocessing methods"""
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
img_array = np.array(img)
best_text = ""
best_confidence = 0
best_method = ""
# Try different preprocessing methods
methods = ['simple', 'adaptive', 'clahe', 'denoised']
if HAS_CV2:
methods.append('advanced')
for method in methods:
try:
if HAS_CV2:
processed = _preprocess_image(img_array, method)
processed_img = Image.fromarray(processed)
else:
processed_img = img
# Get OCR data with confidence
data = pytesseract.image_to_data(processed_img, lang=lang, output_type=pytesseract.Output.DICT)
text = pytesseract.image_to_string(processed_img, lang=lang)
# Calculate average confidence
confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
if text.strip() and avg_confidence > best_confidence:
best_text = text
best_confidence = avg_confidence
best_method = method
except Exception as e:
continue
return best_text.strip(), best_confidence, best_method
def ocr_single_image(
image_bytes: bytes,
source_lang: Optional[str] = None,
target_lang: str = "en",
use_paddle: bool = True,
) -> Dict[str, Any]:
"""
Extract text from a single image and translate.
Args:
image_bytes: Raw image bytes
source_lang: Source language hint (auto-detect if None)
target_lang: Target language for translation
use_paddle: Whether to try PaddleOCR first
Returns:
Dict with original_text, translated_text, detected_language, confidence, method
"""
best_text = ""
best_method = ""
best_confidence = 0
# Determine Tesseract language string
tess_lang = 'eng+chi_sim+chi_tra+jpn+kor+deu+spa+rus+fra'
if source_lang:
mapped = TESSERACT_LANG_MAP.get(source_lang.lower())
if mapped:
tess_lang = mapped
# Try PaddleOCR first (best for Chinese)
if use_paddle and HAS_PADDLEOCR:
paddle_text, paddle_conf = _ocr_with_paddleocr(image_bytes)
if paddle_text and paddle_text.strip():
best_text = paddle_text
best_method = "PaddleOCR"
best_confidence = paddle_conf
# Try Tesseract (fallback or if PaddleOCR failed)
if not best_text.strip():
tess_text, tess_conf, tess_method = _ocr_with_tesseract(image_bytes, tess_lang)
if tess_text and (tess_conf > best_confidence or not best_text):
best_text = tess_text
best_method = f"Tesseract-{tess_method}"
best_confidence = tess_conf
if not best_text.strip():
return {
"original_text": "",
"translated_text": "",
"detected_language": "unknown",
"confidence": 0,
"method": "none",
"error": "No text detected"
}
# Filter pinyin for Chinese text
filtered_text = filter_pinyin_keep_chinese(best_text)
if not filtered_text.strip():
filtered_text = best_text
# Detect language
detected_lang = detect_language_from_text(filtered_text)
# Translate
try:
source = LANG_CODE_MAP.get(detected_lang, detected_lang)
target = LANG_CODE_MAP.get(target_lang, target_lang)
translator = GoogleTranslator(source=source, target=target)
translated = translator.translate(filtered_text)
except Exception as e:
translated = ""
return {
"original_text": filtered_text.strip(),
"translated_text": translated.strip() if translated else "",
"detected_language": detected_lang,
"confidence": round(best_confidence, 2),
"method": best_method
}
def ocr_and_translate_batch(
images: List[bytes],
target_lang: str = "en",
prefer_ocr_local: bool = True,
) -> List[Dict]:
"""
Runs OCR on a batch of images with advanced processing.
Args:
images: List of image bytes
target_lang: Target language for translation
prefer_ocr_local: Whether to prefer local OCR (PaddleOCR)
Returns:
List of dicts with OCR results
"""
results = []
for img_bytes in images:
result = ocr_single_image(
image_bytes=img_bytes,
target_lang=target_lang,
use_paddle=prefer_ocr_local and HAS_PADDLEOCR
)
# Convert to expected format for backward compatibility
results.append({
"text": result.get("original_text", ""),
"translation": result.get("translated_text", ""),
"target_lang": target_lang,
"detected_language": result.get("detected_language", "unknown"),
"confidence": result.get("confidence", 0),
"method": result.get("method", "unknown"),
})
return results
# Keep old function for backward compatibility
def _simple_ocr(image_bytes: bytes) -> str:
"""Simple OCR using pytesseract (backward compatibility)"""
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
text = pytesseract.image_to_string(img)
return text.strip()