| | import argparse |
| | import pymupdf |
| | from deep_translator import ( |
| | GoogleTranslator, |
| | ChatGptTranslator, |
| | ) |
| |
|
| | |
| | TRANSLATORS = { |
| | 'google': GoogleTranslator, |
| | 'chatgpt': ChatGptTranslator, |
| | } |
| |
|
| | def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Text", |
| | translator_name: str = "google", text_color: str = "darkred", keep_original: bool = True): |
| | """ |
| | Translate a PDF file from source language to target language |
| | |
| | Args: |
| | input_file: Path to input PDF file |
| | source_lang: Source language code (e.g. 'en', 'fr') |
| | target_lang: Target language code (e.g. 'ko', 'ja') |
| | layer: Name of the OCG layer (default: "Text") |
| | translator_name: Name of the translator to use (default: "google") |
| | text_color: Color of translated text (default: "darkred") |
| | keep_original: Whether to keep original text visible (default: True) |
| | """ |
| | |
| | WHITE = pymupdf.pdfcolor["white"] |
| | |
| | |
| | COLOR_MAP = { |
| | "darkred": (0.8, 0, 0), |
| | "black": (0, 0, 0), |
| | "blue": (0, 0, 0.8), |
| | "darkgreen": (0, 0.5, 0), |
| | "purple": (0.5, 0, 0.5), |
| | } |
| | |
| | |
| | rgb_color = COLOR_MAP.get(text_color.lower(), COLOR_MAP["darkred"]) |
| |
|
| | |
| | textflags = pymupdf.TEXT_DEHYPHENATE |
| |
|
| | |
| | if translator_name not in TRANSLATORS: |
| | raise ValueError(f"Unsupported translator: {translator_name}. Available translators: {', '.join(TRANSLATORS.keys())}") |
| | |
| | TranslatorClass = TRANSLATORS[translator_name] |
| | |
| | |
| | translator = TranslatorClass(source=source_lang, target=target_lang) |
| |
|
| | |
| | output_file = input_file.rsplit('.', 1)[0] + f'-{target_lang}.pdf' |
| |
|
| | |
| | doc = pymupdf.open(input_file) |
| |
|
| | |
| | ocg_trans = doc.add_ocg(layer, on=True) |
| | |
| | |
| | if not keep_original: |
| | ocg_orig = doc.add_ocg("Original", on=False) |
| |
|
| | |
| | for page in doc: |
| | |
| | blocks = page.get_text("blocks", flags=textflags) |
| |
|
| | |
| | for block in blocks: |
| | bbox = block[:4] |
| | text = block[4] |
| |
|
| | |
| | translated = translator.translate(text) |
| |
|
| | if not keep_original: |
| | |
| | page.insert_htmlbox( |
| | bbox, |
| | text, |
| | css="* {font-family: sans-serif;}", |
| | oc=ocg_orig |
| | ) |
| | |
| | page.draw_rect(bbox, color=None, fill=WHITE) |
| | else: |
| | |
| | page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_trans) |
| |
|
| | |
| | page.insert_htmlbox( |
| | bbox, |
| | translated, |
| | css=f"* {{font-family: sans-serif; color: rgb({int(rgb_color[0]*255)}, {int(rgb_color[1]*255)}, {int(rgb_color[2]*255)});}}", |
| | oc=ocg_trans |
| | ) |
| |
|
| | doc.subset_fonts() |
| | doc.ez_save(output_file) |
| | print(f"Translated PDF saved as: {output_file}") |
| |
|
| | def main(): |
| | """ |
| | can be invoked like this: |
| | ``` |
| | # Basic usage |
| | python translator_cli.py --source english --target zh-CN input.pdf |
| | |
| | # With custom color and hiding original text |
| | python translator_cli.py --source english --target zh-CN --color blue --no-original input.pdf |
| | |
| | # Using ChatGPT translator |
| | export OPENAI_API_KEY=sk-proj-xxxx |
| | export OPENAI_API_BASE=https://api.xxxx.com/v1 |
| | export OPENAI_API_BASE=http://localhost:8080/v1 # for local llm api |
| | export OPENAI_MODEL=default_model |
| | |
| | python translator_cli.py --source english --translator chatgpt --target zh-CN input.pdf |
| | |
| | # do not keep original text as an optional layer: |
| | python translator_cli.py --source english --translator chatgpt --target zh-CN --no-original input.pdf |
| | |
| | ``` |
| | |
| | The translated content is an optional content layer in the new PDF file. |
| | The optional layer can be hidden in Acrobat PDF Reader and Foxit Reader. |
| | """ |
| | |
| | parser = argparse.ArgumentParser(description='Translate PDF documents.') |
| | parser.add_argument('input_file', help='Input PDF file path') |
| | parser.add_argument('--source', '-s', default='en', |
| | help='Source language code (default: en)') |
| | parser.add_argument('--target', '-t', default='zh-CN', |
| | help='Target language code (default: zh-CN)') |
| | parser.add_argument('--layer', '-l', default='Text', |
| | help='Name of the OCG layer (default: Text)') |
| | parser.add_argument('--translator', '-tr', default='google', |
| | choices=list(TRANSLATORS.keys()), |
| | help='Translator to use (default: google)') |
| | parser.add_argument('--color', '-c', default='darkred', |
| | choices=['darkred', 'black', 'blue', 'darkgreen', 'purple'], |
| | help='Color of translated text (default: darkred)') |
| | parser.add_argument('--no-original', action='store_true', |
| | help='Do not keep original text in base layer (default: False)') |
| |
|
| | args = parser.parse_args() |
| |
|
| | try: |
| | translate_pdf(args.input_file, args.source, args.target, args.layer, |
| | args.translator, args.color, not args.no_original) |
| | except Exception as e: |
| | print(f"Error: {str(e)}") |
| | exit(1) |
| |
|
| | if __name__ == "__main__": |
| | main() |