import gradio as gr import os import tempfile from pathlib import Path import requests import base64 import re from typing import Tuple import markdown from dotenv import load_dotenv from openai import OpenAI # Load environment variables from .env file load_dotenv() # API Configuration API_URL = os.getenv("API_URL", "") API_TOKEN = os.getenv("API_TOKEN", "") class Doc2PageConverter: def __init__(self): self.qianfan_token = os.getenv('QIANFAN_TOKEN') self.qianfan_model = "ernie-x1.1-preview" self.client = None if self.qianfan_token: self.client = OpenAI( base_url="https://qianfan.baidubce.com/v2", api_key=self.qianfan_token ) def extract_text_with_api(self, file_path: str) -> str: """Extract text and structure using PP-StructureV3 API""" try: if not API_URL or not API_TOKEN: raise ValueError( "API_URL and API_TOKEN must be configured in .env file") # Determine file type file_extension = Path(file_path).suffix.lower() if file_extension == ".pdf": file_type = 0 # PDF else: file_type = 1 # Image # Read file content with open(file_path, "rb") as f: file_bytes = f.read() # Encode file to base64 file_data = base64.b64encode(file_bytes).decode("ascii") # Prepare API request headers = { "Authorization": f"token {API_TOKEN}", "Content-Type": "application/json", } # Use default settings for simplicity payload = { "file": file_data, "fileType": file_type, "useFormulaRecognition": True, "useChartRecognition": False, "useDocOrientationClassify": False, "useDocUnwarping": False, "useTextlineOrientation": False, "useSealRecognition": True, "useRegionDetection": True, "useTableRecognition": True, "layoutThreshold": 0.5, "layoutNms": True, "layoutUnclipRatio": 1.0, "textDetLimitType": "min", "textTetLimitSideLen": 736, "textDetThresh": 0.30, "textDetBoxThresh": 0.60, "textDetUnclipRatio": 1.5, "textRecScoreThresh": 0.00, "sealDetLimitType": "min", "sealDetLimitSideLen": 736, "sealDetThresh": 0.20, "sealDetBoxThresh": 0.60, "sealDetUnclipRatio": 0.5, "sealRecScoreThresh": 0.00, "useOcrResultsWithTableCells": True, "useE2eWiredTableRecModel": False, "useE2eWirelessTableRecModel": False, "useWiredTableCellsTransToHtml": False, "useWirelessWableCellsTransToHtml": False, "useTableOrientationClassify": True, } # Call API response = requests.post( API_URL, json=payload, headers=headers, timeout=300, # 5 minutes timeout ) response.raise_for_status() result = response.json() # Process API response layout_results = result.get("result", {}).get( "layoutParsingResults", []) markdown_content_list = [] markdown_list = [] for res in layout_results: markdown_data = res["markdown"] markdown_text = markdown_data["text"] img_path_to_url = markdown_data["images"] # Embed images into markdown markdown_content = self.embed_images_into_markdown_text( markdown_text, img_path_to_url ) markdown_content_list.append(markdown_content) # Prepare for concatenation markdown_with_content = markdown_data.copy() markdown_with_content["text"] = markdown_content markdown_list.append(markdown_with_content) # Concatenate all pages concatenated_markdown = self.concatenate_markdown_pages(markdown_list) return concatenated_markdown except requests.exceptions.RequestException as e: raise RuntimeError(f"API request failed: {str(e)}") except Exception as e: print(f"Error in API extraction: {e}") return "" def embed_images_into_markdown_text(self, markdown_text, markdown_images): """Embed images into markdown text""" for img_path, img_url in markdown_images.items(): markdown_text = markdown_text.replace( f' str: """Convert markdown to HTML using ERNIE API""" if not self.client: # Fallback to basic markdown conversion if no API client return self.basic_markdown_to_html(markdown_text) try: prompt = f"""Please convert the following markdown text into a modern, clean HTML page. Use contemporary typography with the Inter font family and clean design principles. Make it visually appealing with proper CSS styling, responsive design, and excellent readability. Design requirements: - Use Inter font from Google Fonts - Clean, modern spacing and typography - Subtle shadows and rounded corners - Good color contrast and hierarchy - Responsive design that works on all devices - Include proper HTML structure with head, body, and semantic elements Important: Add a footer at the bottom with "Powered by PaddleOCR and ERNIE" where PaddleOCR links to https://github.com/PaddlePaddle/PaddleOCR and ERNIE links to https://huggingface.co/BAIDU. Style it with modern, subtle styling. Markdown content: {markdown_text} IMPORTANT: Return ONLY the raw HTML code starting with and ending with . Do NOT wrap it in markdown code blocks or add any explanations. I need the pure HTML content that can be directly saved as an .html file.""" messages = [{"role": "user", "content": prompt}] response = self.client.chat.completions.create( model=self.qianfan_model, messages=messages, max_tokens=64000, ) html_content = response.choices[0].message.content # Clean up markdown code block markers if present if html_content.startswith('```html'): html_content = html_content[7:] # Remove ```html elif html_content.startswith('```'): html_content = html_content[3:] # Remove ``` if html_content.endswith('```'): html_content = html_content[:-3] # Remove ending ``` # Strip any extra whitespace html_content = html_content.strip() return html_content except Exception as e: print(f"Error calling ERNIE API: {e}") return self.basic_markdown_to_html(markdown_text) def basic_markdown_to_html(self, markdown_text: str) -> str: """Fallback markdown to HTML conversion""" html = markdown.markdown(markdown_text) # Wrap in a complete HTML document with styling complete_html = f""" Converted Document
{html}
""" return complete_html def process_document(self, file_path: str) -> Tuple[str, str]: """Process uploaded document and convert to HTML""" try: file_extension = Path(file_path).suffix.lower() # Check supported formats if file_extension == '.pdf' or file_extension in [ '.png', '.jpg', '.jpeg', '.bmp', '.tiff']: # Process with PP-StructureV3 API markdown_content = self.extract_text_with_api(file_path) else: return ("Error: Unsupported file format. " "Please upload PDF or image files."), "" if not markdown_content.strip(): return ("Warning: No text content extracted " "from the document."), "" # Convert markdown to HTML using ERNIE or fallback html_content = self.markdown_to_html_with_ernie(markdown_content) return markdown_content, html_content except Exception as e: return f"Error processing document: {str(e)}", "" # Initialize converter converter = Doc2PageConverter() def process_upload(file): """Process uploaded file and return markdown and HTML""" if file is None: return "Please upload a file.", "", "" try: # Process the document markdown_result, html_result = converter.process_document(file.name) if html_result: return "Document processed successfully!", markdown_result, html_result else: return markdown_result, "", "" # Error message in markdown_result except Exception as e: return f"Error: {str(e)}", "", "" def save_html_file(html_content, filename="converted_page"): """Save HTML content to file for download""" if not html_content: return None # Create temporary file temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, prefix=f"{filename}_") temp_file.write(html_content) temp_file.close() return temp_file.name # Create custom theme for a clean, modern look custom_theme = gr.themes.Default( primary_hue="blue", secondary_hue="gray", neutral_hue="gray", font=("Inter", "system-ui", "sans-serif"), font_mono=("SF Mono", "Consolas", "monospace") ).set( body_background_fill="#fafafa", background_fill_primary="#ffffff", background_fill_secondary="#f8f9fa", border_color_primary="#e5e7eb", button_primary_background_fill="#6366f1", button_primary_background_fill_hover="#4f46e5", button_primary_text_color="#ffffff", ) # Create Gradio interface with gr.Blocks( title="Doc2Page - Simple Document Converter", theme=custom_theme, css=""" .gradio-container { max-width: 1200px !important; margin: auto; padding: 32px 16px; } /* Enhanced button styling */ .gr-button { font-weight: 500; border-radius: 10px; font-size: 14px; transition: all 0.2s ease; box-shadow: 0 2px 4px rgba(99, 102, 241, 0.1); } .gr-button:hover { transform: translateY(-1px); box-shadow: 0 4px 8px rgba(99, 102, 241, 0.2); } /* Input styling */ .gr-textbox, .gr-file { border-radius: 10px; font-family: 'Inter', system-ui, sans-serif; border: 1px solid #e5e7eb; transition: border-color 0.2s ease; } .gr-textbox:focus, .gr-file:focus { border-color: #6366f1; box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1); } /* Typography */ h1 { font-weight: 700; color: #1a1a1a; margin-bottom: 8px; font-size: 2.5rem; } .app-description { color: #6b7280; font-size: 18px; margin-bottom: 40px; font-weight: 400; } /* Tab styling */ .gr-tab { border-radius: 8px 8px 0 0; font-weight: 500; } /* Card-like sections */ .gr-column { background: rgba(255, 255, 255, 0.5); border-radius: 12px; padding: 16px; margin: 8px; } /* Status styling */ .gr-textbox[data-testid*="status"] { background-color: #f8fafc; border: 1px solid #e2e8f0; } /* Download section styling */ .download-section { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; padding: 20px; color: white; margin-top: 20px; } """ ) as app: # Header gr.Markdown( "# Doc2Page", elem_classes="main-title" ) gr.Markdown( "🥃 Transform your documents into beautiful webpages!", elem_classes="app-description" ) # Main interface with gr.Row(): with gr.Column(scale=1, min_width=350): with gr.Group(): gr.Markdown("### 📄 Upload Document") file_input = gr.File( label="Choose your file", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"], file_count="single", height=140 ) process_btn = gr.Button( "✨ Convert to Webpage", variant="primary", size="lg", scale=1 ) status_output = gr.Textbox( label="Status", placeholder="Ready to convert your document...", interactive=False, lines=3, max_lines=3 ) with gr.Column(scale=2): gr.Markdown("### 📋 Results") with gr.Tabs(): with gr.TabItem("❤️ Preview", id="preview"): html_preview = gr.HTML( label="", value="
Your converted webpage will appear here
", ) with gr.TabItem("📝 Markdown Source", id="markdown"): markdown_output = gr.Textbox( label="", placeholder="Extracted markdown content will appear here...", lines=22, interactive=False, show_copy_button=True ) with gr.TabItem("🌐 HTML Source", id="html"): html_output = gr.Code( label="", language="html", lines=22, interactive=False ) # Success & Download section with gr.Row(visible=False) as download_section: with gr.Column(): gr.Markdown("""

✅ Conversion Successful!

Your document has been converted to a beautiful webpage

""") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📥 Download Your Webpage") download_btn = gr.File( label="HTML File", visible=True ) with gr.Column(scale=1): gr.Markdown("### 🚀 Quick Deploy Guide") gr.Markdown(""" 1. **GitHub Pages**: Upload as `index.html` to your repo 2. **Netlify**: Drag & drop the file to netlify.app 3. **Vercel**: Use their simple file deployment 4. **Local**: Double-click to open in browser """, elem_classes="deploy-guide") # Event handlers def process_and_update(file): status, markdown_content, html_content = process_upload(file) # Create download file if HTML was generated download_file = None show_download = False if html_content: filename = Path(file.name).stem if file else "converted_page" download_file = save_html_file(html_content, filename) show_download = True # Preview content with better styling when no content preview_content = html_content if html_content else """

No preview available

Convert a document to see the preview

""" return ( status, # status_output markdown_content, # markdown_output html_content, # html_output preview_content, # html_preview download_file, # download_btn gr.update(visible=show_download) # download_section ) process_btn.click( fn=process_and_update, inputs=[file_input], outputs=[ status_output, markdown_output, html_output, html_preview, download_btn, download_section ] ) # Footer gr.Markdown( """
Powered by PaddleOCR for text extraction and ERNIE for HTML generation
""", elem_id="footer" ) # Tips section with gr.Accordion("💡 Tips for Best Results", open=False): gr.Markdown(""" **File Types:** PDF, PNG, JPG, JPEG, BMP, TIFF **For Best OCR Results:** - Use high-resolution, clear images - Ensure good contrast between text and background - Avoid skewed or rotated documents - PDFs generally produce the best results **🚀 Deploy to GitHub Pages:** 1. Create a new GitHub repository or use an existing one 2. Download the generated HTML file from above 3. Upload it to your repository as `index.html` 4. Go to repository Settings → Pages 5. Select "Deploy from a branch" → Choose "main" branch 6. Your page will be live at `https://yourusername.github.io/yourrepository` **💡 Pro Tips:** - Enable custom domains in GitHub Pages settings - Use GitHub Actions for automated deployments - Consider using Jekyll themes for enhanced styling """) if __name__ == "__main__": app.launch()