In [None]:
%%capture
!pip install gradio transformers transformers-stream-generator qwen-vl-utils
!pip install torchvision torch huggingface_hub spaces accelerate ipython
!pip install pillow av python-docx requests numpy reportlab fpdf hf_xet
#Hold tight, this will take around 1-2 minutes.

In [None]:
# ================================================
#        Qwen3-VL-4B-Thinking-abliterated
# ================================================

# Model used in the app:
# https://huggingface.co/prithivMLmods/Qwen3-VL-4B-Thinking-abliterated

import os
import random
import uuid
import json
import time
import asyncio
from threading import Thread
from typing import Iterable

import gradio as gr
import spaces
import torch
import numpy as np
from PIL import Image
import io
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
from reportlab.lib.units import inch
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import docx
from docx.enum.text import WD_ALIGN_PARAGRAPH

from transformers import (
    Qwen3VLForConditionalGeneration,
    AutoProcessor,
    TextIteratorStreamer
)
from qwen_vl_utils import process_vision_info
from gradio.themes import Soft
from gradio.themes.utils import colors, fonts, sizes

# --- Theme and CSS Definition ---

colors.steel_blue = colors.Color(
    name="steel_blue",
    c50="#EBF3F8",
    c100="#D3E5F0",
    c200="#A8CCE1",
    c300="#7DB3D2",
    c400="#529AC3",
    c500="#4682B4",  # SteelBlue base color
    c600="#3E72A0",
    c700="#36638C",
    c800="#2E5378",
    c900="#264364",
    c950="#1E3450",
)

class SteelBlueTheme(Soft):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.gray,
        secondary_hue: colors.Color | str = colors.steel_blue,
        neutral_hue: colors.Color | str = colors.slate,
        text_size: sizes.Size | str = sizes.text_lg,
        font: fonts.Font | str | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
        ),
        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
        ),
    ):
        super().__init__(
            primary_hue=primary_hue,
            secondary_hue=secondary_hue,
            neutral_hue=neutral_hue,
            text_size=text_size,
            font=font,
            font_mono=font_mono,
        )
        super().set(
            background_fill_primary="*primary_50",
            background_fill_primary_dark="*primary_900",
            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
            button_primary_text_color="white",
            button_primary_text_color_hover="white",
            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
            slider_color="*secondary_500",
            slider_color_dark="*secondary_600",
            block_title_text_weight="600",
            block_border_width="3px",
            block_shadow="*shadow_drop_lg",
            button_primary_shadow="*shadow_drop_lg",
            button_large_padding="11px",
            color_accent_soft="*primary_100",
            block_label_background_fill="*primary_200",
        )

steel_blue_theme = SteelBlueTheme()

# --- Model and App Setup ---

# Define model options
MODEL_OPTIONS = {
    "Qwen3-VL-4B-Thinking-abliterated": "prithivMLmods/Qwen3-VL-4B-Thinking-abliterated",
}

# Preload models and processors into CUDA
models = {}
processors = {}
for name, model_id in MODEL_OPTIONS.items():
    print(f"Loading {name}ðŸ¤—. Hold tight, this will take around 4-6 minutes..")
    models[name] = Qwen3VLForConditionalGeneration.from_pretrained(
        model_id,
        trust_remote_code=True,
        torch_dtype=torch.float16
    ).to("cuda").eval()
    processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

image_extensions = Image.registered_extensions()

def identify_and_save_blob(blob_path):
    """Identifies if the blob is an image and saves it."""
    try:
        with open(blob_path, 'rb') as file:
            blob_content = file.read()
            try:
                Image.open(io.BytesIO(blob_content)).verify()
                extension = ".png"
                media_type = "image"
            except (IOError, SyntaxError):
                raise ValueError("Unsupported media type. Please upload a valid image.")

            filename = f"temp_{uuid.uuid4()}_media{extension}"
            with open(filename, "wb") as f:
                f.write(blob_content)

            return filename, media_type

    except FileNotFoundError:
        raise ValueError(f"The file {blob_path} was not found.")
    except Exception as e:
        raise ValueError(f"An error occurred while processing the file: {e}")

@spaces.GPU
def qwen_inference(model_name, media_input, text_input=None, max_new_tokens=2048, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2):
    """Handles inference for the selected model with advanced options."""
    model = models[model_name]
    processor = processors[model_name]

    if not media_input:
        raise gr.Error("Please upload an image.")

    media_path = media_input
    if media_path.endswith(tuple([i for i in image_extensions.keys()])):
        media_type = "image"
    else:
        try:
            media_path, media_type = identify_and_save_blob(media_input)
        except Exception as e:
            raise ValueError(f"Unsupported media type. Please upload a valid image. Error: {e}")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": media_type, media_type: media_path},
                {"type": "text", "text": text_input},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    streamer = TextIteratorStreamer(
        processor.tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repetition_penalty=repetition_penalty,
    )

    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
        buffer = buffer.replace("<|im_end|>", "")
        yield buffer

def format_plain_text(output_text):
    """Formats the output text as plain text."""
    plain_text = output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
    return plain_text

def generate_document(media_path, output_text, file_format, font_size, line_spacing, alignment, image_size):
    """Generates a document with the input image and plain text output."""
    if not media_path:
        raise gr.Error("Cannot generate document without an input image.")
    plain_text = format_plain_text(output_text)
    if file_format == "pdf":
        return generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size)
    elif file_format == "docx":
        return generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size)

def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):
    """Generates a PDF document."""
    filename = f"output_{uuid.uuid4()}.pdf"
    doc = SimpleDocTemplate(
        filename, pagesize=A4, rightMargin=inch, leftMargin=inch, topMargin=inch, bottomMargin=inch
    )
    styles = getSampleStyleSheet()
    styles["Normal"].fontSize = int(font_size)
    styles["Normal"].leading = int(font_size) * line_spacing
    styles["Normal"].alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]

    story = []
    image_sizes = {"Small": (2 * inch, 2 * inch), "Medium": (4 * inch, 4 * inch), "Large": (6 * inch, 6 * inch)}
    img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
    story.append(img)
    story.append(Spacer(1, 12))
    text = Paragraph(plain_text.replace("\n", "<br/>"), styles["Normal"])
    story.append(text)
    doc.build(story)
    return filename

def generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size):
    """Generates a DOCX document."""
    filename = f"output_{uuid.uuid4()}.docx"
    doc = docx.Document()
    image_sizes = {"Small": docx.shared.Inches(2), "Medium": docx.shared.Inches(4), "Large": docx.shared.Inches(6)}
    doc.add_picture(media_path, width=image_sizes[image_size])
    doc.add_paragraph()
    paragraph = doc.add_paragraph()
    paragraph.paragraph_format.line_spacing = line_spacing
    paragraph.paragraph_format.alignment = {
        "Left": WD_ALIGN_PARAGRAPH.LEFT,
        "Center": WD_ALIGN_PARAGRAPH.CENTER,
        "Right": WD_ALIGN_PARAGRAPH.RIGHT,
        "Justified": WD_ALIGN_PARAGRAPH.JUSTIFY
    }[alignment]
    run = paragraph.add_run(plain_text)
    run.font.size = docx.shared.Pt(int(font_size))
    doc.save(filename)
    return filename

# CSS for output styling
css = """
.download-btn {
    background-color: #35a6d6 !important;
    color: white !important;
}
.download-btn:hover {
    background-color: #22bcff !important;
}
"""

# Gradio app setup
with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
    gr.Markdown("# **Qwen3-VL-4B-Thinking-abliterated**")

    with gr.Tab(label="Image Input"):
        with gr.Row():
            with gr.Column(scale=2):
                model_choice = gr.Dropdown(
                    label="Model Selection",
                    choices=list(MODEL_OPTIONS.keys()),
                    value="Qwen3-VL-4B-Thinking-abliterated"
                )
                input_media = gr.File(label="Upload Image", type="filepath")
                text_input = gr.Textbox(label="Question", value="Describe the Image")

                with gr.Accordion("Advanced options", open=False):
                    max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=4096, step=1, value=2048)
                    temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                    top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                    top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                    repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)

                submit_btn = gr.Button(value="Submit", variant="primary")

            with gr.Column(scale=3):
                output_text = gr.Textbox(label="Output Text", lines=15, show_copy_button=True)
                with gr.Accordion("Plain Text", open=False):
                    plain_text_output = gr.Textbox(label="Standardized Plain Text", lines=10, show_copy_button=True)

        with gr.Accordion("Docx/PDF Settings", open=False):
            with gr.Row():
                with gr.Column():
                    line_spacing = gr.Dropdown(choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0], value=1.5, label="Line Spacing")
                    font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"], value="16", label="Font Size")
                with gr.Column():
                    alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Text Alignment")
                    image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size")
                with gr.Column():
                    file_format = gr.Radio(["pdf", "docx"], label="File Format", value="pdf")

        get_document_btn = gr.Button(value="Get Document", elem_classes="download-btn")

    # --- Event Handlers ---
    submit_btn.click(
        qwen_inference,
        [model_choice, input_media, text_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
        [output_text]
    ).then(
        lambda out_text: format_plain_text(out_text), [output_text], [plain_text_output]
    )

    get_document_btn.click(
        generate_document,
        [input_media, output_text, file_format, font_size, line_spacing, alignment, image_size],
        gr.File(label="Download Document")
    )

demo.launch(debug=True)


## **Demo Inference**

![Screenshot 2025-10-15 at 15-09-19 Gradio](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/tBgePT_Y0oNfwwJwrvYkO.png)

![Screenshot 2025-10-15 at 15-00-03 Gradio](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/6BPowjfSEAeZM6aVZUxKW.png)