File size: 19,175 Bytes

8b72ef4

{
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "%%capture\n",
        "!pip install gradio transformers transformers-stream-generator qwen-vl-utils\n",
        "!pip install torchvision torch huggingface_hub spaces accelerate ipython\n",
        "!pip install pillow av python-docx requests numpy reportlab fpdf hf_xet\n",
        "#Hold tight, this will take around 1-2 minutes."
      ],
      "metadata": {
        "id": "QpRRe6WQ8buc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tElKr2Fkp1bO"
      },
      "outputs": [],
      "source": [
        "# ================================================\n",
        "#        Qwen3-VL-4B-Thinking-abliterated\n",
        "# ================================================\n",
        "\n",
        "# Model used in the app:\n",
        "# https://huggingface.co/prithivMLmods/Qwen3-VL-4B-Thinking-abliterated\n",
        "\n",
        "import os\n",
        "import random\n",
        "import uuid\n",
        "import json\n",
        "import time\n",
        "import asyncio\n",
        "from threading import Thread\n",
        "from typing import Iterable\n",
        "\n",
        "import gradio as gr\n",
        "import spaces\n",
        "import torch\n",
        "import numpy as np\n",
        "from PIL import Image\n",
        "import io\n",
        "from reportlab.lib.pagesizes import A4\n",
        "from reportlab.lib.styles import getSampleStyleSheet\n",
        "from reportlab.lib import colors\n",
        "from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer\n",
        "from reportlab.lib.units import inch\n",
        "from reportlab.pdfbase import pdfmetrics\n",
        "from reportlab.pdfbase.ttfonts import TTFont\n",
        "import docx\n",
        "from docx.enum.text import WD_ALIGN_PARAGRAPH\n",
        "\n",
        "from transformers import (\n",
        "    Qwen3VLForConditionalGeneration,\n",
        "    AutoProcessor,\n",
        "    TextIteratorStreamer\n",
        ")\n",
        "from qwen_vl_utils import process_vision_info\n",
        "from gradio.themes import Soft\n",
        "from gradio.themes.utils import colors, fonts, sizes\n",
        "\n",
        "# --- Theme and CSS Definition ---\n",
        "\n",
        "colors.steel_blue = colors.Color(\n",
        "    name=\"steel_blue\",\n",
        "    c50=\"#EBF3F8\",\n",
        "    c100=\"#D3E5F0\",\n",
        "    c200=\"#A8CCE1\",\n",
        "    c300=\"#7DB3D2\",\n",
        "    c400=\"#529AC3\",\n",
        "    c500=\"#4682B4\",  # SteelBlue base color\n",
        "    c600=\"#3E72A0\",\n",
        "    c700=\"#36638C\",\n",
        "    c800=\"#2E5378\",\n",
        "    c900=\"#264364\",\n",
        "    c950=\"#1E3450\",\n",
        ")\n",
        "\n",
        "class SteelBlueTheme(Soft):\n",
        "    def __init__(\n",
        "        self,\n",
        "        *,\n",
        "        primary_hue: colors.Color | str = colors.gray,\n",
        "        secondary_hue: colors.Color | str = colors.steel_blue,\n",
        "        neutral_hue: colors.Color | str = colors.slate,\n",
        "        text_size: sizes.Size | str = sizes.text_lg,\n",
        "        font: fonts.Font | str | Iterable[fonts.Font | str] = (\n",
        "            fonts.GoogleFont(\"Outfit\"), \"Arial\", \"sans-serif\",\n",
        "        ),\n",
        "        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (\n",
        "            fonts.GoogleFont(\"IBM Plex Mono\"), \"ui-monospace\", \"monospace\",\n",
        "        ),\n",
        "    ):\n",
        "        super().__init__(\n",
        "            primary_hue=primary_hue,\n",
        "            secondary_hue=secondary_hue,\n",
        "            neutral_hue=neutral_hue,\n",
        "            text_size=text_size,\n",
        "            font=font,\n",
        "            font_mono=font_mono,\n",
        "        )\n",
        "        super().set(\n",
        "            background_fill_primary=\"*primary_50\",\n",
        "            background_fill_primary_dark=\"*primary_900\",\n",
        "            body_background_fill=\"linear-gradient(135deg, *primary_200, *primary_100)\",\n",
        "            body_background_fill_dark=\"linear-gradient(135deg, *primary_900, *primary_800)\",\n",
        "            button_primary_text_color=\"white\",\n",
        "            button_primary_text_color_hover=\"white\",\n",
        "            button_primary_background_fill=\"linear-gradient(90deg, *secondary_500, *secondary_600)\",\n",
        "            button_primary_background_fill_hover=\"linear-gradient(90deg, *secondary_600, *secondary_700)\",\n",
        "            button_primary_background_fill_dark=\"linear-gradient(90deg, *secondary_600, *secondary_800)\",\n",
        "            button_primary_background_fill_hover_dark=\"linear-gradient(90deg, *secondary_500, *secondary_500)\",\n",
        "            slider_color=\"*secondary_500\",\n",
        "            slider_color_dark=\"*secondary_600\",\n",
        "            block_title_text_weight=\"600\",\n",
        "            block_border_width=\"3px\",\n",
        "            block_shadow=\"*shadow_drop_lg\",\n",
        "            button_primary_shadow=\"*shadow_drop_lg\",\n",
        "            button_large_padding=\"11px\",\n",
        "            color_accent_soft=\"*primary_100\",\n",
        "            block_label_background_fill=\"*primary_200\",\n",
        "        )\n",
        "\n",
        "steel_blue_theme = SteelBlueTheme()\n",
        "\n",
        "# --- Model and App Setup ---\n",
        "\n",
        "# Define model options\n",
        "MODEL_OPTIONS = {\n",
        "    \"Qwen3-VL-4B-Thinking-abliterated\": \"prithivMLmods/Qwen3-VL-4B-Thinking-abliterated\",\n",
        "}\n",
        "\n",
        "# Preload models and processors into CUDA\n",
        "models = {}\n",
        "processors = {}\n",
        "for name, model_id in MODEL_OPTIONS.items():\n",
        "    print(f\"Loading {name}🤗. Hold tight, this will take around 4-6 minutes..\")\n",
        "    models[name] = Qwen3VLForConditionalGeneration.from_pretrained(\n",
        "        model_id,\n",
        "        trust_remote_code=True,\n",
        "        torch_dtype=torch.float16\n",
        "    ).to(\"cuda\").eval()\n",
        "    processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n",
        "\n",
        "image_extensions = Image.registered_extensions()\n",
        "\n",
        "def identify_and_save_blob(blob_path):\n",
        "    \"\"\"Identifies if the blob is an image and saves it.\"\"\"\n",
        "    try:\n",
        "        with open(blob_path, 'rb') as file:\n",
        "            blob_content = file.read()\n",
        "            try:\n",
        "                Image.open(io.BytesIO(blob_content)).verify()\n",
        "                extension = \".png\"\n",
        "                media_type = \"image\"\n",
        "            except (IOError, SyntaxError):\n",
        "                raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
        "\n",
        "            filename = f\"temp_{uuid.uuid4()}_media{extension}\"\n",
        "            with open(filename, \"wb\") as f:\n",
        "                f.write(blob_content)\n",
        "\n",
        "            return filename, media_type\n",
        "\n",
        "    except FileNotFoundError:\n",
        "        raise ValueError(f\"The file {blob_path} was not found.\")\n",
        "    except Exception as e:\n",
        "        raise ValueError(f\"An error occurred while processing the file: {e}\")\n",
        "\n",
        "@spaces.GPU\n",
        "def qwen_inference(model_name, media_input, text_input=None, max_new_tokens=2048, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2):\n",
        "    \"\"\"Handles inference for the selected model with advanced options.\"\"\"\n",
        "    model = models[model_name]\n",
        "    processor = processors[model_name]\n",
        "\n",
        "    if not media_input:\n",
        "        raise gr.Error(\"Please upload an image.\")\n",
        "\n",
        "    media_path = media_input\n",
        "    if media_path.endswith(tuple([i for i in image_extensions.keys()])):\n",
        "        media_type = \"image\"\n",
        "    else:\n",
        "        try:\n",
        "            media_path, media_type = identify_and_save_blob(media_input)\n",
        "        except Exception as e:\n",
        "            raise ValueError(f\"Unsupported media type. Please upload a valid image. Error: {e}\")\n",
        "\n",
        "    messages = [\n",
        "        {\n",
        "            \"role\": \"user\",\n",
        "            \"content\": [\n",
        "                {\"type\": media_type, media_type: media_path},\n",
        "                {\"type\": \"text\", \"text\": text_input},\n",
        "            ],\n",
        "        }\n",
        "    ]\n",
        "\n",
        "    text = processor.apply_chat_template(\n",
        "        messages, tokenize=False, add_generation_prompt=True\n",
        "    )\n",
        "    image_inputs, _ = process_vision_info(messages)\n",
        "    inputs = processor(\n",
        "        text=[text],\n",
        "        images=image_inputs,\n",
        "        padding=True,\n",
        "        return_tensors=\"pt\",\n",
        "    ).to(\"cuda\")\n",
        "\n",
        "    streamer = TextIteratorStreamer(\n",
        "        processor.tokenizer, skip_prompt=True, skip_special_tokens=True\n",
        "    )\n",
        "\n",
        "    generation_kwargs = dict(\n",
        "        inputs,\n",
        "        streamer=streamer,\n",
        "        max_new_tokens=max_new_tokens,\n",
        "        do_sample=True,\n",
        "        temperature=temperature,\n",
        "        top_p=top_p,\n",
        "        top_k=top_k,\n",
        "        repetition_penalty=repetition_penalty,\n",
        "    )\n",
        "\n",
        "    thread = Thread(target=model.generate, kwargs=generation_kwargs)\n",
        "    thread.start()\n",
        "\n",
        "    buffer = \"\"\n",
        "    for new_text in streamer:\n",
        "        buffer += new_text\n",
        "        buffer = buffer.replace(\"<|im_end|>\", \"\")\n",
        "        yield buffer\n",
        "\n",
        "def format_plain_text(output_text):\n",
        "    \"\"\"Formats the output text as plain text.\"\"\"\n",
        "    plain_text = output_text.replace(\"\\\\(\", \"\").replace(\"\\\\)\", \"\").replace(\"\\\\[\", \"\").replace(\"\\\\]\", \"\")\n",
        "    return plain_text\n",
        "\n",
        "def generate_document(media_path, output_text, file_format, font_size, line_spacing, alignment, image_size):\n",
        "    \"\"\"Generates a document with the input image and plain text output.\"\"\"\n",
        "    if not media_path:\n",
        "        raise gr.Error(\"Cannot generate document without an input image.\")\n",
        "    plain_text = format_plain_text(output_text)\n",
        "    if file_format == \"pdf\":\n",
        "        return generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
        "    elif file_format == \"docx\":\n",
        "        return generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
        "\n",
        "def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
        "    \"\"\"Generates a PDF document.\"\"\"\n",
        "    filename = f\"output_{uuid.uuid4()}.pdf\"\n",
        "    doc = SimpleDocTemplate(\n",
        "        filename, pagesize=A4, rightMargin=inch, leftMargin=inch, topMargin=inch, bottomMargin=inch\n",
        "    )\n",
        "    styles = getSampleStyleSheet()\n",
        "    styles[\"Normal\"].fontSize = int(font_size)\n",
        "    styles[\"Normal\"].leading = int(font_size) * line_spacing\n",
        "    styles[\"Normal\"].alignment = {\"Left\": 0, \"Center\": 1, \"Right\": 2, \"Justified\": 4}[alignment]\n",
        "\n",
        "    story = []\n",
        "    image_sizes = {\"Small\": (2 * inch, 2 * inch), \"Medium\": (4 * inch, 4 * inch), \"Large\": (6 * inch, 6 * inch)}\n",
        "    img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])\n",
        "    story.append(img)\n",
        "    story.append(Spacer(1, 12))\n",
        "    text = Paragraph(plain_text.replace(\"\\n\", \"<br/>\"), styles[\"Normal\"])\n",
        "    story.append(text)\n",
        "    doc.build(story)\n",
        "    return filename\n",
        "\n",
        "def generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
        "    \"\"\"Generates a DOCX document.\"\"\"\n",
        "    filename = f\"output_{uuid.uuid4()}.docx\"\n",
        "    doc = docx.Document()\n",
        "    image_sizes = {\"Small\": docx.shared.Inches(2), \"Medium\": docx.shared.Inches(4), \"Large\": docx.shared.Inches(6)}\n",
        "    doc.add_picture(media_path, width=image_sizes[image_size])\n",
        "    doc.add_paragraph()\n",
        "    paragraph = doc.add_paragraph()\n",
        "    paragraph.paragraph_format.line_spacing = line_spacing\n",
        "    paragraph.paragraph_format.alignment = {\n",
        "        \"Left\": WD_ALIGN_PARAGRAPH.LEFT,\n",
        "        \"Center\": WD_ALIGN_PARAGRAPH.CENTER,\n",
        "        \"Right\": WD_ALIGN_PARAGRAPH.RIGHT,\n",
        "        \"Justified\": WD_ALIGN_PARAGRAPH.JUSTIFY\n",
        "    }[alignment]\n",
        "    run = paragraph.add_run(plain_text)\n",
        "    run.font.size = docx.shared.Pt(int(font_size))\n",
        "    doc.save(filename)\n",
        "    return filename\n",
        "\n",
        "# CSS for output styling\n",
        "css = \"\"\"\n",
        ".download-btn {\n",
        "    background-color: #35a6d6 !important;\n",
        "    color: white !important;\n",
        "}\n",
        ".download-btn:hover {\n",
        "    background-color: #22bcff !important;\n",
        "}\n",
        "\"\"\"\n",
        "\n",
        "# Gradio app setup\n",
        "with gr.Blocks(css=css, theme=steel_blue_theme) as demo:\n",
        "    gr.Markdown(\"# **Qwen3-VL-4B-Thinking-abliterated**\")\n",
        "\n",
        "    with gr.Tab(label=\"Image Input\"):\n",
        "        with gr.Row():\n",
        "            with gr.Column(scale=2):\n",
        "                model_choice = gr.Dropdown(\n",
        "                    label=\"Model Selection\",\n",
        "                    choices=list(MODEL_OPTIONS.keys()),\n",
        "                    value=\"Qwen3-VL-4B-Thinking-abliterated\"\n",
        "                )\n",
        "                input_media = gr.File(label=\"Upload Image\", type=\"filepath\")\n",
        "                text_input = gr.Textbox(label=\"Question\", value=\"Describe the Image\")\n",
        "\n",
        "                with gr.Accordion(\"Advanced options\", open=False):\n",
        "                    max_new_tokens = gr.Slider(label=\"Max new tokens\", minimum=1, maximum=4096, step=1, value=2048)\n",
        "                    temperature = gr.Slider(label=\"Temperature\", minimum=0.1, maximum=4.0, step=0.1, value=0.6)\n",
        "                    top_p = gr.Slider(label=\"Top-p (nucleus sampling)\", minimum=0.05, maximum=1.0, step=0.05, value=0.9)\n",
        "                    top_k = gr.Slider(label=\"Top-k\", minimum=1, maximum=1000, step=1, value=50)\n",
        "                    repetition_penalty = gr.Slider(label=\"Repetition penalty\", minimum=1.0, maximum=2.0, step=0.05, value=1.2)\n",
        "\n",
        "                submit_btn = gr.Button(value=\"Submit\", variant=\"primary\")\n",
        "\n",
        "            with gr.Column(scale=3):\n",
        "                output_text = gr.Textbox(label=\"Output Text\", lines=15, show_copy_button=True)\n",
        "                with gr.Accordion(\"Plain Text\", open=False):\n",
        "                    plain_text_output = gr.Textbox(label=\"Standardized Plain Text\", lines=10, show_copy_button=True)\n",
        "\n",
        "        with gr.Accordion(\"Docx/PDF Settings\", open=False):\n",
        "            with gr.Row():\n",
        "                with gr.Column():\n",
        "                    line_spacing = gr.Dropdown(choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0], value=1.5, label=\"Line Spacing\")\n",
        "                    font_size = gr.Dropdown(choices=[\"8\", \"10\", \"12\", \"14\", \"16\", \"18\", \"20\", \"22\", \"24\"], value=\"16\", label=\"Font Size\")\n",
        "                with gr.Column():\n",
        "                    alignment = gr.Dropdown(choices=[\"Left\", \"Center\", \"Right\", \"Justified\"], value=\"Justified\", label=\"Text Alignment\")\n",
        "                    image_size = gr.Dropdown(choices=[\"Small\", \"Medium\", \"Large\"], value=\"Medium\", label=\"Image Size\")\n",
        "                with gr.Column():\n",
        "                    file_format = gr.Radio([\"pdf\", \"docx\"], label=\"File Format\", value=\"pdf\")\n",
        "\n",
        "        get_document_btn = gr.Button(value=\"Get Document\", elem_classes=\"download-btn\")\n",
        "\n",
        "    # --- Event Handlers ---\n",
        "    submit_btn.click(\n",
        "        qwen_inference,\n",
        "        [model_choice, input_media, text_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],\n",
        "        [output_text]\n",
        "    ).then(\n",
        "        lambda out_text: format_plain_text(out_text), [output_text], [plain_text_output]\n",
        "    )\n",
        "\n",
        "    get_document_btn.click(\n",
        "        generate_document,\n",
        "        [input_media, output_text, file_format, font_size, line_spacing, alignment, image_size],\n",
        "        gr.File(label=\"Download Document\")\n",
        "    )\n",
        "\n",
        "demo.launch(debug=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "\n",
        "## **Demo Inference**\n",
        "\n",
        "![Screenshot 2025-10-15 at 15-09-19 Gradio](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/tBgePT_Y0oNfwwJwrvYkO.png)\n",
        "\n",
        "![Screenshot 2025-10-15 at 15-00-03 Gradio](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/6BPowjfSEAeZM6aVZUxKW.png)"
      ],
      "metadata": {
        "id": "WUGrooGHCcix"
      }
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}