🇹🇷 TAMGA — Çok Modelli Görsel Dil Sistemi

import gradio as gr
from PIL import Image
import torch
import os

from inference import load_for_inference, predict


TAMGA_REPO = "Mueris/TurkishVLMTAMGA"
tamga_model, tamga_tokenizer, tamga_device = load_for_inference(TAMGA_REPO)


from transformers import BlipProcessor, BlipForConditionalGeneration

CAPTION_REPO = "Mueris/TurkishVLMTAMGA-CaptioningModel"
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained(CAPTION_REPO)
caption_model.to("cuda" if torch.cuda.is_available() else "cpu")
caption_device = caption_model.device


def answer(model_choice, image, question):

    if image is None:
        return "**Lütfen bir görsel yükleyin.**"

    if model_choice == "TAMGA VQA":
        if not question.strip():
            return "**Bu model soru gerektirir.**"
        pil_image = Image.fromarray(image)
        resp = predict(tamga_model, tamga_tokenizer, tamga_device, pil_image, question)
        return f"**Cevap:** {resp}"

    else:  # BLIP
        pil_image = Image.fromarray(image)
        inputs = caption_processor(images=pil_image, return_tensors="pt").to(caption_device)
        output = caption_model.generate(**inputs, max_new_tokens=64)
        caption = caption_processor.decode(output[0], skip_special_tokens=True)
        return f"**Açıklama:** {caption}"


def toggle_question(model_choice):
    if model_choice == "BLIP Caption (Fine-Tuned)":
        return gr.update(interactive=False, value="")
    return gr.update(interactive=True)


def load_example_image(path):
    # Gallery returns a list (value, index), so handle list type
    if isinstance(path, list):
        path = path[0]

    if os.path.exists(path):
        return Image.open(path)
    return None


css = """
#col-container { max-width: 1100px; margin: auto; }
.output-box {
    background-color:white; border-radius:10px;
    padding:15px; border:1px solid #d0d0d0;
    font-size:1.1rem; min-height:220px;
}
"""


with gr.Blocks(css=css) as demo:

    gr.HTML("<h1 style='text-align:center;'>🇹🇷 TAMGA — Çok Modelli Görsel Dil Sistemi</h1>")
    gr.HTML("<div style='text-align:center;margin-bottom:20px;'>VQA veya BLIP modeli seçin.</div>")

    with gr.Row(elem_id="col-container"):

        with gr.Column(scale=1):

            model_choice = gr.Dropdown(
                choices=["TAMGA VQA", "BLIP Caption (Fine-Tuned)"],
                value="TAMGA VQA",
                label="🔧 Model Seç"
            )

            image = gr.Image(type="numpy", label="📷 Görsel Yükle")

            # GALLERY — strictly string paths only
            example_gallery = gr.Gallery(
                label="Örnek Görseller",
                columns=4,
                height="150px",
                preview=True
            )

            # <<< YOUR REAL FILES >>>
            example_gallery.value = [
                "examples/Bir_grup_asker.jpg",
                "examples/tank.jpg",
                "examples/ucak.jpg",
                "examples/ucaklar.jpeg",
            ]

            question = gr.Textbox(
                label="Soru (VQA İçin)",
                placeholder="Örn: Bu araç ne sınıf?"
            )

            run_btn = gr.Button("Çalıştır", variant="primary")

        with gr.Column(scale=1):
            output = gr.Markdown(elem_classes="output-box")


    # Disable/enable question box based on model
    model_choice.change(toggle_question, inputs=model_choice, outputs=question)

    # Gallery selection loads the image
    example_gallery.select(
        fn=load_example_image,
        inputs=example_gallery,
        outputs=image
    )

    # Run model
    run_btn.click(
        fn=answer,
        inputs=[model_choice, image, question],
        outputs=output
    )


# Required for HuggingFace Spaces
demo.launch()