import base64 import gc import json import os from io import BytesIO from pathlib import Path import gradio as gr import torch from json_repair import repair_json from qwen_vl_utils import process_vision_info from transformers import ( AutoProcessor, Qwen2_5_VLForConditionalGeneration, Qwen2VLForConditionalGeneration, Qwen3VLForConditionalGeneration, ) from kofi import SCRIPT # Handle spaces.GPU decorator for GPU allocation in Spaces if "SPACES_ZERO_GPU" in os.environ.keys(): import spaces else: class spaces: @staticmethod def GPU(func=None, duration=300): def decorator(f): def wrapper(*args, **kwargs): return f(*args, **kwargs) return wrapper if func is None: return decorator return decorator(func) # Define constants HEADLINE = "# Qwen-VL Object-Detection" SUBLINE = "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection." EXAMPLES_DIR = Path(__file__).parent / "examples" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODEL_IDS = [ "Qwen/Qwen2-VL-2B-Instruct", # https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct "Qwen/Qwen2-VL-7B-Instruct", # https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct "Qwen/Qwen2.5-VL-3B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct "Qwen/Qwen2.5-VL-7B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct "Qwen/Qwen2.5-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct "Qwen/Qwen2.5-VL-72B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct "Qwen/Qwen3-VL-2B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct "Qwen/Qwen3-VL-4B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct "Qwen/Qwen3-VL-8B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct "Qwen/Qwen3-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct ] DEFAULT_SYSTEM_PROMPT = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.' EXAMPLES = [ [ EXAMPLES_DIR / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg", "Qwen/Qwen3-VL-4B-Instruct", DEFAULT_SYSTEM_PROMPT, "detect sailboat, rowboat, person", 512, "Yes", 1920, ], [ EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg", "Qwen/Qwen3-VL-4B-Instruct", DEFAULT_SYSTEM_PROMPT, "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink", 1024, "Yes", 1920, ], [ EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg", "Qwen/Qwen3-VL-4B-Instruct", DEFAULT_SYSTEM_PROMPT, "detect basketball, player with white jersey, player with black jersey", 512, "Yes", 1920, ], [ EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg", "Qwen/Qwen3-VL-4B-Instruct", DEFAULT_SYSTEM_PROMPT, "detect app to find great places, app to take beautiful photos, app to listen music", 512, "Yes", 1920, ], [ EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg", "Qwen/Qwen3-VL-4B-Instruct", DEFAULT_SYSTEM_PROMPT, "detect person, bicycle, netherlands flag", 1920, "Yes", 1920, ], ] # Global variables to track loaded model current_model = None current_processor = None current_model_id = None class AutoModel: @staticmethod def from_pretrained(model_id, dtype="auto", device_map="cpu"): if model_id.startswith("Qwen/Qwen2-VL"): model_loader = Qwen2VLForConditionalGeneration elif model_id.startswith("Qwen/Qwen2.5-VL"): model_loader = Qwen2_5_VLForConditionalGeneration elif model_id.startswith("Qwen/Qwen3-VL"): model_loader = Qwen3VLForConditionalGeneration else: raise ValueError(f"Unsupported model ID: {model_id}") return model_loader.from_pretrained( model_id, dtype=dtype, device_map=device_map ) def resize_image(image, target_size=1000): width, height = image.size if max(width, height) <= target_size: return image if width >= height: new_width = target_size new_height = int((target_size / width) * height) else: new_height = target_size new_width = int((target_size / height) * width) return image.resize((new_width, new_height)) def image_to_base64(image): buffered = BytesIO() image.save(buffered, format="PNG") img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") return img_str with gr.Blocks(js=SCRIPT) as demo: gr.Markdown(HEADLINE) gr.Markdown(SUBLINE) with gr.Row(): with gr.Column(): gr.Markdown("## Inputs") image_input = gr.Image( label="Input Image", type="pil", ) gr.Markdown("## Settings") input_model_id = gr.Dropdown( choices=MODEL_IDS, label="✨ Select Model ID", ) system_prompt = gr.Textbox( label="System Prompt", lines=3, value=DEFAULT_SYSTEM_PROMPT, ) default_user_prompt = "detect object" user_prompt = gr.Textbox( label="User Prompt", lines=3, value=default_user_prompt, ) max_new_tokens = gr.Slider( label="Max New Tokens", minimum=32, maximum=4096, value=256, step=32, interactive=True, ) image_resize = gr.Radio( label="Resize Image", choices=["Yes", "No"], value="Yes", interactive=True, scale=2, ) image_target_size = gr.Slider( label="Image Target Size", minimum=256, maximum=4096, value=1024, step=1, interactive=True, scale=2, ) with gr.Column(): gr.Markdown("## Outputs") output_annotated_image = gr.AnnotatedImage( format="jpeg", key="output_annotated_image", label="Output Image", ) gr.Markdown("## Detections") output_text = gr.Textbox( label="Output Text", lines=10, key="output_text", ) with gr.Row(): run_button = gr.Button("Run") def load_model( model_id: str, ): global current_model, current_processor, current_model_id # Only load model if it's different from the currently loaded one if current_model_id != model_id or current_model is None: # Clear previous model from memory if current_model is not None: del current_model current_model = None if current_processor is not None: del current_processor current_processor = None # Force garbage collection and clear CUDA cache gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() gr.Info( f"Downloading and loading {model_id.removeprefix('Qwen/')} model files ...", duration=10, ) current_model = AutoModel.from_pretrained( model_id, dtype="auto", device_map="cpu" ) current_processor = AutoProcessor.from_pretrained(model_id) current_model_id = model_id return current_model, current_processor @spaces.GPU(duration=300) def generate( model, processor, image, model_id: str, system_prompt: str, user_prompt: str, max_new_tokens: int, image_resize: str, image_target_size: int | None, ): # Move model to CUDA if available (inside @spaces.GPU decorated function) model = model.to(DEVICE) model.eval() base64_image = image_to_base64( resize_image(image, image_target_size) if image_resize == "Yes" and image_target_size else image ) messages = [ { "role": "user", "content": [ { "type": "image", "image": f"data:image;base64,{base64_image}", }, {"type": "text", "text": system_prompt}, {"type": "text", "text": user_prompt}, ], } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to(DEVICE) generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens) generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, ) output_text = str(output_text[0]) output_text = repair_json(output_text) output_json = json.loads(output_text) scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True x_scale = float(image.width / 1000) if scale else 1.0 y_scale = float(image.height / 1000) if scale else 1.0 bboxes = [] for detection in output_json: if "bbox_2d" not in detection: continue if len(detection["bbox_2d"]) != 4: continue if "label" not in detection: continue xmin, ymin, xmax, ymax = detection["bbox_2d"] label = detection.get("label", "") bbox = [ int(xmin * x_scale), int(ymin * y_scale), int(xmax * x_scale), int(ymax * y_scale), ] bboxes.append((bbox, label)) return [(image, bboxes), str(json.dumps(output_json))] def run( image, model_id: str, system_prompt: str, user_prompt: str, max_new_tokens: int = 1024, image_resize: str = "Yes", image_target_size: int | None = None, ): # Load the model and processor (on CPU) model, processor = load_model(model_id) # Run inference (on GPU *if available) return generate( model, processor, image, model_id, system_prompt, user_prompt, max_new_tokens, image_resize, image_target_size, ) with gr.Row(): with gr.Column(): gr.Markdown("## Examples") gr.Examples( fn=run, cache_examples=True, cache_mode="eager", run_on_click=False, examples=EXAMPLES, inputs=[ image_input, input_model_id, system_prompt, user_prompt, max_new_tokens, image_resize, image_target_size, ], outputs=[ output_annotated_image, output_text, ], ) with gr.Row(): with gr.Column(): if DEVICE != "cuda": gr.Markdown( "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime." ) gr.HTML('
') # Connect the button to the detection function run_button.click( fn=run, inputs=[ image_input, input_model_id, system_prompt, user_prompt, max_new_tokens, image_resize, image_target_size, ], outputs=[ output_annotated_image, output_text, ], ) if __name__ == "__main__": demo.launch( share=False, )