Spaces:
Runtime error
Runtime error
| import sys | |
| # Mock audio modules to avoid installing them | |
| sys.modules["audioop"] = type("audioop", (), {"__file__": ""})() | |
| sys.modules["pyaudioop"] = type("pyaudioop", (), {"__file__": ""})() | |
| import torch | |
| import gradio as gr | |
| import supervision as sv | |
| import spaces | |
| from transformers import AutoProcessor, Owlv2ForObjectDetection | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| def init_model(model_id): | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = Owlv2ForObjectDetection.from_pretrained(model_id) | |
| model.eval() | |
| model.to(DEVICE) | |
| return processor, model | |
| def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type): | |
| processor, model = init_model(model_id) | |
| result = None | |
| class_names = {} | |
| if prompt_type == "Text": | |
| inputs = processor( | |
| images=target_image, | |
| text=prompts["texts"], | |
| return_tensors="pt" | |
| ).to(DEVICE) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| target_sizes = torch.tensor([target_image.size[::-1]]) | |
| result = processor.post_process_grounded_object_detection( | |
| outputs=outputs, | |
| target_sizes=target_sizes, | |
| threshold=conf_thresh | |
| )[0] | |
| class_names = {k: v for k, v in enumerate(prompts["texts"])} | |
| elif prompt_type == "Visual": | |
| inputs = processor( | |
| images=target_image, | |
| query_images=prompts["images"], | |
| return_tensors="pt" | |
| ).to(DEVICE) | |
| with torch.no_grad(): | |
| outputs = model.image_guided_detection(**inputs) | |
| # Post-process results | |
| target_sizes = torch.tensor([target_image.size[::-1]]) | |
| result = processor.post_process_image_guided_detection( | |
| outputs=outputs, | |
| target_sizes=target_sizes, | |
| threshold=conf_thresh, | |
| nms_threshold=iou_thresh | |
| )[0] | |
| # prepare for supervision: add 0 label for all boxes | |
| result['labels'] = torch.zeros(len(result['boxes']), dtype=torch.int64) | |
| class_names = {0: "object"} | |
| detections = sv.Detections.from_transformers(result, class_names) | |
| resolution_wh = target_image.size | |
| thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh) | |
| text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh) | |
| labels = [ | |
| f"{class_name} {confidence:.2f}" | |
| for class_name, confidence | |
| in zip(detections['class_name'], detections.confidence) | |
| ] | |
| annotated_image = target_image.copy() | |
| annotated_image = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=thickness).annotate( | |
| scene=annotated_image, detections=detections) | |
| annotated_image = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX, text_scale=text_scale, smart_position=True).annotate( | |
| scene=annotated_image, detections=detections, labels=labels) | |
| return annotated_image | |
| def app(): | |
| with gr.Blocks(): | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| target_image = gr.Image(type="pil", label="Target Image", visible=True, interactive=True) | |
| detect_button = gr.Button(value="Detect Objects") | |
| prompt_type = gr.Textbox(value='Visual', visible=False) # Default prompt type | |
| with gr.Tab("Visual") as visual_tab: | |
| with gr.Row(): | |
| prompt_image = gr.Image(type="pil", label="Prompt Image", visible=True, interactive=True) | |
| with gr.Tab("Text") as text_tab: | |
| texts = gr.Textbox(label="Input Texts", value='', placeholder='person,bus', visible=True, interactive=True) | |
| visual_tab.select( | |
| fn=lambda: ("Visual", gr.update(visible=True)), | |
| inputs=None, | |
| outputs=[prompt_type, prompt_image] | |
| ) | |
| text_tab.select( | |
| fn=lambda: ("Text", gr.update(value=None, visible=False)), | |
| inputs=None, | |
| outputs=[prompt_type, prompt_image] | |
| ) | |
| model_id = gr.Dropdown( | |
| label="Model", | |
| choices=[ | |
| "google/owlv2-base-patch16-ensemble", | |
| "google/owlv2-large-patch14" | |
| ], | |
| value="google/owlv2-base-patch16-ensemble", | |
| ) | |
| conf_thresh = gr.Slider( | |
| label="Confidence Threshold", | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.05, | |
| value=0.25, | |
| ) | |
| iou_thresh = gr.Slider( | |
| label="IoU Threshold", | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.05, | |
| value=0.70, | |
| ) | |
| with gr.Column(): | |
| output_image = gr.Image(type="numpy", label="Annotated Image", visible=True) | |
| def run_inference(prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type): | |
| # add text/built-in prompts | |
| if prompt_type == "Text": | |
| texts = [text.strip() for text in texts.split(',')] | |
| prompts = { | |
| "texts": texts | |
| } | |
| # add visual prompt | |
| elif prompt_type == "Visual": | |
| prompts = { | |
| "images": prompt_image, | |
| } | |
| return inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type) | |
| detect_button.click( | |
| fn=run_inference, | |
| inputs=[prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type], | |
| outputs=[output_image], | |
| ) | |
| ###################### Examples ########################## | |
| image_examples_list = [[ | |
| "test-data/target1.jpg", | |
| "test-data/prompt1.jpg", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.9, | |
| 0.3, | |
| ], | |
| [ | |
| "test-data/target2.jpg", | |
| "test-data/prompt2.jpg", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.9, | |
| 0.3, | |
| ], | |
| [ | |
| "test-data/target3.jpg", | |
| "test-data/prompt3.jpg", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.9, | |
| 0.3, | |
| ], | |
| [ | |
| "test-data/target4.jpg", | |
| "test-data/prompt4.jpg", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.9, | |
| 0.3, | |
| ] | |
| ] | |
| text_examples = gr.Examples( | |
| examples=[[ | |
| "test-data/target1.jpg", | |
| "logo", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.3], | |
| [ | |
| "test-data/target2.jpg", | |
| "cat,remote", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.3], | |
| [ | |
| "test-data/target3.jpg", | |
| "frog,spider,lizard", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.3], | |
| [ | |
| "test-data/target4.jpg", | |
| "cat", | |
| "google/owlv2-base-patch16-ensemble", | |
| 0.3] | |
| ], | |
| inputs=[target_image, texts, model_id, conf_thresh], | |
| visible=False, cache_examples=False, label="Text Prompt Examples") | |
| image_examples = gr.Examples( | |
| examples=image_examples_list, | |
| inputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh], | |
| visible=True, cache_examples=False, label="Box Visual Prompt Examples") | |
| # Examples update | |
| def update_text_examples(): | |
| return gr.Dataset(visible=True), gr.Dataset(visible=False), gr.update(visible=False) | |
| def update_visual_examples(): | |
| return gr.Dataset(visible=False), gr.Dataset(visible=True), gr.update(visible=True) | |
| text_tab.select( | |
| fn=update_text_examples, | |
| inputs=None, | |
| outputs=[text_examples.dataset, image_examples.dataset, iou_thresh] | |
| ) | |
| visual_tab.select( | |
| fn=update_visual_examples, | |
| inputs=None, | |
| outputs=[text_examples.dataset, image_examples.dataset, iou_thresh] | |
| ) | |
| return target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list | |
| gradio_app = gr.Blocks() | |
| with gradio_app: | |
| gr.HTML( | |
| """ | |
| <h1 style='text-align: center'>OWLv2: Zero-shot detection with visual prompt π</h1> | |
| """) | |
| gr.Markdown(""" | |
| This demo showcases the OWLv2 model's ability to perform zero-shot object detection using visual and text prompts. | |
| You can either provide a text prompt or an image as a visual prompt to detect objects in the target image. | |
| For visual prompting, following sample code is used, taken from the HF documentation: | |
| ```python | |
| processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") | |
| model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble") | |
| target_image = Image.open(...) | |
| prompt_image = Image.open(...) | |
| inputs = processor(images=target_image, query_images=prompt_image, return_tensors="pt") | |
| # forward pass | |
| with torch.no_grad(): | |
| outputs = model.image_guided_detection(**inputs) | |
| target_sizes = torch.Tensor([image.size[::-1]]) | |
| results = processor.post_process_image_guided_detection(outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes) | |
| ``` | |
| For some reason, visual prompt works much worse than text, perhaps it's HF implementation issue. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Create a list of all UI components | |
| ui_components = app() | |
| # Unpack the components | |
| target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list = ui_components | |
| gradio_app.load( | |
| fn=lambda: image_examples_list[1], | |
| outputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh] | |
| ) | |
| gradio_app.launch(allowed_paths=["figures"]) | |