Spaces:

vvmnnnkv
/

owlv2-visual-prompt

Runtime error

App Files Files Community

owlv2-visual-prompt / app.py

vvmnnnkv

downgrade gradio

4f54565 4 months ago

raw

history blame

10.6 kB

	import sys

	# Mock audio modules to avoid installing them
	sys.modules["audioop"] = type("audioop", (), {"__file__": ""})()
	sys.modules["pyaudioop"] = type("pyaudioop", (), {"__file__": ""})()

	import torch
	import gradio as gr
	import supervision as sv
	import spaces
	from transformers import AutoProcessor, Owlv2ForObjectDetection

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	@spaces.GPU
	def init_model(model_id):
	processor = AutoProcessor.from_pretrained(model_id)
	model = Owlv2ForObjectDetection.from_pretrained(model_id)
	model.eval()
	model.to(DEVICE)
	return processor, model

	@spaces.GPU
	def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type):
	processor, model = init_model(model_id)

	result = None
	class_names = {}

	if prompt_type == "Text":
	inputs = processor(
	images=target_image,
	text=prompts["texts"],
	return_tensors="pt"
	).to(DEVICE)

	with torch.no_grad():
	outputs = model(**inputs)

	target_sizes = torch.tensor([target_image.size[::-1]])
	result = processor.post_process_grounded_object_detection(
	outputs=outputs,
	target_sizes=target_sizes,
	threshold=conf_thresh
	)[0]
	class_names = {k: v for k, v in enumerate(prompts["texts"])}

	elif prompt_type == "Visual":
	inputs = processor(
	images=target_image,
	query_images=prompts["images"],
	return_tensors="pt"
	).to(DEVICE)
	with torch.no_grad():
	outputs = model.image_guided_detection(**inputs)

	# Post-process results
	target_sizes = torch.tensor([target_image.size[::-1]])
	result = processor.post_process_image_guided_detection(
	outputs=outputs,
	target_sizes=target_sizes,
	threshold=conf_thresh,
	nms_threshold=iou_thresh
	)[0]

	# prepare for supervision: add 0 label for all boxes
	result['labels'] = torch.zeros(len(result['boxes']), dtype=torch.int64)
	class_names = {0: "object"}

	detections = sv.Detections.from_transformers(result, class_names)

	resolution_wh = target_image.size
	thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh)
	text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh)

	labels = [
	f"{class_name} {confidence:.2f}"
	for class_name, confidence
	in zip(detections['class_name'], detections.confidence)
	]

	annotated_image = target_image.copy()
	annotated_image = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=thickness).annotate(
	scene=annotated_image, detections=detections)
	annotated_image = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX, text_scale=text_scale, smart_position=True).annotate(
	scene=annotated_image, detections=detections, labels=labels)

	return annotated_image


	def app():
	with gr.Blocks():
	with gr.Row():
	with gr.Column():
	with gr.Row():
	target_image = gr.Image(type="pil", label="Target Image", visible=True, interactive=True)

	detect_button = gr.Button(value="Detect Objects")
	prompt_type = gr.Textbox(value='Visual', visible=False) # Default prompt type

	with gr.Tab("Visual") as visual_tab:
	with gr.Row():
	prompt_image = gr.Image(type="pil", label="Prompt Image", visible=True, interactive=True)

	with gr.Tab("Text") as text_tab:
	texts = gr.Textbox(label="Input Texts", value='', placeholder='person,bus', visible=True, interactive=True)

	visual_tab.select(
	fn=lambda: ("Visual", gr.update(visible=True)),
	inputs=None,
	outputs=[prompt_type, prompt_image]
	)

	text_tab.select(
	fn=lambda: ("Text", gr.update(value=None, visible=False)),
	inputs=None,
	outputs=[prompt_type, prompt_image]
	)

	model_id = gr.Dropdown(
	label="Model",
	choices=[
	"google/owlv2-base-patch16-ensemble",
	"google/owlv2-large-patch14"
	],
	value="google/owlv2-base-patch16-ensemble",
	)
	conf_thresh = gr.Slider(
	label="Confidence Threshold",
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	value=0.25,
	)
	iou_thresh = gr.Slider(
	label="IoU Threshold",
	minimum=0.0,
	maximum=1.0,
	step=0.05,
	value=0.70,
	)

	with gr.Column():
	output_image = gr.Image(type="numpy", label="Annotated Image", visible=True)


	def run_inference(prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type):
	# add text/built-in prompts
	if prompt_type == "Text":
	texts = [text.strip() for text in texts.split(',')]
	prompts = {
	"texts": texts
	}
	# add visual prompt
	elif prompt_type == "Visual":
	prompts = {
	"images": prompt_image,
	}

	return inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type)

	detect_button.click(
	fn=run_inference,
	inputs=[prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type],
	outputs=[output_image],
	)

	###################### Examples ##########################
	image_examples_list = [[
	"test-data/target1.jpg",
	"test-data/prompt1.jpg",
	"google/owlv2-base-patch16-ensemble",
	0.9,
	0.3,
	],
	[
	"test-data/target2.jpg",
	"test-data/prompt2.jpg",
	"google/owlv2-base-patch16-ensemble",
	0.9,
	0.3,
	],
	[
	"test-data/target3.jpg",
	"test-data/prompt3.jpg",
	"google/owlv2-base-patch16-ensemble",
	0.9,
	0.3,
	],
	[
	"test-data/target4.jpg",
	"test-data/prompt4.jpg",
	"google/owlv2-base-patch16-ensemble",
	0.9,
	0.3,
	]
	]

	text_examples = gr.Examples(
	examples=[[
	"test-data/target1.jpg",
	"logo",
	"google/owlv2-base-patch16-ensemble",
	0.3],
	[
	"test-data/target2.jpg",
	"cat,remote",
	"google/owlv2-base-patch16-ensemble",
	0.3],
	[
	"test-data/target3.jpg",
	"frog,spider,lizard",
	"google/owlv2-base-patch16-ensemble",
	0.3],
	[
	"test-data/target4.jpg",
	"cat",
	"google/owlv2-base-patch16-ensemble",
	0.3]
	],
	inputs=[target_image, texts, model_id, conf_thresh],
	visible=False, cache_examples=False, label="Text Prompt Examples")

	image_examples = gr.Examples(
	examples=image_examples_list,
	inputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh],
	visible=True, cache_examples=False, label="Box Visual Prompt Examples")

	# Examples update
	def update_text_examples():
	return gr.Dataset(visible=True), gr.Dataset(visible=False), gr.update(visible=False)

	def update_visual_examples():
	return gr.Dataset(visible=False), gr.Dataset(visible=True), gr.update(visible=True)

	text_tab.select(
	fn=update_text_examples,
	inputs=None,
	outputs=[text_examples.dataset, image_examples.dataset, iou_thresh]
	)

	visual_tab.select(
	fn=update_visual_examples,
	inputs=None,
	outputs=[text_examples.dataset, image_examples.dataset, iou_thresh]
	)

	return target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list

	gradio_app = gr.Blocks()
	with gradio_app:
	gr.HTML(
	"""
	<h1 style='text-align: center'>OWLv2: Zero-shot detection with visual prompt 👀</h1>
	""")
	gr.Markdown("""
	This demo showcases the OWLv2 model's ability to perform zero-shot object detection using visual and text prompts.

	You can either provide a text prompt or an image as a visual prompt to detect objects in the target image.

	For visual prompting, following sample code is used, taken from the HF documentation:
	```python
	processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
	model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

	target_image = Image.open(...)
	prompt_image = Image.open(...)
	inputs = processor(images=target_image, query_images=prompt_image, return_tensors="pt")

	# forward pass
	with torch.no_grad():
	outputs = model.image_guided_detection(**inputs)

	target_sizes = torch.Tensor([image.size[::-1]])

	results = processor.post_process_image_guided_detection(outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes)
	```

	For some reason, visual prompt works much worse than text, perhaps it's HF implementation issue.
	""")

	with gr.Row():
	with gr.Column():
	# Create a list of all UI components
	ui_components = app()
	# Unpack the components
	target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list = ui_components

	gradio_app.load(
	fn=lambda: image_examples_list[1],
	outputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh]
	)


	gradio_app.launch(allowed_paths=["figures"])