EdgeSAM

Runtime error

App Files Files Community

chongzhou commited on Dec 13, 2023

Commit

69420c9

1 Parent(s): 9dd21b3

switch to ONNX backend

Browse files

Files changed (4) hide show

app.py +95 -146
requirements.txt +2 -1
segment_anything/onnx/__init__.py +1 -0
segment_anything/onnx/predictor_onnx.py +106 -0

app.py CHANGED Viewed

@@ -1,14 +1,18 @@
 # Code credit: [FastSAM Demo](https://huggingface.co/spaces/An-619/FastSAM).
 import gradio as gr
 import numpy as np
-import torch
-from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
 from PIL import ImageDraw
 from utils.tools_gradio import fast_process
 import copy
 import argparse
 parser = argparse.ArgumentParser(
     description="Host EdgeSAM as a local web service."
 )
@@ -16,13 +20,19 @@ parser.add_argument(
     "--checkpoint",
     default="weights/edge_sam_3x.pth",
     type=str,
-    help="The path to the EdgeSAM model checkpoint."
 )
 parser.add_argument(
-    "--enable-everything-mode",
-    action="store_true",
-    help="Since EdgeSAM follows the same encoder-decoder architecture as SAM, the everything mode will infer the "
-         "decoder 32x32=1024 times, which is inefficient, thus a longer processing time is expected.",
 )
 parser.add_argument(
     "--server-name",
@@ -39,12 +49,32 @@ parser.add_argument(
 args = parser.parse_args()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-sam = sam_model_registry["edge_sam"](checkpoint=args.checkpoint, upsample_mode="bicubic")
-sam = sam.to(device=device)
-sam.eval()
-mask_generator = SamAutomaticMaskGenerator(sam)
-predictor = SamPredictor(sam)
 # Description
 title = "<center><strong><font size='8'>EdgeSAM<font></strong> <a href='https://github.com/chongzhou96/EdgeSAM'><font size='6'>[GitHub]</font></a> </center>"
@@ -68,35 +98,6 @@ description_b = """ # Instructions for box mode
               """
-description_e = """ # Everything mode is NOT recommended.
-                Since EdgeSAM follows the same encoder-decoder architecture as SAM, the everything mode will infer the decoder 32x32=1024 times, which is inefficient, thus a longer processing time is expected.
-                1. Upload an image or click one of the provided examples.
-                2. Click Start to get the segmentation mask.
-                3. The Reset button resets the image and masks.
-              """
-examples = [
-    ["assets/1.jpeg"],
-    ["assets/2.jpeg"],
-    ["assets/3.jpeg"],
-    ["assets/4.jpeg"],
-    ["assets/5.jpeg"],
-    ["assets/6.jpeg"],
-    ["assets/7.jpeg"],
-    ["assets/8.jpeg"],
-    ["assets/9.jpeg"],
-    ["assets/10.jpeg"],
-    ["assets/11.jpeg"],
-    ["assets/12.jpeg"],
-    ["assets/13.jpeg"],
-    ["assets/14.jpeg"],
-    ["assets/15.jpeg"],
-    ["assets/16.jpeg"]
-]
 css = "h1 { text-align: center } .about { text-align: justify; padding-left: 10%; padding-right: 10%; }"
 global_points = []
@@ -119,6 +120,7 @@ def reset():
     global_image_with_prompt = None
     return None
 def reset_all():
     global global_points
     global global_point_label
@@ -130,10 +132,7 @@ def reset_all():
     global_box = []
     global_image = None
     global_image_with_prompt = None
-    if args.enable_everything_mode:
-        return None, None, None
-    else:
-        return None, None
 def clear():
@@ -185,14 +184,15 @@ def convert_box(xyxy):
     xyxy[1][1] = max_y
     return xyxy
 def segment_with_points(
-    label,
-    evt: gr.SelectData,
-    input_size=1024,
-    better_quality=False,
-    withContours=True,
-    use_retina=True,
-    mask_random_color=False,
 ):
     global global_points
     global global_point_label
@@ -213,26 +213,30 @@ def segment_with_points(
     )
     image = global_image_with_prompt
-    global_points_np = np.array(global_points)
-    global_point_label_np = np.array(global_point_label)
-    num_multimask_outputs = 4
-    masks, scores, logits = predictor.predict(
-        point_coords=global_points_np,
-        point_labels=global_point_label_np,
-        num_multimask_outputs=num_multimask_outputs,
-        use_stability_score=True
-    )
     print(f'scores: {scores}')
     area = masks.sum(axis=(1, 2))
     print(f'area: {area}')
-    if num_multimask_outputs == 1:
-        annotations = masks
-    else:
-        annotations = np.expand_dims(masks[scores.argmax()], axis=0)
     seg = fast_process(
         annotations=annotations,
@@ -250,12 +254,12 @@ def segment_with_points(
 def segment_with_box(
-    evt: gr.SelectData,
-    input_size=1024,
-    better_quality=False,
-    withContours=True,
-    use_retina=True,
-    mask_random_color=False,
 ):
     global global_box
     global global_image
@@ -292,12 +296,20 @@ def segment_with_box(
         )
         global_box_np = np.array(global_box)
-        masks, scores, logits = predictor.predict(
-            box=global_box_np,
-            num_multimask_outputs=1,
-        )
-        annotations = masks
         seg = fast_process(
             annotations=annotations,
@@ -313,44 +325,10 @@ def segment_with_box(
         return seg
     return image
-def segment_everything(
-    image,
-    input_size=1024,
-    better_quality=False,
-    withContours=True,
-    use_retina=True,
-    mask_random_color=True,
-):
-    nd_image = np.array(image)
-    masks = mask_generator.generate(nd_image)
-    annotations = masks
-    seg = fast_process(
-        annotations=annotations,
-        image=image,
-        device=device,
-        scale=(1024 // input_size),
-        better_quality=better_quality,
-        mask_random_color=mask_random_color,
-        bbox=None,
-        use_retina=use_retina,
-        withContours=withContours,
-    )
-    return seg
 img_p = gr.Image(label="Input with points", type="pil")
 img_b = gr.Image(label="Input with box", type="pil")
-img_e = gr.Image(label="Input (everything)", type="pil")
-if args.enable_everything_mode:
-    all_outputs = [img_p, img_b, img_e]
-else:
-    all_outputs = [img_p, img_b]
 with gr.Blocks(css=css, title="EdgeSAM") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             # Title
@@ -410,53 +388,24 @@ with gr.Blocks(css=css, title="EdgeSAM") as demo:
                     run_on_click=True
                 )
-    if args.enable_everything_mode:
-        with gr.Tab("Everything mode") as tab_e:
-            # Images
-            with gr.Row(variant="panel"):
-                with gr.Column(scale=1):
-                    img_e.render()
-                with gr.Column(scale=1):
-                    with gr.Row():
-                        with gr.Column():
-                            segment_btn_e = gr.Button("Start", variant="primary")
-                            reset_btn_e = gr.Button("Reset", variant="secondary")
-                            gr.Markdown(description_e)
-            # Submit & Clear
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("Try some of the examples below ⬇️")
-                    gr.Examples(
-                        examples=examples,
-                        inputs=[img_e],
-                        examples_per_page=8,
-                    )
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("<center><img src='https://visitor-badge.laobi.icu/badge?page_id=chongzhou/edgesam' alt='visitors'></center>")
     img_p.upload(on_image_upload, img_p, [img_p])
     img_p.select(segment_with_points, [add_or_remove], img_p)
     clear_btn_p.click(clear, outputs=[img_p])
     reset_btn_p.click(reset, outputs=[img_p])
-    tab_p.select(fn=reset_all, outputs=all_outputs)
     img_b.upload(on_image_upload, img_b, [img_b])
     img_b.select(segment_with_box, outputs=[img_b])
     clear_btn_b.click(clear, outputs=[img_b])
     reset_btn_b.click(reset, outputs=[img_b])
-    tab_b.select(fn=reset_all, outputs=all_outputs)
-    if args.enable_everything_mode:
-        segment_btn_e.click(
-            segment_everything, inputs=[img_e], outputs=img_e
-        )
-        reset_btn_e.click(reset, outputs=[img_e])
-        tab_e.select(fn=reset_all, outputs=all_outputs)
 demo.queue()
 # demo.launch(server_name=args.server_name, server_port=args.port)

 # Code credit: [FastSAM Demo](https://huggingface.co/spaces/An-619/FastSAM).
+import torch
 import gradio as gr
 import numpy as np
+from segment_anything import sam_model_registry, SamPredictor
+from segment_anything.onnx import SamPredictorONNX
 from PIL import ImageDraw
 from utils.tools_gradio import fast_process
 import copy
 import argparse
+# Use ONNX to speed up the inference.
+ENABLE_ONNX = True
 parser = argparse.ArgumentParser(
     description="Host EdgeSAM as a local web service."
 )
     "--checkpoint",
     default="weights/edge_sam_3x.pth",
     type=str,
+    help="The path to the PyTorch checkpoint of EdgeSAM."
+)
+parser.add_argument(
+    "--encoder-onnx-path",
+    default="weights/edge_sam_3x_encoder.onnx",
+    type=str,
+    help="The path to the ONNX model of EdgeSAM's encoder."
 )
 parser.add_argument(
+    "--decoder-onnx-path",
+    default="weights/edge_sam_3x_decoder.onnx",
+    type=str,
+    help="The path to the ONNX model of EdgeSAM's decoder."
 )
 parser.add_argument(
     "--server-name",
 args = parser.parse_args()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if ENABLE_ONNX:
+    predictor = SamPredictorONNX(args.encoder_onnx_path, args.decoder_onnx_path)
+else:
+    sam = sam_model_registry["edge_sam"](checkpoint=args.checkpoint, upsample_mode="bicubic")
+    sam = sam.to(device=device)
+    sam.eval()
+    predictor = SamPredictor(sam)
+examples = [
+    ["assets/1.jpeg"],
+    ["assets/2.jpeg"],
+    ["assets/3.jpeg"],
+    ["assets/4.jpeg"],
+    ["assets/5.jpeg"],
+    ["assets/6.jpeg"],
+    ["assets/7.jpeg"],
+    ["assets/8.jpeg"],
+    ["assets/9.jpeg"],
+    ["assets/10.jpeg"],
+    ["assets/11.jpeg"],
+    ["assets/12.jpeg"],
+    ["assets/13.jpeg"],
+    ["assets/14.jpeg"],
+    ["assets/15.jpeg"],
+    ["assets/16.jpeg"]
+]
 # Description
 title = "<center><strong><font size='8'>EdgeSAM<font></strong> <a href='https://github.com/chongzhou96/EdgeSAM'><font size='6'>[GitHub]</font></a> </center>"
               """
 css = "h1 { text-align: center } .about { text-align: justify; padding-left: 10%; padding-right: 10%; }"
 global_points = []
     global_image_with_prompt = None
     return None
 def reset_all():
     global global_points
     global global_point_label
     global_box = []
     global_image = None
     global_image_with_prompt = None
+    return None, None
 def clear():
     xyxy[1][1] = max_y
     return xyxy
 def segment_with_points(
+        label,
+        evt: gr.SelectData,
+        input_size=1024,
+        better_quality=False,
+        withContours=True,
+        use_retina=True,
+        mask_random_color=False,
 ):
     global global_points
     global global_point_label
     )
     image = global_image_with_prompt
+    if ENABLE_ONNX:
+        global_points_np = np.array(global_points)[None]
+        global_point_label_np = np.array(global_point_label)[None]
+        masks, scores, _ = predictor.predict(
+            point_coords=global_points_np,
+            point_labels=global_point_label_np,
+        )
+        masks = masks.squeeze(0)
+        scores = scores.squeeze(0)
+    else:
+        global_points_np = np.array(global_points)
+        global_point_label_np = np.array(global_point_label)
+        masks, scores, logits = predictor.predict(
+            point_coords=global_points_np,
+            point_labels=global_point_label_np,
+            num_multimask_outputs=4,
+            use_stability_score=True
+        )
     print(f'scores: {scores}')
     area = masks.sum(axis=(1, 2))
     print(f'area: {area}')
+    annotations = np.expand_dims(masks[scores.argmax()], axis=0)
     seg = fast_process(
         annotations=annotations,
 def segment_with_box(
+        evt: gr.SelectData,
+        input_size=1024,
+        better_quality=False,
+        withContours=True,
+        use_retina=True,
+        mask_random_color=False,
 ):
     global global_box
     global global_image
         )
         global_box_np = np.array(global_box)
+        if ENABLE_ONNX:
+            point_coords = global_box_np.reshape(2, 2)[None]
+            point_labels = np.array([2, 3])[None]
+            masks, _, _ = predictor.predict(
+                point_coords=point_coords,
+                point_labels=point_labels,
+            )
+            annotations = masks[:, 0, :, :]
+        else:
+            masks, scores, _ = predictor.predict(
+                box=global_box_np,
+                num_multimask_outputs=1,
+            )
+            annotations = masks
         seg = fast_process(
             annotations=annotations,
         return seg
     return image
 img_p = gr.Image(label="Input with points", type="pil")
 img_b = gr.Image(label="Input with box", type="pil")
 with gr.Blocks(css=css, title="EdgeSAM") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             # Title
                     run_on_click=True
                 )
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown(
+                "<center><img src='https://visitor-badge.laobi.icu/badge?page_id=chongzhou/edgesam' alt='visitors'></center>")
     img_p.upload(on_image_upload, img_p, [img_p])
     img_p.select(segment_with_points, [add_or_remove], img_p)
     clear_btn_p.click(clear, outputs=[img_p])
     reset_btn_p.click(reset, outputs=[img_p])
+    tab_p.select(fn=reset_all, outputs=[img_p, img_b])
     img_b.upload(on_image_upload, img_b, [img_b])
     img_b.select(segment_with_box, outputs=[img_b])
     clear_btn_b.click(clear, outputs=[img_b])
     reset_btn_b.click(reset, outputs=[img_b])
+    tab_b.select(fn=reset_all, outputs=[img_p, img_b])
 demo.queue()
 # demo.launch(server_name=args.server_name, server_port=args.port)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 torch
 torchvision
 opencv-python
-timm

 torch
 torchvision
 opencv-python
+timm
+onnxruntime

segment_anything/onnx/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .predictor_onnx import SamPredictorONNX

segment_anything/onnx/predictor_onnx.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import cv2
+import onnxruntime
+from typing import Optional, Tuple
+from ..utils.transforms import ResizeLongestSide
+class SamPredictorONNX:
+    mask_threshold: float = 0.0
+    image_format: str = "RGB"
+    img_size = 1024
+    pixel_mean = np.array([123.675, 116.28, 103.53])[None, :, None, None]
+    pixel_std = np.array([58.395, 57.12, 57.375])[None, :, None, None]
+    def __init__(
+            self,
+            encoder_path: str,
+            decoder_path: str
+    ) -> None:
+        super().__init__()
+        self.encoder = onnxruntime.InferenceSession(encoder_path)
+        self.decoder = onnxruntime.InferenceSession(decoder_path)
+        # Set the execution provider to GPU if available
+        if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
+            self.encoder.set_providers(['CUDAExecutionProvider'])
+            self.decoder.set_providers(['CUDAExecutionProvider'])
+        self.transform = ResizeLongestSide(self.img_size)
+        self.reset_image()
+    def set_image(
+            self,
+            image: np.ndarray,
+            image_format: str = "RGB",
+    ) -> None:
+        assert image_format in [
+            "RGB",
+            "BGR",
+        ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
+        if image_format != self.image_format:
+            image = image[..., ::-1]
+        # Transform the image to the form expected by the model
+        input_image = self.transform.apply_image(image)
+        input_image = input_image.transpose(2, 0, 1)[None, :, :, :]
+        self.reset_image()
+        self.original_size = image.shape[:2]
+        self.input_size = tuple(input_image.shape[-2:])
+        input_image = self.preprocess(input_image).astype(np.float32)
+        outputs = self.encoder.run(None, {'image': input_image})
+        self.features = outputs[0]
+        self.is_image_set = True
+    def predict(
+            self,
+            point_coords: Optional[np.ndarray] = None,
+            point_labels: Optional[np.ndarray] = None,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        if not self.is_image_set:
+            raise RuntimeError("An image must be set with .set_image(...) before mask prediction.")
+        point_coords = self.transform.apply_coords(point_coords, self.original_size)
+        outputs = self.decoder.run(None, {
+            'image_embeddings': self.features,
+            'point_coords': point_coords.astype(np.float32),
+            'point_labels': point_labels.astype(np.float32)
+        })
+        scores, low_res_masks = outputs[0], outputs[1]
+        masks = self.postprocess_masks(low_res_masks)
+        masks = masks > self.mask_threshold
+        return masks, scores, low_res_masks
+    def reset_image(self) -> None:
+        """Resets the currently set image."""
+        self.is_image_set = False
+        self.features = None
+        self.orig_h = None
+        self.orig_w = None
+        self.input_h = None
+        self.input_w = None
+    def preprocess(self, x: np.ndarray):
+        x = (x - self.pixel_mean) / self.pixel_std
+        h, w = x.shape[-2:]
+        padh = self.img_size - h
+        padw = self.img_size - w
+        x = np.pad(x, ((0, 0), (0, 0), (0, padh), (0, padw)), mode='constant', constant_values=0)
+        return x
+    def postprocess_masks(self, mask: np.ndarray):
+        mask = mask.squeeze(0).transpose(1, 2, 0)
+        mask = cv2.resize(mask, (self.img_size, self.img_size), interpolation=cv2.INTER_LINEAR)
+        mask = mask[:self.input_size[0], :self.input_size[1], :]
+        mask = cv2.resize(mask, (self.original_size[1], self.original_size[0]), interpolation=cv2.INTER_LINEAR)
+        mask = mask.transpose(2, 0, 1)[None, :, :, :]
+        return mask