deepmind
/

multimodal-perceiver

+from typing import Dict, List, Any
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+from optimum.onnxruntime import ORTModelForSequenceClassification
+import torch
+from PIL import Image
+import numpy as np
+import librosa
+class EndpointHandler:
+    def __init__(self, path=""):
+        """
+        Initialize the handler. This loads the tokenizer and model required for inference.
+        We will load the `ronai-multimodal-perceiver-tsx` model for multimodal input handling.
+        """
+        # Load the tokenizer and model
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.model = ORTModelForSequenceClassification.from_pretrained(path)
+        # Initialize a pipeline for text classification (adjust task type if needed)
+        self.pipeline = pipeline("text-classification", model=self.model, tokenizer=self.tokenizer)
+    def preprocess(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Preprocess input data based on the modality.
+        This handler supports text, image, and audio data.
+        """
+        inputs = data.get("inputs", None)
+        if isinstance(inputs, str):
+            # Preprocessing for text input
+            tokens = self.tokenizer(inputs, return_tensors="pt")
+            return tokens
+        elif isinstance(inputs, Image.Image):
+            # Preprocessing for image input (convert to tensor)
+            image = np.array(inputs)
+            image_tensor = torch.tensor(image).unsqueeze(0)  # Add batch dimension
+            return image_tensor
+        elif isinstance(inputs, np.ndarray):
+            # Preprocessing for raw array input (e.g., audio, point clouds)
+            return torch.tensor(inputs).unsqueeze(0)
+        elif isinstance(inputs, bytes):
+            # Preprocessing for audio input (convert to mel spectrogram)
+            audio, sr = librosa.load(inputs, sr=None)
+            mel_spectrogram = librosa.feature.melspectrogram(audio, sr=sr)
+            mel_tensor = torch.tensor(mel_spectrogram).unsqueeze(0).unsqueeze(0)  # Add batch and channel dimensions
+            return mel_tensor
+        else:
+            raise ValueError("Unsupported input type. Must be string (text), image (PIL), or array (audio, etc.).")
+    def postprocess(self, outputs: Any) -> List[Dict[str, Any]]:
+        """
+        Post-process the model output to a human-readable format.
+        For text classification, this returns label and score.
+        """
+        logits = outputs.logits
+        probabilities = torch.nn.functional.softmax(logits, dim=-1)
+        predicted_class_id = probabilities.argmax().item()
+        score = probabilities[0, predicted_class_id].item()
+        return [{"label": self.model.config.id2label[predicted_class_id], "score": score}]
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Handles the incoming request, processes the input, runs inference, and returns results.
+        Args:
+            data (Dict[str, Any]): The input data for inference.
+                - data["inputs"] could be a string (text), PIL.Image (image), np.ndarray (audio or point clouds).
+        Returns:
+            A list of dictionaries containing the model's prediction.
+        """
+        # Step 1: Preprocess input data
+        preprocessed_data = self.preprocess(data)
+        # Step 2: Perform model inference
+        outputs = self.pipeline(preprocessed_data)
+        # Step 3: Post-process and return the predictions
+        return self.postprocess(outputs)