Gabriel
/

Qwen2-VL-7B-Instruct-AWQ

@@ -1,40 +1,18 @@
 from typing import Dict, Any
 import torch
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from PIL import Image
 import io
 import base64
 import requests
-from qwen_vl_utils import process_vision_info
 class EndpointHandler():
     def __init__(self, path=""):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
-            path,
-            torch_dtype="auto",
-            device_map="auto"
-        ).to(self.device)
         self.processor = AutoProcessor.from_pretrained(path)
-        # Optionally, adjust min_pixels and max_pixels if needed
-        # min_pixels = 256*28*28
-        # max_pixels = 1280*28*28
-        # self.processor = AutoProcessor.from_pretrained(path, min_pixels=min_pixels, max_pixels=max_pixels)
     def __call__(self, data: Any) -> Dict[str, Any]:
-        """
-        Args:
-            data (Any): The input data, which can be:
-                - Binary image data in the request body.
-                - A dictionary with 'image' and 'text' keys:
-                    - 'image': Base64-encoded image string or image URL.
-                    - 'text': The text prompt.
-        Returns:
-            Dict[str, Any]: The generated text output from the model.
-        """
         default_prompt = "Describe this image."
         if isinstance(data, (bytes, bytearray)):
@@ -46,8 +24,7 @@ class EndpointHandler():
             if image_input is None:
                 return {"error": "No image provided."}
             if image_input.startswith('http'):
-                response = requests.get(image_input)
-                image = Image.open(io.BytesIO(response.content)).convert('RGB')
             else:
                 image_data = base64.b64decode(image_input)
                 image = Image.open(io.BytesIO(image_data)).convert('RGB')
@@ -58,34 +35,24 @@ class EndpointHandler():
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "image",
-                        "image": image,
-                    },
                     {"type": "text", "text": text_input},
                 ],
             }
         ]
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages)
         inputs = self.processor(
             text=[text],
-            images=image_inputs,
-            videos=video_inputs,
             padding=True,
             return_tensors="pt",
-        )
-        inputs = inputs.to(self.device)
-        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
         output_text = self.processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        return {"generated_text": output_text[0]}

 from typing import Dict, Any
 import torch
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 from PIL import Image
 import io
 import base64
 import requests
 class EndpointHandler():
     def __init__(self, path=""):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained(path).to(self.device)
         self.processor = AutoProcessor.from_pretrained(path)
     def __call__(self, data: Any) -> Dict[str, Any]:
         default_prompt = "Describe this image."
         if isinstance(data, (bytes, bytearray)):
             if image_input is None:
                 return {"error": "No image provided."}
             if image_input.startswith('http'):
+                image = Image.open(requests.get(image_input, stream=True).raw).convert('RGB')
             else:
                 image_data = base64.b64decode(image_input)
                 image = Image.open(io.BytesIO(image_data)).convert('RGB')
             {
                 "role": "user",
                 "content": [
+                    {"type": "image", "image": image},
                     {"type": "text", "text": text_input},
                 ],
             }
         ]
+        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.processor(
             text=[text],
+            images=[image],
             padding=True,
             return_tensors="pt",
+        ).to(self.device)
+        generate_ids = self.model.generate(inputs.input_ids, max_length=30)
         output_text = self.processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return {"generated_text": output_text}