deepseek-ai
/

DeepSeek-OCR

@@ -27,7 +27,9 @@ import time
 def load_image(image_path):
     try:
-        image = Image.open(image_path)
         corrected_image = ImageOps.exif_transpose(image)
@@ -353,6 +355,7 @@ class DeepseekOCRConfig(DeepseekV2Config):
 class DeepseekOCRModel(DeepseekV2Model):
     config_class = DeepseekOCRConfig
     def __init__(self, config: DeepseekV2Config):
         super(DeepseekOCRModel, self).__init__(config)
@@ -432,10 +435,11 @@ class DeepseekOCRModel(DeepseekV2Model):
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
-                        print('=====================')
-                        print('BASE: ', global_features.shape)
-                        print('PATCHES: ', local_features.shape)
-                        print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
@@ -475,10 +479,12 @@ class DeepseekOCRModel(DeepseekV2Model):
                         global_features_2 = vision_model(image_ori, global_features_1)
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
-                        print('=====================')
-                        print('BASE: ', global_features.shape)
-                        print('NO PATCHES')
-                        print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
@@ -700,11 +706,13 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
-    def infer(self, tokenizer, prompt='', image_file='', output_path = '', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False):
         self.disable_torch_init()
-        os.makedirs(output_path, exist_ok=True)
-        os.makedirs(f'{output_path}/images', exist_ok=True)
         if prompt and image_file:
             conversation = [
@@ -716,7 +724,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                     # "content": "<image>\nFree OCR. ",
                     # "content": "<image>\nParse the figure. ",
                     # "content": "<image>\nExtract the text in the image. ",
-                    "images": [f'{image_file}'],
                 },
                 {"role": "<|Assistant|>", "content": ""},
             ]
@@ -910,7 +918,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if not eval_mode:
-            streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(

 def load_image(image_path):
     try:
+        image = image_path
+        if not isinstance(image_path, Image.Image):
+            image = Image.open(image_path)
         corrected_image = ImageOps.exif_transpose(image)
 class DeepseekOCRModel(DeepseekV2Model):
     config_class = DeepseekOCRConfig
+    verbose = True
     def __init__(self, config: DeepseekV2Config):
         super(DeepseekOCRModel, self).__init__(config)
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
+                        if self.verbose:
+                            print('=====================')
+                            print('BASE: ', global_features.shape)
+                            print('PATCHES: ', local_features.shape)
+                            print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
                         global_features_2 = vision_model(image_ori, global_features_1)
                         global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
                         global_features = self.projector(global_features)
+                        if self.verbose:
+                            print('=====================')
+                            print('BASE: ', global_features.shape)
+                            print('NO PATCHES')
+                            print('=====================')
                         _, hw, n_dim = global_features.shape
                         h = w = int(hw ** 0.5)
+    def infer(self, tokenizer, prompt='', image_file='', output_path = '', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False, streamer=None, verbose=True):
         self.disable_torch_init()
+        self.model.verbose = verbose
+        if len(output_path) > 0 :
+            os.makedirs(output_path, exist_ok=True)
+            os.makedirs(f'{output_path}/images', exist_ok=True)
         if prompt and image_file:
             conversation = [
                     # "content": "<image>\nFree OCR. ",
                     # "content": "<image>\nParse the figure. ",
                     # "content": "<image>\nExtract the text in the image. ",
+                    "images": [image_file] if isinstance(image_file, (BytesIO, Image.Image)) else [f'{image_file}'],
                 },
                 {"role": "<|Assistant|>", "content": ""},
             ]
         if not eval_mode:
+            streamer = streamer or NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
             with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(