Add Apple Silicon (MPS) backend support

Enables DeepSeek-OCR to run on Apple Silicon (M1/M2/M3/M4) using the MPS backend with proper OCR output quality.

Key changes:
- Replace masked_scatter_ with row-wise boolean assignment on MPS (fixes silent embedding injection failure)
- Use fp32 precision for images and inference on MPS (bfloat16 causes numerical issues)
- Disable autocast on MPS backend
- Make tensor placement device-agnostic (.to(self.device) instead of .cuda())
- Add NaN guards for vision tower outputs on MPS

All changes are conditionally applied based on self.device.type == "mps".
CUDA code path remains completely unchanged for full backwards compatibility.

Tested on: macOS 26.0.1, Apple M4 Max, PyTorch 2.9.0, Transformers 4.46.3

Files changed (1) hide show

modeling_deepseekocr.py +42 -17

modeling_deepseekocr.py CHANGED Viewed

@@ -3,6 +3,7 @@ from .configuration_deepseek_v2 import DeepseekV2Config
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
 from transformers.cache_utils import Cache
 import requests
 from PIL import Image, ImageOps, ImageDraw, ImageFont
 from io import BytesIO
@@ -502,7 +503,23 @@ class DeepseekOCRModel(DeepseekV2Model):
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
-                    inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
                 idx += 1
@@ -799,7 +816,9 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
-                images_list.append(image_transform(global_view).to(torch.bfloat16))
                 # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
@@ -810,9 +829,9 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 if width_crop_num > 1 or height_crop_num > 1:
                     """process the local views"""
                     for i in range(len(images_crop_raw)):
-                        images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))
                 if image_size == 640:
                     valid_img_tokens += len(images_crop_list) * 100
@@ -846,7 +865,9 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 # else:
                 global_view = ImageOps.pad(image, (image_size, image_size),
                                         color=tuple(int(x * 255) for x in image_transform.mean))
-                images_list.append(image_transform(global_view).to(torch.bfloat16))
                 if base_size == 1024:
                     valid_img_tokens += int(256 * ratio)
@@ -911,12 +932,14 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
@@ -929,12 +952,14 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                         )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
@@ -944,10 +969,10 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                         no_repeat_ngram_size = 35,
                         use_cache = True
                         )
         if '<image>' in conversation[0]['content'] and eval_mode:
-                outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
                 stop_str = '<｜end▁of▁sentence｜>'
                 if outputs.endswith(stop_str):
                     outputs = outputs[:-len(stop_str)]
@@ -957,7 +982,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 return outputs
         if '<image>' in conversation[0]['content'] and test_compress:
-            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
             pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
             print('='*50)
             print('image size: ', (w, h))
@@ -968,7 +993,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if '<image>' in conversation[0]['content'] and save_results:
-            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
             stop_str = '<｜end▁of▁sentence｜>'
             print('='*15 + 'save results:' + '='*15)

 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
 from transformers.cache_utils import Cache
+from contextlib import nullcontext
 import requests
 from PIL import Image, ImageOps, ImageDraw, ImageFont
 from io import BytesIO
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
+                    # MPS compatibility: use row-wise assignment; CUDA: keep original masked_scatter_
+                    if self.device.type == "mps":
+                        # MPS-safe: row-wise boolean assignment instead of broadcasted masked_scatter_
+                        mask = images_seq_mask[idx].to(self.device)
+                        feats = images_in_this_batch.to(dtype=inputs_embeds.dtype, device=self.device)
+                        # Basic sanity: number of rows must match
+                        if mask.sum().item() != feats.shape[0]:
+                            raise RuntimeError(
+                                f"image token count mismatch: mask={mask.sum().item()} vs feats={feats.shape[0]}"
+                            )
+                        # Guard against NaNs from upstream vision tower (seen on some MPS builds)
+                        feats = torch.nan_to_num(feats)
+                        # Deterministic row write
+                        inputs_embeds[idx][mask] = feats
+                    else:
+                        # Original CUDA path (unchanged)
+                        inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
                 idx += 1
+                # MPS needs fp32, CUDA can use bfloat16
+                image_dtype = torch.float32 if self.device.type == "mps" else torch.bfloat16
+                images_list.append(image_transform(global_view).to(image_dtype))
                 # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
                 if width_crop_num > 1 or height_crop_num > 1:
                     """process the local views"""
                     for i in range(len(images_crop_raw)):
+                        images_crop_list.append(image_transform(images_crop_raw[i]).to(image_dtype))
                 if image_size == 640:
                     valid_img_tokens += len(images_crop_list) * 100
                 # else:
                 global_view = ImageOps.pad(image, (image_size, image_size),
                                         color=tuple(int(x * 255) for x in image_transform.mean))
+                # MPS needs fp32, CUDA can use bfloat16
+                image_dtype = torch.float32 if self.device.type == "mps" else torch.bfloat16
+                images_list.append(image_transform(global_view).to(image_dtype))
                 if base_size == 1024:
                     valid_img_tokens += int(256 * ratio)
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
+            # MPS: no autocast (pure fp32); CUDA: keep original bfloat16 autocast
+            autocast_ctx = nullcontext() if self.device.type == "mps" else torch.autocast("cuda", dtype=torch.bfloat16)
+            with autocast_ctx:
                 with torch.no_grad():
                     output_ids = self.generate(
+                        input_ids.unsqueeze(0).to(self.device),
+                        images=[(images_crop.to(self.device), images_ori.to(self.device))],
+                        images_seq_mask = images_seq_mask.unsqueeze(0).to(self.device),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
                         )
         else:
+            # MPS: no autocast (pure fp32); CUDA: keep original bfloat16 autocast
+            autocast_ctx = nullcontext() if self.device.type == "mps" else torch.autocast("cuda", dtype=torch.bfloat16)
+            with autocast_ctx:
                 with torch.no_grad():
                     output_ids = self.generate(
+                        input_ids.unsqueeze(0).to(self.device),
+                        images=[(images_crop.to(self.device), images_ori.to(self.device))],
+                        images_seq_mask = images_seq_mask.unsqueeze(0).to(self.device),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
                         no_repeat_ngram_size = 35,
                         use_cache = True
                         )
         if '<image>' in conversation[0]['content'] and eval_mode:
+                outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(self.device).shape[1]:])
                 stop_str = '<｜end▁of▁sentence｜>'
                 if outputs.endswith(stop_str):
                     outputs = outputs[:-len(stop_str)]
                 return outputs
         if '<image>' in conversation[0]['content'] and test_compress:
+            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(self.device).shape[1]:])
             pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
             print('='*50)
             print('image size: ', (w, h))
         if '<image>' in conversation[0]['content'] and save_results:
+            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(self.device).shape[1]:])
             stop_str = '<｜end▁of▁sentence｜>'
             print('='*15 + 'save results:' + '='*15)