zai-org
/

glm-4-9b-chat

@@ -771,15 +771,16 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
             if padding_mask is not None and not padding_mask.all():
                 return padding_mask
             return None
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
             past_length = past_key_values[0][0].shape[2]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
         if padding_mask is not None:
             full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
         if not past_length and padding_mask is not None:
@@ -872,7 +873,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        batch_size, seq_length = input_ids.shape
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)

             if padding_mask is not None and not padding_mask.all():
                 return padding_mask
             return None
+        batch_size, seq_length = input_ids.shape if input_ids is not None else padding_mask.shape
+        device = input_ids.device if input_ids is not None else padding_mask.device
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=device)
         full_attention_mask.tril_()
         past_length = 0
         if past_key_values:
             past_length = past_key_values[0][0].shape[2]
         if past_length:
             full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=device), full_attention_mask), dim=-1)
         if padding_mask is not None:
             full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
         if not past_length and padding_mask is not None:
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = (input_ids.shape if input_ids is not None else inputs_embeds.shape[:2] if inputs_embeds is not None else (None, None))
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)