infly
/

inf-retriever-v1

@@ -17,6 +17,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Qwen2 model."""
 from transformers import Qwen2Config
 import inspect
@@ -274,7 +277,9 @@ class Qwen2Attention(nn.Module):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -378,7 +383,9 @@ class Qwen2FlashAttention2(Qwen2Attention):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
@@ -676,7 +683,9 @@ class Qwen2SdpaAttention(Qwen2Attention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -972,7 +981,6 @@ class Qwen2Model(Qwen2PreTrainedModel):
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
@@ -993,12 +1001,28 @@ class Qwen2Model(Qwen2PreTrainedModel):
                 use_cache = False
         past_key_values_length = 0
         if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1104,7 +1128,10 @@ class Qwen2Model(Qwen2PreTrainedModel):
         next_cache = None
         if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
@@ -1116,6 +1143,7 @@ class Qwen2Model(Qwen2PreTrainedModel):
         )
 class Qwen2ForCausalLM(Qwen2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
@@ -1243,21 +1271,32 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
@@ -1287,13 +1326,14 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         model_inputs.update(
             {
                 "position_ids": position_ids,
-                "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
             }
         )
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# includes edits by https://github.com/BBC-Esq to fix cache errors following transformers version post 4.53.3 major cache refactor
 """ PyTorch Qwen2 model."""
 from transformers import Qwen2Config
 import inspect
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
                 use_cache = False
         past_key_values_length = 0
+        use_legacy_cache = False
         if use_cache:
+            # OLD behavior (removed in HF >= 4.55): treat anything not Cache as "legacy" but then
+            # directly used legacy methods on it (would crash if None or new API).
+            # use_legacy_cache = not isinstance(past_key_values, Cache)
+            # if use_legacy_cache:
+            #     # past_key_values_length = past_key_values.get_seq_length()
+            #     past_key_values_length = past_key_values.get_usable_length(seq_length)
+            # NEW behavior: if a legacy tuple is passed, convert it to the new Cache API,
+            # compute length via .get_seq_length(), and remember to return legacy if that’s what came in.
+            if past_key_values is not None and not isinstance(past_key_values, Cache):
+                use_legacy_cache = True  # remember input format for return
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            if isinstance(past_key_values, Cache):
+                # Layer-agnostic total length; cache_position is handled deeper if needed
+                past_key_values_length = past_key_values.get_seq_length()
+            else:
+                # No cache given on first forward, keep length at 0
+                past_key_values_length = 0
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
         next_cache = None
         if use_cache:
+            # If the caller passed legacy, return legacy. Otherwise return the Cache object.
+            next_cache = (
+                next_decoder_cache.to_legacy_cache() if (use_legacy_cache and next_decoder_cache is not None) else next_decoder_cache
+            )
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         )
 class Qwen2ForCausalLM(Qwen2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
+                # NEW API (HF >= 4.55): use Cache methods
                 cache_length = past_key_values.get_seq_length()
+                past_length = cache_length  # `seen_tokens` removed; use total seq length instead
+                try:
+                    max_cache_length = past_key_values.get_max_cache_shape()
+                except Exception:
+                    max_cache_length = None
+                # OLD API (deprecated/removed):
+                # cache_length = past_key_values.get_seq_length()
+                # past_length = past_key_values.seen_tokens
+                # max_cache_length = past_key_values.get_max_length()
             else:
+                # Legacy tuple format: keep computing lengths directly from tensors
+                # (We keep it compatible without forcing a conversion here)
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens.
+            # We can discard input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
         model_inputs.update(
             {
                 "position_ids": position_ids,
+                "past_key_values": past_key_values,  # pass through unchanged (legacy or new Cache object)
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
             }
         )
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()