BUT-FIT
/

csmpt7b

@@ -10,10 +10,28 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .attention import is_flash_v1_installed, is_flash_v2_installed
 if is_flash_v2_installed():
     try:
         from flash_attn import bert_padding
         from flash_attn.layers.rotary import RotaryEmbedding as DAILRotaryEmbedding
     except Exception as e:
         raise e
 if is_flash_v1_installed():
@@ -140,9 +158,14 @@ def gen_flash_attn_padding_info(bsz: int, S: int, past_key_len: int, device: tor
         key_padding_mask = attention_mask_in_length
         query_padding_mask = attention_mask_in_length
         unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
-    (_, indices_q, cu_seqlens_q, max_seqlen_q) = unpadding_function(torch.empty(bsz, S, 1, device=device), query_padding_mask)
-    (_, indices_k, cu_seqlens_k, max_seqlen_k) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
-    (_, indices_v, _, _) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
     flash_attn_padding_info['indices_q'] = indices_q
     flash_attn_padding_info['indices_k'] = indices_k
     flash_attn_padding_info['indices_v'] = indices_v

 import torch.nn as nn
 import torch.nn.functional as F
 from .attention import is_flash_v1_installed, is_flash_v2_installed
+# Global variable to store the result
+is_flash_attn_ge_2_7_0 = None
 if is_flash_v2_installed():
     try:
         from flash_attn import bert_padding
         from flash_attn.layers.rotary import RotaryEmbedding as DAILRotaryEmbedding
+        import flash_attn
+        from packaging import version
+        # Function to check the version and set the global variable
+        def check_flash_attn_version_gte270():
+            global is_flash_attn_ge_2_7_0
+            installed_version = flash_attn.__version__
+            is_flash_attn_ge_2_7_0 = version.parse(installed_version) >= version.parse("2.7.0")
+        # Call the function to set the global variable
+        check_flash_attn_version_gte270()
     except Exception as e:
         raise e
 if is_flash_v1_installed():
         key_padding_mask = attention_mask_in_length
         query_padding_mask = attention_mask_in_length
         unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
+    if is_flash_attn_ge_2_7_0:
+        (_, indices_q, cu_seqlens_q, max_seqlen_q, _) = unpadding_function(torch.empty(bsz, S, 1, device=device), query_padding_mask)
+        (_, indices_k, cu_seqlens_k, max_seqlen_k, _) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+        (_, indices_v, _, _, _) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+    else:
+        (_, indices_q, cu_seqlens_q, max_seqlen_q) = unpadding_function(torch.empty(bsz, S, 1, device=device), query_padding_mask)
+        (_, indices_k, cu_seqlens_k, max_seqlen_k) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
+        (_, indices_v, _, _) = unpadding_function(torch.empty(bsz, past_key_len + S, 1, device=device), key_padding_mask)
     flash_attn_padding_info['indices_q'] = indices_q
     flash_attn_padding_info['indices_k'] = indices_k
     flash_attn_padding_info['indices_v'] = indices_v