Upload model trained with Unsloth

Upload model trained with Unsloth 2x faster

Files changed (4) hide show

README.md CHANGED Viewed

@@ -3,6 +3,7 @@ library_name: transformers
 tags:
 - trl
 - grpo
 ---
 # Model Card for Model ID

 tags:
 - trl
 - grpo
+- unsloth
 ---
 # Model Card for Model ID

config.json CHANGED Viewed

@@ -7,14 +7,14 @@
   "attention_dropout": 0.0,
   "attn_logit_softcapping": null,
   "bos_token_id": 2,
-  "dtype": "bfloat16",
-  "eos_token_id": 1,
   "final_logit_softcapping": null,
   "head_dim": 256,
   "hidden_activation": "gelu_pytorch_tanh",
-  "hidden_size": 640,
   "initializer_range": 0.02,
-  "intermediate_size": 2048,
   "layer_types": [
     "sliding_attention",
     "sliding_attention",
@@ -33,22 +33,32 @@
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
-    "full_attention"
   ],
   "max_position_embeddings": 32768,
   "model_type": "gemma3_text",
   "num_attention_heads": 4,
-  "num_hidden_layers": 18,
   "num_key_value_heads": 1,
   "pad_token_id": 0,
   "query_pre_attn_scalar": 256,
   "rms_norm_eps": 1e-06,
-  "rope_local_base_freq": 10000.0,
   "rope_scaling": null,
-  "rope_theta": 1000000.0,
   "sliding_window": 512,
-  "transformers_version": "4.56.1",
-  "use_bidirectional_attention": false,
   "use_cache": true,
   "vocab_size": 262144
 }

   "attention_dropout": 0.0,
   "attn_logit_softcapping": null,
   "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 106,
   "final_logit_softcapping": null,
   "head_dim": 256,
   "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
   "initializer_range": 0.02,
+  "intermediate_size": 6912,
   "layer_types": [
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
     "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention"
   ],
   "max_position_embeddings": 32768,
   "model_type": "gemma3_text",
   "num_attention_heads": 4,
+  "num_hidden_layers": 26,
   "num_key_value_heads": 1,
   "pad_token_id": 0,
   "query_pre_attn_scalar": 256,
   "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000,
   "rope_scaling": null,
+  "rope_theta": 1000000,
   "sliding_window": 512,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.4",
+  "unsloth_fixed": true,
+  "unsloth_version": "2025.9.7",
   "use_cache": true,
   "vocab_size": 262144
 }

generation_config.json CHANGED Viewed

@@ -6,8 +6,9 @@
     1,
     106
   ],
   "pad_token_id": 0,
   "top_k": 64,
   "top_p": 0.95,
-  "transformers_version": "4.56.1"
 }

     1,
     106
   ],
+  "max_length": 32768,
   "pad_token_id": 0,
   "top_k": 64,
   "top_p": 0.95,
+  "transformers_version": "4.55.4"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:412ca71266d94d6122ff4527cb50a1a536af21dbb23ed997e35038bd2cffb4eb
-size 536223056

 version https://git-lfs.github.com/spec/v1
+oid sha256:39a650d1cd96b76d14d905dbe78aa52a8123c37e78fdf9a9e8a34772c5c30efd
+size 2000078368