jasonacox commited on
Commit
2443821
·
verified ·
1 Parent(s): ec73b24

Upload nanochat-1.8B-midtrain model

Browse files
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.pt filter=lfs diff=lfs merge=lfs -text
2
  *.pth filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.safetensors filter=lfs diff=lfs merge=lfs -text
5
+ tokenizer/tokenizer.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: nanochat
4
+ tags:
5
+ - nanochat
6
+ - llm
7
+ - dgx-spark
8
+ - grace-blackwell
9
+ - from-scratch
10
+ language:
11
+ - en
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # nanochat-1.8B-midtrain
16
+
17
+ Midtrained model fine-tuned for conversational interactions. Trained on SmolTalk dataset with special tokens for multi-turn conversations.
18
+
19
+ ## Model Details
20
+
21
+ - **Model Type:** GPT-style transformer trained from scratch
22
+ - **Parameters:** ~1.9 billion
23
+ - **Training Phase:** midtrain
24
+ - **Architecture:** 20 layers, 1280 embedding dimension
25
+ - **Hardware:** NVIDIA DGX Spark (Grace Blackwell GB10)
26
+ - **Framework:** [NanoChat](https://github.com/karpathy/nanochat)
27
+ - **Training Precision:** BFloat16
28
+
29
+ ## Training Details
30
+
31
+ - **GPU:** NVIDIA Grace Blackwell GB10
32
+ - **Memory:** 128GB unified memory
33
+ - **CUDA:** 13.0
34
+ - **Optimization:** Muon optimizer for matrix parameters, AdamW for others
35
+ - **Checkpoint Step:** 000813
36
+
37
+ ## Usage
38
+
39
+ ### Prerequisites
40
+
41
+ ```bash
42
+ # Clone the NanoChat repository
43
+ git clone https://github.com/karpathy/nanochat.git
44
+ cd nanochat
45
+
46
+ # Install dependencies (requires CUDA)
47
+ uv sync --extra gpu
48
+
49
+ # Activate the virtual environment
50
+ source .venv/bin/activate
51
+ ```
52
+
53
+ ### Quick Test
54
+
55
+ Download and test this model from HuggingFace:
56
+
57
+ ```bash
58
+ # Clone the test script
59
+ wget https://raw.githubusercontent.com/jasonacox/dgx-spark/main/nanochat/hf_test.py
60
+
61
+ # Install dependencies
62
+ pip install huggingface_hub
63
+
64
+ # Run with this model
65
+ python hf_test.py --model jasonacox/nanochat-1.8B-midtrain
66
+ ```
67
+
68
+ ### Example Code
69
+
70
+ ```python
71
+ import sys
72
+ import os
73
+ import glob
74
+ from huggingface_hub import snapshot_download
75
+ import torch
76
+ from contextlib import nullcontext
77
+
78
+ # Download model from HuggingFace
79
+ print("Downloading model...")
80
+ model_path = snapshot_download(
81
+ repo_id="jasonacox/nanochat-1.8B-midtrain",
82
+ cache_dir=os.path.expanduser("~/.cache/nanochat/hf_downloads")
83
+ )
84
+
85
+ # Setup NanoChat (clone if needed)
86
+ nanochat_path = "nanochat"
87
+ if not os.path.exists(nanochat_path):
88
+ os.system("git clone https://github.com/karpathy/nanochat.git")
89
+ os.system("cd nanochat && uv sync --extra gpu")
90
+
91
+ sys.path.insert(0, nanochat_path)
92
+
93
+ from nanochat.checkpoint_manager import build_model
94
+ from nanochat.common import compute_init, autodetect_device_type
95
+ from nanochat.engine import Engine
96
+
97
+ # Initialize
98
+ device_type = autodetect_device_type()
99
+ _, _, _, _, device = compute_init(device_type)
100
+ ptdtype = torch.bfloat16
101
+ autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
102
+
103
+ # Load model
104
+ checkpoint_files = glob.glob(os.path.join(model_path, "model_*.pt"))
105
+ step = int(os.path.basename(checkpoint_files[0]).split("_")[-1].split(".")[0])
106
+ model, tokenizer, _ = build_model(model_path, step, device, phase="eval")
107
+ engine = Engine(model, tokenizer)
108
+
109
+ # Generate
110
+ prompt = "Hello, how are you?"
111
+ tokens = tokenizer.encode(prompt)
112
+ print(f"Prompt: {prompt}\nResponse: ", end="", flush=True)
113
+
114
+ with autocast_ctx:
115
+ for token_column, _ in engine.generate(tokens, num_samples=1, max_tokens=100, temperature=0.8, top_k=50):
116
+ print(tokenizer.decode([token_column[0]]), end="", flush=True)
117
+ print()
118
+ ```
119
+
120
+ ## Training Pipeline
121
+
122
+ This model was trained using the DGX Spark optimized training pipeline:
123
+
124
+ 1. **Pretraining:** Base language model on FineWeb-EDU dataset
125
+ 2. **Midtraining:** Fine-tuned on conversational data (SmolTalk)
126
+ 3. **SFT:** Supervised fine-tuning on curated conversations
127
+ 4. **RL:** Reinforcement learning with GRPO
128
+
129
+ ## Limitations
130
+
131
+ - This is a micro-model (1.9B parameters) - smaller than commercial LLMs
132
+ - May make factual errors or hallucinate
133
+ - Limited knowledge cutoff from training data
134
+ - Best suited for educational purposes and experimentation
135
+
136
+ ## Citation
137
+
138
+ ```bibtex
139
+ @misc{nanochat-1.8B,
140
+ author = {jasonacox},
141
+ title = {nanochat-1.8B-midtrain},
142
+ year = {2025},
143
+ publisher = {HuggingFace},
144
+ howpublished = {\url{https://huggingface.co/jasonacox/nanochat-1.8B-midtrain}}
145
+ }
146
+ ```
147
+
148
+ ## Acknowledgments
149
+
150
+ - Andrej Karpathy for [NanoChat](https://github.com/karpathy/nanochat)
151
+ - NVIDIA DGX Spark platform
152
+ - FineWeb-EDU and SmolTalk datasets
153
+
154
+ ## License
155
+
156
+ MIT License - Free to use for research and educational purposes
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "nanochat",
3
+ "architecture": "gpt",
4
+ "n_layer": 20,
5
+ "n_head": 10,
6
+ "n_kv_head": 10,
7
+ "n_embd": 1280,
8
+ "vocab_size": 65536,
9
+ "sequence_len": 2048,
10
+ "phase": "midtrain",
11
+ "checkpoint_step": 000813,
12
+ "torch_dtype": "bfloat16"
13
+ }
meta_000813.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 813,
3
+ "val_bpb": 0.3957736584182098,
4
+ "model_config": {
5
+ "sequence_len": 2048,
6
+ "vocab_size": 65536,
7
+ "n_layer": 20,
8
+ "n_head": 10,
9
+ "n_kv_head": 10,
10
+ "n_embd": 1280
11
+ },
12
+ "user_config": {
13
+ "run": "nanochat-midtrain",
14
+ "device_type": "",
15
+ "dtype": "bfloat16",
16
+ "num_iterations": -1,
17
+ "max_seq_len": 2048,
18
+ "device_batch_size": 32,
19
+ "unembedding_lr": 0.004,
20
+ "embedding_lr": 0.2,
21
+ "matrix_lr": 0.02,
22
+ "init_lr_frac": 1.0,
23
+ "weight_decay": 0.0,
24
+ "eval_every": 150,
25
+ "eval_tokens": 10485760,
26
+ "total_batch_size": 524288,
27
+ "dry_run": 0
28
+ }
29
+ }
model_000813.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:925e640763858501445eef1bfab5374efcb13d9858d7d0ff05caffe02854c3e7
3
+ size 2076230219
tokenizer/token_bytes.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae39c27aae519d14071efc95f9a558ba0b7ede47e7d83ad4f198422b44c5f70e
3
+ size 263721
tokenizer/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c060565a46fe83b49d99005acba796f2a630daa7970eb49f7513b89f9fb40e0
3
+ size 846208