KnutJaegersberg commited on Jan 3, 2024

Commit

b3c0032

1 Parent(s): ac691e9

Upload 91 files

Browse files

Files changed (44) hide show

quip-sharp/README.md +3 -4
quip-sharp/docs/index.html +51 -51
quip-sharp/docs/index.md +8 -8
quip-sharp/hfize_llama.py +27 -78
quip-sharp/lib/__pycache__/__init__.cpython-310.pyc +0 -0
quip-sharp/lib/codebook/__pycache__/__init__.cpython-310.pyc +0 -0
quip-sharp/lib/codebook/__pycache__/half_integer_4bit_1col.cpython-310.pyc +0 -0
quip-sharp/lib/codebook/__pycache__/latticed4.cpython-310.pyc +0 -0
quip-sharp/lib/codebook/__pycache__/latticee8_padded12.cpython-310.pyc +0 -0
quip-sharp/lib/codebook/latticee8_padded12.py +100 -129
quip-sharp/lib/linear/__pycache__/__init__.cpython-310.pyc +0 -0
quip-sharp/lib/linear/__pycache__/fused_quantized_linear.cpython-310.pyc +0 -0
quip-sharp/lib/linear/__pycache__/quantized_linear.cpython-310.pyc +0 -0
quip-sharp/lib/linear/fused_quantized_linear.py +22 -0
quip-sharp/lib/linear/quantized_linear.py +25 -16
quip-sharp/lib/utils/__pycache__/__init__.cpython-310.pyc +0 -0
quip-sharp/lib/utils/__pycache__/data_utils.cpython-310.pyc +0 -0
quip-sharp/lib/utils/__pycache__/lm_eval_adaptor.cpython-310.pyc +0 -0
quip-sharp/lib/utils/__pycache__/math_utils.cpython-310.pyc +0 -0
quip-sharp/lib/utils/__pycache__/matmul_had.cpython-310.pyc +0 -0
quip-sharp/lib/utils/__pycache__/matmul_kron.cpython-310.pyc +0 -0
quip-sharp/lib/utils/__pycache__/misc.cpython-310.pyc +0 -0
quip-sharp/lib/utils/__pycache__/unsafe_import.cpython-310.pyc +0 -0
quip-sharp/lib/utils/data_utils.py +1 -1
quip-sharp/lib/utils/unsafe_import.py +1 -3
quip-sharp/model/__pycache__/graph_wrapper.cpython-310.pyc +0 -0
quip-sharp/model/__pycache__/llama.cpython-310.pyc +0 -0
quip-sharp/model/__pycache__/mistral.cpython-310.pyc +0 -0
quip-sharp/model/__pycache__/version.cpython-310.pyc +0 -0
quip-sharp/model/llama.py +25 -55
quip-sharp/model/mistral.py +23 -54
quip-sharp/model/version.py +2 -2
quip-sharp/quantize_llama.py +3 -3
quip-sharp/quiptools/build/lib.linux-x86_64-cpython-310/quiptools_cuda.cpython-310-x86_64-linux-gnu.so +2 -2
quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/.ninja_deps +0 -0
quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/.ninja_log +3 -5
quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools.o +1 -1
quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_e8p_gemv.o +2 -2
quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_wrapper.o +2 -2
quip-sharp/quiptools/dist/quiptools_cuda-0.0.0-py3.10-linux-x86_64.egg +2 -2
quip-sharp/quiptools/quiptools_cuda.egg-info/SOURCES.txt +0 -5
quip-sharp/quiptools/quiptools_e8p_gemv.cu +501 -227
quip-sharp/quiptools/quiptools_wrapper.cpp +8 -3
quip-sharp/scripts/upload_hf.py +1 -0

quip-sharp/README.md CHANGED Viewed

@@ -10,7 +10,7 @@ We also provide a full codebase that allows users to quantize and deploy their o
 | OPTQ      | 3 bit     |   4.577   |   6.838   |   0.544   | **0.786** |
 | OPTQ      | 2 bit     |  109.820  |   62.692  |   0.253   |   0.505   |
 | QuIP      | 2 bit     |   5.574   |   8.268   |   0.544   |   0.751   |
-| **QuIP#** | **2 bit** | **4.156** | **6.545** | **0.595** |   0.785   |
 Quantization results on Llama 2 70B. QuIP# achieves near-native performance at 2 bits, outperforming all other presented baselines.
@@ -18,9 +18,8 @@ Quantization results on Llama 2 70B. QuIP# achieves near-native performance at 2
 ## News
-- We have "deprecated" the 2 bit D4 quantized models as they perform worse than 2 bit E8P models and are slower to run. The code to quantize and run D4 models is still in the codebase, but the D4 models have been removed from HF and we are no longer actively supporting them.
-- We recently added 2 and 4 bit quantized versions of [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [OpenHermes 2.5](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B). See the Model Zoo section for more details.
-- **The 4 bit models have been replaced by new bit-packed models that end with the `-Packed` suffix. The old models have been deprecated, removed, and do not work with the current code (and vice versa). Make sure to pull the latest code to run the 4 bit models.**
 ## Installation

 | OPTQ      | 3 bit     |   4.577   |   6.838   |   0.544   | **0.786** |
 | OPTQ      | 2 bit     |  109.820  |   62.692  |   0.253   |   0.505   |
 | QuIP      | 2 bit     |   5.574   |   8.268   |   0.544   |   0.751   |
+| **QuIP#** | **2 bit** | **4.159** | **6.529** | **0.595** | **0.786** |
 Quantization results on Llama 2 70B. QuIP# achieves near-native performance at 2 bits, outperforming all other presented baselines.
 ## News
+- We merged in a faster E8P kernel that (with CUDA graphs) is around twice as fast as before. Make sure to pull the latest code and models and recompile `quiptools` to get the faster kernel. As a reminder, `hf.generate()` does not work with CUDA graphs so the generation speed in `interactive_gen.py` is not representative of reality.
+- We fixed a duplicated entry in the E8P codebook and updated the result tables.
 ## Installation

quip-sharp/docs/index.html CHANGED Viewed

@@ -283,10 +283,10 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="odd">
 <td style="text-align: center;"><strong>QuIP#</strong></td>
 <td style="text-align: center;"><strong>2 bit</strong></td>
-<td style="text-align: center;"><strong>4.156</strong></td>
-<td style="text-align: center;"><strong>6.545</strong></td>
 <td style="text-align: center;"><strong>0.595</strong></td>
-<td style="text-align: center;">0.785</td>
 </tr>
 </tbody>
 </table>
@@ -688,13 +688,13 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="even">
 <td style="text-align: center;">2-70B</td>
 <td style="text-align: center;">QuIP#</td>
-<td style="text-align: center;">6.535</td>
-<td style="text-align: center;">4.156</td>
-<td style="text-align: center;">0.469</td>
 <td style="text-align: center;">0.595</td>
-<td style="text-align: center;">0.795</td>
-<td style="text-align: center;">0.785</td>
-<td style="text-align: center;">0.740</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">2-13B</td>
@@ -710,13 +710,13 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="even">
 <td style="text-align: center;">2-13B</td>
 <td style="text-align: center;">QuIP#</td>
-<td style="text-align: center;">8.769</td>
-<td style="text-align: center;">6.003</td>
-<td style="text-align: center;">0.381</td>
-<td style="text-align: center;">0.502</td>
-<td style="text-align: center;">0.643</td>
-<td style="text-align: center;">0.751</td>
-<td style="text-align: center;">0.637</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">2-7B</td>
@@ -732,13 +732,13 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="even">
 <td style="text-align: center;">2-7B</td>
 <td style="text-align: center;">QuIP#</td>
-<td style="text-align: center;">12.208</td>
-<td style="text-align: center;">8.201</td>
-<td style="text-align: center;">0.346</td>
-<td style="text-align: center;">0.454</td>
-<td style="text-align: center;">0.647</td>
-<td style="text-align: center;">0.726</td>
-<td style="text-align: center;">0.618</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-65b</td>
@@ -754,13 +754,13 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="even">
 <td style="text-align: center;">1-65b</td>
 <td style="text-align: center;">QuIP#</td>
-<td style="text-align: center;">6.749</td>
-<td style="text-align: center;">4.573</td>
-<td style="text-align: center;">0.435</td>
-<td style="text-align: center;">0.566</td>
-<td style="text-align: center;">0.831</td>
-<td style="text-align: center;">0.792</td>
-<td style="text-align: center;">0.756</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-30B</td>
@@ -776,13 +776,13 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="even">
 <td style="text-align: center;">1-30B</td>
 <td style="text-align: center;">QuIP#</td>
-<td style="text-align: center;">7.465</td>
-<td style="text-align: center;">5.311</td>
-<td style="text-align: center;">0.422</td>
-<td style="text-align: center;">0.537</td>
-<td style="text-align: center;">0.659</td>
-<td style="text-align: center;">0.776</td>
-<td style="text-align: center;">0.714</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-13B</td>
@@ -798,13 +798,13 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="even">
 <td style="text-align: center;">1-13B</td>
 <td style="text-align: center;">QuIP#</td>
-<td style="text-align: center;">8.426</td>
-<td style="text-align: center;">6.353</td>
-<td style="text-align: center;">0.382</td>
-<td style="text-align: center;">0.537</td>
-<td style="text-align: center;">0.665</td>
-<td style="text-align: center;">0.757</td>
-<td style="text-align: center;">0.687</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-7B</td>
@@ -820,13 +820,13 @@ class="math inline">\(\uparrow\)</span></th>
 <tr class="even">
 <td style="text-align: center;">1-7B</td>
 <td style="text-align: center;">QuIP#</td>
-<td style="text-align: center;">10.927</td>
-<td style="text-align: center;">8.146</td>
-<td style="text-align: center;">0.347</td>
-<td style="text-align: center;">0.471</td>
-<td style="text-align: center;">0.673</td>
-<td style="text-align: center;">0.724</td>
-<td style="text-align: center;">0.621</td>
 </tr>
 </tbody>
 </table>

 <tr class="odd">
 <td style="text-align: center;"><strong>QuIP#</strong></td>
 <td style="text-align: center;"><strong>2 bit</strong></td>
+<td style="text-align: center;"><strong>4.159</strong></td>
+<td style="text-align: center;"><strong>6.529</strong></td>
 <td style="text-align: center;"><strong>0.595</strong></td>
+<td style="text-align: center;">0.786</td>
 </tr>
 </tbody>
 </table>
 <tr class="even">
 <td style="text-align: center;">2-70B</td>
 <td style="text-align: center;">QuIP#</td>
+<td style="text-align: center;">6.529</td>
+<td style="text-align: center;">4.158</td>
+<td style="text-align: center;">0.472</td>
 <td style="text-align: center;">0.595</td>
+<td style="text-align: center;">0.791</td>
+<td style="text-align: center;">0.786</td>
+<td style="text-align: center;">0.742</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">2-13B</td>
 <tr class="even">
 <td style="text-align: center;">2-13B</td>
 <td style="text-align: center;">QuIP#</td>
+<td style="text-align: center;">8.755</td>
+<td style="text-align: center;">6.058</td>
+<td style="text-align: center;">0.371</td>
+<td style="text-align: center;">0.501</td>
+<td style="text-align: center;">0.665</td>
+<td style="text-align: center;">0.757</td>
+<td style="text-align: center;">0.636</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">2-7B</td>
 <tr class="even">
 <td style="text-align: center;">2-7B</td>
 <td style="text-align: center;">QuIP#</td>
+<td style="text-align: center;">12.062</td>
+<td style="text-align: center;">8.224</td>
+<td style="text-align: center;">0.325</td>
+<td style="text-align: center;">0.428</td>
+<td style="text-align: center;">0.623</td>
+<td style="text-align: center;">0.712</td>
+<td style="text-align: center;">0.624</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-65b</td>
 <tr class="even">
 <td style="text-align: center;">1-65b</td>
 <td style="text-align: center;">QuIP#</td>
+<td style="text-align: center;">6.744</td>
+<td style="text-align: center;">4.566</td>
+<td style="text-align: center;">0.436</td>
+<td style="text-align: center;">0.569</td>
+<td style="text-align: center;">0.817</td>
+<td style="text-align: center;">0.805</td>
+<td style="text-align: center;">0.736</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-30B</td>
 <tr class="even">
 <td style="text-align: center;">1-30B</td>
 <td style="text-align: center;">QuIP#</td>
+<td style="text-align: center;">7.471</td>
+<td style="text-align: center;">5.317</td>
+<td style="text-align: center;">0.429</td>
+<td style="text-align: center;">0.545</td>
+<td style="text-align: center;">0.669</td>
+<td style="text-align: center;">0.779</td>
+<td style="text-align: center;">0.718</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-13B</td>
 <tr class="even">
 <td style="text-align: center;">1-13B</td>
 <td style="text-align: center;">QuIP#</td>
+<td style="text-align: center;">8.425</td>
+<td style="text-align: center;">6.381</td>
+<td style="text-align: center;">0.387</td>
+<td style="text-align: center;">0.536</td>
+<td style="text-align: center;">0.647</td>
+<td style="text-align: center;">0.750</td>
+<td style="text-align: center;">0.669</td>
 </tr>
 <tr class="odd">
 <td style="text-align: center;">1-7B</td>
 <tr class="even">
 <td style="text-align: center;">1-7B</td>
 <td style="text-align: center;">QuIP#</td>
+<td style="text-align: center;">10.970</td>
+<td style="text-align: center;">8.286</td>
+<td style="text-align: center;">0.352</td>
+<td style="text-align: center;">0.464</td>
+<td style="text-align: center;">0.647</td>
+<td style="text-align: center;">0.720</td>
+<td style="text-align: center;">0.624</td>
 </tr>
 </tbody>
 </table>

quip-sharp/docs/index.md CHANGED Viewed

@@ -52,7 +52,7 @@ These two methods allow QuIP# to significantly close the gap between 2 bit quant
 | OPTQ      | 3 bit     |   4.577   |   6.838   |   0.544   | **0.786** |
 | OPTQ      | 2 bit     |  109.820  |   62.692  |   0.253   |   0.505   |
 | QuIP      | 2 bit     |   5.574   |   8.268   |   0.544   |   0.751   |
-| **QuIP#** | **2 bit** | **4.156** | **6.545** | **0.595** |   0.785   |
 :Quantization results on Llama 2 70B. QuIP# achieves near-native performance at 2 bits, outperforming all other presented baselines.
@@ -237,18 +237,18 @@ Additional results are available [here](https://docs.google.com/spreadsheets/d/1
 |   Model   |   Method  | C4 $\downarrow$ | Wiki $\downarrow$ | ArcC $\uparrow$ | ArcE $\uparrow$ | BoolQ $\uparrow$  | PiQA $\uparrow$ | WinoGrande $\uparrow$ |
 |:---------:|:---------:|:---------------:|:-----------------:|:---------------:|:---------------:|:-------------------:|:---------------:|:-------------------------------:|
 |   2-70B   |    fp16   |      5.533      |       3.120       |      0.480      |      0.597      |       0.766       |      0.809      |         0.768         |
-| 2-70B | QuIP# |    6.535    |     4.156     |    0.469    |    0.595    |     0.795     |    0.785    |       0.740       |
 |   2-13B   |    fp16   |      6.520      |       4.574       |      0.443      |      0.580      |       0.690       |      0.790      |         0.699         |
-| 2-13B | QuIP# |    8.769    |     6.003     |    0.381    |    0.502    |     0.643     |    0.751    |       0.637       |
 |    2-7B   |    fp16   |      7.036      |       5.116       |      0.406      |      0.535      |       0.710       |      0.769      |         0.670         |
-|  2-7B | QuIP# |    12.208   |     8.201     |    0.346    |    0.454    |     0.647     |    0.726    |       0.618       |
 |   1-65b   |    fp16   |      5.811      |       3.532       |      0.463      |      0.588      |       0.823       |      0.809      |         0.771         |
-| 1-65b | QuIP# |    6.749    |     4.573     |    0.435    |    0.566    |     0.831     |    0.792    |       0.756       |
 |   1-30B   |    fp16   |      6.130      |       4.101       |      0.453      |      0.590      |       0.684       |      0.801      |         0.728         |
-| 1-30B | QuIP# |    7.465    |     5.311     |    0.422    |    0.537    |     0.659     |    0.776    |       0.714       |
 |   1-13B   |    fp16   |      6.798      |       5.091       |      0.444      |      0.599      |       0.684       |      0.792      |         0.701         |
-| 1-13B | QuIP# |    8.426    |     6.353     |    0.382    |    0.537    |     0.665     |    0.757    |       0.687       |
 |    1-7B   |    fp16   |      7.343      |       5.677       |      0.415      |      0.525      |       0.731       |      0.774      |         0.670         |
-|  1-7B | QuIP# |    10.927   |     8.146     |    0.347    |    0.471    |     0.673     |    0.724    |       0.621       |
 :QuIP# results across all Llama 1 and 2 models. QuIP# achieves near-native performance at 2 bits on language modeling (C4, Wiki) and zero shot (ArcC, ArcE, BoolQ, PiQA, WinoGrande) tasks.
 </div>

 | OPTQ      | 3 bit     |   4.577   |   6.838   |   0.544   | **0.786** |
 | OPTQ      | 2 bit     |  109.820  |   62.692  |   0.253   |   0.505   |
 | QuIP      | 2 bit     |   5.574   |   8.268   |   0.544   |   0.751   |
+| **QuIP#** | **2 bit** | **4.159** | **6.529** | **0.595** |   0.786   |
 :Quantization results on Llama 2 70B. QuIP# achieves near-native performance at 2 bits, outperforming all other presented baselines.
 |   Model   |   Method  | C4 $\downarrow$ | Wiki $\downarrow$ | ArcC $\uparrow$ | ArcE $\uparrow$ | BoolQ $\uparrow$  | PiQA $\uparrow$ | WinoGrande $\uparrow$ |
 |:---------:|:---------:|:---------------:|:-----------------:|:---------------:|:---------------:|:-------------------:|:---------------:|:-------------------------------:|
 |   2-70B   |    fp16   |      5.533      |       3.120       |      0.480      |      0.597      |       0.766       |      0.809      |         0.768         |
+| 2-70B | QuIP# |    6.529    |     4.158     |    0.472    |    0.595    |     0.791     |    0.786    |       0.742       |
 |   2-13B   |    fp16   |      6.520      |       4.574       |      0.443      |      0.580      |       0.690       |      0.790      |         0.699         |
+| 2-13B | QuIP# |    8.755   |     6.058     |    0.371    |    0.501    |     0.665     |    0.757    |       0.636       |
 |    2-7B   |    fp16   |      7.036      |       5.116       |      0.406      |      0.535      |       0.710       |      0.769      |         0.670         |
+|  2-7B | QuIP# |    12.062   |     8.224     |    0.325    |    0.428    |     0.623     |    0.712    |       0.624       |
 |   1-65b   |    fp16   |      5.811      |       3.532       |      0.463      |      0.588      |       0.823       |      0.809      |         0.771         |
+| 1-65b | QuIP# |    6.744    |     4.566     |    0.436    |    0.569    |     0.817     |    0.805    |       0.736       |
 |   1-30B   |    fp16   |      6.130      |       4.101       |      0.453      |      0.590      |       0.684       |      0.801      |         0.728         |
+| 1-30B | QuIP# |    7.471    |     5.317     |    0.429    |    0.545    |     0.669     |    0.779    |       0.718       |
 |   1-13B   |    fp16   |      6.798      |       5.091       |      0.444      |      0.599      |       0.684       |      0.792      |         0.701         |
+| 1-13B | QuIP# |    8.425    |     6.381     |    0.387    |    0.536   |     0.647     |    0.750    |       0.669       |
 |    1-7B   |    fp16   |      7.343      |       5.677       |      0.415      |      0.525      |       0.731       |      0.774      |         0.670         |
+|  1-7B | QuIP# |    10.970   |     8.286     |    0.352    |    0.464    |     0.647     |    0.720    |       0.624       |
 :QuIP# results across all Llama 1 and 2 models. QuIP# achieves near-native performance at 2 bits on language modeling (C4, Wiki) and zero shot (ArcC, ArcE, BoolQ, PiQA, WinoGrande) tasks.
 </div>

quip-sharp/hfize_llama.py CHANGED Viewed

@@ -5,7 +5,6 @@ import torch
 from transformers import AutoTokenizer
 from model.version import MODEL_VERSION
 from model.llama import LlamaForCausalLM as llama_fuse
-from model.llama_nofuse import LlamaForCausalLM as llama_nofuse
 from model.mistral import MistralForCausalLM
 from lib import codebook
 from lib.utils.unsafe_import import model_from_hf_path
@@ -32,7 +31,6 @@ def unpack_quip(module, saved_layer, codebook_id, codesz):
         module.B.copy_(saved_layer['B'])
     module.SU.copy_(saved_layer['SU'])
     module.SV.copy_(saved_layer['SV'])
-    module.Wscale.copy_(saved_layer['Wscale'])
     if module.rescale_WH:
         module.scaleWH.copy_(saved_layer['scaleWH'])
@@ -50,11 +48,10 @@ def main(args):
     tokenizer = AutoTokenizer.from_pretrained(model_config._name_or_path)
     model_type = model_config.model_type
-    fused = model_config.quip_params.get('fused', True)
     model_config.quip_params['model_version'] = MODEL_VERSION
     if model_type == 'llama':
-        model_cls = llama_fuse if fused else llama_nofuse
     elif model_type == 'mistral':
         model_cls = MistralForCausalLM
     else:
@@ -71,80 +68,32 @@ def main(args):
         layer = model.model.layers[ii]
         cpu = torch.device('cpu')
-        if fused:
-            glog.info(f'loading layer {ii} qkv')
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_qkv.pt', map_location=cpu)
-            layer.self_attn.q_scale.copy_(saved_layer['W_q_scale'])
-            layer.self_attn.k_scale.copy_(saved_layer['W_k_scale'])
-            layer.self_attn.v_scale.copy_(saved_layer['W_v_scale'])
-            unpack_quip(layer.self_attn.qkv_proj, saved_layer, codebook_id, codesz)
-            glog.info(f'loading layer {ii} up')
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_up.pt', map_location=cpu)
-            layer.mlp.up_scale.copy_(saved_layer['W_up_scale'])
-            layer.mlp.gate_scale.copy_(saved_layer['W_gate_scale'])
-            unpack_quip(layer.mlp.upgate_proj, saved_layer, codebook_id, codesz)
-            glog.info(f'loading layer {ii} o')
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_o.pt', map_location=cpu)
-            layer.self_attn.o_scale.copy_(saved_layer['W_o_scale'])
-            unpack_quip(layer.self_attn.o_proj, saved_layer, codebook_id, codesz)
-            glog.info(f'loading layer {ii} down')
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_down.pt', map_location=cpu)
-            layer.mlp.down_scale.copy_(saved_layer['W_down_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.mlp.down_proj.ocs_dupe_inds.copy_(torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.mlp.down_proj, saved_layer, codebook_id, codesz)
-        else:
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_q.pt', map_location=cpu)
-            layer.self_attn.q_scale.copy_(saved_layer['W_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.self_attn.q_proj.ocs_dupe_inds.copy_(
-                    torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.self_attn.q_proj, saved_layer, codebook_id, codesz)
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_k.pt', map_location=cpu)
-            layer.self_attn.k_scale.copy_(saved_layer['W_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.self_attn.k_proj.ocs_dupe_inds.copy_(
-                    torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.self_attn.k_proj, saved_layer, codebook_id, codesz)
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_v.pt', map_location=cpu)
-            layer.self_attn.v_scale.copy_(saved_layer['W_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.self_attn.v_proj.ocs_dupe_inds.copy_(
-                    torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.self_attn.v_proj, saved_layer, codebook_id, codesz)
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_o.pt', map_location=cpu)
-            layer.self_attn.o_scale.copy_(saved_layer['W_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.self_attn.o_proj.ocs_dupe_inds.copy_(
-                    torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.self_attn.o_proj, saved_layer, codebook_id, codesz)
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_up.pt', map_location=cpu)
-            layer.mlp.up_scale.copy_(saved_layer['W_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.mlp.up_proj.ocs_dupe_inds.copy_(torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.mlp.up_proj, saved_layer, codebook_id, codesz)
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_gate.pt', map_location=cpu)
-            layer.mlp.gate_scale.copy_(saved_layer['W_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.mlp.gate_proj.ocs_dupe_inds.copy_(torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.mlp.gate_proj, saved_layer, codebook_id, codesz)
-            saved_layer = torch.load(f'{args.quantized_path}/{ii}_down.pt', map_location=cpu)
-            layer.mlp.down_scale.copy_(saved_layer['W_scale'])
-            if model_config.quip_params['outlier_channel_split']:
-                layer.mlp.down_proj.ocs_dupe_inds.copy_(torch.tensor(saved_layer['ocs_dupe_inds']))
-            unpack_quip(layer.mlp.down_proj, saved_layer, codebook_id, codesz)
     glog.info(f'saving model...')
     model.save_pretrained(args.hf_output_path, safe_serialization=True)

 from transformers import AutoTokenizer
 from model.version import MODEL_VERSION
 from model.llama import LlamaForCausalLM as llama_fuse
 from model.mistral import MistralForCausalLM
 from lib import codebook
 from lib.utils.unsafe_import import model_from_hf_path
         module.B.copy_(saved_layer['B'])
     module.SU.copy_(saved_layer['SU'])
     module.SV.copy_(saved_layer['SV'])
     if module.rescale_WH:
         module.scaleWH.copy_(saved_layer['scaleWH'])
     tokenizer = AutoTokenizer.from_pretrained(model_config._name_or_path)
     model_type = model_config.model_type
     model_config.quip_params['model_version'] = MODEL_VERSION
     if model_type == 'llama':
+        model_cls = llama_fuse
     elif model_type == 'mistral':
         model_cls = MistralForCausalLM
     else:
         layer = model.model.layers[ii]
         cpu = torch.device('cpu')
+        glog.info(f'loading layer {ii} qkv')
+        saved_layer = torch.load(f'{args.quantized_path}/{ii}_qkv.pt', map_location=cpu)
+        layer.self_attn.qkv_proj.fuse_scales[0].copy_(saved_layer['W_q_scale'])
+        layer.self_attn.qkv_proj.fuse_scales[1].copy_(saved_layer['W_k_scale'])
+        layer.self_attn.qkv_proj.fuse_scales[2].copy_(saved_layer['W_v_scale'])
+        layer.self_attn.qkv_proj.Wscale.copy_(saved_layer['Wscale'])
+        unpack_quip(layer.self_attn.qkv_proj, saved_layer, codebook_id, codesz)
+        glog.info(f'loading layer {ii} up')
+        saved_layer = torch.load(f'{args.quantized_path}/{ii}_up.pt', map_location=cpu)
+        layer.mlp.upgate_proj.fuse_scales[0].copy_(saved_layer['W_up_scale'])
+        layer.mlp.upgate_proj.fuse_scales[1].copy_(saved_layer['W_gate_scale'])
+        layer.mlp.upgate_proj.Wscale.copy_(saved_layer['Wscale'])
+        unpack_quip(layer.mlp.upgate_proj, saved_layer, codebook_id, codesz)
+        glog.info(f'loading layer {ii} o')
+        saved_layer = torch.load(f'{args.quantized_path}/{ii}_o.pt', map_location=cpu)
+        layer.self_attn.o_proj.Wscale.copy_(saved_layer['W_o_scale'] * saved_layer['Wscale'])
+        unpack_quip(layer.self_attn.o_proj, saved_layer, codebook_id, codesz)
+        glog.info(f'loading layer {ii} down')
+        saved_layer = torch.load(f'{args.quantized_path}/{ii}_down.pt', map_location=cpu)
+        layer.mlp.down_proj.Wscale.copy_(saved_layer['W_down_scale'] * saved_layer['Wscale'])
+        if model_config.quip_params['outlier_channel_split']:
+            layer.mlp.down_proj.ocs_dupe_inds.copy_(torch.tensor(saved_layer['ocs_dupe_inds']))
+        unpack_quip(layer.mlp.down_proj, saved_layer, codebook_id, codesz)
     glog.info(f'saving model...')
     model.save_pretrained(args.hf_output_path, safe_serialization=True)

quip-sharp/lib/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/__pycache__/__init__.cpython-310.pyc and b/quip-sharp/lib/__pycache__/__init__.cpython-310.pyc differ

quip-sharp/lib/codebook/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/codebook/__pycache__/__init__.cpython-310.pyc and b/quip-sharp/lib/codebook/__pycache__/__init__.cpython-310.pyc differ

quip-sharp/lib/codebook/__pycache__/half_integer_4bit_1col.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/codebook/__pycache__/half_integer_4bit_1col.cpython-310.pyc and b/quip-sharp/lib/codebook/__pycache__/half_integer_4bit_1col.cpython-310.pyc differ

quip-sharp/lib/codebook/__pycache__/latticed4.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/codebook/__pycache__/latticed4.cpython-310.pyc and b/quip-sharp/lib/codebook/__pycache__/latticed4.cpython-310.pyc differ

quip-sharp/lib/codebook/__pycache__/latticee8_padded12.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/codebook/__pycache__/latticee8_padded12.cpython-310.pyc and b/quip-sharp/lib/codebook/__pycache__/latticee8_padded12.cpython-310.pyc differ

quip-sharp/lib/codebook/latticee8_padded12.py CHANGED Viewed

@@ -6,7 +6,6 @@ The total codebook is all 2^7 flips of these 256 entries (2^15) +- 1/4
 which makes 2^16 entries.
 This corresponds to a subset of E8 + 1/4
 """
 import torch
 import math
 from torch import nn
@@ -22,19 +21,12 @@ _INT_MAP = 2**(torch.arange(_E8P_CODESZ).flip(0))
 def int2mask(i, int_map):
     return ((i & int_map) > 0).int()
 def mask2int(mask, int_map):
     return (int_map.unsqueeze(0) * mask.int()).sum(dim=-1)
-def get_abs_grid():
-    intr = torch.arange(-4, 4)
-    d8 = torch.cartesian_prod(*[intr] * _E8P_CODESZ).float() + 1 / 2
-    d8m2 = (d8.sum(dim=-1) % 2 == 0)
-    d8n = d8.norm(dim=-1)**2 <= 10
-    d8abs = torch.unique(d8[sorted(torch.where(d8m2 * d8n)[0])].abs(), dim=0)
-    norm12 = torch.tensor([
         [3, 1, 1, 1, 3, 3, 3, 3],
         [1, 3, 1, 1, 3, 3, 3, 3],
         [1, 1, 3, 1, 3, 3, 3, 3],
@@ -62,82 +54,81 @@ def get_abs_grid():
         [1, 3, 3, 3, 1, 3, 3, 1],
         [1, 3, 3, 3, 3, 1, 1, 3],
         [1, 3, 3, 3, 1, 3, 1, 3],
-        [1, 3, 3, 3, 1, 1, 3, 3],
         [3, 3, 1, 1, 3, 3, 3, 1],
     ]) / 2
     return torch.concat([d8abs, norm12], dim=0)
-def get_full_grid(abs_grid):
-    """
-    idx format:
-        - first 8 bits = which of the 256 entries in the abs grid
-        - next 7 bits = which of the right 7 dims to negate (8th can be inferred)
-        - last bit = +1/4 if true else -1/4
-    """
-    is_even_flips = abs_grid.sum(dim=-1) % 2 == 0
-    abs_idxs = torch.arange(len(abs_grid)) << _E8P_CODESZ
-    entries = [[], []]
-    idxs = [[], []]
-    for i in range(2**(_E8P_CODESZ - 1)):
-        mask = int2mask(i, _INT_MAP)
-        mask_even = (mask.sum(dim=-1) % 2 == 0)
-        mask = mask.unsqueeze(0).repeat(len(abs_grid), 1)
-        mask[:, 0] = mask_even != is_even_flips
-        mask = 1 - 2 * mask
-        entries[0].append(abs_grid * mask + 1 / 4)
-        idxs[0].append(abs_idxs + (i << 1) + 1)
-        entries[1].append(abs_grid * mask - 1 / 4)
-        idxs[1].append(abs_idxs + (i << 1))
-    for i in range(2):
-        entries[i] = torch.concat(entries[i], dim=0)
-        idxs[i] = torch.concat(idxs[i], dim=0)
-    entries = torch.concat(entries, dim=0)
-    idxs = torch.concat(idxs, dim=0)
-    return entries, idxs
-_E8P_ABS_CACHED = get_abs_grid()
-_E8P_GRID, _E8P_GRID_IDX = get_full_grid(_E8P_ABS_CACHED)
 class E8P12_codebook(nn.Module):
     def __init__(self, inference=False):
         super(E8P12_codebook, self).__init__()
-        self.opt_scale = 1  #.03#/1.09
         self.codesz = _E8P_CODESZ
-        self.idx_dtype = torch.int16
-        self.idx_offset = -2**15
-        self.packsz = 1
         self.pack_out = False
-        self.version = 0
-        self.register_buffer('grid_abs', _E8P_ABS_CACHED)
-        self.register_buffer('grid_abs_even', self.grid_abs.sum(dim=-1) % 2 == 0)
         if not inference:
-            self.register_buffer('int_map', _INT_MAP)
             self.register_buffer('grid', _E8P_GRID)
-            self.register_buffer('grid_idx_map',
-                                 (_E8P_GRID_IDX + self.idx_offset).to(self.idx_dtype))
-            idx_lut = torch.zeros(_E8P_GRID_IDX.shape).int()
-            idx_lut[_E8P_GRID_IDX] = torch.arange(len(_E8P_GRID_IDX)).int()
-            self.register_buffer('grid_idx_inv', idx_lut)
-            self.register_buffer('grid_norm', torch.diag(self.grid @ self.grid.T))
-            grid_part = self.grid[:len(self.grid) // 2] - 1 / 4
-            idxs = torch.where(
-                ((grid_part[:, 1:] < 0).sum(dim=-1) <= 1) * \
-                (grid_part[:, 1:].min(dim=-1).values >= -0.5)
-            )[0]
-            grid_part = grid_part[idxs]
-            self.register_buffer('grid_part', grid_part)
-            self.register_buffer('grid_part_norm', torch.diag(grid_part @ grid_part.T))
-            allcombo_idx, idx_map = self.iterate_mask()
-            self.register_buffer('allcombo_idx', allcombo_idx)
-            self.register_buffer('idx_map', idx_map)
             '''
             self.to('cuda')
             samples = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(8), torch.eye(8)).rsample([2000000]).cuda()
@@ -146,60 +137,44 @@ class E8P12_codebook(nn.Module):
             exit()
             '''
-    def iterate_mask(self, device=0):
-        flips = torch.stack([((torch.tensor([i]) & self.int_map) > 0).int()
-                             for i in range(2**_E8P_CODESZ)]).to(device)
-        raw_idx = torch.where(flips.sum(dim=-1) % 2 == 0)[0]
-        flips = 1 - 2 * flips[raw_idx]
-        idx_map = torch.zeros(2**_E8P_CODESZ, dtype=torch.int32)
-        for i in range(len(raw_idx)):
-            idx_map[raw_idx[i]] = i
-        allcombo = flips.unsqueeze(1) * self.grid_part.unsqueeze(0).to(device)
-        allcombo_idx = torch.zeros(allcombo.shape[0:2]).int()
-        for i in range(len(allcombo)):
-            allcombo_idx[i] = self.round(allcombo[i], self.grid.to(device),
-                                         self.grid_norm.to(device))[1]
-        return allcombo_idx.cpu(), idx_map.cpu()
     def round(self, X, grid, grid_norm):
         assert X.shape[-1] == self.codesz
         Xqidx = (2 * X @ grid.T - grid_norm).argmax(-1)
         return grid[Xqidx], Xqidx
-    def fast_quantize_part(self, X):
-        X_part = torch.abs(X)
-        X_odd = torch.where((X < 0).sum(dim=-1) % 2 != 0)[0]
-        X_part[X_odd, 0] = -X_part[X_odd, 0]
-        mask = 1 - 2 * (X < 0).to(torch.float32)
-        mask[X_odd, 0] = -mask[X_odd, 0]
-        roundout, Xqidx = self.round(X_part, self.grid_part, self.grid_part_norm)
-        vals = roundout * mask
-        real_idx = self.allcombo_idx[self.idx_map[mask2int((1 - mask) / 2, self.int_map)], Xqidx]
-        err = (X - vals).norm(dim=-1)
-        return vals, real_idx, err
     def quantize(self, X, return_idx=True):
-        X_plus = X + 1 / 4  # quantize X to D8^ - 1/4
-        X_minus = X - 1 / 4  # quantize X to D8^ + 1/4
-        plus_vals, plus_idx, plus_err = self.fast_quantize_part(X_plus)
-        minus_vals, minus_idx, minus_err = self.fast_quantize_part(X_minus)
-        plus_idx = plus_idx + 2**15
-        which = plus_err < minus_err
-        final_vals = torch.where(which.unsqueeze(-1), plus_vals - 1 / 4, minus_vals + 1 / 4)
         if return_idx:
-            final_idxs = self.grid_idx_map[torch.where(which, plus_idx, minus_idx)]
             return final_vals, final_idxs
         return final_vals
-    def maybe_pack_idxs(self, idxs):
-        return idxs
     def by_idxs(self, idxs, **kwargs):
-        return self.grid[self.grid_idx_inv[idxs.int() - self.idx_offset]]
 class QuantizedE8P12Linear(nn.Module):
@@ -207,10 +182,6 @@ class QuantizedE8P12Linear(nn.Module):
     def __init__(self, device):
         super().__init__()
         self.codebook = E8P12_codebook(inference=True).to(torch.float16).to(device)
-        self.codebook_matvec = torch.zeros((256, ), dtype=torch.int64, device=device)
-        for i in range(8):
-            chunk = (self.codebook.grid_abs[:, i] * 4).to(torch.int64)
-            self.codebook_matvec |= chunk << (i * 8)
     def forward(self,
                 input,
@@ -228,9 +199,9 @@ class QuantizedE8P12Linear(nn.Module):
                 rescale_WH=False,
                 scaleWH=None,
                 **kwargs):
-        (m, n) = Qidxs.shape
-        x = input.view(-1, n * _E8P_CODESZ).to(torch.float32)
         if rescale_WH:
             x /= scaleWH
         x = x * SU
@@ -240,17 +211,17 @@ class QuantizedE8P12Linear(nn.Module):
             Bx = x @ B.t().to(torch.float32)
             ABx = Bx @ A.t().to(torch.float32)
-        # TODO: find the optimal threshold
-        if x.size(0) < 6:
-            x = quiptools_cuda.decode_matmul_e8p(x, Qidxs - 0x8000,
-                                                 self.codebook_matvec).to(torch.float32)
         else:
-            W_decompressed = torch.zeros(m,
-                                         n * _E8P_CODESZ,
-                                         device=Qidxs.device,
-                                         dtype=torch.float16)
-            quiptools_cuda.decompress_e8p_origorder(Qidxs, self.codebook.grid_abs,
-                                                    self.codebook.grid_abs_even, W_decompressed)
             x = (x.to(torch.float16) @ W_decompressed.T).to(torch.float32)
         x *= Wscale

 which makes 2^16 entries.
 This corresponds to a subset of E8 + 1/4
 """
 import torch
 import math
 from torch import nn
 def int2mask(i, int_map):
     return ((i & int_map) > 0).int()
 def mask2int(mask, int_map):
     return (int_map.unsqueeze(0) * mask.int()).sum(dim=-1)
+def get_norm12():
+    # 29 elements of norm 12 in E8 + 1/4
+    return torch.tensor([
         [3, 1, 1, 1, 3, 3, 3, 3],
         [1, 3, 1, 1, 3, 3, 3, 3],
         [1, 1, 3, 1, 3, 3, 3, 3],
         [1, 3, 3, 3, 1, 3, 3, 1],
         [1, 3, 3, 3, 3, 1, 1, 3],
         [1, 3, 3, 3, 1, 3, 1, 3],
+        [1, 1, 3, 3, 1, 3, 3, 3],
         [3, 3, 1, 1, 3, 3, 3, 1],
     ]) / 2
+def get_packed_abs_grid():
+    intr = torch.arange(-4, 4)
+    d8 = torch.cartesian_prod(*[intr] * 8).float() + 1 / 2
+    d8m2 = (d8.sum(dim=-1) % 2 == 0)
+    d8n = d8.norm(dim=-1)**2 <= 10
+    d8abs = torch.unique(d8[sorted(torch.where(d8m2 * d8n)[0])].abs(), dim=0)
+    norm12 = get_norm12()
+    cba = torch.concat([d8abs, norm12], dim=0)
+    cba = cba[:, [0, 2, 4, 6, 1, 3, 5, 7]]
+    cba[:,7] *= (1 - 2 * (cba.sum(1) % 2))
+    cba = cba * 2 + 8
+    cba = cba.to(torch.int32)
+    acc = cba[:,0]
+    for i in range(7):
+        acc = acc | (cba[:,(i+1)] << ((i+1)*4))
+    return acc
+def get_abs_grid():
+    intr = torch.arange(-4, 4)
+    d8 = torch.cartesian_prod(*[intr] * _E8P_CODESZ).float() + 1 / 2
+    d8m2 = (d8.sum(dim=-1) % 2 == 0)
+    d8n = d8.norm(dim=-1)**2 <= 10
+    d8abs = torch.unique(d8[sorted(torch.where(d8m2 * d8n)[0])].abs(), dim=0)
+    norm12 = get_norm12()
     return torch.concat([d8abs, norm12], dim=0)
+def get_full_grid(packed_abs_grid):
+    synth_codebook = torch.zeros(1 << 16, 8)
+    shuffle_map = [0,4,1,5,2,6,3,7]
+    for c in range(1 << 16):
+        signs = c & 255
+        abs = c >> 8
+        parity = 0
+        for i in range(8):
+            parity = parity ^ ((signs >> i) & 1)
+        signs = signs ^ parity
+        abs_code = packed_abs_grid[abs].item()
+        for i in range(8):
+            ii = shuffle_map[i]
+            synth_codebook[c,i] = (((abs_code >> (4 * ii)) & 15) - 8) * 0.5
+            if ((signs >> ii) & 1):
+                synth_codebook[c,i] *= -1
+        if parity:
+            synth_codebook[c,:] -= 0.25
+        else:
+            synth_codebook[c,:] += 0.25
+    return synth_codebook, torch.arange(1 << 16)
+_E8P_PACKED_ABS_CACHED = get_packed_abs_grid()
+_E8P_GRID, _E8P_GRID_IDX = get_full_grid(_E8P_PACKED_ABS_CACHED)
 class E8P12_codebook(nn.Module):
     def __init__(self, inference=False):
         super(E8P12_codebook, self).__init__()
+        self.opt_scale = 1.03
         self.codesz = _E8P_CODESZ
+        self.idx_dtype = torch.int64
+        self.packsz = 4
         self.pack_out = False
+        self.version = 1
+        self.register_buffer('grid_packed_abs', _E8P_PACKED_ABS_CACHED)
         if not inference:
             self.register_buffer('grid', _E8P_GRID)
+            self.register_buffer('grid_norm', _E8P_GRID.norm(dim=-1)**2)
             '''
             self.to('cuda')
             samples = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(8), torch.eye(8)).rsample([2000000]).cuda()
             exit()
             '''
     def round(self, X, grid, grid_norm):
         assert X.shape[-1] == self.codesz
         Xqidx = (2 * X @ grid.T - grid_norm).argmax(-1)
         return grid[Xqidx], Xqidx
     def quantize(self, X, return_idx=True):
+        final_vals, final_idxs = self.round(X, self.grid, self.grid_norm)
         if return_idx:
             return final_vals, final_idxs
         return final_vals
+    def maybe_pack_idxs(self, idxs):
+        m, n = idxs.shape
+        idxs = idxs.view(m//2, 2, (n*8)//16, 2).transpose(1, 2).contiguous()
+        abs32 = (idxs[:, :, 0, 0] >> 8) + \
+            ((idxs[:, :, 1, 0] >> 8) << 8) + \
+            ((idxs[:, :, 0, 1] >> 8) << 16) + \
+            ((idxs[:, :, 1, 1] >> 8) << 24)
+        sign32 = torch.zeros(abs32.shape, dtype=abs32.dtype, device=abs32.device)
+        for i in range(4):
+            wt = idxs[:, :, i % 2, i // 2]
+            for j in range(8):
+                sign32 += ((wt >> j) & 1) << (4*j + i)
+        output = (sign32 << 32) + abs32
+        output = output.reshape(m//16, 8, n//8, 4).transpose(1, 2).contiguous()
+        return output.view(m, n//4)
     def by_idxs(self, idxs, **kwargs):
+        m, n = idxs.shape
+        W_decompressed = quiptools_cuda.decompress_packed_e8p(
+            idxs.view(m//16, n//2, 8, 4),
+            self.grid_packed_abs
+        )
+        return W_decompressed
 class QuantizedE8P12Linear(nn.Module):
     def __init__(self, device):
         super().__init__()
         self.codebook = E8P12_codebook(inference=True).to(torch.float16).to(device)
     def forward(self,
                 input,
                 rescale_WH=False,
                 scaleWH=None,
                 **kwargs):
+        n, m = len(SU), len(SV)
+        x = input.view(-1, n).to(torch.float32)
         if rescale_WH:
             x /= scaleWH
         x = x * SU
             Bx = x @ B.t().to(torch.float32)
             ABx = Bx @ A.t().to(torch.float32)
+        if x.size(0) == 1:
+            x = quiptools_cuda.decode_matvec_e8p(
+                x[0].to(torch.float16),
+                Qidxs.view(m//16, n//64, 8, 4),
+                self.codebook.grid_packed_abs
+            ).to(torch.float32)
         else:
+            W_decompressed = quiptools_cuda.decompress_packed_e8p(
+                Qidxs.view(m//16, n//64, 8, 4),
+                self.codebook.grid_packed_abs
+            )
             x = (x.to(torch.float16) @ W_decompressed.T).to(torch.float32)
         x *= Wscale

quip-sharp/lib/linear/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/linear/__pycache__/__init__.cpython-310.pyc and b/quip-sharp/lib/linear/__pycache__/__init__.cpython-310.pyc differ

quip-sharp/lib/linear/__pycache__/fused_quantized_linear.cpython-310.pyc ADDED Viewed

Binary file (1.43 kB). View file

quip-sharp/lib/linear/__pycache__/quantized_linear.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/linear/__pycache__/quantized_linear.cpython-310.pyc and b/quip-sharp/lib/linear/__pycache__/quantized_linear.cpython-310.pyc differ

quip-sharp/lib/linear/fused_quantized_linear.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+import quiptools_cuda
+from lib.utils import dtype_from_str, get_hadK
+from lib import codebook
+from .quantized_linear import QuantizedLinear
+import time
+class FusedQuantizedLinear(QuantizedLinear):
+    def __init__(self, fuse_dim, fuse_sizes, *QL_args, **QL_kwargs):
+        super(FusedQuantizedLinear, self).__init__(*QL_args, **QL_kwargs)
+        self.fuse_dim = fuse_dim
+        self.fuse_sizes = fuse_sizes
+        self.register_buffer('fuse_scales', torch.ones(len(self.fuse_sizes)))
+        self.n = len(self.fuse_sizes)
+    def forward(self, input):
+        fused_output = super(FusedQuantizedLinear, self).forward(input)
+        split_outputs = torch.split(fused_output, self.fuse_sizes, self.fuse_dim)
+        return tuple(split_outputs[i] * self.fuse_scales[i] for i in range(self.n))

quip-sharp/lib/linear/quantized_linear.py CHANGED Viewed

@@ -18,7 +18,8 @@ class QuantizedLinear(nn.Module):
                  codebook_version,
                  outlier_channel_split=False,
                  rank=-1,
-                 rescale_WH=False):
         super().__init__()
         self.in_features = in_features
@@ -27,6 +28,10 @@ class QuantizedLinear(nn.Module):
         self.rank = rank
         self.rescale_WH = rescale_WH
         if self.outlier_channel_split:
             self.register_buffer('ocs_dupe_inds', torch.arange(in_features))
@@ -87,18 +92,22 @@ class QuantizedLinear(nn.Module):
         if self.outlier_channel_split:
             input = input[..., self.ocs_dupe_inds]
-        return self.codebook_class(input,
-                                   self.Qidxs,
-                                   self.SU,
-                                   self.SV,
-                                   self.Wscale,
-                                   self.had_left,
-                                   self.had_right,
-                                   self.K_left,
-                                   self.K_right,
-                                   rank=self.rank,
-                                   A=self.A,
-                                   B=self.B,
-                                   rescale_WH=self.rescale_WH,
-                                   scaleWH=self.scaleWH,
-                                   packed=self.packed)

                  codebook_version,
                  outlier_channel_split=False,
                  rank=-1,
+                 rescale_WH=False,
+                 bias=False):
         super().__init__()
         self.in_features = in_features
         self.rank = rank
         self.rescale_WH = rescale_WH
+        self.has_bias = bias
+        if self.has_bias:
+            self.register_buffer('bias', torch.ones(out_features))
         if self.outlier_channel_split:
             self.register_buffer('ocs_dupe_inds', torch.arange(in_features))
         if self.outlier_channel_split:
             input = input[..., self.ocs_dupe_inds]
+        result = self.codebook_class(input,
+                                     self.Qidxs,
+                                     self.SU,
+                                     self.SV,
+                                     self.Wscale,
+                                     self.had_left,
+                                     self.had_right,
+                                     self.K_left,
+                                     self.K_right,
+                                     rank=self.rank,
+                                     A=self.A,
+                                     B=self.B,
+                                     rescale_WH=self.rescale_WH,
+                                     scaleWH=self.scaleWH,
+                                     packed=self.packed)
+        if self.has_bias:
+            return result + self.bias
+        return result

quip-sharp/lib/utils/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/__init__.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/__init__.cpython-310.pyc differ

quip-sharp/lib/utils/__pycache__/data_utils.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/data_utils.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/data_utils.cpython-310.pyc differ

quip-sharp/lib/utils/__pycache__/lm_eval_adaptor.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/lm_eval_adaptor.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/lm_eval_adaptor.cpython-310.pyc differ

quip-sharp/lib/utils/__pycache__/math_utils.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/math_utils.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/math_utils.cpython-310.pyc differ

quip-sharp/lib/utils/__pycache__/matmul_had.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/matmul_had.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/matmul_had.cpython-310.pyc differ

quip-sharp/lib/utils/__pycache__/matmul_kron.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/matmul_kron.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/matmul_kron.cpython-310.pyc differ

quip-sharp/lib/utils/__pycache__/misc.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/misc.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/misc.cpython-310.pyc differ

quip-sharp/lib/utils/__pycache__/unsafe_import.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/lib/utils/__pycache__/unsafe_import.cpython-310.pyc and b/quip-sharp/lib/utils/__pycache__/unsafe_import.cpython-310.pyc differ

quip-sharp/lib/utils/data_utils.py CHANGED Viewed

@@ -58,7 +58,6 @@ def block_LDL(H, b):
 def wrap_tokenizer(tokenizer, x, ctx_size):
     return tokenizer(x, return_tensors='pt', truncation=True, padding=True, max_length=ctx_size)
 def sample_devset(dataset, tokenizer, size=128, ctx_size=2048, nproc=1):
     devset = torch.zeros((size, ctx_size), dtype=torch.int64)
     saved = 0
@@ -122,6 +121,7 @@ def load_quip(save_name, cb, args, device):
 def dtype_from_str(str):
     dtype_map = {
         'torch.int32': torch.int32,
         'torch.int16': torch.int16,
         'torch.uint8': torch.uint8,

 def wrap_tokenizer(tokenizer, x, ctx_size):
     return tokenizer(x, return_tensors='pt', truncation=True, padding=True, max_length=ctx_size)
 def sample_devset(dataset, tokenizer, size=128, ctx_size=2048, nproc=1):
     devset = torch.zeros((size, ctx_size), dtype=torch.int64)
     saved = 0
 def dtype_from_str(str):
     dtype_map = {
+        'torch.int64': torch.int64,
         'torch.int32': torch.int32,
         'torch.int16': torch.int16,
         'torch.uint8': torch.uint8,

quip-sharp/lib/utils/unsafe_import.py CHANGED Viewed

@@ -2,7 +2,6 @@
 from model.graph_wrapper import get_graph_wrapper
 from model.llama import LlamaForCausalLM as llama_fuse
-from model.llama_nofuse import LlamaForCausalLM as llama_nofuse
 from model.mistral import MistralForCausalLM
 import json
 import os
@@ -17,10 +16,9 @@ def model_from_hf_path(path, use_cuda_graph=True, use_flash_attn=True):
     is_quantized = hasattr(bad_config, 'quip_params')
     model_type = bad_config.model_type
     if is_quantized:
-        fused = bad_config.quip_params.get('fused', True)
         if model_type == 'llama':
             model_str = transformers.LlamaConfig.from_pretrained(path)._name_or_path
-            model_cls = llama_fuse if fused else llama_nofuse
         elif model_type == 'mistral':
             model_str = transformers.MistralConfig.from_pretrained(path)._name_or_path
             model_cls = MistralForCausalLM

 from model.graph_wrapper import get_graph_wrapper
 from model.llama import LlamaForCausalLM as llama_fuse
 from model.mistral import MistralForCausalLM
 import json
 import os
     is_quantized = hasattr(bad_config, 'quip_params')
     model_type = bad_config.model_type
     if is_quantized:
         if model_type == 'llama':
             model_str = transformers.LlamaConfig.from_pretrained(path)._name_or_path
+            model_cls = llama_fuse
         elif model_type == 'mistral':
             model_str = transformers.MistralConfig.from_pretrained(path)._name_or_path
             model_cls = MistralForCausalLM

quip-sharp/model/__pycache__/graph_wrapper.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/model/__pycache__/graph_wrapper.cpython-310.pyc and b/quip-sharp/model/__pycache__/graph_wrapper.cpython-310.pyc differ

quip-sharp/model/__pycache__/llama.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/model/__pycache__/llama.cpython-310.pyc and b/quip-sharp/model/__pycache__/llama.cpython-310.pyc differ

quip-sharp/model/__pycache__/mistral.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/model/__pycache__/mistral.cpython-310.pyc and b/quip-sharp/model/__pycache__/mistral.cpython-310.pyc differ

quip-sharp/model/__pycache__/version.cpython-310.pyc CHANGED Viewed

Binary files a/quip-sharp/model/__pycache__/version.cpython-310.pyc and b/quip-sharp/model/__pycache__/version.cpython-310.pyc differ

quip-sharp/model/llama.py CHANGED Viewed

@@ -48,6 +48,7 @@ if is_flash_attn_available():
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 from lib.linear.quantized_linear import QuantizedLinear
 from .version import check_model_version
 logger = logging.get_logger(__name__)
@@ -225,15 +226,17 @@ class LlamaMLP(nn.Module):
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.upgate_proj = QuantizedLinear(self.hidden_size,
-                                           self.intermediate_size * 2,
-                                           config.quip_params['codesz'],
-                                           config.quip_params.get('packsz', 1),
-                                           config.quip_params.get('pack_out', False),
-                                           config.quip_params['idx_dtype'],
-                                           config.quip_params.get('codebook_version', 0),
-                                           rank=config.quip_params['lora_rank'],
-                                           rescale_WH=config.quip_params['rescale_WH'])
         self.down_proj = QuantizedLinear(
             self.config.quip_params['ocs_down_size'] if \
             self.config.quip_params['outlier_channel_split'] else self.intermediate_size,
@@ -246,24 +249,14 @@ class LlamaMLP(nn.Module):
             outlier_channel_split=self.config.quip_params['outlier_channel_split'],
             rank=self.config.quip_params['lora_rank'],
             rescale_WH=self.config.quip_params['rescale_WH'])
-        self.register_buffer('up_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('gate_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('down_scale', nn.Parameter(torch.ones(())))
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
         if self.config.pretraining_tp > 1:
             raise Exception
-        else:
-            upgate_proj = self.upgate_proj(x.to(torch.float32))
-            up_proj = self.up_scale * upgate_proj[...,
-                                                  0:self.intermediate_size]
-            gate_proj = self.gate_scale * upgate_proj[
-                ..., self.intermediate_size:(self.intermediate_size * 2)]
-            down_proj = self.down_scale * self.down_proj(
-                self.act_fn(gate_proj) * up_proj)
-        return down_proj.half()
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -297,7 +290,12 @@ class LlamaAttention(nn.Module):
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                 f" and `num_heads`: {self.num_heads})."
             )
-        self.qkv_proj = QuantizedLinear(
             self.hidden_size, (self.num_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim),
@@ -308,7 +306,7 @@ class LlamaAttention(nn.Module):
             config.quip_params.get('codebook_version', 0),
             rank=config.quip_params['lora_rank'],
             rescale_WH=config.quip_params['rescale_WH'])
         self.o_proj = QuantizedLinear(self.num_heads * self.head_dim,
                                       self.hidden_size,
                                       config.quip_params['codesz'],
@@ -319,10 +317,6 @@ class LlamaAttention(nn.Module):
                                       rank=config.quip_params['lora_rank'],
                                       rescale_WH=config.quip_params['rescale_WH'])
-        self.register_buffer('q_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('k_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('v_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('o_scale', nn.Parameter(torch.ones(())))
         self._init_rope()
     def _init_rope(self):
@@ -370,19 +364,7 @@ class LlamaAttention(nn.Module):
         if self.config.pretraining_tp > 1:
             assert (False)
         else:
-            qkv_states = self.qkv_proj(hidden_states.to(torch.float32))
-            query_states = self.q_scale * qkv_states[..., 0:(self.num_heads *
-                                                             self.head_dim)]
-            key_states = self.k_scale * qkv_states[..., (
-                self.num_heads * self.head_dim):(
-                    (self.num_heads * self.head_dim) +
-                    (self.num_key_value_heads * self.head_dim))]
-            value_states = self.v_scale * qkv_states[..., (
-                (self.num_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim)):(
-                    (self.num_heads * self.head_dim) +
-                    (self.num_key_value_heads * self.head_dim) +
-                    (self.num_key_value_heads * self.head_dim))]
             query_states = query_states.half()
             key_states = key_states.half()
             value_states = value_states.half()
@@ -439,7 +421,7 @@ class LlamaAttention(nn.Module):
         if self.config.pretraining_tp > 1:
             assert (False)
         else:
-            attn_output = (self.o_scale * self.o_proj(attn_output)).half()
         if not output_attentions:
             attn_weights = None
@@ -468,19 +450,7 @@ class LlamaFlashAttention2(LlamaAttention):
         output_attentions = False
         bsz, q_len, _ = hidden_states.size()
-        qkv_states = self.qkv_proj(hidden_states.to(torch.float32))
-        query_states = self.q_scale * qkv_states[..., 0:(self.num_heads *
-                                                         self.head_dim)]
-        key_states = self.k_scale * qkv_states[..., (
-            self.num_heads * self.head_dim):(
-                (self.num_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim))]
-        value_states = self.v_scale * qkv_states[..., (
-            (self.num_heads * self.head_dim) +
-            (self.num_key_value_heads * self.head_dim)):(
-                (self.num_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim))]
         query_states = query_states.half()
         key_states = key_states.half()
         value_states = value_states.half()
@@ -538,7 +508,7 @@ class LlamaFlashAttention2(LlamaAttention):
         )
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = (self.o_scale * self.o_proj(attn_output)).half()
         if not output_attentions:
             attn_weights = None

     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 from lib.linear.quantized_linear import QuantizedLinear
+from lib.linear.fused_quantized_linear import FusedQuantizedLinear
 from .version import check_model_version
 logger = logging.get_logger(__name__)
         self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        self.upgate_proj = FusedQuantizedLinear(
+            -1, (self.intermediate_size, self.intermediate_size),
+            self.hidden_size,
+            self.intermediate_size * 2,
+            config.quip_params['codesz'],
+            config.quip_params.get('packsz', 1),
+            config.quip_params.get('pack_out', False),
+            config.quip_params['idx_dtype'],
+            config.quip_params.get('codebook_version', 0),
+            rank=config.quip_params['lora_rank'],
+            rescale_WH=config.quip_params['rescale_WH'])
         self.down_proj = QuantizedLinear(
             self.config.quip_params['ocs_down_size'] if \
             self.config.quip_params['outlier_channel_split'] else self.intermediate_size,
             outlier_channel_split=self.config.quip_params['outlier_channel_split'],
             rank=self.config.quip_params['lora_rank'],
             rescale_WH=self.config.quip_params['rescale_WH'])
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
         if self.config.pretraining_tp > 1:
             raise Exception
+        up_proj, gate_proj = self.upgate_proj(x.to(torch.float32))
+        return self.down_proj(self.act_fn(gate_proj) * up_proj).half()
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                 f" and `num_heads`: {self.num_heads})."
             )
+        self.qkv_proj = FusedQuantizedLinear(
+            -1,
+            (self.num_heads*self.head_dim,
+             self.num_key_value_heads*self.head_dim,
+             self.num_key_value_heads*self.head_dim),
             self.hidden_size, (self.num_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim),
             config.quip_params.get('codebook_version', 0),
             rank=config.quip_params['lora_rank'],
             rescale_WH=config.quip_params['rescale_WH'])
         self.o_proj = QuantizedLinear(self.num_heads * self.head_dim,
                                       self.hidden_size,
                                       config.quip_params['codesz'],
                                       rank=config.quip_params['lora_rank'],
                                       rescale_WH=config.quip_params['rescale_WH'])
         self._init_rope()
     def _init_rope(self):
         if self.config.pretraining_tp > 1:
             assert (False)
         else:
+            query_states, key_states, value_states = self.qkv_proj(hidden_states.to(torch.float32))
             query_states = query_states.half()
             key_states = key_states.half()
             value_states = value_states.half()
         if self.config.pretraining_tp > 1:
             assert (False)
         else:
+            attn_output = self.o_proj(attn_output).half()
         if not output_attentions:
             attn_weights = None
         output_attentions = False
         bsz, q_len, _ = hidden_states.size()
+        query_states, key_states, value_states = self.qkv_proj(hidden_states.to(torch.float32))
         query_states = query_states.half()
         key_states = key_states.half()
         value_states = value_states.half()
         )
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output).half()
         if not output_attentions:
             attn_weights = None

quip-sharp/model/mistral.py CHANGED Viewed

@@ -48,6 +48,7 @@ if is_flash_attn_available():
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 from lib.linear.quantized_linear import QuantizedLinear
 from .version import check_model_version
 logger = logging.get_logger(__name__)
@@ -192,15 +193,17 @@ class MistralMLP(nn.Module):
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.upgate_proj = QuantizedLinear(self.hidden_size,
-                                           self.intermediate_size * 2,
-                                           config.quip_params['codesz'],
-                                           config.quip_params.get('packsz', 1),
-                                           config.quip_params.get('pack_out', False),
-                                           config.quip_params['idx_dtype'],
-                                           config.quip_params.get('codebook_version', 0),
-                                           rank=config.quip_params['lora_rank'],
-                                           rescale_WH=config.quip_params['rescale_WH'])
         self.down_proj = QuantizedLinear(
             self.config.quip_params['ocs_down_size'] if \
             self.config.quip_params['outlier_channel_split'] else self.intermediate_size,
@@ -213,20 +216,11 @@ class MistralMLP(nn.Module):
             outlier_channel_split=self.config.quip_params['outlier_channel_split'],
             rank=self.config.quip_params['lora_rank'],
             rescale_WH=self.config.quip_params['rescale_WH'])
-        self.register_buffer('up_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('gate_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('down_scale', nn.Parameter(torch.ones(())))
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
-        upgate_proj = self.upgate_proj(x.to(torch.float32))
-        up_proj = self.up_scale * upgate_proj[...,
-                                              0:self.intermediate_size]
-        gate_proj = self.gate_scale * upgate_proj[
-            ..., self.intermediate_size:(self.intermediate_size * 2)]
-        down_proj = self.down_scale * self.down_proj(
-            self.act_fn(gate_proj) * up_proj)
-        return down_proj.half()
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -264,7 +258,11 @@ class MistralAttention(nn.Module):
                 f" and `num_heads`: {self.num_heads})."
             )
-        self.qkv_proj = QuantizedLinear(
             self.hidden_size, (self.num_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim),
@@ -286,11 +284,6 @@ class MistralAttention(nn.Module):
                                       rank=config.quip_params['lora_rank'],
                                       rescale_WH=config.quip_params['rescale_WH'])
-        self.register_buffer('q_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('k_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('v_scale', nn.Parameter(torch.ones(())))
-        self.register_buffer('o_scale', nn.Parameter(torch.ones(())))
         self.rotary_emb = MistralRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
@@ -312,19 +305,7 @@ class MistralAttention(nn.Module):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
-        qkv_states = self.qkv_proj(hidden_states.to(torch.float32))
-        query_states = self.q_scale * qkv_states[..., 0:(self.num_heads *
-                                                         self.head_dim)]
-        key_states = self.k_scale * qkv_states[..., (
-            self.num_heads * self.head_dim):(
-                (self.num_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim))]
-        value_states = self.v_scale * qkv_states[..., (
-            (self.num_heads * self.head_dim) +
-            (self.num_key_value_heads * self.head_dim)):(
-                (self.num_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim))]
         query_states = query_states.half()
         key_states = key_states.half()
         value_states = value_states.half()
@@ -379,7 +360,7 @@ class MistralAttention(nn.Module):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = (self.o_scale * self.o_proj(attn_output)).half()
         if not output_attentions:
             attn_weights = None
@@ -406,19 +387,7 @@ class MistralFlashAttention2(MistralAttention):
     ):
         bsz, q_len, _ = hidden_states.size()
-        qkv_states = self.qkv_proj(hidden_states.to(torch.float32))
-        query_states = self.q_scale * qkv_states[..., 0:(self.num_heads *
-                                                         self.head_dim)]
-        key_states = self.k_scale * qkv_states[..., (
-            self.num_heads * self.head_dim):(
-                (self.num_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim))]
-        value_states = self.v_scale * qkv_states[..., (
-            (self.num_heads * self.head_dim) +
-            (self.num_key_value_heads * self.head_dim)):(
-                (self.num_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim) +
-                (self.num_key_value_heads * self.head_dim))]
         query_states = query_states.half()
         key_states = key_states.half()
         value_states = value_states.half()
@@ -517,7 +486,7 @@ class MistralFlashAttention2(MistralAttention):
         )
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = (self.o_scale * self.o_proj(attn_output)).half()
         if not output_attentions:
             attn_weights = None

     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 from lib.linear.quantized_linear import QuantizedLinear
+from lib.linear.fused_quantized_linear import FusedQuantizedLinear
 from .version import check_model_version
 logger = logging.get_logger(__name__)
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        self.upgate_proj = FusedQuantizedLinear(
+            -1, (self.intermediate_size, self.intermediate_size),
+            self.hidden_size,
+            self.intermediate_size * 2,
+            config.quip_params['codesz'],
+            config.quip_params.get('packsz', 1),
+            config.quip_params.get('pack_out', False),
+            config.quip_params['idx_dtype'],
+            config.quip_params.get('codebook_version', 0),
+            rank=config.quip_params['lora_rank'],
+            rescale_WH=config.quip_params['rescale_WH'])
         self.down_proj = QuantizedLinear(
             self.config.quip_params['ocs_down_size'] if \
             self.config.quip_params['outlier_channel_split'] else self.intermediate_size,
             outlier_channel_split=self.config.quip_params['outlier_channel_split'],
             rank=self.config.quip_params['lora_rank'],
             rescale_WH=self.config.quip_params['rescale_WH'])
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
+        up_proj, gate_proj = self.upgate_proj(x.to(torch.float32))
+        return self.down_proj(self.act_fn(gate_proj) * up_proj).half()
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
                 f" and `num_heads`: {self.num_heads})."
             )
+        self.qkv_proj = FusedQuantizedLinear(
+            -1,
+            (self.num_heads*self.head_dim,
+             self.num_key_value_heads*self.head_dim,
+             self.num_key_value_heads*self.head_dim),
             self.hidden_size, (self.num_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim) +
             (self.num_key_value_heads * self.head_dim),
                                       rank=config.quip_params['lora_rank'],
                                       rescale_WH=config.quip_params['rescale_WH'])
         self.rotary_emb = MistralRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
+        query_states, key_states, value_states = self.qkv_proj(hidden_states.to(torch.float32))
         query_states = query_states.half()
         key_states = key_states.half()
         value_states = value_states.half()
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output).half()
         if not output_attentions:
             attn_weights = None
     ):
         bsz, q_len, _ = hidden_states.size()
+        query_states, key_states, value_states = self.qkv_proj(hidden_states.to(torch.float32))
         query_states = query_states.half()
         key_states = key_states.half()
         value_states = value_states.half()
         )
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output).half()
         if not output_attentions:
             attn_weights = None

quip-sharp/model/version.py CHANGED Viewed

@@ -1,8 +1,8 @@
-MODEL_VERSION = 0
 def check_model_version(test):
     if test != MODEL_VERSION:
         raise Exception(
             f"Saved model version ({test}) does not match the "\
             f"source code model version ({MODEL_VERSION}). "\
-            "Please pull the latest code from git@github.com:Cornell-RelaxML/quip-sharp.git")

+MODEL_VERSION = 1
 def check_model_version(test):
     if test != MODEL_VERSION:
         raise Exception(
             f"Saved model version ({test}) does not match the "\
             f"source code model version ({MODEL_VERSION}). "\
+            "Please pull the latest code or model checkpoints.")

quip-sharp/quantize_llama.py CHANGED Viewed

@@ -26,8 +26,8 @@ parser.add_argument('--num_cpu_threads', default=8, type=int)
 parser.add_argument('--batch_size', default=8, type=int)
 parser.add_argument('--devset_size', default=64, type=int)
 parser.add_argument('--ctx_size', default=2048, type=int)
-parser.add_argument('--save_path', default='checkpoints/quantized-hada-70b', type=str)
-parser.add_argument('--hessian_path', default='/share/desa/nfs01/quip_llama2/hessians', type=str)
 parser.add_argument('--base_model', default='meta-llama/Llama-2-70b-hf', type=str)
 parser.add_argument('--sigma_reg', default=1e-2, type=float)
 parser.add_argument('--sigma_reg2', default=1e-2, type=float)
@@ -286,7 +286,7 @@ def main(args):
         all_config['model_config'].quip_params['ocs_down_size'] = args.ocs_down_size
     torch.save(all_config, os.path.join(args.save_path, 'config.pt'))
-    tokenizer = AutoTokenizer.from_pretrained(args.base_model, use_fast=False)
     tokenizer.pad_token = tokenizer.eos_token
     glog.info('loaded model')

 parser.add_argument('--batch_size', default=8, type=int)
 parser.add_argument('--devset_size', default=64, type=int)
 parser.add_argument('--ctx_size', default=2048, type=int)
+parser.add_argument('--save_path', type=str)
+parser.add_argument('--hessian_path', type=str)
 parser.add_argument('--base_model', default='meta-llama/Llama-2-70b-hf', type=str)
 parser.add_argument('--sigma_reg', default=1e-2, type=float)
 parser.add_argument('--sigma_reg2', default=1e-2, type=float)
         all_config['model_config'].quip_params['ocs_down_size'] = args.ocs_down_size
     torch.save(all_config, os.path.join(args.save_path, 'config.pt'))
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
     tokenizer.pad_token = tokenizer.eos_token
     glog.info('loaded model')

quip-sharp/quiptools/build/lib.linux-x86_64-cpython-310/quiptools_cuda.cpython-310-x86_64-linux-gnu.so CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:899ab752e90b0cf2fbb9548cf4017ad053a8dda89b5cc78a765b72b3703bc11b
-size 13026208

 version https://git-lfs.github.com/spec/v1
+oid sha256:b869e88b6457109857b32ebb2ba424ebb6cdc53ecc3f856ef881709a4fbccf85
+size 12982144

quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/.ninja_deps CHANGED Viewed

Binary files a/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/.ninja_deps and b/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/.ninja_deps differ

quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/.ninja_log CHANGED Viewed

@@ -1,6 +1,4 @@
 # ninja log v5
-0	18168	1703587582805727136	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_wrapper.o	1b1606004175d38f
-9	19979	1703587632810492590	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_wrapper.o	c55be518cf9b4c1e
-8	18153	1703587706965942532	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_wrapper.o	1b1606004175d38f
-8	43545	1703587732366665742	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools.o	f601b9f154f8bde0
-8	47187	1703587736006769314	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_e8p_gemv.o	9d441ce55de572ae

 # ninja log v5
+0	19060	1704135981196240899	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_wrapper.o	1b1606004175d38f
+0	43897	1704136006046927545	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools.o	f601b9f154f8bde0
+0	46532	1704136008677000213	/run/media/knut/HD/text-generation-webui/repositories/quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_e8p_gemv.o	9d441ce55de572ae

quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools.o CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57c7f4e07515bed1baa410f842db9703d80c82b25dd0d69588266772eebe746c
 size 2174288

 version https://git-lfs.github.com/spec/v1
+oid sha256:874d7a979e49a3dcecaec7c56b4072f8f055e60a4560f5225ad1c385ce38607c
 size 2174288

quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_e8p_gemv.o CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:822a1718cc8158b9b8311cdc714853d8adc22ce3ddaa5af7a85a8839fe3757a6
-size 5510384

 version https://git-lfs.github.com/spec/v1
+oid sha256:f237dcab2ca8896deac1088c1857f78235ba149f26b26b55a3761832fe08c1f6
+size 5448600

quip-sharp/quiptools/build/temp.linux-x86_64-cpython-310/quiptools_wrapper.o CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e9b169bc14cbe82bc31c5c9759c8cde3c6ee185919c42ef251d99dd3b8ca06ef
-size 6681584

 version https://git-lfs.github.com/spec/v1
+oid sha256:b7aa0a52748df5ca8b4e93a471bf7f3b491676c8c4a2be257855f08b1fa6c7f3
+size 6729784

quip-sharp/quiptools/dist/quiptools_cuda-0.0.0-py3.10-linux-x86_64.egg CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8393c6c6b07efe01541da956828ceca44efb79465dda3ea62b1e25b5c84297ed
-size 4193234

 version https://git-lfs.github.com/spec/v1
+oid sha256:52d0c3eab47c2b08bed9da2d4a6e26c6ebb373192077b27eb4162f01d82a4237
+size 4181938

quip-sharp/quiptools/quiptools_cuda.egg-info/SOURCES.txt CHANGED Viewed

@@ -1,12 +1,7 @@
-.quiptools.cu.swp
-benchmark_e8p.py
-error.txt
 quiptools.cu
 quiptools_e8p_gemv.cu
 quiptools_wrapper.cpp
 setup.py
-test_d4.py
-test_e8p.py
 quiptools_cuda.egg-info/PKG-INFO
 quiptools_cuda.egg-info/SOURCES.txt
 quiptools_cuda.egg-info/dependency_links.txt

 quiptools.cu
 quiptools_e8p_gemv.cu
 quiptools_wrapper.cpp
 setup.py
 quiptools_cuda.egg-info/PKG-INFO
 quiptools_cuda.egg-info/SOURCES.txt
 quiptools_cuda.egg-info/dependency_links.txt

quip-sharp/quiptools/quiptools_e8p_gemv.cu CHANGED Viewed

@@ -9,6 +9,8 @@
 #include <cuda_fp16.h>
 #include <mma.h>
 #include <ATen/ATen.h>
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
@@ -40,222 +42,235 @@ __host__ static inline void gpuAssert(cudaError_t code, const char *file, int li
     }
 }
-#define BLOCK_SIZE 512
-#define WARP_SIZE 32
-__device__ static inline uint64_t decode8weights(
-    uint16_t weight_compressed,
-    const int64_t *__restrict__ codebook_abs
-) {
-    uint32_t bit_shift = (weight_compressed & 1)^1;
-    uint8_t bits_sign = (weight_compressed >> 1) & ((1 << 7) - 1);
-    uint8_t bits_abs = (weight_compressed >> 8) & ((1 << 9) - 1);
-    int64_t packed_ = codebook_abs[bits_abs];
-    uint32_t packed[2];
-    memcpy(packed, &packed_, sizeof(packed));
-    // TODO: optimize this by redefining the bit pattern
-    uint32_t parity = __popc(packed[0] & 0x04040404) ^ __popc(packed[1]&0x04040404);
-    uint8_t sign_vec = bits_sign | ((__popc(bits_sign) ^ parity) << 7);
-    uint32_t decoded_sign[2];
-    decoded_sign[0] = sign_vec * 0x08040201ll;
-    decoded_sign[1] = sign_vec * 0x80402010ll;
-    decoded_sign[0] &= 0x80808080;
-    decoded_sign[1] &= 0x80808080;
-    decoded_sign[0] >>= 7;
-    decoded_sign[1] >>= 7;
-    decoded_sign[0] *= 255 - 3;
-    decoded_sign[1] *= 255 - 3;
-    packed[0] ^= decoded_sign[0];
-    packed[1] ^= decoded_sign[1];
-    packed[0] |= 0x01010101;
-    packed[1] |= 0x01010101;
-    packed[0] -= bit_shift * 0x02020202;
-    packed[1] -= bit_shift * 0x02020202;
-    memcpy(&packed_, packed, sizeof(packed));
-    return packed_;
 }
-/*
-llama 2 70B:
-M N K
-1 8192 8192
-1 57344 8192
-1 8192 28672
-1 10240 8192
-*/
-template <typename scalar_t>
 __global__ static void
-__launch_bounds__(BLOCK_SIZE)
-decode_matmul_e8p_kernel(
-    scalar_t *__restrict__ output,
-    const scalar_t *__restrict__ x,
-    const int16_t *__restrict__ weights_compressed,
-    const int64_t *__restrict__ codebook_abs,
-    int64_t M,
-    int64_t N,
-    int64_t K
 ) {
-    __shared__ int64_t codebook_local[256];
-    if (threadIdx.x < 256) {
-    codebook_local[threadIdx.x] = codebook_abs[threadIdx.x];
-    }
-    __syncthreads();
-    int64_t warpId = threadIdx.x / WARP_SIZE;
-    int64_t laneId = threadIdx.x % WARP_SIZE;
-    // each thread adds 8 activation-weight products
-    const int64_t unroll_k = 2;
-    const int64_t pack = 8;
-    const int64_t elem_per_thread = pack * unroll_k;
-    int64_t warps_per_elem = K / WARP_SIZE / elem_per_thread;
-    const int64_t unroll_n = 16;
-    const int64_t local_k = 1; // in terms of warp size. 32 threads of elem_per_thread fma each, dont set below 1 because of __shfl_down_sync
-    int64_t local_n = BLOCK_SIZE / WARP_SIZE / local_k;
-    int64_t grid_N = N / unroll_n;
-    __shared__ scalar_t accum_scratch[BLOCK_SIZE / WARP_SIZE];
-    bool SHARED_REDUCE = false;
-    for (int64_t warpPos = blockIdx.x * BLOCK_SIZE/WARP_SIZE + warpId;
-            warpPos < M * grid_N * warps_per_elem;
-            warpPos += gridDim.x * BLOCK_SIZE/WARP_SIZE) {
-        int64_t local_n_i = (warpPos% (BLOCK_SIZE / WARP_SIZE)) / local_k;
-        int64_t local_k_i = (warpPos% (BLOCK_SIZE / WARP_SIZE)) % local_k;
-        int64_t m = (warpPos / warps_per_elem) / (grid_N);
-        int64_t k_ = warpPos % (warps_per_elem * local_n);
-        int64_t k = k_ / (local_k * local_n) * local_k + k_ % local_k;
-        scalar_t this_activations[elem_per_thread];
-#pragma unroll
-        for (int64_t unroll_k_i = 0; unroll_k_i < unroll_k; unroll_k_i++) {
-            const scalar_t *activations = x + m * K + (k * WARP_SIZE + laneId) * elem_per_thread + unroll_k_i * pack;
-            if constexpr (std::is_same<scalar_t, float>::value) {
-                const float4 *first_half = reinterpret_cast<const float4 *>(activations);
-                __builtin_assume_aligned(first_half, 16);
-                this_activations[unroll_k_i * pack + 0] = first_half->x;
-                this_activations[unroll_k_i * pack + 1] = first_half->y;
-                this_activations[unroll_k_i * pack + 2] = first_half->z;
-                this_activations[unroll_k_i * pack + 3] = first_half->w;
-                const float4 *second_half = reinterpret_cast<const float4 *>(activations + 4);
-                __builtin_assume_aligned(second_half, 16);
-                this_activations[unroll_k_i * pack + 4] = second_half->x;
-                this_activations[unroll_k_i * pack + 5] = second_half->y;
-                this_activations[unroll_k_i * pack + 6] = second_half->z;
-                this_activations[unroll_k_i * pack + 7] = second_half->w;
-            } else {
-                for (int64_t activation_i = 0; activation_i < pack; activation_i++) {
-                    this_activations[unroll_k_i * pack + activation_i] = activations[activation_i];
-                }
             }
-        }
-        for (int64_t unroll_n_i = 0; unroll_n_i < unroll_n; unroll_n_i++) {
-            scalar_t accumulator = 0;
-            int64_t n = ((warpPos/local_k) % local_n) + ((warpPos / warps_per_elem) % grid_N) / local_n * local_n;
-            __syncwarp();
-            uint16_t this_weights[unroll_k];
-            if (unroll_k % 2 == 0) {
-                for (int64_t unroll_k_i = 0; unroll_k_i < unroll_k; unroll_k_i+=2) {
-                    const ushort2 *loaded = (const ushort2 *) &weights_compressed[(n*unroll_n + unroll_n_i) * K/pack + (k * WARP_SIZE + laneId) * unroll_k + unroll_k_i];
-                    __builtin_assume_aligned(loaded, 4);
-                    this_weights[unroll_k_i] = loaded->x;
-                    this_weights[unroll_k_i + 1] = loaded->y;
-                }
-            } else {
-                for (int64_t unroll_k_i = 0; unroll_k_i < unroll_k; unroll_k_i++) {
-                    this_weights[unroll_k_i] = weights_compressed[(n*unroll_n + unroll_n_i) * K/pack + (k * WARP_SIZE + laneId) * unroll_k + unroll_k_i];
-                }
-            }
-#pragma unroll
-            for (int64_t unroll_k_i = 0; unroll_k_i < unroll_k; unroll_k_i++) {
-                // TODO: optimize access pattern by reordering weights
-                uint16_t encoded = this_weights[unroll_k_i];
-                uint64_t decoded = decode8weights(encoded, codebook_local);
-                #ifdef EMULATED_INT82FP16
-                // bit twiddling to convert int8 to fp16 from http://arxiv.org/abs/2211.10017
-                half2 unpacked[2][2];
-                uint64_t lower_half = decoded & 0x00ff00ff00ff00ff;
-                lower_half = (lower_half ^ 0x6480648064806480);
-                memcpy(unpacked[0], &lower_half, sizeof(uint64_t));
-                uint64_t upper_half = (decoded & 0xff00ff00ff00ff00) >> 8;
-                upper_half = (upper_half ^ 0x6480648064806480);
-                memcpy(unpacked[1], &upper_half, sizeof(uint64_t));
-                const half2 adjust = {__float2half(-1152.0f), __float2half(-1152.0f)};
-                unpacked[0][0] = __hadd2(unpacked[0][0], adjust);
-                unpacked[0][1] = __hadd2(unpacked[0][1], adjust);
-                unpacked[1][0] = __hadd2(unpacked[1][0], adjust);
-                unpacked[1][1] = __hadd2(unpacked[1][1], adjust);
-                float2 unpacked_f[2][2];
-                unpacked_f[0][0] = __half22float2(unpacked[0][0]);
-                unpacked_f[0][1] = __half22float2(unpacked[0][1]);
-                unpacked_f[1][0] = __half22float2(unpacked[1][0]);
-                unpacked_f[1][1] = __half22float2(unpacked[1][1]);
-                accumulator += this_activations[unroll_k_i * pack + 0] * (unpacked_f[0][0].x);
-                accumulator += this_activations[unroll_k_i * pack + 1] * (unpacked_f[1][0].x);
-                accumulator += this_activations[unroll_k_i * pack + 2] * (unpacked_f[0][0].y);
-                accumulator += this_activations[unroll_k_i * pack + 3] * (unpacked_f[1][0].y);
-                accumulator += this_activations[unroll_k_i * pack + 4] * (unpacked_f[0][1].x);
-                accumulator += this_activations[unroll_k_i * pack + 5] * (unpacked_f[1][1].x);
-                accumulator += this_activations[unroll_k_i * pack + 6] * (unpacked_f[0][1].y);
-                accumulator += this_activations[unroll_k_i * pack + 7] * (unpacked_f[1][1].y);
-                #else
-                for (int64_t i = 0; i < 8; i += 1) {
-                    int8_t weight = decoded >> (i * 8);
-                    accumulator += this_activations[unroll_k_i * pack + i] * (int8_t) weight;
-                }
-                #endif
-            }
-            accumulator *= 0.25;
-            for (int offset = WARP_SIZE/2; offset > 0; offset /= 2) {
-                // apparently c10::Half does arithmetic operations in float32?
-                // https://github.com/pytorch/pytorch/blob/0bd4d1f4ab38d3088de8aa5fbba35427b42d118e/c10/util/Half.h#L4C58-L6C80
-                if constexpr (std::is_same<scalar_t, c10::Half>::value) {
-                    accumulator += __shfl_down_sync(0xFFFFFFFF, __float2half(accumulator), offset);
-                } else {
-                    accumulator += __shfl_down_sync(0xFFFFFFFF, accumulator, offset);
-                }
             }
-            if (SHARED_REDUCE) {
-                if (laneId == 0) {
-                    accum_scratch[warpId] = accumulator;
-                    __syncthreads();
-                    if (warpId % local_k == 0) {
-                        scalar_t local_accum = 0;
-                        for (int64_t accum_i = 0; accum_i < local_k; accum_i++) {
-                            local_accum += accum_scratch[warpId / local_k * local_k + accum_i];
-                        }
-                        atomicAdd(output + m * N + n * unroll_n + unroll_n_i, local_accum);
-                    }
-                } else {
-                    __syncthreads();
-                }
-            } else {
-                if (laneId == 0) {
-                    atomicAdd(output + m * N + n * unroll_n + unroll_n_i, accumulator);
-                }
-            }
         }
     }
 }
-__host__ extern torch::Tensor decode_matmul_e8p(
     torch::Tensor x,
     torch::Tensor weights_compressed,
     torch::Tensor codebook_abs
@@ -265,47 +280,306 @@ __host__ extern torch::Tensor decode_matmul_e8p(
     CHECK_INPUT(weights_compressed);
     CHECK_INPUT(codebook_abs);
-    TORCH_CHECK(weights_compressed.scalar_type() == torch::kInt16);
-    TORCH_CHECK(codebook_abs.scalar_type() == torch::kInt64);
-    TORCH_CHECK(x.size(-1) == weights_compressed.size(-1) << 3);
     TORCH_CHECK(codebook_abs.size(-1) == 256);
-    int64_t M = x.size(-2);
-    int64_t N = weights_compressed.size(-2);
     int64_t K = x.size(-1);
-    //printf("%lld %lld %lld\n", M, N, K);
-    TORCH_CHECK(K % WARP_SIZE == 0, "K is not divisible by WARP_SIZE");
     at::DeviceGuard guard(x.device());
     torch::TensorOptions options = torch::TensorOptions()
-        .dtype(x.scalar_type())
         .layout(torch::kStrided)
         .device(torch::kCUDA)
         .requires_grad(false);
-    torch::Tensor output = torch::zeros(std::vector<int64_t>{M, N}, options);
     cudaDeviceProp deviceProp;
     cudaGetDeviceProperties(&deviceProp, x.get_device());
-    int64_t grid_size = static_cast<int64_t>(6 * deviceProp.multiProcessorCount);
     at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
-    AT_DISPATCH_FLOATING_TYPES_AND2(
-            at::ScalarType::Half,
-            at::ScalarType::BFloat16,
-            x.scalar_type(),
-            "decode_matmul_e8p",
-            [&] {
-        decode_matmul_e8p_kernel<<<grid_size, BLOCK_SIZE, 0, stream>>>(
-                output.data_ptr<scalar_t>(),
-                x.data_ptr<scalar_t>(),
-                weights_compressed.data_ptr<int16_t>(),
-                codebook_abs.data_ptr<int64_t>(),
-                M,
-                N,
-                K);
-        gpuErrchk(cudaPeekAtLastError());
-    });
     return output;
 }

 #include <cuda_fp16.h>
 #include <mma.h>
+#include <cuda_pipeline.h>
 #include <ATen/ATen.h>
 #include <ATen/Context.h>
 #include <ATen/Dispatch.h>
     }
 }
+__device__ static inline uint32_t add_as_half2(uint32_t x, uint32_t y) {
+    uint32_t z;
+    asm("add.f16x2 %0,%1,%2;" : "=r"(z) : "r"(x), "r"(y));
+    return z;
+}
+__device__ static inline uint32_t mask_lop3(uint32_t x, uint32_t m0, uint32_t m1) {
+    uint32_t y;
+    asm("lop3.b32 %0, %1, %2, %3, 0xEA;" : "=r"(y) : "r"(x), "r"(m0), "r"(m1));
+    return y;
+    // return (x & m0) | m1;
 }
+#define BASE_OFFSET 0xd080d080
+#define XMASK 0x00f000f0
+#define WMASK 0x50085008
 __global__ static void
+// __launch_bounds__(1024, 1024)
+decode_matvec_e8p_kernel(
+    float *__restrict__ output,
+    const uint2 *__restrict__ input,
+    const uint2 *__restrict__ weights_compressed,
+    const uint32_t *__restrict__ codebook_abs,
+    int N,
+    int K
 ) {
+    int warpId = threadIdx.y;
+    int laneId = threadIdx.x;
+    // __shared__ float sum_scratch[16*32];
+    // __shared__ uint32_t codebook_local[256*32];
+    // for (int icb = warpId; icb < 256; icb += 32) {
+    //     codebook_local[icb*32 + laneId] = codebook_abs[icb];
+    // }
+    // __syncthreads();
+    __shared__ uint2 shared_weights[1024*2];
+    for (int iin = blockIdx.x; iin < (N >> 4); iin += gridDim.x) {
+        float z0 = 0.0;
+        float z1 = 0.0;
+        float z2 = 0.0;
+        float z3 = 0.0;
+        // int shwo = laneId + 32*warpId;
+        // __pipeline_memcpy_async(shared_weights + shwo, weights_compressed + laneId + 32*warpId + 1024*0 + (K >> 1)*iin, 8);
+        // __pipeline_commit();
+        for (int iik = warpId; iik < (K >> 6); iik += 32) {
+            // if (iik + 1 < (K >> 11)) {
+            //     __pipeline_memcpy_async(shared_weights + (shwo ^ 1024), weights_compressed + laneId + 32*iik + 1024 + (K >> 1)*iin, 8);
+            //     __pipeline_commit();
+            //     __pipeline_wait_prior(1);
+            //     shwo = shwo ^ 1024;
+            // }
+            // else {
+            //     __pipeline_wait_prior(0);
+            // }
+            // uint2 w_compr = shared_weights[shwo]; // weights_compressed[laneId + 32*warpId + 1024*iik + (K >> 1)*iin];
+            uint2 w_compr = weights_compressed[laneId + 32*iik + (K >> 1)*iin];
+            uint32_t a = w_compr.x;
+            uint32_t b = w_compr.y;
+            uint32_t s = b;
+            s = s ^ (s >> 4);
+            s = s ^ (s >> 8);
+            s = s ^ (s >> 16);
+            uint32_t sb = (s & 15);
+            s = b ^ sb;
+            sb = sb | (sb << 16);
+            uint32_t input_to_warp = ((const uint32_t*)(&input[16*iik]))[laneId];
+            uint32_t shifted_laneId = (laneId & 3) << 3;
+            /// BLOCK 01
+            {
+            uint32_t x = codebook_abs[(a >> 0) & 255];
+            x = x ^ ((s & 0x11111111) * 14);
+            uint32_t o = BASE_OFFSET | ((sb & 0x00010001) << 4);
+            uint32_t w00 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w01 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w02 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w03 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            x = codebook_abs[(a >> 8) & 255];
+            x = x ^ ((s & 0x22222222) * 7);
+            o = BASE_OFFSET | ((sb & 0x00020002) << 3);
+            uint32_t w10 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w11 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w12 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w13 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            // uint2 x_in = input[0 + (laneId & 3)*4 + 16*warpId + 16*32*iik];
+            // uint32_t x_in0 = x_in.x;
+            // uint32_t x_in1 = x_in.y;
+            uint32_t x_in0 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 0);
+            uint32_t x_in1 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 1);
+            asm(
+                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                " { %0, %1, %2, %3 },"
+                " { %4, %5, %6, %7 },"
+                " { %8, %9 },"
+                " { %0, %1, %2, %3 };"
+                : "+f"(z0), "+f"(z1), "+f"(z2), "+f"(z3)
+                : "r"(w00), "r"(w10), "r"(w01),  "r"(w11),
+                  "r"(x_in0), "r"(x_in1)
+            );
+            // x_in = input[1 + (laneId & 3)*4 + 16*warpId + 16*32*iik];
+            // x_in0 = x_in.x;
+            // x_in1 = x_in.y;
+            x_in0 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 2);
+            x_in1 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 3);
+            asm(
+                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                " { %0, %1, %2, %3 },"
+                " { %4, %5, %6, %7 },"
+                " { %8, %9 },"
+                " { %0, %1, %2, %3 };"
+                : "+f"(z0), "+f"(z1), "+f"(z2), "+f"(z3)
+                : "r"(w02), "r"(w12), "r"(w03), "r"(w13),
+                  "r"(x_in0), "r"(x_in1)
+            );
             }
+            /// BLOCK 23
+            {
+            uint32_t x = codebook_abs[(a >> 16) & 255];
+            s = s >> 2;
+            x = x ^ ((s & 0x11111111) * 14);
+            uint32_t o = BASE_OFFSET | ((sb & 0x00040004) << 2);
+            uint32_t w00 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w01 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w02 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w03 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            x = codebook_abs[(a >> 24) & 255];
+            x = x ^ ((s & 0x22222222) * 7);
+            o = BASE_OFFSET | ((sb & 0x00080008) << 1);
+            uint32_t w10 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w11 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w12 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w13 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            // uint2 x_in = input[2 + (laneId & 3)*4 + 16*warpId + 16*32*iik];
+            // uint32_t x_in0 = x_in.x;
+            // uint32_t x_in1 = x_in.y;
+            uint32_t x_in0 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 4);
+            uint32_t x_in1 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 5);
+            asm(
+                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                " { %0, %1, %2, %3 },"
+                " { %4, %5, %6, %7 },"
+                " { %8, %9 },"
+                " { %0, %1, %2, %3 };"
+                : "+f"(z0), "+f"(z1), "+f"(z2), "+f"(z3)
+                : "r"(w00), "r"(w10), "r"(w01), "r"(w11),
+                  "r"(x_in0), "r"(x_in1)
+            );
+            // x_in = input[3 + (laneId & 3)*4 + 16*warpId + 16*32*iik];
+            // x_in0 = x_in.x;
+            // x_in1 = x_in.y;
+            x_in0 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 6);
+            x_in1 = __shfl_sync(FULL_MASK, input_to_warp, shifted_laneId | 7);
+            asm(
+                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                " { %0, %1, %2, %3 },"
+                " { %4, %5, %6, %7 },"
+                " { %8, %9 },"
+                " { %0, %1, %2, %3 };"
+                : "+f"(z0), "+f"(z1), "+f"(z2), "+f"(z3)
+                : "r"(w02), "r"(w12), "r"(w03), "r"(w13),
+                  "r"(x_in0), "r"(x_in1)
+            );
             }
+        }
+        // we produced 16 outputs, so only 16 threads
+        if ((laneId & 1) == 0) {
+            atomicAdd(output + (iin << 4) + (laneId >> 1), (laneId & 2) ? z2 : z0);
         }
+        // if ((laneId & 3) == 0) {
+        //     sum_scratch[warpId + ((laneId >> 1) + 0) * 32] = z0;
+        //     sum_scratch[warpId + ((laneId >> 1) + 1) * 32] = z2;
+        // }
+        // __syncthreads();
+        // // load and sum
+        // if (warpId < 16) {
+        //     float acc = sum_scratch[laneId + warpId*32];
+        //     for (int offset = 16; offset > 0; offset /= 2) {
+        //         acc += __shfl_down_sync(FULL_MASK, acc, offset);
+        //     }
+        //     if (laneId == 0) {
+        //         output[(iin << 4) + warpId] = acc;
+        //     }
+        // }
     }
 }
+__host__ extern torch::Tensor decode_matvec_e8p(
     torch::Tensor x,
     torch::Tensor weights_compressed,
     torch::Tensor codebook_abs
     CHECK_INPUT(weights_compressed);
     CHECK_INPUT(codebook_abs);
+    TORCH_CHECK(x.dim() == 1);
+    TORCH_CHECK(weights_compressed.dim() == 4);
+    TORCH_CHECK(weights_compressed.size(3) == 4);
+    TORCH_CHECK(weights_compressed.size(2) == 8);
+    TORCH_CHECK(codebook_abs.dim() == 1);
+    TORCH_CHECK(x.scalar_type() == torch::kFloat16);
+    TORCH_CHECK(weights_compressed.scalar_type() == torch::kInt64);
+    TORCH_CHECK(codebook_abs.scalar_type() == torch::kInt32);
+    TORCH_CHECK(x.size(-1) == weights_compressed.size(1) << 6);
     TORCH_CHECK(codebook_abs.size(-1) == 256);
+    int64_t N = weights_compressed.size(0) * 16;
     int64_t K = x.size(-1);
+    TORCH_CHECK(K % 64 == 0, "K is not divisible by 64");
+    TORCH_CHECK(N % 16 == 0, "N is not divisible by 16");
+    TORCH_CHECK(K < 65536, "K is not too large");
+    TORCH_CHECK(N < 65536, "N is not too large");
     at::DeviceGuard guard(x.device());
     torch::TensorOptions options = torch::TensorOptions()
+        .dtype(torch::kFloat32)
         .layout(torch::kStrided)
         .device(torch::kCUDA)
         .requires_grad(false);
+    torch::Tensor output = torch::zeros(std::vector<int64_t>{N}, options);
     cudaDeviceProp deviceProp;
     cudaGetDeviceProperties(&deviceProp, x.get_device());
+    int64_t grid_size = static_cast<int64_t>(deviceProp.multiProcessorCount);
     at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
+    const dim3 block_size(32,32);
+    decode_matvec_e8p_kernel<<<grid_size, block_size, 0, stream>>>(
+        output.data_ptr<float>(),
+        (const uint2*)x.data_ptr<c10::Half>(),
+        (const uint2*)weights_compressed.data_ptr<int64_t>(),
+        (const uint32_t*)codebook_abs.data_ptr<int32_t>(),
+        N,
+        K);
+    gpuErrchk(cudaPeekAtLastError());
     return output;
 }
+__global__ static void
+test_tc_kernel(float *__restrict__ output) {
+    int laneId = threadIdx.x;
+    uint32_t w0 = (laneId == 0) ? 0x3C003C00 : 0x00000000;
+    uint32_t w1 = 0x00000000;
+    uint32_t w2 = 0x00000000;
+    uint32_t w3 = 0x00000000;
+    uint32_t x0 = (laneId == 0) ? 0x3C003C00 : 0x00000000;
+    uint32_t x1 = 0x00000000;
+    float z0 = 0.0;
+    float z1 = 0.0;
+    float z2 = 0.0;
+    float z3 = 0.0;
+    asm(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+        " { %0, %1, %2, %3 },"
+        " { %4, %5, %6, %7 },"
+        " { %8, %9 },"
+        " { %0, %1, %2, %3 };"
+        : "+f"(z0), "+f"(z1), "+f"(z2), "+f"(z3)
+        : "r"(w0), "r"(w1), "r"(w2), "r"(w3),
+          "r"(x0), "r"(x1)
+    );
+    output[laneId*4 + 0] = z0;
+    output[laneId*4 + 1] = z1;
+    output[laneId*4 + 2] = z2;
+    output[laneId*4 + 3] = z3;
+}
+__host__ extern torch::Tensor test_tc() {
+    torch::TensorOptions options = torch::TensorOptions()
+        .dtype(torch::kFloat32)
+        .layout(torch::kStrided)
+        .device(torch::kCUDA)
+        .requires_grad(false);
+    torch::Tensor output = torch::zeros(std::vector<int64_t>{32*4}, options);
+    test_tc_kernel<<<1, 32>>>(output.data_ptr<float>());
+    gpuErrchk(cudaPeekAtLastError());
+    return output;
+}
+__global__ static void
+test_codebook_expand_kernel(uint32_t *__restrict__ output, const uint32_t *__restrict__ codebook_abs) {
+    uint32_t a = threadIdx.x;
+    uint32_t b = 0;
+    for (int i = 0; i < 8; i++) {
+        b |= (((blockIdx.x >> i) & 1) << (4*i));
+    }
+    uint32_t s = b;
+    s = s ^ (s >> 4);
+    s = s ^ (s >> 8);
+    s = s ^ (s >> 16);
+    uint32_t sb = (s & 15);
+    s = b ^ sb;
+    sb = sb | (sb << 16);
+    uint32_t x = codebook_abs[(a >> 0) & 255];
+    x = x ^ ((s & 0x11111111) * 14);
+    uint32_t o = BASE_OFFSET | ((sb & 0x00010001) << 4);
+    uint32_t w0 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+    uint32_t w1 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+    uint32_t w2 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+    uint32_t w3 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+    output[blockIdx.x*256*4 + threadIdx.x*4 + 0] = w0;
+    output[blockIdx.x*256*4 + threadIdx.x*4 + 1] = w1;
+    output[blockIdx.x*256*4 + threadIdx.x*4 + 2] = w2;
+    output[blockIdx.x*256*4 + threadIdx.x*4 + 3] = w3;
+}
+__host__ extern torch::Tensor test_codebook_expand(torch::Tensor codebook_abs) {
+    torch::TensorOptions options = torch::TensorOptions()
+        .dtype(torch::kFloat16)
+        .layout(torch::kStrided)
+        .device(torch::kCUDA)
+        .requires_grad(false);
+    torch::Tensor output = torch::zeros(std::vector<int64_t>{256*256,8}, options);
+    test_codebook_expand_kernel<<<256, 256>>>((uint32_t*)output.data_ptr<c10::Half>(), (const uint32_t*)codebook_abs.data_ptr<int32_t>());
+    gpuErrchk(cudaPeekAtLastError());
+    return output;
+}
+__global__ static void
+// __launch_bounds__(1024, 1024)
+decompress_packed_e8p_kernel(
+    uint32_t *__restrict__ output,
+    const uint2 *__restrict__ weights_compressed,
+    const uint32_t *__restrict__ codebook_abs,
+    int N,
+    int K
+) {
+    int warpId = threadIdx.y;
+    int laneId = threadIdx.x;
+    for (int iin = blockIdx.x; iin < (N >> 4); iin += gridDim.x) {
+        for (int iik = warpId; iik < (K >> 6); iik += 32) {
+            uint2 w_compr = weights_compressed[laneId + 32*iik + (K >> 1)*iin];
+            uint32_t a = w_compr.x;
+            uint32_t b = w_compr.y;
+            uint32_t s = b;
+            s = s ^ (s >> 4);
+            s = s ^ (s >> 8);
+            s = s ^ (s >> 16);
+            uint32_t sb = (s & 15);
+            s = b ^ sb;
+            sb = sb | (sb << 16);
+            /// BLOCK 01
+            {
+            uint32_t x = codebook_abs[(a >> 0) & 255];
+            x = x ^ ((s & 0x11111111) * 14);
+            uint32_t o = BASE_OFFSET | ((sb & 0x00010001) << 4);
+            uint32_t w00 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w01 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w02 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w03 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            x = codebook_abs[(a >> 8) & 255];
+            x = x ^ ((s & 0x22222222) * 7);
+            o = BASE_OFFSET | ((sb & 0x00020002) << 3);
+            uint32_t w10 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w11 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w12 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w13 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 0] = w00;
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 1] = w01;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 0] = w10;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 1] = w11;
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 2] = w02;
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 3] = w03;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 2] = w12;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 0*4 + ((laneId & 3) << 3) + 3] = w13;
+            }
+            /// BLOCK 23
+            {
+            uint32_t x = codebook_abs[(a >> 16) & 255];
+            s = s >> 2;
+            x = x ^ ((s & 0x11111111) * 14);
+            uint32_t o = BASE_OFFSET | ((sb & 0x00040004) << 2);
+            uint32_t w00 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w01 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w02 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w03 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            x = codebook_abs[(a >> 24) & 255];
+            x = x ^ ((s & 0x22222222) * 7);
+            o = BASE_OFFSET | ((sb & 0x00080008) << 1);
+            uint32_t w10 = add_as_half2(mask_lop3(x << 4, XMASK, WMASK), o);
+            uint32_t w11 = add_as_half2(mask_lop3(x << 0, XMASK, WMASK), o);
+            uint32_t w12 = add_as_half2(mask_lop3(x >> 4, XMASK, WMASK), o);
+            uint32_t w13 = add_as_half2(mask_lop3(x >> 8, XMASK, WMASK), o);
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 0] = w00;
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 1] = w01;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 0] = w10;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 1] = w11;
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 2] = w02;
+            output[iin*8*K + (laneId >> 2)*K + 0 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 3] = w03;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 2] = w12;
+            output[iin*8*K + (laneId >> 2)*K + 1 * (K >> 1) + iik*32 + 1*4 + ((laneId & 3) << 3) + 3] = w13;
+            }
+        }
+    }
+}
+__host__ extern torch::Tensor decompress_packed_e8p(
+    torch::Tensor weights_compressed,
+    torch::Tensor codebook_abs
+) {
+    CHECK_INPUT(weights_compressed);
+    CHECK_INPUT(codebook_abs);
+    TORCH_CHECK(weights_compressed.dim() == 4);
+    TORCH_CHECK(weights_compressed.size(3) == 4);
+    TORCH_CHECK(weights_compressed.size(2) == 8);
+    TORCH_CHECK(codebook_abs.dim() == 1);
+    TORCH_CHECK(weights_compressed.scalar_type() == torch::kInt64);
+    TORCH_CHECK(codebook_abs.scalar_type() == torch::kInt32);
+    TORCH_CHECK(codebook_abs.size(-1) == 256);
+    int64_t N = weights_compressed.size(0) * 16;
+    int64_t K = weights_compressed.size(1) << 6;
+    TORCH_CHECK(K % 64 == 0, "K is not divisible by 64");
+    TORCH_CHECK(N % 16 == 0, "N is not divisible by 16");
+    TORCH_CHECK(K < 65536, "K is not too large");
+    TORCH_CHECK(N < 65536, "N is not too large");
+    at::DeviceGuard guard(codebook_abs.device());
+    torch::TensorOptions options = torch::TensorOptions()
+        .dtype(torch::kFloat16)
+        .layout(torch::kStrided)
+        .device(torch::kCUDA)
+        .requires_grad(false);
+    torch::Tensor output = torch::zeros(std::vector<int64_t>{N,K}, options);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, weights_compressed.get_device());
+    int64_t grid_size = static_cast<int64_t>(deviceProp.multiProcessorCount);
+    at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
+    const dim3 block_size(32,32);
+    decompress_packed_e8p_kernel<<<grid_size, block_size, 0, stream>>>(
+        (uint32_t*)output.data_ptr<c10::Half>(),
+        (const uint2*)weights_compressed.data_ptr<int64_t>(),
+        (const uint32_t*)codebook_abs.data_ptr<int32_t>(),
+        N,
+        K);
+    gpuErrchk(cudaPeekAtLastError());
+    return output;
+}

quip-sharp/quiptools/quiptools_wrapper.cpp CHANGED Viewed

@@ -43,13 +43,17 @@ void decompress_e8p_origorder(
     torch::Tensor &Y         // m x n
 );
-torch::Tensor decode_matmul_e8p(
     torch::Tensor x,
     torch::Tensor weights_compressed,
     torch::Tensor codebook_abs
 );
 void decompress_hi4b1c_packed(
     torch::Tensor YIs,      // m x (n/8)
     torch::Tensor CB,       // 16 x 1
@@ -64,7 +68,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("decompress_d4", &decompress_d4, "decompress_d4");
   m.def("decompress_d4_origorder", &decompress_d4_origorder, "decompress_d4_origorder");
   m.def("decompress_e8p_origorder", &decompress_e8p_origorder, "decompress_e8p_origorder");
-  m.def("decode_matmul_e8p", &decode_matmul_e8p, "decode_matmul_e8p");
   m.def("decompress_hi4b1c_packed", &decompress_hi4b1c_packed, "decompress_hi4b1c_packed");
 }

     torch::Tensor &Y         // m x n
 );
+torch::Tensor decompress_packed_e8p(
+    torch::Tensor weights_compressed,      // m x (n/8)
+    torch::Tensor codebook_abs       // 256 x 8
+);
+torch::Tensor decode_matvec_e8p(
     torch::Tensor x,
     torch::Tensor weights_compressed,
     torch::Tensor codebook_abs
 );
 void decompress_hi4b1c_packed(
     torch::Tensor YIs,      // m x (n/8)
     torch::Tensor CB,       // 16 x 1
   m.def("decompress_d4", &decompress_d4, "decompress_d4");
   m.def("decompress_d4_origorder", &decompress_d4_origorder, "decompress_d4_origorder");
   m.def("decompress_e8p_origorder", &decompress_e8p_origorder, "decompress_e8p_origorder");
+  m.def("decompress_packed_e8p", &decompress_packed_e8p, "decompress_packed_e8p");
+  m.def("decode_matvec_e8p", &decode_matvec_e8p, "decode_matvec_e8p");
   m.def("decompress_hi4b1c_packed", &decompress_hi4b1c_packed, "decompress_hi4b1c_packed");
 }

quip-sharp/scripts/upload_hf.py CHANGED Viewed

@@ -29,4 +29,5 @@ if __name__ == "__main__":
         multi_commits=args.no_multi_commits,
         multi_commits_verbose=True,
         token=args.write_token,
     )

         multi_commits=args.no_multi_commits,
         multi_commits_verbose=True,
         token=args.write_token,
+        create_pr=True, # creates a PR. You must manually merge the PR in
     )