| from transformers import AutoModelForCausalLM, AutoProcessor | |
| from llmcompressor import oneshot | |
| from llmcompressor.modifiers.quantization import QuantizationModifier | |
| MODEL_ID = "moonshotai/Kimi-VL-A3B-Thinking-2506" | |
| SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| # Configure the simple PTQ quantization | |
| recipe = QuantizationModifier( | |
| targets="Linear", scheme="FP8_DYNAMIC", ignore=["re:.*lm_head", "re:multi_*", "re:vision.*"]) | |
| # Apply the quantization algorithm. | |
| oneshot( | |
| model=model, | |
| recipe=recipe, | |
| tokenizer=processor, # Pass the loaded processor here | |
| output_dir=SAVE_DIR | |
| ) | |
| # Save the model | |
| #model.save_pretrained(SAVE_DIR) | |
| #processor.save_pretrained(SAVE_DIR) |