from transformers import AutoModelForCausalLM, AutoProcessor from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier MODEL_ID = "moonshotai/Kimi-VL-A3B-Thinking-2506" SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True ) processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) # Configure the simple PTQ quantization recipe = QuantizationModifier( targets="Linear", scheme="FP8_DYNAMIC", ignore=["re:.*lm_head", "re:multi_*", "re:vision.*"]) # Apply the quantization algorithm. oneshot( model=model, recipe=recipe, tokenizer=processor, # Pass the loaded processor here output_dir=SAVE_DIR ) # Save the model #model.save_pretrained(SAVE_DIR) #processor.save_pretrained(SAVE_DIR)