| ``` | |
| from transformers import AutoTokenizer | |
| from auto_gptq import AutoGPTQForCausalLM | |
| model_path = 'efficient-llm/llama-2-13b-chat-gptq' | |
| tokenizer_path = 'meta-llama/Llama-2-7b-hf' | |
| model = AutoGPTQForCausalLM.from_quantized( | |
| model_path, | |
| # inject_fused_attention=False, # or | |
| disable_exllama=True, | |
| device_map='auto', | |
| revision='3bit_128g', | |
| ) | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) | |
| input_ids = tokenizer('How are you?', return_tensors='pt').input_ids.to('cuda') | |
| outputs = model.generate(input_ids=input_ids, max_length=128) | |
| print(tokenizer.decode(outputs[0])) | |
| ``` | |