| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| device = "cuda:0" | |
| tokenizer = AutoTokenizer.from_pretrained("glm-4-voice-9b", trust_remote_code=True) | |
| tokenizer.chat_template = "{{role}}: {{content}}" | |
| query = "你好" | |
| inputs = tokenizer.apply_chat_template([{"role": "user", "content": query}], | |
| add_generation_prompt=True, | |
| tokenize=True, | |
| return_tensors="pt", | |
| return_dict=True | |
| ) | |
| inputs = inputs.to(device) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "glm-4-voice-9b", | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True, | |
| load_in_4bit=True | |
| ).eval() | |
| model.save_pretrained("glm-4-voice-9b-int4") | |
| tokenizer.save_pretrained("glm-4-voice-9b-int4") |