| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import os | |
| import re | |
| model_name_or_path = "" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) | |
| model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", load_in_4bit=True) # You may want to use bfloat16 and/or move to GPU here | |
| messages = [ | |
| {"role": "user", "content": "How to make pasta?"}, | |
| ] | |
| tokenized_chat = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| thinking_budget=0 # control the thinking budget | |
| ) | |
| outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=60) | |
| output_text = tokenizer.decode(outputs[0]) | |
| print(output_text) |