from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch class EndpointHandler: def __init__(self, path=""): self.tokenizer = AutoTokenizer.from_pretrained(path) self.model = AutoModelForSeq2SeqLM.from_pretrained( path, torch_dtype=torch.bfloat16 ) def __call__(self, data): inputs = data.pop("inputs", data) messages = [{"role": "user", "content": inputs}] input_ids = self.tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ) outputs = self.model.generate( input_ids, max_new_tokens=1024, temperature=0.1, do_sample=True ) return { "generated_text": self.tokenizer.decode(outputs[0], skip_special_tokens=True) }