Lorenzob
/

luminar-nano

Safetensors

trm

Model card Files Files and versions

xet

Community

Lorenzob commited on Oct 12

Commit

354aec4

verified ·

1 Parent(s): 9571d80

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

handler.py +45 -21

handler.py CHANGED Viewed

@@ -188,24 +188,46 @@ class EndpointHandler:
     def postprocess(self, outputs):
-        logger.info("Starting postprocess.")
-        logger.debug(f"Postprocess input shape: {outputs.shape}")
         try:
-            # Postprocess the model outputs
-            # 'outputs' here is the output of the inference method (e.g., logits)
-            # For text generation, you would typically decode the generated token IDs
-            # This is a placeholder postprocessing step (e.g., returning the raw logits as a list)
-            # Example: decode token IDs if using model.generate()
-            # generated_ids = outputs[0] # Assuming outputs from generate() is a tensor
-            # generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
-            # return generated_text
-            # For this basic handler returning logits, just convert to CPU and list
-            response = outputs.cpu().tolist()
-            logger.info("Postprocess complete.")
-            # logger.debug(f"Postprocess output (partial): {response[:10]}...") # Avoid printing very large outputs
-            return response
         except Exception as e:
             logger.error(f"Error during postprocess: {e}", exc_info=True)
@@ -221,14 +243,16 @@ class EndpointHandler:
             model_input = self.preprocess(data)
             logger.info("Preprocessing successful.")
-            # 2. Inference
             logger.info("Calling inference...")
-            model_output = self.inference(model_input)
             logger.info("Inference successful.")
-            # 3. Postprocess
             logger.info("Calling postprocess...")
-            response = self.postprocess(model_output)
             logger.info("Postprocessing successful.")
             logger.info("Handle method complete.")

     def postprocess(self, outputs):
+        logger.info("Starting postprocess for text generation (Greedy Decoding).")
+        # 'outputs' here is the output of the inference method (logits)
+        # This implements a basic greedy decoding strategy
+        if self.tokenizer is None:
+            logger.error("Tokenizer is not available for postprocessing.")
+            raise RuntimeError("Tokenizer is not available for postprocessing.")
         try:
+            # Assuming outputs are logits of shape (batch_size, sequence_length, vocab_size)
+            # For greedy decoding, we take the argmax of the logits for the last token
+            # and append it to the input sequence.
+            # This basic handler will generate one token at a time in a loop.
+            # A real text generation handler would likely take the initial input_ids
+            # and loop until a stop condition is met.
+            # For a single forward pass output (like from the inference method),
+            # we can't generate a sequence directly here.
+            # The handle method would need to manage the generation loop.
+            # Let's adapt this postprocess to just decode the most probable token from the last position
+            # as a basic example, or return the input + the most probable next token.
+            # Assuming inputs were processed one by one (batch size 1 for simplicity in this example)
+            # And outputs are logits for the input sequence
+            if outputs.ndim == 3 and outputs.shape[0] == 1: # Shape (1, seq_len, vocab_size)
+                last_token_logits = outputs[0, -1, :] # Logits for the last token in the sequence
+                predicted_token_id = torch.argmax(last_token_logits).item()
+                # Decode the predicted token
+                predicted_text = self.tokenizer.decode([predicted_token_id])
+                logger.info(f"Predicted next token: {predicted_text} (ID: {predicted_token_id})")
+                # In a real generation loop, you would append this to the input and repeat.
+                # For this handler, let's just return the predicted token text.
+                return predicted_text
+            else:
+                logger.warning(f"Unexpected output shape for greedy decoding: {outputs.shape}. Returning raw logits list.")
+                return outputs.cpu().tolist()
         except Exception as e:
             logger.error(f"Error during postprocess: {e}", exc_info=True)
             model_input = self.preprocess(data)
             logger.info("Preprocessing successful.")
+            # 2. Inference (single forward pass to get logits)
             logger.info("Calling inference...")
+            model_output_logits = self.inference(model_input)
             logger.info("Inference successful.")
+            # 3. Postprocess (basic greedy decoding of the next token)
+            # Note: This postprocess only generates the *next* token.
+            # For full text generation, you would need a loop here or modify inference.
             logger.info("Calling postprocess...")
+            response = self.postprocess(model_output_logits)
             logger.info("Postprocessing successful.")
             logger.info("Handle method complete.")