frugal-ai-submission-template

Sleeping

App Files Files Community

Tonic commited on Feb 10

Commit

4477f42

verified ·

1 Parent(s): 71340db

fix transformers

Browse files

Files changed (1) hide show

tasks/text.py +47 -60

tasks/text.py CHANGED Viewed

@@ -1,14 +1,13 @@
 from fastapi import APIRouter
 from datetime import datetime
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
-import random
-from transformers import pipeline, AutoConfig
 import os
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Tuple
-import numpy as np
 import torch
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
@@ -18,23 +17,23 @@ os.environ["TORCH_COMPILE_DISABLE"] = "1"
 router = APIRouter()
-DESCRIPTION = "Random Baseline"
 ROUTE = "/text"
 class TextClassifier:
     def __init__(self):
-        # Add retry mechanism for model initialization
         max_retries = 3
         for attempt in range(max_retries):
             try:
-                self.config = AutoConfig.from_pretrained("Tonic/climate-guard-toxic-agent")
-                self.label2id = self.config.label2id
-                self.classifier = pipeline(
-                    "text-classification",
-                    "Tonic/climate-guard-toxic-agent",
-                    device="cpu",
-                    batch_size=16
                 )
                 print("Model initialized successfully")
                 break
             except Exception as e:
@@ -43,21 +42,37 @@ class TextClassifier:
                 print(f"Attempt {attempt + 1} failed, retrying...")
                 time.sleep(1)
     def process_batch(self, batch: List[str], batch_idx: int) -> Tuple[List[int], int]:
         """Process a batch of texts and return their predictions"""
         max_retries = 3
         for attempt in range(max_retries):
             try:
                 print(f"Processing batch {batch_idx} with {len(batch)} items (attempt {attempt + 1})")
-                # Process texts one by one in case of errors
                 predictions = []
                 for text in batch:
-                    try:
-                        pred = self.classifier(text)
-                        pred_label = self.label2id[pred[0]["label"]]
-                        predictions.append(pred_label)
-                    except Exception as e:
-                        print(f"Error processing text in batch {batch_idx}: {str(e)}")
                 if not predictions:
                     raise Exception("No predictions generated for batch")
@@ -68,21 +83,14 @@ class TextClassifier:
             except Exception as e:
                 if attempt == max_retries - 1:
                     print(f"Final error in batch {batch_idx}: {str(e)}")
-                    return [0] * len(batch), batch_idx  # Return default predictions instead of empty list
                 print(f"Error in batch {batch_idx} (attempt {attempt + 1}): {str(e)}")
                 time.sleep(1)
-@router.post(ROUTE, tags=["Text Task"],
-             description=DESCRIPTION)
 async def evaluate_text(request: TextEvaluationRequest):
-    """
-    Evaluate text classification for climate disinformation detection.
-    Current Model: Random Baseline
-    - Makes random predictions from the label space (0-7)
-    - Used as a baseline for comparison
-    """
     # Get space info
     username, space_url = get_space_info()
@@ -100,30 +108,20 @@ async def evaluate_text(request: TextEvaluationRequest):
     # Load and prepare the dataset
     dataset = load_dataset(request.dataset_name)
-    # Convert string labels to integers
     dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
-    # Split dataset
-    train_test = dataset["train"]
     test_dataset = dataset["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
-    #--------------------------------------------------------------------------------------------
-    # YOUR MODEL INFERENCE CODE HERE
-    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
-    #--------------------------------------------------------------------------------------------
     true_labels = test_dataset["label"]
     # Initialize the model once
     classifier = TextClassifier()
     # Prepare batches
-    batch_size = 32
     quotes = test_dataset["quote"]
     num_batches = len(quotes) // batch_size + (1 if len(quotes) % batch_size != 0 else 0)
     batches = [
@@ -131,54 +129,44 @@ async def evaluate_text(request: TextEvaluationRequest):
         for i in range(num_batches)
     ]
-    # Initialize batch_results before parallel processing
     batch_results = [[] for _ in range(num_batches)]
     # Process batches in parallel
-    max_workers = min(os.cpu_count(), 4)  # Limit to 4 workers or CPU count
     print(f"Processing with {max_workers} workers")
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Submit all batches for processing
         future_to_batch = {
-            executor.submit(
-                classifier.process_batch,
-                batch,
-                idx
-            ): idx for idx, batch in enumerate(batches)
         }
-        # Collect results in order
         for future in future_to_batch:
             batch_idx = future_to_batch[future]
             try:
                 predictions, idx = future.result()
-                if predictions:  # Only store non-empty predictions
                     batch_results[idx] = predictions
                     print(f"Stored results for batch {idx} ({len(predictions)} predictions)")
             except Exception as e:
                 print(f"Failed to get results for batch {batch_idx}: {e}")
-                # Use default predictions instead of empty list
                 batch_results[batch_idx] = [0] * len(batches[batch_idx])
-    # Flatten predictions while maintaining order
     predictions = []
     for batch_preds in batch_results:
         if batch_preds is not None:
             predictions.extend(batch_preds)
-    #--------------------------------------------------------------------------------------------
-    # YOUR MODEL INFERENCE STOPS HERE
-    #--------------------------------------------------------------------------------------------
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
     # Calculate accuracy
     accuracy = accuracy_score(true_labels, predictions)
-    print("accuracy : ", accuracy)
-    # Prepare results dictionary
     results = {
         "username": username,
         "space_url": space_url,
@@ -196,6 +184,5 @@ async def evaluate_text(request: TextEvaluationRequest):
         }
     }
-    print("results : ", results)
     return results

 from fastapi import APIRouter
 from datetime import datetime
+import time
 from datasets import load_dataset
 from sklearn.metrics import accuracy_score
 import os
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Tuple
 import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from .utils.evaluation import TextEvaluationRequest
 from .utils.emissions import tracker, clean_emissions_data, get_space_info
 router = APIRouter()
+DESCRIPTION = "Climate Guard Toxic Agent Classifier"
 ROUTE = "/text"
 class TextClassifier:
     def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         max_retries = 3
         for attempt in range(max_retries):
             try:
+                # Load model and tokenizer directly instead of using pipeline
+                self.model = AutoModelForSequenceClassification.from_pretrained(
+                    "Tonic/climate-guard-toxic-agent"
+                ).to(self.device)
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    "Tonic/climate-guard-toxic-agent"
                 )
+                self.model.eval()  # Set to evaluation mode
                 print("Model initialized successfully")
                 break
             except Exception as e:
                 print(f"Attempt {attempt + 1} failed, retrying...")
                 time.sleep(1)
+    def predict_single(self, text: str) -> int:
+        """Predict single text instance"""
+        try:
+            inputs = self.tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512,
+                padding=True
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                predictions = outputs.logits.argmax(-1)
+                return predictions.item()
+        except Exception as e:
+            print(f"Error in single prediction: {str(e)}")
+            return 0  # Return default prediction on error
     def process_batch(self, batch: List[str], batch_idx: int) -> Tuple[List[int], int]:
         """Process a batch of texts and return their predictions"""
         max_retries = 3
         for attempt in range(max_retries):
             try:
                 print(f"Processing batch {batch_idx} with {len(batch)} items (attempt {attempt + 1})")
                 predictions = []
+                # Process texts one by one for better error handling
                 for text in batch:
+                    pred = self.predict_single(text)
+                    predictions.append(pred)
                 if not predictions:
                     raise Exception("No predictions generated for batch")
             except Exception as e:
                 if attempt == max_retries - 1:
                     print(f"Final error in batch {batch_idx}: {str(e)}")
+                    return [0] * len(batch), batch_idx
                 print(f"Error in batch {batch_idx} (attempt {attempt + 1}): {str(e)}")
                 time.sleep(1)
+@router.post(ROUTE, tags=["Text Task"], description=DESCRIPTION)
 async def evaluate_text(request: TextEvaluationRequest):
+    """Evaluate text classification for climate disinformation detection."""
     # Get space info
     username, space_url = get_space_info()
     # Load and prepare the dataset
     dataset = load_dataset(request.dataset_name)
     dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
     test_dataset = dataset["test"]
     # Start tracking emissions
     tracker.start()
     tracker.start_task("inference")
     true_labels = test_dataset["label"]
     # Initialize the model once
     classifier = TextClassifier()
     # Prepare batches
+    batch_size = 16  # Reduced batch size for better memory management
     quotes = test_dataset["quote"]
     num_batches = len(quotes) // batch_size + (1 if len(quotes) % batch_size != 0 else 0)
     batches = [
         for i in range(num_batches)
     ]
+    # Initialize batch_results
     batch_results = [[] for _ in range(num_batches)]
     # Process batches in parallel
+    max_workers = min(os.cpu_count(), 4)
     print(f"Processing with {max_workers} workers")
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_batch = {
+            executor.submit(classifier.process_batch, batch, idx): idx
+            for idx, batch in enumerate(batches)
         }
         for future in future_to_batch:
             batch_idx = future_to_batch[future]
             try:
                 predictions, idx = future.result()
+                if predictions:
                     batch_results[idx] = predictions
                     print(f"Stored results for batch {idx} ({len(predictions)} predictions)")
             except Exception as e:
                 print(f"Failed to get results for batch {batch_idx}: {e}")
                 batch_results[batch_idx] = [0] * len(batches[batch_idx])
+    # Flatten predictions
     predictions = []
     for batch_preds in batch_results:
         if batch_preds is not None:
             predictions.extend(batch_preds)
     # Stop tracking emissions
     emissions_data = tracker.stop_task()
     # Calculate accuracy
     accuracy = accuracy_score(true_labels, predictions)
+    print("accuracy:", accuracy)
+    # Prepare results
     results = {
         "username": username,
         "space_url": space_url,
         }
     }
+    print("results:", results)
     return results