Spaces:

darisdzakwanhoesien
/

esgdata

Sleeping

App Files Files Community

darisdzakwanhoesien2 commited on Nov 2

Commit

59dfe0f

1 Parent(s): 962d966

label new

Browse files

Files changed (3) hide show

evaluate_esg_coverage.py +2 -4
sanitize_csv.py +1 -1
train_finetune.py +13 -104

evaluate_esg_coverage.py CHANGED Viewed

@@ -10,13 +10,11 @@ def evaluate_models(corpus_path="data/esg_corpus_processed.csv"):
     Evaluates and benchmarks different models on ESG domain coverage.
     """
     if not os.path.exists(corpus_path):
-        print(f"Error: Corpus file not found at {corpus_path}")
-        sys.exit(1)
     df = pd.read_csv(corpus_path)
     if 'text' not in df.columns:
-        print("Error: CSV must have a 'text' column.")
-        sys.exit(1)
     df = df.dropna(subset=["text"])
     print(f"Loaded {len(df)} ESG sentences for evaluation.")

     Evaluates and benchmarks different models on ESG domain coverage.
     """
     if not os.path.exists(corpus_path):
+        raise FileNotFoundError(f"Error: Corpus file not found at {corpus_path}")
     df = pd.read_csv(corpus_path)
     if 'text' not in df.columns:
+        raise ValueError("Error: CSV must have a 'text' column.")
     df = df.dropna(subset=["text"])
     print(f"Loaded {len(df)} ESG sentences for evaluation.")

sanitize_csv.py CHANGED Viewed

@@ -64,7 +64,7 @@ def sanitize_csv(input_path, output_path):
         print(f"Sanitized data saved to {output_path}")
     except FileNotFoundError:
-        print(f"Error: File not found at {input_path}. Make sure 'data/esg_corpus.csv' is in your repository.")
     except Exception as e:
         print(f"An error occurred during sanitization: {e}")

         print(f"Sanitized data saved to {output_path}")
     except FileNotFoundError:
+        print(f"Error: File not found at {input_path}. Make sure '{input_file_path}' is in your repository.")
     except Exception as e:
         print(f"An error occurred during sanitization: {e}")

train_finetune.py CHANGED Viewed

@@ -1,17 +1,11 @@
 # train_finetune.py
-# Integrated script to sanitize data, create triplets, and fine-tune a model.
 import pandas as pd
-from sentence_transformers import SentenceTransformer, InputExample, losses, util
 from torch.utils.data import DataLoader
-import torch
-import csv
-import re
-import os
 # --- File Paths ---
-RAW_CORPUS_PATH = "data/esg_corpus.csv"
-SANITIZED_CORPUS_PATH = "data/esg_corpus_sanitized.csv"
 TRIPLETS_PATH = "data/esg_triplets.csv"
 OUTPUT_MODEL_PATH = "./fine_tuned_esg_model"
@@ -20,105 +14,14 @@ BASE_MODEL = "all-MiniLM-L6-v2"
 TRAIN_BATCH_SIZE = 16
 NUM_EPOCHS = 4
 LEARNING_RATE = 2e-5
-NUM_TRIPLETS = 1000 # Increased for better training
-def sanitize_csv(input_path, output_path):
     """
-    Reads a malformed CSV, cleans it, and writes a valid CSV file.
     """
-    print(f"Sanitizing {input_path}...")
-    try:
-        with open(input_path, 'r', encoding='utf-8') as f:
-            content = f.read()
-        header, body = content.split('\n', 1)
-        records_raw = re.split(r'\n(?=\d+,)', body)
-        output_dir = os.path.dirname(output_path)
-        if output_dir:
-            os.makedirs(output_dir, exist_ok=True)
-        with open(output_path, 'w', newline='', encoding='utf-8') as f_out:
-            writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)
-            writer.writerow([h.strip() for h in header.strip().split(',')])
-            for raw_record in records_raw:
-                if not raw_record.strip():
-                    continue
-                try:
-                    parts = raw_record.split(',', 2)
-                    if len(parts) < 3:
-                        continue
-                    index, filename, markdown = parts[0].strip(), parts[1].strip(), parts[2].strip('" \n')
-                    writer.writerow([index, filename, markdown])
-                except IndexError:
-                    continue
-        print(f"Sanitized data saved to {output_path}")
-    except FileNotFoundError:
-        print(f"Error: Raw corpus file not found at {input_path}. Aborting.")
-        exit(1)
-    except Exception as e:
-        print(f"An error occurred during sanitization: {e}")
-        exit(1)
-def create_triplets(corpus_path, output_path, num_triplets):
-    """
-    Generates training triplets (anchor, positive, negative) from the sanitized corpus.
-    """
-    print(f"Generating triplets from {corpus_path}...")
-    try:
-        df = pd.read_csv(corpus_path)
-        if 'markdown' not in df.columns:
-            print(f"Error: Sanitized corpus at {corpus_path} is missing 'markdown' column.")
-            exit(1)
-    except FileNotFoundError:
-        print(f"Error: Sanitized corpus file not found at {corpus_path}. Aborting.")
-        exit(1)
-    sentences = df['markdown'].dropna().unique().tolist()
-    if len(sentences) < 3:
-        print("Error: Not enough unique sentences to generate triplets.")
-        exit(1)
-    model = SentenceTransformer(BASE_MODEL)
-    embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=True)
-    triplets = []
-    for i in range(len(sentences)):
-        anchor_embedding = embeddings[i].unsqueeze(0)
-        cos_scores = util.cos_sim(anchor_embedding, embeddings)[0]
-        cos_scores[i] = -1
-        positive_idx = torch.topk(cos_scores, k=1).indices[0].item()
-        negative_idx = torch.randint(0, len(sentences), (1,)).item()
-        while negative_idx == i or negative_idx == positive_idx:
-            negative_idx = torch.randint(0, len(sentences), (1,)).item()
-        triplets.append({
-            "anchor": sentences[i],
-            "positive": sentences[positive_idx],
-            "negative": sentences[negative_idx]
-        })
-        if len(triplets) >= num_triplets:
-            break
-    pd.DataFrame(triplets).to_csv(output_path, index=False)
-    print(f"Triplet generation complete. Saved {len(triplets)} triplets to {output_path}")
-def run_training_pipeline():
-    """
-    Main function to run the full pipeline: sanitize, create triplets, and fine-tune.
-    """
-    print("--- Starting Full ESG Fine-Tuning Pipeline ---")
-    # Step 1: Sanitize the raw CSV data
-    sanitize_csv(RAW_CORPUS_PATH, SANITIZED_CORPUS_PATH)
-    # Step 2: Generate triplets from the sanitized data
-    create_triplets(SANITIZED_CORPUS_PATH, TRIPLETS_PATH, num_triplets=NUM_TRIPLETS)
-    # Step 3: Fine-tune the model using the generated triplets
-    print("\n--- Step 3: Fine-Tuning the Model ---")
     model = SentenceTransformer(BASE_MODEL)
     try:
@@ -126,15 +29,20 @@ def run_training_pipeline():
         train_examples = [InputExample(texts=[row['anchor'], row['positive'], row['negative']]) for _, row in triplets_df.iterrows()]
     except FileNotFoundError:
         print(f"Error: Triplets file not found at {TRIPLETS_PATH}. Aborting.")
         return
     if not train_examples:
         print("No training examples found. Aborting fine-tuning.")
         return
     train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
     train_loss = losses.MultipleNegativesRankingLoss(model)
     print(f"Starting training for {NUM_EPOCHS} epochs...")
     model.fit(
         train_objectives=[(train_dataloader, train_loss)],
@@ -146,6 +54,7 @@ def run_training_pipeline():
     )
     print(f"--- Fine-tuning complete. Model saved to {OUTPUT_MODEL_PATH} ---")
 if __name__ == "__main__":
-    run_training_pipeline()

 # train_finetune.py
+# Script to fine-tune a SentenceTransformer model on pre-generated ESG triplets.
 import pandas as pd
+from sentence_transformers import SentenceTransformer, InputExample, losses
 from torch.utils.data import DataLoader
 # --- File Paths ---
 TRIPLETS_PATH = "data/esg_triplets.csv"
 OUTPUT_MODEL_PATH = "./fine_tuned_esg_model"
 TRAIN_BATCH_SIZE = 16
 NUM_EPOCHS = 4
 LEARNING_RATE = 2e-5
+def fine_tune_model():
     """
+    Main function to fine-tune the model using pre-generated triplets.
     """
+    print("--- Starting ESG Fine-Tuning ---")
+    # Load a pre-trained SentenceTransformer model
     model = SentenceTransformer(BASE_MODEL)
     try:
         train_examples = [InputExample(texts=[row['anchor'], row['positive'], row['negative']]) for _, row in triplets_df.iterrows()]
     except FileNotFoundError:
         print(f"Error: Triplets file not found at {TRIPLETS_PATH}. Aborting.")
+        print("Please run 'create_triplets.py' first to generate the training data.")
         return
     if not train_examples:
         print("No training examples found. Aborting fine-tuning.")
         return
+    # Create a DataLoader
     train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
+    # Define the loss function. MultipleNegativesRankingLoss is great for triplets.
     train_loss = losses.MultipleNegativesRankingLoss(model)
+    # Fine-tune the model
     print(f"Starting training for {NUM_EPOCHS} epochs...")
     model.fit(
         train_objectives=[(train_dataloader, train_loss)],
     )
     print(f"--- Fine-tuning complete. Model saved to {OUTPUT_MODEL_PATH} ---")
+    print(f"To use this model, update FINE_TUNED_MODEL_PATH in app.py to '{OUTPUT_MODEL_PATH}'.")
 if __name__ == "__main__":
+    fine_tune_model()