Spaces:
Sleeping
Sleeping
darisdzakwanhoesien2
commited on
Commit
·
59dfe0f
1
Parent(s):
962d966
label new
Browse files- evaluate_esg_coverage.py +2 -4
- sanitize_csv.py +1 -1
- train_finetune.py +13 -104
evaluate_esg_coverage.py
CHANGED
|
@@ -10,13 +10,11 @@ def evaluate_models(corpus_path="data/esg_corpus_processed.csv"):
|
|
| 10 |
Evaluates and benchmarks different models on ESG domain coverage.
|
| 11 |
"""
|
| 12 |
if not os.path.exists(corpus_path):
|
| 13 |
-
|
| 14 |
-
sys.exit(1)
|
| 15 |
|
| 16 |
df = pd.read_csv(corpus_path)
|
| 17 |
if 'text' not in df.columns:
|
| 18 |
-
|
| 19 |
-
sys.exit(1)
|
| 20 |
|
| 21 |
df = df.dropna(subset=["text"])
|
| 22 |
print(f"Loaded {len(df)} ESG sentences for evaluation.")
|
|
|
|
| 10 |
Evaluates and benchmarks different models on ESG domain coverage.
|
| 11 |
"""
|
| 12 |
if not os.path.exists(corpus_path):
|
| 13 |
+
raise FileNotFoundError(f"Error: Corpus file not found at {corpus_path}")
|
|
|
|
| 14 |
|
| 15 |
df = pd.read_csv(corpus_path)
|
| 16 |
if 'text' not in df.columns:
|
| 17 |
+
raise ValueError("Error: CSV must have a 'text' column.")
|
|
|
|
| 18 |
|
| 19 |
df = df.dropna(subset=["text"])
|
| 20 |
print(f"Loaded {len(df)} ESG sentences for evaluation.")
|
sanitize_csv.py
CHANGED
|
@@ -64,7 +64,7 @@ def sanitize_csv(input_path, output_path):
|
|
| 64 |
print(f"Sanitized data saved to {output_path}")
|
| 65 |
|
| 66 |
except FileNotFoundError:
|
| 67 |
-
print(f"Error: File not found at {input_path}. Make sure '
|
| 68 |
except Exception as e:
|
| 69 |
print(f"An error occurred during sanitization: {e}")
|
| 70 |
|
|
|
|
| 64 |
print(f"Sanitized data saved to {output_path}")
|
| 65 |
|
| 66 |
except FileNotFoundError:
|
| 67 |
+
print(f"Error: File not found at {input_path}. Make sure '{input_file_path}' is in your repository.")
|
| 68 |
except Exception as e:
|
| 69 |
print(f"An error occurred during sanitization: {e}")
|
| 70 |
|
train_finetune.py
CHANGED
|
@@ -1,17 +1,11 @@
|
|
| 1 |
# train_finetune.py
|
| 2 |
-
#
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
-
from sentence_transformers import SentenceTransformer, InputExample, losses
|
| 6 |
from torch.utils.data import DataLoader
|
| 7 |
-
import torch
|
| 8 |
-
import csv
|
| 9 |
-
import re
|
| 10 |
-
import os
|
| 11 |
|
| 12 |
# --- File Paths ---
|
| 13 |
-
RAW_CORPUS_PATH = "data/esg_corpus.csv"
|
| 14 |
-
SANITIZED_CORPUS_PATH = "data/esg_corpus_sanitized.csv"
|
| 15 |
TRIPLETS_PATH = "data/esg_triplets.csv"
|
| 16 |
OUTPUT_MODEL_PATH = "./fine_tuned_esg_model"
|
| 17 |
|
|
@@ -20,105 +14,14 @@ BASE_MODEL = "all-MiniLM-L6-v2"
|
|
| 20 |
TRAIN_BATCH_SIZE = 16
|
| 21 |
NUM_EPOCHS = 4
|
| 22 |
LEARNING_RATE = 2e-5
|
| 23 |
-
NUM_TRIPLETS = 1000 # Increased for better training
|
| 24 |
|
| 25 |
-
def
|
| 26 |
"""
|
| 27 |
-
|
| 28 |
"""
|
| 29 |
-
print(
|
| 30 |
-
try:
|
| 31 |
-
with open(input_path, 'r', encoding='utf-8') as f:
|
| 32 |
-
content = f.read()
|
| 33 |
-
|
| 34 |
-
header, body = content.split('\n', 1)
|
| 35 |
-
records_raw = re.split(r'\n(?=\d+,)', body)
|
| 36 |
-
|
| 37 |
-
output_dir = os.path.dirname(output_path)
|
| 38 |
-
if output_dir:
|
| 39 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 40 |
-
|
| 41 |
-
with open(output_path, 'w', newline='', encoding='utf-8') as f_out:
|
| 42 |
-
writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)
|
| 43 |
-
writer.writerow([h.strip() for h in header.strip().split(',')])
|
| 44 |
-
|
| 45 |
-
for raw_record in records_raw:
|
| 46 |
-
if not raw_record.strip():
|
| 47 |
-
continue
|
| 48 |
-
try:
|
| 49 |
-
parts = raw_record.split(',', 2)
|
| 50 |
-
if len(parts) < 3:
|
| 51 |
-
continue
|
| 52 |
-
index, filename, markdown = parts[0].strip(), parts[1].strip(), parts[2].strip('" \n')
|
| 53 |
-
writer.writerow([index, filename, markdown])
|
| 54 |
-
except IndexError:
|
| 55 |
-
continue
|
| 56 |
-
print(f"Sanitized data saved to {output_path}")
|
| 57 |
-
except FileNotFoundError:
|
| 58 |
-
print(f"Error: Raw corpus file not found at {input_path}. Aborting.")
|
| 59 |
-
exit(1)
|
| 60 |
-
except Exception as e:
|
| 61 |
-
print(f"An error occurred during sanitization: {e}")
|
| 62 |
-
exit(1)
|
| 63 |
-
|
| 64 |
-
def create_triplets(corpus_path, output_path, num_triplets):
|
| 65 |
-
"""
|
| 66 |
-
Generates training triplets (anchor, positive, negative) from the sanitized corpus.
|
| 67 |
-
"""
|
| 68 |
-
print(f"Generating triplets from {corpus_path}...")
|
| 69 |
-
try:
|
| 70 |
-
df = pd.read_csv(corpus_path)
|
| 71 |
-
if 'markdown' not in df.columns:
|
| 72 |
-
print(f"Error: Sanitized corpus at {corpus_path} is missing 'markdown' column.")
|
| 73 |
-
exit(1)
|
| 74 |
-
except FileNotFoundError:
|
| 75 |
-
print(f"Error: Sanitized corpus file not found at {corpus_path}. Aborting.")
|
| 76 |
-
exit(1)
|
| 77 |
-
|
| 78 |
-
sentences = df['markdown'].dropna().unique().tolist()
|
| 79 |
-
if len(sentences) < 3:
|
| 80 |
-
print("Error: Not enough unique sentences to generate triplets.")
|
| 81 |
-
exit(1)
|
| 82 |
|
| 83 |
-
|
| 84 |
-
embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=True)
|
| 85 |
-
|
| 86 |
-
triplets = []
|
| 87 |
-
for i in range(len(sentences)):
|
| 88 |
-
anchor_embedding = embeddings[i].unsqueeze(0)
|
| 89 |
-
cos_scores = util.cos_sim(anchor_embedding, embeddings)[0]
|
| 90 |
-
cos_scores[i] = -1
|
| 91 |
-
|
| 92 |
-
positive_idx = torch.topk(cos_scores, k=1).indices[0].item()
|
| 93 |
-
negative_idx = torch.randint(0, len(sentences), (1,)).item()
|
| 94 |
-
while negative_idx == i or negative_idx == positive_idx:
|
| 95 |
-
negative_idx = torch.randint(0, len(sentences), (1,)).item()
|
| 96 |
-
|
| 97 |
-
triplets.append({
|
| 98 |
-
"anchor": sentences[i],
|
| 99 |
-
"positive": sentences[positive_idx],
|
| 100 |
-
"negative": sentences[negative_idx]
|
| 101 |
-
})
|
| 102 |
-
if len(triplets) >= num_triplets:
|
| 103 |
-
break
|
| 104 |
-
|
| 105 |
-
pd.DataFrame(triplets).to_csv(output_path, index=False)
|
| 106 |
-
print(f"Triplet generation complete. Saved {len(triplets)} triplets to {output_path}")
|
| 107 |
-
|
| 108 |
-
def run_training_pipeline():
|
| 109 |
-
"""
|
| 110 |
-
Main function to run the full pipeline: sanitize, create triplets, and fine-tune.
|
| 111 |
-
"""
|
| 112 |
-
print("--- Starting Full ESG Fine-Tuning Pipeline ---")
|
| 113 |
-
|
| 114 |
-
# Step 1: Sanitize the raw CSV data
|
| 115 |
-
sanitize_csv(RAW_CORPUS_PATH, SANITIZED_CORPUS_PATH)
|
| 116 |
-
|
| 117 |
-
# Step 2: Generate triplets from the sanitized data
|
| 118 |
-
create_triplets(SANITIZED_CORPUS_PATH, TRIPLETS_PATH, num_triplets=NUM_TRIPLETS)
|
| 119 |
-
|
| 120 |
-
# Step 3: Fine-tune the model using the generated triplets
|
| 121 |
-
print("\n--- Step 3: Fine-Tuning the Model ---")
|
| 122 |
model = SentenceTransformer(BASE_MODEL)
|
| 123 |
|
| 124 |
try:
|
|
@@ -126,15 +29,20 @@ def run_training_pipeline():
|
|
| 126 |
train_examples = [InputExample(texts=[row['anchor'], row['positive'], row['negative']]) for _, row in triplets_df.iterrows()]
|
| 127 |
except FileNotFoundError:
|
| 128 |
print(f"Error: Triplets file not found at {TRIPLETS_PATH}. Aborting.")
|
|
|
|
| 129 |
return
|
| 130 |
|
| 131 |
if not train_examples:
|
| 132 |
print("No training examples found. Aborting fine-tuning.")
|
| 133 |
return
|
| 134 |
|
|
|
|
| 135 |
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
|
|
|
|
|
|
|
| 136 |
train_loss = losses.MultipleNegativesRankingLoss(model)
|
| 137 |
|
|
|
|
| 138 |
print(f"Starting training for {NUM_EPOCHS} epochs...")
|
| 139 |
model.fit(
|
| 140 |
train_objectives=[(train_dataloader, train_loss)],
|
|
@@ -146,6 +54,7 @@ def run_training_pipeline():
|
|
| 146 |
)
|
| 147 |
|
| 148 |
print(f"--- Fine-tuning complete. Model saved to {OUTPUT_MODEL_PATH} ---")
|
|
|
|
| 149 |
|
| 150 |
if __name__ == "__main__":
|
| 151 |
-
|
|
|
|
| 1 |
# train_finetune.py
|
| 2 |
+
# Script to fine-tune a SentenceTransformer model on pre-generated ESG triplets.
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
+
from sentence_transformers import SentenceTransformer, InputExample, losses
|
| 6 |
from torch.utils.data import DataLoader
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# --- File Paths ---
|
|
|
|
|
|
|
| 9 |
TRIPLETS_PATH = "data/esg_triplets.csv"
|
| 10 |
OUTPUT_MODEL_PATH = "./fine_tuned_esg_model"
|
| 11 |
|
|
|
|
| 14 |
TRAIN_BATCH_SIZE = 16
|
| 15 |
NUM_EPOCHS = 4
|
| 16 |
LEARNING_RATE = 2e-5
|
|
|
|
| 17 |
|
| 18 |
+
def fine_tune_model():
|
| 19 |
"""
|
| 20 |
+
Main function to fine-tune the model using pre-generated triplets.
|
| 21 |
"""
|
| 22 |
+
print("--- Starting ESG Fine-Tuning ---")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
# Load a pre-trained SentenceTransformer model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
model = SentenceTransformer(BASE_MODEL)
|
| 26 |
|
| 27 |
try:
|
|
|
|
| 29 |
train_examples = [InputExample(texts=[row['anchor'], row['positive'], row['negative']]) for _, row in triplets_df.iterrows()]
|
| 30 |
except FileNotFoundError:
|
| 31 |
print(f"Error: Triplets file not found at {TRIPLETS_PATH}. Aborting.")
|
| 32 |
+
print("Please run 'create_triplets.py' first to generate the training data.")
|
| 33 |
return
|
| 34 |
|
| 35 |
if not train_examples:
|
| 36 |
print("No training examples found. Aborting fine-tuning.")
|
| 37 |
return
|
| 38 |
|
| 39 |
+
# Create a DataLoader
|
| 40 |
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
|
| 41 |
+
|
| 42 |
+
# Define the loss function. MultipleNegativesRankingLoss is great for triplets.
|
| 43 |
train_loss = losses.MultipleNegativesRankingLoss(model)
|
| 44 |
|
| 45 |
+
# Fine-tune the model
|
| 46 |
print(f"Starting training for {NUM_EPOCHS} epochs...")
|
| 47 |
model.fit(
|
| 48 |
train_objectives=[(train_dataloader, train_loss)],
|
|
|
|
| 54 |
)
|
| 55 |
|
| 56 |
print(f"--- Fine-tuning complete. Model saved to {OUTPUT_MODEL_PATH} ---")
|
| 57 |
+
print(f"To use this model, update FINE_TUNED_MODEL_PATH in app.py to '{OUTPUT_MODEL_PATH}'.")
|
| 58 |
|
| 59 |
if __name__ == "__main__":
|
| 60 |
+
fine_tune_model()
|