darisdzakwanhoesien2 commited on
Commit
59dfe0f
·
1 Parent(s): 962d966
Files changed (3) hide show
  1. evaluate_esg_coverage.py +2 -4
  2. sanitize_csv.py +1 -1
  3. train_finetune.py +13 -104
evaluate_esg_coverage.py CHANGED
@@ -10,13 +10,11 @@ def evaluate_models(corpus_path="data/esg_corpus_processed.csv"):
10
  Evaluates and benchmarks different models on ESG domain coverage.
11
  """
12
  if not os.path.exists(corpus_path):
13
- print(f"Error: Corpus file not found at {corpus_path}")
14
- sys.exit(1)
15
 
16
  df = pd.read_csv(corpus_path)
17
  if 'text' not in df.columns:
18
- print("Error: CSV must have a 'text' column.")
19
- sys.exit(1)
20
 
21
  df = df.dropna(subset=["text"])
22
  print(f"Loaded {len(df)} ESG sentences for evaluation.")
 
10
  Evaluates and benchmarks different models on ESG domain coverage.
11
  """
12
  if not os.path.exists(corpus_path):
13
+ raise FileNotFoundError(f"Error: Corpus file not found at {corpus_path}")
 
14
 
15
  df = pd.read_csv(corpus_path)
16
  if 'text' not in df.columns:
17
+ raise ValueError("Error: CSV must have a 'text' column.")
 
18
 
19
  df = df.dropna(subset=["text"])
20
  print(f"Loaded {len(df)} ESG sentences for evaluation.")
sanitize_csv.py CHANGED
@@ -64,7 +64,7 @@ def sanitize_csv(input_path, output_path):
64
  print(f"Sanitized data saved to {output_path}")
65
 
66
  except FileNotFoundError:
67
- print(f"Error: File not found at {input_path}. Make sure 'data/esg_corpus.csv' is in your repository.")
68
  except Exception as e:
69
  print(f"An error occurred during sanitization: {e}")
70
 
 
64
  print(f"Sanitized data saved to {output_path}")
65
 
66
  except FileNotFoundError:
67
+ print(f"Error: File not found at {input_path}. Make sure '{input_file_path}' is in your repository.")
68
  except Exception as e:
69
  print(f"An error occurred during sanitization: {e}")
70
 
train_finetune.py CHANGED
@@ -1,17 +1,11 @@
1
  # train_finetune.py
2
- # Integrated script to sanitize data, create triplets, and fine-tune a model.
3
 
4
  import pandas as pd
5
- from sentence_transformers import SentenceTransformer, InputExample, losses, util
6
  from torch.utils.data import DataLoader
7
- import torch
8
- import csv
9
- import re
10
- import os
11
 
12
  # --- File Paths ---
13
- RAW_CORPUS_PATH = "data/esg_corpus.csv"
14
- SANITIZED_CORPUS_PATH = "data/esg_corpus_sanitized.csv"
15
  TRIPLETS_PATH = "data/esg_triplets.csv"
16
  OUTPUT_MODEL_PATH = "./fine_tuned_esg_model"
17
 
@@ -20,105 +14,14 @@ BASE_MODEL = "all-MiniLM-L6-v2"
20
  TRAIN_BATCH_SIZE = 16
21
  NUM_EPOCHS = 4
22
  LEARNING_RATE = 2e-5
23
- NUM_TRIPLETS = 1000 # Increased for better training
24
 
25
- def sanitize_csv(input_path, output_path):
26
  """
27
- Reads a malformed CSV, cleans it, and writes a valid CSV file.
28
  """
29
- print(f"Sanitizing {input_path}...")
30
- try:
31
- with open(input_path, 'r', encoding='utf-8') as f:
32
- content = f.read()
33
-
34
- header, body = content.split('\n', 1)
35
- records_raw = re.split(r'\n(?=\d+,)', body)
36
-
37
- output_dir = os.path.dirname(output_path)
38
- if output_dir:
39
- os.makedirs(output_dir, exist_ok=True)
40
-
41
- with open(output_path, 'w', newline='', encoding='utf-8') as f_out:
42
- writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)
43
- writer.writerow([h.strip() for h in header.strip().split(',')])
44
-
45
- for raw_record in records_raw:
46
- if not raw_record.strip():
47
- continue
48
- try:
49
- parts = raw_record.split(',', 2)
50
- if len(parts) < 3:
51
- continue
52
- index, filename, markdown = parts[0].strip(), parts[1].strip(), parts[2].strip('" \n')
53
- writer.writerow([index, filename, markdown])
54
- except IndexError:
55
- continue
56
- print(f"Sanitized data saved to {output_path}")
57
- except FileNotFoundError:
58
- print(f"Error: Raw corpus file not found at {input_path}. Aborting.")
59
- exit(1)
60
- except Exception as e:
61
- print(f"An error occurred during sanitization: {e}")
62
- exit(1)
63
-
64
- def create_triplets(corpus_path, output_path, num_triplets):
65
- """
66
- Generates training triplets (anchor, positive, negative) from the sanitized corpus.
67
- """
68
- print(f"Generating triplets from {corpus_path}...")
69
- try:
70
- df = pd.read_csv(corpus_path)
71
- if 'markdown' not in df.columns:
72
- print(f"Error: Sanitized corpus at {corpus_path} is missing 'markdown' column.")
73
- exit(1)
74
- except FileNotFoundError:
75
- print(f"Error: Sanitized corpus file not found at {corpus_path}. Aborting.")
76
- exit(1)
77
-
78
- sentences = df['markdown'].dropna().unique().tolist()
79
- if len(sentences) < 3:
80
- print("Error: Not enough unique sentences to generate triplets.")
81
- exit(1)
82
 
83
- model = SentenceTransformer(BASE_MODEL)
84
- embeddings = model.encode(sentences, convert_to_tensor=True, show_progress_bar=True)
85
-
86
- triplets = []
87
- for i in range(len(sentences)):
88
- anchor_embedding = embeddings[i].unsqueeze(0)
89
- cos_scores = util.cos_sim(anchor_embedding, embeddings)[0]
90
- cos_scores[i] = -1
91
-
92
- positive_idx = torch.topk(cos_scores, k=1).indices[0].item()
93
- negative_idx = torch.randint(0, len(sentences), (1,)).item()
94
- while negative_idx == i or negative_idx == positive_idx:
95
- negative_idx = torch.randint(0, len(sentences), (1,)).item()
96
-
97
- triplets.append({
98
- "anchor": sentences[i],
99
- "positive": sentences[positive_idx],
100
- "negative": sentences[negative_idx]
101
- })
102
- if len(triplets) >= num_triplets:
103
- break
104
-
105
- pd.DataFrame(triplets).to_csv(output_path, index=False)
106
- print(f"Triplet generation complete. Saved {len(triplets)} triplets to {output_path}")
107
-
108
- def run_training_pipeline():
109
- """
110
- Main function to run the full pipeline: sanitize, create triplets, and fine-tune.
111
- """
112
- print("--- Starting Full ESG Fine-Tuning Pipeline ---")
113
-
114
- # Step 1: Sanitize the raw CSV data
115
- sanitize_csv(RAW_CORPUS_PATH, SANITIZED_CORPUS_PATH)
116
-
117
- # Step 2: Generate triplets from the sanitized data
118
- create_triplets(SANITIZED_CORPUS_PATH, TRIPLETS_PATH, num_triplets=NUM_TRIPLETS)
119
-
120
- # Step 3: Fine-tune the model using the generated triplets
121
- print("\n--- Step 3: Fine-Tuning the Model ---")
122
  model = SentenceTransformer(BASE_MODEL)
123
 
124
  try:
@@ -126,15 +29,20 @@ def run_training_pipeline():
126
  train_examples = [InputExample(texts=[row['anchor'], row['positive'], row['negative']]) for _, row in triplets_df.iterrows()]
127
  except FileNotFoundError:
128
  print(f"Error: Triplets file not found at {TRIPLETS_PATH}. Aborting.")
 
129
  return
130
 
131
  if not train_examples:
132
  print("No training examples found. Aborting fine-tuning.")
133
  return
134
 
 
135
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
 
 
136
  train_loss = losses.MultipleNegativesRankingLoss(model)
137
 
 
138
  print(f"Starting training for {NUM_EPOCHS} epochs...")
139
  model.fit(
140
  train_objectives=[(train_dataloader, train_loss)],
@@ -146,6 +54,7 @@ def run_training_pipeline():
146
  )
147
 
148
  print(f"--- Fine-tuning complete. Model saved to {OUTPUT_MODEL_PATH} ---")
 
149
 
150
  if __name__ == "__main__":
151
- run_training_pipeline()
 
1
  # train_finetune.py
2
+ # Script to fine-tune a SentenceTransformer model on pre-generated ESG triplets.
3
 
4
  import pandas as pd
5
+ from sentence_transformers import SentenceTransformer, InputExample, losses
6
  from torch.utils.data import DataLoader
 
 
 
 
7
 
8
  # --- File Paths ---
 
 
9
  TRIPLETS_PATH = "data/esg_triplets.csv"
10
  OUTPUT_MODEL_PATH = "./fine_tuned_esg_model"
11
 
 
14
  TRAIN_BATCH_SIZE = 16
15
  NUM_EPOCHS = 4
16
  LEARNING_RATE = 2e-5
 
17
 
18
+ def fine_tune_model():
19
  """
20
+ Main function to fine-tune the model using pre-generated triplets.
21
  """
22
+ print("--- Starting ESG Fine-Tuning ---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Load a pre-trained SentenceTransformer model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  model = SentenceTransformer(BASE_MODEL)
26
 
27
  try:
 
29
  train_examples = [InputExample(texts=[row['anchor'], row['positive'], row['negative']]) for _, row in triplets_df.iterrows()]
30
  except FileNotFoundError:
31
  print(f"Error: Triplets file not found at {TRIPLETS_PATH}. Aborting.")
32
+ print("Please run 'create_triplets.py' first to generate the training data.")
33
  return
34
 
35
  if not train_examples:
36
  print("No training examples found. Aborting fine-tuning.")
37
  return
38
 
39
+ # Create a DataLoader
40
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
41
+
42
+ # Define the loss function. MultipleNegativesRankingLoss is great for triplets.
43
  train_loss = losses.MultipleNegativesRankingLoss(model)
44
 
45
+ # Fine-tune the model
46
  print(f"Starting training for {NUM_EPOCHS} epochs...")
47
  model.fit(
48
  train_objectives=[(train_dataloader, train_loss)],
 
54
  )
55
 
56
  print(f"--- Fine-tuning complete. Model saved to {OUTPUT_MODEL_PATH} ---")
57
+ print(f"To use this model, update FINE_TUNED_MODEL_PATH in app.py to '{OUTPUT_MODEL_PATH}'.")
58
 
59
  if __name__ == "__main__":
60
+ fine_tune_model()