Stepan commited on
Commit
da83cd6
·
1 Parent(s): 2614015

Test dataset improvements

Browse files
.gitignore CHANGED
@@ -2,3 +2,4 @@
2
  __pycache__
3
  .streamlit
4
  qdrant_data/.lock
 
 
2
  __pycache__
3
  .streamlit
4
  qdrant_data/.lock
5
+ .env
README.md CHANGED
@@ -10,3 +10,95 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ # LTU Chat RAG Evaluation
15
+
16
+ This repository contains a RAG (Retrieval-Augmented Generation) pipeline for the LTU (Luleå University of Technology) programme data, along with evaluation tools using Ragas.
17
+
18
+ ## Overview
19
+
20
+ The system uses:
21
+ - **Qdrant**: Vector database for storing and retrieving embeddings
22
+ - **Haystack**: Framework for building the RAG pipeline
23
+ - **Ragas**: Framework for evaluating RAG systems
24
+
25
+ ## Files
26
+
27
+ - `rag_pipeline.py`: Main RAG pipeline implementation
28
+ - `ragas_eval.py`: Script to evaluate the RAG pipeline using Ragas
29
+ - `testset.json`: JSONL file containing test questions, reference answers, and contexts
30
+ - `testset_generation.py`: Script used to generate the test set
31
+
32
+ ## Requirements
33
+
34
+ ```
35
+ streamlit==1.42.2
36
+ haystack-ai==2.10.3
37
+ qdrant-client==1.13.2
38
+ python-dotenv==1.0.1
39
+ beautifulsoup4==4.13.3
40
+ qdrant-haystack==8.0.0
41
+ ragas-haystack==2.1.0
42
+ rapidfuzz==3.12.2
43
+ pandas
44
+ ```
45
+
46
+ ## Setup
47
+
48
+ 1. Make sure you have all the required packages installed:
49
+ ```
50
+ pip install -r requirements.txt
51
+ ```
52
+
53
+ 2. Set up your environment variables (optional):
54
+ ```
55
+ export NEBIUS_API_KEY="your_api_key_here"
56
+ ```
57
+ If not set, the script will use the default API key included in the code.
58
+
59
+ ## Running the Evaluation
60
+
61
+ To evaluate the RAG pipeline using Ragas:
62
+
63
+ ```bash
64
+ python ragas_eval.py
65
+ ```
66
+
67
+ This will:
68
+ 1. Load the Qdrant document store from the local directory
69
+ 2. Load the test set from `testset.json`
70
+ 3. Run the RAG pipeline on each test question
71
+ 4. Evaluate the results using Ragas metrics
72
+ 5. Save the evaluation results to `ragas_evaluation_results.json`
73
+
74
+ ## Ragas Metrics
75
+
76
+ The evaluation uses the following Ragas metrics:
77
+
78
+ - **Faithfulness**: Measures if the generated answer is factually consistent with the retrieved contexts
79
+ - **Answer Relevancy**: Measures if the answer is relevant to the question
80
+ - **Context Precision**: Measures the proportion of retrieved contexts that are relevant
81
+ - **Context Recall**: Measures if the retrieved contexts contain the information needed to answer the question
82
+ - **Context Relevancy**: Measures the relevance of retrieved contexts to the question
83
+
84
+ ## Customization
85
+
86
+ You can customize the evaluation by modifying the `RAGEvaluator` class parameters:
87
+
88
+ ```python
89
+ evaluator = RAGEvaluator(
90
+ embedding_model_name="BAAI/bge-en-icl",
91
+ llm_model_name="meta-llama/Llama-3.3-70B-Instruct",
92
+ qdrant_path="./qdrant_data",
93
+ api_base_url="https://api.studio.nebius.com/v1/",
94
+ collection_name="ltu_programmes"
95
+ )
96
+ ```
97
+
98
+ ## Test Set Format
99
+
100
+ The test set is a JSONL file where each line contains:
101
+ - `user_input`: The question
102
+ - `reference`: The reference answer
103
+ - `reference_contexts`: List of reference contexts that should be retrieved
104
+ - `synthesizer_name`: Name of the synthesizer used to generate the reference answer
rag_pipeline.py CHANGED
@@ -24,11 +24,11 @@ load_dotenv()
24
  logging.basicConfig(level=logging.INFO)
25
  logger = logging.getLogger(__name__)
26
 
27
- logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
28
- logging.getLogger("haystack").setLevel(logging.DEBUG)
29
 
30
- tracing.tracer.is_content_tracing_enabled = True # to enable tracing/logging content (inputs/outputs)
31
- tracing.enable_tracing(LoggingTracer(tags_color_strings={"haystack.component.input": "\x1b[1;31m", "haystack.component.name": "\x1b[1;34m"}))
32
 
33
  class RAGPipeline:
34
  def __init__(
@@ -207,15 +207,16 @@ class RAGPipeline:
207
  "text_embedder": {"text": question},
208
  # "bm25_retriever": {"query": question},
209
  "prompt_builder": {"question": question}
210
- })
211
 
212
  # Extract answer and documents
213
  answer = result["llm"]["replies"][0]
214
- # documents = result["embedding_retriever"]["documents"]
 
215
 
216
  return {
217
  "answer": answer,
218
- "documents": [], #documents,
219
  "question": question
220
  }
221
  except Exception as e:
 
24
  logging.basicConfig(level=logging.INFO)
25
  logger = logging.getLogger(__name__)
26
 
27
+ # logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
28
+ # logging.getLogger("haystack").setLevel(logging.DEBUG)
29
 
30
+ # tracing.tracer.is_content_tracing_enabled = True # to enable tracing/logging content (inputs/outputs)
31
+ # tracing.enable_tracing(LoggingTracer(tags_color_strings={"haystack.component.input": "\x1b[1;31m", "haystack.component.name": "\x1b[1;34m"}))
32
 
33
  class RAGPipeline:
34
  def __init__(
 
207
  "text_embedder": {"text": question},
208
  # "bm25_retriever": {"query": question},
209
  "prompt_builder": {"question": question}
210
+ }, {'embedding_retriever'})
211
 
212
  # Extract answer and documents
213
  answer = result["llm"]["replies"][0]
214
+ print(result.keys())
215
+ documents = result["embedding_retriever"]["documents"]
216
 
217
  return {
218
  "answer": answer,
219
+ "documents": documents, #documents,
220
  "question": question
221
  }
222
  except Exception as e:
ragas_eval.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ from typing import List, Dict, Any
5
+
6
+ # Haystack imports
7
+ from haystack.utils import Secret
8
+ from haystack.components.generators.openai import OpenAIGenerator
9
+ from haystack.components.embedders import OpenAITextEmbedder
10
+ from ragas import EvaluationDataset, SingleTurnSample
11
+
12
+
13
+ # Ragas imports
14
+ from ragas.metrics import (
15
+ faithfulness,
16
+ answer_relevancy,
17
+ context_precision,
18
+ context_recall,
19
+ # context_relevancy
20
+ )
21
+ from ragas.llms.haystack_wrapper import HaystackLLMWrapper
22
+ from ragas.embeddings.haystack_wrapper import HaystackEmbeddingsWrapper
23
+ from ragas import evaluate
24
+ import pandas as pd
25
+
26
+ # Import the existing RAG pipeline
27
+ from rag_pipeline import RAGPipeline
28
+
29
+ # Configure logging
30
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
31
+ logger = logging.getLogger(__name__)
32
+
33
+ class RAGEvaluator:
34
+ def __init__(
35
+ self,
36
+ embedding_model_name: str = "BAAI/bge-en-icl",
37
+ llm_model_name: str = "meta-llama/Llama-3.3-70B-Instruct",
38
+ qdrant_path: str = "./qdrant_data",
39
+ api_base_url: str = "https://api.studio.nebius.com/v1/",
40
+ collection_name: str = "ltu_documents"
41
+ ):
42
+ self.embedding_model_name = embedding_model_name
43
+ self.llm_model_name = llm_model_name
44
+ self.qdrant_path = qdrant_path
45
+ self.api_base_url = api_base_url
46
+ self.collection_name = collection_name
47
+
48
+ # Load API key from environment or use the one from testset_generation.py
49
+ self.api_key = Secret.from_token(os.getenv("NEBIUS_API_KEY"))
50
+
51
+ # Initialize the existing RAG pipeline
52
+ self.init_components()
53
+
54
+ def init_components(self):
55
+ """Initialize the existing RAG pipeline and Ragas components"""
56
+ logger.info("Initializing components...")
57
+
58
+ # Initialize the existing RAG pipeline
59
+ self.rag_pipeline = RAGPipeline(
60
+ embedding_model_name=self.embedding_model_name,
61
+ llm_model_name=self.llm_model_name,
62
+ qdrant_path=self.qdrant_path
63
+ )
64
+
65
+ # Initialize Ragas wrappers
66
+ self.llm_wrapper = HaystackLLMWrapper(
67
+ OpenAIGenerator(
68
+ api_base_url="https://api.studio.nebius.com/v1/",
69
+ model=self.llm_model_name,
70
+ api_key=self.api_key,
71
+ generation_kwargs={
72
+ "max_tokens": 1024,
73
+ "temperature": 0.1,
74
+ "top_p": 0.95,
75
+ }
76
+ )
77
+ )
78
+
79
+ self.embedding_wrapper = HaystackEmbeddingsWrapper(
80
+ OpenAITextEmbedder(
81
+ api_base_url="https://api.studio.nebius.com/v1/",
82
+ model=self.embedding_model_name,
83
+ api_key=self.api_key,
84
+ )
85
+ )
86
+
87
+ logger.info("Components initialized successfully")
88
+
89
+ def load_testset(self, testset_path: str) -> List[Dict[str, Any]]:
90
+ """Load test set from a JSONL file"""
91
+ logger.info(f"Loading test set from {testset_path}...")
92
+
93
+ test_data = []
94
+ with open(testset_path, 'r', encoding='utf-8') as f:
95
+ for line in f:
96
+ test_data.append(json.loads(line))
97
+
98
+ logger.info(f"Loaded {len(test_data)} test samples")
99
+ return test_data
100
+
101
+ def prepare_ragas_dataframe(self, test_data: List[Dict[str, Any]], results: List[Dict[str, Any]]) -> pd.DataFrame:
102
+ """Prepare dataframe for Ragas evaluation"""
103
+ logger.info("Preparing data for Ragas evaluation...")
104
+
105
+ eval_data = []
106
+
107
+ for _, (test_sample, result) in enumerate(zip(test_data, results)):
108
+ question = test_sample["user_input"]
109
+ reference_answer = test_sample["reference"]
110
+
111
+ # Get generated answer and contexts from pipeline result
112
+ generated_answer = result["answer"]
113
+ contexts = [doc.content for doc in result["documents"]]
114
+
115
+ # Get reference contexts
116
+ reference_contexts = test_sample.get("reference_contexts", [])
117
+
118
+ eval_data.append(SingleTurnSample(
119
+ user_input=question,
120
+ response=generated_answer,
121
+ retrieved_contexts=contexts,
122
+ reference=reference_answer,
123
+ reference_contexts=reference_contexts
124
+ ))
125
+ # print(eval_data[0])
126
+
127
+ return EvaluationDataset(eval_data)
128
+
129
+ def run_evaluation(self, testset_path: str = "testset.jsonl") -> Dict[str, float]:
130
+ """Run the full evaluation process"""
131
+ logger.info("Starting RAG pipeline evaluation...")
132
+
133
+ # Load test set
134
+ test_data = self.load_testset(testset_path)
135
+
136
+ # Run pipeline for each test sample
137
+ results = []
138
+ for i, test_sample in enumerate(test_data):
139
+ logger.info(f"Processing test sample {i+1}/{len(test_data)}")
140
+ question = test_sample["user_input"]
141
+
142
+ # Run the existing RAG pipeline
143
+ result = self.rag_pipeline.query(question)
144
+ results.append(result)
145
+
146
+ # Prepare data for Ragas
147
+ eval_ds = self.prepare_ragas_dataframe(test_data, results)
148
+
149
+ # Run Ragas evaluation
150
+ logger.info("Running Ragas evaluation...")
151
+ evaluation_result = evaluate(
152
+ eval_ds,
153
+ # metrics=[
154
+ # faithfulness,
155
+ # answer_relevancy,
156
+ # context_precision,
157
+ # context_recall,
158
+ # # context_relevancy
159
+ # ],
160
+ llm=self.llm_wrapper,
161
+ embeddings=self.embedding_wrapper,
162
+ # reference_answers=eval_df["reference_answer"].tolist(),
163
+ # reference_contexts=eval_df["reference_contexts"].tolist()
164
+ )
165
+
166
+ # Print and return results
167
+ logger.info("Evaluation complete!")
168
+ logger.info(f"Results: {evaluation_result}")
169
+
170
+ return evaluation_result
171
+
172
+ if __name__ == "__main__":
173
+ # Create and run evaluator
174
+ evaluator = RAGEvaluator()
175
+ results = evaluator.run_evaluation()
176
+ print(repr(results))
177
+ # Save results to file
178
+ # with open("ragas_evaluation_results.json", "w") as f:
179
+ # json.dump(results.to_dict(), f, indent=2)
180
+
181
+ # print("\nEvaluation results saved to ragas_evaluation_results.json")
182
+ # INFO:__main__:Results: {
183
+ # 'answer_relevancy': 0.8558,
184
+ # 'context_precision': 0.9033,
185
+ # 'faithfulness': 0.8000,
186
+ # 'context_recall': 0.9417
187
+ # }
188
+ # {'answer_relevancy': 0.8558, 'context_precision': 0.9033, 'faithfulness': 0.8000, 'context_recall': 0.9417}
ragas_evaluation_results.json ADDED
File without changes
requirements.txt CHANGED
@@ -5,4 +5,5 @@ python-dotenv==1.0.1
5
  beautifulsoup4==4.13.3
6
  qdrant-haystack==8.0.0
7
  ragas-haystack==2.1.0
8
- rapidfuzz==3.12.2
 
 
5
  beautifulsoup4==4.13.3
6
  qdrant-haystack==8.0.0
7
  ragas-haystack==2.1.0
8
+ rapidfuzz==3.12.2
9
+ pandas==2.2.2
testset.json → testset.jsonl RENAMED
File without changes
testset_generation.py CHANGED
@@ -4,9 +4,12 @@ from haystack.components.generators.openai import OpenAIGenerator
4
  from haystack.components.embedders import OpenAITextEmbedder
5
  from haystack.utils import Secret
6
  import json
 
 
7
 
 
 
8
 
9
- api_key = Secret.from_token("eyJhbGciOiJIUzI1NiIsImtpZCI6IlV6SXJWd1h0dnprLVRvdzlLZWstc0M1akptWXBvX1VaVkxUZlpnMDRlOFUiLCJ0eXAiOiJKV1QifQ.eyJzdWIiOiJnaXRodWJ8MzM4NTU5OCIsInNjb3BlIjoib3BlbmlkIG9mZmxpbmVfYWNjZXNzIiwiaXNzIjoiYXBpX2tleV9pc3N1ZXIiLCJhdWQiOlsiaHR0cHM6Ly9uZWJpdXMtaW5mZXJlbmNlLmV1LmF1dGgwLmNvbS9hcGkvdjIvIl0sImV4cCI6MTg5ODY1NzA4NSwidXVpZCI6IjkwYWY2MmQ5LTQ1M2ItNDZjNi05N2ZkLTg3ZTQ2YWEzMTg0NyIsIm5hbWUiOiJsdHUtdGhlc2lzICIsImV4cGlyZXNfYXQiOiIyMDMwLTAzLTAyVDA0OjQ0OjQ1KzAwMDAifQ.f31st8MhisxGfLxXeLEsSPGIoCKGy1Py3_-qn2Cw2Tw")
10
  llm = HaystackLLMWrapper(OpenAIGenerator(
11
  api_base_url="https://api.studio.nebius.com/v1/",
12
  model="meta-llama/Llama-3.3-70B-Instruct",
@@ -36,6 +39,6 @@ if not lcdocs:
36
  generator = TestsetGenerator(llm=llm, embedding_model=embedding)
37
  dataset = generator.generate_with_langchain_docs(lcdocs, testset_size=10)
38
  # Save the generated test samples to a JSON file
39
- dataset.to_jsonl("testset.json")
40
 
41
- print(f"Saved {len(dataset)} test samples to testset.json")
 
4
  from haystack.components.embedders import OpenAITextEmbedder
5
  from haystack.utils import Secret
6
  import json
7
+ import os
8
+ from dotenv import load_dotenv
9
 
10
+ load_dotenv() # This loads variables from .env into the environment
11
+ api_key = Secret.from_token(os.getenv("NEBIUS_API_KEY"))
12
 
 
13
  llm = HaystackLLMWrapper(OpenAIGenerator(
14
  api_base_url="https://api.studio.nebius.com/v1/",
15
  model="meta-llama/Llama-3.3-70B-Instruct",
 
39
  generator = TestsetGenerator(llm=llm, embedding_model=embedding)
40
  dataset = generator.generate_with_langchain_docs(lcdocs, testset_size=10)
41
  # Save the generated test samples to a JSON file
42
+ dataset.to_jsonl("testset.jsonl")
43
 
44
+ print(f"Saved {len(dataset)} test samples to testset.jsonl")