Spaces:

toll-brigs-0
/

ltu-chat

Sleeping

App Files Files Community

Stepan commited on Mar 20

Commit

da83cd6

1 Parent(s): 2614015

Test dataset improvements

Browse files

Files changed (8) hide show

.gitignore +1 -0
README.md +92 -0
rag_pipeline.py +8 -7
ragas_eval.py +188 -0
ragas_evaluation_results.json +0 -0
requirements.txt +2 -1
testset.json → testset.jsonl +0 -0
testset_generation.py +6 -3

.gitignore CHANGED Viewed

@@ -2,3 +2,4 @@
 __pycache__
 .streamlit
 qdrant_data/.lock

 __pycache__
 .streamlit
 qdrant_data/.lock
+.env

README.md CHANGED Viewed

@@ -10,3 +10,95 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# LTU Chat RAG Evaluation
+This repository contains a RAG (Retrieval-Augmented Generation) pipeline for the LTU (Luleå University of Technology) programme data, along with evaluation tools using Ragas.
+## Overview
+The system uses:
+- **Qdrant**: Vector database for storing and retrieving embeddings
+- **Haystack**: Framework for building the RAG pipeline
+- **Ragas**: Framework for evaluating RAG systems
+## Files
+- `rag_pipeline.py`: Main RAG pipeline implementation
+- `ragas_eval.py`: Script to evaluate the RAG pipeline using Ragas
+- `testset.json`: JSONL file containing test questions, reference answers, and contexts
+- `testset_generation.py`: Script used to generate the test set
+## Requirements
+```
+streamlit==1.42.2
+haystack-ai==2.10.3
+qdrant-client==1.13.2
+python-dotenv==1.0.1
+beautifulsoup4==4.13.3
+qdrant-haystack==8.0.0
+ragas-haystack==2.1.0
+rapidfuzz==3.12.2
+pandas
+```
+## Setup
+1. Make sure you have all the required packages installed:
+   ```
+   pip install -r requirements.txt
+   ```
+2. Set up your environment variables (optional):
+   ```
+   export NEBIUS_API_KEY="your_api_key_here"
+   ```
+   If not set, the script will use the default API key included in the code.
+## Running the Evaluation
+To evaluate the RAG pipeline using Ragas:
+```bash
+python ragas_eval.py
+```
+This will:
+1. Load the Qdrant document store from the local directory
+2. Load the test set from `testset.json`
+3. Run the RAG pipeline on each test question
+4. Evaluate the results using Ragas metrics
+5. Save the evaluation results to `ragas_evaluation_results.json`
+## Ragas Metrics
+The evaluation uses the following Ragas metrics:
+- **Faithfulness**: Measures if the generated answer is factually consistent with the retrieved contexts
+- **Answer Relevancy**: Measures if the answer is relevant to the question
+- **Context Precision**: Measures the proportion of retrieved contexts that are relevant
+- **Context Recall**: Measures if the retrieved contexts contain the information needed to answer the question
+- **Context Relevancy**: Measures the relevance of retrieved contexts to the question
+## Customization
+You can customize the evaluation by modifying the `RAGEvaluator` class parameters:
+```python
+evaluator = RAGEvaluator(
+    embedding_model_name="BAAI/bge-en-icl",
+    llm_model_name="meta-llama/Llama-3.3-70B-Instruct",
+    qdrant_path="./qdrant_data",
+    api_base_url="https://api.studio.nebius.com/v1/",
+    collection_name="ltu_programmes"
+)
+```
+## Test Set Format
+The test set is a JSONL file where each line contains:
+- `user_input`: The question
+- `reference`: The reference answer
+- `reference_contexts`: List of reference contexts that should be retrieved
+- `synthesizer_name`: Name of the synthesizer used to generate the reference answer

rag_pipeline.py CHANGED Viewed

@@ -24,11 +24,11 @@ load_dotenv()
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
-logging.getLogger("haystack").setLevel(logging.DEBUG)
-tracing.tracer.is_content_tracing_enabled = True # to enable tracing/logging content (inputs/outputs)
-tracing.enable_tracing(LoggingTracer(tags_color_strings={"haystack.component.input": "\x1b[1;31m", "haystack.component.name": "\x1b[1;34m"}))
 class RAGPipeline:
     def __init__(
@@ -207,15 +207,16 @@ class RAGPipeline:
                 "text_embedder": {"text": question},
                 # "bm25_retriever": {"query": question},
                 "prompt_builder": {"question": question}
-            })
             # Extract answer and documents
             answer = result["llm"]["replies"][0]
-            # documents = result["embedding_retriever"]["documents"]
             return {
                 "answer": answer,
-                "documents": [], #documents,
                 "question": question
             }
         except Exception as e:

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
+# logging.getLogger("haystack").setLevel(logging.DEBUG)
+# tracing.tracer.is_content_tracing_enabled = True # to enable tracing/logging content (inputs/outputs)
+# tracing.enable_tracing(LoggingTracer(tags_color_strings={"haystack.component.input": "\x1b[1;31m", "haystack.component.name": "\x1b[1;34m"}))
 class RAGPipeline:
     def __init__(
                 "text_embedder": {"text": question},
                 # "bm25_retriever": {"query": question},
                 "prompt_builder": {"question": question}
+            }, {'embedding_retriever'})
             # Extract answer and documents
             answer = result["llm"]["replies"][0]
+            print(result.keys())
+            documents = result["embedding_retriever"]["documents"]
             return {
                 "answer": answer,
+                "documents": documents, #documents,
                 "question": question
             }
         except Exception as e:

ragas_eval.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import json
+import logging
+from typing import List, Dict, Any
+# Haystack imports
+from haystack.utils import Secret
+from haystack.components.generators.openai import OpenAIGenerator
+from haystack.components.embedders import OpenAITextEmbedder
+from ragas import EvaluationDataset, SingleTurnSample
+# Ragas imports
+from ragas.metrics import (
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall,
+    # context_relevancy
+)
+from ragas.llms.haystack_wrapper import HaystackLLMWrapper
+from ragas.embeddings.haystack_wrapper import HaystackEmbeddingsWrapper
+from ragas import evaluate
+import pandas as pd
+# Import the existing RAG pipeline
+from rag_pipeline import RAGPipeline
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+class RAGEvaluator:
+    def __init__(
+        self,
+        embedding_model_name: str = "BAAI/bge-en-icl",
+        llm_model_name: str = "meta-llama/Llama-3.3-70B-Instruct",
+        qdrant_path: str = "./qdrant_data",
+        api_base_url: str = "https://api.studio.nebius.com/v1/",
+        collection_name: str = "ltu_documents"
+    ):
+        self.embedding_model_name = embedding_model_name
+        self.llm_model_name = llm_model_name
+        self.qdrant_path = qdrant_path
+        self.api_base_url = api_base_url
+        self.collection_name = collection_name
+        # Load API key from environment or use the one from testset_generation.py
+        self.api_key = Secret.from_token(os.getenv("NEBIUS_API_KEY"))
+        # Initialize the existing RAG pipeline
+        self.init_components()
+    def init_components(self):
+        """Initialize the existing RAG pipeline and Ragas components"""
+        logger.info("Initializing components...")
+        # Initialize the existing RAG pipeline
+        self.rag_pipeline = RAGPipeline(
+            embedding_model_name=self.embedding_model_name,
+            llm_model_name=self.llm_model_name,
+            qdrant_path=self.qdrant_path
+        )
+        # Initialize Ragas wrappers
+        self.llm_wrapper = HaystackLLMWrapper(
+            OpenAIGenerator(
+                api_base_url="https://api.studio.nebius.com/v1/",
+                model=self.llm_model_name,
+                api_key=self.api_key,
+                generation_kwargs={
+                    "max_tokens": 1024,
+                    "temperature": 0.1,
+                    "top_p": 0.95,
+                }
+            )
+        )
+        self.embedding_wrapper = HaystackEmbeddingsWrapper(
+            OpenAITextEmbedder(
+                api_base_url="https://api.studio.nebius.com/v1/",
+                model=self.embedding_model_name,
+                api_key=self.api_key,
+            )
+        )
+        logger.info("Components initialized successfully")
+    def load_testset(self, testset_path: str) -> List[Dict[str, Any]]:
+        """Load test set from a JSONL file"""
+        logger.info(f"Loading test set from {testset_path}...")
+        test_data = []
+        with open(testset_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                test_data.append(json.loads(line))
+        logger.info(f"Loaded {len(test_data)} test samples")
+        return test_data
+    def prepare_ragas_dataframe(self, test_data: List[Dict[str, Any]], results: List[Dict[str, Any]]) -> pd.DataFrame:
+        """Prepare dataframe for Ragas evaluation"""
+        logger.info("Preparing data for Ragas evaluation...")
+        eval_data = []
+        for _, (test_sample, result) in enumerate(zip(test_data, results)):
+            question = test_sample["user_input"]
+            reference_answer = test_sample["reference"]
+            # Get generated answer and contexts from pipeline result
+            generated_answer = result["answer"]
+            contexts = [doc.content for doc in result["documents"]]
+            # Get reference contexts
+            reference_contexts = test_sample.get("reference_contexts", [])
+            eval_data.append(SingleTurnSample(
+                user_input=question,
+                response=generated_answer,
+                retrieved_contexts=contexts,
+                reference=reference_answer,
+                reference_contexts=reference_contexts
+            ))
+            # print(eval_data[0])
+        return EvaluationDataset(eval_data)
+    def run_evaluation(self, testset_path: str = "testset.jsonl") -> Dict[str, float]:
+        """Run the full evaluation process"""
+        logger.info("Starting RAG pipeline evaluation...")
+        # Load test set
+        test_data = self.load_testset(testset_path)
+        # Run pipeline for each test sample
+        results = []
+        for i, test_sample in enumerate(test_data):
+            logger.info(f"Processing test sample {i+1}/{len(test_data)}")
+            question = test_sample["user_input"]
+            # Run the existing RAG pipeline
+            result = self.rag_pipeline.query(question)
+            results.append(result)
+        # Prepare data for Ragas
+        eval_ds = self.prepare_ragas_dataframe(test_data, results)
+        # Run Ragas evaluation
+        logger.info("Running Ragas evaluation...")
+        evaluation_result = evaluate(
+            eval_ds,
+            # metrics=[
+            #     faithfulness,
+            #     answer_relevancy,
+            #     context_precision,
+            #     context_recall,
+            #     # context_relevancy
+            # ],
+            llm=self.llm_wrapper,
+            embeddings=self.embedding_wrapper,
+            # reference_answers=eval_df["reference_answer"].tolist(),
+            # reference_contexts=eval_df["reference_contexts"].tolist()
+        )
+        # Print and return results
+        logger.info("Evaluation complete!")
+        logger.info(f"Results: {evaluation_result}")
+        return evaluation_result
+if __name__ == "__main__":
+    # Create and run evaluator
+    evaluator = RAGEvaluator()
+    results = evaluator.run_evaluation()
+    print(repr(results))
+    # Save results to file
+    # with open("ragas_evaluation_results.json", "w") as f:
+    #     json.dump(results.to_dict(), f, indent=2)
+    # print("\nEvaluation results saved to ragas_evaluation_results.json")
+#     INFO:__main__:Results: {
+#           'answer_relevancy': 0.8558,
+#           'context_precision': 0.9033,
+#           'faithfulness': 0.8000,
+#           'context_recall': 0.9417
+#     }
+#     {'answer_relevancy': 0.8558, 'context_precision': 0.9033, 'faithfulness': 0.8000, 'context_recall': 0.9417}

ragas_evaluation_results.json ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ python-dotenv==1.0.1
 beautifulsoup4==4.13.3
 qdrant-haystack==8.0.0
 ragas-haystack==2.1.0
-rapidfuzz==3.12.2

 beautifulsoup4==4.13.3
 qdrant-haystack==8.0.0
 ragas-haystack==2.1.0
+rapidfuzz==3.12.2
+pandas==2.2.2

testset.json → testset.jsonl RENAMED Viewed

File without changes

testset_generation.py CHANGED Viewed

@@ -4,9 +4,12 @@ from haystack.components.generators.openai import OpenAIGenerator
 from haystack.components.embedders import OpenAITextEmbedder
 from haystack.utils import Secret
 import json
-api_key = Secret.from_token("eyJhbGciOiJIUzI1NiIsImtpZCI6IlV6SXJWd1h0dnprLVRvdzlLZWstc0M1akptWXBvX1VaVkxUZlpnMDRlOFUiLCJ0eXAiOiJKV1QifQ.eyJzdWIiOiJnaXRodWJ8MzM4NTU5OCIsInNjb3BlIjoib3BlbmlkIG9mZmxpbmVfYWNjZXNzIiwiaXNzIjoiYXBpX2tleV9pc3N1ZXIiLCJhdWQiOlsiaHR0cHM6Ly9uZWJpdXMtaW5mZXJlbmNlLmV1LmF1dGgwLmNvbS9hcGkvdjIvIl0sImV4cCI6MTg5ODY1NzA4NSwidXVpZCI6IjkwYWY2MmQ5LTQ1M2ItNDZjNi05N2ZkLTg3ZTQ2YWEzMTg0NyIsIm5hbWUiOiJsdHUtdGhlc2lzICIsImV4cGlyZXNfYXQiOiIyMDMwLTAzLTAyVDA0OjQ0OjQ1KzAwMDAifQ.f31st8MhisxGfLxXeLEsSPGIoCKGy1Py3_-qn2Cw2Tw")
 llm = HaystackLLMWrapper(OpenAIGenerator(
         api_base_url="https://api.studio.nebius.com/v1/",
         model="meta-llama/Llama-3.3-70B-Instruct",
@@ -36,6 +39,6 @@ if not lcdocs:
 generator = TestsetGenerator(llm=llm, embedding_model=embedding)
 dataset = generator.generate_with_langchain_docs(lcdocs, testset_size=10)
 # Save the generated test samples to a JSON file
-dataset.to_jsonl("testset.json")
-print(f"Saved {len(dataset)} test samples to testset.json")

 from haystack.components.embedders import OpenAITextEmbedder
 from haystack.utils import Secret
 import json
+import os
+from dotenv import load_dotenv
+load_dotenv()  # This loads variables from .env into the environment
+api_key = Secret.from_token(os.getenv("NEBIUS_API_KEY"))
 llm = HaystackLLMWrapper(OpenAIGenerator(
         api_base_url="https://api.studio.nebius.com/v1/",
         model="meta-llama/Llama-3.3-70B-Instruct",
 generator = TestsetGenerator(llm=llm, embedding_model=embedding)
 dataset = generator.generate_with_langchain_docs(lcdocs, testset_size=10)
 # Save the generated test samples to a JSON file
+dataset.to_jsonl("testset.jsonl")
+print(f"Saved {len(dataset)} test samples to testset.jsonl")