# download dataset # wget https://huggingface.co/datasets/Metin/WikiRAG-TR/resolve/main/data/train.csv import gzip import logging import os import sys from collections import defaultdict import numpy as np import pytrec_eval import tqdm import pandas as pd from sentence_transformers import CrossEncoder, util from pylate import evaluation, losses, models, utils, rank evalResultsDf = None question2answer = False shortContext = True # only affects question2context retrieval if question2answer: document_length = 256 else: document_length = 8190 model_name_or_paths = [ "Y-J-Ju/ModernBERT-base-ColBERT", "99eren99/ColBERT-ModernBERT-base-Turkish-uncased", "jinaai/jina-colbert-v2", ] for model_name_or_path in model_name_or_paths: if "jina" in model_name_or_path: model = models.ColBERT( model_name_or_path=model_name_or_path, query_prefix="[QueryMarker]", document_prefix="[DocumentMarker]", attend_to_expansion_tokens=True, trust_remote_code=True, document_length=document_length, ) else: model = models.ColBERT( model_name_or_path=model_name_or_path, document_length=document_length ) # query length kept default because it pads to query length # and document length set to a high value to avoid truncation, it only truncates the document doesn't pads model.eval() model.to("cuda") df = pd.read_csv("metinrag.csv") if "eren" in model_name_or_path: try: model.tokenizer.model_input_names.remove("token_type_ids") except: print(model_name_or_path) df.question = df.question.apply( lambda x: x.replace("İ", "i").replace("I", "ı").lower() ) df.context = df.context.apply( lambda x: x.replace("İ", "i").replace("I", "ı").lower() ) df.answer = df.answer.apply( lambda x: x.replace("İ", "i").replace("I", "ı").lower() ) # evaluate only first 1000 pairs if not question2answer: # filter long context rows if shortContext: df = df[ df.context.apply( lambda x: True if len(x.split()) < 400 else False ).values == True ] else: # filter short context rows df = df[ df.context.apply( lambda x: True if len(x.split()) > 400 else False ).values == True ] df = df.values[:1000] # Read test queries queries = {} passage_cand = {} relevant_qid = [] relevant_docs = defaultdict(lambda: defaultdict(int)) # df[i,2] for question2answer retrieval, df[i,3] for question2content retrieval if question2answer: candidatepassages = [[i, df[i, 2]] for i in range(len(df))] else: candidatepassages = [[i, df[i, 3]] for i in range(len(df))] candidateIds = [[i for i in range(len(df))]] for i, row in enumerate(df): queries[str(i + 10000)] = row[1] relevant_qid.append(str(i + 10000)) for z in range(len(df)): relevant_docs[str(i + 10000)][str(z)] = 0 relevant_docs[str(i + 10000)][str(i)] = 1 queries_result_list = [] run = {} documents_embeddings = model.encode( [list(df[:, 2]) if question2answer else list(df[:, 3])], is_query=False, ) for qid in tqdm.tqdm(relevant_qid): query = queries[qid] queries_embeddings = model.encode( [query], is_query=True, ) reranked_documents = rank.rerank( documents_ids=candidateIds, queries_embeddings=queries_embeddings, documents_embeddings=documents_embeddings, ) run[qid] = {} for resDict in reranked_documents[0]: run[qid][str(resDict["id"])] = float(resDict["score"]) evaluator = pytrec_eval.RelevanceEvaluator( relevant_docs, pytrec_eval.supported_measures ) scores = evaluator.evaluate(run) def print_line(measure, scope, value): print("{:25s}{:8s}{:.4f}".format(measure, scope, value)) for query_id, query_measures in sorted(scores.items()): break for measure, value in sorted(query_measures.items()): print_line(measure, query_id, value) # Scope hack: use query_measures of last item in previous loop to # figure out all unique measure names. resultsColumns = ["model name"] resultsRow = [model_name_or_path] for measure in sorted(query_measures.keys()): resultsColumns.append(measure) resultsRow.append( pytrec_eval.compute_aggregated_measure( measure, [query_measures[measure] for query_measures in scores.values()] ) ) if evalResultsDf is None: evalResultsDf = pd.DataFrame(columns=resultsColumns) evalResultsDf.loc[-1] = resultsRow evalResultsDf.index = evalResultsDf.index + 1 evalResultsDf.to_csv("testResults.csv", encoding="utf-8")