DeAR-Reranking
Collection
DeAR (Deep Agent Rank): Dual-Stage Document Reranking with Reasoning Agents Accepted at EMNLP Findings 2025
β’
12 items
β’
Updated
β’
1
DeAR-3B-Reranker-RankNet-LoRA-v1 is a LoRA adapter for the most efficient reranker in the DeAR family. This ultra-lightweight adapter (~40MB) achieves fast inference speeds while maintaining competitive accuracy, making it ideal for resource-constrained production environments.
β
Ultra Lightweight: Only 40MB storage
β
Fastest Inference: 1.5s for 100 documents
β
Memory Efficient: 10GB GPU for inference
β
Easy Deployment: Quick adapter loading
β
Cost Effective: Minimal compute requirements
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig
# Load LoRA adapter
adapter_path = "abdoelsayed/dear-3b-reranker-ranknet-lora-v1"
config = PeftConfig.from_pretrained(adapter_path)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(
config.base_model_name_or_path,
num_labels=1,
torch_dtype=torch.bfloat16
)
# Load and merge LoRA
model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.merge_and_unload()
model.eval().cuda()
# Use model
query = "What is machine learning?"
document = "Machine learning is a subset of artificial intelligence..."
inputs = tokenizer(
f"query: {query}",
f"document: {document}",
return_tensors="pt",
truncation=True,
max_length=228,
padding="max_length"
)
inputs = {k: v.cuda() for k, v in inputs.items()}
with torch.no_grad():
score = model(**inputs).logits.squeeze().item()
print(f"Relevance score: {score}")
from typing import List, Tuple
def load_3b_lora_ranker(adapter_path: str):
"""Load 3B LoRA adapter efficiently."""
config = PeftConfig.from_pretrained(adapter_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
base = AutoModelForSequenceClassification.from_pretrained(
config.base_model_name_or_path,
num_labels=1,
torch_dtype=torch.bfloat16
)
model = PeftModel.from_pretrained(base, adapter_path)
model = model.merge_and_unload()
model.eval().cuda()
return tokenizer, model
# Load once
tokenizer, model = load_3b_lora_ranker("abdoelsayed/dear-3b-reranker-ranknet-lora-v1")
# Rerank function
@torch.inference_mode()
def rerank(tokenizer, model, query: str, docs: List[Tuple[str, str]], batch_size=128):
scores = []
device = next(model.parameters()).device
for i in range(0, len(docs), batch_size):
batch = docs[i:i + batch_size]
queries = [f"query: {query}"] * len(batch)
documents = [f"document: {t} {p}" for t, p in batch]
inputs = tokenizer(queries, documents, return_tensors="pt",
truncation=True, max_length=228, padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}
logits = model(**inputs).logits.squeeze(-1)
scores.extend(logits.cpu().tolist())
return sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
{
"r": 16,
"lora_alpha": 32,
"target_modules": [
"q_proj", "v_proj", "k_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
"lora_dropout": 0.05,
"bias": "none",
"task_type": "SEQ_CLS"
}
LoRA Adapter: 40MB
Full 3B Model: 6GB
Full 8B Model: 16GB
Ratio: 0.67% of 3B, 0.25% of 8B
3B LoRA: 1.5s (100 docs)
8B Full: 2.2s (100 docs)
Speedup: 1.47x faster than 8B
3B LoRA: 10GB GPU
3B Full: 12GB GPU
8B Full: 18GB GPU
Best for:
Use full 3B for:
Use 8B for:
# Minimal memory deployment
import torch
from transformers import AutoModelForSequenceClassification
from peft import PeftModel
adapter_path = "abdoelsayed/dear-3b-reranker-ranknet-lora-v1"
# Load with memory optimization
model = AutoModelForSequenceClassification.from_pretrained(
"meta-llama/Llama-3.2-3B",
num_labels=1,
torch_dtype=torch.bfloat16,
device_map="auto",
low_cpu_mem_usage=True
)
# Load adapter
model = PeftModel.from_pretrained(model, adapter_path)
model = model.merge_and_unload()
model.eval()
# Optional: Compile for speedup
if hasattr(torch, 'compile'):
model = torch.compile(model, mode="max-autotune")
Model Size vs NDCG@10 (TREC DL19):
ββ Teacher-13B: 73.8 (26GB)
ββ DeAR-8B-Full: 74.5 (16GB)
ββ DeAR-8B-LoRA: 74.2 (100MB + base)
ββ DeAR-3B-Full: 71.2 (6GB)
ββ DeAR-3B-LoRA: 70.9 (40MB + base) β This model
Best Efficiency: 95% accuracy at 0.25% size of 8B!
Full Version:
Same Size (3B):
Larger (8B):
Resources:
@article{abdallah2025dear,
title={DeAR: Dual-Stage Document Reranking with Reasoning Agents via LLM Distillation},
author={Abdallah, Abdelrahman and Mozafari, Jamshid and Piryani, Bhawna and Jatowt, Adam},
journal={arXiv preprint arXiv:2508.16998},
year={2025}
}
MIT License
Base model
meta-llama/Llama-3.2-3B