from huggingface_hub import hf_hub_download, InferenceClient from FlagEmbedding import FlagICLModel import pandas as pd import faiss import numpy as np import os import json # Define HF tokens HF_TOKEN_read = os.environ.get("HF_TOKEN_read") #HF_TOKEN_inference = os.environ.get("HF_TOKEN_inf") # Dataset repo (private) DATASET_REPO = "luciagomez/MrPhil_vector" # ------------------------------------------------------------------- # 1. Download files from Hugging Face dataset # ------------------------------------------------------------------- parquet_path = hf_hub_download( repo_id=DATASET_REPO, filename="bgem3/foundations.parquet", repo_type="dataset", token=HF_TOKEN_read, cache_dir="/tmp/huggingface" ) faiss_path = hf_hub_download( repo_id=DATASET_REPO, filename="bgem3/faiss.index", repo_type="dataset", token=HF_TOKEN_read, cache_dir="/tmp/huggingface" ) meta_path = hf_hub_download( repo_id=DATASET_REPO, filename="bgem3/meta.json", repo_type="dataset", token=HF_TOKEN_read, cache_dir="/tmp/huggingface" ) # ------------------------------------------------------------------- # 2. Load data # ------------------------------------------------------------------- df = pd.read_parquet(parquet_path,engine="pyarrow") index = faiss.read_index(faiss_path) with open(meta_path, "r") as f: meta = json.load(f) dim = index.d n = index.ntotal print(f"Loaded FAISS index with {n} vectors of dimension {dim}") # ------------------------------------------------------------------- # 3. Initialize BGE-ICL model for queries # ------------------------------------------------------------------- examples = [ { "instruct": "Retrieve foundations whose mission aligns with the given perspective.", "query": "Protect marine life while educating children about ocean conservation", "response": "Foundations working on marine conservation and youth education." }, { "instruct": "Retrieve foundations whose mission aligns with the given perspective.", "query": "Promote renewable energy education and community awareness", "response": "Foundations focused on clean energy advocacy and public education." } ] model = FlagICLModel( "BAAI/bge-en-icl", query_instruction_for_retrieval="Given a mission statement, retrieve foundations with aligned purposes.", examples_for_task=examples, use_fp16=False, # set True if GPU with enough memory ) # ------------------------------- # Helper to encode queries # ------------------------------- def encode_query(query: str) -> np.ndarray: return model.encode_queries([query])[0].astype("float32") # Encode a user query using BGE-EN-ICL. # ------------------------------------------------------------------- # 4. Retrieval function # ------------------------------------------------------------------- def find_similar_foundations(perspective: str, top_k: int = 5): """ Given a user perspective, retrieve top-k foundations aligned with it. """ # Encode perspective q_emb = encode_query(perspective).reshape(1, -1) # FAISS expects 2D # Search FAISS index scores, idxs = index.search(q_emb, top_k) # Retrieve foundation info results = [] for score, idx in zip(scores[0], idxs[0]): foundation_info = { "Title": foundations.iloc[idx]["Title"], "Purpose": foundations.iloc[idx]["Purpose"], "Score": float(score) } results.append(foundation_info) return results