Spaces:

orionweller
/

retrieval-prompting

Running on Zero

App Files Files Community

orionweller commited on Sep 6, 2024

Commit

d89580e

1 Parent(s): d231d5c

faiss

Browse files

Files changed (2) hide show

app.py +41 -10
scifact/faiss_index.bin +3 -0

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import spaces
 import ir_datasets
 import pytrec_eval
 from huggingface_hub import login
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -72,21 +73,51 @@ def load_model():
     model = model.merge_and_unload()
     model.eval()
 def load_corpus_embeddings(dataset_name):
     global retrievers, corpus_lookups
     corpus_path = f"{dataset_name}/corpus_emb.*.pkl"
     index_files = glob.glob(corpus_path)
     logger.info(f'Loading {len(index_files)} files into index for {dataset_name}.')
-    p_reps_0, p_lookup_0 = pickle_load(index_files[0])
-    retrievers[dataset_name] = FaissFlatSearcher(p_reps_0)
-    shards = [(p_reps_0, p_lookup_0)] + [pickle_load(f) for f in index_files[1:]]
-    corpus_lookups[dataset_name] = []
-    for p_reps, p_lookup in tqdm.tqdm(shards, desc=f'Loading shards into index for {dataset_name}', total=len(index_files)):
-        retrievers[dataset_name].add(p_reps)
-        corpus_lookups[dataset_name] += p_lookup
 def pickle_load(path):
     with open(path, 'rb') as f:
@@ -187,7 +218,6 @@ def gradio_interface(dataset, postfix):
     return run_evaluation(dataset, postfix)
 # Create Gradio interface
 iface = gr.Interface(
     fn=gradio_interface,
@@ -201,7 +231,8 @@ iface = gr.Interface(
     examples=[
         ["scifact", ""],
         ["scifact", "When judging the relevance of a document, focus on the pragmatics of the query and consider irrelevant any documents for which the user would have used a different query."]
-    ]
 )
 # Launch the interface

 import ir_datasets
 import pytrec_eval
 from huggingface_hub import login
+import faiss
 # Set up logging
 logging.basicConfig(level=logging.INFO)
     model = model.merge_and_unload()
     model.eval()
+def save_faiss_index(index, dataset_name):
+    index_path = f"{dataset_name}/faiss_index.bin"
+    faiss.write_index(index, index_path)
+    logger.info(f"Saved FAISS index for {dataset_name} to {index_path}")
+def load_faiss_index(dataset_name):
+    index_path = f"{dataset_name}/faiss_index.bin"
+    if os.path.exists(index_path):
+        logger.info(f"Loading existing FAISS index for {dataset_name} from {index_path}")
+        return faiss.read_index(index_path, faiss.IO_FLAG_MMAP)
+    return None
 def load_corpus_embeddings(dataset_name):
     global retrievers, corpus_lookups
     corpus_path = f"{dataset_name}/corpus_emb.*.pkl"
     index_files = glob.glob(corpus_path)
     logger.info(f'Loading {len(index_files)} files into index for {dataset_name}.')
+    # Try to load existing FAISS index
+    faiss_index = load_faiss_index(dataset_name)
+    if faiss_index is None:
+        # If no existing index, create a new one
+        p_reps_0, p_lookup_0 = pickle_load(index_files[0])
+        retrievers[dataset_name] = FaissFlatSearcher(p_reps_0)
+        shards = [(p_reps_0, p_lookup_0)] + [pickle_load(f) for f in index_files[1:]]
+        corpus_lookups[dataset_name] = []
+        for p_reps, p_lookup in tqdm.tqdm(shards, desc=f'Loading shards into index for {dataset_name}', total=len(index_files)):
+            retrievers[dataset_name].add(p_reps)
+            corpus_lookups[dataset_name] += p_lookup
+        # Save the newly created index
+        save_faiss_index(retrievers[dataset_name].index, dataset_name)
+    else:
+        # Use the loaded index
+        retrievers[dataset_name] = FaissFlatSearcher(faiss_index)
+        # Load corpus lookups
+        corpus_lookups[dataset_name] = []
+        for file in index_files:
+            _, p_lookup = pickle_load(file)
+            corpus_lookups[dataset_name] += p_lookup
 def pickle_load(path):
     with open(path, 'rb') as f:
     return run_evaluation(dataset, postfix)
 # Create Gradio interface
 iface = gr.Interface(
     fn=gradio_interface,
     examples=[
         ["scifact", ""],
         ["scifact", "When judging the relevance of a document, focus on the pragmatics of the query and consider irrelevant any documents for which the user would have used a different query."]
+    ],
+    cache_examples=True,
 )
 # Launch the interface

scifact/faiss_index.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d04b686b0c2f04a4fdeabb58c840eacf9471ac3f4625395f7664419b3c51cf57
+size 84918317