Spaces:
Running
Running
| # vectorstore.py | |
| import os | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_experimental.text_splitter import SemanticChunker | |
| from langchain_community.vectorstores import FAISS | |
| def load_or_build_vectorstore(local_file: str, index_folder: str, embeddings): | |
| """ | |
| Loads a local FAISS index if it exists; otherwise, | |
| builds a new index from the specified PDF file. | |
| """ | |
| if os.path.exists(index_folder): | |
| print("Loading existing FAISS index from disk...") | |
| vectorstore = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True) | |
| else: | |
| print("Building a new FAISS index...") | |
| loader = PyPDFLoader(local_file) | |
| documents = loader.load() | |
| text_splitter = SemanticChunker( | |
| embeddings=embeddings, | |
| breakpoint_threshold_type='percentile', | |
| breakpoint_threshold_amount=90 | |
| ) | |
| chunked_docs = text_splitter.split_documents(documents) | |
| print(f"Document split into {len(chunked_docs)} chunks.") | |
| vectorstore = FAISS.from_documents(chunked_docs, embeddings) | |
| vectorstore.save_local(index_folder) | |
| return vectorstore | |