Spaces:
Runtime error
Runtime error
Petzys
commited on
Commit
·
531b3e4
1
Parent(s):
0d690e2
Feat: index build is now part of docker build
Browse files- .gitignore +4 -1
- Dockerfile +7 -1
- app.py +3 -36
- build_index.py +46 -0
.gitignore
CHANGED
|
@@ -1 +1,4 @@
|
|
| 1 |
-
__pycache__
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
venv/
|
| 3 |
+
meta.pkl
|
| 4 |
+
xkcd.index
|
Dockerfile
CHANGED
|
@@ -4,7 +4,6 @@ WORKDIR /app
|
|
| 4 |
COPY requirements.txt ./
|
| 5 |
RUN pip install --no-cache-dir --upgrade pip \
|
| 6 |
&& if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; else pip install --no-cache-dir gradio; fi
|
| 7 |
-
COPY . .
|
| 8 |
|
| 9 |
ENV DEBIAN_FRONTEND noninteractive
|
| 10 |
RUN apt-get update && \
|
|
@@ -12,6 +11,13 @@ RUN apt-get update && \
|
|
| 12 |
apt-get install -yq --no-install-recommends \
|
| 13 |
prometheus-node-exporter
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
EXPOSE 7860
|
| 16 |
EXPOSE 8000
|
| 17 |
EXPOSE 9100
|
|
|
|
| 4 |
COPY requirements.txt ./
|
| 5 |
RUN pip install --no-cache-dir --upgrade pip \
|
| 6 |
&& if [ -f requirements.txt ]; then pip install --no-cache-dir -r requirements.txt; else pip install --no-cache-dir gradio; fi
|
|
|
|
| 7 |
|
| 8 |
ENV DEBIAN_FRONTEND noninteractive
|
| 9 |
RUN apt-get update && \
|
|
|
|
| 11 |
apt-get install -yq --no-install-recommends \
|
| 12 |
prometheus-node-exporter
|
| 13 |
|
| 14 |
+
# Build index
|
| 15 |
+
COPY build_index.py ./
|
| 16 |
+
RUN python build_index.py
|
| 17 |
+
|
| 18 |
+
COPY prometheus_helper.py ./
|
| 19 |
+
COPY app.py ./
|
| 20 |
+
|
| 21 |
EXPOSE 7860
|
| 22 |
EXPOSE 8000
|
| 23 |
EXPOSE 9100
|
app.py
CHANGED
|
@@ -18,48 +18,15 @@ META_FILE = "meta.pkl"
|
|
| 18 |
CHAT_MODEL = os.getenv("CHAT_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
|
| 19 |
prometheus_helper = PrometheusHelper()
|
| 20 |
|
| 21 |
-
# ---
|
| 22 |
-
def build_index():
|
| 23 |
-
prometheus_helper.start_index_build_timer()
|
| 24 |
-
print("Building FAISS index...")
|
| 25 |
-
ds = load_dataset("olivierdehaene/xkcd", split="train")
|
| 26 |
-
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 27 |
-
texts = []
|
| 28 |
-
for ex in ds:
|
| 29 |
-
title = ex["title"] if ex["title"] else ""
|
| 30 |
-
transcript = ex["transcript"] if ex["transcript"] else ""
|
| 31 |
-
explanation = ex["explanation"] if "explanation" in ex and ex["explanation"] else ""
|
| 32 |
-
texts.append(f"{title} {transcript} {explanation}")
|
| 33 |
-
|
| 34 |
-
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
|
| 35 |
-
dim = embeddings.shape[1]
|
| 36 |
-
index = faiss.IndexFlatL2(dim)
|
| 37 |
-
index.add(embeddings)
|
| 38 |
-
faiss.write_index(index, INDEX_FILE)
|
| 39 |
-
|
| 40 |
-
# Store just the metadata we need (pickle-friendly)
|
| 41 |
-
meta = [
|
| 42 |
-
{
|
| 43 |
-
"id": ex["id"],
|
| 44 |
-
"title": ex["title"],
|
| 45 |
-
"transcript": ex["transcript"],
|
| 46 |
-
"explanation": ex["explanation"] if "explanation" in ex else "",
|
| 47 |
-
}
|
| 48 |
-
for ex in ds
|
| 49 |
-
]
|
| 50 |
-
with open(META_FILE, "wb") as f:
|
| 51 |
-
pickle.dump(meta, f)
|
| 52 |
-
|
| 53 |
-
prometheus_helper.stop_index_build_timer()
|
| 54 |
-
return index, meta
|
| 55 |
-
|
| 56 |
def get_index():
|
| 57 |
if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
|
| 58 |
print("Loading cached index...")
|
| 59 |
with open(META_FILE, "rb") as f:
|
| 60 |
return faiss.read_index(INDEX_FILE), pickle.load(f)
|
| 61 |
else:
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
def get_id_from_string(str:str) -> str:
|
| 65 |
id_start = str.index("[") +1
|
|
|
|
| 18 |
CHAT_MODEL = os.getenv("CHAT_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
|
| 19 |
prometheus_helper = PrometheusHelper()
|
| 20 |
|
| 21 |
+
# --- Load index ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def get_index():
|
| 23 |
if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
|
| 24 |
print("Loading cached index...")
|
| 25 |
with open(META_FILE, "rb") as f:
|
| 26 |
return faiss.read_index(INDEX_FILE), pickle.load(f)
|
| 27 |
else:
|
| 28 |
+
print("Index files not found, please run build_index.py first.")
|
| 29 |
+
exit(1)
|
| 30 |
|
| 31 |
def get_id_from_string(str:str) -> str:
|
| 32 |
id_start = str.index("[") +1
|
build_index.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import pickle
|
| 4 |
+
import faiss
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
INDEX_FILE = "xkcd.index"
|
| 9 |
+
META_FILE = "meta.pkl"
|
| 10 |
+
|
| 11 |
+
# --- Build / load index ---
|
| 12 |
+
def build_index():
|
| 13 |
+
print("Building FAISS index...")
|
| 14 |
+
ds = load_dataset("olivierdehaene/xkcd", split="train")
|
| 15 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 16 |
+
texts = []
|
| 17 |
+
for ex in ds:
|
| 18 |
+
title = ex["title"] if ex["title"] else ""
|
| 19 |
+
transcript = ex["transcript"] if ex["transcript"] else ""
|
| 20 |
+
explanation = (
|
| 21 |
+
ex["explanation"] if "explanation" in ex and ex["explanation"] else ""
|
| 22 |
+
)
|
| 23 |
+
texts.append(f"{title} {transcript} {explanation}")
|
| 24 |
+
|
| 25 |
+
embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
|
| 26 |
+
dim = embeddings.shape[1]
|
| 27 |
+
index = faiss.IndexFlatL2(dim)
|
| 28 |
+
index.add(embeddings)
|
| 29 |
+
faiss.write_index(index, INDEX_FILE)
|
| 30 |
+
|
| 31 |
+
# Store just the metadata we need (pickle-friendly)
|
| 32 |
+
meta = [
|
| 33 |
+
{
|
| 34 |
+
"id": ex["id"],
|
| 35 |
+
"title": ex["title"],
|
| 36 |
+
"transcript": ex["transcript"],
|
| 37 |
+
"explanation": ex["explanation"] if "explanation" in ex else "",
|
| 38 |
+
}
|
| 39 |
+
for ex in ds
|
| 40 |
+
]
|
| 41 |
+
with open(META_FILE, "wb") as f:
|
| 42 |
+
pickle.dump(meta, f)
|
| 43 |
+
|
| 44 |
+
return index, meta
|
| 45 |
+
|
| 46 |
+
build_index()
|