Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -1,30 +1,22 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import chromadb
|
| 3 |
import pandas as pd
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
-
import nltk
|
| 6 |
|
| 7 |
# --- 1. SETUP MODELS AND DATABASE ---
|
| 8 |
-
# This setup runs once when the app starts.
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
nltk.data.find('tokenizers/punkt')
|
| 14 |
-
except nltk.downloader.DownloadError:
|
| 15 |
-
print("Downloading NLTK's 'punkt' model...")
|
| 16 |
-
nltk.download('punkt')
|
| 17 |
|
| 18 |
print("Loading embedding model...")
|
| 19 |
-
|
| 20 |
-
#SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
|
| 21 |
-
|
| 22 |
embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
|
| 23 |
|
| 24 |
client = chromadb.Client()
|
| 25 |
-
|
| 26 |
collection = client.get_or_create_collection(
|
| 27 |
-
name="
|
| 28 |
metadata={"hnsw:space": "cosine"}
|
| 29 |
)
|
| 30 |
print("ChromaDB collection ready.")
|
|
@@ -32,15 +24,18 @@ print("ChromaDB collection ready.")
|
|
| 32 |
# --- 2. CORE FUNCTIONS ---
|
| 33 |
def index_transcript(transcript_text):
|
| 34 |
"""Chunks and indexes a full transcript into ChromaDB."""
|
|
|
|
|
|
|
|
|
|
| 35 |
if not transcript_text.strip():
|
| 36 |
return "Please paste a transcript before indexing.", pd.DataFrame()
|
| 37 |
|
| 38 |
-
# --- FIX: Use NLTK to split by sentence for more robust chunking ---
|
| 39 |
-
# The language parameter improves accuracy for Portuguese.
|
| 40 |
chunks = nltk.sent_tokenize(transcript_text, language='portuguese')
|
| 41 |
-
|
| 42 |
-
# Filter out any very short, likely empty chunks
|
| 43 |
chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
ids = [f"chunk_{i}" for i in range(len(chunks))]
|
| 46 |
|
|
@@ -48,7 +43,6 @@ def index_transcript(transcript_text):
|
|
| 48 |
collection.delete(ids=collection.get()['ids'])
|
| 49 |
|
| 50 |
collection.add(documents=chunks, ids=ids)
|
| 51 |
-
|
| 52 |
indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
|
| 53 |
return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df
|
| 54 |
|
|
@@ -56,28 +50,27 @@ def search_transcript(query):
|
|
| 56 |
"""Searches the indexed transcript for a given query."""
|
| 57 |
if not query.strip():
|
| 58 |
return pd.DataFrame(), "Please enter a query."
|
| 59 |
-
|
| 60 |
results = collection.query(query_texts=[query], n_results=3)
|
| 61 |
-
|
| 62 |
if not results or not results['documents'][0]:
|
| 63 |
return pd.DataFrame(), "No similar chunks found."
|
| 64 |
-
|
| 65 |
documents = results['documents'][0]
|
| 66 |
distances = results['distances'][0]
|
| 67 |
similarities = [f"{1 - dist:.2f}" for dist in distances]
|
| 68 |
-
|
| 69 |
df = pd.DataFrame({
|
| 70 |
"Similarity Score": similarities,
|
| 71 |
"Matching Chunk": documents
|
| 72 |
})
|
| 73 |
return df, "Search complete."
|
| 74 |
|
| 75 |
-
# --- 3. GRADIO INTERFACE
|
| 76 |
-
sample_transcript = """
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
|
| 79 |
gr.Markdown("# 🤖 Guideline Compliance Prototype")
|
| 80 |
-
gr.Markdown("An interactive demo to test semantic search on call transcripts using ChromaDB.")
|
| 81 |
with gr.Row():
|
| 82 |
with gr.Column(scale=1):
|
| 83 |
gr.Markdown("### 1. Index a Transcript")
|
|
@@ -87,10 +80,10 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as d
|
|
| 87 |
indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview", interactive=False)
|
| 88 |
with gr.Column(scale=1):
|
| 89 |
gr.Markdown("### 2. Search for Compliance")
|
| 90 |
-
query_input = gr.Textbox(label="Guideline Query", placeholder="Ex: O operador ofereceu duas opções?"
|
| 91 |
search_button = gr.Button("Search", variant="primary")
|
| 92 |
search_status = gr.Label(value="Status: Waiting for query.")
|
| 93 |
-
results_output = gr.DataFrame(headers=["Similarity Score", "Matching Chunk"], label="Search Results (Top 3)"
|
| 94 |
index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
|
| 95 |
search_button.click(fn=search_transcript, inputs=[query_input], outputs=[results_output, search_status])
|
| 96 |
|
|
|
|
| 1 |
+
|
| 2 |
import gradio as gr
|
| 3 |
import chromadb
|
| 4 |
import pandas as pd
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import nltk
|
| 7 |
|
| 8 |
# --- 1. SETUP MODELS AND DATABASE ---
|
|
|
|
| 9 |
|
| 10 |
+
# FIX: Directly download the 'punkt' model. This is the most robust method for deployment.
|
| 11 |
+
print("Downloading NLTK's 'punkt' model...")
|
| 12 |
+
nltk.download('punkt')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
print("Loading embedding model...")
|
|
|
|
|
|
|
|
|
|
| 15 |
embedding_model = SentenceTransformer('rufimelo/bert-large-portuguese-cased-sts')
|
| 16 |
|
| 17 |
client = chromadb.Client()
|
|
|
|
| 18 |
collection = client.get_or_create_collection(
|
| 19 |
+
name="transcript_demo_br_model_final",
|
| 20 |
metadata={"hnsw:space": "cosine"}
|
| 21 |
)
|
| 22 |
print("ChromaDB collection ready.")
|
|
|
|
| 24 |
# --- 2. CORE FUNCTIONS ---
|
| 25 |
def index_transcript(transcript_text):
|
| 26 |
"""Chunks and indexes a full transcript into ChromaDB."""
|
| 27 |
+
print("--- DEBUGGING ---")
|
| 28 |
+
print(f"Raw transcript received: '{transcript_text}'")
|
| 29 |
+
|
| 30 |
if not transcript_text.strip():
|
| 31 |
return "Please paste a transcript before indexing.", pd.DataFrame()
|
| 32 |
|
|
|
|
|
|
|
| 33 |
chunks = nltk.sent_tokenize(transcript_text, language='portuguese')
|
|
|
|
|
|
|
| 34 |
chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
|
| 35 |
+
|
| 36 |
+
print(f"Number of chunks created: {len(chunks)}")
|
| 37 |
+
print(f"Chunks found: {chunks}")
|
| 38 |
+
print("--- END DEBUGGING ---")
|
| 39 |
|
| 40 |
ids = [f"chunk_{i}" for i in range(len(chunks))]
|
| 41 |
|
|
|
|
| 43 |
collection.delete(ids=collection.get()['ids'])
|
| 44 |
|
| 45 |
collection.add(documents=chunks, ids=ids)
|
|
|
|
| 46 |
indexed_df = pd.DataFrame({"Indexed Chunks": chunks})
|
| 47 |
return f"✅ Indexed {len(chunks)} chunks successfully!", indexed_df
|
| 48 |
|
|
|
|
| 50 |
"""Searches the indexed transcript for a given query."""
|
| 51 |
if not query.strip():
|
| 52 |
return pd.DataFrame(), "Please enter a query."
|
|
|
|
| 53 |
results = collection.query(query_texts=[query], n_results=3)
|
|
|
|
| 54 |
if not results or not results['documents'][0]:
|
| 55 |
return pd.DataFrame(), "No similar chunks found."
|
|
|
|
| 56 |
documents = results['documents'][0]
|
| 57 |
distances = results['distances'][0]
|
| 58 |
similarities = [f"{1 - dist:.2f}" for dist in distances]
|
|
|
|
| 59 |
df = pd.DataFrame({
|
| 60 |
"Similarity Score": similarities,
|
| 61 |
"Matching Chunk": documents
|
| 62 |
})
|
| 63 |
return df, "Search complete."
|
| 64 |
|
| 65 |
+
# --- 3. GRADIO INTERFACE ---
|
| 66 |
+
sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
|
| 67 |
+
Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
|
| 68 |
+
Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra?
|
| 69 |
+
Cliente: Claro, o número é 11223344. Estou bem decepcionado, usei a cafeteira só duas vezes.
|
| 70 |
+
"""
|
| 71 |
|
| 72 |
with gr.Blocks(theme=gr.themes.Soft(), title="Guideline Compliance Tester") as demo:
|
| 73 |
gr.Markdown("# 🤖 Guideline Compliance Prototype")
|
|
|
|
| 74 |
with gr.Row():
|
| 75 |
with gr.Column(scale=1):
|
| 76 |
gr.Markdown("### 1. Index a Transcript")
|
|
|
|
| 80 |
indexed_preview = gr.DataFrame(headers=["Indexed Chunks"], label="Indexed Data Preview", interactive=False)
|
| 81 |
with gr.Column(scale=1):
|
| 82 |
gr.Markdown("### 2. Search for Compliance")
|
| 83 |
+
query_input = gr.Textbox(label="Guideline Query", placeholder="Ex: O operador ofereceu duas opções?")
|
| 84 |
search_button = gr.Button("Search", variant="primary")
|
| 85 |
search_status = gr.Label(value="Status: Waiting for query.")
|
| 86 |
+
results_output = gr.DataFrame(headers=["Similarity Score", "Matching Chunk"], label="Search Results (Top 3)")
|
| 87 |
index_button.click(fn=index_transcript, inputs=[transcript_input], outputs=[index_status, indexed_preview])
|
| 88 |
search_button.click(fn=search_transcript, inputs=[query_input], outputs=[results_output, search_status])
|
| 89 |
|