Spaces:

ibombonato
/

Semantic-search-br

Sleeping

App Files Files Community

ibombonato commited on Jul 9

Commit

a5c77b7

verified ·

1 Parent(s): a793867

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +14 -6

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import nltk
 # --- 1. SETUP MODELS AND DATABASE ---
-# FIX: Directly download the 'punkt' model. This is the most robust method for deployment.
 print("Downloading NLTK's 'punkt' model...")
 nltk.download('punkt')
@@ -24,15 +24,23 @@ print("ChromaDB collection ready.")
 # --- 2. CORE FUNCTIONS ---
 def index_transcript(transcript_text):
     """Chunks and indexes a full transcript into ChromaDB."""
-    print("--- DEBUGGING ---")
-    print(f"Raw transcript received: '{transcript_text}'")
     if not transcript_text.strip():
         return "Please paste a transcript before indexing.", pd.DataFrame()
-    chunks = nltk.sent_tokenize(transcript_text, language='portuguese')
     chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
     print(f"Number of chunks created: {len(chunks)}")
     print(f"Chunks found: {chunks}")
     print("--- END DEBUGGING ---")
@@ -62,7 +70,7 @@ def search_transcript(query):
     })
     return df, "Search complete."
-# --- 3. GRADIO INTERFACE ---
 sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
 Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
 Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra?

 # --- 1. SETUP MODELS AND DATABASE ---
+# This single download is all we need.
 print("Downloading NLTK's 'punkt' model...")
 nltk.download('punkt')
 # --- 2. CORE FUNCTIONS ---
 def index_transcript(transcript_text):
     """Chunks and indexes a full transcript into ChromaDB."""
     if not transcript_text.strip():
         return "Please paste a transcript before indexing.", pd.DataFrame()
+    # --- FIX: Explicitly load the Portuguese tokenizer to avoid lookup errors ---
+    # This file is included in the 'punkt' download.
+    try:
+        pt_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
+        chunks = pt_tokenizer.tokenize(transcript_text)
+    except Exception as e:
+        # Fallback to default tokenizer if the Portuguese one fails for any reason
+        print(f"Could not load Portuguese tokenizer, falling back to default. Error: {e}")
+        chunks = nltk.sent_tokenize(transcript_text)
     chunks = [chunk.strip() for chunk in chunks if len(chunk.strip()) > 5]
+    # Debugging logs to confirm the chunking
+    print("--- CHUNKING DEBUG ---")
     print(f"Number of chunks created: {len(chunks)}")
     print(f"Chunks found: {chunks}")
     print("--- END DEBUGGING ---")
     })
     return df, "Search complete."
+# --- 3. GRADIO INTERFACE (No changes) ---
 sample_transcript = """Atendente: Olá, bem-vindo à EletroMax. Meu nome é Sofia, em que posso ajudar?
 Cliente: Oi, Sofia. Eu comprei uma cafeteira no site de vocês na semana passada, e ela simplesmente parou de funcionar.
 Atendente: Puxa, que chato isso. Sinto muito pelo transtorno. Pode me informar o número do pedido para eu localizar sua compra?