Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
| 1 |
-
from langchain_community.llms import
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
import os
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
# Load
|
| 7 |
load_dotenv()
|
| 8 |
-
MODEL_PATH = os.getenv("MODEL_PATH", "TheBloke/gemma-2b-it-GGUF/gemma-2b-it.Q4_K_M.gguf")
|
| 9 |
|
| 10 |
def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
|
| 11 |
-
"""Summarize a dataframe to avoid overloading the context window."""
|
| 12 |
summary = f"Columns: {', '.join(df.columns)}\n\n"
|
| 13 |
if len(df) > max_rows:
|
| 14 |
sample = df.sample(max_rows, random_state=42)
|
|
@@ -20,8 +18,6 @@ def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
|
|
| 20 |
return summary
|
| 21 |
|
| 22 |
def query_agent(df: pd.DataFrame, query: str) -> str:
|
| 23 |
-
"""Query a CSV/DataFrame using your local Gemma model with context-aware limits."""
|
| 24 |
-
# Attempt to handle simple analytical questions directly with pandas
|
| 25 |
query_lower = query.lower()
|
| 26 |
try:
|
| 27 |
if "most common" in query_lower or "most frequent" in query_lower:
|
|
@@ -32,12 +28,10 @@ def query_agent(df: pd.DataFrame, query: str) -> str:
|
|
| 32 |
except Exception as e:
|
| 33 |
print("Direct analysis failed:", e)
|
| 34 |
|
| 35 |
-
# Otherwise summarize dataset for LLM
|
| 36 |
data_text = summarize_dataframe(df)
|
| 37 |
|
| 38 |
prompt = f"""
|
| 39 |
You are a data analysis assistant with expertise in statistics and data interpretation.
|
| 40 |
-
|
| 41 |
Analyze the dataset sample below and answer the user's question in a **clear, detailed, and well-explained way**.
|
| 42 |
Include both the **direct answer** and a short **explanation or reasoning** behind it.
|
| 43 |
|
|
@@ -50,12 +44,13 @@ def query_agent(df: pd.DataFrame, query: str) -> str:
|
|
| 50 |
Answer (with explanation):
|
| 51 |
"""
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
| 59 |
)
|
| 60 |
|
| 61 |
answer = llm(prompt)
|
|
|
|
| 1 |
+
from langchain_community.llms import HuggingFaceHub
|
| 2 |
from dotenv import load_dotenv
|
| 3 |
import os
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
# Load .env variables (still useful for local dev)
|
| 7 |
load_dotenv()
|
|
|
|
| 8 |
|
| 9 |
def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str:
|
|
|
|
| 10 |
summary = f"Columns: {', '.join(df.columns)}\n\n"
|
| 11 |
if len(df) > max_rows:
|
| 12 |
sample = df.sample(max_rows, random_state=42)
|
|
|
|
| 18 |
return summary
|
| 19 |
|
| 20 |
def query_agent(df: pd.DataFrame, query: str) -> str:
|
|
|
|
|
|
|
| 21 |
query_lower = query.lower()
|
| 22 |
try:
|
| 23 |
if "most common" in query_lower or "most frequent" in query_lower:
|
|
|
|
| 28 |
except Exception as e:
|
| 29 |
print("Direct analysis failed:", e)
|
| 30 |
|
|
|
|
| 31 |
data_text = summarize_dataframe(df)
|
| 32 |
|
| 33 |
prompt = f"""
|
| 34 |
You are a data analysis assistant with expertise in statistics and data interpretation.
|
|
|
|
| 35 |
Analyze the dataset sample below and answer the user's question in a **clear, detailed, and well-explained way**.
|
| 36 |
Include both the **direct answer** and a short **explanation or reasoning** behind it.
|
| 37 |
|
|
|
|
| 44 |
Answer (with explanation):
|
| 45 |
"""
|
| 46 |
|
| 47 |
+
# Use a hosted model instead of local LlamaCpp
|
| 48 |
+
llm = HuggingFaceHub(
|
| 49 |
+
repo_id="google/gemma-2b-it", # or another model like mistralai/Mistral-7B-Instruct-v0.2
|
| 50 |
+
model_kwargs={
|
| 51 |
+
"temperature": 0.7,
|
| 52 |
+
"max_new_tokens": 1024,
|
| 53 |
+
}
|
| 54 |
)
|
| 55 |
|
| 56 |
answer = llm(prompt)
|