from huggingface_hub import InferenceClient import os import pandas as pd def summarize_dataframe(df: pd.DataFrame, max_rows: int = 30) -> str: summary = f"Columns: {', '.join(df.columns)}\n\n" if len(df) > max_rows: sample = df.sample(max_rows, random_state=42) summary += "Showing a random sample of rows:\n" else: sample = df summary += "Showing all rows:\n" summary += sample.to_string(index=False) return summary def query_agent(df: pd.DataFrame, query: str) -> str: query_lower = query.lower() # ----------------- Direct Analysis for Most Common ----------------- try: if "most common" in query_lower or "most frequent" in query_lower: # Look for multiple columns in query cols_in_query = [col for col in df.columns if col.lower() in query_lower] if len(cols_in_query) == 1: col = cols_in_query[0] value = df[col].mode()[0] return f"The most common value in column '{col}' is '{value}'." elif len(cols_in_query) > 1: # Compute most common combination of values across the columns combo_series = df[cols_in_query].apply(lambda row: tuple(row), axis=1) most_common_combo = combo_series.mode()[0] combo_str = ", ".join(f"{col}={val}" for col, val in zip(cols_in_query, most_common_combo)) return f"The most common combination of values is: {combo_str}" except Exception as e: print("Direct analysis failed:", e) # ----------------- Use LLM if direct analysis fails ----------------- data_text = summarize_dataframe(df) prompt = f""" You are a data analysis assistant with expertise in statistics and data interpretation. Analyze the dataset sample below and answer the user's question in a clear, detailed, and well-explained way. Include both the direct answer and a short explanation or reasoning behind it. Dataset Summary: {data_text} Question: {query} Answer (with explanation): """ # Initialize client with explicit provider client = InferenceClient( model="google/gemma-2b-it", provider="hf-inference", token=os.environ.get("HUGGINGFACE_API_TOKEN"), ) try: response = client.text_generation( prompt, max_new_tokens=1024, temperature=0.7, ) except Exception as e: print("Model call failed:", e) return "⚠️ Sorry, the model could not generate an answer. Please try again." # Extract text safely if isinstance(response, str): return response elif isinstance(response, dict) and "generated_text" in response: return response["generated_text"] elif isinstance(response, list) and len(response) > 0 and "generated_text" in response[0]: return response[0]["generated_text"] else: return str(response)