# pip install smolagents python-chess stockfish pandas numpy requests markdownify # Generic agent import os from typing import Optional import pandas as pd # Genai imports from google import genai from google.genai import types # Smolagents imports from smolagents import ( CodeAgent, InferenceClientModel, TransformersModel, LiteLLMModel, DuckDuckGoSearchTool, VisitWebpageTool, PythonInterpreterTool, FinalAnswerTool, ) # Import your custom tools (to be used in app, not in local notebook) from tools.gemini_native_tools import analyze_video, analyze_image, analyze_audio from tools.download_file import download_file_from_url from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string from tools.audio_tools import youtube_to_text, transcribe_audio # Define tools AGENT_TOOLS = [ # Default Tools DuckDuckGoSearchTool(), # Internet search VisitWebpageTool(), # Retrieve webpage content PythonInterpreterTool(), # Executes agent-generated Python code FinalAnswerTool(), # Ends agent reasoning and returns final answer # Custom Tools download_file_from_url, # file downloader text_file_to_string, # .txt, .md, .json, etc. pdf_to_text, # PyMuPDF-based safe PDF parser image_to_text, # OCR for images youtube_to_text, # Youtube audio to text transcribe_audio, # Audio file to text ] # Gemini-only tools NATIVE_TOOLS = [ analyze_video, analyze_image, analyze_audio ] # Define authorized imports AUTHORIZED_IMPORTS = [ 'numpy','re', 'pandas', 'json', 'datetime', 'tempfile','requests', 'markdownify', 'chess.*', ] # --- SYSTEM PROMPT TEMPLATE --- # The {} placeholder will be filled differently for Basic vs Gemini (Native) SYSTEM_PROMPT_TEMPLATE = """ You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems. ### 1. Reason-Act-Observe Follow a **PLAN → ACT → OBSERVE** loop: - **PLAN:** Break the task into 1–3 logical steps. Identify tools for each step. - **ACT:** Write and run one self-contained Python block per step. - **OBSERVE:** Examine outputs or errors before proceeding. ### 2. File Handling {file_handling_instructions} **Important rules:** - Whenever you are given a file path (or url), you **must ABSOLUTELY store it in a variable first** (e.g. filepath`) and pass that variable directly to the next tool. **NEVER** try to write the path yourself in the function. - You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio). ### 3. Data Analysis & Answer - Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis. - Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task. ### 4. Additional instructions for the following tasks provided by GAIA team - You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated. - YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. - Do NOT include "FINAL ANSWER:" in your final answer text. For example: if the question is "What is the capital of Spain?", respond with "Madrid". It is exact and expected answer. ### 5. To provide the final answer, you MUST call the final_answer tool inside a block. - Example of how to end the task: Question: "What is the capital of France?" Thought: I have found the answer. I will now provide it. final_answer("Paris") """ # Instruction for Tool-Based Agents (BasicAgent and Gemini-Standard) TOOL_BASED_INSTRUCTIONS = """ You must select the reading or transcription method **strictly** based on the file type: | File Type / Source | Tool / Method to Use | | :--- | :--- | | `.csv` | `pd.read_csv(filepath)` | | `.xlsx`, `.xls` | `pd.read_excel(filepath)` | | `.pdf` | `pdf_to_text(filepath)` | | `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` | | `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` | | **YouTube URL** | `youtube_to_text(url)` | | `.mp3`, `.wav`, `.m4a` | `transcribe_audio(filepath)` | """ # Instruction for Native Gemini (No OCR/Transcribe tools for media) NATIVE_MEDIA_INSTRUCTIONS = """ You have **native vision and audio capabilities**. - For **Images (.png, .jpg) and Audio/Video**: Do NOT use external tools like `image_to_text`. You can see and hear these files directly. Analyze them using your internal multimodal capabilities. - For **Data/Text files**: Continue using tools like `pd.read_csv(filepath)` or `text_file_to_string(filepath)`. """ class BasicAgent: def __init__(self): self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=TOOL_BASED_INSTRUCTIONS) self.model = InferenceClientModel( model_id = "Qwen/Qwen3-Next-80B-A3B-Thinking", temperature = 0.0, top_p = 1.0, max_tokens = 8196, ) self.basic_agent = CodeAgent( name = "basic_agent", description = "Basic smolagents CodeAgent", model = self.model, tools = AGENT_TOOLS, add_base_tools = True, # probably redundant, but it does not hurt max_steps = 5, additional_authorized_imports = AUTHORIZED_IMPORTS, verbosity_level = 1, max_print_outputs_length=1_000_000 ) print("✅ Basic agent initialized") def __call__(self, question: str, file_path: Optional[str] = None) -> str: prompt = f"{self.system_prompt}\n\nQuestion: {question}" if file_path: prompt += f"\nFile path: {file_path}" return self.basic_agent.run(prompt) class GeminiAgent: def __init__(self, native_multimodal: bool = True, model_id: str = "gemini/gemini-2.5-flash-lite"): # def __init__(self, native_multimodal: bool = True, model_id: str = "gemini/gemini-3-flash-preview"): self.native_multimodal = native_multimodal self.model_id = model_id if self.native_multimodal: client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY")) # Switch prompt based on the native_multimodal flag INSTRUCTIONS = NATIVE_MEDIA_INSTRUCTIONS if native_multimodal else TOOL_BASED_INSTRUCTIONS self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=INSTRUCTIONS) GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") if not GOOGLE_API_KEY: raise RuntimeError( "GOOGLE_API_KEY not found." ) self.model = LiteLLMModel( model_id = model_id, api_key = GOOGLE_API_KEY, temperature = 0.0, top_p = 1.0, max_tokens = 8196, timeout = 120 # Add timeout to prevent hanging ) # If native, we can optionally remove image_to_text from tools to prevent the agent from getting confused if self.native_multimodal: self.tools = NATIVE_TOOLS + [t for t in AGENT_TOOLS if t not in [image_to_text, youtube_to_text, transcribe_audio]] else: self.tools = AGENT_TOOLS self.gemini_agent = CodeAgent( name = "gemini_agent", description = f"Gemini CodeAgent ({model_id})", model = self.model, tools = self.tools, add_base_tools = True, # probably redundant, but it does not hurt max_steps = 2, additional_authorized_imports = AUTHORIZED_IMPORTS, verbosity_level = 1, max_print_outputs_length=1_000_000 ) print(f"✅ Gemini agent initialized with model: {model_id}") def __call__(self, question: str, file_path: Optional[str] = None) -> str: prompt = f"{self.system_prompt}\n\nQuestion: {question}" if file_path: prompt += f"\n\nThere is a file at: {file_path}. Use your tools to process it." return self.gemini_agent.run(prompt)