|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
from typing import Optional |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
from google import genai |
|
|
from google.genai import types |
|
|
|
|
|
|
|
|
from smolagents import ( |
|
|
CodeAgent, |
|
|
InferenceClientModel, |
|
|
TransformersModel, |
|
|
LiteLLMModel, |
|
|
DuckDuckGoSearchTool, |
|
|
VisitWebpageTool, |
|
|
PythonInterpreterTool, |
|
|
FinalAnswerTool, |
|
|
) |
|
|
|
|
|
|
|
|
from tools.gemini_native_tools import analyze_video, analyze_image, analyze_audio |
|
|
from tools.download_file import download_file_from_url |
|
|
from tools.files_to_text import image_to_text, pdf_to_text, text_file_to_string |
|
|
from tools.audio_tools import youtube_to_text, transcribe_audio |
|
|
|
|
|
|
|
|
AGENT_TOOLS = [ |
|
|
|
|
|
DuckDuckGoSearchTool(), |
|
|
VisitWebpageTool(), |
|
|
PythonInterpreterTool(), |
|
|
FinalAnswerTool(), |
|
|
|
|
|
|
|
|
download_file_from_url, |
|
|
text_file_to_string, |
|
|
pdf_to_text, |
|
|
image_to_text, |
|
|
youtube_to_text, |
|
|
transcribe_audio, |
|
|
] |
|
|
|
|
|
|
|
|
NATIVE_TOOLS = [ |
|
|
analyze_video, |
|
|
analyze_image, |
|
|
analyze_audio |
|
|
] |
|
|
|
|
|
|
|
|
AUTHORIZED_IMPORTS = [ |
|
|
'numpy','re', 'pandas', 'json', 'datetime', |
|
|
'tempfile','requests', 'markdownify', 'chess.*', |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
SYSTEM_PROMPT_TEMPLATE = """ |
|
|
You are an expert **General AI Assistant** and **Python Programmer** tasked with solving complex GAIA benchmark problems. |
|
|
|
|
|
### 1. Reason-Act-Observe |
|
|
Follow a **PLAN → ACT → OBSERVE** loop: |
|
|
- **PLAN:** Break the task into 1–3 logical steps. Identify tools for each step. |
|
|
- **ACT:** Write and run one self-contained Python block per step. |
|
|
- **OBSERVE:** Examine outputs or errors before proceeding. |
|
|
|
|
|
### 2. File Handling |
|
|
{file_handling_instructions} |
|
|
|
|
|
**Important rules:** |
|
|
- Whenever you are given a file path (or url), you **must ABSOLUTELY store it in a variable first** (e.g. filepath`) and pass that variable directly to the next tool. **NEVER** try to write the path yourself in the function. |
|
|
- You must **not** mix methods across file types (e.g. do not use Whisper for CSVs or pandas for audio). |
|
|
|
|
|
### 3. Data Analysis & Answer |
|
|
- Inspect loaded datasets first (`.head()`, `.info()`, `.describe()`) before analysis. |
|
|
- Write clean, idiomatic Python code. Before that, check if there is any pre-made tool that would work for the task. |
|
|
|
|
|
### 4. Additional instructions for the following tasks provided by GAIA team |
|
|
- You are a general AI assistant. I will ask you a question. Do not reveal your internal reasoning. Only the content inside FinalAnswerTool will be evaluated. |
|
|
- Finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. |
|
|
|
|
|
### 5. To provide the final answer, you MUST call the final_answer tool inside a <code> block. |
|
|
|
|
|
- Example of how to end the task: |
|
|
|
|
|
Thought: I have found the answer. I will now provide it. |
|
|
<code> |
|
|
final_answer("FINAL ANSWER: The capital of France is Paris") |
|
|
</code> |
|
|
|
|
|
\n\n |
|
|
""" |
|
|
|
|
|
|
|
|
TOOL_BASED_INSTRUCTIONS = """ |
|
|
You must select the reading or transcription method **strictly** based on the file type: |
|
|
| File Type / Source | Tool / Method to Use | |
|
|
| :--- | :--- | |
|
|
| `.csv` | `pd.read_csv(filepath)` | |
|
|
| `.xlsx`, `.xls` | `pd.read_excel(filepath)` | |
|
|
| `.pdf` | `pdf_to_text(filepath)` | |
|
|
| `.txt`, `.md`, `.json` | `text_file_to_string(filepath)` | |
|
|
| `.png`, `.jpg`, `.jpeg` | `image_to_text(filepath)` | |
|
|
| **YouTube URL** | `youtube_to_text(url)` | |
|
|
| `.mp3`, `.wav`, `.m4a` | `transcribe_audio(filepath)` | |
|
|
""" |
|
|
|
|
|
|
|
|
NATIVE_MEDIA_INSTRUCTIONS = """ |
|
|
You have **native vision and audio capabilities**. |
|
|
- For **Images (.png, .jpg) and Audio/Video**: Do NOT use external tools like `image_to_text`. You can see and hear these files directly. Analyze them using your internal multimodal capabilities. |
|
|
- For **Data/Text files**: Continue using tools like `pd.read_csv(filepath)` or `text_file_to_string(filepath)`. |
|
|
""" |
|
|
|
|
|
class BasicAgent: |
|
|
def __init__(self): |
|
|
self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=TOOL_BASED_INSTRUCTIONS) |
|
|
self.model = InferenceClientModel( |
|
|
model_id = "Qwen/Qwen3-Next-80B-A3B-Thinking", |
|
|
temperature = 0.0, |
|
|
top_p = 1.0, |
|
|
max_tokens = 8196, |
|
|
) |
|
|
self.basic_agent = CodeAgent( |
|
|
name = "basic_agent", |
|
|
description = "Basic smolagents CodeAgent", |
|
|
model = self.model, |
|
|
tools = AGENT_TOOLS, |
|
|
add_base_tools = True, |
|
|
max_steps = 5, |
|
|
additional_authorized_imports = AUTHORIZED_IMPORTS, |
|
|
verbosity_level = 1, |
|
|
max_print_outputs_length=1_000_000 |
|
|
) |
|
|
|
|
|
print("✅ Basic agent initialized") |
|
|
|
|
|
def __call__(self, question: str, file_path: Optional[str] = None) -> str: |
|
|
prompt = f"{self.system_prompt}\n\nQuestion: {question}" |
|
|
if file_path: |
|
|
prompt += f"\nFile path: {file_path}" |
|
|
return self.basic_agent.run(prompt) |
|
|
|
|
|
class GeminiAgent: |
|
|
def __init__(self, native_multimodal: bool = True, model_id: str = "gemini/gemini-2.5-flash-lite"): |
|
|
self.native_multimodal = native_multimodal |
|
|
if self.native_multimodal: |
|
|
client = genai.Client(api_key=os.environ.get("GOOGLE_API_KEY")) |
|
|
|
|
|
|
|
|
INSTRUCTIONS = NATIVE_MEDIA_INSTRUCTIONS if native_multimodal else TOOL_BASED_INSTRUCTIONS |
|
|
self.system_prompt = SYSTEM_PROMPT_TEMPLATE.format(file_handling_instructions=INSTRUCTIONS) |
|
|
|
|
|
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") |
|
|
if not GOOGLE_API_KEY: |
|
|
raise RuntimeError( |
|
|
"GOOGLE_API_KEY not found." |
|
|
) |
|
|
self.model = LiteLLMModel( |
|
|
model_id = model_id, |
|
|
api_key = GOOGLE_API_KEY, |
|
|
temperature = 0.0, |
|
|
top_p = 1.0, |
|
|
max_tokens = 8196, |
|
|
timeout = 120 |
|
|
) |
|
|
|
|
|
if self.native_multimodal: |
|
|
self.tools = NATIVE_TOOLS + [t for t in AGENT_TOOLS if t not in [image_to_text, youtube_to_text, transcribe_audio]] |
|
|
else: |
|
|
self.tools = AGENT_TOOLS |
|
|
self.gemini_agent = CodeAgent( |
|
|
name = "gemini_agent", |
|
|
description = "Gemini CodeAgent", |
|
|
model = self.model, |
|
|
tools = self.tools, |
|
|
add_base_tools = True, |
|
|
max_steps = 8, |
|
|
additional_authorized_imports = AUTHORIZED_IMPORTS, |
|
|
verbosity_level = 1, |
|
|
max_print_outputs_length=1_000_000 |
|
|
) |
|
|
|
|
|
print("✅ Gemini agent initialized") |
|
|
|
|
|
def __call__(self, question: str, file_path: Optional[str] = None) -> str: |
|
|
prompt = f"{self.system_prompt}\n\nQuestion: {question}" |
|
|
if file_path: |
|
|
prompt += f"\n\nThere is a file at: {file_path}. Use your tools to process it." |
|
|
|
|
|
return self.gemini_agent.run(prompt) |