HeTalksInMaths commited on
Commit
67549a7
Β·
1 Parent(s): 5fd9547

Add app.py with progressive database expansion (5K batches)

Browse files
Files changed (1) hide show
  1. app.py +695 -0
app.py ADDED
@@ -0,0 +1,695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ToGMAL Combined Demo - Difficulty Analyzer + Chat Interface
4
+ ===========================================================
5
+
6
+ Tabbed interface combining:
7
+ 1. Difficulty Analyzer - Direct vector DB analysis
8
+ 2. Chat Interface - LLM with MCP tool calling
9
+
10
+ Perfect for demos and VC pitches!
11
+ """
12
+
13
+ import gradio as gr
14
+ import json
15
+ import os
16
+ import re
17
+ from pathlib import Path
18
+ from typing import List, Dict, Tuple, Optional
19
+ from benchmark_vector_db import BenchmarkVectorDB
20
+ import logging
21
+
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Initialize the vector database (shared by both tabs)
26
+ db_path = Path("./data/benchmark_vector_db")
27
+ db = None
28
+
29
+ def get_db():
30
+ """Lazy load the vector database."""
31
+ global db
32
+ if db is None:
33
+ try:
34
+ logger.info("Initializing BenchmarkVectorDB...")
35
+ db = BenchmarkVectorDB(
36
+ db_path=db_path,
37
+ embedding_model="all-MiniLM-L6-v2"
38
+ )
39
+ logger.info("βœ“ BenchmarkVectorDB initialized successfully")
40
+ except Exception as e:
41
+ logger.error(f"Failed to initialize BenchmarkVectorDB: {e}")
42
+ raise
43
+ return db
44
+
45
+ # Build database if needed (first launch)
46
+ try:
47
+ db = get_db()
48
+ current_count = db.collection.count()
49
+
50
+ if False and current_count == 0:
51
+ logger.info("Database is empty - building initial 5K sample...")
52
+ from datasets import load_dataset
53
+ from benchmark_vector_db import BenchmarkQuestion
54
+ import random
55
+
56
+ test_dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
57
+ total_questions = 0 # disabled in demo
58
+
59
+ if total_questions > 5000:
60
+ indices = random.sample(range(total_questions), 5000)
61
+ pass # selection disabled in demo
62
+
63
+ all_questions = []
64
+ for idx, item in enumerate(test_dataset):
65
+ question = BenchmarkQuestion(
66
+ question_id=f"mmlu_pro_test_{idx}",
67
+ source_benchmark="MMLU_Pro",
68
+ domain=item.get('category', 'unknown').lower(),
69
+ question_text=item['question'],
70
+ correct_answer=item['answer'],
71
+ choices=item.get('options', []),
72
+ success_rate=0.45,
73
+ difficulty_score=0.55,
74
+ difficulty_label="Hard",
75
+ num_models_tested=0
76
+ )
77
+ all_questions.append(question)
78
+
79
+ batch_size = 1000
80
+ for i in range(0, len(all_questions), batch_size):
81
+ batch = all_questions[i:i + batch_size]
82
+ db.index_questions(batch)
83
+
84
+ logger.info(f"βœ“ Database build complete! Indexed {len(all_questions)} questions")
85
+ else:
86
+ logger.info(f"βœ“ Loaded existing database with {current_count:,} questions")
87
+ except Exception as e:
88
+ logger.warning(f"Database initialization deferred: {e}")
89
+ db = None
90
+
91
+ # ============================================================================
92
+ # TAB 1: DIFFICULTY ANALYZER
93
+ # ============================================================================
94
+
95
+ def analyze_prompt_difficulty(prompt: str, k: int = 5) -> str:
96
+ """Analyze a prompt and return difficulty assessment."""
97
+ if not prompt.strip():
98
+ return "Please enter a prompt to analyze."
99
+
100
+ try:
101
+ db = get_db()
102
+ result = db.query_similar_questions(prompt, k=k)
103
+
104
+ output = []
105
+ output.append(f"## 🎯 Difficulty Assessment\n")
106
+ output.append(f"**Risk Level**: {result['risk_level']}")
107
+ output.append(f"**Success Rate**: {result['weighted_success_rate']:.1%}")
108
+ output.append(f"**Avg Similarity**: {result['avg_similarity']:.3f}")
109
+ output.append("")
110
+ output.append(f"**Recommendation**: {result['recommendation']}")
111
+ output.append("")
112
+ output.append(f"## πŸ” Similar Benchmark Questions\n")
113
+
114
+ for i, q in enumerate(result['similar_questions'], 1):
115
+ output.append(f"{i}. **{q['question_text'][:100]}...**")
116
+ output.append(f" - Source: {q['source']} ({q['domain']})")
117
+ output.append(f" - Success Rate: {q['success_rate']:.1%}")
118
+ output.append(f" - Similarity: {q['similarity']:.3f}")
119
+ output.append("")
120
+
121
+ total_questions = db.collection.count()
122
+ output.append(f"*Analyzed using {k} most similar questions from {total_questions:,} benchmark questions*")
123
+
124
+ return "\n".join(output)
125
+ except Exception as e:
126
+ return f"Error analyzing prompt: {str(e)}"
127
+
128
+ # ==========================================================================
129
+ # Database status and expansion helpers
130
+ # ==========================================================================
131
+
132
+ def get_database_info() -> str:
133
+ global db
134
+ if db is None:
135
+ return """### ⚠️ Database Not Initialized
136
+
137
+ **Status:** Waiting for initialization
138
+
139
+ The vector database is not yet ready. It will initialize on first use.
140
+ """
141
+ try:
142
+ db = get_db()
143
+ current_count = db.collection.count()
144
+ total_available = 32719
145
+ remaining = max(0, total_available - current_count)
146
+ progress_pct = (current_count / total_available * 100) if total_available > 0 else 0
147
+ info = "### πŸ“Š Database Status\n\n"
148
+ info += f"**Current Size:** {current_count:,} questions\n"
149
+ info += f"**Total Available:** {total_available:,} questions\n"
150
+ info += f"**Progress:** {progress_pct:.1f}% complete\n"
151
+ info += f"**Remaining:** {remaining:,} questions\n\n"
152
+ if remaining > 0:
153
+ clicks_needed = (remaining + 4999) // 5000
154
+ info += "πŸ’‘ Click 'Expand Database' to add 5,000 more questions\n"
155
+ info += f"πŸ“ˆ ~{clicks_needed} more clicks to reach full 32K+ dataset"
156
+ else:
157
+ info += "πŸŽ‰ Database is complete with all available questions!"
158
+ return info
159
+ except Exception as e:
160
+ return f"Error getting database info: {str(e)}"
161
+
162
+
163
+ def expand_database(batch_size: int = 5000) -> str:
164
+ global db
165
+ try:
166
+ db = get_db()
167
+ from datasets import load_dataset
168
+ from benchmark_vector_db import BenchmarkQuestion
169
+
170
+ current_count = db.collection.count()
171
+ total_available = 32719
172
+ if current_count >= total_available:
173
+ return f"βœ… Database complete at {current_count:,}/{total_available:,}."
174
+
175
+ # Load MMLU-Pro validation set (not test, to avoid overlap)
176
+ logger.info(f"Expanding database by up to {batch_size} questions...")
177
+ dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="validation")
178
+
179
+ # Calculate how many we've already indexed
180
+ start_idx = current_count
181
+ actual_batch_size = min(batch_size, len(dataset) - start_idx) # type: ignore
182
+
183
+ if actual_batch_size <= 0:
184
+ return "βœ… All MMLU-Pro validation questions already indexed."
185
+
186
+ new_questions = []
187
+ for idx in range(start_idx, start_idx + actual_batch_size):
188
+ item = dataset[idx] # type: ignore
189
+ q = BenchmarkQuestion(
190
+ question_id=f"mmlu_pro_val_{idx}",
191
+ source_benchmark="MMLU_Pro",
192
+ domain=str(item.get('category', 'unknown')).lower() if isinstance(item, dict) else 'unknown',
193
+ question_text=str(item['question']) if isinstance(item, dict) else str(item),
194
+ correct_answer=str(item['answer']) if isinstance(item, dict) else '',
195
+ choices=item.get('options', []) if isinstance(item, dict) else [],
196
+ success_rate=0.45, # MMLU-Pro average
197
+ difficulty_score=0.55,
198
+ difficulty_label="Hard",
199
+ num_models_tested=0
200
+ )
201
+ new_questions.append(q)
202
+
203
+ # Index the batch
204
+ if new_questions:
205
+ db.index_questions(new_questions)
206
+
207
+ new_count = db.collection.count()
208
+ remaining = max(0, len(dataset) - new_count) # type: ignore
209
+
210
+ result = f"βœ… Added {len(new_questions)} questions.\n\n"
211
+ result += f"**Total:** {new_count:,}/{len(dataset):,} (MMLU-Pro validation)\n" # type: ignore
212
+ result += f"**Remaining:** {remaining:,}\n"
213
+ if remaining > 0:
214
+ result += f"πŸ’‘ Click again to add up to {min(batch_size, remaining):,} more."
215
+ else:
216
+ result += "πŸŽ‰ All MMLU-Pro validation questions indexed!"
217
+ return result
218
+
219
+ except Exception as e:
220
+ logger.error(f"Expansion failed: {e}")
221
+ import traceback
222
+ error_details = traceback.format_exc()[:500]
223
+ return f"❌ Error expanding database: {str(e)}\n\nDetails:\n{error_details}"
224
+
225
+ # ============================================================================
226
+ # TAB 2: CHAT INTERFACE WITH MCP TOOLS
227
+ # ============================================================================
228
+
229
+ def tool_check_prompt_difficulty(prompt: str, k: int = 5) -> Dict:
230
+ """MCP Tool: Analyze prompt difficulty."""
231
+ try:
232
+ db = get_db()
233
+ result = db.query_similar_questions(prompt, k=k)
234
+
235
+ return {
236
+ "risk_level": result['risk_level'],
237
+ "success_rate": f"{result['weighted_success_rate']:.1%}",
238
+ "avg_similarity": f"{result['avg_similarity']:.3f}",
239
+ "recommendation": result['recommendation'],
240
+ "similar_questions": [
241
+ {
242
+ "question": q['question_text'][:150],
243
+ "source": q['source'],
244
+ "domain": q['domain'],
245
+ "success_rate": f"{q['success_rate']:.1%}",
246
+ "similarity": f"{q['similarity']:.3f}"
247
+ }
248
+ for q in result['similar_questions'][:3]
249
+ ]
250
+ }
251
+ except Exception as e:
252
+ return {"error": f"Analysis failed: {str(e)}"}
253
+
254
+ def tool_analyze_prompt_safety(prompt: str) -> Dict:
255
+ """MCP Tool: Analyze prompt for safety issues."""
256
+ issues = []
257
+ risk_level = "low"
258
+
259
+ dangerous_patterns = [
260
+ r'\brm\s+-rf\b',
261
+ r'\bdelete\s+all\b',
262
+ r'\bformat\s+.*drive\b',
263
+ r'\bdrop\s+database\b'
264
+ ]
265
+
266
+ for pattern in dangerous_patterns:
267
+ if re.search(pattern, prompt, re.IGNORECASE):
268
+ issues.append("Detected potentially dangerous file operation")
269
+ risk_level = "high"
270
+ break
271
+
272
+ medical_keywords = ['diagnose', 'treatment', 'medication', 'symptoms', 'cure', 'disease']
273
+ if any(keyword in prompt.lower() for keyword in medical_keywords):
274
+ issues.append("Medical advice request detected - requires professional consultation")
275
+ risk_level = "moderate" if risk_level == "low" else risk_level
276
+
277
+ if re.search(r'\b(build|create|write)\s+.*\b(\d{3,})\s+(lines|functions|classes)', prompt, re.IGNORECASE):
278
+ issues.append("Large-scale coding request - may exceed LLM capabilities")
279
+ risk_level = "moderate" if risk_level == "low" else risk_level
280
+
281
+ return {
282
+ "risk_level": risk_level,
283
+ "issues_found": len(issues),
284
+ "issues": issues if issues else ["No significant safety concerns detected"],
285
+ "recommendation": "Proceed with caution" if issues else "Prompt appears safe"
286
+ }
287
+
288
+ def call_llm_with_tools(
289
+ messages: List[Dict[str, str]],
290
+ available_tools: List[Dict],
291
+ model: str = "mistralai/Mistral-7B-Instruct-v0.2"
292
+ ) -> Tuple[str, Optional[Dict]]:
293
+ """Call LLM with tool calling capability."""
294
+
295
+ # Check if this is a TOOL_RESULT message - if so, synthesize response
296
+ last_msg = messages[-1] if messages else {}
297
+ if last_msg.get('role') == 'system' and 'TOOL_RESULT:' in last_msg.get('content', ''):
298
+ # Extract tool result
299
+ tool_result_str = last_msg['content']
300
+
301
+ # Simple synthesis based on the tool result
302
+ try:
303
+ import json
304
+ # Extract JSON from TOOL_RESULT: name=X data={...}
305
+ match = re.search(r'data=(.+)$', tool_result_str)
306
+ if match:
307
+ result_data = json.loads(match.group(1))
308
+
309
+ # Generate natural language synthesis
310
+ if 'risk_level' in result_data: # Difficulty analysis
311
+ risk = result_data['risk_level']
312
+ success = result_data.get('success_rate', 'unknown')
313
+ rec = result_data.get('recommendation', '')
314
+
315
+ response = f"""I've analyzed this prompt's difficulty. Here's what I found:
316
+
317
+ **Difficulty Assessment:** {risk}
318
+
319
+ Based on similarity to benchmark questions, LLMs have about a {success} success rate on similar tasks.
320
+
321
+ {rec}
322
+
323
+ This means """
324
+
325
+ if risk == "CRITICAL":
326
+ response += "this is extremely challenging - you'll likely need to break it into smaller steps or use specialized tools."
327
+ elif risk == "HIGH":
328
+ response += "this is quite difficult - consider using multi-step reasoning and verification."
329
+ elif risk == "MODERATE":
330
+ response += "this is moderately challenging - chain-of-thought prompting should help."
331
+ else:
332
+ response += "this is within normal LLM capabilities - a standard response should work well."
333
+
334
+ return response, None
335
+
336
+ elif 'issues_found' in result_data: # Safety analysis
337
+ risk = result_data['risk_level']
338
+ issues = result_data.get('issues', [])
339
+
340
+ response = f"""I've checked this prompt for safety concerns.
341
+
342
+ **Safety Assessment:** {risk.upper()} risk
343
+
344
+ """
345
+ if issues and issues[0] != "No significant safety concerns detected":
346
+ response += "**Concerns identified:**\n"
347
+ for issue in issues:
348
+ response += f"- {issue}\n"
349
+ response += "\nPlease proceed carefully with this request."
350
+ else:
351
+ response += "No significant safety concerns detected. The prompt appears safe to process."
352
+
353
+ return response, None
354
+ except Exception as e:
355
+ logger.warning(f"Failed to synthesize tool result: {e}")
356
+ # Fall through to HuggingFace API attempt
357
+
358
+ # Try HuggingFace API for initial responses
359
+ try:
360
+ from huggingface_hub import InferenceClient
361
+ client = InferenceClient()
362
+
363
+ system_msg = """You are ToGMAL Assistant, an AI that helps analyze prompts for difficulty and safety.
364
+
365
+ You have access to these tools:
366
+ 1. check_prompt_difficulty - Analyzes how difficult a prompt is for current LLMs
367
+ 2. analyze_prompt_safety - Checks for safety issues in prompts
368
+
369
+ When a user asks about prompt difficulty, safety, or capabilities, use the appropriate tool.
370
+ To call a tool, respond with: TOOL_CALL: tool_name(arg1="value1", arg2="value2")
371
+
372
+ After a tool is called, you will receive: TOOL_RESULT: name=<tool_name> data=<json>
373
+ Use TOOL_RESULT to provide a helpful, comprehensive response to the user."""
374
+
375
+ conversation = system_msg + "\n\n"
376
+ for msg in messages:
377
+ role = msg['role']
378
+ content = msg['content']
379
+ if role == 'user':
380
+ conversation += f"User: {content}\n"
381
+ elif role == 'assistant':
382
+ conversation += f"Assistant: {content}\n"
383
+ elif role == 'system':
384
+ conversation += f"System: {content}\n"
385
+
386
+ conversation += "Assistant: "
387
+
388
+ response = client.text_generation(
389
+ conversation,
390
+ model=model,
391
+ max_new_tokens=512,
392
+ temperature=0.7,
393
+ top_p=0.95,
394
+ do_sample=True
395
+ )
396
+
397
+ response_text = response.strip()
398
+ tool_call = None
399
+
400
+ if "TOOL_CALL:" in response_text:
401
+ match = re.search(r'TOOL_CALL:\s*(\w+)\((.*?)\)', response_text)
402
+ if match:
403
+ tool_name = match.group(1)
404
+ args_str = match.group(2)
405
+ args = {}
406
+ for arg in args_str.split(','):
407
+ if '=' in arg:
408
+ key, val = arg.split('=', 1)
409
+ key = key.strip()
410
+ val = val.strip().strip('"\'')
411
+ args[key] = val
412
+ tool_call = {"name": tool_name, "arguments": args}
413
+ response_text = re.sub(r'TOOL_CALL:.*?\)', '', response_text).strip()
414
+
415
+ logger.info(f"βœ“ HuggingFace API call successful")
416
+ return response_text, tool_call
417
+
418
+ except Exception as e:
419
+ logger.warning(f"HuggingFace API unavailable ({str(e)[:100]}), using fallback")
420
+ return fallback_llm(messages, available_tools)
421
+
422
+ def fallback_llm(messages: List[Dict[str, str]], available_tools: List[Dict]) -> Tuple[str, Optional[Dict]]:
423
+ """Fallback when HF API unavailable."""
424
+ last_message = messages[-1]['content'].lower() if messages else ""
425
+
426
+ # Safety intent first
427
+ if any(word in last_message for word in ['safe', 'safety', 'dangerous', 'risk']):
428
+ return "", {"name": "analyze_prompt_safety", "arguments": {"prompt": messages[-1]['content']}}
429
+
430
+ # Difficulty intent (expanded triggers)
431
+ if any(word in last_message for word in ['difficult', 'difficulty', 'hard', 'easy', 'challenging', 'analyze', 'analysis', 'assess', 'check']):
432
+ return "", {"name": "check_prompt_difficulty", "arguments": {"prompt": messages[-1]['content'], "k": 5}}
433
+
434
+ # Default: run difficulty analysis on any non-empty message
435
+ if last_message.strip():
436
+ return "", {"name": "check_prompt_difficulty", "arguments": {"prompt": messages[-1]['content'], "k": 5}}
437
+
438
+ return """I'm ToGMAL Assistant. I can help analyze prompts for:
439
+ - **Difficulty**: How challenging is this for current LLMs?
440
+ - **Safety**: Are there any safety concerns?
441
+
442
+ Try asking me to analyze a prompt!""", None
443
+
444
+ AVAILABLE_TOOLS = [
445
+ {
446
+ "name": "check_prompt_difficulty",
447
+ "description": "Analyzes how difficult a prompt is for current LLMs",
448
+ "parameters": {"prompt": "The prompt to analyze", "k": "Number of similar questions"}
449
+ },
450
+ {
451
+ "name": "analyze_prompt_safety",
452
+ "description": "Checks for safety issues in prompts",
453
+ "parameters": {"prompt": "The prompt to analyze"}
454
+ }
455
+ ]
456
+
457
+ def execute_tool(tool_name: str, arguments: Dict) -> Dict:
458
+ """Execute a tool and return results."""
459
+ if tool_name == "check_prompt_difficulty":
460
+ prompt = arguments.get("prompt", "")
461
+ try:
462
+ k = int(arguments.get("k", 5))
463
+ except Exception:
464
+ k = 5
465
+ k = max(1, min(100, k))
466
+ return tool_check_prompt_difficulty(prompt, k)
467
+ elif tool_name == "analyze_prompt_safety":
468
+ return tool_analyze_prompt_safety(arguments.get("prompt", ""))
469
+ else:
470
+ return {"error": f"Unknown tool: {tool_name}"}
471
+
472
+ def format_tool_result(tool_name: str, result: Dict) -> str:
473
+ """Format tool result as natural language."""
474
+ if tool_name == "check_prompt_difficulty":
475
+ if "error" in result:
476
+ return f"Sorry, I couldn't analyze the difficulty: {result['error']}"
477
+ return f"""Based on my analysis of similar benchmark questions:
478
+
479
+ **Difficulty Level:** {result['risk_level'].upper()}
480
+ **Success Rate:** {result['success_rate']}
481
+ **Similarity:** {result['avg_similarity']}
482
+
483
+ **Recommendation:** {result['recommendation']}
484
+
485
+ **Similar questions:**
486
+ {chr(10).join([f"β€’ {q['question'][:100]}... (Success: {q['success_rate']})" for q in result['similar_questions'][:2]])}
487
+ """
488
+ elif tool_name == "analyze_prompt_safety":
489
+ if "error" in result:
490
+ return f"Sorry, I couldn't analyze safety: {result['error']}"
491
+ issues = "\n".join([f"β€’ {issue}" for issue in result['issues']])
492
+ return f"""**Safety Analysis:**
493
+
494
+ **Risk Level:** {result['risk_level'].upper()}
495
+ **Issues Found:** {result['issues_found']}
496
+
497
+ {issues}
498
+
499
+ **Recommendation:** {result['recommendation']}
500
+ """
501
+ return json.dumps(result, indent=2)
502
+
503
+ def chat(message: str, history: List[Tuple[str, str]]) -> Tuple[List[Tuple[str, str]], str]:
504
+ """Process chat message with tool calling."""
505
+ messages = []
506
+ for user_msg, assistant_msg in history:
507
+ messages.append({"role": "user", "content": user_msg})
508
+ if assistant_msg:
509
+ messages.append({"role": "assistant", "content": assistant_msg})
510
+
511
+ messages.append({"role": "user", "content": message})
512
+
513
+ # Step 1: Get LLM response (may include tool call)
514
+ response_text, tool_call = call_llm_with_tools(messages, AVAILABLE_TOOLS)
515
+
516
+ tool_status = ""
517
+
518
+ if tool_call:
519
+ tool_name = tool_call['name']
520
+ tool_args = tool_call['arguments']
521
+
522
+ tool_status = f"πŸ› οΈ **Calling tool:** `{tool_name}`\n**Arguments:** {json.dumps(tool_args, indent=2)}\n\n"
523
+
524
+ # Execute the tool
525
+ tool_result = execute_tool(tool_name, tool_args)
526
+ tool_status += f"**Result:**\n```json\n{json.dumps(tool_result, indent=2)}\n```\n\n"
527
+
528
+ # Step 2: Add tool result and get final LLM response
529
+ messages.append({
530
+ "role": "system",
531
+ "content": f"TOOL_RESULT: name={tool_name} data={json.dumps(tool_result)}"
532
+ })
533
+
534
+ # Try to get LLM to synthesize the result
535
+ final_response, _ = call_llm_with_tools(messages, AVAILABLE_TOOLS)
536
+
537
+ # If LLM provided a response, use it; otherwise format the tool result nicely
538
+ if final_response and final_response.strip():
539
+ response_text = final_response
540
+ else:
541
+ # Format tool result as a natural language response
542
+ response_text = format_tool_result(tool_name, tool_result)
543
+ tool_status += "\n_Note: LLM did not provide synthesis, using formatted tool result_\n"
544
+
545
+ # If still no response text, provide default message
546
+ if not response_text or not response_text.strip():
547
+ response_text = """I'm ToGMAL Assistant. I can help analyze prompts for:
548
+ - **Difficulty**: How challenging is this for current LLMs?
549
+ - **Safety**: Are there any safety concerns?
550
+
551
+ Try asking me to analyze a prompt!"""
552
+
553
+ history.append((message, response_text))
554
+ return history, tool_status
555
+
556
+ # ============================================================================
557
+ # GRADIO INTERFACE - TABBED LAYOUT
558
+ # ============================================================================
559
+
560
+ with gr.Blocks(title="ToGMAL - Difficulty Analyzer + Chat", css="""
561
+ .tab-nav button { font-size: 16px !important; padding: 12px 24px !important; }
562
+ .gradio-container { max-width: 1200px !important; }
563
+ """) as demo:
564
+
565
+ gr.Markdown("# 🧠 ToGMAL - Intelligent LLM Analysis Platform")
566
+ gr.Markdown("""
567
+ **Taxonomy of Generative Model Apparent Limitations**
568
+
569
+ Choose your interface:
570
+ - **Difficulty Analyzer** - Direct analysis of prompt difficulty using 32K+ benchmarks
571
+ - **Chat Assistant** - Interactive chat where AI can call MCP tools dynamically
572
+ """)
573
+
574
+ with gr.Tabs():
575
+ # TAB 1: DIFFICULTY ANALYZER
576
+ with gr.Tab("πŸ“Š Difficulty Analyzer"):
577
+ gr.Markdown("### Analyze Prompt Difficulty")
578
+ gr.Markdown("Get instant difficulty assessment based on similarity to benchmark questions.")
579
+ with gr.Accordion("πŸ“š Database Management", open=False):
580
+ db_info = gr.Markdown(get_database_info())
581
+ with gr.Row():
582
+ expand_btn = gr.Button("πŸš€ Expand Database (+5K)")
583
+ refresh_btn = gr.Button("πŸ”„ Refresh Stats")
584
+ expand_output = gr.Markdown()
585
+ expand_btn.click(fn=expand_database, inputs=[], outputs=expand_output)
586
+ refresh_btn.click(fn=get_database_info, inputs=[], outputs=db_info)
587
+
588
+ with gr.Row():
589
+ with gr.Column():
590
+ analyzer_prompt = gr.Textbox(
591
+ label="Enter your prompt",
592
+ placeholder="e.g., Calculate the quantum correction to the partition function...",
593
+ lines=3
594
+ )
595
+ analyzer_k = gr.Slider(
596
+ minimum=1,
597
+ maximum=10,
598
+ value=5,
599
+ step=1,
600
+ label="Number of similar questions to show"
601
+ )
602
+ analyzer_btn = gr.Button("Analyze Difficulty", variant="primary")
603
+
604
+ with gr.Column():
605
+ analyzer_output = gr.Markdown(label="Analysis Results")
606
+
607
+ gr.Examples(
608
+ examples=[
609
+ "Calculate the quantum correction to the partition function for a 3D harmonic oscillator",
610
+ "Prove that there are infinitely many prime numbers",
611
+ "Diagnose a patient with acute chest pain and shortness of breath",
612
+ "What is 2 + 2?",
613
+ ],
614
+ inputs=analyzer_prompt
615
+ )
616
+
617
+ analyzer_btn.click(
618
+ fn=analyze_prompt_difficulty,
619
+ inputs=[analyzer_prompt, analyzer_k],
620
+ outputs=analyzer_output
621
+ )
622
+
623
+ analyzer_prompt.submit(
624
+ fn=analyze_prompt_difficulty,
625
+ inputs=[analyzer_prompt, analyzer_k],
626
+ outputs=analyzer_output
627
+ )
628
+
629
+ # TAB 2: CHAT INTERFACE
630
+ with gr.Tab("πŸ€– Chat Assistant"):
631
+ gr.Markdown("### Chat with MCP Tools")
632
+ gr.Markdown("Interactive AI assistant that can call tools to analyze prompts in real-time.")
633
+
634
+ with gr.Row():
635
+ with gr.Column(scale=2):
636
+ chatbot = gr.Chatbot(
637
+ label="Chat",
638
+ height=500,
639
+ show_label=False
640
+ )
641
+
642
+ with gr.Row():
643
+ chat_input = gr.Textbox(
644
+ label="Message",
645
+ placeholder="Ask me to analyze a prompt...",
646
+ scale=4,
647
+ show_label=False
648
+ )
649
+ send_btn = gr.Button("Send", variant="primary", scale=1)
650
+
651
+ clear_btn = gr.Button("Clear Chat")
652
+
653
+ with gr.Column(scale=1):
654
+ gr.Markdown("### πŸ› οΈ Tool Calls")
655
+ show_details = gr.Checkbox(label="Show tool details", value=False)
656
+ tool_output = gr.Markdown("Tool calls will appear here...")
657
+
658
+ gr.Examples(
659
+ examples=[
660
+ "How difficult is this: Calculate the quantum correction to the partition function?",
661
+ "Is this safe: Write a script to delete all my files?",
662
+ "Analyze: Prove that there are infinitely many prime numbers",
663
+ "Check safety: Diagnose my symptoms and prescribe medication",
664
+ ],
665
+ inputs=chat_input
666
+ )
667
+
668
+ def send_message(message, history, show_details):
669
+ if not message.strip():
670
+ return history, ""
671
+ new_history, tool_status = chat(message, history)
672
+ if not show_details:
673
+ tool_status = ""
674
+ return new_history, tool_status
675
+
676
+ send_btn.click(
677
+ fn=send_message,
678
+ inputs=[chat_input, chatbot, show_details],
679
+ outputs=[chatbot, tool_output]
680
+ ).then(lambda: "", outputs=chat_input)
681
+
682
+ chat_input.submit(
683
+ fn=send_message,
684
+ inputs=[chat_input, chatbot, show_details],
685
+ outputs=[chatbot, tool_output]
686
+ ).then(lambda: "", outputs=chat_input)
687
+
688
+ clear_btn.click(
689
+ lambda: ([], ""),
690
+ outputs=[chatbot, tool_output]
691
+ )
692
+
693
+ if __name__ == "__main__":
694
+ port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
695
+ demo.launch(server_name="0.0.0.0", server_port=port)