mgbam commited on
Commit
8b75c9f
·
verified ·
1 Parent(s): c112240

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +432 -0
app.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GENESIS-AI MCP Studio — Hugging Face Space
3
+ ==========================================
4
+ A one-file, production-leaning prototype that fuses:
5
+ • MCP-style tool adapters (RCSB PDB, medRxiv, Raindrop, QuickChart, MeasureSpace)
6
+ • Hugging Face Transformers (summarization, keyphrase extraction, NER, Q&A)
7
+ • Agentic orchestration (tool-using graph with spec-like permissions)
8
+ • Gradio UI for instant deployment on Hugging Face Spaces
9
+
10
+ Run locally:
11
+ pip install -U transformers accelerate torch gradio httpx pydantic python-dotenv rich
12
+ HF_HOME=.hf_cache # optional local cache
13
+ python app.py
14
+
15
+ Deploy on Hugging Face Spaces:
16
+ • Space type: Gradio
17
+ • Add secrets in the Space Settings as environment variables (see .env keys below)
18
+
19
+ .env (optional, set as secrets in HF Space):
20
+ RAINDROP_TOKEN=... # for Raindrop.io adapter
21
+ MEASURESPACE_API_KEY=... # weather/geocode adapter
22
+ QUICKCHART_BASE=https://quickchart.io/chart
23
+
24
+ Notes:
25
+ - External adapters are permission-gated at call-time and can be expanded.
26
+ - The medRxiv adapter uses a public JSON endpoint via crossref for robust search; switch to official APIs where available.
27
+ - This is a wow-piece: clean architecture + real utility out-of-the-box.
28
+ """
29
+
30
+ from __future__ import annotations
31
+ import os
32
+ import re
33
+ import json
34
+ import time
35
+ from dataclasses import dataclass, field
36
+ from typing import Any, Dict, List, Optional, Tuple
37
+
38
+ import httpx
39
+ import gradio as gr
40
+ from pydantic import BaseModel
41
+ from rich import print as rprint
42
+
43
+ # ----------------------------
44
+ # Hugging Face model helpers
45
+ # ----------------------------
46
+ from transformers import pipeline
47
+
48
+ _SUMMARIZER = None
49
+ _QA = None
50
+ _NER = None
51
+ _KEYPHRASE = None
52
+
53
+
54
+ def get_summarizer():
55
+ global _SUMMARIZER
56
+ if _SUMMARIZER is None:
57
+ _SUMMARIZER = pipeline(
58
+ "summarization", model="facebook/bart-large-cnn", device_map="auto"
59
+ )
60
+ return _SUMMARIZER
61
+
62
+
63
+ def get_qa():
64
+ global _QA
65
+ if _QA is None:
66
+ _QA = pipeline("question-answering", model="deepset/roberta-base-squad2", device_map="auto")
67
+ return _QA
68
+
69
+
70
+ def get_ner():
71
+ global _NER
72
+ if _NER is None:
73
+ _NER = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device_map="auto")
74
+ return _NER
75
+
76
+
77
+ def get_keyphrase():
78
+ """Simple keyphrase extractor via NER + heuristic; swap for a dedicated model if desired."""
79
+ global _KEYPHRASE
80
+ if _KEYPHRASE is None:
81
+ # We'll reuse NER under the hood to highlight key entities as phrases
82
+ _KEYPHRASE = get_ner()
83
+ return _KEYPHRASE
84
+
85
+
86
+ # ----------------------------
87
+ # Minimal MCP-style abstractions
88
+ # ----------------------------
89
+ class Permission(BaseModel):
90
+ server: str
91
+ scope: str # e.g., "read", "write"
92
+ resource: str # e.g., "medrxiv", "raindrop"
93
+
94
+
95
+ class ToolResult(BaseModel):
96
+ ok: bool
97
+ data: Any = None
98
+ error: Optional[str] = None
99
+
100
+
101
+ class Tool:
102
+ name: str
103
+ description: str
104
+ requires: List[Permission]
105
+
106
+ async def call(self, **kwargs) -> ToolResult: # to be implemented
107
+ raise NotImplementedError
108
+
109
+
110
+ # ----------------------------
111
+ # Adapters (MCP-like Servers)
112
+ # ----------------------------
113
+ class MedRxivTool(Tool):
114
+ name = "medrxiv.search"
115
+ description = "Search medRxiv / bioRxiv via Crossref for recent preprints."
116
+ requires = [Permission(server="crossref", scope="read", resource="literature")]
117
+
118
+ async def call(self, query: str, max_results: int = 5) -> ToolResult:
119
+ url = "https://api.crossref.org/works"
120
+ params = {
121
+ "query": query,
122
+ "filter": "from-pub-date:2023-01-01,has-abstract:true",
123
+ "rows": max_results,
124
+ "select": "title,author,URL,abstract,issued,container-title"
125
+ }
126
+ try:
127
+ async with httpx.AsyncClient(timeout=20) as client:
128
+ resp = await client.get(url, params=params)
129
+ resp.raise_for_status()
130
+ items = resp.json().get("message", {}).get("items", [])
131
+ results = []
132
+ for it in items:
133
+ title = (it.get("title") or [""])[0]
134
+ abstract = it.get("abstract") or ""
135
+ # Crossref abstracts can include HTML; strip tags
136
+ abstract = re.sub(r"<[^>]+>", " ", abstract)
137
+ results.append({
138
+ "title": title,
139
+ "authors": [a.get("family", "") for a in it.get("author", [])],
140
+ "url": it.get("URL"),
141
+ "venue": (it.get("container-title") or [""])[0],
142
+ "date": (it.get("issued", {}).get("date-parts") or [[None]])[0][0],
143
+ "abstract": abstract.strip(),
144
+ })
145
+ return ToolResult(ok=True, data=results)
146
+ except Exception as e:
147
+ return ToolResult(ok=False, error=str(e))
148
+
149
+
150
+ class RCSBPDBTool(Tool):
151
+ name = "rcsb.structure"
152
+ description = "Lookup PDB structures by query and return metadata."
153
+ requires = [Permission(server="rcsb", scope="read", resource="pdb")]
154
+
155
+ async def call(self, query: str, max_results: int = 5) -> ToolResult:
156
+ # Simple search via RCSB Search API
157
+ # See: https://search.rcsb.org/#search-api
158
+ endpoint = "https://search.rcsb.org/rcsbsearch/v2/query"
159
+ payload = {
160
+ "query": {
161
+ "type": "terminal",
162
+ "service": "text",
163
+ "parameters": {"attribute": "rcsb_entry_container_identifiers.entry_id", "operator": "exact_match", "value": query}
164
+ },
165
+ "return_type": "entry",
166
+ "request_options": {"pager": {"start": 0, "rows": max_results}}
167
+ }
168
+ # If not an exact PDB id, fallback to full-text search
169
+ if not re.fullmatch(r"[0-9][A-Za-z0-9]{3}", query):
170
+ payload = {
171
+ "query": {"type": "terminal", "service": "text", "parameters": {"value": query}},
172
+ "return_type": "entry",
173
+ "request_options": {"pager": {"start": 0, "rows": max_results}}
174
+ }
175
+ try:
176
+ async with httpx.AsyncClient(timeout=20) as client:
177
+ resp = await client.post(endpoint, json=payload)
178
+ resp.raise_for_status()
179
+ ids = [x.get("identifier") for x in resp.json().get("result_set", [])]
180
+ out = []
181
+ for pdb_id in ids:
182
+ info = await client.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}")
183
+ if info.status_code == 200:
184
+ out.append({"pdb_id": pdb_id, **info.json()})
185
+ return ToolResult(ok=True, data=out)
186
+ except Exception as e:
187
+ return ToolResult(ok=False, error=str(e))
188
+
189
+
190
+ class RaindropTool(Tool):
191
+ name = "raindrop.save"
192
+ description = "Save a URL to Raindrop.io (bookmarks)."
193
+ requires = [Permission(server="raindrop", scope="write", resource="bookmarks")]
194
+
195
+ async def call(self, url: str, title: Optional[str] = None, tags: Optional[List[str]] = None) -> ToolResult:
196
+ token = os.getenv("RAINDROP_TOKEN")
197
+ if not token:
198
+ return ToolResult(ok=False, error="RAINDROP_TOKEN not set")
199
+ try:
200
+ async with httpx.AsyncClient(timeout=20) as client:
201
+ headers = {"Authorization": f"Bearer {token}"}
202
+ payload = {"link": url}
203
+ if title:
204
+ payload["title"] = title
205
+ if tags:
206
+ payload["tags"] = tags
207
+ resp = await client.post("https://api.raindrop.io/rest/v1/raindrop", json=payload, headers=headers)
208
+ resp.raise_for_status()
209
+ return ToolResult(ok=True, data=resp.json())
210
+ except Exception as e:
211
+ return ToolResult(ok=False, error=str(e))
212
+
213
+
214
+ class MeasureSpaceTool(Tool):
215
+ name = "measure.weather"
216
+ description = "Weather/geocode lookup via MeasureSpace (demo)."
217
+ requires = [Permission(server="measurespace", scope="read", resource="weather")]
218
+
219
+ async def call(self, location: str) -> ToolResult:
220
+ # Placeholder: shows how you'd wire a hosted MCP; replace with actual endpoint/key
221
+ key = os.getenv("MEASURESPACE_API_KEY")
222
+ if not key:
223
+ return ToolResult(ok=False, error="MEASURESPACE_API_KEY not set")
224
+ # Example stub response
225
+ return ToolResult(ok=True, data={"location": location, "summary": "Sunny demo", "tempC": 28})
226
+
227
+
228
+ class QuickChartTool(Tool):
229
+ name = "quickchart.render"
230
+ description = "Render a chart via QuickChart and return image URL."
231
+ requires = [Permission(server="quickchart", scope="write", resource="chart")]
232
+
233
+ async def call(self, labels: List[str], values: List[float], title: str = "Keyphrases") -> ToolResult:
234
+ base = os.getenv("QUICKCHART_BASE", "https://quickchart.io/chart")
235
+ cfg = {
236
+ "type": "bar",
237
+ "data": {"labels": labels, "datasets": [{"label": title, "data": values}]},
238
+ "options": {"plugins": {"legend": {"display": False}, "title": {"display": True, "text": title}}}
239
+ }
240
+ url = f"{base}?c={json.dumps(cfg)}"
241
+ return ToolResult(ok=True, data={"url": url, "config": cfg})
242
+
243
+
244
+ # ----------------------------
245
+ # Agent Orchestrator
246
+ # ----------------------------
247
+ @dataclass
248
+ class AgentContext:
249
+ query: str
250
+ goals: List[str] = field(default_factory=list)
251
+ permissions: List[Permission] = field(default_factory=list)
252
+
253
+
254
+ class GenesisAgent:
255
+ def __init__(self):
256
+ self.medrxiv = MedRxivTool()
257
+ self.rcsb = RCSBPDBTool()
258
+ self.raindrop = RaindropTool()
259
+ self.weather = MeasureSpaceTool()
260
+ self.chart = QuickChartTool()
261
+
262
+ async def run_pipeline(self, ctx: AgentContext) -> Dict[str, Any]:
263
+ """Main pipeline:
264
+ 1) Literature search (medRxiv via Crossref)
265
+ 2) Summarize abstracts with HF
266
+ 3) Extract key entities/phrases
267
+ 4) Optional: save links to Raindrop
268
+ 5) Build a bar chart of salient keyphrases
269
+ """
270
+ # 1) Literature
271
+ lit = await self.medrxiv.call(query=ctx.query, max_results=6)
272
+ if not lit.ok:
273
+ return {"error": f"Literature search failed: {lit.error}"}
274
+
275
+ articles = lit.data
276
+ texts = []
277
+ for art in articles:
278
+ blob = f"Title: {art['title']}\nVenue: {art['venue']} ({art['date']})\nAbstract: {art['abstract']}"
279
+ texts.append(blob)
280
+
281
+ # 2) Summarize
282
+ summarizer = get_summarizer()
283
+ summaries = []
284
+ for t in texts:
285
+ # Chunk if too long for the model; simple truncation for brevity
286
+ if len(t) > 3000:
287
+ t = t[:3000]
288
+ s = summarizer(t, max_length=200, min_length=80, do_sample=False)[0]["summary_text"]
289
+ summaries.append(s)
290
+
291
+ # 3) Keyphrase via NER
292
+ ner = get_keyphrase()
293
+ phrase_counts: Dict[str, int] = {}
294
+ for s in summaries:
295
+ ents = ner(s)
296
+ for e in ents:
297
+ phrase = e.get("word")
298
+ if not phrase:
299
+ continue
300
+ phrase = phrase.strip()
301
+ # Normalize B- / I- etc leftovers
302
+ phrase = phrase.replace("##", "")
303
+ phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1
304
+
305
+ # Top phrases
306
+ top = sorted(phrase_counts.items(), key=lambda x: x[1], reverse=True)[:10]
307
+ labels = [k for k, _ in top] or ["No phrases"]
308
+ values = [v for _, v in top] or [1]
309
+
310
+ # 4) Optional bookmark first three
311
+ saved = []
312
+ if any(p.server == "raindrop" and p.scope == "write" for p in ctx.permissions):
313
+ for art in articles[:3]:
314
+ res = await self.raindrop.call(url=art["url"], title=art["title"], tags=["genesis-ai", "medrxiv"])
315
+ saved.append({"title": art["title"], "ok": res.ok})
316
+
317
+ # 5) Chart
318
+ chart = await self.chart.call(labels=labels, values=values, title="Key Entities Across Summaries")
319
+
320
+ return {
321
+ "query": ctx.query,
322
+ "articles": articles,
323
+ "summaries": summaries,
324
+ "keyphrases": top,
325
+ "chart": chart.data if chart.ok else {"error": chart.error},
326
+ "bookmarks": saved,
327
+ }
328
+
329
+
330
+ # ----------------------------
331
+ # Gradio UI
332
+ # ----------------------------
333
+ CSS = """
334
+ :root { --radius: 16px; }
335
+ .gradio-container { font-family: ui-sans-serif, system-ui; }
336
+ .box { border: 1px solid #e5e7eb; border-radius: var(--radius); padding: 16px; }
337
+ .heading { font-size: 22px; font-weight: 700; margin-bottom: 8px; }
338
+ .subtle { color: #6b7280; }
339
+ .badge { display:inline-block; padding: 2px 8px; border-radius: 999px; background: #eef2ff; margin-right:6px; }
340
+ .card { border: 1px solid #e5e7eb; border-radius: var(--radius); padding: 12px; }
341
+ """
342
+
343
+
344
+ def render_articles(arts: List[Dict[str, Any]]) -> str:
345
+ rows = []
346
+ for a in arts:
347
+ t = a.get("title", "")
348
+ u = a.get("url", "")
349
+ v = a.get("venue", "")
350
+ d = a.get("date", "")
351
+ rows.append(f"<div class='card'><div class='heading'>{t}</div><div class='subtle'>{v} · {d}</div><div><a href='{u}' target='_blank'>{u}</a></div></div>")
352
+ return "\n".join(rows) or "<i>No results</i>"
353
+
354
+
355
+ def render_keyphrases(kp: List[Tuple[str, int]]) -> str:
356
+ return " ".join([f"<span class='badge'>{k} × {v}</span>" for k, v in kp]) or "<i>None</i>"
357
+
358
+
359
+ async def generate(query: str, save_to_raindrop: bool):
360
+ perms = [Permission(server="crossref", scope="read", resource="literature"),
361
+ Permission(server="quickchart", scope="write", resource="chart")]
362
+ if save_to_raindrop:
363
+ perms.append(Permission(server="raindrop", scope="write", resource="bookmarks"))
364
+
365
+ agent = GenesisAgent()
366
+ ctx = AgentContext(query=query, goals=["Literature review", "Key entity map"], permissions=perms)
367
+ out = await agent.run_pipeline(ctx)
368
+ if "error" in out:
369
+ return gr.HTML.update(value=f"<div class='box'><b>Error:</b> {out['error']}</div>"), "", "", ""
370
+
371
+ arts_html = render_articles(out["articles"]) \
372
+ + ("<div class='subtle' style='margin-top:6px'>(Showing up to 6)</div>")
373
+
374
+ chart_url = out.get("chart", {}).get("url") or ""
375
+
376
+ summary_blob = "\n\n".join([f"— {s}" for s in out["summaries"]])
377
+
378
+ keyphrase_html = render_keyphrases(out["keyphrases"]) \
379
+ + ("<div class='subtle' style='margin-top:6px'>(Top 10)</div>")
380
+
381
+ return gr.HTML.update(value=arts_html), chart_url, summary_blob, gr.HTML.update(value=keyphrase_html)
382
+
383
+
384
+ with gr.Blocks(css=CSS, title="GENESIS-AI MCP Studio") as demo:
385
+ gr.Markdown("""
386
+ # GENESIS-AI MCP Studio
387
+ A Hugging Face + MCP-inspired research agent that:
388
+ - searches recent preprints (Crossref → med/bioRxiv),
389
+ - summarizes with **BART**,
390
+ - maps key entities/phrases (NER),
391
+ - renders an instant chart (QuickChart),
392
+ - optionally saves top links to **Raindrop**.
393
+
394
+ > Swap/expand adapters to add RCSB PDB, Kube, GitHub Actions, Open Library, etc.
395
+ """)
396
+ with gr.Row():
397
+ query = gr.Textbox(label="Research query", placeholder="e.g., CRISPR base editing off-target detection")
398
+ with gr.Row():
399
+ save = gr.Checkbox(label="Bookmark top results to Raindrop.io", value=False)
400
+ go = gr.Button("Run Agent ▶", variant="primary")
401
+
402
+ gr.Markdown("### Results")
403
+ with gr.Row():
404
+ arts = gr.HTML()
405
+ with gr.Row():
406
+ chart = gr.Image(label="Key Entities Chart (auto-generated)", type="filepath")
407
+ with gr.Row():
408
+ summaries = gr.Textbox(label="Summaries", lines=12)
409
+ with gr.Row():
410
+ phrases = gr.HTML()
411
+
412
+ async def _run(q, s):
413
+ html, chart_url, summ, kp = await generate(q, s)
414
+ img_path = ""
415
+ if chart_url:
416
+ # Download chart to show inline in Spaces
417
+ try:
418
+ with httpx.Client(timeout=20) as client:
419
+ resp = client.get(chart_url)
420
+ if resp.status_code == 200:
421
+ p = f"chart_{int(time.time())}.png"
422
+ with open(p, "wb") as f:
423
+ f.write(resp.content)
424
+ img_path = p
425
+ except Exception as e:
426
+ rprint("[red]Chart download failed:", e)
427
+ return html, img_path, summ, kp
428
+
429
+ go.click(_run, inputs=[query, save], outputs=[arts, chart, summaries, phrases])
430
+
431
+ if __name__ == "__main__":
432
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))