Spaces:

Agents-MCP-Hackathon
/

Intelligent_Content_Organizer

Running

App Files Files Community

Nihal2000 commited on Jun 2

Commit

3e772ec

1 Parent(s): eb60db5

First vesrion

Browse files

Files changed (18) hide show

app.py +157 -0
config.py +7 -0
core/__init__.py +0 -0
core/agent.py +17 -0
core/ai_enrichment.py +41 -0
core/components.py +23 -0
core/components.pyi +29 -0
core/database.py +81 -0
core/parser.py +30 -0
core/processing.py +42 -0
core/storage.py +58 -0
core/summarizer.py +25 -0
core/utils.py +23 -0
data/article_url.txt +0 -0
data/document1.pdf +0 -0
data/sample_note.txt +0 -0
mcp_tools.py +122 -0
requirements.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+import uuid
+import gradio as gr
+from gradio import components
+from fastmcp import FastMCP
+# from core.parser import parse_document, parse_url
+from core.parser import parse_document, parse_url
+from core.summarizer import summarize_content, tag_content
+from core.storage import add_document, search_documents
+from core.agent import answer_question
+# from core.components import DocumentViewer
+import plotly.graph_objects as go
+# Initialize the FastMCP server (for agentic tools)
+mcp = FastMCP("IntelligentContentOrganizer")
+# Gradio UI functions
+def process_content(file_obj, url, tags_input):
+    """
+    Handle file upload or URL input: parse content, summarize, tag, store.
+    """
+    content_text = ""
+    source = ""
+    if file_obj is not None:
+        # Save uploaded file to temp path
+        file_path = file_obj.name
+        content_text = parse_document(file_path)
+        source = file_obj.name
+    elif url:
+        content_text = parse_url(url)
+        source = url
+    else:
+        return "No document provided.", "", "", ""
+    # Summarize and tag (simulated)
+    summary = summarize_content(content_text)
+    tags = tag_content(content_text)
+    # Allow user to override or confirm tags via input
+    if tags_input:
+        # If user entered new tags, split by comma
+        tags = [t.strip() for t in tags_input.split(",") if t.strip() != ""]
+    # Store in ChromaDB with a unique ID
+    doc_id = str(uuid.uuid4())
+    metadata = {"source": source, "tags": tags}
+    add_document(doc_id, content_text, metadata)
+    return content_text, summary, ", ".join(tags), f"Document stored with ID: {doc_id}"
+def generate_graph():
+    """
+    Create a simple Plotly graph of documents.
+    Nodes = documents, edges = shared tags.
+    """
+    # Fetch all documents from ChromaDB
+    from core.storage import get_all_documents
+    docs = get_all_documents()
+    if not docs:
+        return go.Figure()  # empty
+    # Build graph connections: if two docs share a tag, connect them
+    nodes = {doc["id"]: doc for doc in docs}
+    edges = []
+    for i, doc1 in enumerate(docs):
+        for doc2 in docs[i+1:]:
+            shared_tags = set(doc1["metadata"]["tags"]) & set(doc2["metadata"]["tags"])
+            if shared_tags:
+                edges.append((doc1["id"], doc2["id"]))
+    # Use networkx to compute layout (or simple fixed positions)
+    import networkx as nx
+    G = nx.Graph()
+    G.add_nodes_from(nodes.keys())
+    G.add_edges_from(edges)
+    pos = nx.spring_layout(G, seed=42)
+    # Create Plotly traces
+    edge_x = []
+    edge_y = []
+    for (src, dst) in edges:
+        x0, y0 = pos[src]
+        x1, y1 = pos[dst]
+        edge_x += [x0, x1, None]
+        edge_y += [y0, y1, None]
+    edge_trace = go.Scatter(
+        x=edge_x, y=edge_y,
+        line=dict(width=1, color='#888'),
+        hoverinfo='none',
+        mode='lines')
+    node_x = []
+    node_y = []
+    node_text = []
+    for node_id in G.nodes():
+        x, y = pos[node_id]
+        node_x.append(x)
+        node_y.append(y)
+        text = nodes[node_id]["metadata"].get("source", "")
+        node_text.append(f"{text}\nTags: {nodes[node_id]['metadata']['tags']}")
+    node_trace = go.Scatter(
+        x=node_x, y=node_y,
+        mode='markers+text',
+        marker=dict(size=10, color='skyblue'),
+        text=node_text, hoverinfo='text', textposition="bottom center")
+    fig = go.Figure(data=[edge_trace, node_trace],
+                    layout=go.Layout(title="Document Knowledge Graph",
+                                     showlegend=False,
+                                     margin=dict(l=20, r=20, b=20, t=30)))
+    return fig
+def handle_query(question):
+    """
+    Answer a user question by retrieving relevant documents and summarizing them.
+    """
+    if not question:
+        return "Please enter a question."
+    answer = answer_question(question)
+    return answer
+# Build Gradio interface with Blocks
+with gr.Blocks(title="Intelligent Content Organizer") as demo:
+    gr.Markdown("# Intelligent Content Organizer")
+    with gr.Tab("Upload / Fetch Content"):
+        gr.Markdown("**Add a document:** Upload a file or enter a URL.")
+        with gr.Row():
+            file_in = gr.File(label="Upload Document (PDF, TXT, etc.)")
+            url_in = gr.Textbox(label="Document URL", placeholder="https://example.com/article")
+        tags_in = gr.Textbox(label="Tags (comma-separated)", placeholder="Enter tags or leave blank")
+        process_btn = gr.Button("Parse & Add Document")
+        doc_view = gr.Textbox(label="Document Preview", lines=10, interactive=False)
+        summary_out = gr.Textbox(label="Summary", interactive=False)
+        tags_out = gr.Textbox(label="Detected Tags", interactive=False)
+        status_out = gr.Textbox(label="Status/Info", interactive=False)
+        process_btn.click(fn=process_content, inputs=[file_in, url_in, tags_in],
+                          outputs=[doc_view, summary_out, tags_out, status_out])
+    with gr.Tab("Knowledge Graph"):
+        gr.Markdown("**Document relationships:** Shared tags indicate edges.")
+        graph_plot = gr.Plot(label="Knowledge Graph")
+        refresh_btn = gr.Button("Refresh Graph")
+        refresh_btn.click(fn=generate_graph, inputs=None, outputs=graph_plot)
+    with gr.Tab("Ask a Question"):
+        gr.Markdown("**AI Q&A:** Ask a question about your documents.")
+        question_in = gr.Textbox(label="Your Question")
+        answer_out = gr.Textbox(label="Answer", interactive=False)
+        ask_btn = gr.Button("Get Answer")
+        ask_btn.click(fn=handle_query, inputs=question_in, outputs=answer_out)
+if __name__ == "__main__":
+    # Launch Gradio app (Hugging Face Spaces will auto-launch this)
+    # demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
+    demo.launch(mcp_server=True)

config.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# config.py
+import os
+from dotenv import load_dotenv
+load_dotenv()  # loads from .env if present
+MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
+CLAUDE_API_KEY  = os.environ.get("CLAUDE_API_KEY")
+BRAVE_API_KEY   = os.environ.get("BRAVE_API_KEY")

core/__init__.py ADDED Viewed

File without changes

core/agent.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import json
+from core.storage import search_documents
+# For Q&A we can use a simple retrieval + QA pipeline (stubbed here)
+# In a real app, you might use LangChain or a HuggingFace question-answering model.
+def answer_question(question: str) -> str:
+    """
+    Agent: retrieve relevant docs and answer the question.
+    """
+    # Retrieve top documents
+    results = search_documents(question, top_k=3)
+    doc_texts = results.get("documents", [[]])[0]
+    combined = " ".join(doc_texts)
+    # Stub: just echo the question and number of docs
+    if not combined.strip():
+        return "No relevant documents found."
+    return f"Answered question: '{question}' (based on {len(doc_texts)} documents)."

core/ai_enrichment.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# core/ai_enrichment.py
+from mistralai import Mistral
+import config
+def generate_tags(text: str) -> list[str]:
+    """
+    Use Mistral AI to generate 5-7 relevant tags for the text.
+    """
+    with Mistral(api_key=config.MISTRAL_API_KEY) as client:
+        response = client.chat.complete(
+            model="mistral-small-latest",
+            messages=[{
+                "role": "user",
+                "content": f"Generate 5-7 relevant tags (comma-separated) for the following text:\n\n{text}"
+            }]
+        )
+    try:
+        content = response["choices"][0]["message"]["content"]
+    except (KeyError, IndexError):
+        return []
+    tags = [tag.strip() for tag in content.split(",") if tag.strip()]
+    return tags
+def summarize_text(text: str) -> str:
+    """
+    Use Mistral AI to generate a concise summary of the text.
+    """
+    with Mistral(api_key=config.MISTRAL_API_KEY) as client:
+        response = client.chat.complete(
+            model="mistral-small-latest",
+            messages=[{
+                "role": "user",
+                "content": f"Summarize the following text in a concise manner:\n\n{text}"
+            }]
+        )
+    try:
+        summary = response["choices"][0]["message"]["content"].strip()
+    except (KeyError, IndexError):
+        return ""
+    return summary

core/components.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import gradio as gr
+class DocumentViewer(gr.components.Component):
+    """
+    Custom Gradio component for document preview and tag editing.
+    (Stub implementation)
+    """
+    def __init__(self, label=None):
+        super().__init__(label=label, value=None)
+        self.visible = True
+        self.interactive = False
+    def preprocess(self, x):
+        # Input is a file path (or object); just return as-is
+        return x
+    def postprocess(self, x):
+        # x is the raw document text; display first few lines as preview
+        if not x:
+            return ""
+        lines = x.splitlines()
+        preview = "\n".join(lines[:10])
+        return preview

core/components.pyi ADDED Viewed

	@@ -0,0 +1,29 @@

+import gradio as gr
+from gradio.events import Dependency
+class DocumentViewer(gr.components.Component):
+    """
+    Custom Gradio component for document preview and tag editing.
+    (Stub implementation)
+    """
+    def __init__(self, label=None):
+        super().__init__(label=label, value=None)
+        self.visible = True
+        self.interactive = False
+    def preprocess(self, x):
+        # Input is a file path (or object); just return as-is
+        return x
+    def postprocess(self, x):
+        # x is the raw document text; display first few lines as preview
+        if not x:
+            return ""
+        lines = x.splitlines()
+        preview = "\n".join(lines[:10])
+        return preview
+    from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING
+    from gradio.blocks import Block
+    if TYPE_CHECKING:
+        from gradio.components import Timer
+        from gradio.components.base import Component

core/database.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# core/database.py
+import chromadb
+from chromadb.config import Settings
+from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
+import config
+def init_chroma():
+    """
+    Initialize a ChromaDB client and collection with an embedding function.
+    Uses OpenAI embeddings if API key is available, otherwise a dummy embedding.
+    """
+    # Initialize Chroma client (in-memory by default)
+    client = chromadb.Client(Settings())
+    # Determine embedding function
+    embedding_fn = None
+    try:
+        openai_key = config.OPENAI_API_KEY
+    except AttributeError:
+        openai_key = None
+    if openai_key:
+        embedding_fn = OpenAIEmbeddingFunction(
+            api_key=openai_key,
+            model_name="text-embedding-ada-002"
+        )
+    else:
+        # Dummy embedding: one-dimensional embedding based on text length
+        class DummyEmbedding:
+            def __call__(self, texts):
+                return [[float(len(text))] for text in texts]
+        embedding_fn = DummyEmbedding()
+    # Create or get collection named "documents"
+    collection = client.get_or_create_collection(
+        name="documents",
+        embedding_function=embedding_fn
+    )
+    return collection
+def add_document(collection, doc_id: str, text: str, tags: list[str], summary: str, source: str):
+    """
+    Add a document to the ChromaDB collection with metadata.
+    """
+    metadata = {"tags": tags, "summary": summary, "source": source}
+    # Add document (Chroma will generate embeddings using the collection's embedding function)
+    collection.add(
+        ids=[doc_id],
+        documents=[text],
+        metadatas=[metadata]
+    )
+def search_documents(collection, query: str, top_n: int = 5) -> list[dict]:
+    """
+    Search for semantically similar documents in the collection.
+    Returns top N results with their metadata.
+    """
+    results = collection.query(
+        query_texts=[query],
+        n_results=top_n,
+        include=["metadatas", "documents", "distances"]
+    )
+    hits = []
+    # Extract the results from the Chroma query response
+    ids = results.get("ids", [[]])[0]
+    documents = results.get("documents", [[]])[0]
+    metadatas = results.get("metadatas", [[]])[0]
+    distances = results.get("distances", [[]])[0]
+    for i, doc_id in enumerate(ids):
+        hit = {
+            "id": doc_id,
+            "score": distances[i] if i < len(distances) else None,
+            "source": metadatas[i].get("source") if i < len(metadatas) else None,
+            "tags": metadatas[i].get("tags") if i < len(metadatas) else None,
+            "summary": metadatas[i].get("summary") if i < len(metadatas) else None,
+            "document": documents[i] if i < len(documents) else None
+        }
+        hits.append(hit)
+    return hits

core/parser.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import requests
+from bs4 import BeautifulSoup
+from unstructured.partition.auto import partition
+def parse_document(file_path: str) -> str:
+    """
+    Parse a document file (PDF, DOCX, TXT, etc.) into text using Unstructured.
+    """
+    try:
+        elements = partition(file_path)
+        # Combine text elements into a single string
+        text = "\n".join([elem.text for elem in elements if elem.text])
+        return text
+    except Exception as e:
+        return f"Error parsing document: {e}"
+def parse_url(url: str) -> str:
+    """
+    Fetch and parse webpage content at the given URL.
+    """
+    try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        response = requests.get(url, headers=headers, timeout=10)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Extract visible text from paragraphs
+        paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])
+        text = "\n".join([p.get_text() for p in paragraphs])
+        return text
+    except Exception as e:
+        return f"Error fetching URL: {e}"

core/processing.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# core/processing.py
+import requests
+from unstructured.partition.html import partition_html
+from unstructured.partition.auto import partition
+import config
+def fetch_web_content(url: str) -> str:
+    """
+    Fetch and parse web content from the given URL into structured text.
+    """
+    try:
+        # Use Unstructured to fetch and parse HTML content directly from the URL
+        elements = partition_html(url=url)
+        text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
+        return text
+    except Exception:
+        # If Unstructured parsing fails, attempt a simple HTTP GET as a fallback
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+            html_text = response.text
+            # Attempt parsing the fetched HTML text
+            elements = partition(filename=None, file=html_text)
+            text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
+            return text
+        except Exception:
+            # On failure, return empty string
+            return ""
+def parse_local_file(file_path: str) -> str:
+    """
+    Parse a local file into structured text using the Unstructured library.
+    Supports various file formats (e.g., PDF, DOCX, TXT).
+    """
+    try:
+        elements = partition(filename=file_path)
+        text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
+        return text
+    except Exception:
+        # Return empty string on failure
+        return ""

core/storage.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import chromadb
+import os
+from mistralai import Mistral
+import config
+# Initialize ChromaDB client (persistent directory can be set via CHROMA_DB_DIR)
+chroma_db_path = os.getenv("CHROMA_DB_DIR", "db/")
+client = chromadb.Client()
+collection = client.get_or_create_collection("documents")
+# Use Mistral API for embeddings
+def get_mistral_embedding(text: str) -> list[float]:
+    """
+    Get embedding for the given text using Mistral API.
+    """
+    with Mistral(api_key=config.MISTRAL_API_KEY) as client:
+        response = client.embeddings.create(
+            model="mistral-embed",
+            input=text
+        )
+        # The API returns a list of embeddings (one per input)
+        return response['data'][0]['embedding']
+def add_document(doc_id: str, text: str, metadata: dict):
+    """
+    Add a document's text and metadata to the ChromaDB collection.
+    """
+    embedding = get_mistral_embedding(text)
+    collection.add(ids=[doc_id], embeddings=[embedding], documents=[text], metadatas=[metadata])
+    # Persist to disk
+    client.persist()
+    return True
+def search_documents(query: str, top_k: int = 5) -> dict:
+    """
+    Search for documents semantically similar to the query.
+    Returns a dictionary of top results.
+    """
+    query_vec = get_mistral_embedding(query)
+    results = collection.query(query_embeddings=[query_vec], n_results=top_k,
+                               include=['ids','distances','documents','metadatas'])
+    return results
+def get_all_documents() -> list:
+    """
+    Retrieve metadata for all documents in the collection.
+    """
+    all_ids = collection.get()['ids']
+    docs = []
+    for doc_id in all_ids:
+        res = collection.get(ids=[doc_id])
+        if res and res['metadatas']:
+            docs.append({"id": doc_id, "metadata": res['metadatas'][0]})
+    return docs

core/summarizer.py ADDED Viewed

	@@ -0,0 +1,25 @@

+def summarize_content(text: str) -> str:
+    """
+    Generate a summary of the text. (This is a stub simulating a Claude 3 Haiku call.)
+    """
+    # In a real app, you might call the Anthropic Claude 3 API here.
+    # We'll return the first 100 characters as a "summary".
+    summary = text.strip().replace("\n", " ")
+    summary = summary[:100] + ("..." if len(summary) > 100 else "")
+    return f"Summary: {summary}"
+def tag_content(text: str) -> list:
+    """
+    Generate tags for the text. (This is a stub simulating a Mistral 7B call.)
+    """
+    # In a real app, you might call a tag-generation model or use embeddings.
+    # We'll simulate by picking some keywords.
+    common_words = ["data", "analysis", "python", "research", "AI"]
+    tags = []
+    lower = text.lower()
+    for word in common_words:
+        if word in lower:
+            tags.append(word)
+    if not tags:
+        tags = ["general"]
+    return tags

core/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# core/utils.py
+import re
+from datetime import datetime
+import hashlib
+def clean_text(text: str) -> str:
+    """
+    Clean and normalize text by removing extra whitespace.
+    """
+    if not text:
+        return ""
+    # Collapse multiple whitespace into single spaces and strip ends
+    cleaned = re.sub(r'\s+', ' ', text)
+    return cleaned.strip()
+def generate_doc_id(source: str) -> str:
+    """
+    Generate a unique document ID based on source identifier and timestamp.
+    """
+    timestamp = datetime.now().isoformat()
+    raw_id = f"{source}-{timestamp}"
+    return hashlib.md5(raw_id.encode()).hexdigest()

data/article_url.txt ADDED Viewed

File without changes

data/document1.pdf ADDED Viewed

File without changes

data/sample_note.txt ADDED Viewed

File without changes

mcp_tools.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# # mcp_tools.py
+# from fastmcp import FastMCP
+# import core.processing as processing
+# import core.ai_enrichment as ai_enrichment
+# import core.database as db
+# import core.utils as utils
+# # Initialize the FastMCP server instance
+# mcp = FastMCP(name="IntelligentContentOrganizer")
+# # Initialize the ChromaDB collection (shared for all tools)
+# collection = db.init_chroma()
+# @mcp.tool()
+# def process_content(url: str) -> dict:
+#     """
+#     Process content from a web URL: fetch, enrich, and store.
+#     Returns document ID, tags, summary, and source.
+#     """
+#     content = processing.fetch_web_content(url)
+#     text = utils.clean_text(content)
+#     tags = ai_enrichment.generate_tags(text) if text else []
+#     summary = ai_enrichment.summarize_text(text) if text else ""
+#     doc_id = utils.generate_doc_id(url)
+#     # Add the document to the database collection
+#     db.add_document(collection, doc_id, text, tags, summary, source=url)
+#     return {"id": doc_id, "tags": tags, "summary": summary, "source": url}
+# @mcp.tool()
+# def upload_local_file(file_path: str) -> dict:
+#     """
+#     Process a local file: parse, enrich, and store.
+#     Returns document ID, tags, summary, and source.
+#     """
+#     content = processing.parse_local_file(file_path)
+#     text = utils.clean_text(content)
+#     tags = ai_enrichment.generate_tags(text) if text else []
+#     summary = ai_enrichment.summarize_text(text) if text else ""
+#     doc_id = utils.generate_doc_id(file_path)
+#     db.add_document(collection, doc_id, text, tags, summary, source=file_path)
+#     return {"id": doc_id, "tags": tags, "summary": summary, "source": file_path}
+# @mcp.tool()
+# def semantic_search(query: str, top_n: int = 5) -> list:
+#     """
+#     Search for documents semantically similar to the query.
+#     Returns top N results as a list of dictionaries.
+#     """
+#     results = db.search_documents(collection, query, top_n)
+#     return results
+from fastmcp import FastMCP
+from core.parser import parse_document, parse_url
+from core.summarizer import summarize_content, tag_content
+from core.storage import add_document, search_documents
+from core.agent import answer_question
+import json
+mcp = FastMCP("IntelligentContentOrganizer_MCP")
+@mcp.tool(name="parse_document")
+def mcp_parse_document(file_path: str) -> str:
+    """
+    MCP tool: Parse a document file and return extracted text.
+    """
+    text = parse_document(file_path)
+    return text
+@mcp.tool(name="parse_url")
+def mcp_parse_url(url: str) -> str:
+    """
+    MCP tool: Fetch and parse webpage content from a URL.
+    """
+    text = parse_url(url)
+    return text
+@mcp.tool(name="summarize")
+def mcp_summarize(text: str) -> str:
+    """
+    MCP tool: Generate a summary of the provided text.
+    """
+    return summarize_content(text)
+@mcp.tool(name="tag")
+def mcp_tag(text: str) -> str:
+    """
+    MCP tool: Generate tags for the provided text (JSON list).
+    """
+    tags = tag_content(text)
+    return json.dumps(tags)
+@mcp.tool(name="add_to_db")
+def mcp_add_to_db(doc_id: str, text: str, metadata_json: str) -> str:
+    """
+    MCP tool: Add a document to ChromaDB with given ID and metadata (JSON).
+    """
+    metadata = json.loads(metadata_json)
+    add_document(doc_id, text, metadata)
+    return "Document added with ID: " + doc_id
+@mcp.tool(name="search_db")
+def mcp_search_db(query: str, top_k: int = 5) -> str:
+    """
+    MCP tool: Search documents using a query (semantic search). Returns JSON results.
+    """
+    results = search_documents(query, top_k=top_k)
+    return json.dumps(results)
+@mcp.tool(name="answer_question")
+def mcp_answer_question(question: str) -> str:
+    """
+    MCP tool: Answer a question using the agentic workflow.
+    """
+    answer = answer_question(question)
+    return answer
+if __name__ == "__main__":
+    # Run the MCP server (streamable HTTP for web integration:contentReference[oaicite:6]{index=6})
+    mcp.run(transport="streamable-http", host="0.0.0.0", port=7861, path="/mcp")

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+mistralai
+python-dotenv
+gradio>=4.0
+fastmcp>=2.0
+chromadb
+sentence-transformers
+unstructured
+requests
+beautifulsoup4
+plotly
+networkx