Spaces:

Agents-MCP-Hackathon
/

Intelligent_Content_Organizer

Running

App Files Files Community

Nihal2000 commited on Jun 3

Commit

8ba2581

1 Parent(s): 3e772ec

Server initialization

Browse files

Files changed (19) hide show

.gitignore +33 -0
app.py +250 -153
config.py +122 -5
core/__init__.py +0 -0
core/agent.py +0 -17
core/ai_enrichment.py +0 -41
core/components.py +0 -23
core/components.pyi +0 -29
core/database.py +0 -81
core/parser.py +0 -30
core/processing.py +0 -42
core/summarizer.py +0 -25
core/utils.py +0 -23
data/article_url.txt +0 -0
data/document1.pdf +0 -0
data/sample_note.txt +0 -0
mcp_server.py +219 -0
mcp_tools.py +589 -119
requirements.txt +20 -9

.gitignore CHANGED Viewed

	@@ -0,0 +1,33 @@

+# Ignore environment variables
+.env
+# Ignore Python cache files
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+# Ignore Jupyter notebooks checkpoints
+.ipynb_checkpoints/
+# Ignore virtual environment folders
+env/
+venv/
+ENV/
+VENV/
+# Ignore VSCode-specific files
+.vscode/
+# Ignore OS-specific files
+.DS_Store
+Thumbs.db
+# Ignore database or app data
+db/
+*.sqlite3
+# Ignore Gradio temp files
+gradio_cached_examples/
+tmp/
+*.log

app.py CHANGED Viewed

@@ -1,157 +1,254 @@
-import os
-import uuid
 import gradio as gr
-from gradio import components
-from fastmcp import FastMCP
-# from core.parser import parse_document, parse_url
-from core.parser import parse_document, parse_url
-from core.summarizer import summarize_content, tag_content
-from core.storage import add_document, search_documents
-from core.agent import answer_question
-# from core.components import DocumentViewer
-import plotly.graph_objects as go
-# Initialize the FastMCP server (for agentic tools)
-mcp = FastMCP("IntelligentContentOrganizer")
-# Gradio UI functions
-def process_content(file_obj, url, tags_input):
     """
-    Handle file upload or URL input: parse content, summarize, tag, store.
-    """
-    content_text = ""
-    source = ""
-    if file_obj is not None:
-        # Save uploaded file to temp path
-        file_path = file_obj.name
-        content_text = parse_document(file_path)
-        source = file_obj.name
-    elif url:
-        content_text = parse_url(url)
-        source = url
-    else:
-        return "No document provided.", "", "", ""
-    # Summarize and tag (simulated)
-    summary = summarize_content(content_text)
-    tags = tag_content(content_text)
-    # Allow user to override or confirm tags via input
-    if tags_input:
-        # If user entered new tags, split by comma
-        tags = [t.strip() for t in tags_input.split(",") if t.strip() != ""]
-    # Store in ChromaDB with a unique ID
-    doc_id = str(uuid.uuid4())
-    metadata = {"source": source, "tags": tags}
-    add_document(doc_id, content_text, metadata)
-    return content_text, summary, ", ".join(tags), f"Document stored with ID: {doc_id}"
-def generate_graph():
-    """
-    Create a simple Plotly graph of documents.
-    Nodes = documents, edges = shared tags.
-    """
-    # Fetch all documents from ChromaDB
-    from core.storage import get_all_documents
-    docs = get_all_documents()
-    if not docs:
-        return go.Figure()  # empty
-    # Build graph connections: if two docs share a tag, connect them
-    nodes = {doc["id"]: doc for doc in docs}
-    edges = []
-    for i, doc1 in enumerate(docs):
-        for doc2 in docs[i+1:]:
-            shared_tags = set(doc1["metadata"]["tags"]) & set(doc2["metadata"]["tags"])
-            if shared_tags:
-                edges.append((doc1["id"], doc2["id"]))
-    # Use networkx to compute layout (or simple fixed positions)
-    import networkx as nx
-    G = nx.Graph()
-    G.add_nodes_from(nodes.keys())
-    G.add_edges_from(edges)
-    pos = nx.spring_layout(G, seed=42)
-    # Create Plotly traces
-    edge_x = []
-    edge_y = []
-    for (src, dst) in edges:
-        x0, y0 = pos[src]
-        x1, y1 = pos[dst]
-        edge_x += [x0, x1, None]
-        edge_y += [y0, y1, None]
-    edge_trace = go.Scatter(
-        x=edge_x, y=edge_y,
-        line=dict(width=1, color='#888'),
-        hoverinfo='none',
-        mode='lines')
-    node_x = []
-    node_y = []
-    node_text = []
-    for node_id in G.nodes():
-        x, y = pos[node_id]
-        node_x.append(x)
-        node_y.append(y)
-        text = nodes[node_id]["metadata"].get("source", "")
-        node_text.append(f"{text}\nTags: {nodes[node_id]['metadata']['tags']}")
-    node_trace = go.Scatter(
-        x=node_x, y=node_y,
-        mode='markers+text',
-        marker=dict(size=10, color='skyblue'),
-        text=node_text, hoverinfo='text', textposition="bottom center")
-    fig = go.Figure(data=[edge_trace, node_trace],
-                    layout=go.Layout(title="Document Knowledge Graph",
-                                     showlegend=False,
-                                     margin=dict(l=20, r=20, b=20, t=30)))
-    return fig
-def handle_query(question):
-    """
-    Answer a user question by retrieving relevant documents and summarizing them.
-    """
-    if not question:
-        return "Please enter a question."
-    answer = answer_question(question)
-    return answer
-# Build Gradio interface with Blocks
-with gr.Blocks(title="Intelligent Content Organizer") as demo:
-    gr.Markdown("# Intelligent Content Organizer")
-    with gr.Tab("Upload / Fetch Content"):
-        gr.Markdown("**Add a document:** Upload a file or enter a URL.")
-        with gr.Row():
-            file_in = gr.File(label="Upload Document (PDF, TXT, etc.)")
-            url_in = gr.Textbox(label="Document URL", placeholder="https://example.com/article")
-        tags_in = gr.Textbox(label="Tags (comma-separated)", placeholder="Enter tags or leave blank")
-        process_btn = gr.Button("Parse & Add Document")
-        doc_view = gr.Textbox(label="Document Preview", lines=10, interactive=False)
-        summary_out = gr.Textbox(label="Summary", interactive=False)
-        tags_out = gr.Textbox(label="Detected Tags", interactive=False)
-        status_out = gr.Textbox(label="Status/Info", interactive=False)
-        process_btn.click(fn=process_content, inputs=[file_in, url_in, tags_in],
-                          outputs=[doc_view, summary_out, tags_out, status_out])
-    with gr.Tab("Knowledge Graph"):
-        gr.Markdown("**Document relationships:** Shared tags indicate edges.")
-        graph_plot = gr.Plot(label="Knowledge Graph")
-        refresh_btn = gr.Button("Refresh Graph")
-        refresh_btn.click(fn=generate_graph, inputs=None, outputs=graph_plot)
-    with gr.Tab("Ask a Question"):
-        gr.Markdown("**AI Q&A:** Ask a question about your documents.")
-        question_in = gr.Textbox(label="Your Question")
-        answer_out = gr.Textbox(label="Answer", interactive=False)
-        ask_btn = gr.Button("Get Answer")
-        ask_btn.click(fn=handle_query, inputs=question_in, outputs=answer_out)
 if __name__ == "__main__":
-    # Launch Gradio app (Hugging Face Spaces will auto-launch this)
-    # demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
-    demo.launch(mcp_server=True)

 import gradio as gr
+import asyncio
+from pathlib import Path
+import tempfile
+import json
+from typing import List, Dict, Any
+import logging
+from config import Config
+from mcp_server import mcp
+# Handle imports based on how the app is run
+try:
+    from mcp_server import mcp
+    MCP_AVAILABLE = True
+except ImportError:
+    MCP_AVAILABLE = False
+    print("⚠️ MCP server not available, running in standalone mode")
+import mcp_tools
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Validate configuration on startup
+try:
+    Config.validate()
+except ValueError as e:
+    logger.error(f"Configuration error: {e}")
+    print(f"⚠️ Configuration error: {e}")
+    print("Please set the required API keys in your environment variables or .env file")
+# Global state for search results
+current_results = []
+async def process_file_handler(file):
+    """Handle file upload and processing"""
+    if file is None:
+        return "Please upload a file", "", "", None
+    try:
+        # Process the file
+        result = await mcp_tools.process_local_file(file.name)
+        if result.get("success"):
+            tags_display = ", ".join(result["tags"])
+            return (
+                f"✅ Successfully processed: {result['file_name']}",
+                result["summary"],
+                tags_display,
+                gr.update(visible=True, value=create_result_card(result))
+            )
+        else:
+            return f"❌ Error: {result.get('error', 'Unknown error')}", "", "", None
+    except Exception as e:
+        logger.error(f"Error in file handler: {str(e)}")
+        return f"❌ Error: {str(e)}", "", "", None
+async def process_url_handler(url):
+    """Handle URL processing"""
+    if not url:
+        return "Please enter a URL", "", "", None
+    try:
+        # Process the URL
+        result = await mcp_tools.process_web_content(url)
+        if result.get("success"):
+            tags_display = ", ".join(result["tags"])
+            return (
+                f"✅ Successfully processed: {url}",
+                result["summary"],
+                tags_display,
+                gr.update(visible=True, value=create_result_card(result))
+            )
+        else:
+            return f"❌ Error: {result.get('error', 'Unknown error')}", "", "", None
+    except Exception as e:
+        logger.error(f"Error in URL handler: {str(e)}")
+        return f"❌ Error: {str(e)}", "", "", None
+async def search_handler(query):
+    """Handle semantic search"""
+    if not query:
+        return [], "Please enter a search query"
+    try:
+        # Perform search
+        results = await mcp_tools.search_knowledge_base(query, limit=10)
+        if results:
+            # Create display cards for each result
+            result_cards = []
+            for result in results:
+                card = f"""
+                ### 📄 {result.get('source', 'Unknown Source')}
+                **Tags:** {', '.join(result.get('tags', []))}
+                **Summary:** {result.get('summary', 'No summary available')}
+                **Relevance:** {result.get('relevance_score', 0):.2%}
+                ---
+                """
+                result_cards.append(card)
+            global current_results
+            current_results = results
+            return result_cards, f"Found {len(results)} results"
+        else:
+            return [], "No results found"
+    except Exception as e:
+        logger.error(f"Error in search: {str(e)}")
+        return [], f"Error: {str(e)}"
+def create_result_card(result: Dict[str, Any]) -> str:
+    """Create a formatted result card"""
+    return f"""
+    ### 📋 Processing Complete
+    **Document ID:** {result.get('doc_id', 'N/A')}
+    **Source:** {result.get('file_name', result.get('url', 'Unknown'))}
+    **Tags:** {', '.join(result.get('tags', []))}
+    **Summary:** {result.get('summary', 'No summary available')}
+    **Chunks Processed:** {result.get('chunks_processed', 0)}
     """
+# Create Gradio interface
+with gr.Blocks(title="Intelligent Content Organizer - MCP Agent") as demo:
+    gr.Markdown("""
+    # 🧠 Intelligent Content Organizer
+    ### MCP-Powered Knowledge Management System
+    This AI-driven system automatically organizes, enriches, and retrieves your digital content.
+    Upload files or provide URLs to build your personal knowledge base with automatic tagging and semantic search.
+    ---
+    """)
+    with gr.Tabs():
+        # File Processing Tab
+        with gr.TabItem("📁 Process Files"):
+            with gr.Row():
+                with gr.Column():
+                    file_input = gr.File(
+                        label="Upload Document",
+                        file_types=[".pdf", ".txt", ".docx", ".doc", ".html", ".md", ".csv", ".json"]
+                    )
+                    file_process_btn = gr.Button("Process File", variant="primary")
+                with gr.Column():
+                    file_status = gr.Textbox(label="Status", lines=1)
+                    file_summary = gr.Textbox(label="Generated Summary", lines=3)
+                    file_tags = gr.Textbox(label="Generated Tags", lines=1)
+            file_result = gr.Markdown(visible=False)
+        # URL Processing Tab
+        with gr.TabItem("🌐 Process URLs"):
+            with gr.Row():
+                with gr.Column():
+                    url_input = gr.Textbox(
+                        label="Enter URL",
+                        placeholder="https://example.com/article"
+                    )
+                    url_process_btn = gr.Button("Process URL", variant="primary")
+                with gr.Column():
+                    url_status = gr.Textbox(label="Status", lines=1)
+                    url_summary = gr.Textbox(label="Generated Summary", lines=3)
+                    url_tags = gr.Textbox(label="Generated Tags", lines=1)
+            url_result = gr.Markdown(visible=False)
+        # Search Tab
+        with gr.TabItem("🔍 Semantic Search"):
+            search_input = gr.Textbox(
+                label="Search Query",
+                placeholder="Enter your search query...",
+                lines=1
+            )
+            search_btn = gr.Button("Search", variant="primary")
+            search_status = gr.Textbox(label="Status", lines=1)
+            search_results = gr.Markdown(label="Search Results")
+        # MCP Server Info Tab
+        with gr.TabItem("ℹ️ MCP Server Info"):
+            gr.Markdown("""
+            ### MCP Server Configuration
+            This Gradio app also functions as an MCP (Model Context Protocol) server, allowing integration with:
+            - Claude Desktop
+            - Cursor
+            - Other MCP-compatible clients
+            **Server Name:** intelligent-content-organizer
+            **Available Tools:**
+            - `process_file`: Process local files and extract content
+            - `process_url`: Fetch and process web content
+            - `semantic_search`: Search across stored documents
+            - `get_document_summary`: Get detailed document information
+            **To use as MCP server:**
+            1. Add this server to your MCP client configuration
+            2. Use the tools listed above to interact with your knowledge base
+            3. All processed content is automatically indexed for semantic search
+            **Tags:** mcp-server-track
+            """)
+    # Event handlers
+    file_process_btn.click(
+        fn=lambda x: asyncio.run(process_file_handler(x)),
+        inputs=[file_input],
+        outputs=[file_status, file_summary, file_tags, file_result]
+    )
+    url_process_btn.click(
+        fn=lambda x: asyncio.run(process_url_handler(x)),
+        inputs=[url_input],
+        outputs=[url_status, url_summary, url_tags, url_result]
+    )
+    search_btn.click(
+        fn=lambda x: asyncio.run(search_handler(x)),
+        inputs=[search_input],
+        outputs=[search_results, search_status]
+    )
+# Launch configuration
 if __name__ == "__main__":
+    # Check if running as MCP server
+    import sys
+    if "--mcp" in sys.argv:
+        # Run as MCP server
+        import asyncio
+        asyncio.run(mcp.run())
+    else:
+        # Run as Gradio app
+        demo.launch(
+            server_name="0.0.0.0",
+            share=False,
+            show_error=True
+        )

config.py CHANGED Viewed

@@ -1,7 +1,124 @@
-# config.py
 import os
 from dotenv import load_dotenv
-load_dotenv()  # loads from .env if present
-MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
-CLAUDE_API_KEY  = os.environ.get("CLAUDE_API_KEY")
-BRAVE_API_KEY   = os.environ.get("BRAVE_API_KEY")

+# import os
+# from dotenv import load_dotenv
+# # Load environment variables
+# load_dotenv()
+# class Config:
+#     """Configuration management for API keys and settings"""
+#     # API Keys (from environment variables)
+#     MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
+#     BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
+#     UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY", "")
+#     ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
+#     # ChromaDB Settings
+#     CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
+#     CHROMA_COLLECTION_NAME = "knowledge_base"
+#     # MCP Server Settings
+#     MCP_SERVER_NAME = "intelligent-content-organizer"
+#     MCP_SERVER_VERSION = "1.0.0"
+#     # Processing Settings
+#     MAX_FILE_SIZE_MB = 50
+#     SUPPORTED_FILE_TYPES = [
+#         ".pdf", ".txt", ".docx", ".doc", ".html", ".md",
+#         ".csv", ".json", ".xml", ".epub", ".rtf"
+#     ]
+#     # Model Settings
+#     MISTRAL_MODEL = "mistral-small-latest"
+#     CLAUDE_MODEL = "claude-3-haiku-20240307"
+#     EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+#     @classmethod
+#     def validate(cls):
+#         """Validate that all required API keys are set"""
+#         missing_keys = []
+#         if not cls.MISTRAL_API_KEY:
+#             missing_keys.append("MISTRAL_API_KEY")
+#         if not cls.BRAVE_API_KEY:
+#             missing_keys.append("BRAVE_API_KEY")
+#         if not cls.UNSTRUCTURED_API_KEY:
+#             missing_keys.append("UNSTRUCTURED_API_KEY")
+#         if missing_keys:
+#             raise ValueError(f"Missing required API keys: {', '.join(missing_keys)}")
+#         return True
 import os
 from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class Config:
+    """Configuration management for API keys and settings"""
+    # API Keys - Only 2 needed, both with free tiers!
+    MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY", "")
+    ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
+    # ChromaDB Settings (completely free local storage)
+    CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
+    CHROMA_COLLECTION_NAME = "knowledge_base"
+    # MCP Server Settings
+    MCP_SERVER_NAME = "intelligent-content-organizer"
+    MCP_SERVER_VERSION = "1.0.0"
+    MCP_SERVER_DESCRIPTION = "AI-powered knowledge management with automatic tagging and semantic search"
+    # Processing Settings
+    MAX_FILE_SIZE_MB = 50
+    SUPPORTED_FILE_TYPES = [
+        ".pdf", ".txt", ".docx", ".doc", ".html", ".htm",
+        ".md", ".csv", ".json", ".xml", ".rtf"
+    ]
+    # Model Settings
+    MISTRAL_MODEL = "mistral-small-latest"  # Free tier available
+    CLAUDE_MODEL = "claude-3-haiku-20240307"  # Free tier available
+    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Completely free
+    # Feature Flags - Enable/disable based on API availability
+    USE_MISTRAL_FOR_TAGS = bool(MISTRAL_API_KEY)
+    USE_CLAUDE_FOR_SUMMARY = bool(ANTHROPIC_API_KEY)
+    # Free alternatives settings
+    ENABLE_FREE_FALLBACKS = True  # Always use free methods when APIs fail
+    @classmethod
+    def validate(cls):
+        """Validate configuration - now more flexible"""
+        warnings = []
+        if not cls.MISTRAL_API_KEY:
+            warnings.append("MISTRAL_API_KEY not set - will use free tag generation")
+        if not cls.ANTHROPIC_API_KEY:
+            warnings.append("ANTHROPIC_API_KEY not set - will use free summarization")
+        if warnings:
+            print("⚠️  Configuration warnings:")
+            for warning in warnings:
+                print(f"   - {warning}")
+            print("\n✅ The app will still work using free alternatives!")
+        else:
+            print("✅ All API keys configured")
+        return True
+    @classmethod
+    def get_status(cls):
+        """Get configuration status for display"""
+        return {
+            "mistral_configured": bool(cls.MISTRAL_API_KEY),
+            "anthropic_configured": bool(cls.ANTHROPIC_API_KEY),
+            "free_fallbacks_enabled": cls.ENABLE_FREE_FALLBACKS,
+            "supported_formats": cls.SUPPORTED_FILE_TYPES,
+            "embedding_model": cls.EMBEDDING_MODEL
+        }

core/__init__.py DELETED Viewed

File without changes

core/agent.py DELETED Viewed

@@ -1,17 +0,0 @@
-import json
-from core.storage import search_documents
-# For Q&A we can use a simple retrieval + QA pipeline (stubbed here)
-# In a real app, you might use LangChain or a HuggingFace question-answering model.
-def answer_question(question: str) -> str:
-    """
-    Agent: retrieve relevant docs and answer the question.
-    """
-    # Retrieve top documents
-    results = search_documents(question, top_k=3)
-    doc_texts = results.get("documents", [[]])[0]
-    combined = " ".join(doc_texts)
-    # Stub: just echo the question and number of docs
-    if not combined.strip():
-        return "No relevant documents found."
-    return f"Answered question: '{question}' (based on {len(doc_texts)} documents)."

core/ai_enrichment.py DELETED Viewed

@@ -1,41 +0,0 @@
-# core/ai_enrichment.py
-from mistralai import Mistral
-import config
-def generate_tags(text: str) -> list[str]:
-    """
-    Use Mistral AI to generate 5-7 relevant tags for the text.
-    """
-    with Mistral(api_key=config.MISTRAL_API_KEY) as client:
-        response = client.chat.complete(
-            model="mistral-small-latest",
-            messages=[{
-                "role": "user",
-                "content": f"Generate 5-7 relevant tags (comma-separated) for the following text:\n\n{text}"
-            }]
-        )
-    try:
-        content = response["choices"][0]["message"]["content"]
-    except (KeyError, IndexError):
-        return []
-    tags = [tag.strip() for tag in content.split(",") if tag.strip()]
-    return tags
-def summarize_text(text: str) -> str:
-    """
-    Use Mistral AI to generate a concise summary of the text.
-    """
-    with Mistral(api_key=config.MISTRAL_API_KEY) as client:
-        response = client.chat.complete(
-            model="mistral-small-latest",
-            messages=[{
-                "role": "user",
-                "content": f"Summarize the following text in a concise manner:\n\n{text}"
-            }]
-        )
-    try:
-        summary = response["choices"][0]["message"]["content"].strip()
-    except (KeyError, IndexError):
-        return ""
-    return summary

core/components.py DELETED Viewed

@@ -1,23 +0,0 @@
-import gradio as gr
-class DocumentViewer(gr.components.Component):
-    """
-    Custom Gradio component for document preview and tag editing.
-    (Stub implementation)
-    """
-    def __init__(self, label=None):
-        super().__init__(label=label, value=None)
-        self.visible = True
-        self.interactive = False
-    def preprocess(self, x):
-        # Input is a file path (or object); just return as-is
-        return x
-    def postprocess(self, x):
-        # x is the raw document text; display first few lines as preview
-        if not x:
-            return ""
-        lines = x.splitlines()
-        preview = "\n".join(lines[:10])
-        return preview

core/components.pyi DELETED Viewed

@@ -1,29 +0,0 @@
-import gradio as gr
-from gradio.events import Dependency
-class DocumentViewer(gr.components.Component):
-    """
-    Custom Gradio component for document preview and tag editing.
-    (Stub implementation)
-    """
-    def __init__(self, label=None):
-        super().__init__(label=label, value=None)
-        self.visible = True
-        self.interactive = False
-    def preprocess(self, x):
-        # Input is a file path (or object); just return as-is
-        return x
-    def postprocess(self, x):
-        # x is the raw document text; display first few lines as preview
-        if not x:
-            return ""
-        lines = x.splitlines()
-        preview = "\n".join(lines[:10])
-        return preview
-    from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING
-    from gradio.blocks import Block
-    if TYPE_CHECKING:
-        from gradio.components import Timer
-        from gradio.components.base import Component

core/database.py DELETED Viewed

@@ -1,81 +0,0 @@
-# core/database.py
-import chromadb
-from chromadb.config import Settings
-from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
-import config
-def init_chroma():
-    """
-    Initialize a ChromaDB client and collection with an embedding function.
-    Uses OpenAI embeddings if API key is available, otherwise a dummy embedding.
-    """
-    # Initialize Chroma client (in-memory by default)
-    client = chromadb.Client(Settings())
-    # Determine embedding function
-    embedding_fn = None
-    try:
-        openai_key = config.OPENAI_API_KEY
-    except AttributeError:
-        openai_key = None
-    if openai_key:
-        embedding_fn = OpenAIEmbeddingFunction(
-            api_key=openai_key,
-            model_name="text-embedding-ada-002"
-        )
-    else:
-        # Dummy embedding: one-dimensional embedding based on text length
-        class DummyEmbedding:
-            def __call__(self, texts):
-                return [[float(len(text))] for text in texts]
-        embedding_fn = DummyEmbedding()
-    # Create or get collection named "documents"
-    collection = client.get_or_create_collection(
-        name="documents",
-        embedding_function=embedding_fn
-    )
-    return collection
-def add_document(collection, doc_id: str, text: str, tags: list[str], summary: str, source: str):
-    """
-    Add a document to the ChromaDB collection with metadata.
-    """
-    metadata = {"tags": tags, "summary": summary, "source": source}
-    # Add document (Chroma will generate embeddings using the collection's embedding function)
-    collection.add(
-        ids=[doc_id],
-        documents=[text],
-        metadatas=[metadata]
-    )
-def search_documents(collection, query: str, top_n: int = 5) -> list[dict]:
-    """
-    Search for semantically similar documents in the collection.
-    Returns top N results with their metadata.
-    """
-    results = collection.query(
-        query_texts=[query],
-        n_results=top_n,
-        include=["metadatas", "documents", "distances"]
-    )
-    hits = []
-    # Extract the results from the Chroma query response
-    ids = results.get("ids", [[]])[0]
-    documents = results.get("documents", [[]])[0]
-    metadatas = results.get("metadatas", [[]])[0]
-    distances = results.get("distances", [[]])[0]
-    for i, doc_id in enumerate(ids):
-        hit = {
-            "id": doc_id,
-            "score": distances[i] if i < len(distances) else None,
-            "source": metadatas[i].get("source") if i < len(metadatas) else None,
-            "tags": metadatas[i].get("tags") if i < len(metadatas) else None,
-            "summary": metadatas[i].get("summary") if i < len(metadatas) else None,
-            "document": documents[i] if i < len(documents) else None
-        }
-        hits.append(hit)
-    return hits

core/parser.py DELETED Viewed

@@ -1,30 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-from unstructured.partition.auto import partition
-def parse_document(file_path: str) -> str:
-    """
-    Parse a document file (PDF, DOCX, TXT, etc.) into text using Unstructured.
-    """
-    try:
-        elements = partition(file_path)
-        # Combine text elements into a single string
-        text = "\n".join([elem.text for elem in elements if elem.text])
-        return text
-    except Exception as e:
-        return f"Error parsing document: {e}"
-def parse_url(url: str) -> str:
-    """
-    Fetch and parse webpage content at the given URL.
-    """
-    try:
-        headers = {"User-Agent": "Mozilla/5.0"}
-        response = requests.get(url, headers=headers, timeout=10)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Extract visible text from paragraphs
-        paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])
-        text = "\n".join([p.get_text() for p in paragraphs])
-        return text
-    except Exception as e:
-        return f"Error fetching URL: {e}"

core/processing.py DELETED Viewed

@@ -1,42 +0,0 @@
-# core/processing.py
-import requests
-from unstructured.partition.html import partition_html
-from unstructured.partition.auto import partition
-import config
-def fetch_web_content(url: str) -> str:
-    """
-    Fetch and parse web content from the given URL into structured text.
-    """
-    try:
-        # Use Unstructured to fetch and parse HTML content directly from the URL
-        elements = partition_html(url=url)
-        text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
-        return text
-    except Exception:
-        # If Unstructured parsing fails, attempt a simple HTTP GET as a fallback
-        try:
-            response = requests.get(url)
-            response.raise_for_status()
-            html_text = response.text
-            # Attempt parsing the fetched HTML text
-            elements = partition(filename=None, file=html_text)
-            text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
-            return text
-        except Exception:
-            # On failure, return empty string
-            return ""
-def parse_local_file(file_path: str) -> str:
-    """
-    Parse a local file into structured text using the Unstructured library.
-    Supports various file formats (e.g., PDF, DOCX, TXT).
-    """
-    try:
-        elements = partition(filename=file_path)
-        text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
-        return text
-    except Exception:
-        # Return empty string on failure
-        return ""

core/summarizer.py DELETED Viewed

@@ -1,25 +0,0 @@
-def summarize_content(text: str) -> str:
-    """
-    Generate a summary of the text. (This is a stub simulating a Claude 3 Haiku call.)
-    """
-    # In a real app, you might call the Anthropic Claude 3 API here.
-    # We'll return the first 100 characters as a "summary".
-    summary = text.strip().replace("\n", " ")
-    summary = summary[:100] + ("..." if len(summary) > 100 else "")
-    return f"Summary: {summary}"
-def tag_content(text: str) -> list:
-    """
-    Generate tags for the text. (This is a stub simulating a Mistral 7B call.)
-    """
-    # In a real app, you might call a tag-generation model or use embeddings.
-    # We'll simulate by picking some keywords.
-    common_words = ["data", "analysis", "python", "research", "AI"]
-    tags = []
-    lower = text.lower()
-    for word in common_words:
-        if word in lower:
-            tags.append(word)
-    if not tags:
-        tags = ["general"]
-    return tags

core/utils.py DELETED Viewed

@@ -1,23 +0,0 @@
-# core/utils.py
-import re
-from datetime import datetime
-import hashlib
-def clean_text(text: str) -> str:
-    """
-    Clean and normalize text by removing extra whitespace.
-    """
-    if not text:
-        return ""
-    # Collapse multiple whitespace into single spaces and strip ends
-    cleaned = re.sub(r'\s+', ' ', text)
-    return cleaned.strip()
-def generate_doc_id(source: str) -> str:
-    """
-    Generate a unique document ID based on source identifier and timestamp.
-    """
-    timestamp = datetime.now().isoformat()
-    raw_id = f"{source}-{timestamp}"
-    return hashlib.md5(raw_id.encode()).hexdigest()

data/article_url.txt DELETED Viewed

File without changes

data/document1.pdf DELETED Viewed

File without changes

data/sample_note.txt DELETED Viewed

File without changes

mcp_server.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# from mcp.server.fastmcp import FastMCP
+# import json
+# from typing import Dict, List, Any
+# import logging
+# # Set up logging
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+# # Initialize MCP server
+# mcp = FastMCP("intelligent-content-organizer")
+# @mcp.tool()
+# async def process_file(file_path: str) -> Dict[str, Any]:
+#     """
+#     Process a local file and extract content, generate tags, and create embeddings
+#     Args:
+#         file_path: Path to the file to process
+#     Returns:
+#         Dictionary containing processed content, tags, and metadata
+#     """
+#     try:
+#         from mcp_tools import process_local_file
+#         result = await process_local_file(file_path)
+#         return result
+#     except Exception as e:
+#         logger.error(f"Error processing file: {str(e)}")
+#         return {"error": str(e)}
+# @mcp.tool()
+# async def process_url(url: str) -> Dict[str, Any]:
+#     """
+#     Fetch and process content from a URL
+#     Args:
+#         url: URL to fetch and process
+#     Returns:
+#         Dictionary containing processed content, tags, and metadata
+#     """
+#     try:
+#         from mcp_tools import process_web_content
+#         result = await process_web_content(url)
+#         return result
+#     except Exception as e:
+#         logger.error(f"Error processing URL: {str(e)}")
+#         return {"error": str(e)}
+# @mcp.tool()
+# async def semantic_search(query: str, limit: int = 5) -> List[Dict[str, Any]]:
+#     """
+#     Perform semantic search across stored documents
+#     Args:
+#         query: Search query
+#         limit: Maximum number of results to return
+#     Returns:
+#         List of relevant documents with metadata
+#     """
+#     try:
+#         from mcp_tools import search_knowledge_base
+#         results = await search_knowledge_base(query, limit)
+#         return results
+#     except Exception as e:
+#         logger.error(f"Error performing search: {str(e)}")
+#         return [{"error": str(e)}]
+# @mcp.tool()
+# async def get_document_summary(doc_id: str) -> Dict[str, Any]:
+#     """
+#     Get summary and metadata for a specific document
+#     Args:
+#         doc_id: Document ID in the knowledge base
+#     Returns:
+#         Document summary and metadata
+#     """
+#     try:
+#         from mcp_tools import get_document_details
+#         result = await get_document_details(doc_id)
+#         return result
+#     except Exception as e:
+#         logger.error(f"Error getting document summary: {str(e)}")
+#         return {"error": str(e)}
+# # Server metadata
+# @mcp.resource("server_info")
+# async def get_server_info() -> Dict[str, Any]:
+#     """Get information about this MCP server"""
+#     return {
+#         "name": "Intelligent Content Organizer",
+#         "version": "1.0.0",
+#         "description": "AI-powered knowledge management system with automatic tagging and semantic search",
+#         "capabilities": [
+#             "File processing (20+ formats)",
+#             "Web content extraction",
+#             "Automatic tagging",
+#             "Semantic search",
+#             "Document summarization"
+#         ]
+#     }
+# if __name__ == "__main__":
+#     # Run the MCP server
+#     import asyncio
+#     asyncio.run(mcp.run())
+from mcp.server.fastmcp import FastMCP
+import json
+from typing import Dict, List, Any
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize MCP server
+mcp = FastMCP("intelligent-content-organizer")
+@mcp.tool()
+async def process_file(file_path: str) -> Dict[str, Any]:
+    """
+    Process a local file and extract content, generate tags, and create embeddings
+    """
+    try:
+        from mcp_tools import process_local_file
+        result = await process_local_file(file_path)
+        return result
+    except Exception as e:
+        logger.error(f"Error processing file: {str(e)}")
+        return {"error": str(e)}
+@mcp.tool()
+async def process_url(url: str) -> Dict[str, Any]:
+    """
+    Fetch and process content from a URL
+    """
+    try:
+        from mcp_tools import process_web_content
+        result = await process_web_content(url)
+        return result
+    except Exception as e:
+        logger.error(f"Error processing URL: {str(e)}")
+        return {"error": str(e)}
+@mcp.tool()
+async def semantic_search(query: str, limit: int = 5) -> List[Dict[str, Any]]:
+    """
+    Perform semantic search across stored documents
+    """
+    try:
+        from mcp_tools import search_knowledge_base
+        results = await search_knowledge_base(query, limit)
+        return results
+    except Exception as e:
+        logger.error(f"Error performing search: {str(e)}")
+        return [{"error": str(e)}]
+@mcp.tool()
+async def get_document_summary(doc_id: str) -> Dict[str, Any]:
+    """
+    Get summary and metadata for a specific document
+    """
+    try:
+        from mcp_tools import get_document_details
+        result = await get_document_details(doc_id)
+        return result
+    except Exception as e:
+        logger.error(f"Error getting document summary: {str(e)}")
+        return {"error": str(e)}
+@mcp.tool()
+async def get_server_info() -> Dict[str, Any]:
+    """
+    Get information about this MCP server
+    """
+    return {
+        "name": "Intelligent Content Organizer",
+        "version": "1.0.0",
+        "description": "AI-powered knowledge management system with automatic tagging and semantic search",
+        "capabilities": [
+            "File processing (20+ formats)",
+            "Web content extraction",
+            "Automatic tagging",
+            "Semantic search",
+            "Document summarization"
+        ],
+        "tools": [
+            {
+                "name": "process_file",
+                "description": "Process local files and extract content"
+            },
+            {
+                "name": "process_url",
+                "description": "Fetch and process web content"
+            },
+            {
+                "name": "semantic_search",
+                "description": "Search across stored documents"
+            },
+            {
+                "name": "get_document_summary",
+                "description": "Get document details"
+            },
+            {
+                "name": "get_server_info",
+                "description": "Get server information"
+            }
+        ]
+    }
+if __name__ == "__main__":
+    # Run the MCP server
+    import asyncio
+    asyncio.run(mcp.run())

mcp_tools.py CHANGED Viewed

@@ -1,122 +1,592 @@
-# # mcp_tools.py
-# from fastmcp import FastMCP
-# import core.processing as processing
-# import core.ai_enrichment as ai_enrichment
-# import core.database as db
-# import core.utils as utils
-# # Initialize the FastMCP server instance
-# mcp = FastMCP(name="IntelligentContentOrganizer")
-# # Initialize the ChromaDB collection (shared for all tools)
-# collection = db.init_chroma()
-# @mcp.tool()
-# def process_content(url: str) -> dict:
-#     """
-#     Process content from a web URL: fetch, enrich, and store.
-#     Returns document ID, tags, summary, and source.
-#     """
-#     content = processing.fetch_web_content(url)
-#     text = utils.clean_text(content)
-#     tags = ai_enrichment.generate_tags(text) if text else []
-#     summary = ai_enrichment.summarize_text(text) if text else ""
-#     doc_id = utils.generate_doc_id(url)
-#     # Add the document to the database collection
-#     db.add_document(collection, doc_id, text, tags, summary, source=url)
-#     return {"id": doc_id, "tags": tags, "summary": summary, "source": url}
-# @mcp.tool()
-# def upload_local_file(file_path: str) -> dict:
-#     """
-#     Process a local file: parse, enrich, and store.
-#     Returns document ID, tags, summary, and source.
-#     """
-#     content = processing.parse_local_file(file_path)
-#     text = utils.clean_text(content)
-#     tags = ai_enrichment.generate_tags(text) if text else []
-#     summary = ai_enrichment.summarize_text(text) if text else ""
-#     doc_id = utils.generate_doc_id(file_path)
-#     db.add_document(collection, doc_id, text, tags, summary, source=file_path)
-#     return {"id": doc_id, "tags": tags, "summary": summary, "source": file_path}
-# @mcp.tool()
-# def semantic_search(query: str, top_n: int = 5) -> list:
-#     """
-#     Search for documents semantically similar to the query.
-#     Returns top N results as a list of dictionaries.
-#     """
-#     results = db.search_documents(collection, query, top_n)
-#     return results
-from fastmcp import FastMCP
-from core.parser import parse_document, parse_url
-from core.summarizer import summarize_content, tag_content
-from core.storage import add_document, search_documents
-from core.agent import answer_question
 import json
-mcp = FastMCP("IntelligentContentOrganizer_MCP")
-@mcp.tool(name="parse_document")
-def mcp_parse_document(file_path: str) -> str:
-    """
-    MCP tool: Parse a document file and return extracted text.
-    """
-    text = parse_document(file_path)
-    return text
-@mcp.tool(name="parse_url")
-def mcp_parse_url(url: str) -> str:
-    """
-    MCP tool: Fetch and parse webpage content from a URL.
-    """
-    text = parse_url(url)
-    return text
-@mcp.tool(name="summarize")
-def mcp_summarize(text: str) -> str:
-    """
-    MCP tool: Generate a summary of the provided text.
-    """
-    return summarize_content(text)
-@mcp.tool(name="tag")
-def mcp_tag(text: str) -> str:
-    """
-    MCP tool: Generate tags for the provided text (JSON list).
-    """
-    tags = tag_content(text)
-    return json.dumps(tags)
-@mcp.tool(name="add_to_db")
-def mcp_add_to_db(doc_id: str, text: str, metadata_json: str) -> str:
-    """
-    MCP tool: Add a document to ChromaDB with given ID and metadata (JSON).
-    """
-    metadata = json.loads(metadata_json)
-    add_document(doc_id, text, metadata)
-    return "Document added with ID: " + doc_id
-@mcp.tool(name="search_db")
-def mcp_search_db(query: str, top_k: int = 5) -> str:
-    """
-    MCP tool: Search documents using a query (semantic search). Returns JSON results.
-    """
-    results = search_documents(query, top_k=top_k)
-    return json.dumps(results)
-@mcp.tool(name="answer_question")
-def mcp_answer_question(question: str) -> str:
-    """
-    MCP tool: Answer a question using the agentic workflow.
-    """
-    answer = answer_question(question)
-    return answer
-if __name__ == "__main__":
-    # Run the MCP server (streamable HTTP for web integration:contentReference[oaicite:6]{index=6})
-    mcp.run(transport="streamable-http", host="0.0.0.0", port=7861, path="/mcp")

+import asyncio
+import aiohttp
+import chromadb
+from chromadb.utils import embedding_functions
 import json
+import logging
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+import hashlib
+from pathlib import Path
+import requests
+# Document processing libraries (all free)
+import PyPDF2
+import docx
+from bs4 import BeautifulSoup
+import pandas as pd
+import markdown
+import xml.etree.ElementTree as ET
+from newspaper import Article
+import trafilatura
+from duckduckgo_search import DDGS
+# AI libraries
+from config import Config
+from mistralai.client import MistralClient
+import anthropic
+# Set up logging
+logger = logging.getLogger(__name__)
+# Initialize AI clients
+mistral_client = MistralClient(api_key=Config.MISTRAL_API_KEY) if Config.MISTRAL_API_KEY else None
+anthropic_client = anthropic.Anthropic(api_key=Config.ANTHROPIC_API_KEY) if Config.ANTHROPIC_API_KEY else None
+# Initialize ChromaDB
+chroma_client = chromadb.PersistentClient(path=Config.CHROMA_DB_PATH)
+embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
+    model_name=Config.EMBEDDING_MODEL
+)
+# Get or create collection
+try:
+    collection = chroma_client.get_collection(
+        name=Config.CHROMA_COLLECTION_NAME,
+        embedding_function=embedding_function
+    )
+except:
+    collection = chroma_client.create_collection(
+        name=Config.CHROMA_COLLECTION_NAME,
+        embedding_function=embedding_function
+    )
+class DocumentProcessor:
+    """Free document processing without Unstructured API"""
+    @staticmethod
+    def extract_text_from_pdf(file_path: str) -> str:
+        """Extract text from PDF files"""
+        text = ""
+        try:
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page_num in range(len(pdf_reader.pages)):
+                    page = pdf_reader.pages[page_num]
+                    text += page.extract_text() + "\n"
+        except Exception as e:
+            logger.error(f"Error reading PDF: {e}")
+        return text
+    @staticmethod
+    def extract_text_from_docx(file_path: str) -> str:
+        """Extract text from DOCX files"""
+        try:
+            doc = docx.Document(file_path)
+            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+            return text
+        except Exception as e:
+            logger.error(f"Error reading DOCX: {e}")
+            return ""
+    @staticmethod
+    def extract_text_from_html(file_path: str) -> str:
+        """Extract text from HTML files"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                soup = BeautifulSoup(file.read(), 'html.parser')
+                # Remove script and style elements
+                for script in soup(["script", "style"]):
+                    script.extract()
+                text = soup.get_text()
+                lines = (line.strip() for line in text.splitlines())
+                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+                text = '\n'.join(chunk for chunk in chunks if chunk)
+            return text
+        except Exception as e:
+            logger.error(f"Error reading HTML: {e}")
+            return ""
+    @staticmethod
+    def extract_text_from_txt(file_path: str) -> str:
+        """Extract text from TXT files"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                return file.read()
+        except Exception as e:
+            logger.error(f"Error reading TXT: {e}")
+            return ""
+    @staticmethod
+    def extract_text_from_csv(file_path: str) -> str:
+        """Extract text from CSV files"""
+        try:
+            df = pd.read_csv(file_path)
+            return df.to_string()
+        except Exception as e:
+            logger.error(f"Error reading CSV: {e}")
+            return ""
+    @staticmethod
+    def extract_text_from_json(file_path: str) -> str:
+        """Extract text from JSON files"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+                return json.dumps(data, indent=2)
+        except Exception as e:
+            logger.error(f"Error reading JSON: {e}")
+            return ""
+    @staticmethod
+    def extract_text_from_markdown(file_path: str) -> str:
+        """Extract text from Markdown files"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                md_text = file.read()
+                html = markdown.markdown(md_text)
+                soup = BeautifulSoup(html, 'html.parser')
+                return soup.get_text()
+        except Exception as e:
+            logger.error(f"Error reading Markdown: {e}")
+            return ""
+    @staticmethod
+    def extract_text_from_xml(file_path: str) -> str:
+        """Extract text from XML files"""
+        try:
+            tree = ET.parse(file_path)
+            root = tree.getroot()
+            def extract_text(element):
+                text = element.text or ""
+                for child in element:
+                    text += " " + extract_text(child)
+                return text.strip()
+            return extract_text(root)
+        except Exception as e:
+            logger.error(f"Error reading XML: {e}")
+            return ""
+    @classmethod
+    def extract_text(cls, file_path: str) -> str:
+        """Extract text from any supported file type"""
+        path = Path(file_path)
+        extension = path.suffix.lower()
+        extractors = {
+            '.pdf': cls.extract_text_from_pdf,
+            '.docx': cls.extract_text_from_docx,
+            '.doc': cls.extract_text_from_docx,
+            '.html': cls.extract_text_from_html,
+            '.htm': cls.extract_text_from_html,
+            '.txt': cls.extract_text_from_txt,
+            '.csv': cls.extract_text_from_csv,
+            '.json': cls.extract_text_from_json,
+            '.md': cls.extract_text_from_markdown,
+            '.xml': cls.extract_text_from_xml,
+        }
+        extractor = extractors.get(extension, cls.extract_text_from_txt)
+        return extractor(file_path)
+def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
+    """Split text into chunks with overlap"""
+    chunks = []
+    start = 0
+    text_length = len(text)
+    while start < text_length:
+        end = start + chunk_size
+        chunk = text[start:end]
+        # Try to find a sentence boundary
+        if end < text_length:
+            last_period = chunk.rfind('.')
+            last_newline = chunk.rfind('\n')
+            boundary = max(last_period, last_newline)
+            if boundary > chunk_size // 2:
+                chunk = text[start:start + boundary + 1]
+                end = start + boundary + 1
+        chunks.append(chunk.strip())
+        start = end - overlap
+    return chunks
+async def fetch_web_content_free(url: str) -> Optional[str]:
+    """Fetch content from URL using multiple free methods"""
+    # Method 1: Try newspaper3k (best for articles)
+    try:
+        article = Article(url)
+        article.download()
+        article.parse()
+        content = f"{article.title}\n\n{article.text}"
+        if len(content) > 100:  # Valid content
+            return content
+    except Exception as e:
+        logger.debug(f"Newspaper failed: {e}")
+    # Method 2: Try trafilatura (great for web scraping)
+    try:
+        downloaded = trafilatura.fetch_url(url)
+        content = trafilatura.extract(downloaded)
+        if content and len(content) > 100:
+            return content
+    except Exception as e:
+        logger.debug(f"Trafilatura failed: {e}")
+    # Method 3: Basic BeautifulSoup scraping
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'header']):
+                element.decompose()
+            # Try to find main content
+            main_content = None
+            # Common content selectors
+            content_selectors = [
+                'main', 'article', '[role="main"]',
+                '.content', '#content', '.post', '.entry-content',
+                '.article-body', '.story-body'
+            ]
+            for selector in content_selectors:
+                main_content = soup.select_one(selector)
+                if main_content:
+                    break
+            if not main_content:
+                main_content = soup.find('body')
+            if main_content:
+                text = main_content.get_text(separator='\n', strip=True)
+                # Get title
+                title = soup.find('title')
+                title_text = title.get_text() if title else "No title"
+                return f"{title_text}\n\n{text}"
+    except Exception as e:
+        logger.error(f"BeautifulSoup failed: {e}")
+    return None
+async def search_web_free(query: str, num_results: int = 5) -> List[Dict[str, str]]:
+    """Search the web using free methods (DuckDuckGo)"""
+    try:
+        results = []
+        with DDGS() as ddgs:
+            for r in ddgs.text(query, max_results=num_results):
+                results.append({
+                    'title': r.get('title', ''),
+                    'url': r.get('link', ''),
+                    'snippet': r.get('body', '')
+                })
+        return results
+    except Exception as e:
+        logger.error(f"Search failed: {e}")
+        return []
+# In mcp_tools.py
+async def generate_tags(content: str) -> List[str]:
+    """Generate tags using Mistral AI or fallback to free method"""
+    try:
+        if mistral_client: # This is MistralClient from mistralai.client
+            prompt = f"""Analyze this content and generate 5-7 relevant tags.
+            Return only the tags as a comma-separated list.
+            Content: {content[:2000]}...
+            Tags:"""
+            # For mistralai==0.4.2, pass messages as a list of dicts
+            response = mistral_client.chat(
+                model=Config.MISTRAL_MODEL,
+                messages=[{"role": "user", "content": prompt}] # <--- CHANGE HERE
+            )
+            tags_text = response.choices[0].message.content.strip()
+            tags = [tag.strip() for tag in tags_text.split(",")]
+            return tags[:7]
+        else:
+            # Free fallback: Extract keywords using frequency analysis
+            return generate_tags_free(content)
+    except Exception as e:
+        logger.error(f"Error generating tags: {str(e)}")
+        return generate_tags_free(content)
+def generate_tags_free(content: str) -> List[str]:
+    """Free tag generation using keyword extraction"""
+    from collections import Counter
+    import re
+    # Simple keyword extraction
+    words = re.findall(r'\b[a-z]{4,}\b', content.lower())
+    # Common stop words
+    stop_words = {
+        'this', 'that', 'these', 'those', 'what', 'which', 'when', 'where',
+        'who', 'whom', 'whose', 'why', 'how', 'with', 'about', 'against',
+        'between', 'into', 'through', 'during', 'before', 'after', 'above',
+        'below', 'from', 'down', 'out', 'off', 'over', 'under', 'again',
+        'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
+        'how', 'all', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
+        'such', 'only', 'same', 'than', 'that', 'have', 'has', 'had',
+        'been', 'being', 'does', 'doing', 'will', 'would', 'could', 'should'
+    }
+    # Filter and count words
+    filtered_words = [w for w in words if w not in stop_words and len(w) > 4]
+    word_counts = Counter(filtered_words)
+    # Get top keywords
+    top_keywords = [word for word, _ in word_counts.most_common(7)]
+    return top_keywords if top_keywords else ["untagged"]
+async def generate_summary(content: str) -> str:
+    """Generate summary using Claude or fallback to free method"""
+    try:
+        if anthropic_client:
+            message = anthropic_client.messages.create(
+                model=Config.CLAUDE_MODEL,
+                max_tokens=300,
+                messages=[{
+                    "role": "user",
+                    "content": f"Summarize this content in 2-3 sentences:\n\n{content[:4000]}..."
+                }]
+            )
+            return message.content[0].text.strip()
+        else:
+            # Free fallback
+            return generate_summary_free(content)
+    except Exception as e:
+        logger.error(f"Error generating summary: {str(e)}")
+        return generate_summary_free(content)
+def generate_summary_free(content: str) -> str:
+    """Free summary generation using simple extraction"""
+    sentences = content.split('.')
+    # Take first 3 sentences
+    summary_sentences = sentences[:3]
+    summary = '. '.join(s.strip() for s in summary_sentences if s.strip())
+    if len(summary) > 300:
+        summary = summary[:297] + "..."
+    return summary if summary else "Content preview: " + content[:200] + "..."
+async def process_local_file(file_path: str) -> Dict[str, Any]:
+    """Process a local file and store it in the knowledge base"""
+    try:
+        # Validate file
+        path = Path(file_path)
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        if path.suffix.lower() not in Config.SUPPORTED_FILE_TYPES:
+            raise ValueError(f"Unsupported file type: {path.suffix}")
+        # Extract text using free methods
+        full_text = DocumentProcessor.extract_text(file_path)
+        if not full_text:
+            raise ValueError("No text could be extracted from the file")
+        # Generate document ID
+        doc_id = hashlib.md5(f"{path.name}_{datetime.now().isoformat()}".encode()).hexdigest()
+        # Generate tags
+        tags = await generate_tags(full_text[:3000])
+        # Generate summary
+        summary = await generate_summary(full_text[:5000])
+        # Chunk the text
+        chunks = chunk_text(full_text, chunk_size=1000, overlap=100)
+        chunks = chunks[:10]  # Limit chunks for demo
+        # Store in ChromaDB
+        chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
+        metadata = {
+            "source": str(path),
+            "file_name": path.name,
+            "file_type": path.suffix,
+            "processed_at": datetime.now().isoformat(),
+            "tags": ", ".join(tags),
+            "summary": summary,
+            "doc_id": doc_id
+        }
+        collection.add(
+            documents=chunks,
+            ids=chunk_ids,
+            metadatas=[metadata for _ in chunks]
+        )
+        return {
+            "success": True,
+            "doc_id": doc_id,
+            "file_name": path.name,
+            "tags": tags,
+            "summary": summary,
+            "chunks_processed": len(chunks),
+            "metadata": metadata
+        }
+    except Exception as e:
+        logger.error(f"Error processing file: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+async def process_web_content(url_or_query: str) -> Dict[str, Any]:
+    """Process web content from URL or search query"""
+    try:
+        # Check if it's a URL or search query
+        is_url = url_or_query.startswith(('http://', 'https://'))
+        if is_url:
+            content = await fetch_web_content_free(url_or_query)
+            source = url_or_query
+        else:
+            # It's a search query
+            search_results = await search_web_free(url_or_query, num_results=3)
+            if not search_results:
+                raise ValueError("No search results found")
+            # Process the first result
+            first_result = search_results[0]
+            content = await fetch_web_content_free(first_result['url'])
+            source = first_result['url']
+            # Add search context
+            content = f"Search Query: {url_or_query}\n\n{first_result['title']}\n\n{content}"
+        if not content:
+            raise ValueError("Failed to fetch content")
+        # Generate document ID
+        doc_id = hashlib.md5(f"{source}_{datetime.now().isoformat()}".encode()).hexdigest()
+        # Generate tags
+        tags = await generate_tags(content[:3000])
+        # Generate summary
+        summary = await generate_summary(content[:5000])
+        # Chunk the content
+        chunks = chunk_text(content, chunk_size=1000, overlap=100)
+        chunks = chunks[:10]  # Limit for demo
+        # Store in ChromaDB
+        chunk_ids = [f"{doc_id}_{i}" for i in range(len(chunks))]
+        metadata = {
+            "source": source,
+            "url": source if is_url else f"Search: {url_or_query}",
+            "content_type": "web",
+            "processed_at": datetime.now().isoformat(),
+            "tags": ", ".join(tags),
+            "summary": summary,
+            "doc_id": doc_id
+        }
+        collection.add(
+            documents=chunks,
+            ids=chunk_ids,
+            metadatas=[metadata for _ in chunks]
+        )
+        return {
+            "success": True,
+            "doc_id": doc_id,
+            "url": source,
+            "tags": tags,
+            "summary": summary,
+            "chunks_processed": len(chunks),
+            "metadata": metadata,
+            "search_query": url_or_query if not is_url else None
+        }
+    except Exception as e:
+        logger.error(f"Error processing web content: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+async def search_knowledge_base(query: str, limit: int = 5) -> List[Dict[str, Any]]:
+    """Perform semantic search in the knowledge base"""
+    try:
+        results = collection.query(
+            query_texts=[query],
+            n_results=limit
+        )
+        if not results["ids"][0]:
+            return []
+        # Format results
+        formatted_results = []
+        seen_docs = set()
+        for i, doc_id in enumerate(results["ids"][0]):
+            metadata = results["metadatas"][0][i]
+            # Deduplicate by document
+            if metadata["doc_id"] not in seen_docs:
+                seen_docs.add(metadata["doc_id"])
+                formatted_results.append({
+                    "doc_id": metadata["doc_id"],
+                    "source": metadata.get("source", "Unknown"),
+                    "tags": metadata.get("tags", "").split(", "),
+                    "summary": metadata.get("summary", ""),
+                    "relevance_score": 1 - results["distances"][0][i],
+                    "processed_at": metadata.get("processed_at", "")
+                })
+        return formatted_results
+    except Exception as e:
+        logger.error(f"Error searching knowledge base: {str(e)}")
+        return []
+async def get_document_details(doc_id: str) -> Dict[str, Any]:
+    """Get detailed information about a document"""
+    try:
+        results = collection.get(
+            where={"doc_id": doc_id},
+            limit=1
+        )
+        if not results["ids"]:
+            return {"error": "Document not found"}
+        metadata = results["metadatas"][0]
+        return {
+            "doc_id": doc_id,
+            "source": metadata.get("source", "Unknown"),
+            "tags": metadata.get("tags", "").split(", "),
+            "summary": metadata.get("summary", ""),
+            "processed_at": metadata.get("processed_at", ""),
+            "file_type": metadata.get("file_type", ""),
+            "content_preview": results["documents"][0][:500] + "..."
+        }
+    except Exception as e:
+        logger.error(f"Error getting document details: {str(e)}")
+        return {"error": str(e)}

requirements.txt CHANGED Viewed

@@ -1,12 +1,23 @@
-mistralai
 python-dotenv
-gradio>=4.0
-fastmcp>=2.0
-chromadb
-sentence-transformers
-unstructured
-requests
 beautifulsoup4
-plotly
-networkx

+# Requirements for the project
+gradio==4.44.1
+mcp==1.0.0
+fastmcp==0.1.0
+chromadb==0.4.24
+mistralai==0.4.2
+anthropic
+aiohttp
 python-dotenv
+sentence-transformers==2.7.0
+plotly==5.22.0
+pandas==2.2.2
+numpy==1.26.4
+PyPDF2
+python-docx
 beautifulsoup4
+markdown
+ebooklib
+newspaper3k
+trafilatura
+duckduckgo-search
+requests