Spaces:

Keeby-smilyai
/

LLM-kitchen

Running

App Files Files Community

Keeby-smilyai commited on Sep 18

Commit

e0790fc

verified ·

1 Parent(s): 06078e5

Create backend.py

Browse files

Files changed (1) hide show

backend.py +234 -0

backend.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# backend.py
+import sqlite3
+import threading
+import time
+import torch
+from huggingface_hub import whoami
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
+import os
+DB_PATH = "llm_kitchen.db"
+training_queue = []
+active_run_lock = threading.Lock()
+active_run_id = None
+# ------------------------------ DATABASE ------------------------------
+def init_db():
+    if os.path.exists(DB_PATH):
+        return
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.executescript("""
+        CREATE TABLE IF NOT EXISTS users (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            hf_token TEXT UNIQUE NOT NULL,
+            created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+        );
+        CREATE TABLE IF NOT EXISTS training_runs (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            user_id INTEGER NOT NULL,
+            arch_type TEXT NOT NULL,
+            num_layers INTEGER NOT NULL,
+            learning_rate REAL NOT NULL,
+            epochs INTEGER NOT NULL,
+            batch_size INTEGER NOT NULL,
+            status TEXT DEFAULT 'queued',
+            logs TEXT DEFAULT '',
+            started_at DATETIME,
+            completed_at DATETIME,
+            FOREIGN KEY (user_id) REFERENCES users(id)
+        );
+    """)
+    conn.commit()
+    conn.close()
+init_db()
+def get_user_by_token(hf_token):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT id FROM users WHERE hf_token = ?", (hf_token,))
+    row = cursor.fetchone()
+    conn.close()
+    return row[0] if row else None
+def create_user(hf_token):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("INSERT INTO users (hf_token) VALUES (?)", (hf_token,))
+    user_id = cursor.lastrowid
+    conn.commit()
+    conn.close()
+    return user_id
+def create_training_run(user_id, config):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("""
+        INSERT INTO training_runs
+        (user_id, arch_type, num_layers, learning_rate, epochs, batch_size)
+        VALUES (?, ?, ?, ?, ?, ?)
+    """, (
+        user_id,
+        config['arch_type'],
+        config['num_layers'],
+        config['learning_rate'],
+        config['epochs'],
+        config['batch_size']
+    ))
+    run_id = cursor.lastrowid
+    conn.commit()
+    conn.close()
+    return run_id
+def get_user_runs(user_id):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("""
+        SELECT id, arch_type, num_layers, status, started_at
+        FROM training_runs
+        WHERE user_id = ?
+        ORDER BY started_at DESC
+    """, (user_id,))
+    runs = cursor.fetchall()
+    conn.close()
+    return runs
+def get_run_logs(run_id):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT logs, status FROM training_runs WHERE id = ?", (run_id,))
+    row = cursor.fetchone()
+    conn.close()
+    return row if row else ("", "unknown")
+def update_run_status(run_id, status, logs=""):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    if status == 'running':
+        cursor.execute("UPDATE training_runs SET status = ?, started_at = CURRENT_TIMESTAMP WHERE id = ?", (status, run_id))
+    elif status in ['completed', 'failed', 'timeout']:
+        cursor.execute("UPDATE training_runs SET status = ?, completed_at = CURRENT_TIMESTAMP WHERE id = ?", (status, run_id))
+    if logs:
+        current_logs = get_run_logs(run_id)[0]
+        cursor.execute("UPDATE training_runs SET logs = ? WHERE id = ?", (current_logs + "\n" + logs, run_id))
+    conn.commit()
+    conn.close()
+# ------------------------------ AUTH ------------------------------
+def verify_hf_token(token):
+    try:
+        whoami(token=token)
+        user_id = get_user_by_token(token)
+        if not user_id:
+            user_id = create_user(token)
+            return user_id, "Welcome to the LLM Kitchen, Chef! 🍳 Your apron is ready."
+        else:
+            return user_id, "Welcome back, Chef! 👨‍🍳 Your last dish is still warm."
+    except Exception as e:
+        return None, f"Invalid token. Please try again. ({str(e)})"
+# ------------------------------ TRAINING QUEUE ------------------------------
+def queue_training_run(user_id, config):
+    run_id = create_training_run(user_id, config)
+    training_queue.append({
+        "run_id": run_id,
+        "user_id": user_id,
+        **config
+    })
+    return run_id
+def ram_check_mock():
+    # Mock: Allow 1 run at a time, 1.5GB per run
+    global active_run_id
+    return active_run_id is None
+def start_training_if_free():
+    global active_run_id
+    with active_run_lock:
+        if active_run_id is not None:
+            return False
+        if not training_queue:
+            return False
+        if not ram_check_mock():
+            return False
+        job = training_queue.pop(0)
+        active_run_id = job["run_id"]
+        update_run_status(active_run_id, "running", "🍳 Starting kitchen process...")
+        thread = threading.Thread(target=run_training_job, args=(job,))
+        thread.start()
+        return True
+def run_training_job(job):
+    global active_run_id
+    run_id = job["run_id"]
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        log_update(f"Run {run_id}: Device = {device}", run_id)
+        # Load tiny model for demo (replace with custom later)
+        model_name = "distilgpt2"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
+        log_update(f"Run {run_id}: Model loaded", run_id)
+        # Load dataset
+        dataset = load_dataset("voidful/reasoning_gemini_300k", split="train[:1%]")  # Tiny slice for demo
+        def tokenize_function(examples):
+            texts = [q + " " + a for q, a in zip(examples["message"], examples["answer"])]
+            return tokenizer(texts, truncation=True, padding="max_length", max_length=128)
+        tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["message", "answer"])
+        log_update(f"Run {run_id}: Dataset tokenized", run_id)
+        # Training args
+        training_args = TrainingArguments(
+            output_dir=f"./runs/{run_id}",
+            num_train_epochs=job["epochs"],
+            per_device_train_batch_size=job["batch_size"],
+            learning_rate=job["learning_rate"],
+            save_strategy="no",
+            logging_steps=1,
+            report_to="none",
+            fp16=False,
+            no_cuda=(device == "cpu")
+        )
+        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_dataset,
+            data_collator=data_collator,
+        )
+        log_update(f"Run {run_id}: Starting training...", run_id)
+        trainer.train()
+        # Simulate 48h timeout with short sleep for demo
+        time.sleep(10)  # Replace with real training
+        eval_results = trainer.evaluate()
+        log_update(f"Run {run_id}: Training complete. Loss = {eval_results.get('eval_loss', 'N/A')}", run_id)
+        update_run_status(run_id, "completed")
+    except Exception as e:
+        log_update(f"Run {run_id}: FAILED - {str(e)}", run_id)
+        update_run_status(run_id, "failed")
+    finally:
+        with active_run_lock:
+            active_run_id = None
+        # Try starting next queued job
+        start_training_if_free()
+def log_update(message, run_id):
+    print(f"[LOG] {message}")  # Also print to Spaces logs
+    update_run_status(run_id, "running", message)