Spaces:

Keeby-smilyai
/

LLM-kitchen

Running

App Files Files Community

Keeby-smilyai commited on Sep 18

Commit

ba941c2

verified ·

1 Parent(s): 547c56f

Update backend.py

Browse files

Files changed (1) hide show

backend.py +90 -21

backend.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# backend.py — REAL VERSION
 import sqlite3
 import threading
 import time
@@ -53,17 +53,96 @@ def init_db():
 init_db()
-# ... [KEEP ALL DB HELPER FUNCTIONS: get_user_by_token, create_user, etc. — NO CHANGES] ...
 # ------------------------------ AUTH ------------------------------
-# ... [KEEP verify_hf_token — NO CHANGES] ...
 # ------------------------------ TRAINING QUEUE ------------------------------
 def ram_available():
-    """Check if we can start a new run (1.5GB per run)"""
-    total_ram = psutil.virtual_memory().total / (1024**3)  # GB
-    used_ram = psutil.virtual_memory().used / (1024**3)    # GB
     available_gb = total_ram - used_ram
     return available_gb >= MAX_RAM_PER_RUN_GB
@@ -94,7 +173,6 @@ def start_training_if_free():
         thread = threading.Thread(target=run_training_job, args=(job,))
         thread.start()
-        # Start 48h timeout killer
         timer = threading.Timer(RUN_TIMEOUT, kill_run_timeout, args=[job["run_id"]])
         timer.start()
@@ -107,7 +185,7 @@ def kill_run_timeout(run_id):
             log_update(f"Run {run_id}: 💥 48-HOUR TIMEOUT REACHED. Terminating.", run_id)
             update_run_status(run_id, "timeout")
             active_run_id = None
-            start_training_if_free()  # try next
 # ------------------------------ CUSTOM MODELS FROM SCRATCH ------------------------------
@@ -125,8 +203,8 @@ class CNNLanguageModel(nn.Module):
         self.fc = nn.Linear(in_ch, vocab_size)
     def forward(self, x, labels=None):
-        x = self.embedding(x).transpose(1, 2)  # (B, E, L)
-        x = self.convs(x).transpose(1, 2)     # (B, L, E*2^N)
         logits = self.fc(x)
         loss = None
         if labels is not None:
@@ -184,7 +262,7 @@ def get_model(arch_type, vocab_size, num_layers):
 class TextDataset(Dataset):
     def __init__(self, tokenized_data):
         self.input_ids = tokenized_data["input_ids"]
-        self.labels = tokenized_data["input_ids"]  # causal LM
     def __len__(self):
         return len(self.input_ids)
@@ -204,18 +282,14 @@ def run_training_job(job):
         device = "cuda" if torch.cuda.is_available() else "cpu"
         log_update(f"Run {run_id}: 🚀 Device = {device} | RAM available: {psutil.virtual_memory().available / (1024**3):.2f} GB", run_id)
-        # Load tokenizer (shared for all models)
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         vocab_size = len(tokenizer)
-        # Build model from scratch
         model = get_model(job["arch_type"], vocab_size, job["num_layers"]).to(device)
         log_update(f"Run {run_id}: 🧱 Model initialized: {job['arch_type']} x{job['num_layers']} layers", run_id)
-        # Load dataset — full training set (or 100K for speed)
         dataset = load_dataset("voidful/reasoning_gemini_300k", split="train[:100000]")
         def tokenize_function(examples):
             texts = [q + " " + a for q, a in zip(examples["message"], examples["answer"])]
@@ -224,11 +298,8 @@ def run_training_job(job):
         train_dataset = TextDataset(tokenized_dataset)
         train_loader = DataLoader(train_dataset, batch_size=job["batch_size"], shuffle=True)
-        # Optimizer
         optimizer = torch.optim.AdamW(model.parameters(), lr=job["learning_rate"])
-        # Training loop
         model.train()
         log_update(f"Run {run_id}: ▶️ Starting training for {job['epochs']} epochs...", run_id)
@@ -252,7 +323,6 @@ def run_training_job(job):
             avg_loss = total_loss / len(train_loader)
             log_update(f"Run {run_id}: ✅ Epoch {epoch+1} completed | Avg Loss: {avg_loss:.4f}", run_id)
-        # Save model
         model_path = f"./runs/{run_id}"
         os.makedirs(model_path, exist_ok=True)
         torch.save(model.state_dict(), f"{model_path}/model.pth")
@@ -271,11 +341,10 @@ def run_training_job(job):
 def log_update(message, run_id):
     timestamp = time.strftime("%H:%M:%S")
     full_msg = f"[{timestamp}] {message}"
-    print(full_msg)  # Also shows in HF Spaces logs
     if run_id > 0:
         update_run_status(run_id, "running", full_msg)
 # ------------------------------ PUBLIC API ------------------------------
 __all__ = [

+# backend.py — REAL, FULL, WORKING VERSION
 import sqlite3
 import threading
 import time
 init_db()
+def get_user_by_token(hf_token):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT id FROM users WHERE hf_token = ?", (hf_token,))
+    row = cursor.fetchone()
+    conn.close()
+    return row[0] if row else None
+def create_user(hf_token):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("INSERT INTO users (hf_token) VALUES (?)", (hf_token,))
+    user_id = cursor.lastrowid
+    conn.commit()
+    conn.close()
+    return user_id
+def create_training_run(user_id, config):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("""
+        INSERT INTO training_runs
+        (user_id, arch_type, num_layers, learning_rate, epochs, batch_size)
+        VALUES (?, ?, ?, ?, ?, ?)
+    """, (
+        user_id,
+        config['arch_type'],
+        config['num_layers'],
+        config['learning_rate'],
+        config['epochs'],
+        config['batch_size']
+    ))
+    run_id = cursor.lastrowid
+    conn.commit()
+    conn.close()
+    return run_id
+def get_user_runs(user_id):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("""
+        SELECT id, arch_type, num_layers, status, started_at
+        FROM training_runs
+        WHERE user_id = ?
+        ORDER BY started_at DESC
+    """, (user_id,))
+    runs = cursor.fetchall()
+    conn.close()
+    return runs
+def get_run_logs(run_id):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute("SELECT logs, status FROM training_runs WHERE id = ?", (run_id,))
+    row = cursor.fetchone()
+    conn.close()
+    return row if row else ("", "unknown")
+def update_run_status(run_id, status, logs=""):
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    if status == 'running':
+        cursor.execute("UPDATE training_runs SET status = ?, started_at = CURRENT_TIMESTAMP WHERE id = ?", (status, run_id))
+    elif status in ['completed', 'failed', 'timeout']:
+        cursor.execute("UPDATE training_runs SET status = ?, completed_at = CURRENT_TIMESTAMP WHERE id = ?", (status, run_id))
+    if logs:
+        current_logs = get_run_logs(run_id)[0]
+        cursor.execute("UPDATE training_runs SET logs = ? WHERE id = ?", (current_logs + "\n" + logs, run_id))
+    conn.commit()
+    conn.close()
 # ------------------------------ AUTH ------------------------------
+def verify_hf_token(token):
+    try:
+        whoami(token=token)
+        user_id = get_user_by_token(token)
+        if not user_id:
+            user_id = create_user(token)
+            return user_id, "Welcome to the LLM Kitchen, Chef! 🍳 Your apron is ready."
+        else:
+            return user_id, "Welcome back, Chef! 👨‍🍳 Your last dish is still warm."
+    except Exception as e:
+        return None, f"Invalid token. Please try again. ({str(e)})"
 # ------------------------------ TRAINING QUEUE ------------------------------
 def ram_available():
+    total_ram = psutil.virtual_memory().total / (1024**3)
+    used_ram = psutil.virtual_memory().used / (1024**3)
     available_gb = total_ram - used_ram
     return available_gb >= MAX_RAM_PER_RUN_GB
         thread = threading.Thread(target=run_training_job, args=(job,))
         thread.start()
         timer = threading.Timer(RUN_TIMEOUT, kill_run_timeout, args=[job["run_id"]])
         timer.start()
             log_update(f"Run {run_id}: 💥 48-HOUR TIMEOUT REACHED. Terminating.", run_id)
             update_run_status(run_id, "timeout")
             active_run_id = None
+            start_training_if_free()
 # ------------------------------ CUSTOM MODELS FROM SCRATCH ------------------------------
         self.fc = nn.Linear(in_ch, vocab_size)
     def forward(self, x, labels=None):
+        x = self.embedding(x).transpose(1, 2)
+        x = self.convs(x).transpose(1, 2)
         logits = self.fc(x)
         loss = None
         if labels is not None:
 class TextDataset(Dataset):
     def __init__(self, tokenized_data):
         self.input_ids = tokenized_data["input_ids"]
+        self.labels = tokenized_data["input_ids"]
     def __len__(self):
         return len(self.input_ids)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         log_update(f"Run {run_id}: 🚀 Device = {device} | RAM available: {psutil.virtual_memory().available / (1024**3):.2f} GB", run_id)
         tokenizer = AutoTokenizer.from_pretrained("gpt2")
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         vocab_size = len(tokenizer)
         model = get_model(job["arch_type"], vocab_size, job["num_layers"]).to(device)
         log_update(f"Run {run_id}: 🧱 Model initialized: {job['arch_type']} x{job['num_layers']} layers", run_id)
         dataset = load_dataset("voidful/reasoning_gemini_300k", split="train[:100000]")
         def tokenize_function(examples):
             texts = [q + " " + a for q, a in zip(examples["message"], examples["answer"])]
         train_dataset = TextDataset(tokenized_dataset)
         train_loader = DataLoader(train_dataset, batch_size=job["batch_size"], shuffle=True)
         optimizer = torch.optim.AdamW(model.parameters(), lr=job["learning_rate"])
         model.train()
         log_update(f"Run {run_id}: ▶️ Starting training for {job['epochs']} epochs...", run_id)
             avg_loss = total_loss / len(train_loader)
             log_update(f"Run {run_id}: ✅ Epoch {epoch+1} completed | Avg Loss: {avg_loss:.4f}", run_id)
         model_path = f"./runs/{run_id}"
         os.makedirs(model_path, exist_ok=True)
         torch.save(model.state_dict(), f"{model_path}/model.pth")
 def log_update(message, run_id):
     timestamp = time.strftime("%H:%M:%S")
     full_msg = f"[{timestamp}] {message}"
+    print(full_msg)
     if run_id > 0:
         update_run_status(run_id, "running", full_msg)
 # ------------------------------ PUBLIC API ------------------------------
 __all__ = [