Keeby-smilyai commited on
Commit
4aa4aa6
Β·
verified Β·
1 Parent(s): fc6f2c0

Update backend.py

Browse files
Files changed (1) hide show
  1. backend.py +109 -6
backend.py CHANGED
@@ -1,16 +1,17 @@
1
- # backend.py β€” REAL, FULL, WORKING VERSION
2
  import sqlite3
3
  import threading
4
  import time
5
  import torch
6
  import torch.nn as nn
7
  from torch.utils.data import DataLoader, Dataset
8
- from huggingface_hub import whoami
9
  from datasets import load_dataset
10
  from transformers import AutoTokenizer
11
  import psutil
12
  import os
13
  import signal
 
14
 
15
  DB_PATH = "llm_kitchen.db"
16
  training_queue = []
@@ -282,15 +283,22 @@ def run_training_job(job):
282
  device = "cuda" if torch.cuda.is_available() else "cpu"
283
  log_update(f"Run {run_id}: πŸš€ Device = {device} | RAM available: {psutil.virtual_memory().available / (1024**3):.2f} GB", run_id)
284
 
 
285
  tokenizer = AutoTokenizer.from_pretrained("gpt2")
286
  if tokenizer.pad_token is None:
287
  tokenizer.pad_token = tokenizer.eos_token
288
 
 
 
 
 
 
289
  vocab_size = len(tokenizer)
290
  model = get_model(job["arch_type"], vocab_size, job["num_layers"]).to(device)
291
  log_update(f"Run {run_id}: 🧱 Model initialized: {job['arch_type']} x{job['num_layers']} layers", run_id)
292
 
293
- dataset = load_dataset("voidful/reasoning_gemini_300k", split="train[:100000]")
 
294
  def tokenize_function(examples):
295
  texts = [q + " " + a for q, a in zip(examples["message"], examples["answer"])]
296
  return tokenizer(texts, truncation=True, padding="max_length", max_length=128)
@@ -318,16 +326,19 @@ def run_training_job(job):
318
  total_loss += loss.item()
319
  if step % 50 == 0:
320
  ram_gb = psutil.virtual_memory().used / (1024**3)
321
- log_update(f"Run {run_id}: Epoch {epoch+1} | Step {step} | Loss: {loss.item():.4f} | RAM: {ram_gb:.2f}GB", run_id)
 
322
 
323
  avg_loss = total_loss / len(train_loader)
324
  log_update(f"Run {run_id}: βœ… Epoch {epoch+1} completed | Avg Loss: {avg_loss:.4f}", run_id)
325
 
 
326
  model_path = f"./runs/{run_id}"
327
  os.makedirs(model_path, exist_ok=True)
328
  torch.save(model.state_dict(), f"{model_path}/model.pth")
 
329
  update_run_status(run_id, "completed", f"Model saved to {model_path}")
330
- log_update(f"Run {run_id}: πŸ’Ύ Model checkpoint saved.", run_id)
331
 
332
  except Exception as e:
333
  log_update(f"Run {run_id}: πŸ’₯ FAILED - {str(e)}", run_id)
@@ -345,6 +356,96 @@ def log_update(message, run_id):
345
  if run_id > 0:
346
  update_run_status(run_id, "running", full_msg)
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  # ------------------------------ PUBLIC API ------------------------------
349
 
350
  __all__ = [
@@ -355,5 +456,7 @@ __all__ = [
355
  "start_training_if_free",
356
  "update_run_status",
357
  "log_update",
358
- "init_db"
 
 
359
  ]
 
1
+ # backend.py β€” REAL, FULL, WORKING VERSION β€” USER WRITES README, NO AUTO-GENERATED
2
  import sqlite3
3
  import threading
4
  import time
5
  import torch
6
  import torch.nn as nn
7
  from torch.utils.data import DataLoader, Dataset
8
+ from huggingface_hub import whoami, HfApi, create_repo
9
  from datasets import load_dataset
10
  from transformers import AutoTokenizer
11
  import psutil
12
  import os
13
  import signal
14
+ import shutil
15
 
16
  DB_PATH = "llm_kitchen.db"
17
  training_queue = []
 
283
  device = "cuda" if torch.cuda.is_available() else "cpu"
284
  log_update(f"Run {run_id}: πŸš€ Device = {device} | RAM available: {psutil.virtual_memory().available / (1024**3):.2f} GB", run_id)
285
 
286
+ # Load and save tokenizer
287
  tokenizer = AutoTokenizer.from_pretrained("gpt2")
288
  if tokenizer.pad_token is None:
289
  tokenizer.pad_token = tokenizer.eos_token
290
 
291
+ tokenizer_save_path = f"./runs/{run_id}/tokenizer"
292
+ os.makedirs(tokenizer_save_path, exist_ok=True)
293
+ tokenizer.save_pretrained(tokenizer_save_path)
294
+ log_update(f"Run {run_id}: πŸ’Ύ Tokenizer saved to {tokenizer_save_path}", run_id)
295
+
296
  vocab_size = len(tokenizer)
297
  model = get_model(job["arch_type"], vocab_size, job["num_layers"]).to(device)
298
  log_update(f"Run {run_id}: 🧱 Model initialized: {job['arch_type']} x{job['num_layers']} layers", run_id)
299
 
300
+ # Load dataset β€” reduce for testing
301
+ dataset = load_dataset("voidful/reasoning_gemini_300k", split="train[:5000]")
302
  def tokenize_function(examples):
303
  texts = [q + " " + a for q, a in zip(examples["message"], examples["answer"])]
304
  return tokenizer(texts, truncation=True, padding="max_length", max_length=128)
 
326
  total_loss += loss.item()
327
  if step % 50 == 0:
328
  ram_gb = psutil.virtual_memory().used / (1024**3)
329
+ cpu_pct = psutil.cpu_percent(interval=0.1)
330
+ log_update(f"Run {run_id}: Epoch {epoch+1} | Step {step} | Loss: {loss.item():.4f} | RAM: {ram_gb:.2f}GB | CPU: {cpu_pct}%", run_id)
331
 
332
  avg_loss = total_loss / len(train_loader)
333
  log_update(f"Run {run_id}: βœ… Epoch {epoch+1} completed | Avg Loss: {avg_loss:.4f}", run_id)
334
 
335
+ # Save model
336
  model_path = f"./runs/{run_id}"
337
  os.makedirs(model_path, exist_ok=True)
338
  torch.save(model.state_dict(), f"{model_path}/model.pth")
339
+ torch.save(model.state_dict(), f"{model_path}/pytorch_model.bin") # HF standard
340
  update_run_status(run_id, "completed", f"Model saved to {model_path}")
341
+ log_update(f"Run {run_id}: πŸ’Ύ Model checkpoint saved as .pth and .bin", run_id)
342
 
343
  except Exception as e:
344
  log_update(f"Run {run_id}: πŸ’₯ FAILED - {str(e)}", run_id)
 
356
  if run_id > 0:
357
  update_run_status(run_id, "running", full_msg)
358
 
359
+ # ------------------------------ INFERENCE ------------------------------
360
+
361
+ def run_inference(run_id, prompt):
362
+ """Load model + tokenizer and generate answer"""
363
+ model_path = f"./runs/{run_id}/model.pth"
364
+ tokenizer_path = f"./runs/{run_id}/tokenizer"
365
+
366
+ if not os.path.exists(model_path):
367
+ return "ModelError: Model not found. Did training complete?"
368
+
369
+ if not os.path.exists(tokenizer_path):
370
+ return "ModelError: Tokenizer not found."
371
+
372
+ # Load tokenizer
373
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
374
+ if tokenizer.pad_token is None:
375
+ tokenizer.pad_token = tokenizer.eos_token
376
+
377
+ # Fetch arch_type and num_layers from DB
378
+ conn = sqlite3.connect(DB_PATH)
379
+ cursor = conn.cursor()
380
+ cursor.execute("SELECT arch_type, num_layers FROM training_runs WHERE id = ?", (run_id,))
381
+ row = cursor.fetchone()
382
+ conn.close()
383
+
384
+ if not row:
385
+ return "ModelError: Run not found."
386
+
387
+ arch_type, num_layers = row
388
+ vocab_size = len(tokenizer)
389
+ model = get_model(arch_type, vocab_size, num_layers)
390
+
391
+ # Load weights
392
+ model.load_state_dict(torch.load(model_path, map_location="cpu"))
393
+ model.eval()
394
+
395
+ # Encode prompt
396
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128)
397
+ input_ids = inputs.input_ids
398
+
399
+ # Generate (simple greedy)
400
+ with torch.no_grad():
401
+ outputs = model(input_ids)
402
+ logits = outputs["logits"]
403
+ predicted_token_ids = torch.argmax(logits, dim=-1)
404
+ generated_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)
405
+
406
+ return f"πŸ§‘β€πŸ³ Model says:\n{generated_text}"
407
+
408
+ # ------------------------------ PUBLISH TO HUB (USER WRITES README) ------------------------------
409
+
410
+ def publish_run_to_hub(run_id, hf_token, repo_name, user_description=""):
411
+ """Save model + tokenizer and push to HF Hub. User provides description for README.md."""
412
+ local_dir = f"./runs/{run_id}/hub_upload"
413
+ os.makedirs(local_dir, exist_ok=True)
414
+
415
+ # Copy model as pytorch_model.bin
416
+ model_src = f"./runs/{run_id}/pytorch_model.bin"
417
+ if not os.path.exists(model_src):
418
+ raise FileNotFoundError("Model .bin not found for publishing.")
419
+ shutil.copy(model_src, f"{local_dir}/pytorch_model.bin")
420
+
421
+ # Copy tokenizer
422
+ tokenizer_path = f"./runs/{run_id}/tokenizer"
423
+ if not os.path.exists(tokenizer_path):
424
+ raise FileNotFoundError("Tokenizer not found.")
425
+ shutil.copytree(tokenizer_path, f"{local_dir}/tokenizer", dirs_exist_ok=True)
426
+
427
+ # Save user-provided description as README.md (or minimal fallback)
428
+ readme_content = user_description.strip()
429
+ if not readme_content:
430
+ readme_content = f"# Model from LLM Kitchen - Run #{run_id}\n\nTrained using custom architecture."
431
+
432
+ with open(f"{local_dir}/README.md", "w") as f:
433
+ f.write(readme_content)
434
+
435
+ # Create repo on user's account
436
+ api = HfApi()
437
+ repo_id = repo_name
438
+ url = create_repo(repo_id, token=hf_token, private=False, exist_ok=True)
439
+
440
+ # Push folder
441
+ api.upload_folder(
442
+ folder_path=local_dir,
443
+ repo_id=repo_id,
444
+ token=hf_token
445
+ )
446
+
447
+ return f"https://huggingface.co/{repo_id}"
448
+
449
  # ------------------------------ PUBLIC API ------------------------------
450
 
451
  __all__ = [
 
456
  "start_training_if_free",
457
  "update_run_status",
458
  "log_update",
459
+ "init_db",
460
+ "run_inference",
461
+ "publish_run_to_hub"
462
  ]