euIaxs22 commited on
Commit
88420b3
·
verified ·
1 Parent(s): 20d4e13

Update start.sh

Browse files
Files changed (1) hide show
  1. start.sh +83 -67
start.sh CHANGED
@@ -3,81 +3,96 @@ set -euo pipefail
3
 
4
  : "${APP_DIR:=/app}"
5
 
6
- export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True,max_split_size_mb:128"
7
- export CUDA_MODULE_LOADING="LAZY"
8
-
9
  echo "======================================================="
10
  echo " VINCIE - Start (VINCIE-3B, 8x L40S)"
11
  echo "======================================================="
12
 
13
- # 1) Builder (garante Apex/Flash e deps CUDA)
14
- echo "🛠️ Iniciando o builder.sh para compilar/instalar dependências CUDA..."
15
  if [ -f "/app/builder.sh" ]; then
16
  /bin/bash /app/builder.sh
17
  echo "✅ Builder finalizado."
18
  else
19
- echo "⚠️ Aviso: builder.sh não encontrado. Pulando etapa de compilação de dependências."
20
- fi
21
 
22
  # 2) Env da UI
23
  export GRADIO_SERVER_NAME="0.0.0.0"
24
  export GRADIO_SERVER_PORT="${PORT:-7860}"
25
- export GRADIO_ENABLE_QUEUE="True"
26
 
27
- # 3) Preflight: repo + snapshot + symlinks idempotentes
 
 
 
 
 
 
 
 
 
28
  python3 - <<'PY'
29
- import os, sys, subprocess
30
  from pathlib import Path
31
- from services.vincie import VincieService
32
-
33
- repo_dir = Path("/app/VINCIE")
34
- ckpt_repo = repo_dir / "ckpt" / "VINCIE-3B"
35
- ckpt_app = Path("/app/ckpt") / "VINCIE-3B"
36
-
37
- def ensure_symlink(link: Path, target: Path):
38
- if link.is_symlink():
39
- # relinka se o alvo mudou
40
- if link.resolve() != target:
41
- link.unlink()
42
- link.symlink_to(target, target_is_directory=True)
43
- elif link.exists():
44
- # se for pasta/arquivo, remove para padronizar como symlink
45
- subprocess.run(["rm","-rf",str(link)], check=True)
46
- link.symlink_to(target, target_is_directory=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  else:
48
- link.parent.mkdir(parents=True, exist_ok=True)
49
- link.symlink_to(target, target_is_directory=True)
50
-
51
- # 3.1 baixa repo + snapshot
52
- svc = VincieService()
53
- svc.ensure_repo()
54
- svc.ensure_model()
55
- snapshot = Path(str(svc.ckpt_dir))
56
-
57
- # 3.2 cria symlink dentro do repo e também em /app/ckpt
58
- ensure_symlink(ckpt_repo, snapshot)
59
- ensure_symlink(ckpt_app, snapshot)
60
-
61
- # 3.3 valida itens essenciais
62
- need = [snapshot/"dit.pth", snapshot/"vae.pth", snapshot/"llm14b"]
63
- missing = [str(p) for p in need if not p.exists()]
64
- if missing:
65
- raise SystemExit(f"[preflight] faltam itens no snapshot: {missing}")
66
-
67
- # 3.4 symlink opcional de models (heranças YAML)
68
- models_link = Path("/app/models")
69
- models_src = repo_dir / "models"
70
- if models_src.exists() and (not models_link.exists()):
71
  try:
72
- models_link.symlink_to(models_src, target_is_directory=True)
73
- print(f"[preflight] linked {models_link} -> {models_src}")
74
- except Exception as e:
75
- print("[preflight] warn: link models failed:", e)
76
-
77
- print(f"[preflight] OK: repo={repo_dir}, ckpt(link)={ckpt_repo} -> {snapshot}")
78
- PY
79
-
80
- # 4) PYTHONPATH (imports upstream)
 
 
81
  export VINCIE_DIR="${VINCIE_DIR:-/app/VINCIE}"
82
  : "${PYTHONPATH:=}"
83
  if [ -n "${PYTHONPATH}" ]; then
@@ -85,16 +100,17 @@ if [ -n "${PYTHONPATH}" ]; then
85
  else
86
  export PYTHONPATH="${VINCIE_DIR}"
87
  fi
 
 
 
 
 
88
 
89
- # 5) Diagnóstico mínimo
90
- echo "🔎 ckpt (repo):"
91
- ls -la /app/VINCIE/ckpt || true
92
- ls -la /app/VINCIE/ckpt/VINCIE-3B | head -n 20 || true
93
- echo "🔎 ckpt (/app):"
94
- ls -la /app/ckpt || true
95
- ls -la /app/ckpt/VINCIE-3B | head -n 20 || true
96
 
97
- # 6) Sobe a UI
98
  echo "🚀 Iniciando a interface web VINCIE (app_vince.py)..."
99
  cd "$APP_DIR"
100
- python3 /app/app_vince.py
 
3
 
4
  : "${APP_DIR:=/app}"
5
 
 
 
 
6
  echo "======================================================="
7
  echo " VINCIE - Start (VINCIE-3B, 8x L40S)"
8
  echo "======================================================="
9
 
10
+ # 1) Builder (Apex/Flash/dep. CUDA)
11
+ echo "🛠️ Iniciando o builder.sh..."
12
  if [ -f "/app/builder.sh" ]; then
13
  /bin/bash /app/builder.sh
14
  echo "✅ Builder finalizado."
15
  else
16
+ echo "⚠️ Aviso: builder.sh não encontrado. Pulando compilação."
17
+ fi # [file:24]
18
 
19
  # 2) Env da UI
20
  export GRADIO_SERVER_NAME="0.0.0.0"
21
  export GRADIO_SERVER_PORT="${PORT:-7860}"
22
+ export GRADIO_ENABLE_QUEUE="True" # [file:24]
23
 
24
+ # 3) CKPT_ROOT externo ao repo (use volumes)
25
+ if [ -d "/data" ]; then
26
+ export CKPT_ROOT="/data/ckpt/VINCIE-3B"
27
+ export HF_HUB_CACHE="${HF_HUB_CACHE:-/data/.cache/huggingface/hub}"
28
+ else
29
+ export CKPT_ROOT="/app/ckpt/VINCIE-3B"
30
+ export HF_HUB_CACHE="${HF_HUB_CACHE:-/app/.cache/huggingface/hub}"
31
+ fi
32
+
33
+ # 4) Preflight: baixar snapshot COMPLETO para CKPT_ROOT
34
  python3 - <<'PY'
35
+ import os, sys, shutil
36
  from pathlib import Path
37
+ from huggingface_hub import snapshot_download
38
+
39
+ CKPT_ROOT = Path(os.environ["CKPT_ROOT"])
40
+ MODEL_REPO = os.environ.get("MODEL_REPO", "ByteDance-Seed/VINCIE-3B")
41
+ repo_dir = Path("/app/VINCIE")
42
+ gy = repo_dir / "configs" / "generate.yaml"
43
+ mp = repo_dir / "main.py"
44
+
45
+ def download_snapshot():
46
+ CKPT_ROOT.parent.mkdir(parents=True, exist_ok=True)
47
+ tmp = CKPT_ROOT.with_name(CKPT_ROOT.name + ".tmp")
48
+ if tmp.exists():
49
+ shutil.rmtree(tmp, ignore_errors=True)
50
+ snapshot_download(
51
+ repo_id=MODEL_REPO,
52
+ local_dir=str(tmp),
53
+ local_dir_use_symlinks=False,
54
+ resume_download=True,
55
+ )
56
+ if CKPT_ROOT.exists():
57
+ shutil.rmtree(CKPT_ROOT, ignore_errors=True)
58
+ tmp.rename(CKPT_ROOT)
59
+
60
+ def validate():
61
+ need = [CKPT_ROOT/"dit.pth", CKPT_ROOT/"vae.pth", CKPT_ROOT/"llm14b"]
62
+ missing = [str(p) for p in need if not p.exists()]
63
+ if missing:
64
+ raise RuntimeError(f"Snapshot incompleto: {missing}")
65
+ if not gy.exists() or not mp.exists():
66
+ raise RuntimeError("VINCIE repo inválido (faltando generate.yaml/main.py)")
67
+
68
+ # valida repo
69
+ if not gy.exists() or not mp.exists():
70
+ raise SystemExit("[preflight] Repo ausente/incompleto; clone esperado antes do start.")
71
+
72
+ try:
73
+ if CKPT_ROOT.exists():
74
+ try:
75
+ validate(); print(f"[preflight] CKPT_ROOT OK: {CKPT_ROOT}")
76
+ except Exception as e:
77
+ print("[preflight] validação falhou; refazendo snapshot:", e)
78
+ shutil.rmtree(CKPT_ROOT, ignore_errors=True)
79
+ download_snapshot(); validate()
80
  else:
81
+ download_snapshot(); validate()
82
+ except Exception as e:
83
+ print("[preflight] falha definitiva:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  try:
85
+ for p in (CKPT_ROOT.parent, ):
86
+ print(f">>> {p}")
87
+ if p.exists():
88
+ for x in sorted(p.iterdir(), key=lambda z: z.name.lower()):
89
+ print(" -", x.name + ("/" if x.is_dir() else ""))
90
+ except Exception:
91
+ pass
92
+ sys.exit(1)
93
+ PY # [file:24][file:25]
94
+
95
+ # 5) PYTHONPATH e includes relativos
96
  export VINCIE_DIR="${VINCIE_DIR:-/app/VINCIE}"
97
  : "${PYTHONPATH:=}"
98
  if [ -n "${PYTHONPATH}" ]; then
 
100
  else
101
  export PYTHONPATH="${VINCIE_DIR}"
102
  fi
103
+ [ -d "/app/VINCIE/models" ] && [ ! -e "/app/models" ] && ln -s /app/VINCIE/models /app/models || true
104
+
105
+ # 6) Alocador CUDA (mitigar fragmentação no cold-start)
106
+ export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True,max_split_size_mb:128}"
107
+ export CUDA_MODULE_LOADING="${CUDA_MODULE_LOADING:-LAZY}"
108
 
109
+ # 7) Diagnóstico leve
110
+ echo "🔎 Conteúdo de CKPT_ROOT:"
111
+ ls -la "${CKPT_ROOT}" | head -n 50 || true
 
 
 
 
112
 
113
+ # 8) Launch da UI
114
  echo "🚀 Iniciando a interface web VINCIE (app_vince.py)..."
115
  cd "$APP_DIR"
116
+ python3 /app/app_vince.py # [file:23]