Spaces:
Paused
Paused
File size: 3,827 Bytes
c00ea12 93626d8 c00ea12 93626d8 c00ea12 6188fa0 16ced43 c00ea12 6188fa0 16ced43 47d5744 c00ea12 47d5744 c00ea12 47d5744 c00ea12 fbe9d9f 4c802e6 ac90475 16ced43 22f9c33 9b230d1 e997f7e 87eb4d4 fbe9d9f 87eb4d4 e997f7e 687a9ef 9c00877 687a9ef 9c00877 687a9ef 9c00877 687a9ef 9c00877 687a9ef 9c00877 687a9ef 9c00877 687a9ef 9c00877 b5c6744 bd40b35 1ebc5ad 22f9c33 8c19f21 7f655ac 22f9c33 1ebc5ad 16ced43 9b230d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
#!/usr/bin/env bash
set -euo pipefail
echo "======================================================="
echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
echo "======================================================="
# ---------------------- Env base ----------------------
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
# SDPA / FA toggles
export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}"
export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}"
export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}"
export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
# CUDA / NCCL baseline
export CUDA_MODULE_LOADING="LAZY"
export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
export NCCL_DEBUG="INFO"
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_P2P_DISABLE=0
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME="lo"
export NCCL_BLOCKING_WAIT=1
export TORCH_NCCL_BLOCKING_WAIT=1
#export NCCL_TIMEOUT=600
# ---------------------- Persistência HF/torch ----------------------
if [ -d /data ]; then
export HF_HOME="/data/.cache/huggingface"
export TORCH_HOME="/data/.cache/torch"
else
export HF_HOME="/app/.cache/huggingface"
export TORCH_HOME="/app/.cache/torch"
fi
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME"
mkdir -p /app/.cache
ln -sf "$HF_HOME" /app/.cache/huggingface
unset TRANSFORMERS_CACHE
export HF_HUB_ENABLE_HF_TRANSFER=1
export HF_HUB_DOWNLOAD_TIMEOUT=6000
MODEL_REPO="ByteDance-Seed/VINCIE-3B"
CKPT_DIR="/app/ckpt/VINCIE-3B"
mkdir -p "$CKPT_DIR"
# ---------------------- Cache Estruturado HF (persistente) ----------------------
# Define cache no volume persistente /data (1TB)
if [ -d /data ]; then
export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
else
export HF_HOME="${HF_HOME:-/app/.cache/huggingface}"
fi
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE"
echo "📦 Cache HF: $HF_HUB_CACHE"
# Download usando cache estruturado (não duplica arquivos)
python3 - <<'PY'
from huggingface_hub import snapshot_download
import os
cache_dir = os.environ.get('HF_HUB_CACHE')
print(f'📥 Baixando VINCIE-3B para cache: {cache_dir}')
try:
model_path = snapshot_download(
repo_id='ByteDance-Seed/VINCIE-3B',
cache_dir=cache_dir, # Usa cache estruturado
resume_download=True, # Retoma downloads interrompidos
max_workers=8, # Acelera com paralelismo
# Não usa local_dir - mantém tudo no cache HF
)
print(f'✅ Modelo em cache: {model_path}')
# Cria symlink para compatibilidade com código legacy
ckpt_link = '/app/ckpt/VINCIE-3B'
os.makedirs('/app/ckpt', exist_ok=True)
if os.path.islink(ckpt_link):
os.unlink(ckpt_link)
if not os.path.exists(ckpt_link):
os.symlink(model_path, ckpt_link)
print(f'🔗 Symlink: {ckpt_link} -> {model_path}')
except Exception as e:
print(f'⚠️ Download falhou: {e}')
import traceback
traceback.print_exc()
PY
echo "Executando builder Apex/Q8..."
chmod +x /app/builder.sh
/app/builder.sh || true
# ---------------------- Diagnóstico ----------------------
/app/info.sh || true
#ls -la /app || true
#ls -R /app | head -n 2000 || true
# ---------------------- Subindo serviço ----------------------
echo "🚀 Subindo serviços..."
# Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno
python /app/app_vince.py
|