File size: 3,827 Bytes
c00ea12
 
93626d8
c00ea12
 
 
93626d8
c00ea12
 
 
6188fa0
16ced43
c00ea12
 
 
 
 
6188fa0
16ced43
47d5744
c00ea12
47d5744
 
c00ea12
 
47d5744
 
 
 
 
c00ea12
 
fbe9d9f
4c802e6
ac90475
 
 
 
16ced43
22f9c33
 
 
 
 
 
 
 
9b230d1
 
 
 
 
e997f7e
87eb4d4
fbe9d9f
87eb4d4
 
e997f7e
 
 
687a9ef
9c00877
687a9ef
 
 
 
 
 
 
 
 
 
 
 
 
9c00877
 
 
687a9ef
 
 
 
9c00877
687a9ef
9c00877
687a9ef
 
 
 
9c00877
687a9ef
 
 
 
 
 
 
 
 
 
 
9c00877
 
687a9ef
 
9c00877
 
 
b5c6744
 
 
 
bd40b35
 
1ebc5ad
22f9c33
8c19f21
 
7f655ac
22f9c33
1ebc5ad
16ced43
9b230d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env bash
set -euo pipefail

echo "======================================================="
echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
echo "======================================================="

# ---------------------- Env base ----------------------
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"

# SDPA / FA toggles
export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}"
export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}"
export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}"
export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"

# CUDA / NCCL baseline
export CUDA_MODULE_LOADING="LAZY"
export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
export NCCL_DEBUG="INFO"
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_P2P_DISABLE=0
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME="lo"
export NCCL_BLOCKING_WAIT=1
export TORCH_NCCL_BLOCKING_WAIT=1
#export NCCL_TIMEOUT=600



    

# ---------------------- Persistência HF/torch ----------------------
if [ -d /data ]; then
  export HF_HOME="/data/.cache/huggingface"
  export TORCH_HOME="/data/.cache/torch"
else
  export HF_HOME="/app/.cache/huggingface"
  export TORCH_HOME="/app/.cache/torch"
fi
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME"

mkdir -p /app/.cache
ln -sf "$HF_HOME" /app/.cache/huggingface
unset TRANSFORMERS_CACHE

export HF_HUB_ENABLE_HF_TRANSFER=1
export HF_HUB_DOWNLOAD_TIMEOUT=6000

MODEL_REPO="ByteDance-Seed/VINCIE-3B"
CKPT_DIR="/app/ckpt/VINCIE-3B"
mkdir -p "$CKPT_DIR"

# ---------------------- Cache Estruturado HF (persistente) ----------------------

# Define cache no volume persistente /data (1TB)
if [ -d /data ]; then
  export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
else
  export HF_HOME="${HF_HOME:-/app/.cache/huggingface}"
fi

export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE"

echo "📦 Cache HF: $HF_HUB_CACHE"

# Download usando cache estruturado (não duplica arquivos)
python3 - <<'PY'
from huggingface_hub import snapshot_download
import os

cache_dir = os.environ.get('HF_HUB_CACHE')
print(f'📥 Baixando VINCIE-3B para cache: {cache_dir}')

try:
    model_path = snapshot_download(
        repo_id='ByteDance-Seed/VINCIE-3B',
        cache_dir=cache_dir,           # Usa cache estruturado
        resume_download=True,          # Retoma downloads interrompidos
        max_workers=8,                 # Acelera com paralelismo
        # Não usa local_dir - mantém tudo no cache HF
    )
    print(f'✅ Modelo em cache: {model_path}')
    
    # Cria symlink para compatibilidade com código legacy
    ckpt_link = '/app/ckpt/VINCIE-3B'
    os.makedirs('/app/ckpt', exist_ok=True)
    if os.path.islink(ckpt_link):
        os.unlink(ckpt_link)
    if not os.path.exists(ckpt_link):
        os.symlink(model_path, ckpt_link)
        print(f'🔗 Symlink: {ckpt_link} -> {model_path}')
        
except Exception as e:
    print(f'⚠️ Download falhou: {e}')
    import traceback
    traceback.print_exc()
PY


echo "Executando builder Apex/Q8..."
chmod +x /app/builder.sh 
  
/app/builder.sh || true


# ---------------------- Diagnóstico ----------------------
/app/info.sh || true
#ls -la /app || true
#ls -R /app | head -n 2000 || true

# ---------------------- Subindo serviço ----------------------
echo "🚀 Subindo serviços..."
# Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno
python /app/app_vince.py