#!/usr/bin/env bash set -euo pipefail echo "=======================================================" echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)" echo "=======================================================" # ---------------------- Env base ---------------------- export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}" # SDPA / FA toggles export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}" export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}" export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}" export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}" export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}" # CUDA / NCCL baseline export CUDA_MODULE_LOADING="LAZY" export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}" export CUDA_DEVICE_ORDER="PCI_BUS_ID" export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8" export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}" export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}" export NCCL_DEBUG="INFO" export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_P2P_DISABLE=0 export NCCL_IB_DISABLE=1 export NCCL_SOCKET_IFNAME="lo" export NCCL_BLOCKING_WAIT=1 export TORCH_NCCL_BLOCKING_WAIT=1 #export NCCL_TIMEOUT=600 # ---------------------- Persistência HF/torch ---------------------- if [ -d /data ]; then export HF_HOME="/data/.cache/huggingface" export TORCH_HOME="/data/.cache/torch" else export HF_HOME="/app/.cache/huggingface" export TORCH_HOME="/app/.cache/torch" fi export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}" mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME" mkdir -p /app/.cache ln -sf "$HF_HOME" /app/.cache/huggingface unset TRANSFORMERS_CACHE export HF_HUB_ENABLE_HF_TRANSFER=1 export HF_HUB_DOWNLOAD_TIMEOUT=6000 MODEL_REPO="ByteDance-Seed/VINCIE-3B" CKPT_DIR="/app/ckpt/VINCIE-3B" mkdir -p "$CKPT_DIR" # ---------------------- Cache Estruturado HF (persistente) ---------------------- # Define cache no volume persistente /data (1TB) if [ -d /data ]; then export HF_HOME="${HF_HOME:-/data/.cache/huggingface}" else export HF_HOME="${HF_HOME:-/app/.cache/huggingface}" fi export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}" mkdir -p "$HF_HUB_CACHE" echo "📦 Cache HF: $HF_HUB_CACHE" # Download usando cache estruturado (não duplica arquivos) python3 - <<'PY' from huggingface_hub import snapshot_download import os cache_dir = os.environ.get('HF_HUB_CACHE') print(f'📥 Baixando VINCIE-3B para cache: {cache_dir}') try: model_path = snapshot_download( repo_id='ByteDance-Seed/VINCIE-3B', cache_dir=cache_dir, # Usa cache estruturado resume_download=True, # Retoma downloads interrompidos max_workers=8, # Acelera com paralelismo # Não usa local_dir - mantém tudo no cache HF ) print(f'✅ Modelo em cache: {model_path}') # Cria symlink para compatibilidade com código legacy ckpt_link = '/app/ckpt/VINCIE-3B' os.makedirs('/app/ckpt', exist_ok=True) if os.path.islink(ckpt_link): os.unlink(ckpt_link) if not os.path.exists(ckpt_link): os.symlink(model_path, ckpt_link) print(f'🔗 Symlink: {ckpt_link} -> {model_path}') except Exception as e: print(f'⚠️ Download falhou: {e}') import traceback traceback.print_exc() PY echo "Executando builder Apex/Q8..." chmod +x /app/builder.sh /app/builder.sh || true # ---------------------- Diagnóstico ---------------------- /app/info.sh || true #ls -la /app || true #ls -R /app | head -n 2000 || true # ---------------------- Subindo serviço ---------------------- echo "🚀 Subindo serviços..." # Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno python /app/app_vince.py