euIaxs22 commited on
Commit
d377cd1
·
verified ·
1 Parent(s): b0e51aa

Update builder.sh

Browse files
Files changed (1) hide show
  1. builder.sh +54 -144
builder.sh CHANGED
@@ -1,62 +1,45 @@
1
  #!/usr/bin/env bash
2
  set -euo pipefail
3
 
4
- echo "🚀 Builder (Apex + Q8) — runtime, GPU visível, cache persistente"
5
 
6
- # ===== Persistência e caches =====
7
- # Prioriza /data (HF Spaces) e mantém compatibilidade com /app
8
- if [ -d /data ]; then
9
- export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
10
- export TORCH_HOME="${TORCH_HOME:-/data/.cache/torch}"
11
- else
12
- export HF_HOME="${HF_HOME:-/app/.cache/huggingface}"
13
- export TORCH_HOME="${TORCH_HOME:-/app/.cache/torch}"
14
- fi
15
  export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
16
- mkdir -p "$HF_HOME" "$HF_HUB_CACHE" "$TORCH_HOME"
17
-
18
- # Symlink de compatibilidade (se scripts esperarem /app/.cache/huggingface)
19
- mkdir -p /app/.cache
20
- ln -sf "$HF_HOME" /app/.cache/huggingface
21
-
22
- # ===== Repositório de wheels no Hub =====
23
- export SELF_HF_REPO_ID="${SELF_HF_REPO_ID:-euIaxs22/Aduc-sdr}"
24
-
25
- # ===== Aceleração de transferência =====
26
  export HF_HUB_ENABLE_HF_TRANSFER="${HF_HUB_ENABLE_HF_TRANSFER:-1}"
27
- export HF_HUB_DOWNLOAD_TIMEOUT="${HF_HUB_DOWNLOAD_TIMEOUT:-60}"
28
 
29
- # ===== Diretórios de trabalho =====
30
- mkdir -p /app/wheels /app/cuda_cache /app/wheels/src
31
  chmod -R 777 /app/wheels || true
32
  export CUDA_CACHE_PATH="/app/cuda_cache"
33
 
34
- # Licença (NVIDIA NGC) se presente
35
  if [ -f "/NGC-DL-CONTAINER-LICENSE" ]; then
36
  cp -f /NGC-DL-CONTAINER-LICENSE /app/wheels/NGC-DL-CONTAINER-LICENSE || true
37
  fi
38
 
39
- # ===== Dependências mínimas de build =====
40
  python -m pip install -v -U pip build setuptools wheel hatchling hatch-vcs scikit-build-core cmake ninja packaging "huggingface_hub[hf_transfer]" || true
41
 
42
  # ===== Tags de ambiente (Python/CUDA/Torch) =====
43
  PY_TAG="$(python -c 'import sys; print(f"cp{sys.version_info[0]}{sys.version_info[1]}")' 2>/dev/null || echo cp310)"
44
  TORCH_VER="$(python - <<'PY'
45
  try:
46
- import torch, re
47
- v = torch.__version__
48
- print(re.sub(r'\+.*$', '', v))
49
  except Exception:
50
- print("unknown")
51
  PY
52
  )"
53
  CU_TAG="$(python - <<'PY'
54
  try:
55
- import torch
56
- cu = getattr(torch.version, "cuda", None)
57
- print("cu"+cu.replace(".","")) if cu else print("")
58
  except Exception:
59
- print("")
60
  PY
61
  )"
62
  echo "[env] PY_TAG=${PY_TAG} TORCH_VER=${TORCH_VER} CU_TAG=${CU_TAG}"
@@ -65,11 +48,11 @@ echo "[env] PY_TAG=${PY_TAG} TORCH_VER=${TORCH_VER} CU_TAG=${CU_TAG}"
65
  check_apex() {
66
  python - <<'PY'
67
  try:
68
- from apex.normalization import FusedLayerNorm, FusedRMSNorm
69
- import importlib; importlib.import_module("fused_layer_norm_cuda")
70
- ok = True
71
  except Exception:
72
- ok = False
73
  raise SystemExit(0 if ok else 1)
74
  PY
75
  }
@@ -82,10 +65,10 @@ raise SystemExit(0 if spec else 1)
82
  PY
83
  }
84
 
85
- # ===== Download de wheels do Hub =====
86
  install_from_hf () {
87
- local PKG="$1" # 'apex' ou 'q8_kernels'
88
- echo "[hub] Buscando wheel de ${PKG} em ${SELF_HF_REPO_ID} (py=${PY_TAG}, cu=${CU_TAG})"
89
  python - "$PKG" "$PY_TAG" "$CU_TAG" <<'PY' || exit 0
90
  import os, sys
91
  from huggingface_hub import HfApi, hf_hub_download, HfFolder
@@ -93,16 +76,16 @@ from huggingface_hub import HfApi, hf_hub_download, HfFolder
93
  pkg, py_tag, cu_tag = sys.argv[1], sys.argv[2], sys.argv[3]
94
  repo = os.environ.get("SELF_HF_REPO_ID","euIaxs22/Aduc-sdr")
95
  api = HfApi(token=os.getenv("HF_TOKEN") or HfFolder.get_token())
96
-
97
  try:
98
- files = api.list_repo_files(repo_id=repo, repo_type="model")
99
  except Exception:
100
- raise SystemExit(0)
101
 
102
  cands = [f for f in files if f.endswith(".whl") and f.rsplit("/",1)[-1].startswith(pkg+"-") and py_tag in f]
103
- pref = [f for f in cands if cu_tag and cu_tag in f] or cands
104
  if not pref:
105
- raise SystemExit(0)
 
106
  target = sorted(pref, reverse=True)[0]
107
  print(target)
108
  path = hf_hub_download(repo_id=repo, filename=target, repo_type="model", local_dir="/app/wheels")
@@ -113,7 +96,7 @@ PY
113
  # ===== Builders =====
114
  build_apex () {
115
  local SRC="/app/wheels/src/apex"
116
- echo "[build] Fonte Apex em ${SRC}"
117
  if [ -d "$SRC/.git" ]; then
118
  git -C "$SRC" fetch --all -p || true
119
  git -C "$SRC" reset --hard HEAD || true
@@ -122,18 +105,16 @@ build_apex () {
122
  rm -rf "$SRC"
123
  git clone --depth 1 https://github.com/NVIDIA/apex "$SRC"
124
  fi
125
-
126
  echo "[build] Compilando Apex -> wheel"
127
  export APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_ALL_CONTRIB_EXT=0
128
- python -m pip wheel --no-build-isolation --no-deps "$SRC" -w /app/wheels || true
129
-
130
  local W="$(ls -t /app/wheels/apex-*.whl 2>/dev/null | head -n1 || true)"
131
  if [ -n "${W}" ]; then
132
- python -m pip install -U --no-deps "${W}" || true
133
- echo "[build] Apex instalado da wheel: ${W}"
134
  else
135
- echo "[build] Nenhuma wheel Apex gerada; instalando do source"
136
- python -m pip install --no-build-isolation "$SRC" || true
137
  fi
138
  }
139
 
@@ -145,27 +126,25 @@ build_q8 () {
145
  git clone --filter=blob:none "$Q8_REPO" "$SRC"
146
  git -C "$SRC" checkout "$Q8_COMMIT"
147
  git -C "$SRC" submodule update --init --recursive
148
-
149
  echo "[build] Compilando Q8 Kernels -> wheel"
150
- python -m pip wheel --no-build-isolation "$SRC" -w /app/wheels || true
151
-
152
  local W="$(ls -t /app/wheels/q8_kernels-*.whl 2>/dev/null | head -n1 || true)"
153
  if [ -n "${W}" ]; then
154
- python -m pip install -U --no-deps "${W}" || true
155
- echo "[build] Q8 instalado da wheel: ${W}"
156
  else
157
- echo "[build] Nenhuma wheel q8_kernels gerada; instalando do source"
158
- python -m pip install --no-build-isolation "$SRC" || true
159
  fi
160
  }
161
 
162
  # ===== Pipeline genérico =====
163
  ensure_pkg () {
164
- local PKG="$1" # apex | q8_kernels
165
- local CHECK_FN="$2" # check_apex | check_q8
166
- local BUILD_FN="$3" # build_apex | build_q8
167
- echo "[flow] === ${PKG} ==="
168
 
 
169
  if ${CHECK_FN}; then
170
  echo "[flow] ${PKG}: já instalado (import OK)"
171
  return 0
@@ -176,7 +155,7 @@ ensure_pkg () {
176
  if [ -n "${HF_OUT:-}" ]; then
177
  WHEEL_PATH="$(printf "%s\n" "${HF_OUT}" | tail -n1)"
178
  echo "[hub] Baixado: ${WHEEL_PATH}"
179
- python -m pip install -U --no-build-isolation "${WHEEL_PATH}" || true
180
  if ${CHECK_FN}; then
181
  echo "[flow] ${PKG}: sucesso via Hub (${WHEEL_PATH})"
182
  return 0
@@ -194,102 +173,33 @@ ensure_pkg () {
194
  return 0
195
  fi
196
 
197
- echo "[flow] ${PKG}: falhou após build; seguindo adiante"
198
  return 1
199
  }
200
 
201
-
202
- # ===== layer_norm (wheel separada) =====
203
- check_layernorm () { python - <<'PY'
204
- import importlib, sys
205
- try:
206
- importlib.import_module("layer_norm_cuda")
207
- sys.exit(0)
208
- except Exception:
209
- sys.exit(1)
210
- PY
211
- }
212
- install_layernorm_from_hf () {
213
- log "[hub] tentando wheel 'layer_norm' no ${SELF_HF_REPO_ID} (py=${PY_TAG} cu=${CU_TAG} torch=${TORCH_VER})"
214
- python - <<'PY' 2>/dev/null || exit 0
215
- from huggingface_hub import HfApi, HfFolder
216
- import os, re, sys
217
- repo = os.environ.get("SELF_HF_REPO_ID")
218
- py = os.environ.get("PY_TAG"); cu = os.environ.get("CU_TAG"); torchv = os.environ.get("TORCH_VER")
219
- api = HfApi(token=os.getenv("HF_TOKEN") or HfFolder.get_token())
220
- try:
221
- files = api.list_repo_files(repo_id=repo, repo_type="model")
222
- except Exception:
223
- sys.exit(0)
224
- p1 = re.compile(rf"(?:^|/)layer[_-]?norm-.*{re.escape(py)}.*{re.escape(cu)}.*{re.escape(torchv)}.*\.whl$")
225
- p2 = re.compile(rf"(?:^|/)(?:flash[_-]?attn[_-])?layer[_-]?norm-.*{re.escape(py)}.*{re.escape(cu)}.*\.whl$")
226
- cands = [f for f in files if f.endswith(".whl") and (p1.search(f) or p2.search(f))]
227
- if not cands: sys.exit(0)
228
- cands.sort(reverse=True)
229
- path = api.hf_hub_download(repo_id=repo, filename=cands[0], repo_type="model", local_dir="/app/wheels")
230
- print(path)
231
- PY
232
- }
233
- build_layernorm () {
234
- local SRC="/app/wheels/src/flash-attention"
235
- if [ ! -d "$SRC/.git" ]; then
236
- rm -rf "$SRC"; git clone --filter=blob:none "$FLASHATTN_REPO" "$SRC"
237
- fi
238
- git -C "$SRC" fetch --all -p || true
239
- git -C "$SRC" checkout "$FLASHATTN_REF" || true
240
- git -C "$SRC" submodule update --init --recursive || true
241
- export MAX_JOBS="${MAX_JOBS:-4}"
242
- log "[build] layer_norm — MAX_JOBS=$MAX_JOBS"
243
- (cd "$SRC/csrc/layer_norm" && python -m pip wheel --no-build-isolation --no-deps . -w /app/wheels)
244
- local W="$(ls -t /app/wheels/*layer*norm*-*.whl 2>/dev/null | head -n1 || true)"
245
- if [ -n "$W" ]; then
246
- python -m pip install -U --no-deps --no-build-isolation "$W" || true
247
- log "[build] layer_norm instalado da wheel: $W"
248
- else
249
- log "[build] layer_norm: wheel não encontrada; instalando do source"
250
- (cd "$SRC/csrc/layer_norm" && MAX_JOBS="$MAX_JOBS" pip install . --no-build-isolation) || true
251
- fi
252
- }
253
- ensure_layernorm () {
254
- log "[flow] === layer_norm (FlashAttention) ==="
255
- if check_layernorm; then log "[flow] layer_norm já instalado (import OK)"; return 0; fi
256
- local HF_LN="$(install_layernorm_from_hf || true)"
257
- if [ -n "${HF_LN:-}" ]; then
258
- python -m pip install -U --no-deps --no-build-isolation "$HF_LN" || true
259
- if check_layernorm; then log "[flow] layer_norm via Hub OK"; return 0; fi
260
- fi
261
- log "[flow] compilando layer_norm (fallback)"
262
- build_layernorm
263
- if check_layernorm; then log "[flow] layer_norm build OK"; return 0; fi
264
- log "[flow] layer_norm falhou"; return 1
265
- }
266
-
267
-
268
-
269
  # ===== Execução: Apex e Q8 =====
270
-
271
  ensure_pkg "apex" check_apex build_apex || true
272
- ensure_pkg "q8_kernels" check_q8 build_q8 || true
273
- ensure_layernorm || true
274
 
275
- # ===== Upload das wheels geradas (opcional) =====
276
  python - <<'PY'
277
  import os
278
  from huggingface_hub import HfApi, HfFolder
279
  repo=os.environ.get("SELF_HF_REPO_ID","euIaxs22/Aduc-sdr")
280
  token=os.getenv("HF_TOKEN") or HfFolder.get_token()
281
  if not token:
282
- raise SystemExit("HF_TOKEN ausente; upload desabilitado")
283
  api=HfApi(token=token)
284
  api.upload_folder(
285
- folder_path="/app/wheels",
286
- repo_id=repo,
287
- repo_type="model",
288
- allow_patterns=["*.whl","NGC-DL-CONTAINER-LICENSE"],
289
- ignore_patterns=["**/src/**","**/*.log","**/logs/**",".git/**"],
290
  )
291
  print("Upload concluído (wheels + licença).")
292
  PY
293
 
 
294
  chmod -R 777 /app/wheels || true
295
  echo "✅ Builder finalizado."
 
1
  #!/usr/bin/env bash
2
  set -euo pipefail
3
 
4
+ echo "🚀 Builder (Apex + Q8) — roda em runtime com GPU visível"
5
 
6
+ # ===== Config e diretórios =====
7
+ export SELF_HF_REPO_ID="${SELF_HF_REPO_ID:-euIaxs22/Aduc-sdr}" # Model repo no HF com wheels
8
+ export HF_HOME="${HF_HOME:-/app/model_cache}"
 
 
 
 
 
 
9
  export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
10
+ export TORCH_HOME="${TORCH_HOME:-$HF_HOME/torch}"
 
 
 
 
 
 
 
 
 
11
  export HF_HUB_ENABLE_HF_TRANSFER="${HF_HUB_ENABLE_HF_TRANSFER:-1}"
12
+ export PATH="$HOME/.local/bin:$PATH"
13
 
14
+ mkdir -p /app/wheels /app/cuda_cache "$HF_HOME" "$TORCH_HOME" /app/wheels/src
 
15
  chmod -R 777 /app/wheels || true
16
  export CUDA_CACHE_PATH="/app/cuda_cache"
17
 
 
18
  if [ -f "/NGC-DL-CONTAINER-LICENSE" ]; then
19
  cp -f /NGC-DL-CONTAINER-LICENSE /app/wheels/NGC-DL-CONTAINER-LICENSE || true
20
  fi
21
 
22
+ # ===== Dependências mínimas =====
23
  python -m pip install -v -U pip build setuptools wheel hatchling hatch-vcs scikit-build-core cmake ninja packaging "huggingface_hub[hf_transfer]" || true
24
 
25
  # ===== Tags de ambiente (Python/CUDA/Torch) =====
26
  PY_TAG="$(python -c 'import sys; print(f"cp{sys.version_info[0]}{sys.version_info[1]}")' 2>/dev/null || echo cp310)"
27
  TORCH_VER="$(python - <<'PY'
28
  try:
29
+ import torch, re
30
+ v = torch.__version__
31
+ print(re.sub(r'\+.*$', '', v))
32
  except Exception:
33
+ print("unknown")
34
  PY
35
  )"
36
  CU_TAG="$(python - <<'PY'
37
  try:
38
+ import torch
39
+ cu = getattr(torch.version, "cuda", None)
40
+ print("cu"+cu.replace(".","")) if cu else print("")
41
  except Exception:
42
+ print("")
43
  PY
44
  )"
45
  echo "[env] PY_TAG=${PY_TAG} TORCH_VER=${TORCH_VER} CU_TAG=${CU_TAG}"
 
48
  check_apex() {
49
  python - <<'PY'
50
  try:
51
+ from apex.normalization import FusedLayerNorm, FusedRMSNorm
52
+ import importlib; importlib.import_module("fused_layer_norm_cuda")
53
+ ok = True
54
  except Exception:
55
+ ok = False
56
  raise SystemExit(0 if ok else 1)
57
  PY
58
  }
 
65
  PY
66
  }
67
 
68
+ # ===== Download do Hub =====
69
  install_from_hf () {
70
+ local PKG="$1" # 'apex' ou 'q8_kernels'
71
+ echo "[hub] Verificando wheel de ${PKG} no repositório ${SELF_HF_REPO_ID}"
72
  python - "$PKG" "$PY_TAG" "$CU_TAG" <<'PY' || exit 0
73
  import os, sys
74
  from huggingface_hub import HfApi, hf_hub_download, HfFolder
 
76
  pkg, py_tag, cu_tag = sys.argv[1], sys.argv[2], sys.argv[3]
77
  repo = os.environ.get("SELF_HF_REPO_ID","euIaxs22/Aduc-sdr")
78
  api = HfApi(token=os.getenv("HF_TOKEN") or HfFolder.get_token())
 
79
  try:
80
+ files = api.list_repo_files(repo_id=repo, repo_type="model")
81
  except Exception:
82
+ raise SystemExit(0)
83
 
84
  cands = [f for f in files if f.endswith(".whl") and f.rsplit("/",1)[-1].startswith(pkg+"-") and py_tag in f]
85
+ pref = [f for f in cands if cu_tag and cu_tag in f] or cands
86
  if not pref:
87
+ raise SystemExit(0)
88
+
89
  target = sorted(pref, reverse=True)[0]
90
  print(target)
91
  path = hf_hub_download(repo_id=repo, filename=target, repo_type="model", local_dir="/app/wheels")
 
96
  # ===== Builders =====
97
  build_apex () {
98
  local SRC="/app/wheels/src/apex"
99
+ echo "[build] Preparando fonte Apex em ${SRC}"
100
  if [ -d "$SRC/.git" ]; then
101
  git -C "$SRC" fetch --all -p || true
102
  git -C "$SRC" reset --hard HEAD || true
 
105
  rm -rf "$SRC"
106
  git clone --depth 1 https://github.com/NVIDIA/apex "$SRC"
107
  fi
 
108
  echo "[build] Compilando Apex -> wheel"
109
  export APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_ALL_CONTRIB_EXT=0
110
+ python -m pip wheel -v --no-build-isolation --no-deps "$SRC" -w /app/wheels || true
 
111
  local W="$(ls -t /app/wheels/apex-*.whl 2>/dev/null | head -n1 || true)"
112
  if [ -n "${W}" ]; then
113
+ python -m pip install -v -U --no-deps "${W}" || true
114
+ echo "[build] Apex instalado da wheel recém-compilada: ${W}"
115
  else
116
+ echo "[build] Nenhuma wheel Apex gerada; instalando do source (pode falhar)"
117
+ python -m pip install -v --no-build-isolation "$SRC" || true
118
  fi
119
  }
120
 
 
126
  git clone --filter=blob:none "$Q8_REPO" "$SRC"
127
  git -C "$SRC" checkout "$Q8_COMMIT"
128
  git -C "$SRC" submodule update --init --recursive
 
129
  echo "[build] Compilando Q8 Kernels -> wheel"
130
+ python -m pip wheel -v --no-build-isolation "$SRC" -w /app/wheels || true
 
131
  local W="$(ls -t /app/wheels/q8_kernels-*.whl 2>/dev/null | head -n1 || true)"
132
  if [ -n "${W}" ]; then
133
+ python -m pip install -v -U --no-deps "${W}" || true
134
+ echo "[build] Q8 instalado da wheel recém-compilada: ${W}"
135
  else
136
+ echo "[build] Nenhuma wheel q8_kernels gerada; instalando do source (pode falhar)"
137
+ python -m pip install -v --no-build-isolation "$SRC" || true
138
  fi
139
  }
140
 
141
  # ===== Pipeline genérico =====
142
  ensure_pkg () {
143
+ local PKG="$1" # apex | q8_kernels
144
+ local CHECK_FN="$2" # check_apex | check_q8
145
+ local BUILD_FN="$3" # build_apex | build_q8
 
146
 
147
+ echo "[flow] === ${PKG} ==="
148
  if ${CHECK_FN}; then
149
  echo "[flow] ${PKG}: já instalado (import OK)"
150
  return 0
 
155
  if [ -n "${HF_OUT:-}" ]; then
156
  WHEEL_PATH="$(printf "%s\n" "${HF_OUT}" | tail -n1)"
157
  echo "[hub] Baixado: ${WHEEL_PATH}"
158
+ python -m pip install -v -U --no-build-isolation "${WHEEL_PATH}" || true
159
  if ${CHECK_FN}; then
160
  echo "[flow] ${PKG}: sucesso via Hub (${WHEEL_PATH})"
161
  return 0
 
173
  return 0
174
  fi
175
 
176
+ echo "[flow] ${PKG}: falhou após build; registrando logs e seguindo"
177
  return 1
178
  }
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  # ===== Execução: Apex e Q8 =====
 
181
  ensure_pkg "apex" check_apex build_apex || true
182
+ #ensure_pkg "q8_kernels" check_q8 build_q8 || true
183
+
184
 
 
185
  python - <<'PY'
186
  import os
187
  from huggingface_hub import HfApi, HfFolder
188
  repo=os.environ.get("SELF_HF_REPO_ID","euIaxs22/Aduc-sdr")
189
  token=os.getenv("HF_TOKEN") or HfFolder.get_token()
190
  if not token:
191
+ raise SystemExit("HF_TOKEN ausente; upload desabilitado")
192
  api=HfApi(token=token)
193
  api.upload_folder(
194
+ folder_path="/app/wheels",
195
+ repo_id=repo,
196
+ repo_type="model",
197
+ allow_patterns=["*.whl","NGC-DL-CONTAINER-LICENSE"],
198
+ ignore_patterns=["**/src/**","**/*.log","**/logs/**",".git/**"],
199
  )
200
  print("Upload concluído (wheels + licença).")
201
  PY
202
 
203
+
204
  chmod -R 777 /app/wheels || true
205
  echo "✅ Builder finalizado."