"""Modal GPU runner for MiniCPM-V 4.6 vision LoRA via LLaMA-Factory.""" from __future__ import annotations import json import os import re import subprocess import sys from pathlib import Path import modal APP_NAME = "vivamais-vision-train" # Inlined (not imported from a sibling module) so the remote container, which # imports this file by itself, does not hit ModuleNotFoundError: vivamais_profile. MODAL_WORKSPACE = "marinaleitecabrera" def assert_modal_workspace() -> None: active = os.environ.get("MODAL_PROFILE", MODAL_WORKSPACE) if active != MODAL_WORKSPACE: raise RuntimeError( f"Modal profile {active!r} is active; expected {MODAL_WORKSPACE!r}. " f"Run: modal profile activate {MODAL_WORKSPACE}" ) MODAL_PROFILE = MODAL_WORKSPACE VOLUME_NAME = "vivamais-vision-checkpoints" DATASET_VOLUME_NAME = "vivamais-vision-dataset" FACTORY_ROOT = Path("/opt/LLaMA-Factory") VIVAMAIS_REMOTE = "/opt/vivamais" FORBIDDEN_CONFIG_MARKERS = ("quantization_bit:",) def config_remote(version: str) -> str: return f"{VIVAMAIS_REMOTE}/minicpmv4_6_lora_sft_{version}.yaml" def train_json_name(version: str) -> str: return f"vivamais_vision_train_{version}.json" def output_dir_for(version: str) -> Path: return Path(f"/checkpoints/minicpmv4_6/lora/sft_{version}") def local_config_path(version: str) -> Path: return Path(f"finetune/vision/configs/minicpmv4_6_lora_sft_{version}.yaml") app = modal.App(APP_NAME) checkpoints = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True) dataset = modal.Volume.from_name(DATASET_VOLUME_NAME, create_if_missing=True) base_image = ( modal.Image.debian_slim(python_version="3.11") .apt_install("git") .pip_install( "torch", "torchvision", "datasets", "accelerate", "peft", "sentencepiece", "protobuf", "einops", "tiktoken", "av", "librosa", "soundfile", "huggingface_hub", ) .run_commands( "git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git /opt/LLaMA-Factory", "pip install -e '/opt/LLaMA-Factory[minicpm_v]'", "pip install 'transformers>=5.7.0,<6.0.0'", ) .add_local_file( local_path="finetune/vision/configs/minicpmv4_6_lora_sft_v2.yaml", remote_path=config_remote("v2"), copy=True, ) .add_local_file( local_path="finetune/vision/configs/minicpmv4_6_lora_sft_v3.yaml", remote_path=config_remote("v3"), copy=True, ) .add_local_file( local_path="finetune/vision/configs/minicpmv4_6_lora_sft_v4.yaml", remote_path=config_remote("v4"), copy=True, ) .add_local_file( local_path="finetune/vision/configs/minicpmv4_6_lora_sft_v5.yaml", remote_path=config_remote("v5"), copy=True, ) .add_local_file( local_path="finetune/vision/minicpm_processor.py", remote_path=f"{VIVAMAIS_REMOTE}/minicpm_processor.py", copy=True, ) .add_local_file( local_path="finetune/vision/patches/apply_llamafactory_loader_patch.py", remote_path=f"{VIVAMAIS_REMOTE}/apply_llamafactory_loader_patch.py", copy=True, ) .run_commands(f"python {VIVAMAIS_REMOTE}/apply_llamafactory_loader_patch.py") ) def _validate_train_config(config_path: Path) -> None: text = config_path.read_text(encoding="utf-8") for marker in FORBIDDEN_CONFIG_MARKERS: if marker in text: raise RuntimeError( f"refusing to train: {config_path} contains {marker!r}. " "Vision LoRA must run without QLoRA." ) if re.search(r"preprocessing_num_workers:", text): raise RuntimeError( f"refusing to train: {config_path} sets preprocessing_num_workers. " "Omit it so dataset.map runs in-process." ) def _preflight_processor(model_name: str) -> None: if VIVAMAIS_REMOTE not in sys.path: sys.path.insert(0, VIVAMAIS_REMOTE) from minicpm_processor import load_minicpm_processor processor = load_minicpm_processor(model_name, trust_remote_code=True) print(f"processor preflight ok: {processor.__class__.__name__}") def _merge_dataset_info(factory_data_dir: Path, patch_path: Path) -> None: target = factory_data_dir / "dataset_info.json" base = json.loads(target.read_text(encoding="utf-8")) patch = json.loads(patch_path.read_text(encoding="utf-8")) base.update(patch) target.write_text(json.dumps(base, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") def _install_package(package_dir: Path) -> None: data_src = package_dir / "data" data_dst = FACTORY_ROOT / "data" for path in data_src.rglob("*"): if path.is_file(): rel = path.relative_to(data_src) dest = data_dst / rel dest.parent.mkdir(parents=True, exist_ok=True) dest.write_bytes(path.read_bytes()) _merge_dataset_info(data_dst, package_dir / "dataset_info.json") def _upload_lora(lora_dir: Path, hf_repo: str) -> None: token = os.environ.get("HF_TOKEN") if not token: print("HF_TOKEN not set; skipping Hub upload") return from huggingface_hub import HfApi api = HfApi(token=token) api.upload_folder( folder_path=str(lora_dir), repo_id=hf_repo, repo_type="model", private=True, ) # GPU is resolved at app-build time from the launch environment so the decorated # value is the real requested GPU (with_options(gpu=...) proved unreliable). Set # VIVAMAIS_TRAIN_GPU=H100 before `modal run` to train on H100. TRAIN_GPU = os.environ.get("VIVAMAIS_TRAIN_GPU", "A10G") @app.function( image=base_image, gpu=TRAIN_GPU, timeout=60 * 60 * 6, volumes={"/package": dataset, "/checkpoints": checkpoints}, ) def train_lora(*, version: str = "v2", hf_repo: str | None = None) -> str: import torch print(f"GPU requested={TRAIN_GPU} actual={torch.cuda.get_device_name(0)}", flush=True) dataset.reload() package_dir = Path("/package") config_path = Path(config_remote(version)) _validate_train_config(config_path) config_text = config_path.read_text(encoding="utf-8") print(f"using baked config from {config_path}:\n{config_text}") train_json = package_dir / "data" / train_json_name(version) if not train_json.is_file(): train_json = package_dir / "data" / "vivamais_vision_train.json" if not train_json.is_file(): raise FileNotFoundError(f"missing dataset on volume: {package_dir / 'data'}") _preflight_processor("openbmb/MiniCPM-V-4.6") _install_package(package_dir) env = {**os.environ, "DISABLE_VERSION_CHECK": "1"} subprocess.run( ["llamafactory-cli", "train", str(config_path)], cwd=FACTORY_ROOT, check=True, env=env, ) checkpoints.commit() output_dir = output_dir_for(version) if hf_repo: _upload_lora(output_dir, hf_repo) return str(output_dir) def _assert_modal_profile() -> None: assert_modal_workspace() def _upload_package(local_package: Path) -> None: with dataset.batch_upload(force=True) as batch: batch.put_directory(local_package, "/") @app.local_entrypoint() def main( package_dir: str = "finetune/modal_package", hf_repo: str | None = None, detach: bool = False, version: str = "v2", skip_upload: bool = False, ) -> None: _assert_modal_profile() local_package = Path(package_dir) if not skip_upload and not local_package.is_dir(): raise FileNotFoundError(f"package dir not found: {local_package}") _validate_train_config(local_config_path(version)) print(f"workspace: {MODAL_PROFILE} (version {version})") if skip_upload: print("skip_upload set; using dataset already on the volume") else: print(f"uploading dataset images+json to volume {DATASET_VOLUME_NAME}") _upload_package(local_package) print(f"starting train_lora on {TRAIN_GPU} (set VIVAMAIS_TRAIN_GPU to change)") print("monitor: modal app list && modal app logs --follow") if detach: train_lora.spawn(version=version, hf_repo=hf_repo) print("spawned train_lora; follow logs with modal app logs --follow") return output = train_lora.remote(version=version, hf_repo=hf_repo) print(f"training complete: {output}")