|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Diagnose script for checking OS/hardware/python/pip/verl/network. |
|
|
The output of this script can be a very good hint to issue/problem. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import platform |
|
|
import socket |
|
|
import subprocess |
|
|
import sys |
|
|
import time |
|
|
|
|
|
import psutil |
|
|
|
|
|
try: |
|
|
from urllib.parse import urlparse |
|
|
from urllib.request import urlopen |
|
|
except ImportError: |
|
|
from urllib2 import urlopen |
|
|
from urlparse import urlparse |
|
|
import argparse |
|
|
import importlib.metadata |
|
|
|
|
|
import torch |
|
|
|
|
|
URLS = { |
|
|
"PYPI": "https://pypi.python.org/pypi/pip", |
|
|
} |
|
|
|
|
|
REGIONAL_URLS = { |
|
|
"cn": { |
|
|
"PYPI(douban)": "https://pypi.douban.com/", |
|
|
"Conda(tsinghua)": "https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/", |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
def test_connection(name, url, timeout=10): |
|
|
"""Simple connection test""" |
|
|
urlinfo = urlparse(url) |
|
|
start = time.time() |
|
|
try: |
|
|
socket.gethostbyname(urlinfo.netloc) |
|
|
except Exception as e: |
|
|
print("Error resolving DNS for {}: {}, {}".format(name, url, e)) |
|
|
return |
|
|
dns_elapsed = time.time() - start |
|
|
start = time.time() |
|
|
try: |
|
|
_ = urlopen(url, timeout=timeout) |
|
|
except Exception as e: |
|
|
print("Error open {}: {}, {}, DNS finished in {} sec.".format(name, url, e, dns_elapsed)) |
|
|
return |
|
|
load_elapsed = time.time() - start |
|
|
print("Timing for {}: {}, DNS: {:.4f} sec, LOAD: {:.4f} sec.".format(name, url, dns_elapsed, load_elapsed)) |
|
|
|
|
|
|
|
|
def check_python(): |
|
|
print("----------Python Info----------") |
|
|
print("Version :", platform.python_version()) |
|
|
print("Compiler :", platform.python_compiler()) |
|
|
print("Build :", platform.python_build()) |
|
|
print("Arch :", platform.architecture()) |
|
|
|
|
|
|
|
|
def check_pip(): |
|
|
print("------------Pip Info-----------") |
|
|
try: |
|
|
import pip |
|
|
|
|
|
print("Version :", pip.__version__) |
|
|
print("Directory :", os.path.dirname(pip.__file__)) |
|
|
except ImportError: |
|
|
print("No corresponding pip install for current python.") |
|
|
|
|
|
|
|
|
def _get_current_git_commit(): |
|
|
try: |
|
|
result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) |
|
|
return result.stdout.strip() |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"Error running git command: {e.stderr.strip()}") |
|
|
return None |
|
|
except FileNotFoundError: |
|
|
print("Did not find command: git") |
|
|
return None |
|
|
|
|
|
|
|
|
def check_verl(): |
|
|
print("----------verl Info-----------") |
|
|
try: |
|
|
sys.path.insert(0, os.getcwd()) |
|
|
import verl |
|
|
|
|
|
print("Version :", verl.__version__) |
|
|
verl_dir = os.path.dirname(verl.__file__) |
|
|
print("Directory :", verl_dir) |
|
|
try: |
|
|
commit_hash = _get_current_git_commit() |
|
|
print("Commit Hash :", commit_hash) |
|
|
except AttributeError: |
|
|
print("Commit hash not found. ") |
|
|
except ImportError as e: |
|
|
print(f"No verl installed: {e}") |
|
|
except Exception as e: |
|
|
import traceback |
|
|
|
|
|
if not isinstance(e, IOError): |
|
|
print("An error occurred trying to import verl.") |
|
|
print("This is very likely due to missing or incompatible library files.") |
|
|
print(traceback.format_exc()) |
|
|
|
|
|
|
|
|
def check_os(): |
|
|
print("----------Platform Info----------") |
|
|
print("Platform :", platform.platform()) |
|
|
print("system :", platform.system()) |
|
|
print("node :", platform.node()) |
|
|
print("release :", platform.release()) |
|
|
print("version :", platform.version()) |
|
|
|
|
|
|
|
|
def check_hardware(): |
|
|
print("----------Hardware Info----------") |
|
|
print("machine :", platform.machine()) |
|
|
print("processor :", platform.processor()) |
|
|
if sys.platform.startswith("darwin"): |
|
|
pipe = subprocess.Popen(("sysctl", "-a"), stdout=subprocess.PIPE) |
|
|
output = pipe.communicate()[0] |
|
|
for line in output.split(b"\n"): |
|
|
if b"brand_string" in line or b"features" in line: |
|
|
print(line.strip()) |
|
|
elif sys.platform.startswith("linux"): |
|
|
subprocess.call(["lscpu"]) |
|
|
elif sys.platform.startswith("win32"): |
|
|
subprocess.call(["wmic", "cpu", "get", "name"]) |
|
|
|
|
|
|
|
|
def check_network(args): |
|
|
print("----------Network Test----------") |
|
|
if args.timeout > 0: |
|
|
print("Setting timeout: {}".format(args.timeout)) |
|
|
socket.setdefaulttimeout(10) |
|
|
for region in args.region.strip().split(","): |
|
|
r = region.strip().lower() |
|
|
if not r: |
|
|
continue |
|
|
if r in REGIONAL_URLS: |
|
|
URLS.update(REGIONAL_URLS[r]) |
|
|
else: |
|
|
import warnings |
|
|
|
|
|
warnings.warn("Region {} do not need specific test, please refer to global sites.".format(r), stacklevel=2) |
|
|
for name, url in URLS.items(): |
|
|
test_connection(name, url, args.timeout) |
|
|
|
|
|
|
|
|
def check_environment(): |
|
|
print("----------Environment----------") |
|
|
for k, v in os.environ.items(): |
|
|
if k.startswith("VERL_") or k.startswith("OMP_") or k.startswith("KMP_") or k == "CC" or k == "CXX": |
|
|
print('{}="{}"'.format(k, v)) |
|
|
|
|
|
|
|
|
def check_pip_package_versions(): |
|
|
packages = ["vllm", "sglang", "ray", "torch"] |
|
|
for package in packages: |
|
|
try: |
|
|
version = importlib.metadata.version(package) |
|
|
print(f"{package}\t : {version}") |
|
|
except importlib.metadata.PackageNotFoundError: |
|
|
print(f"{package}\t : not found.") |
|
|
|
|
|
|
|
|
def check_cuda_versions(): |
|
|
if torch.cuda.is_available(): |
|
|
try: |
|
|
cuda_runtime_version = torch.version.cuda |
|
|
print(f"CUDA Runtime : {cuda_runtime_version}") |
|
|
import subprocess |
|
|
|
|
|
nvcc_output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8") |
|
|
cuda_compiler_version = next((line for line in nvcc_output.splitlines() if "release" in line), None) |
|
|
if cuda_compiler_version: |
|
|
print(f"CUDA Compiler : {cuda_compiler_version.strip()}") |
|
|
else: |
|
|
print("Could not determine CUDA compiler version.") |
|
|
except FileNotFoundError as e: |
|
|
print(f"CUDA compiler : Not found: {e}") |
|
|
except Exception as e: |
|
|
print(f"An error occurred while checking CUDA versions: {e}") |
|
|
else: |
|
|
print("CUDA is not available.") |
|
|
|
|
|
|
|
|
def _get_cpu_memory(): |
|
|
""" |
|
|
Get the total CPU memory capacity in GB. |
|
|
""" |
|
|
memory = psutil.virtual_memory() |
|
|
return memory.total / (1024**3) |
|
|
|
|
|
|
|
|
def _get_gpu_info(): |
|
|
""" |
|
|
Get GPU type, GPU memory, and GPU count using nvidia-smi command. |
|
|
""" |
|
|
try: |
|
|
result = subprocess.run( |
|
|
["nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,noheader,nounits"], |
|
|
capture_output=True, |
|
|
text=True, |
|
|
check=True, |
|
|
) |
|
|
gpu_lines = result.stdout.strip().split("\n") |
|
|
gpu_count = len(gpu_lines) |
|
|
gpu_info = [] |
|
|
for line in gpu_lines: |
|
|
gpu_name, gpu_memory = line.split(", ") |
|
|
gpu_info.append( |
|
|
{ |
|
|
"type": gpu_name, |
|
|
"memory": float(gpu_memory) / 1024, |
|
|
} |
|
|
) |
|
|
return gpu_count, gpu_info |
|
|
except subprocess.CalledProcessError: |
|
|
print("Failed to execute nvidia-smi command.") |
|
|
return 0, [] |
|
|
|
|
|
|
|
|
def _get_system_info(): |
|
|
""" |
|
|
Get CPU memory capacity, GPU type, GPU memory, and GPU count. |
|
|
""" |
|
|
cpu_memory = _get_cpu_memory() |
|
|
gpu_count, gpu_info = _get_gpu_info() |
|
|
return {"cpu_memory": cpu_memory, "gpu_count": gpu_count, "gpu_info": gpu_info} |
|
|
|
|
|
|
|
|
def check_system_info(): |
|
|
print("----------System Info----------") |
|
|
system_info = _get_system_info() |
|
|
print(f"CPU Memory\t: {system_info['cpu_memory']:.2f} GB") |
|
|
print(f"GPU Count\t: {system_info['gpu_count']}") |
|
|
for i, gpu in enumerate(system_info["gpu_info"]): |
|
|
print(f"GPU {i + 1}\tType : {gpu['type']}") |
|
|
print(f"GPU {i + 1}\tMemory : {gpu['memory']:.2f} GB") |
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
"""Parse arguments.""" |
|
|
parser = argparse.ArgumentParser( |
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
|
description="Diagnose script for checking the current system.", |
|
|
) |
|
|
choices = ["python", "pip", "verl", "system", "os", "environment"] |
|
|
for choice in choices: |
|
|
parser.add_argument("--" + choice, default=1, type=int, help="Diagnose {}.".format(choice)) |
|
|
parser.add_argument("--network", default=0, type=int, help="Diagnose network.") |
|
|
parser.add_argument("--hardware", default=0, type=int, help="Diagnose hardware.") |
|
|
parser.add_argument( |
|
|
"--region", |
|
|
default="", |
|
|
type=str, |
|
|
help="Additional sites in which region(s) to test. \ |
|
|
Specify 'cn' for example to test mirror sites in China.", |
|
|
) |
|
|
parser.add_argument("--timeout", default=10, type=int, help="Connection test timeout threshold, 0 to disable.") |
|
|
args = parser.parse_args() |
|
|
return args |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
args = parse_args() |
|
|
if args.python: |
|
|
check_python() |
|
|
|
|
|
if args.pip: |
|
|
check_pip() |
|
|
check_pip_package_versions() |
|
|
|
|
|
if args.verl: |
|
|
check_verl() |
|
|
|
|
|
if args.os: |
|
|
check_os() |
|
|
|
|
|
if args.hardware: |
|
|
check_hardware() |
|
|
|
|
|
if args.network: |
|
|
check_network(args) |
|
|
|
|
|
if args.environment: |
|
|
check_environment() |
|
|
check_cuda_versions() |
|
|
|
|
|
if args.system: |
|
|
check_system_info() |
|
|
|