diff --git "a/slurm.out" "b/slurm.out" new file mode 100644--- /dev/null +++ "b/slurm.out" @@ -0,0 +1,3005 @@ +1: W1123 14:35:36.265000 2700626 torch/distributed/run.py:792] +1: W1123 14:35:36.265000 2700626 torch/distributed/run.py:792] ***************************************** +1: W1123 14:35:36.265000 2700626 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +1: W1123 14:35:36.265000 2700626 torch/distributed/run.py:792] ***************************************** +2: W1123 14:35:36.265000 1476038 torch/distributed/run.py:792] +2: W1123 14:35:36.265000 1476038 torch/distributed/run.py:792] ***************************************** +2: W1123 14:35:36.265000 1476038 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +2: W1123 14:35:36.265000 1476038 torch/distributed/run.py:792] ***************************************** +0: W1123 14:35:36.265000 3958233 torch/distributed/run.py:792] +0: W1123 14:35:36.265000 3958233 torch/distributed/run.py:792] ***************************************** +0: W1123 14:35:36.265000 3958233 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +0: W1123 14:35:36.265000 3958233 torch/distributed/run.py:792] ***************************************** +3: W1123 14:35:36.265000 1471917 torch/distributed/run.py:792] +3: W1123 14:35:36.265000 1471917 torch/distributed/run.py:792] ***************************************** +3: W1123 14:35:36.265000 1471917 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +3: W1123 14:35:36.265000 1471917 torch/distributed/run.py:792] ***************************************** +0: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3958317] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +0: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3958317] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +2: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:1476125] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +1: [2025-11-23 14:40:32,705] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:2700705] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +3: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:1472009] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing` +2: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:1476125] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +1: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:2700705] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +3: [2025-11-23 14:40:32,706] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:1472009] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing +0: [2025-11-23 14:41:03,102] [WARNING] [axolotl.utils.config.normalize_config:139] [PID:3958317] [RANK:0] Invalid value for save_steps (1.6666666666666667) from saves_per_epoch and/or num_epochs. Saving at training end only. +0: [2025-11-23 14:41:03,272] [INFO] [axolotl.cli.config.load_cfg:245] [PID:3958317] [RANK:0] config: +0: { +0: "activation_offloading": false, +0: "auto_resume_from_checkpoints": true, +0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1763904854646355858.yaml", +0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-4b", +0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-4b", +0: "batch_size": 16, +0: "bf16": true, +0: "capabilities": { +0: "bf16": true, +0: "compute_capability": "sm_90", +0: "fp8": false, +0: "n_gpu": 16, +0: "n_node": 1 +0: }, +0: "chat_template": "gemma3", +0: "context_parallel_size": 1, +0: "curriculum_sampling": true, +0: "dataloader_num_workers": 2, +0: "dataset_prepared_path": "/lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0.5", +0: "dataset_processes": 32, +0: "datasets": [ +0: { +0: "chat_template": "tokenizer_default", +0: "data_files": [ +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0001.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0002.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0013.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0015.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0004.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0011.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0000.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0003.jsonl" +0: ], +0: "ds_type": "json", +0: "field_messages": "conversations", +0: "message_property_mappings": { +0: "content": "content", +0: "role": "role" +0: }, +0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking", +0: "trust_remote_code": false, +0: "type": "chat_template" +0: }, +0: { +0: "chat_template": "tokenizer_default", +0: "data_files": [ +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0007.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0009.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0005.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0006.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0014.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0010.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0012.jsonl", +0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking/0008.jsonl" +0: ], +0: "ds_type": "json", +0: "field_messages": "conversations", +0: "message_property_mappings": { +0: "content": "content", +0: "role": "role" +0: }, +0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/thinking", +0: "trust_remote_code": false, +0: "type": "chat_template" +0: } +0: ], +0: "ddp": true, +0: "deepspeed": { +0: "bf16": { +0: "enabled": true +0: }, +0: "gradient_accumulation_steps": "auto", +0: "gradient_clipping": "auto", +0: "train_batch_size": "auto", +0: "train_micro_batch_size_per_gpu": "auto", +0: "wall_clock_breakdown": false, +0: "zero_optimization": { +0: "contiguous_gradients": true, +0: "overlap_comm": true, +0: "reduce_bucket_size": "auto", +0: "stage": 3, +0: "stage3_gather_16bit_weights_on_model_save": true, +0: "stage3_param_persistence_threshold": "auto", +0: "stage3_prefetch_bucket_size": "auto", +0: "sub_group_size": 0 +0: } +0: }, +0: "device": "cuda:0", +0: "device_map": { +0: "": 0 +0: }, +0: "dion_rank_fraction": 1.0, +0: "dion_rank_multiple_of": 1, +0: "env_capabilities": { +0: "torch_version": "2.6.0" +0: }, +0: "eot_tokens": [ +0: "" +0: ], +0: "eval_batch_size": 1, +0: "eval_causal_lm_metrics": [ +0: "sacrebleu", +0: "comet", +0: "ter", +0: "chrf" +0: ], +0: "eval_max_new_tokens": 128, +0: "eval_sample_packing": true, +0: "eval_table_size": 0, +0: "evals_per_epoch": 0, +0: "flash_attention": true, +0: "fp16": false, +0: "gradient_accumulation_steps": 1, +0: "gradient_checkpointing": true, +0: "gradient_checkpointing_kwargs": { +0: "use_reentrant": true +0: }, +0: "is_multimodal": true, +0: "learning_rate": 5e-06, +0: "lisa_layers_attribute": "model.layers", +0: "load_best_model_at_end": false, +0: "load_in_4bit": false, +0: "load_in_8bit": false, +0: "local_rank": 0, +0: "logging_steps": 10, +0: "lora_dropout": 0.0, +0: "loraplus_lr_embedding": 1e-06, +0: "lr_scheduler": "warmup_stable_decay", +0: "lr_scheduler_kwargs": { +0: "min_lr_ratio": 0.1, +0: "num_decay_steps": 200 +0: }, +0: "max_prompt_len": 512, +0: "mean_resizing_embeddings": false, +0: "micro_batch_size": 1, +0: "model_config_type": "gemma3", +0: "num_epochs": 0.6, +0: "optimizer": "adamw_torch_fused", +0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-4b/0.5", +0: "pad_to_sequence_len": true, +0: "pretrain_multipack_attn": true, +0: "pretrain_multipack_buffer_size": 10000, +0: "processor_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-4b", +0: "profiler_steps_start": 0, +0: "qlora_sharded_model_loading": false, +0: "ray_num_workers": 1, +0: "resources_per_worker": { +0: "GPU": 1 +0: }, +0: "sample_packing": true, +0: "sample_packing_bin_size": 200, +0: "sample_packing_group_size": 100000, +0: "sample_packing_sequentially": true, +0: "save_only_model": true, +0: "save_safetensors": true, +0: "save_total_limit": 20, +0: "saves_per_epoch": 1, +0: "sequence_len": 16384, +0: "shuffle_before_merging_datasets": true, +0: "shuffle_merged_datasets": false, +0: "skip_prepare_dataset": false, +0: "strict": false, +0: "tensor_parallel_size": 1, +0: "tf32": false, +0: "tiled_mlp_use_original_mlp": true, +0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-27b", +0: "torch_dtype": "torch.bfloat16", +0: "train_on_inputs": false, +0: "trl": { +0: "log_completions": false, +0: "mask_truncated_completions": false, +0: "ref_model_mixup_alpha": 0.9, +0: "ref_model_sync_steps": 64, +0: "scale_rewards": true, +0: "sync_ref_model": false, +0: "use_vllm": false, +0: "vllm_server_host": "0.0.0.0", +0: "vllm_server_port": 8000 +0: }, +0: "use_ray": false, +0: "use_tensorboard": true, +0: "val_set_size": 0.0, +0: "vllm": { +0: "device": "auto", +0: "dtype": "auto", +0: "gpu_memory_utilization": 0.9, +0: "host": "0.0.0.0", +0: "port": 8000 +0: }, +0: "warmup_steps": 100, +0: "weight_decay": 0.0, +0: "world_size": 16 +0: } +0: [2025-11-23 14:41:03,273] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:3958317] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used. +2: [2025-11-23 14:41:05,147] [INFO] [axolotl.utils.data.sft._load_raw_datasets:314] [PID:1476128] [RANK:3] Loading raw datasets... +2: Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 4638 examples [00:00, 12849.35 examples/s] Generating train split: 9310 examples [00:00, 21307.04 examples/s] Generating train split: 28012 examples [00:00, 60097.01 examples/s] Generating train split: 39449 examples [00:00, 64467.45 examples/s] Generating train split: 53392 examples [00:00, 69259.07 examples/s] Generating train split: 69660 examples [00:01, 85958.64 examples/s] Generating train split: 88447 examples [00:01, 80181.70 examples/s] Generating train split: 109184 examples [00:01, 85361.85 examples/s] Generating train split: 123087 examples [00:01, 86829.67 examples/s] Generating train split: 139318 examples [00:01, 97144.19 examples/s] Generating train split: 157992 examples [00:02, 92776.98 examples/s] Generating train split: 178803 examples [00:02, 99095.05 examples/s] Generating train split: 192677 examples [00:02, 98292.92 examples/s] Generating train split: 208978 examples [00:02, 106980.59 examples/s] Generati +2: ng train split: 227918 examples [00:02, 91560.59 examples/s] Generating train split: 241725 examples [00:02, 95407.49 examples/s] Generating train split: 253114 examples [00:03, 81285.99 examples/s] Generating train split: 271907 examples [00:03, 93881.27 examples/s] Generating train split: 278638 examples [00:03, 84392.86 examples/s] +2: [2025-11-23 14:41:10,911] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:88] [PID:1476128] [RANK:3] Loading dataset: /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking with base_type: chat_template and prompt_style: None +2: Tokenizing Prompts (num_proc=32): 0%| | 0/278638 [00:0016384) (num_proc=32): 0%| | 0/557277 [00:0016384) (num_proc=32): 0%| | 1000/557277 [00:00<05:10, 1789.00 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 2%|▏ | 9000/557277 [00:00<00:31, 17564.32 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 3%|▎ | 17000/557277 [00:00<00:17, 30485.42 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 5%|▍ | 26000/557277 [00:00<00:12, 42560.60 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 6%|▋ | 36000/557277 [00:01<00:09, 56255.84 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 8%|▊ | 44000/557277 [00:01<00:08, 57586.87 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 9%|▉ | 52000/557277 [00:01<00:13, 37965.69 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 11%|█▏ | 64000/557277 [00:01<00:09, 49547.77 examples/s] Dropping +2: Long Sequences (>16384) (num_proc=32): 15%|█▍ | 81000/557277 [00:01<00:06, 72009.60 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 17%|█▋ | 94000/557277 [00:01<00:05, 82752.52 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 19%|█▉ | 105000/557277 [00:01<00:05, 88071.49 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 21%|██ | 116000/557277 [00:02<00:05, 86115.20 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 23%|██▎ | 126000/557277 [00:02<00:05, 79984.76 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 25%|██▍ | 137000/557277 [00:02<00:04, 86364.39 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 27%|██▋ | 148000/557277 [00:02<00:04, 91588.40 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 29%|██▊ | 160000/557277 [00:02<00:04, 98327.37 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 31%|███ | 174000/5572 +2: 77 [00:02<00:03, 109147.47 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 34%|███▎ | 187000/557277 [00:02<00:03, 107049.50 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 36%|███▌ | 199000/557277 [00:02<00:03, 102808.68 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 38%|███▊ | 210000/557277 [00:03<00:03, 96100.64 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 39%|███▉ | 220000/557277 [00:03<00:03, 91287.20 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 41%|████▏ | 230000/557277 [00:03<00:03, 92769.75 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 43%|████▎ | 242000/557277 [00:03<00:03, 99103.41 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 46%|████▌ | 256000/557277 [00:03<00:02, 105534.12 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 49%|████▊ | 271000/557277 [00:03<00:02, 114584.42 examples +2: /s] Dropping Long Sequences (>16384) (num_proc=32): 51%|█████ | 283000/557277 [00:03<00:02, 101016.54 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 53%|█████▎ | 294000/557277 [00:03<00:02, 101394.75 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 55%|█████▍ | 305000/557277 [00:03<00:02, 96302.89 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 57%|█████▋ | 315075/557277 [00:04<00:02, 91299.86 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 58%|█████▊ | 324395/557277 [00:04<00:02, 85868.99 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 60%|█████▉ | 333640/557277 [00:04<00:04, 55552.64 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 61%|██████ | 340640/557277 [00:04<00:04, 49709.14 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 62%|██████▏ | 346640/557277 [00:05<00:08, 26019.59 examples/s] Dropping +2: Long Sequences (>16384) (num_proc=32): 63%|██████▎ | 352640/557277 [00:05<00:07, 28687.90 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 64%|██████▍ | 357640/557277 [00:05<00:06, 30991.24 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 65%|██████▌ | 362640/557277 [00:06<00:09, 19577.30 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 66%|██████▌ | 366640/557277 [00:06<00:08, 21472.69 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 67%|██████▋ | 372640/557277 [00:06<00:07, 24961.69 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 68%|██████▊ | 376640/557277 [00:06<00:09, 18496.03 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 68%|██████▊ | 379640/557277 [00:07<00:09, 17790.28 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 69%|██████▉ | 383640/557277 [00:07<00:08, 20499.73 examples/s] Dropping +2: Long Sequences (>16384) (num_proc=32): 70%|██████▉ | 388640/557277 [00:07<00:07, 23658.68 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 70%|███████ | 391640/557277 [00:07<00:08, 19359.88 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 71%|███████ | 394640/557277 [00:07<00:10, 15691.44 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 71%|███████▏ | 397640/557277 [00:07<00:08, 17819.58 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 73%|███████▎ | 404640/557277 [00:08<00:06, 23883.45 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 73%|███████▎ | 407640/557277 [00:08<00:07, 20838.06 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 74%|███████▎ | 410640/557277 [00:08<00:09, 15836.38 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 74%|███████▍ | 413640/557277 [00:08<00:08, 17791.18 examples/s] +2: Dropping Long Sequences (>16384) (num_proc=32): 75%|███████▌ | 419640/557277 [00:08<00:05, 25250.13 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 76%|███████▌ | 423640/557277 [00:09<00:06, 19899.78 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 77%|███████▋ | 426640/557277 [00:09<00:07, 16481.45 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 77%|███████▋ | 429640/557277 [00:09<00:07, 17802.33 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 78%|███████▊ | 435640/557277 [00:09<00:05, 23702.81 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 79%|███████▊ | 438640/557277 [00:10<00:06, 18544.97 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 79%|███████▉ | 441640/557277 [00:10<00:07, 16402.49 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 80%|███████▉ | 445640/557277 [00:10<00:06, 1788 +2: 3.40 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 81%|████████ | 451640/557277 [00:10<00:04, 23941.46 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 82%|████████▏ | 454640/557277 [00:10<00:05, 18890.17 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 82%|████████▏ | 457640/557277 [00:11<00:06, 15984.80 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 83%|████████▎ | 461640/557277 [00:11<00:05, 19090.74 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 83%|████████▎ | 464640/557277 [00:11<00:04, 20939.59 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 84%|████████▍ | 468640/557277 [00:11<00:03, 23009.39 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 85%|████████▍ | 471640/557277 [00:11<00:04, 18121.65 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 85%|████████▌ | 47 +2: 4640/557277 [00:12<00:05, 16217.02 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 86%|████████▌ | 477640/557277 [00:12<00:04, 17747.63 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 86%|████████▋ | 481640/557277 [00:12<00:03, 21616.92 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 87%|████████▋ | 484640/557277 [00:12<00:03, 23369.88 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 88%|████████▊ | 487640/557277 [00:12<00:04, 16781.47 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 88%|████████▊ | 490640/557277 [00:12<00:04, 16650.16 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 89%|████████▊ | 493640/557277 [00:12<00:03, 18552.48 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 89%|████████▉ | 496640/557277 [00:13<00:03, 19331.92 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 90%| +2: ████████▉ | 500640/557277 [00:13<00:02, 22921.87 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 90%|█████████ | 503640/557277 [00:13<00:03, 17168.86 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 91%|█████████ | 506640/557277 [00:13<00:02, 17101.93 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 91%|█████████▏| 508640/557277 [00:13<00:02, 17331.03 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 92%|█████████▏| 510640/557277 [00:13<00:02, 17529.32 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 93%|█████████▎| 515640/557277 [00:14<00:01, 24375.40 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 93%|█████████▎| 518640/557277 [00:14<00:02, 17698.47 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 94%|█████████▎| 521640/557277 [00:14<00:01, 18166.50 examples/s] Dropping Lo +2: ng Sequences (>16384) (num_proc=32): 94%|█████████▍| 524640/557277 [00:14<00:01, 17556.95 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 95%|█████████▍| 526640/557277 [00:14<00:01, 17588.01 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 96%|█████████▌| 532640/557277 [00:14<00:00, 25624.74 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 96%|█████████▌| 535640/557277 [00:15<00:01, 19777.39 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 97%|█████████▋| 538640/557277 [00:15<00:00, 19240.06 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 97%|█████████▋| 541055/557277 [00:15<00:00, 17734.85 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 98%|█████████▊| 544470/557277 [00:15<00:00, 19553.35 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 99%|█████████▊| 549714/55727 +2: 7 [00:15<00:00, 23559.53 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 99%|█████████▉| 552958/557277 [00:15<00:00, 23149.39 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 100%|█████████▉| 556033/557277 [00:16<00:00, 24613.07 examples/s] Dropping Long Sequences (>16384) (num_proc=32): 100%|██████████| 557277/557277 [00:16<00:00, 33865.07 examples/s] +2: Drop Samples with Zero Trainable Tokens (num_proc=32): 0%| | 0/553055 [00:00 +0: jzxh251:3958317:3958317 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh251:3958317:3958317 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh251:3958317:3958317 [0] NCCL INFO NET/Plugin: Using internal network plugin. +0: jzxh251:3958317:3958317 [0] NCCL INFO cudaDriverVersion 12080 +0: NCCL version 2.21.5+cuda12.4 +0: jzxh251:3958317:3958317 [0] NCCL INFO Comm config Blocking set to 1 +0: jzxh251:3958318:3958318 [1] NCCL INFO cudaDriverVersion 12080 +0: jzxh251:3958318:3958318 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.233<0> +0: jzxh251:3958318:3958318 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh251:3958318:3958318 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh251:3958318:3958318 [1] NCCL INFO NET/Plugin: Using internal network plugin. +0: jzxh251:3958318:3958318 [1] NCCL INFO Comm config Blocking set to 1 +0: jzxh251:3958319:3958319 [2] NCCL INFO cudaDriverVersion 12080 +0: jzxh251:3958320:3958320 [3] NCCL INFO cudaDriverVersion 12080 +2: jzxh253:1476125:1476125 [0] NCCL INFO cudaDriverVersion 12080 +0: jzxh251:3958320:3958320 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.233<0> +0: jzxh251:3958319:3958319 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.233<0> +0: jzxh251:3958320:3958320 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh251:3958320:3958320 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh251:3958320:3958320 [3] NCCL INFO NET/Plugin: Using internal network plugin. +0: jzxh251:3958319:3958319 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +0: jzxh251:3958319:3958319 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +0: jzxh251:3958319:3958319 [2] NCCL INFO NET/Plugin: Using internal network plugin. +1: jzxh252:2700705:2700705 [0] NCCL INFO cudaDriverVersion 12080 +1: jzxh252:2700705:2700705 [0] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.237<0> +0: jzxh251:3958320:3958320 [3] NCCL INFO Comm config Blocking set to 1 +0: jzxh251:3958319:3958319 [2] NCCL INFO Comm config Blocking set to 1 +3: jzxh254:1472009:1472009 [0] NCCL INFO cudaDriverVersion 12080 +3: jzxh254:1472009:1472009 [0] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.245<0> +2: jzxh253:1476125:1476125 [0] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.241<0> +2: jzxh253:1476127:1476127 [2] NCCL INFO cudaDriverVersion 12080 +1: jzxh252:2700705:2700705 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh252:2700705:2700705 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +1: jzxh252:2700705:2700705 [0] NCCL INFO NET/Plugin: Using internal network plugin. +1: jzxh252:2700706:2700706 [1] NCCL INFO cudaDriverVersion 12080 +3: jzxh254:1472009:1472009 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh254:1472009:1472009 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +3: jzxh254:1472009:1472009 [0] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh253:1476125:1476125 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh254:1472011:1472011 [2] NCCL INFO cudaDriverVersion 12080 +2: jzxh253:1476125:1476125 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh253:1476125:1476125 [0] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh253:1476127:1476127 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.241<0> +2: jzxh253:1476127:1476127 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +2: jzxh253:1476127:1476127 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh253:1476127:1476127 [2] NCCL INFO NET/Plugin: Using internal network plugin. +1: jzxh252:2700708:2700708 [3] NCCL INFO cudaDriverVersion 12080 +1: jzxh252:2700706:2700706 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.237<0> +1: jzxh252:2700705:2700705 [0] NCCL INFO Comm config Blocking set to 1 +1: jzxh252:2700706:2700706 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh252:2700706:2700706 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +1: jzxh252:2700706:2700706 [1] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh254:1472010:1472010 [1] NCCL INFO cudaDriverVersion 12080 +1: jzxh252:2700708:2700708 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.237<0> +1: jzxh252:2700707:2700707 [2] NCCL INFO cudaDriverVersion 12080 +3: jzxh254:1472009:1472009 [0] NCCL INFO Comm config Blocking set to 1 +1: jzxh252:2700708:2700708 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh252:2700708:2700708 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh253:1476128:1476128 [3] NCCL INFO cudaDriverVersion 12080 +1: jzxh252:2700708:2700708 [3] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh253:1476125:1476125 [0] NCCL INFO Comm config Blocking set to 1 +3: jzxh254:1472011:1472011 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.245<0> +3: jzxh254:1472010:1472010 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.245<0> +1: jzxh252:2700707:2700707 [2] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.237<0> +2: jzxh253:1476126:1476126 [1] NCCL INFO cudaDriverVersion 12080 +2: jzxh253:1476127:1476127 [2] NCCL INFO Comm config Blocking set to 1 +1: jzxh252:2700706:2700706 [1] NCCL INFO Comm config Blocking set to 1 +1: jzxh252:2700707:2700707 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +1: jzxh252:2700707:2700707 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +1: jzxh252:2700707:2700707 [2] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh254:1472012:1472012 [3] NCCL INFO cudaDriverVersion 12080 +3: jzxh254:1472011:1472011 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh254:1472011:1472011 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +3: jzxh254:1472011:1472011 [2] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh254:1472010:1472010 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh254:1472010:1472010 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +3: jzxh254:1472010:1472010 [1] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh253:1476126:1476126 [1] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.241<0> +2: jzxh253:1476128:1476128 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.241<0> +1: jzxh252:2700708:2700708 [3] NCCL INFO Comm config Blocking set to 1 +2: jzxh253:1476126:1476126 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +2: jzxh253:1476126:1476126 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh253:1476126:1476126 [1] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh253:1476128:1476128 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +2: jzxh253:1476128:1476128 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh253:1476128:1476128 [3] NCCL INFO NET/Plugin: Using internal network plugin. +3: jzxh254:1472011:1472011 [2] NCCL INFO Comm config Blocking set to 1 +1: jzxh252:2700707:2700707 [2] NCCL INFO Comm config Blocking set to 1 +3: jzxh254:1472010:1472010 [1] NCCL INFO Comm config Blocking set to 1 +3: jzxh254:1472012:1472012 [3] NCCL INFO Bootstrap : Using ibp24s0:10.100.7.245<0> +3: jzxh254:1472012:1472012 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) +3: jzxh254:1472012:1472012 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so +2: jzxh253:1476126:1476126 [1] NCCL INFO Comm config Blocking set to 1 +3: jzxh254:1472012:1472012 [3] NCCL INFO NET/Plugin: Using internal network plugin. +2: jzxh253:1476128:1476128 [3] NCCL INFO Comm config Blocking set to 1 +3: jzxh254:1472012:1472012 [3] NCCL INFO Comm config Blocking set to 1 +0: jzxh251:3958318:3959113 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.233<0> +0: jzxh251:3958318:3959113 [1] NCCL INFO Using non-device net plugin version 0 +0: jzxh251:3958318:3959113 [1] NCCL INFO Using network IB +0: jzxh251:3958317:3959112 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.233<0> +0: jzxh251:3958320:3959114 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.233<0> +0: jzxh251:3958317:3959112 [0] NCCL INFO Using non-device net plugin version 0 +0: jzxh251:3958317:3959112 [0] NCCL INFO Using network IB +0: jzxh251:3958319:3959115 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.233<0> +0: jzxh251:3958320:3959114 [3] NCCL INFO Using non-device net plugin version 0 +0: jzxh251:3958320:3959114 [3] NCCL INFO Using network IB +0: jzxh251:3958319:3959115 [2] NCCL INFO Using non-device net plugin version 0 +0: jzxh251:3958319:3959115 [2] NCCL INFO Using network IB +2: jzxh253:1476127:1478099 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.241<0> +2: jzxh253:1476125:1478098 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.241<0> +2: jzxh253:1476128:1478101 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.241<0> +2: jzxh253:1476126:1478100 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.241<0> +2: jzxh253:1476127:1478099 [2] NCCL INFO Using non-device net plugin version 0 +2: jzxh253:1476125:1478098 [0] NCCL INFO Using non-device net plugin version 0 +2: jzxh253:1476127:1478099 [2] NCCL INFO Using network IB +2: jzxh253:1476125:1478098 [0] NCCL INFO Using network IB +2: jzxh253:1476128:1478101 [3] NCCL INFO Using non-device net plugin version 0 +2: jzxh253:1476128:1478101 [3] NCCL INFO Using network IB +2: jzxh253:1476126:1478100 [1] NCCL INFO Using non-device net plugin version 0 +2: jzxh253:1476126:1478100 [1] NCCL INFO Using network IB +1: jzxh252:2700707:2701505 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.237<0> +1: jzxh252:2700705:2701502 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.237<0> +1: jzxh252:2700706:2701503 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.237<0> +1: jzxh252:2700707:2701505 [2] NCCL INFO Using non-device net plugin version 0 +1: jzxh252:2700707:2701505 [2] NCCL INFO Using network IB +1: jzxh252:2700705:2701502 [0] NCCL INFO Using non-device net plugin version 0 +1: jzxh252:2700705:2701502 [0] NCCL INFO Using network IB +1: jzxh252:2700708:2701504 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.237<0> +1: jzxh252:2700706:2701503 [1] NCCL INFO Using non-device net plugin version 0 +1: jzxh252:2700706:2701503 [1] NCCL INFO Using network IB +1: jzxh252:2700708:2701504 [3] NCCL INFO Using non-device net plugin version 0 +1: jzxh252:2700708:2701504 [3] NCCL INFO Using network IB +3: jzxh254:1472009:1472810 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.245<0> +3: jzxh254:1472010:1472812 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.245<0> +3: jzxh254:1472011:1472811 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.245<0> +3: jzxh254:1472009:1472810 [0] NCCL INFO Using non-device net plugin version 0 +3: jzxh254:1472009:1472810 [0] NCCL INFO Using network IB +3: jzxh254:1472010:1472812 [1] NCCL INFO Using non-device net plugin version 0 +3: jzxh254:1472010:1472812 [1] NCCL INFO Using network IB +3: jzxh254:1472012:1472813 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp24s0:10.100.7.245<0> +3: jzxh254:1472011:1472811 [2] NCCL INFO Using non-device net plugin version 0 +3: jzxh254:1472011:1472811 [2] NCCL INFO Using network IB +3: jzxh254:1472012:1472813 [3] NCCL INFO Using non-device net plugin version 0 +3: jzxh254:1472012:1472813 [3] NCCL INFO Using network IB +0: jzxh251:3958317:3959112 [0] NCCL INFO ncclCommInitRank comm 0x55ad9c344130 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init START +0: jzxh251:3958319:3959115 [2] NCCL INFO ncclCommInitRank comm 0x557a67ebf3e0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init START +0: jzxh251:3958320:3959114 [3] NCCL INFO ncclCommInitRank comm 0x5627d69ed2e0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init START +0: jzxh251:3958318:3959113 [1] NCCL INFO ncclCommInitRank comm 0x561703831fa0 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init START +3: jzxh254:1472011:1472811 [2] NCCL INFO ncclCommInitRank comm 0x55ee68f2ae50 rank 14 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init START +3: jzxh254:1472012:1472813 [3] NCCL INFO ncclCommInitRank comm 0x55e3cd6150e0 rank 15 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init START +1: jzxh252:2700705:2701502 [0] NCCL INFO ncclCommInitRank comm 0x55691e321820 rank 4 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init START +1: jzxh252:2700706:2701503 [1] NCCL INFO ncclCommInitRank comm 0x55d9b6dce4a0 rank 5 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init START +3: jzxh254:1472010:1472812 [1] NCCL INFO ncclCommInitRank comm 0x558d1e103f20 rank 13 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init START +2: jzxh253:1476125:1478098 [0] NCCL INFO ncclCommInitRank comm 0x55705cd0ee30 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init START +2: jzxh253:1476127:1478099 [2] NCCL INFO ncclCommInitRank comm 0x555b4b77d2b0 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init START +2: jzxh253:1476128:1478101 [3] NCCL INFO ncclCommInitRank comm 0x55c13cf8e7e0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init START +2: jzxh253:1476126:1478100 [1] NCCL INFO ncclCommInitRank comm 0x559d74834b90 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init START +3: jzxh254:1472009:1472810 [0] NCCL INFO ncclCommInitRank comm 0x5603bb4b7f00 rank 12 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init START +1: jzxh252:2700707:2701505 [2] NCCL INFO ncclCommInitRank comm 0x563d58040fd0 rank 6 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init START +1: jzxh252:2700708:2701504 [3] NCCL INFO ncclCommInitRank comm 0x55e552323b50 rank 7 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init START +3: jzxh254:1472009:1472810 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +3: jzxh254:1472009:1472810 [0] NCCL INFO NVLS multicast support is not available on dev 0 +1: jzxh252:2700705:2701502 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +1: jzxh252:2700705:2701502 [0] NCCL INFO NVLS multicast support is not available on dev 0 +0: jzxh251:3958317:3959112 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +0: jzxh251:3958317:3959112 [0] NCCL INFO NVLS multicast support is not available on dev 0 +2: jzxh253:1476128:1478101 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +2: jzxh253:1476128:1478101 [3] NCCL INFO NVLS multicast support is not available on dev 3 +3: jzxh254:1472011:1472811 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +3: jzxh254:1472011:1472811 [2] NCCL INFO NVLS multicast support is not available on dev 2 +1: jzxh252:2700707:2701505 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +1: jzxh252:2700707:2701505 [2] NCCL INFO NVLS multicast support is not available on dev 2 +3: jzxh254:1472012:1472813 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +3: jzxh254:1472012:1472813 [3] NCCL INFO NVLS multicast support is not available on dev 3 +0: jzxh251:3958319:3959115 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +1: jzxh252:2700708:2701504 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +1: jzxh252:2700708:2701504 [3] NCCL INFO NVLS multicast support is not available on dev 3 +0: jzxh251:3958319:3959115 [2] NCCL INFO NVLS multicast support is not available on dev 2 +3: jzxh254:1472010:1472812 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +3: jzxh254:1472010:1472812 [1] NCCL INFO NVLS multicast support is not available on dev 1 +1: jzxh252:2700706:2701503 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +1: jzxh252:2700706:2701503 [1] NCCL INFO NVLS multicast support is not available on dev 1 +0: jzxh251:3958318:3959113 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +0: jzxh251:3958318:3959113 [1] NCCL INFO NVLS multicast support is not available on dev 1 +0: jzxh251:3958320:3959114 [3] NCCL INFO Setting affinity for GPU 3 to ffffff00,00000000,00000000,ffffff00,00000000,00000000 +0: jzxh251:3958320:3959114 [3] NCCL INFO NVLS multicast support is not available on dev 3 +2: jzxh253:1476127:1478099 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffff0000,00000000,000000ff,ffff0000,00000000 +2: jzxh253:1476127:1478099 [2] NCCL INFO NVLS multicast support is not available on dev 2 +2: jzxh253:1476125:1478098 [0] NCCL INFO Setting affinity for GPU 0 to ffffff,00000000,00000000,00ffffff +2: jzxh253:1476125:1478098 [0] NCCL INFO NVLS multicast support is not available on dev 0 +2: jzxh253:1476126:1478100 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000,00000000,0000ffff,ff000000 +2: jzxh253:1476126:1478100 [1] NCCL INFO NVLS multicast support is not available on dev 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO comm 0x55ad9c344130 rank 0 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +0: jzxh251:3958318:3959113 [1] NCCL INFO comm 0x561703831fa0 rank 1 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 01/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 02/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 03/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 05/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 06/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 07/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh251:3958320:3959114 [3] NCCL INFO comm 0x5627d69ed2e0 rank 3 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +0: jzxh251:3958319:3959115 [2] NCCL INFO comm 0x557a67ebf3e0 rank 2 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +0: jzxh251:3958318:3959113 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] 3/-1/-1->1->0 [3] 0/-1/-1->1->3 [4] -1/-1/-1->1->2 [5] 3/9/-1->1->-1 [6] -1/-1/-1->1->3 [7] 0/-1/-1->1->2 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->5 [10] 3/-1/-1->1->0 [11] 0/-1/-1->1->3 [12] -1/-1/-1->1->2 [13] 3/-1/-1->1->5 [14] -1/-1/-1->1->3 [15] 0/-1/-1->1->2 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh251:3958318:3959113 [1] NCCL INFO P2P Chunksize set to 131072 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 09/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 10/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 11/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 13/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 14/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958320:3959114 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] 0/-1/-1->3->2 [2] -1/-1/-1->3->1 [3] 1/11/-1->3->-1 [4] 2/-1/-1->3->0 [5] 0/-1/-1->3->1 [6] 1/-1/-1->3->0 [7] 2/11/-1->3->-1 [8] -1/-1/-1->3->2 [9] 0/-1/-1->3->2 [10] -1/-1/-1->3->1 [11] 1/-1/-1->3->7 [12] 2/-1/-1->3->0 [13] 0/-1/-1->3->1 [14] 1/-1/-1->3->0 [15] 2/-1/-1->3->7 +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 15/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh251:3958319:3959115 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 0/10/-1->2->-1 [3] -1/-1/-1->2->0 [4] 1/-1/-1->2->3 [5] -1/-1/-1->2->0 [6] 0/10/-1->2->-1 [7] 1/-1/-1->2->3 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 0/-1/-1->2->6 [11] -1/-1/-1->2->0 [12] 1/-1/-1->2->3 [13] -1/-1/-1->2->0 [14] 0/-1/-1->2->6 [15] 1/-1/-1->2->3 +0: jzxh251:3958320:3959114 [3] NCCL INFO P2P Chunksize set to 131072 +0: jzxh251:3958319:3959115 [2] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472012:1472813 [3] NCCL INFO comm 0x55e3cd6150e0 rank 15 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +0: jzxh251:3958317:3959112 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->3 [2] 1/-1/-1->0->2 [3] 2/-1/-1->0->1 [4] 3/8/-1->0->-1 [5] 2/-1/-1->0->3 [6] 3/-1/-1->0->2 [7] -1/-1/-1->0->1 [8] 1/-1/-1->0->4 [9] -1/-1/-1->0->3 [10] 1/-1/-1->0->2 [11] 2/-1/-1->0->1 [12] 3/-1/-1->0->4 [13] 2/-1/-1->0->3 [14] 3/-1/-1->0->2 [15] -1/-1/-1->0->1 +0: jzxh251:3958317:3959112 [0] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472012:1472813 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 12/-1/-1->15->14 [2] -1/-1/-1->15->13 [3] 13/-1/-1->15->11 [4] 14/-1/-1->15->12 [5] 12/-1/-1->15->13 [6] 13/-1/-1->15->12 [7] 14/-1/-1->15->11 [8] -1/-1/-1->15->14 [9] 12/-1/-1->15->14 [10] -1/-1/-1->15->13 [11] 13/7/-1->15->-1 [12] 14/-1/-1->15->12 [13] 12/-1/-1->15->13 [14] 13/-1/-1->15->12 [15] 14/7/-1->15->-1 +3: jzxh254:1472012:1472813 [3] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472011:1472811 [2] NCCL INFO comm 0x55ee68f2ae50 rank 14 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +1: jzxh252:2700705:2701502 [0] NCCL INFO comm 0x55691e321820 rank 4 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +1: jzxh252:2700706:2701503 [1] NCCL INFO comm 0x55d9b6dce4a0 rank 5 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +1: jzxh252:2700707:2701505 [2] NCCL INFO comm 0x563d58040fd0 rank 6 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +1: jzxh252:2700708:2701504 [3] NCCL INFO comm 0x55e552323b50 rank 7 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +3: jzxh254:1472011:1472811 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 12/-1/-1->14->10 [3] -1/-1/-1->14->12 [4] 13/-1/-1->14->15 [5] -1/-1/-1->14->12 [6] 12/-1/-1->14->10 [7] 13/-1/-1->14->15 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 12/6/-1->14->-1 [11] -1/-1/-1->14->12 [12] 13/-1/-1->14->15 [13] -1/-1/-1->14->12 [14] 12/6/-1->14->-1 [15] 13/-1/-1->14->15 +3: jzxh254:1472011:1472811 [2] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472010:1472812 [1] NCCL INFO comm 0x558d1e103f20 rank 13 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +2: jzxh253:1476125:1478098 [0] NCCL INFO comm 0x55705cd0ee30 rank 8 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +2: jzxh253:1476128:1478101 [3] NCCL INFO comm 0x55c13cf8e7e0 rank 11 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +2: jzxh253:1476127:1478099 [2] NCCL INFO comm 0x555b4b77d2b0 rank 10 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +2: jzxh253:1476126:1478100 [1] NCCL INFO comm 0x559d74834b90 rank 9 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +2: jzxh253:1476125:1478098 [0] NCCL INFO Trees [0] 9/4/12->8->0 [1] -1/-1/-1->8->11 [2] 9/-1/-1->8->10 [3] 10/-1/-1->8->9 [4] 11/4/12->8->0 [5] 10/-1/-1->8->11 [6] 11/-1/-1->8->10 [7] -1/-1/-1->8->9 [8] 9/-1/-1->8->4 [9] -1/-1/-1->8->11 [10] 9/-1/-1->8->10 [11] 10/-1/-1->8->9 [12] 11/-1/-1->8->4 [13] 10/-1/-1->8->11 [14] 11/-1/-1->8->10 [15] -1/-1/-1->8->9 +2: jzxh253:1476125:1478098 [0] NCCL INFO P2P Chunksize set to 131072 +2: jzxh253:1476127:1478099 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 8/6/14->10->2 [3] -1/-1/-1->10->8 [4] 9/-1/-1->10->11 [5] -1/-1/-1->10->8 [6] 8/6/14->10->2 [7] 9/-1/-1->10->11 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 8/-1/-1->10->6 [11] -1/-1/-1->10->8 [12] 9/-1/-1->10->11 [13] -1/-1/-1->10->8 [14] 8/-1/-1->10->6 [15] 9/-1/-1->10->11 +2: jzxh253:1476128:1478101 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] 8/-1/-1->11->10 [2] -1/-1/-1->11->9 [3] 9/7/15->11->3 [4] 10/-1/-1->11->8 [5] 8/-1/-1->11->9 [6] 9/-1/-1->11->8 [7] 10/7/15->11->3 [8] -1/-1/-1->11->10 [9] 8/-1/-1->11->10 [10] -1/-1/-1->11->9 [11] 9/-1/-1->11->7 [12] 10/-1/-1->11->8 [13] 8/-1/-1->11->9 [14] 9/-1/-1->11->8 [15] 10/-1/-1->11->7 +3: jzxh254:1472010:1472812 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->9 [2] 15/-1/-1->13->12 [3] 12/-1/-1->13->15 [4] -1/-1/-1->13->14 [5] 15/-1/-1->13->9 [6] -1/-1/-1->13->15 [7] 12/-1/-1->13->14 [8] 14/-1/-1->13->12 [9] 14/5/-1->13->-1 [10] 15/-1/-1->13->12 [11] 12/-1/-1->13->15 [12] -1/-1/-1->13->14 [13] 15/5/-1->13->-1 [14] -1/-1/-1->13->15 [15] 12/-1/-1->13->14 +3: jzxh254:1472010:1472812 [1] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472009:1472810 [0] NCCL INFO comm 0x5603bb4b7f00 rank 12 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +3: jzxh254:1472009:1472810 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] -1/-1/-1->12->15 [2] 13/-1/-1->12->14 [3] 14/-1/-1->12->13 [4] 15/-1/-1->12->8 [5] 14/-1/-1->12->15 [6] 15/-1/-1->12->14 [7] -1/-1/-1->12->13 [8] 13/4/-1->12->-1 [9] -1/-1/-1->12->15 [10] 13/-1/-1->12->14 [11] 14/-1/-1->12->13 [12] 15/4/-1->12->-1 [13] 14/-1/-1->12->15 [14] 15/-1/-1->12->14 [15] -1/-1/-1->12->13 +3: jzxh254:1472009:1472810 [0] NCCL INFO P2P Chunksize set to 131072 +2: jzxh253:1476126:1478100 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/5/13->9->1 [2] 11/-1/-1->9->8 [3] 8/-1/-1->9->11 [4] -1/-1/-1->9->10 [5] 11/5/13->9->1 [6] -1/-1/-1->9->11 [7] 8/-1/-1->9->10 [8] 10/-1/-1->9->8 [9] 10/-1/-1->9->5 [10] 11/-1/-1->9->8 [11] 8/-1/-1->9->11 [12] -1/-1/-1->9->10 [13] 11/-1/-1->9->5 [14] -1/-1/-1->9->11 [15] 8/-1/-1->9->10 +2: jzxh253:1476127:1478099 [2] NCCL INFO P2P Chunksize set to 131072 +2: jzxh253:1476128:1478101 [3] NCCL INFO P2P Chunksize set to 131072 +2: jzxh253:1476126:1478100 [1] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700705:2701502 [0] NCCL INFO Trees [0] 5/-1/-1->4->8 [1] -1/-1/-1->4->7 [2] 5/-1/-1->4->6 [3] 6/-1/-1->4->5 [4] 7/-1/-1->4->8 [5] 6/-1/-1->4->7 [6] 7/-1/-1->4->6 [7] -1/-1/-1->4->5 [8] 5/8/0->4->12 [9] -1/-1/-1->4->7 [10] 5/-1/-1->4->6 [11] 6/-1/-1->4->5 [12] 7/8/0->4->12 [13] 6/-1/-1->4->7 [14] 7/-1/-1->4->6 [15] -1/-1/-1->4->5 +1: jzxh252:2700705:2701502 [0] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700706:2701503 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->9 [2] 7/-1/-1->5->4 [3] 4/-1/-1->5->7 [4] -1/-1/-1->5->6 [5] 7/-1/-1->5->9 [6] -1/-1/-1->5->7 [7] 4/-1/-1->5->6 [8] 6/-1/-1->5->4 [9] 6/9/1->5->13 [10] 7/-1/-1->5->4 [11] 4/-1/-1->5->7 [12] -1/-1/-1->5->6 [13] 7/9/1->5->13 [14] -1/-1/-1->5->7 [15] 4/-1/-1->5->6 +1: jzxh252:2700707:2701505 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 4/-1/-1->6->10 [3] -1/-1/-1->6->4 [4] 5/-1/-1->6->7 [5] -1/-1/-1->6->4 [6] 4/-1/-1->6->10 [7] 5/-1/-1->6->7 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 4/10/2->6->14 [11] -1/-1/-1->6->4 [12] 5/-1/-1->6->7 [13] -1/-1/-1->6->4 [14] 4/10/2->6->14 [15] 5/-1/-1->6->7 +1: jzxh252:2700706:2701503 [1] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700708:2701504 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 4/-1/-1->7->6 [2] -1/-1/-1->7->5 [3] 5/-1/-1->7->11 [4] 6/-1/-1->7->4 [5] 4/-1/-1->7->5 [6] 5/-1/-1->7->4 [7] 6/-1/-1->7->11 [8] -1/-1/-1->7->6 [9] 4/-1/-1->7->6 [10] -1/-1/-1->7->5 [11] 5/11/3->7->15 [12] 6/-1/-1->7->4 [13] 4/-1/-1->7->5 [14] 5/-1/-1->7->4 [15] 6/11/3->7->15 +1: jzxh252:2700707:2701505 [2] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700708:2701504 [3] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 00/0 : 14[2] -> 15[3] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 04/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 00/0 : 6[2] -> 7[3] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 08/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 12/0 : 14[2] -> 15[3] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 04/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 08/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 00/0 : 5[1] -> 6[2] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 12/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 03/0 : 5[1] -> 6[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 04/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 00/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 07/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 03/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 08/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 04/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 11/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 07/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 12/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 08/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 15/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 11/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 12/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 15/0 : 13[1] -> 14[2] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 00/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 03/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 04/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 07/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 08/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 11/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 12/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 15/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 00/0 : 4[0] -> 5[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 03/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 04/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 07/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 01/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 02/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 11/0 : 4[0] -> 5[1] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 05/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 15/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 06/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 09/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 10/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 01/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 13/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 02/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 02/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 14/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 05/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 05/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 06/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 06/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 09/0 : 4[0] -> 7[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 10/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 09/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 01/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 13/0 : 4[0] -> 7[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 14/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 10/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 02/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 13/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 05/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 06/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 14/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 09/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 10/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 13/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 14/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 01/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 02/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 05/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 06/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 9[1] -> 8[0] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 10/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 01/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 02/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 14/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 09/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 05/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 06/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 09/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 10/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 13/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 14/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 03/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 07/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 11/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 11/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 03/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 07/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 15/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 15/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 02/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 01/0 : 15[3] -> 14[2] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 06/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 10/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 01/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 05/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 05/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 09/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 14/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 02/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 09/0 : 7[3] -> 6[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 10/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 13/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 13/0 : 7[3] -> 6[2] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 06/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 6[2] -> 5[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Connected all rings +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Connected all rings +2: jzxh253:1476126:1478100 [1] NCCL INFO Connected all rings +0: jzxh251:3958318:3959113 [1] NCCL INFO Connected all rings +0: jzxh251:3958320:3959114 [3] NCCL INFO Connected all rings +0: jzxh251:3958319:3959115 [2] NCCL INFO Connected all rings +1: jzxh252:2700707:2701505 [2] NCCL INFO Connected all rings +1: jzxh252:2700705:2701502 [0] NCCL INFO Connected all rings +1: jzxh252:2700706:2701503 [1] NCCL INFO Connected all rings +1: jzxh252:2700708:2701504 [3] NCCL INFO Connected all rings +2: jzxh253:1476128:1478101 [3] NCCL INFO Connected all rings +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Connected all rings +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 02/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 10/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 01/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 01/0 : 6[2] -> 7[3] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Connected all rings +3: jzxh254:1472011:1472811 [2] NCCL INFO Connected all rings +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Connected all rings +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 02/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 07/0 : 6[2] -> 7[3] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 02/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Connected all rings +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 02/0 : 4[0] -> 6[2] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 02/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 03/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 09/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 02/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 15/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 03/0 : 0[0] -> 2[2] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 03/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 10/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 01/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 02/0 : 1[1] -> 3[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 05/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 05/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 05/0 : 4[0] -> 6[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 02/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 03/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 07/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 06/0 : 4[0] -> 6[2] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 06/0 : 0[0] -> 2[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 03/0 : 1[1] -> 3[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 06/0 : 8[0] -> 10[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 03/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 05/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 01/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 10/0 : 0[0] -> 2[2] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 10/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 10/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 05/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 06/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 09/0 : 14[2] -> 15[3] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 15/0 : 14[2] -> 15[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 06/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 09/0 : 13[1] -> 14[2] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 11/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 11/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 11/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 02/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 06/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 10/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 10/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 02/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 13/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 03/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 13/0 : 8[0] -> 10[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 03/0 : 13[1] -> 15[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 11/0 : 9[1] -> 11[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 13/0 : 0[0] -> 2[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 10/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 11/0 : 1[1] -> 3[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 13/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 14/0 : 4[0] -> 6[2] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 14/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 05/0 : 13[1] -> 15[3] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 05/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 14/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 13/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 06/0 : 13[1] -> 15[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 14/0 : 9[1] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 06/0 : 12[0] -> 14[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 14/0 : 1[1] -> 3[3] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 11/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 04/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 10/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 12/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 12/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 11/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 10/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 14/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 04/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 11/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 13/0 : 13[1] -> 15[3] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 4[0] -> 7[3] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 13/0 : 12[0] -> 14[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 14/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 14/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 04/0 : 12[0] -> 15[3] via P2P/CUMEM +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 12/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958317:3959112 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472009:1472810 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 02/0 : 2[2] -> 0[0] via P2P/CUMEM +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2701502 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478098 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 03/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 01/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 02/0 : 6[2] -> 4[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 01/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 02/0 : 10[2] -> 8[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 02/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 01/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 05/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 03/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 04/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 03/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 04/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 04/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 05/0 : 10[2] -> 8[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 03/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 06/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 05/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 05/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 06/0 : 10[2] -> 8[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 05/0 : 14[2] -> 12[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 05/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 10/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 06/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 05/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 06/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 06/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 10/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 06/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 09/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 06/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 11/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 10/0 : 14[2] -> 12[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 12/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 11/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 10/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 09/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 09/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 13/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 11/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 12/0 : 11[3] -> 8[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 13/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 11/0 : 6[2] -> 4[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 12/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 14/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 13/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 14/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 13/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 13/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 13/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 13/0 : 14[2] -> 12[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 14/0 : 15[3] -> 12[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 14/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 02/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 14/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 14/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 14/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 02/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 03/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 02/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 03/0 : 3[3] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 02/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 05/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 03/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 05/0 : 3[3] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 03/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 06/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 05/0 : 7[3] -> 5[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 06/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 05/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 10/0 : 15[3] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 10/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 11/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 06/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 13/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 10/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 14/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 11/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 11/0 : 3[3] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 13/0 : 11[3] -> 9[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 14/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 13/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 00/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 06/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 14/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 04/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 10/0 : 7[3] -> 5[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 11/0 : 7[3] -> 5[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 13/0 : 7[3] -> 5[1] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 14/0 : 7[3] -> 5[1] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 07/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 00/0 : 7[3] -> 6[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 04/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 08/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 07/0 : 7[3] -> 6[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 08/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 12/0 : 15[3] -> 14[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 12/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh252:2700708:2701504 [3] NCCL INFO Channel 15/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1472813 [3] NCCL INFO Channel 15/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 00/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 01/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 04/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 07/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 00/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 08/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 00/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 03/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 09/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 03/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 12/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 07/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM +3: jzxh254:1472011:1472811 [2] NCCL INFO Channel 15/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 07/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 08/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478101 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 11/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 08/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh251:3958318:3959113 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 00/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 11/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 01/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh253:1476127:1478099 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700706:2701503 [1] NCCL INFO Channel 15/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 04/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 07/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 08/0 : 6[2] -> 5[1] via P2P/CUMEM +3: jzxh254:1472010:1472812 [1] NCCL INFO Channel 15/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM +0: jzxh251:3958319:3959115 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 09/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh253:1476126:1478100 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 12/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700707:2701505 [2] NCCL INFO Channel 15/0 : 6[2] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3959114 [3] NCCL INFO Connected all trees +0: jzxh251:3958317:3959112 [0] NCCL INFO Connected all trees +0: jzxh251:3958320:3959114 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958320:3959114 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh251:3958319:3959115 [2] NCCL INFO Connected all trees +2: jzxh253:1476125:1478098 [0] NCCL INFO Connected all trees +1: jzxh252:2700708:2701504 [3] NCCL INFO Connected all trees +0: jzxh251:3958318:3959113 [1] NCCL INFO Connected all trees +0: jzxh251:3958317:3959112 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958317:3959112 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh251:3958319:3959115 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958319:3959115 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh251:3958318:3959113 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958318:3959113 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476128:1478101 [3] NCCL INFO Connected all trees +1: jzxh252:2700705:2701502 [0] NCCL INFO Connected all trees +1: jzxh252:2700708:2701504 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476125:1478098 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476126:1478100 [1] NCCL INFO Connected all trees +2: jzxh253:1476125:1478098 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700708:2701504 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476128:1478101 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476128:1478101 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700705:2701502 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh252:2700705:2701502 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476127:1478099 [2] NCCL INFO Connected all trees +2: jzxh253:1476126:1478100 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476126:1478100 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700707:2701505 [2] NCCL INFO Connected all trees +1: jzxh252:2700706:2701503 [1] NCCL INFO Connected all trees +2: jzxh253:1476127:1478099 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476127:1478099 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700707:2701505 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh252:2700706:2701503 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh252:2700707:2701505 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700706:2701503 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh254:1472009:1472810 [0] NCCL INFO Connected all trees +3: jzxh254:1472009:1472810 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh254:1472009:1472810 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh254:1472010:1472812 [1] NCCL INFO Connected all trees +3: jzxh254:1472012:1472813 [3] NCCL INFO Connected all trees +3: jzxh254:1472010:1472812 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh254:1472011:1472811 [2] NCCL INFO Connected all trees +3: jzxh254:1472012:1472813 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh254:1472010:1472812 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh254:1472012:1472813 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh254:1472011:1472811 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh254:1472011:1472811 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476125:1478098 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh253:1476125:1478098 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh253:1476127:1478099 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh253:1476125:1478098 [0] NCCL INFO ncclCommInitRank comm 0x55705cd0ee30 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +2: jzxh253:1476126:1478100 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh253:1476128:1478101 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +2: jzxh253:1476127:1478099 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh253:1476126:1478100 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh253:1476128:1478101 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +2: jzxh253:1476127:1478099 [2] NCCL INFO ncclCommInitRank comm 0x555b4b77d2b0 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +2: jzxh253:1476128:1478101 [3] NCCL INFO ncclCommInitRank comm 0x55c13cf8e7e0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +2: jzxh253:1476126:1478100 [1] NCCL INFO ncclCommInitRank comm 0x559d74834b90 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +1: jzxh252:2700707:2701505 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh252:2700707:2701505 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh252:2700707:2701505 [2] NCCL INFO ncclCommInitRank comm 0x563d58040fd0 rank 6 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +1: jzxh252:2700706:2701503 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh252:2700705:2701502 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh252:2700706:2701503 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh252:2700705:2701502 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh252:2700706:2701503 [1] NCCL INFO ncclCommInitRank comm 0x55d9b6dce4a0 rank 5 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +1: jzxh252:2700705:2701502 [0] NCCL INFO ncclCommInitRank comm 0x55691e321820 rank 4 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +1: jzxh252:2700708:2701504 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +1: jzxh252:2700708:2701504 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +1: jzxh252:2700708:2701504 [3] NCCL INFO ncclCommInitRank comm 0x55e552323b50 rank 7 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +0: jzxh251:3958318:3959113 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh251:3958318:3959113 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh251:3958318:3959113 [1] NCCL INFO ncclCommInitRank comm 0x561703831fa0 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +0: jzxh251:3958320:3959114 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh251:3958320:3959114 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh251:3958320:3959114 [3] NCCL INFO ncclCommInitRank comm 0x5627d69ed2e0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +0: jzxh251:3958317:3959112 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh251:3958317:3959112 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh251:3958317:3959112 [0] NCCL INFO ncclCommInitRank comm 0x55ad9c344130 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +0: jzxh251:3958319:3959115 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +0: jzxh251:3958319:3959115 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +0: jzxh251:3958319:3959115 [2] NCCL INFO ncclCommInitRank comm 0x557a67ebf3e0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +3: jzxh254:1472011:1472811 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh254:1472011:1472811 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh254:1472011:1472811 [2] NCCL INFO ncclCommInitRank comm 0x55ee68f2ae50 rank 14 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +3: jzxh254:1472012:1472813 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh254:1472012:1472813 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh254:1472012:1472813 [3] NCCL INFO ncclCommInitRank comm 0x55e3cd6150e0 rank 15 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +3: jzxh254:1472010:1472812 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh254:1472009:1472810 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so +3: jzxh254:1472010:1472812 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh254:1472009:1472810 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. +3: jzxh254:1472010:1472812 [1] NCCL INFO ncclCommInitRank comm 0x558d1e103f20 rank 13 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +3: jzxh254:1472009:1472810 [0] NCCL INFO ncclCommInitRank comm 0x5603bb4b7f00 rank 12 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xfb9b4631e7b54d2e - Init COMPLETE +2: jzxh253:1476126:1478130 [1] NCCL INFO Channel 04/1 : 9[1] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh253:1476126:1478130 [1] NCCL INFO Channel 05/1 : 9[1] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh253:1476125:1478131 [0] NCCL INFO Channel 04/1 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +2: jzxh253:1476125:1478131 [0] NCCL INFO Channel 05/1 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +2: jzxh253:1476128:1478132 [3] NCCL INFO Channel 04/1 : 11[3] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh253:1476128:1478132 [3] NCCL INFO Channel 05/1 : 11[3] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh253:1476127:1478133 [2] NCCL INFO Channel 04/1 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +2: jzxh253:1476127:1478133 [2] NCCL INFO Channel 05/1 : 10[2] -> 0[0] [send] via NET/IB/0(8)/GDRDMA/Shared +1: jzxh252:2700707:2701534 [2] NCCL INFO Channel 12/1 : 6[2] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh252:2700707:2701534 [2] NCCL INFO Channel 13/1 : 6[2] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh252:2700705:2701535 [0] NCCL INFO Channel 12/1 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +1: jzxh252:2700706:2701536 [1] NCCL INFO Channel 12/1 : 5[1] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh252:2700705:2701535 [0] NCCL INFO Channel 13/1 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +1: jzxh252:2700706:2701536 [1] NCCL INFO Channel 13/1 : 5[1] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +0: jzxh251:3958319:3959146 [2] NCCL INFO Channel 00/1 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh251:3958318:3959147 [1] NCCL INFO Channel 00/1 : 1[1] -> 0[0] via P2P/CUMEM +1: jzxh252:2700708:2701537 [3] NCCL INFO Channel 12/1 : 7[3] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +1: jzxh252:2700708:2701537 [3] NCCL INFO Channel 13/1 : 7[3] -> 0[0] [send] via NET/IB/0(4)/GDRDMA/Shared +0: jzxh251:3958320:3959145 [3] NCCL INFO Channel 00/1 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 08/1 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 09/1 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958319:3959146 [2] NCCL INFO Channel 01/1 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh251:3958318:3959147 [1] NCCL INFO Channel 01/1 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958320:3959145 [3] NCCL INFO Channel 01/1 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1472842 [2] NCCL INFO Channel 08/1 : 14[2] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh254:1472010:1472843 [1] NCCL INFO Channel 08/1 : 13[1] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh254:1472011:1472842 [2] NCCL INFO Channel 09/1 : 14[2] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh254:1472010:1472843 [1] NCCL INFO Channel 09/1 : 13[1] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh254:1472012:1472844 [3] NCCL INFO Channel 08/1 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +3: jzxh254:1472009:1472845 [0] NCCL INFO Channel 08/1 : 12[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +3: jzxh254:1472009:1472845 [0] NCCL INFO Channel 09/1 : 12[0] -> 0[0] [send] via NET/IB/0/GDRDMA/Shared +3: jzxh254:1472012:1472844 [3] NCCL INFO Channel 09/1 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 08/1 : 14[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 09/1 : 14[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 08/1 : 13[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 09/1 : 13[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 08/1 : 12[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 09/1 : 12[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 04/1 : 11[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 05/1 : 11[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 04/1 : 10[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 05/1 : 10[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 04/1 : 9[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 05/1 : 9[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 04/1 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 05/1 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 12/1 : 7[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 13/1 : 7[3] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 12/1 : 6[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 13/1 : 6[2] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 12/1 : 5[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 13/1 : 5[1] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 12/1 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: jzxh251:3958317:3959148 [0] NCCL INFO Channel 13/1 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA/Shared +0: [2025-11-23 14:46:15,578] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:3958317] [RANK:0] gather_len_batches: [59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622, 59622] +0: [2025-11-23 14:46:15,682] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:3958317] [RANK:0] sample_packing_eff_est across ranks: [0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812, 0.8715326189994812] +0: [2025-11-23 14:46:15,699] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:3958317] [RANK:0] Maximum number of steps set at 2235 +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +1: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +3: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +2: Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +0: [2025-11-23 14:46:22,541] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:3958317] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation +0: [2025-11-23 14:46:22,542] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:3958317] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +3: Loading checkpoint shards: 0%| | 0/2 [00:003->2 [1] 0/-1/-1->3->2 [2] -1/-1/-1->3->1 [3] 1/11/-1->3->-1 [4] 2/-1/-1->3->0 [5] 0/-1/-1->3->1 [6] 1/-1/-1->3->0 [7] 2/11/-1->3->-1 [8] -1/-1/-1->3->2 [9] 0/-1/-1->3->2 [10] -1/-1/-1->3->1 [11] 1/-1/-1->3->7 [12] 2/-1/-1->3->0 [13] 0/-1/-1->3->1 [14] 1/-1/-1->3->0 [15] 2/-1/-1->3->7 +0: jzxh251:3958319:3960015 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 0/10/-1->2->-1 [3] -1/-1/-1->2->0 [4] 1/-1/-1->2->3 [5] -1/-1/-1->2->0 [6] 0/10/-1->2->-1 [7] 1/-1/-1->2->3 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 0/-1/-1->2->6 [11] -1/-1/-1->2->0 [12] 1/-1/-1->2->3 [13] -1/-1/-1->2->0 [14] 0/-1/-1->2->6 [15] 1/-1/-1->2->3 +0: jzxh251:3958320:3960016 [3] NCCL INFO P2P Chunksize set to 131072 +2: jzxh253:1476126:1478987 [1] NCCL INFO comm 0x1463a413d300 rank 9 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +2: jzxh253:1476128:1478986 [3] NCCL INFO comm 0x151034113bc0 rank 11 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +2: jzxh253:1476127:1478985 [2] NCCL INFO comm 0x148618125040 rank 10 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +2: jzxh253:1476125:1478988 [0] NCCL INFO Trees [0] 9/4/12->8->0 [1] -1/-1/-1->8->11 [2] 9/-1/-1->8->10 [3] 10/-1/-1->8->9 [4] 11/4/12->8->0 [5] 10/-1/-1->8->11 [6] 11/-1/-1->8->10 [7] -1/-1/-1->8->9 [8] 9/-1/-1->8->4 [9] -1/-1/-1->8->11 [10] 9/-1/-1->8->10 [11] 10/-1/-1->8->9 [12] 11/-1/-1->8->4 [13] 10/-1/-1->8->11 [14] 11/-1/-1->8->10 [15] -1/-1/-1->8->9 +2: jzxh253:1476125:1478988 [0] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700705:2702397 [0] NCCL INFO Trees [0] 5/-1/-1->4->8 [1] -1/-1/-1->4->7 [2] 5/-1/-1->4->6 [3] 6/-1/-1->4->5 [4] 7/-1/-1->4->8 [5] 6/-1/-1->4->7 [6] 7/-1/-1->4->6 [7] -1/-1/-1->4->5 [8] 5/8/0->4->12 [9] -1/-1/-1->4->7 [10] 5/-1/-1->4->6 [11] 6/-1/-1->4->5 [12] 7/8/0->4->12 [13] 6/-1/-1->4->7 [14] 7/-1/-1->4->6 [15] -1/-1/-1->4->5 +1: jzxh252:2700707:2702398 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 4/-1/-1->6->10 [3] -1/-1/-1->6->4 [4] 5/-1/-1->6->7 [5] -1/-1/-1->6->4 [6] 4/-1/-1->6->10 [7] 5/-1/-1->6->7 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 4/10/2->6->14 [11] -1/-1/-1->6->4 [12] 5/-1/-1->6->7 [13] -1/-1/-1->6->4 [14] 4/10/2->6->14 [15] 5/-1/-1->6->7 +0: jzxh251:3958318:3960017 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] 3/-1/-1->1->0 [3] 0/-1/-1->1->3 [4] -1/-1/-1->1->2 [5] 3/9/-1->1->-1 [6] -1/-1/-1->1->3 [7] 0/-1/-1->1->2 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->5 [10] 3/-1/-1->1->0 [11] 0/-1/-1->1->3 [12] -1/-1/-1->1->2 [13] 3/-1/-1->1->5 [14] -1/-1/-1->1->3 [15] 0/-1/-1->1->2 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 00/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh251:3958319:3960015 [2] NCCL INFO P2P Chunksize set to 131072 +0: jzxh251:3958318:3960017 [1] NCCL INFO P2P Chunksize set to 131072 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 01/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 02/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 03/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +2: jzxh253:1476126:1478987 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/5/13->9->1 [2] 11/-1/-1->9->8 [3] 8/-1/-1->9->11 [4] -1/-1/-1->9->10 [5] 11/5/13->9->1 [6] -1/-1/-1->9->11 [7] 8/-1/-1->9->10 [8] 10/-1/-1->9->8 [9] 10/-1/-1->9->5 [10] 11/-1/-1->9->8 [11] 8/-1/-1->9->11 [12] -1/-1/-1->9->10 [13] 11/-1/-1->9->5 [14] -1/-1/-1->9->11 [15] 8/-1/-1->9->10 +2: jzxh253:1476126:1478987 [1] NCCL INFO P2P Chunksize set to 131072 +2: jzxh253:1476128:1478986 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] 8/-1/-1->11->10 [2] -1/-1/-1->11->9 [3] 9/7/15->11->3 [4] 10/-1/-1->11->8 [5] 8/-1/-1->11->9 [6] 9/-1/-1->11->8 [7] 10/7/15->11->3 [8] -1/-1/-1->11->10 [9] 8/-1/-1->11->10 [10] -1/-1/-1->11->9 [11] 9/-1/-1->11->7 [12] 10/-1/-1->11->8 [13] 8/-1/-1->11->9 [14] 9/-1/-1->11->8 [15] 10/-1/-1->11->7 +1: jzxh252:2700708:2702399 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 4/-1/-1->7->6 [2] -1/-1/-1->7->5 [3] 5/-1/-1->7->11 [4] 6/-1/-1->7->4 [5] 4/-1/-1->7->5 [6] 5/-1/-1->7->4 [7] 6/-1/-1->7->11 [8] -1/-1/-1->7->6 [9] 4/-1/-1->7->6 [10] -1/-1/-1->7->5 [11] 5/11/3->7->15 [12] 6/-1/-1->7->4 [13] 4/-1/-1->7->5 [14] 5/-1/-1->7->4 [15] 6/11/3->7->15 +1: jzxh252:2700706:2702400 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->9 [2] 7/-1/-1->5->4 [3] 4/-1/-1->5->7 [4] -1/-1/-1->5->6 [5] 7/-1/-1->5->9 [6] -1/-1/-1->5->7 [7] 4/-1/-1->5->6 [8] 6/-1/-1->5->4 [9] 6/9/1->5->13 [10] 7/-1/-1->5->4 [11] 4/-1/-1->5->7 [12] -1/-1/-1->5->6 [13] 7/9/1->5->13 [14] -1/-1/-1->5->7 [15] 4/-1/-1->5->6 +1: jzxh252:2700707:2702398 [2] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700705:2702397 [0] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700708:2702399 [3] NCCL INFO P2P Chunksize set to 131072 +1: jzxh252:2700706:2702400 [1] NCCL INFO P2P Chunksize set to 131072 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 04/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +2: jzxh253:1476127:1478985 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 8/6/14->10->2 [3] -1/-1/-1->10->8 [4] 9/-1/-1->10->11 [5] -1/-1/-1->10->8 [6] 8/6/14->10->2 [7] 9/-1/-1->10->11 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 8/-1/-1->10->6 [11] -1/-1/-1->10->8 [12] 9/-1/-1->10->11 [13] -1/-1/-1->10->8 [14] 8/-1/-1->10->6 [15] 9/-1/-1->10->11 +2: jzxh253:1476128:1478986 [3] NCCL INFO P2P Chunksize set to 131072 +2: jzxh253:1476127:1478985 [2] NCCL INFO P2P Chunksize set to 131072 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 05/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 06/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 07/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 08/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 09/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 10/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 11/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 12/16 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 13/16 : 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 14/16 : 0 3 6 5 4 7 10 9 8 11 14 13 12 15 2 1 +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 15/16 : 0 1 2 7 4 5 6 11 8 9 10 15 12 13 14 3 +0: jzxh251:3958317:3960014 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->3 [2] 1/-1/-1->0->2 [3] 2/-1/-1->0->1 [4] 3/8/-1->0->-1 [5] 2/-1/-1->0->3 [6] 3/-1/-1->0->2 [7] -1/-1/-1->0->1 [8] 1/-1/-1->0->4 [9] -1/-1/-1->0->3 [10] 1/-1/-1->0->2 [11] 2/-1/-1->0->1 [12] 3/-1/-1->0->4 [13] 2/-1/-1->0->3 [14] 3/-1/-1->0->2 [15] -1/-1/-1->0->1 +0: jzxh251:3958317:3960014 [0] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472012:1473683 [3] NCCL INFO comm 0x14f628121240 rank 15 nRanks 16 nNodes 4 localRanks 4 localRank 3 MNNVL 0 +3: jzxh254:1472010:1473684 [1] NCCL INFO comm 0x15171c121dc0 rank 13 nRanks 16 nNodes 4 localRanks 4 localRank 1 MNNVL 0 +3: jzxh254:1472011:1473682 [2] NCCL INFO comm 0x145a0c1133c0 rank 14 nRanks 16 nNodes 4 localRanks 4 localRank 2 MNNVL 0 +3: jzxh254:1472012:1473683 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 12/-1/-1->15->14 [2] -1/-1/-1->15->13 [3] 13/-1/-1->15->11 [4] 14/-1/-1->15->12 [5] 12/-1/-1->15->13 [6] 13/-1/-1->15->12 [7] 14/-1/-1->15->11 [8] -1/-1/-1->15->14 [9] 12/-1/-1->15->14 [10] -1/-1/-1->15->13 [11] 13/7/-1->15->-1 [12] 14/-1/-1->15->12 [13] 12/-1/-1->15->13 [14] 13/-1/-1->15->12 [15] 14/7/-1->15->-1 +3: jzxh254:1472011:1473682 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 12/-1/-1->14->10 [3] -1/-1/-1->14->12 [4] 13/-1/-1->14->15 [5] -1/-1/-1->14->12 [6] 12/-1/-1->14->10 [7] 13/-1/-1->14->15 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 12/6/-1->14->-1 [11] -1/-1/-1->14->12 [12] 13/-1/-1->14->15 [13] -1/-1/-1->14->12 [14] 12/6/-1->14->-1 [15] 13/-1/-1->14->15 +3: jzxh254:1472012:1473683 [3] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472010:1473684 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->9 [2] 15/-1/-1->13->12 [3] 12/-1/-1->13->15 [4] -1/-1/-1->13->14 [5] 15/-1/-1->13->9 [6] -1/-1/-1->13->15 [7] 12/-1/-1->13->14 [8] 14/-1/-1->13->12 [9] 14/5/-1->13->-1 [10] 15/-1/-1->13->12 [11] 12/-1/-1->13->15 [12] -1/-1/-1->13->14 [13] 15/5/-1->13->-1 [14] -1/-1/-1->13->15 [15] 12/-1/-1->13->14 +3: jzxh254:1472011:1473682 [2] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472010:1473684 [1] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472009:1473685 [0] NCCL INFO comm 0x15272813ee40 rank 12 nRanks 16 nNodes 4 localRanks 4 localRank 0 MNNVL 0 +3: jzxh254:1472009:1473685 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] -1/-1/-1->12->15 [2] 13/-1/-1->12->14 [3] 14/-1/-1->12->13 [4] 15/-1/-1->12->8 [5] 14/-1/-1->12->15 [6] 15/-1/-1->12->14 [7] -1/-1/-1->12->13 [8] 13/4/-1->12->-1 [9] -1/-1/-1->12->15 [10] 13/-1/-1->12->14 [11] 14/-1/-1->12->13 [12] 15/4/-1->12->-1 [13] 14/-1/-1->12->15 [14] 15/-1/-1->12->14 [15] -1/-1/-1->12->13 +3: jzxh254:1472009:1473685 [0] NCCL INFO P2P Chunksize set to 131072 +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 00/0 : 14[2] -> 15[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 00/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 00/0 : 6[2] -> 7[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 03/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 04/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 04/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 04/0 : 13[1] -> 14[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 08/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 08/0 : 6[2] -> 7[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 07/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 12/0 : 14[2] -> 15[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 08/0 : 13[1] -> 14[2] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 12/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 00/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 11/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 03/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 12/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 04/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 15/0 : 13[1] -> 14[2] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 07/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 08/0 : 5[1] -> 6[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 11/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 12/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 15/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [send] via NET/IB/0(8)/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 00/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 04/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 08/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 12/0 : 11[3] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 00/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [send] via NET/IB/0(12)/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 03/0 : 12[0] -> 13[1] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 04/0 : 12[0] -> 13[1] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 07/0 : 12[0] -> 13[1] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 08/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 00/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 11/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 12/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 15/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 04/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 08/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 12/0 : 7[3] -> 8[0] [send] via NET/IB/0(4)/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 00/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 01/0 : 8[0] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 02/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 03/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 01/0 : 12[0] -> 15[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 02/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 05/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 04/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 05/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 06/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 07/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 06/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 09/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 09/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 4[0] -> 5[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 10/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 10/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 13/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 11/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 13/0 : 8[0] -> 11[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 14/0 : 12[0] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 00/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 14/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 04/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 08/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 15/0 : 4[0] -> 5[1] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 12/0 : 15[3] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[0] [send] via NET/IB/0(0)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 01/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 02/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [send] via NET/IB/2(10)/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 05/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 06/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 02/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 06/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 09/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 10/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 14/0 : 11[3] -> 14[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 10/0 : 4[0] -> 7[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 13/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 14/0 : 4[0] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 02/0 : 0[0] -> 3[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [send] via NET/IB/1(9)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 05/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 02/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 06/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 06/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 10/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 14/0 : 7[3] -> 10[2] [send] via NET/IB/2(6)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 09/0 : 0[0] -> 3[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 10/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 01/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 13/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 05/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 09/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 14/0 : 0[0] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 13/0 : 10[2] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [send] via NET/IB/1(13)/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [send] via NET/IB/2(14)/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 01/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 02/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 02/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 02/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 06/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 06/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 10/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 10/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 14/0 : 15[3] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 14/0 : 3[3] -> 6[2] [send] via NET/IB/2(2)/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 05/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 06/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 09/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 01/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 10/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 05/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 13/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 14/0 : 13[1] -> 12[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 09/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 13/0 : 6[2] -> 9[1] [send] via NET/IB/1(5)/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 01/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 01/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 05/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 05/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 09/0 : 9[1] -> 8[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 09/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 09/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 13/0 : 14[2] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 13/0 : 2[2] -> 5[1] [send] via NET/IB/1(1)/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [send] via NET/IB/3(7)/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 11/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [send] via NET/IB/3(11)/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 15/0 : 6[2] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 01/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 11/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 15/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 02/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 05/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 06/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 02/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 06/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 10/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 5[1] -> 4[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 14/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 03/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 07/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 11/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 15/0 : 2[2] -> 7[3] [send] via NET/IB/3(3)/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 03/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 07/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 01/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 03/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 05/0 : 7[3] -> 6[2] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 09/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 03/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 07/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 07/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 11/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 11/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 15/0 : 14[2] -> 3[3] [send] via NET/IB/3(15)/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 15/0 : 10[2] -> 15[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 13/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 03/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 07/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 11/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 15/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 02/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 01/0 : 15[3] -> 14[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 06/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 10/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 05/0 : 15[3] -> 14[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 14/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 09/0 : 15[3] -> 14[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 13/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 10/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Connected all rings +0: jzxh251:3958320:3960016 [3] NCCL INFO Connected all rings +0: jzxh251:3958317:3960014 [0] NCCL INFO Connected all rings +0: jzxh251:3958318:3960017 [1] NCCL INFO Connected all rings +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Connected all rings +3: jzxh254:1472010:1473684 [1] NCCL INFO Connected all rings +2: jzxh253:1476128:1478986 [3] NCCL INFO Connected all rings +2: jzxh253:1476127:1478985 [2] NCCL INFO Connected all rings +2: jzxh253:1476125:1478988 [0] NCCL INFO Connected all rings +1: jzxh252:2700705:2702397 [0] NCCL INFO Connected all rings +1: jzxh252:2700706:2702400 [1] NCCL INFO Connected all rings +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 02/0 : 4[0] -> 5[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Connected all rings +3: jzxh254:1472011:1473682 [2] NCCL INFO Connected all rings +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 02/0 : 12[0] -> 13[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Connected all rings +1: jzxh252:2700707:2702398 [2] NCCL INFO Connected all rings +2: jzxh253:1476126:1478987 [1] NCCL INFO Connected all rings +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 10/0 : 12[0] -> 13[1] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 10/0 : 4[0] -> 5[1] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 01/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 01/0 : 6[2] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 01/0 : 13[1] -> 14[2] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 07/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 09/0 : 13[1] -> 14[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 09/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 02/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 07/0 : 14[2] -> 15[3] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 15/0 : 6[2] -> 7[3] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 09/0 : 14[2] -> 15[3] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 02/0 : 12[0] -> 14[2] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 03/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 15/0 : 14[2] -> 15[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 02/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 02/0 : 1[1] -> 3[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 05/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 03/0 : 8[0] -> 10[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 03/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 01/0 : 5[1] -> 6[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 02/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 03/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 05/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 05/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 5[1] -> 6[2] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 02/0 : 9[1] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 06/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 05/0 : 1[1] -> 3[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 06/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 03/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 02/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 06/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 10/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 02/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 06/0 : 1[1] -> 3[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 10/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 10/0 : 12[0] -> 14[2] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 03/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 03/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 05/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 03/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 11/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 11/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 05/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 10/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 11/0 : 12[0] -> 14[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 06/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 06/0 : 5[1] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 05/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 06/0 : 9[1] -> 11[3] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 13/0 : 8[0] -> 10[2] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 13/0 : 0[0] -> 2[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 10/0 : 13[1] -> 15[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 11/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 13/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 10/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 13/0 : 1[1] -> 3[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 14/0 : 0[0] -> 2[2] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 14/0 : 8[0] -> 10[2] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 06/0 : 4[0] -> 6[2] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 11/0 : 13[1] -> 15[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 10/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 14/0 : 12[0] -> 14[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 11/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 14/0 : 1[1] -> 3[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 13/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 10/0 : 4[0] -> 6[2] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 04/0 : 0[0] -> 3[3] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 14/0 : 13[1] -> 15[3] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 5[1] -> 7[3] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 8[0] -> 11[3] via P2P/CUMEM +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 12/0 : 0[0] -> 3[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 11/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 12/0 : 8[0] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 13/0 : 4[0] -> 6[2] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 14/0 : 5[1] -> 7[3] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 04/0 : 12[0] -> 15[3] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 06/0 : 10[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 12/0 : 12[0] -> 15[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 11/0 : 9[1] -> 11[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 13/0 : 9[1] -> 11[3] via P2P/CUMEM +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 14/0 : 4[0] -> 6[2] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 14/0 : 9[1] -> 11[3] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 1[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 01/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 05/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 9[1] -> 13[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 5[1] -> 9[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 09/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 04/0 : 4[0] -> 7[3] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 8[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 13[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 11[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 5[1] -> 13[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 4[0] -> 7[3] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 2[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 02/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 06/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [receive] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 3[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 0[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 6[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 03/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [send] via NET/IB/1/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 00/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 07/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 04/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 7[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 4[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 14[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 6[2] -> 14[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 15[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 12[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 7[3] -> 15[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472009:1473685 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [send] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 4[0] -> 12[0] [send] via NET/IB/0/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [receive] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +1: jzxh252:2700705:2702397 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [send] via NET/IB/0/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/1/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 08/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 10/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +0: jzxh251:3958317:3960014 [0] NCCL INFO Channel 12/0 : 4[0] -> 0[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 09/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 14/0 : 6[2] -> 2[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 13/0 : 5[1] -> 1[1] [receive] via NET/IB/1/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 02/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 12[0] -> 8[0] [receive] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 00/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 04/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 14[2] -> 10[2] [receive] via NET/IB/2/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 08/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 03/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 13[1] -> 9[1] [receive] via NET/IB/1/GDRDMA +2: jzxh253:1476125:1478988 [0] NCCL INFO Channel 12/0 : 8[0] -> 4[0] [send] via NET/IB/0/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 02/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 01/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 10/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 02/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 03/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 14/0 : 10[2] -> 6[2] [send] via NET/IB/2/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 05/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 09/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 13/0 : 9[1] -> 5[1] [send] via NET/IB/1/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 02/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 05/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 03/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 03/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 05/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 06/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 05/0 : 10[2] -> 8[0] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 06/0 : 14[2] -> 12[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 05/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 10/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 06/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/3/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 10/0 : 14[2] -> 12[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 06/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 10/0 : 10[2] -> 8[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 11/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 11/0 : 2[2] -> 0[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 15/0 : 7[3] -> 3[3] [receive] via NET/IB/3/GDRDMA +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 11/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 11/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 10/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 15[3] -> 11[3] [receive] via NET/IB/3/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 13/0 : 14[2] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 11/0 : 6[2] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 11/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 15/0 : 11[3] -> 7[3] [send] via NET/IB/3/GDRDMA +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 13/0 : 10[2] -> 8[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 01/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 01/0 : 7[3] -> 4[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 01/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 14/0 : 14[2] -> 12[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 14/0 : 10[2] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 13/0 : 6[2] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 04/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 13/0 : 2[2] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 04/0 : 15[3] -> 12[0] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 04/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 14/0 : 2[2] -> 0[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 05/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 14/0 : 6[2] -> 4[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 05/0 : 15[3] -> 12[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 05/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 06/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 06/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 09/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 09/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 02/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 06/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 12/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 12/0 : 11[3] -> 8[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 13/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 13/0 : 11[3] -> 8[0] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 09/0 : 7[3] -> 4[0] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 12/0 : 7[3] -> 4[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 03/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 13/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 14/0 : 15[3] -> 12[0] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 14/0 : 11[3] -> 8[0] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 05/0 : 3[3] -> 1[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 02/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 14/0 : 7[3] -> 4[0] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 03/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 02/0 : 7[3] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 06/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 03/0 : 7[3] -> 5[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 05/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 02/0 : 11[3] -> 9[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 10/0 : 3[3] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 05/0 : 7[3] -> 5[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 06/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 03/0 : 11[3] -> 9[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 06/0 : 7[3] -> 5[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 10/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 05/0 : 11[3] -> 9[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 10/0 : 7[3] -> 5[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 11/0 : 15[3] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 06/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 13/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 11/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 10/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 14/0 : 15[3] -> 13[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 13/0 : 7[3] -> 5[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 11/0 : 11[3] -> 9[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 14/0 : 7[3] -> 5[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 00/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 13/0 : 11[3] -> 9[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 04/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 14/0 : 11[3] -> 9[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 00/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 07/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 04/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 08/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 07/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 12/0 : 15[3] -> 14[2] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 11/0 : 3[3] -> 1[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 13/0 : 3[3] -> 1[1] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 08/0 : 7[3] -> 6[2] via P2P/CUMEM +3: jzxh254:1472012:1473683 [3] NCCL INFO Channel 15/0 : 15[3] -> 14[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 14/0 : 3[3] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 12/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 00/0 : 14[2] -> 13[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM +1: jzxh252:2700708:2702399 [3] NCCL INFO Channel 15/0 : 7[3] -> 6[2] via P2P/CUMEM +0: jzxh251:3958319:3960015 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 01/0 : 14[2] -> 13[1] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 04/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh253:1476128:1478986 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 07/0 : 14[2] -> 13[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 00/0 : 6[2] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 08/0 : 14[2] -> 13[1] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 01/0 : 6[2] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 09/0 : 14[2] -> 13[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 04/0 : 6[2] -> 5[1] via P2P/CUMEM +0: jzxh251:3958320:3960016 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 12/0 : 14[2] -> 13[1] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 00/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM +3: jzxh254:1472011:1473682 [2] NCCL INFO Channel 15/0 : 14[2] -> 13[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 07/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 03/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 07/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 08/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 09/0 : 6[2] -> 5[1] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 08/0 : 5[1] -> 4[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 00/0 : 13[1] -> 12[0] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 11/0 : 5[1] -> 4[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 12/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM +1: jzxh252:2700707:2702398 [2] NCCL INFO Channel 15/0 : 6[2] -> 5[1] via P2P/CUMEM +2: jzxh253:1476126:1478987 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM +2: jzxh253:1476127:1478985 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM +1: jzxh252:2700706:2702400 [1] NCCL INFO Channel 15/0 : 5[1] -> 4[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 03/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 07/0 : 13[1] -> 12[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 08/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 11/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh254:1472010:1473684 [1] NCCL INFO Channel 15/0 : 13[1] -> 12[0] via P2P/CUMEM +0: jzxh251:3958318:3960017 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM +3: jzxh254:1472009:1473685 [0] NCCL INFO Connected all trees +3: jzxh254:1472009:1473685 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh254:1472012:1473683 [3] NCCL INFO Connected all trees +3: jzxh254:1472009:1473685 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh251:3958317:3960014 [0] NCCL INFO Connected all trees +3: jzxh254:1472012:1473683 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh254:1472012:1473683 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh251:3958320:3960016 [3] NCCL INFO Connected all trees +3: jzxh254:1472010:1473684 [1] NCCL INFO Connected all trees +3: jzxh254:1472011:1473682 [2] NCCL INFO Connected all trees +0: jzxh251:3958320:3960016 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958320:3960016 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh251:3958318:3960017 [1] NCCL INFO Connected all trees +0: jzxh251:3958319:3960015 [2] NCCL INFO Connected all trees +3: jzxh254:1472010:1473684 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476125:1478988 [0] NCCL INFO Connected all trees +0: jzxh251:3958318:3960017 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958319:3960015 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958318:3960017 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +0: jzxh251:3958319:3960015 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh254:1472010:1473684 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +3: jzxh254:1472011:1473682 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +3: jzxh254:1472011:1473682 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700705:2702397 [0] NCCL INFO Connected all trees +1: jzxh252:2700708:2702399 [3] NCCL INFO Connected all trees +0: jzxh251:3958317:3960014 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +0: jzxh251:3958317:3960014 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476125:1478988 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476125:1478988 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476128:1478986 [3] NCCL INFO Connected all trees +2: jzxh253:1476128:1478986 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476128:1478986 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476127:1478985 [2] NCCL INFO Connected all trees +1: jzxh252:2700708:2702399 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh252:2700705:2702397 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh252:2700708:2702399 [3] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700705:2702397 [0] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476126:1478987 [1] NCCL INFO Connected all trees +1: jzxh252:2700707:2702398 [2] NCCL INFO Connected all trees +1: jzxh252:2700706:2702400 [1] NCCL INFO Connected all trees +1: jzxh252:2700707:2702398 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh252:2700707:2702398 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476127:1478985 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476126:1478987 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +2: jzxh253:1476127:1478985 [2] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +1: jzxh252:2700706:2702400 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 +1: jzxh252:2700706:2702400 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476126:1478987 [1] NCCL INFO 16 coll channels, 16 collnet channels, 0 nvls channels, 16 p2p channels, 2 p2p channels per peer +2: jzxh253:1476127:1478985 [2] NCCL INFO ncclCommInitRank comm 0x148618125040 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x8447d14ea06660bf - Init COMPLETE +2: jzxh253:1476125:1478988 [0] NCCL INFO ncclCommInitRank comm 0x14687c132d80 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x8447d14ea06660bf - Init COMPLETE +3: jzxh254:1472012:1473683 [3] NCCL INFO ncclCommInitRank comm 0x14f628121240 rank 15 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x8447d14ea06660bf - Init COMPLETE +3: jzxh254:1472010:1473684 [1] NCCL INFO ncclCommInitRank comm 0x15171c121dc0 rank 13 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x8447d14ea06660bf - Init COMPLETE +3: jzxh254:1472011:1473682 [2] NCCL INFO ncclCommInitRank comm 0x145a0c1133c0 rank 14 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x8447d14ea06660bf - Init COMPLETE +3: jzxh254:1472009:1473685 [0] NCCL INFO ncclCommInitRank comm 0x15272813ee40 rank 12 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x8447d14ea06660bf - Init COMPLETE +2: jzxh253:1476128:1478986 [3] NCCL INFO ncclCommInitRank comm 0x151034113bc0 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x8447d14ea06660bf - Init COMPLETE +2: jzxh253:1476126:1478987 [1] NCCL INFO ncclCommInitRank comm 0x1463a413d300 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x8447d14ea06660bf - Init COMPLETE +1: jzxh252:2700707:2702398 [2] NCCL INFO ncclCommInitRank comm 0x1528a013f400 rank 6 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x8447d14ea06660bf - Init COMPLETE +1: jzxh252:2700705:2702397 [0] NCCL INFO ncclCommInitRank comm 0x150c30122f80 rank 4 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x8447d14ea06660bf - Init COMPLETE +1: jzxh252:2700706:2702400 [1] NCCL INFO ncclCommInitRank comm 0x145ab8119b40 rank 5 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x8447d14ea06660bf - Init COMPLETE +1: jzxh252:2700708:2702399 [3] NCCL INFO ncclCommInitRank comm 0x1460a413fc10 rank 7 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x8447d14ea06660bf - Init COMPLETE +0: jzxh251:3958318:3960017 [1] NCCL INFO ncclCommInitRank comm 0x14ceec116f00 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 2c000 commId 0x8447d14ea06660bf - Init COMPLETE +0: jzxh251:3958320:3960016 [3] NCCL INFO ncclCommInitRank comm 0x14ef70130550 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId ad000 commId 0x8447d14ea06660bf - Init COMPLETE +0: jzxh251:3958317:3960014 [0] NCCL INFO ncclCommInitRank comm 0x147c70132940 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 1b000 commId 0x8447d14ea06660bf - Init COMPLETE +0: jzxh251:3958319:3960015 [2] NCCL INFO ncclCommInitRank comm 0x145828122fa0 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 9d000 commId 0x8447d14ea06660bf - Init COMPLETE +0: {'loss': 0.7349, 'grad_norm': 2.5536512675043928, 'learning_rate': 9.05e-07, 'memory/max_mem_active(gib)': 57.09, 'memory/max_mem_allocated(gib)': 57.09, 'memory/device_mem_reserved(gib)': 65.68, 'epoch': 0.0} +0: 0%| | 0/2235 [00:00