# Intelligent Tokenizer v6.2.0 Configuration # Progressive Splitting with GPT-5 Improvements model: name: "IntelligentTokenizerV62" version: "6.2.0" description: "Progressive splitting tokenizer with multi-level cross-attention" # Architecture parameters architecture: # Tokenizer settings tokenizer: content_size: 46 # Actual content bytes max_seq_len: 48 # Total with BOS/EOS chunk_overlap: 8 # Overlap for sliding window vocab_size: 260 # 256 bytes + 4 special tokens # Encoder settings (4 layers) encoder: hidden_dim: 1280 # Unified dimension num_heads: 16 # Query heads kv_heads: 2 # Key-Value heads (MQA - 8x reduction) num_layers: 4 # Total encoder layers dropout: 0.1 # TRUE Adaptive splitting (완전 학습 기반, 하드코딩 없음) adaptive_splitting: min_tokens: 1 # 최소 1개 토큰 (48:1 압축) max_tokens: 4 # 최대 4개 토큰 (12:1 압축, 여전히 BPE의 3배) # 압축률은 모델이 자동으로 학습 # 1 token = 48:1, 2 tokens = 24:1, 3 tokens = 16:1, 4 tokens = 12:1 learning_based: true # 완전 학습 기반 use_importance: true # 중요도 기반 비대칭 분할 use_gumbel: true # Gumbel-Softmax로 미분 가능한 선택 # Gate warmup (GPT suggestion) warmup: enabled: true steps: 1000 # Warmup steps for gates # Language clustering language: clusters: 128 # Reduced from 512 (GPT suggestion) embedding_dim: 256 # Decoder settings (6 layers) decoder: hidden_dim: 1280 # Match encoder num_heads: 16 # Query heads kv_heads: 2 # Key-Value heads (MQA) num_layers: 6 # 6 layers (reduced from 8) dropout: 0.1 # Memory optimization kv_cache: enabled: true max_cache_size: 512 # Maximum cached tokens # Cross-attention levels cross_attention: levels: [0, 1, 2, 3] # Which encoder layers to attend to fusion: "weighted_sum" # weighted_sum or concatenate # Generation settings generation: max_length: 512 temperature: 1.0 top_k: 50 top_p: 0.95 # Training configuration training: # Adaptive learning (완전 동적 조정) adaptive_weights: # 초기값만 제공, 실제로는 학습 중 자동 조정 reconstruction: 1.0 # 복원 품질 (기본값) compression: 2.0 # 압축률 (16:1 유지) boundary: 1.0 # 경계 학습 (중요도 상향) # Dynamic adjustment dynamic_loss_scaling: true scale_by_performance: true # Optimizer settings optimizer: type: "AdamW" learning_rate: 0.00003 # 더 낮춤 for batch 128 betas: [0.9, 0.95] # beta2 더 낮춤 (안정성) eps: 0.000001 # 1e-6 (더 증가) weight_decay: 0.0005 # 더 낮춤 # Scheduler settings scheduler: type: "CosineAnnealingLR" T_max: 100 eta_min: 0.000005 # 더 낮은 최소값 warmup_steps: 2000 # warmup 늘림 (1000 -> 2000) # Training parameters batch_size: 64 # GPU allows it (user using 128) gradient_accumulation_steps: 4 max_grad_norm: 0.3 # 더 강하게 (1.0 -> 0.3) fp16: true gradient_checkpointing: true # Logging logging: log_interval: 100 eval_interval: 500 save_interval: 1 wandb: enabled: false project: "intelligent-tokenizer-v62" # Dataset configuration dataset: train_path: "data/" val_path: "data/" test_path: "data/" # Data processing preprocessing: max_length: 2048 # Maximum input length stride: 1536 # Stride for long sequences min_length: 48 # Minimum sequence length # Language distribution (for balanced sampling) languages: - code: "en" weight: 0.3 - code: "ko" weight: 0.2 - code: "zh" weight: 0.15 - code: "ja" weight: 0.1 - code: "es" weight: 0.05 - code: "fr" weight: 0.05 - code: "de" weight: 0.05 - code: "ru" weight: 0.05 - code: "ar" weight: 0.05 # Evaluation metrics evaluation: metrics: - compression_ratio # Target: 8-20x - reconstruction_accuracy # Target: >95% - boundary_precision # Target: >90% - language_accuracy # Target: >95% targets: compression_ratio: min: 12.0 # 최악의 경우도 BPE의 3배 (4 tokens) optimal: 24.0 # 평균 목표 (2 tokens) max: 48.0 # 최상의 경우 (1 token) reconstruction_accuracy: 0.90 boundary_precision: 0.90 language_accuracy: 0.90 # Hardware settings hardware: device: "cuda" num_workers: 4 pin_memory: true # Checkpoint settings checkpoint: save_dir: "checkpoints/v62/" resume_from: null save_best: true save_last: true max_checkpoints: 5 # Special tokens (must match tokenizer.py) special_tokens: PAD: 256 BOS: 257 EOS: 258 MASK: 259 # Experimental features experimental: # Gumbel-Softmax temperature annealing gumbel_annealing: enabled: true initial_temp: 1.0 final_temp: 0.3 # 0.1 → 0.3 (너무 낮으면 불안정) anneal_rate: 0.9999 # 0.99995 → 0.9999 (조금 더 빠르게) # Dynamic token allocation (진짜 동적) dynamic_tokens: enabled: true min_tokens: 1 # 최소 1개 (48:1) max_tokens: 4 # 최대 4개 (12:1, BPE의 3배) # quality_threshold 제거 - 모델이 스스로 학습 # Boundary learning enhancements boundary_learning: utf8_aware: true word_aware: true phrase_aware: true # Memory optimization memory: gradient_checkpointing: true mixed_precision: true optimize_cuda: true clear_cache_interval: 100