# Intelligent Tokenizer v6.2.0 Configuration
# Progressive Splitting with GPT-5 Improvements

model:
  name: "IntelligentTokenizerV62"
  version: "6.2.0"
  description: "Progressive splitting tokenizer with multi-level cross-attention"

# Architecture parameters
architecture:
  # Tokenizer settings
  tokenizer:
    content_size: 46        # Actual content bytes
    max_seq_len: 48        # Total with BOS/EOS
    chunk_overlap: 8       # Overlap for sliding window
    vocab_size: 260        # 256 bytes + 4 special tokens

  # Encoder settings (4 layers)
  encoder:
    hidden_dim: 1280       # Unified dimension
    num_heads: 16          # Query heads
    kv_heads: 2           # Key-Value heads (MQA - 8x reduction)
    num_layers: 4         # Total encoder layers
    dropout: 0.1

    # TRUE Adaptive splitting (완전 학습 기반, 하드코딩 없음)
    adaptive_splitting:
      min_tokens: 1              # 최소 1개 토큰 (48:1 압축)
      max_tokens: 4              # 최대 4개 토큰 (12:1 압축, 여전히 BPE의 3배)
      # 압축률은 모델이 자동으로 학습
      # 1 token = 48:1, 2 tokens = 24:1, 3 tokens = 16:1, 4 tokens = 12:1
      learning_based: true       # 완전 학습 기반
      use_importance: true       # 중요도 기반 비대칭 분할
      use_gumbel: true          # Gumbel-Softmax로 미분 가능한 선택

    # Gate warmup (GPT suggestion)
    warmup:
      enabled: true
      steps: 1000        # Warmup steps for gates

    # Language clustering
    language:
      clusters: 128       # Reduced from 512 (GPT suggestion)
      embedding_dim: 256

  # Decoder settings (6 layers)
  decoder:
    hidden_dim: 1280      # Match encoder
    num_heads: 16         # Query heads
    kv_heads: 2          # Key-Value heads (MQA)
    num_layers: 6        # 6 layers (reduced from 8)
    dropout: 0.1

    # Memory optimization
    kv_cache:
      enabled: true
      max_cache_size: 512  # Maximum cached tokens

    # Cross-attention levels
    cross_attention:
      levels: [0, 1, 2, 3]  # Which encoder layers to attend to
      fusion: "weighted_sum"  # weighted_sum or concatenate

    # Generation settings
    generation:
      max_length: 512
      temperature: 1.0
      top_k: 50
      top_p: 0.95

# Training configuration
training:
  # Adaptive learning (완전 동적 조정)
  adaptive_weights:
    # 초기값만 제공, 실제로는 학습 중 자동 조정
    reconstruction: 1.0    # 복원 품질 (기본값)
    compression: 2.0       # 압축률 (16:1 유지)
    boundary: 1.0          # 경계 학습 (중요도 상향)

  # Dynamic adjustment
  dynamic_loss_scaling: true
  scale_by_performance: true

  # Optimizer settings
  optimizer:
    type: "AdamW"
    learning_rate: 0.00003  # 더 낮춤 for batch 128
    betas: [0.9, 0.95]      # beta2 더 낮춤 (안정성)
    eps: 0.000001   # 1e-6 (더 증가)
    weight_decay: 0.0005    # 더 낮춤

  # Scheduler settings
  scheduler:
    type: "CosineAnnealingLR"
    T_max: 100
    eta_min: 0.000005  # 더 낮은 최소값
    warmup_steps: 2000  # warmup 늘림 (1000 -> 2000)

  # Training parameters
  batch_size: 64  # GPU allows it (user using 128)
  gradient_accumulation_steps: 4
  max_grad_norm: 0.3  # 더 강하게 (1.0 -> 0.3)
  fp16: true
  gradient_checkpointing: true

  # Logging
  logging:
    log_interval: 100
    eval_interval: 500
    save_interval: 1
    wandb:
      enabled: false
      project: "intelligent-tokenizer-v62"

# Dataset configuration
dataset:
  train_path: "data/"
  val_path: "data/"
  test_path: "data/"

  # Data processing
  preprocessing:
    max_length: 2048      # Maximum input length
    stride: 1536          # Stride for long sequences
    min_length: 48        # Minimum sequence length

  # Language distribution (for balanced sampling)
  languages:
    - code: "en"
      weight: 0.3
    - code: "ko"
      weight: 0.2
    - code: "zh"
      weight: 0.15
    - code: "ja"
      weight: 0.1
    - code: "es"
      weight: 0.05
    - code: "fr"
      weight: 0.05
    - code: "de"
      weight: 0.05
    - code: "ru"
      weight: 0.05
    - code: "ar"
      weight: 0.05

# Evaluation metrics
evaluation:
  metrics:
    - compression_ratio    # Target: 8-20x
    - reconstruction_accuracy  # Target: >95%
    - boundary_precision   # Target: >90%
    - language_accuracy    # Target: >95%

  targets:
    compression_ratio:
      min: 12.0      # 최악의 경우도 BPE의 3배 (4 tokens)
      optimal: 24.0  # 평균 목표 (2 tokens)
      max: 48.0      # 최상의 경우 (1 token)
    reconstruction_accuracy: 0.90
    boundary_precision: 0.90
    language_accuracy: 0.90

# Hardware settings
hardware:
  device: "cuda"
  num_workers: 4
  pin_memory: true

# Checkpoint settings
checkpoint:
  save_dir: "checkpoints/v62/"
  resume_from: null
  save_best: true
  save_last: true
  max_checkpoints: 5

# Special tokens (must match tokenizer.py)
special_tokens:
  PAD: 256
  BOS: 257
  EOS: 258
  MASK: 259

# Experimental features
experimental:
  # Gumbel-Softmax temperature annealing
  gumbel_annealing:
    enabled: true
    initial_temp: 1.0
    final_temp: 0.3      # 0.1 → 0.3 (너무 낮으면 불안정)
    anneal_rate: 0.9999  # 0.99995 → 0.9999 (조금 더 빠르게)

  # Dynamic token allocation (진짜 동적)
  dynamic_tokens:
    enabled: true
    min_tokens: 1        # 최소 1개 (48:1)
    max_tokens: 4        # 최대 4개 (12:1, BPE의 3배)
    # quality_threshold 제거 - 모델이 스스로 학습

  # Boundary learning enhancements
  boundary_learning:
    utf8_aware: true
    word_aware: true
    phrase_aware: true

# Memory optimization
memory:
  gradient_checkpointing: true
  mixed_precision: true
  optimize_cuda: true
  clear_cache_interval: 100