Spaces:

OliverPerrin
/

LexiMind

Running

OliverPerrin commited on 19 days ago

Commit

67c3a83

1 Parent(s): 590a604

Fix Pylance type errors, add inductor compilation support

- Add cast() for registered buffer access (pe, cos, sin) in decoder.py,
positional_encoding.py, and attention.py to satisfy Pylance type checking
- Add cast() for compile_model return types in train.py
- Add type: ignore for rouge_score import (no type stubs available)
- Add safe_compile.py for torch.compile with inductor backend (default mode)
- Add nan_debugger.py for debugging NaN/Inf during training
- Update configs: batch_size=12 for medium/full, fix layer counts to match FLAN-T5-base
- Add benchmark mode to train.py for speed testing without saving checkpoints
- Suppress torch inductor warnings that interfere with tqdm progress bars

All checks pass: ruff, mypy, Pylance

Files changed (13) hide show

configs/model/base.yaml +3 -3
configs/training/dev.yaml +10 -7
configs/training/full.yaml +8 -5
configs/training/medium.yaml +9 -6
scripts/eval_rouge.py +1 -1
scripts/train.py +40 -11
src/models/attention.py +5 -3
src/models/decoder.py +7 -6
src/models/positional_encoding.py +2 -1
src/training/nan_debugger.py +123 -0
src/training/safe_compile.py +86 -0
src/training/trainer.py +55 -6
tests/test_models/test_decoder.py +3 -3

configs/model/base.yaml CHANGED Viewed

@@ -1,8 +1,8 @@
 # FLAN-T5-base architecture
-# 12 encoder layers, 12 decoder layers, 768 hidden dim
 d_model: 768
-num_encoder_layers: 12
-num_decoder_layers: 12
 num_attention_heads: 12
 ffn_dim: 2048  # T5 uses d_ff = 2048 for base model
 dropout: 0.1

 # FLAN-T5-base architecture
+# 6 encoder layers, 6 decoder layers, 768 hidden dim
 d_model: 768
+num_encoder_layers: 6   # T5-base has 6 layers
+num_decoder_layers: 6   # T5-base has 6 layers
 num_attention_heads: 12
 ffn_dim: 2048  # T5 uses d_ff = 2048 for base model
 dropout: 0.1

configs/training/dev.yaml CHANGED Viewed

@@ -4,15 +4,18 @@
 # Use: python scripts/train.py training=dev
 dataloader:
-  batch_size: 8          # Safe for 12GB VRAM - no shared memory spillover
   shuffle: true
-  num_workers: 4
   pin_memory: true
 optimizer:
   name: adamw
-  lr: 5.0e-5              # Higher LR for fast convergence
   weight_decay: 0.01
 scheduler:
   name: cosine
@@ -21,12 +24,12 @@ scheduler:
 trainer:
   max_epochs: 1
   gradient_clip_norm: 1.0
-  gradient_accumulation_steps: 1  # No accumulation - maximize throughput
-  validation_max_length: 64
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0
     emotion: 1.0
     topic: 1.0
-  max_train_samples: 2000
-  max_val_samples: 200

 # Use: python scripts/train.py training=dev
 dataloader:
+  batch_size: 14
   shuffle: true
+  num_workers: 6
   pin_memory: true
+  persistent_workers: true
+  prefetch_factor: 4
 optimizer:
   name: adamw
+  lr: 2.0e-5
   weight_decay: 0.01
+  eps: 1.0e-6
 scheduler:
   name: cosine
 trainer:
   max_epochs: 1
   gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 4
+  validation_max_length: 128
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0
     emotion: 1.0
     topic: 1.0
+  max_train_samples: 1000
+  max_val_samples: 100

configs/training/full.yaml CHANGED Viewed

@@ -1,27 +1,30 @@
 # Full Training Configuration for FLAN-T5-base
 # Complete training run on all data
-# Training time: ~6-8 hours on RTX 4070 12GB
 # Use: python scripts/train.py training=full
 dataloader:
-  batch_size: 6          # Optimized for 12GB VRAM
   shuffle: true
   num_workers: 6
   pin_memory: true
 optimizer:
   name: adamw
   lr: 2.0e-5
   weight_decay: 0.01
 scheduler:
   name: cosine
-  warmup_steps: 500       # ~3% of steps
 trainer:
-  max_epochs: 3           # 3 epochs usually sufficient, avoids overfit
   gradient_clip_norm: 1.0
-  gradient_accumulation_steps: 6  # Effective batch = 36
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:

 # Full Training Configuration for FLAN-T5-base
 # Complete training run on all data
+# Training time: ~4-6 hours on RTX 4070 12GB with inductor
 # Use: python scripts/train.py training=full
 dataloader:
+  batch_size: 14
   shuffle: true
   num_workers: 6
   pin_memory: true
+  persistent_workers: true
+  prefetch_factor: 4
 optimizer:
   name: adamw
   lr: 2.0e-5
   weight_decay: 0.01
+  eps: 1.0e-6
 scheduler:
   name: cosine
+  warmup_steps: 500
 trainer:
+  max_epochs: 3
   gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 3  # Effective batch = 42
   validation_max_length: 128
   label_smoothing: 0.1
   task_weights:

configs/training/medium.yaml CHANGED Viewed

@@ -1,28 +1,31 @@
 # Medium Configuration for FLAN-T5-base
 # Balanced approach - good results in reasonable time
-# Training time: ~2-3 hours on RTX 4070 12GB
 # Use: python scripts/train.py training=medium
 dataloader:
-  batch_size: 6          # Optimized for 12GB VRAM with accumulation
   shuffle: true
   num_workers: 6
   pin_memory: true
 optimizer:
   name: adamw
-  lr: 3.0e-5              # Slightly higher - compensates for effective batch
   weight_decay: 0.01
 scheduler:
   name: cosine
-  warmup_steps: 300       # ~5% of steps
 trainer:
   max_epochs: 3
   gradient_clip_norm: 1.0
-  gradient_accumulation_steps: 3  # Effective batch = 18
-  validation_max_length: 96
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0

 # Medium Configuration for FLAN-T5-base
 # Balanced approach - good results in reasonable time
+# Training time: ~1.5-2 hours on RTX 4070 12GB with inductor
 # Use: python scripts/train.py training=medium
 dataloader:
+  batch_size: 14
   shuffle: true
   num_workers: 6
   pin_memory: true
+  persistent_workers: true
+  prefetch_factor: 4
 optimizer:
   name: adamw
+  lr: 3.0e-5
   weight_decay: 0.01
+  eps: 1.0e-6
 scheduler:
   name: cosine
+  warmup_steps: 300
 trainer:
   max_epochs: 3
   gradient_clip_norm: 1.0
+  gradient_accumulation_steps: 3  # Effective batch = 42
+  validation_max_length: 128
   label_smoothing: 0.1
   task_weights:
     summarization: 1.0

scripts/eval_rouge.py CHANGED Viewed

@@ -18,7 +18,7 @@ from pathlib import Path
 from statistics import fmean
 from typing import Dict, Iterable, List, Sequence, Tuple
-from rouge_score import rouge_scorer
 from tqdm import tqdm
 PROJECT_ROOT = Path(__file__).resolve().parent.parent

 from statistics import fmean
 from typing import Dict, Iterable, List, Sequence, Tuple
+from rouge_score import rouge_scorer  # type: ignore[import-untyped]
 from tqdm import tqdm
 PROJECT_ROOT = Path(__file__).resolve().parent.parent

scripts/train.py CHANGED Viewed

@@ -11,10 +11,20 @@ Date: December 2025
 from __future__ import annotations
 import json
 import sys
 import time
 from pathlib import Path
-from typing import Any, Dict, Sequence
 import hydra
 import torch
@@ -82,14 +92,14 @@ def limit_samples(splits: Dict[str, list], cfg: DictConfig) -> None:
 # --------------- Model Compilation ---------------
-def compile_model(model: torch.nn.Module) -> Any:
-    """Compile model with aot_eager backend (stable, avoids inductor NaN issues)."""
-    try:
-        compiled = torch.compile(model, backend="aot_eager")
-        print("✓ Compiled with aot_eager")
-        return compiled
-    except Exception:
-        return model
 # --------------- Main ---------------
@@ -101,6 +111,11 @@ def main(cfg: DictConfig) -> None:
     print(OmegaConf.to_yaml(cfg))
     set_seed(cfg.seed)
     # Enable TF32 for Ampere+ GPUs (RTX 30xx/40xx) - ~2x matmul speedup
     if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
         print("✓ TF32 enabled for Ampere GPU")
@@ -242,9 +257,13 @@ def main(cfg: DictConfig) -> None:
     # Compile encoder/decoder for faster training (skip heads - small overhead)
     if model.encoder is not None:
-        model.encoder = compile_model(model.encoder)
     if model.decoder is not None:
-        model.decoder = compile_model(model.decoder)
     # --------------- Optimizer & Trainer ---------------
@@ -272,6 +291,8 @@ def main(cfg: DictConfig) -> None:
     # --------------- Train ---------------
     def save_checkpoint(epoch: int, model: torch.nn.Module, history: Dict) -> None:
         path = Path(cfg.checkpoint_out).parent / f"epoch_{epoch}.pt"
         path.parent.mkdir(parents=True, exist_ok=True)
         save_state(model, str(path))
@@ -281,6 +302,14 @@ def main(cfg: DictConfig) -> None:
     # --------------- Save Outputs ---------------
     # Best checkpoint
     ckpt_path = Path(cfg.checkpoint_out)
     ckpt_path.parent.mkdir(parents=True, exist_ok=True)

 from __future__ import annotations
 import json
+import logging
+import os
 import sys
 import time
+import warnings
 from pathlib import Path
+from typing import Dict, Sequence, cast
+# Suppress torch inductor warnings that mess up progress bars
+os.environ.setdefault("TORCH_LOGS", "-all")
+warnings.filterwarnings("ignore", category=UserWarning, module="torch._inductor")
+warnings.filterwarnings("ignore", category=FutureWarning, module="mlflow")
+logging.getLogger("torch._inductor").setLevel(logging.ERROR)
+logging.getLogger("torch._dynamo").setLevel(logging.ERROR)
 import hydra
 import torch
 # --------------- Model Compilation ---------------
+def compile_model(model: torch.nn.Module) -> torch.nn.Module:
+    """Compile model with inductor backend (default mode, no CUDA graphs)."""
+    from src.training.safe_compile import apply_safe_config, compile_model_safe
+    # Apply safe configuration first
+    apply_safe_config()
+    # Compile with default mode (inductor without CUDA graphs)
+    return compile_model_safe(model, mode="default")
 # --------------- Main ---------------
     print(OmegaConf.to_yaml(cfg))
     set_seed(cfg.seed)
+    # Benchmark mode: skip saving checkpoints (for speed testing)
+    benchmark_mode = cfg.get("benchmark", False)
+    if benchmark_mode:
+        print("⚡ BENCHMARK MODE: Checkpoints will NOT be saved")
     # Enable TF32 for Ampere+ GPUs (RTX 30xx/40xx) - ~2x matmul speedup
     if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
         print("✓ TF32 enabled for Ampere GPU")
     # Compile encoder/decoder for faster training (skip heads - small overhead)
     if model.encoder is not None:
+        from src.models.encoder import TransformerEncoder
+        model.encoder = cast(TransformerEncoder, compile_model(model.encoder))
     if model.decoder is not None:
+        from src.models.decoder import TransformerDecoder
+        model.decoder = cast(TransformerDecoder, compile_model(model.decoder))
     # --------------- Optimizer & Trainer ---------------
     # --------------- Train ---------------
     def save_checkpoint(epoch: int, model: torch.nn.Module, history: Dict) -> None:
+        if benchmark_mode:
+            return  # Skip saving in benchmark mode
         path = Path(cfg.checkpoint_out).parent / f"epoch_{epoch}.pt"
         path.parent.mkdir(parents=True, exist_ok=True)
         save_state(model, str(path))
     # --------------- Save Outputs ---------------
+    if benchmark_mode:
+        total_time = time.perf_counter() - start_time
+        print(f"\n{'=' * 50}")
+        print(f"⚡ Benchmark complete in {total_time:.1f}s")
+        print("  (No files saved in benchmark mode)")
+        print(f"{'=' * 50}")
+        return
     # Best checkpoint
     ckpt_path = Path(cfg.checkpoint_out)
     ckpt_path.parent.mkdir(parents=True, exist_ok=True)

src/models/attention.py CHANGED Viewed

@@ -13,7 +13,7 @@ Date: 2025-10-23
 """
 import math
-from typing import Optional, Tuple
 import torch
 import torch.nn as nn
@@ -280,8 +280,10 @@ class RotaryEmbedding(nn.Module):
         seq_len = x.shape[2]
         # Slice cos/sin to current sequence length
         # unsqueeze to broadcast over batch and heads: (1, 1, seq_len, dim)
-        cos = self.cos[:seq_len, :].unsqueeze(0).unsqueeze(0)
-        sin = self.sin[:seq_len, :].unsqueeze(0).unsqueeze(0)
         return (x * cos) + (self._rotate_half(x) * sin)

 """
 import math
+from typing import Optional, Tuple, cast
 import torch
 import torch.nn as nn
         seq_len = x.shape[2]
         # Slice cos/sin to current sequence length
         # unsqueeze to broadcast over batch and heads: (1, 1, seq_len, dim)
+        cos_buf = cast(torch.Tensor, self.cos)
+        sin_buf = cast(torch.Tensor, self.sin)
+        cos = cos_buf[:seq_len, :].unsqueeze(0).unsqueeze(0)
+        sin = sin_buf[:seq_len, :].unsqueeze(0).unsqueeze(0)
         return (x * cos) + (self._rotate_half(x) * sin)

src/models/decoder.py CHANGED Viewed

@@ -14,7 +14,7 @@ Author: Oliver Perrin
 Date: 2025-10-23
 """
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 import torch
 import torch.nn as nn
@@ -530,7 +530,7 @@ class TransformerDecoder(nn.Module):
         if self.pos_encoder is not None:
             if hasattr(self.pos_encoder, "pe"):
                 # Sinusoidal: use buffer directly
-                pe = self.pos_encoder.pe  # (1, max_len, d_model)
                 pos_idx = past_len
                 if pos_idx >= pe.size(1):
                     raise RuntimeError(f"pos_idx {pos_idx} exceeds max_len {pe.size(1)}")
@@ -538,12 +538,12 @@ class TransformerDecoder(nn.Module):
             elif hasattr(self.pos_encoder, "embeddings"):
                 # Learned: lookup specific position
                 # Create position ids: [past_len]
-                pos_idx = torch.tensor([past_len], dtype=torch.long, device=device)
                 # Lookup embedding: (1, d_model)
-                pos_emb = self.pos_encoder.embeddings(pos_idx)
                 # Add to input: (B, 1, d_model) + (1, 1, d_model) broadcast
                 x = x + pos_emb.unsqueeze(0)
-                x = self.pos_encoder.dropout(x)
             else:
                 # fallback: call pos_encoder (likely incorrect for step-by-step if it assumes pos 0)
                 x = self.pos_encoder(x)
@@ -583,7 +583,8 @@ class TransformerDecoder(nn.Module):
         # Iterate layers, updating caches and computing output for current token only
         layer_input = x  # (B,1,d_model)
-        for i, layer in enumerate(self.layers):
             # -------------------
             # 1) Self-attention (incremental)
             # -------------------

 Date: 2025-10-23
 """
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
 import torch
 import torch.nn as nn
         if self.pos_encoder is not None:
             if hasattr(self.pos_encoder, "pe"):
                 # Sinusoidal: use buffer directly
+                pe: torch.Tensor = self.pos_encoder.pe  # type: ignore[union-attr]
                 pos_idx = past_len
                 if pos_idx >= pe.size(1):
                     raise RuntimeError(f"pos_idx {pos_idx} exceeds max_len {pe.size(1)}")
             elif hasattr(self.pos_encoder, "embeddings"):
                 # Learned: lookup specific position
                 # Create position ids: [past_len]
+                pos_idx_t = torch.tensor([past_len], dtype=torch.long, device=device)
                 # Lookup embedding: (1, d_model)
+                pos_emb = self.pos_encoder.embeddings(pos_idx_t)  # type: ignore[union-attr]
                 # Add to input: (B, 1, d_model) + (1, 1, d_model) broadcast
                 x = x + pos_emb.unsqueeze(0)
+                x = self.pos_encoder.dropout(x)  # type: ignore[union-attr]
             else:
                 # fallback: call pos_encoder (likely incorrect for step-by-step if it assumes pos 0)
                 x = self.pos_encoder(x)
         # Iterate layers, updating caches and computing output for current token only
         layer_input = x  # (B,1,d_model)
+        for i, layer_module in enumerate(self.layers):
+            layer = cast(TransformerDecoderLayer, layer_module)
             # -------------------
             # 1) Self-attention (incremental)
             # -------------------

src/models/positional_encoding.py CHANGED Viewed

@@ -74,7 +74,8 @@ class PositionalEncoding(nn.Module):
         # Add the appropriate slice of positional encoding
         # Apply dropout
         # Return result
-        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
         # self.pe contains pre-computed encodings for all positions
         # just need to add the first seq_len positions to x
         return self.dropout(x)

         # Add the appropriate slice of positional encoding
         # Apply dropout
         # Return result
+        pe: torch.Tensor = self.pe  # type: ignore[assignment]
+        x = x + pe[:, : x.size(1)].requires_grad_(False)
         # self.pe contains pre-computed encodings for all positions
         # just need to add the first seq_len positions to x
         return self.dropout(x)

src/training/nan_debugger.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+NaN debugging utilities for training.
+Helps identify where NaNs originate in the model during training.
+Author: Oliver Perrin
+Date: December 2025
+"""
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+class NaNDetector:
+    """Detect and log NaNs in model parameters and gradients."""
+    def __init__(self, model: nn.Module, enabled: bool = True):
+        self.model = model
+        self.enabled = enabled
+        self.nan_count = 0
+        self.max_nans = 10
+    def check_forward(self, outputs: torch.Tensor, loss: torch.Tensor, step: int) -> bool:
+        """Check for NaNs in forward pass. Returns True if NaN found."""
+        if not self.enabled:
+            return False
+        has_nan = False
+        if torch.isnan(outputs).any():
+            print(f"\n{'=' * 60}")
+            print(f"⚠ NaN detected in MODEL OUTPUTS at step {step}")
+            print(f"Output shape: {outputs.shape}")
+            print(f"NaN count: {torch.isnan(outputs).sum().item()}")
+            print(f"{'=' * 60}\n")
+            has_nan = True
+        if torch.isnan(loss):
+            print(f"\n{'=' * 60}")
+            print(f"⚠ NaN detected in LOSS at step {step}")
+            print(f"Loss value: {loss.item()}")
+            print(f"{'=' * 60}\n")
+            has_nan = True
+        if has_nan:
+            self.nan_count += 1
+            if self.nan_count >= self.max_nans:
+                print(f"\n⚠ Too many NaNs ({self.nan_count}), stopping training")
+        return has_nan
+    def check_gradients(self, step: int) -> Optional[Tuple[str, torch.Tensor]]:
+        """Check gradients for NaNs/Infs after backward pass."""
+        if not self.enabled:
+            return None
+        for name, param in self.model.named_parameters():
+            if param.grad is not None:
+                if torch.isnan(param.grad).any():
+                    print(f"\n{'=' * 60}")
+                    print(f"⚠ NaN in GRADIENT: {name}")
+                    print(f"  Step: {step}")
+                    print(f"  Grad shape: {param.grad.shape}")
+                    print(f"  NaN count: {torch.isnan(param.grad).sum().item()}")
+                    print(f"{'=' * 60}\n")
+                    return (name, param.grad)
+                if torch.isinf(param.grad).any():
+                    print(f"\n{'=' * 60}")
+                    print(f"⚠ Inf in GRADIENT: {name}")
+                    print(f"  Step: {step}")
+                    print(f"  Inf count: {torch.isinf(param.grad).sum().item()}")
+                    print(f"{'=' * 60}\n")
+                    return (name, param.grad)
+        return None
+    def check_parameters(self, step: int) -> Optional[str]:
+        """Check parameters for NaNs/Infs."""
+        if not self.enabled:
+            return None
+        for name, param in self.model.named_parameters():
+            if torch.isnan(param).any():
+                print(f"\n{'=' * 60}")
+                print(f"⚠ NaN in PARAMETER: {name}")
+                print(f"  Step: {step}")
+                print(f"{'=' * 60}\n")
+                return str(name)
+            if torch.isinf(param).any():
+                print(f"\n{'=' * 60}")
+                print(f"⚠ Inf in PARAMETER: {name}")
+                print(f"  Step: {step}")
+                print(f"{'=' * 60}\n")
+                return str(name)
+        return None
+def gradient_stats(model: nn.Module) -> dict:
+    """Get gradient statistics for debugging."""
+    stats = {
+        "max_grad": 0.0,
+        "min_grad": float("inf"),
+        "mean_grad": 0.0,
+        "num_grads": 0,
+    }
+    grad_norms = []
+    for _name, param in model.named_parameters():
+        if param.grad is not None:
+            grad_norms.append(param.grad.norm().item())
+            stats["max_grad"] = max(stats["max_grad"], param.grad.abs().max().item())
+            stats["min_grad"] = min(stats["min_grad"], param.grad.abs().min().item())
+            stats["num_grads"] += 1
+    if grad_norms:
+        stats["mean_grad"] = sum(grad_norms) / len(grad_norms)
+    return stats

src/training/safe_compile.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Safe torch.compile configuration that prevents NaN issues.
+Author: Oliver Perrin
+Date: December 2025
+"""
+import torch
+def compile_model_safe(
+    model: torch.nn.Module,
+    mode: str = "default",
+) -> torch.nn.Module:
+    """
+    Compile model with inductor backend and safety guardrails.
+    Uses 'default' mode which gives inductor speedups without CUDA graphs.
+    CUDA graphs (reduce-overhead mode) don't work with dynamic shapes or
+    shared embeddings like in T5.
+    Args:
+        model: Model to compile
+        mode: Compilation mode ("default" recommended, avoid "reduce-overhead")
+    Returns:
+        Compiled model (or original if compilation fails)
+    """
+    if not torch.cuda.is_available():
+        print("⚠ CUDA not available, skipping compilation")
+        return model
+    try:
+        # Configure for stability
+        torch._dynamo.config.suppress_errors = True
+        torch._dynamo.config.cache_size_limit = 64  # Allow more graph variations
+        # Disable aggressive optimizations that can cause NaNs
+        if hasattr(torch, "_inductor"):
+            cfg = torch._inductor.config
+            if hasattr(cfg, "epilogue_fusion"):
+                cfg.epilogue_fusion = False
+            if hasattr(cfg, "coordinate_descent_tuning"):
+                cfg.coordinate_descent_tuning = False
+            if hasattr(cfg, "force_fuse_int_mm_with_mul"):
+                cfg.force_fuse_int_mm_with_mul = False
+            # Explicitly disable CUDA graphs
+            if hasattr(cfg, "triton"):
+                if hasattr(cfg.triton, "cudagraphs"):
+                    cfg.triton.cudagraphs = False
+                if hasattr(cfg.triton, "max_autotune_gemm"):
+                    cfg.triton.max_autotune_gemm = False
+        # Compile with inductor (no CUDA graphs)
+        compiled = torch.compile(model, mode=mode, fullgraph=False, dynamic=True)
+        print(f"✓ Compiled with inductor ({mode} mode)")
+        return compiled
+    except Exception as e:
+        print(f"⚠ Inductor compilation failed: {e}")
+        print("  Falling back to aot_eager")
+        try:
+            return torch.compile(model, backend="aot_eager")
+        except Exception:
+            print("  Using uncompiled model")
+            return model
+def apply_safe_config():
+    """Apply safe configuration to torch._inductor before any compilation."""
+    if hasattr(torch, "_inductor"):
+        cfg = torch._inductor.config
+        if hasattr(cfg, "epilogue_fusion"):
+            cfg.epilogue_fusion = False
+        if hasattr(cfg, "coordinate_descent_tuning"):
+            cfg.coordinate_descent_tuning = False
+        if hasattr(cfg, "triton"):
+            if hasattr(cfg.triton, "cudagraphs"):
+                cfg.triton.cudagraphs = False
+            if hasattr(cfg.triton, "max_autotune_gemm"):
+                cfg.triton.max_autotune_gemm = False
+    # Dynamo config for stability
+    torch._dynamo.config.suppress_errors = True
+    torch._dynamo.config.cache_size_limit = 64
+    print("✓ Applied safe inductor configuration")

src/training/trainer.py CHANGED Viewed

@@ -10,6 +10,7 @@ Date: December 2025
 from __future__ import annotations
 import time
 from collections import defaultdict
 from dataclasses import dataclass
@@ -23,6 +24,7 @@ from tqdm import tqdm
 from ..data.tokenization import Tokenizer
 from .metrics import accuracy, multilabel_f1, rouge_like
 # --------------- Configuration ---------------
@@ -69,6 +71,14 @@ class Trainer:
         self.use_bfloat16 = self.use_amp and torch.cuda.is_bf16_supported()
         self.scaler = torch.GradScaler("cuda", enabled=(self.use_amp and not self.use_bfloat16))
         self._nan_counter = 0
         mlflow.set_experiment(config.experiment_name)
@@ -98,6 +108,8 @@ class Trainer:
                 desc="Training",
                 unit="epoch",
                 position=0,
             )
             for epoch in epoch_pbar:
@@ -178,11 +190,14 @@ class Trainer:
             unit="batch",
             leave=False,
             position=1,
         )
         context = torch.enable_grad() if train else torch.no_grad()
         with context:
             for step in pbar:
                 step_loss = 0.0
                 for task, loader in loaders.items():
@@ -241,7 +256,19 @@ class Trainer:
         return averaged
     def _optimizer_step(self) -> None:
-        """Optimizer step with gradient clipping."""
         if self.use_bfloat16:
             torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.gradient_clip_norm)
             self.optimizer.step()
@@ -250,8 +277,16 @@ class Trainer:
             torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.gradient_clip_norm)
             self.scaler.step(self.optimizer)
             self.scaler.update()
         self.optimizer.zero_grad()
     def _get_batch(
         self, iterators: Dict, loader: DataLoader, task: str
     ) -> Dict[str, torch.Tensor] | None:
@@ -274,14 +309,28 @@ class Trainer:
     def _forward_task(
         self, task: str, batch: Dict[str, torch.Tensor]
     ) -> tuple[torch.Tensor, Dict[str, float]]:
-        """Route to task-specific forward pass."""
         if task == "summarization":
-            return self._forward_summarization(batch)
         elif task == "emotion":
-            return self._forward_emotion(batch)
         elif task == "topic":
-            return self._forward_topic(batch)
-        raise ValueError(f"Unknown task: {task}")
     def _forward_summarization(
         self, batch: Dict[str, torch.Tensor]

 from __future__ import annotations
+import sys
 import time
 from collections import defaultdict
 from dataclasses import dataclass
 from ..data.tokenization import Tokenizer
 from .metrics import accuracy, multilabel_f1, rouge_like
+from .nan_debugger import NaNDetector
 # --------------- Configuration ---------------
         self.use_bfloat16 = self.use_amp and torch.cuda.is_bf16_supported()
         self.scaler = torch.GradScaler("cuda", enabled=(self.use_amp and not self.use_bfloat16))
+        # NaN detection
+        self.nan_detector = NaNDetector(model, enabled=True)
+        self.nan_skip_count = 0
+        self.max_nan_skips = 50
+        # Track current step for debugging
+        self._current_step = 0
         self._nan_counter = 0
         mlflow.set_experiment(config.experiment_name)
                 desc="Training",
                 unit="epoch",
                 position=0,
+                file=sys.stderr,
+                dynamic_ncols=True,
             )
             for epoch in epoch_pbar:
             unit="batch",
             leave=False,
             position=1,
+            file=sys.stderr,
+            dynamic_ncols=True,
         )
         context = torch.enable_grad() if train else torch.no_grad()
         with context:
             for step in pbar:
+                self._current_step = step
                 step_loss = 0.0
                 for task, loader in loaders.items():
         return averaged
     def _optimizer_step(self) -> None:
+        """Optimizer step with gradient clipping and NaN detection."""
+        # Check gradients for NaN/Inf BEFORE clipping
+        nan_grad = self.nan_detector.check_gradients(self._current_step)
+        if nan_grad is not None:
+            param_name, _ = nan_grad
+            print(f"⚠ Skipping optimizer step due to NaN gradient in {param_name}")
+            self.optimizer.zero_grad()
+            self.nan_skip_count += 1
+            if self.nan_skip_count > self.max_nan_skips:
+                raise RuntimeError("Too many NaN gradients, stopping")
+            return
+        # Clip and step
         if self.use_bfloat16:
             torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.gradient_clip_norm)
             self.optimizer.step()
             torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.gradient_clip_norm)
             self.scaler.step(self.optimizer)
             self.scaler.update()
         self.optimizer.zero_grad()
+        # Check parameters for NaN AFTER update
+        nan_param = self.nan_detector.check_parameters(self._current_step)
+        if nan_param is not None:
+            raise RuntimeError(
+                f"NaN in parameter {nan_param} after optimizer step at step {self._current_step}!"
+            )
     def _get_batch(
         self, iterators: Dict, loader: DataLoader, task: str
     ) -> Dict[str, torch.Tensor] | None:
     def _forward_task(
         self, task: str, batch: Dict[str, torch.Tensor]
     ) -> tuple[torch.Tensor, Dict[str, float]]:
+        """Route to task-specific forward pass with NaN detection."""
         if task == "summarization":
+            loss, task_metrics = self._forward_summarization(batch)
         elif task == "emotion":
+            loss, task_metrics = self._forward_emotion(batch)
         elif task == "topic":
+            loss, task_metrics = self._forward_topic(batch)
+        else:
+            raise ValueError(f"Unknown task: {task}")
+        # Check for NaN in loss
+        if torch.isnan(loss):
+            self.nan_skip_count += 1
+            print(
+                f"⚠ NaN loss detected in {task} at step {self._current_step} (skip {self.nan_skip_count}/{self.max_nan_skips})"
+            )
+            if self.nan_skip_count > self.max_nan_skips:
+                raise RuntimeError(f"Too many NaN batches ({self.nan_skip_count}), stopping")
+            # Return zero loss to skip this batch
+            return torch.tensor(0.0, device=loss.device, requires_grad=True), task_metrics
+        return loss, task_metrics
     def _forward_summarization(
         self, batch: Dict[str, torch.Tensor]

tests/test_models/test_decoder.py CHANGED Viewed

@@ -64,9 +64,9 @@ def test_decoder_layer_causal_mask_blocks_future():
     B, H, Tq, Tk = self_attn.shape
     for i in range(Tq):
         for j in range(i + 1, Tk):
-            assert torch.allclose(
-                self_attn[:, :, i, j], torch.zeros(B, H)
-            ), f"Found nonzero attention to future position {j} from query {i}"
 def test_decoder_stack_and_greedy_decode_shapes():

     B, H, Tq, Tk = self_attn.shape
     for i in range(Tq):
         for j in range(i + 1, Tk):
+            assert torch.allclose(self_attn[:, :, i, j], torch.zeros(B, H)), (
+                f"Found nonzero attention to future position {j} from query {i}"
+            )
 def test_decoder_stack_and_greedy_decode_shapes():