Step 53110: 27.1B tokens (Stage 2 in progress), loss=1.463, ppl=4.3

Browse files

Files changed (9) hide show

config.json +45 -0
configuration_saber.py +252 -0
generation_config.json +7 -0
meta.json +7 -0
model.safetensors +3 -0
modeling_saber.py +948 -0
optimizer.pt +3 -0
tokenizer.json +0 -0
tokenizer_config.json +12 -0

config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "architectures": [
+    "SABERForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_saber.SABERConfig",
+    "AutoModelForCausalLM": "modeling_saber.SABERForCausalLM"
+  },
+  "curiosity_coeff": 0.01,
+  "d_anchor": 96,
+  "d_exp": 192,
+  "d_ff": 2164,
+  "d_model": 1536,
+  "dtype": "float32",
+  "enable_anchors": true,
+  "enable_experience": true,
+  "gradient_checkpointing": false,
+  "head_dim": 128,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 2048,
+  "model_type": "saber",
+  "n_anchors": 64,
+  "n_heads": 12,
+  "n_layers": 20,
+  "predictability_mode": false,
+  "resonant_alpha_init": 3.0,
+  "resonant_layers": [
+    0,
+    2,
+    4,
+    6,
+    8,
+    10,
+    12,
+    14,
+    16,
+    18
+  ],
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.3.0",
+  "use_cache": true,
+  "vocab_size": 50257
+}

configuration_saber.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+configuration_saber.py — HuggingFace-compatible configuration for Eve-3-SABER-1B.
+Usage:
+    from configuration_saber import SABERConfig
+    config = SABERConfig()                 # default 1B spec
+    config.save_pretrained("./eve-3-saber-1b")
+    config = SABERConfig.from_pretrained("./eve-3-saber-1b")
+"""
+from __future__ import annotations
+from typing import List, Optional
+from transformers import PretrainedConfig
+class SABERConfig(PretrainedConfig):
+    r"""
+    Configuration class for Eve-3-SABER-1B.
+    SABER (Semantic Anchor-Biased Experience-Resonant) is a dense decoder-only
+    transformer with three novel components:
+    1. **Slip-Anchors** — a per-layer learnable codebook that biases K and V
+       *after* RoPE, preserving FlashAttention compatibility.
+    2. **Experience Stream** — a low-dimensional per-token state that flows
+       *layer-to-layer* (not token-to-token), with a curiosity auxiliary loss.
+    3. **Resonant FFN** — even-numbered layers augment SwiGLU with a learned
+       sinusoidal modulation, blended via a trainable alpha.
+    Args:
+        vocab_size (int):
+            Vocabulary size. Defaults to ``50257`` (GPT-2 tokenizer).
+        d_model (int):
+            Hidden/residual dimension. Defaults to ``2048``.
+        n_heads (int):
+            Number of attention heads. Defaults to ``16``.
+        head_dim (int):
+            Per-head dimension; must satisfy ``d_model == n_heads * head_dim``.
+            Defaults to ``128``.
+        n_layers (int):
+            Number of transformer blocks. Defaults to ``24``.
+        d_ff (int):
+            SwiGLU inner dimension. The spec value ``5461`` yields ~1.38B params;
+            use ``2855`` (tuned via ``param_counter.py --tune-dff``) to hit
+            exactly 1.0B. Defaults to ``5461`` (spec) so the number is always
+            explicit and reviewable.
+        max_position_embeddings (int):
+            Maximum sequence length for RoPE. Defaults to ``2048``.
+        rope_theta (float):
+            Base for RoPE frequency computation. Defaults to ``10000.0``.
+        rms_norm_eps (float):
+            Epsilon for RMSNorm numerical stability. Defaults to ``1e-6``.
+        initializer_range (float):
+            Std-dev for weight initialization (Normal). Defaults to ``0.02``.
+        tie_word_embeddings (bool):
+            Whether to tie the LM head weights to the input embedding table.
+            Defaults to ``True``.
+        --- Slip-Anchor hyperparameters ---
+        n_anchors (int):
+            Codebook size. Defaults to ``64``.
+        d_anchor (int):
+            Anchor bottleneck dimension. Defaults to ``128``.
+        --- Experience-stream hyperparameters ---
+        d_exp (int):
+            Experience stream dimension. Defaults to ``256``.
+        curiosity_coeff (float):
+            Weight of curiosity auxiliary loss. Defaults to ``0.01``.
+        --- Resonant-FFN hyperparameters ---
+        resonant_layers (Optional[List[int]]):
+            Which layer indices use the resonant FFN.  ``None`` means "all even
+            layers (0, 2, 4, …)".  Pass an explicit list to override (e.g. last
+            8 layers only for predictability mode).
+        resonant_alpha_init (float):
+            Initial value of ``alpha_raw`` before sigmoid; ``sigmoid(3.0)≈0.95``
+            starts training near pure SwiGLU. Defaults to ``3.0``.
+        --- Predictability mode (GPT-5.2 Thinking) ---
+        predictability_mode (bool):
+            When ``True`` the following overrides are applied at model
+            construction time:
+            * Anchor gate bias → ``-3`` (anchors nearly silent).
+            * ``U_e`` scale → ``0.05`` (tiny experience updates).
+            * ``resonant_layers`` → last 8 layers only.
+            Defaults to ``False``.
+        --- Gradient checkpointing ---
+        use_cache (bool):
+            Whether past KV states are returned (not used during training).
+            Defaults to ``True``.
+        gradient_checkpointing (bool):
+            Enable activation checkpointing.  Set via
+            ``model.gradient_checkpointing_enable()`` rather than here in most
+            cases. Defaults to ``False``.
+    """
+    # Required by HuggingFace AutoModel registry
+    model_type: str = "saber"
+    # Map canonical HF attribute names to SABER field names so that
+    # generic HF utilities (e.g. model.config.hidden_size) work transparently.
+    attribute_map = {
+        "hidden_size":              "d_model",
+        "num_hidden_layers":        "n_layers",
+        "num_attention_heads":      "n_heads",
+        "intermediate_size":        "d_ff",
+        "max_position_embeddings":  "max_position_embeddings",
+    }
+    def __init__(
+        self,
+        # Core architecture
+        vocab_size: int = 50257,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        head_dim: int = 128,
+        n_layers: int = 24,
+        d_ff: int = 2855,
+        max_position_embeddings: int = 2048,
+        rope_theta: float = 10_000.0,
+        rms_norm_eps: float = 1e-6,
+        initializer_range: float = 0.02,
+        tie_word_embeddings: bool = True,
+        # Slip-anchor
+        n_anchors: int = 64,
+        d_anchor: int = 128,
+        # Experience stream
+        d_exp: int = 256,
+        curiosity_coeff: float = 0.01,
+        # Resonant FFN
+        resonant_layers: Optional[List[int]] = None,
+        resonant_alpha_init: float = 3.0,
+        # Predictability mode
+        predictability_mode: bool = False,
+        # Inference / training toggles
+        use_cache: bool = True,
+        gradient_checkpointing: bool = False,
+        # Ablation flags (component enable/disable)
+        enable_anchors: bool = True,
+        enable_experience: bool = True,
+        **kwargs,
+    ) -> None:
+        # ------------------------------------------------------------------ #
+        # Validate key relationships
+        # ------------------------------------------------------------------ #
+        if d_model != n_heads * head_dim:
+            raise ValueError(
+                f"d_model ({d_model}) must equal n_heads ({n_heads}) × "
+                f"head_dim ({head_dim}) = {n_heads * head_dim}."
+            )
+        # ------------------------------------------------------------------ #
+        # Core
+        # ------------------------------------------------------------------ #
+        self.vocab_size             = vocab_size
+        self.d_model                = d_model
+        self.n_heads                = n_heads
+        self.head_dim               = head_dim
+        self.n_layers               = n_layers
+        self.d_ff                   = d_ff
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta             = rope_theta
+        self.rms_norm_eps           = rms_norm_eps
+        self.initializer_range      = initializer_range
+        # ------------------------------------------------------------------ #
+        # Slip-anchor
+        # ------------------------------------------------------------------ #
+        self.n_anchors   = n_anchors
+        self.d_anchor    = d_anchor
+        # ------------------------------------------------------------------ #
+        # Experience stream
+        # ------------------------------------------------------------------ #
+        self.d_exp          = d_exp
+        self.curiosity_coeff = curiosity_coeff
+        # ------------------------------------------------------------------ #
+        # Resonant FFN — default to all even layers
+        # ------------------------------------------------------------------ #
+        if resonant_layers is None:
+            resonant_layers = [i for i in range(n_layers) if i % 2 == 0]
+        self.resonant_layers    = resonant_layers
+        self.resonant_alpha_init = resonant_alpha_init
+        # ------------------------------------------------------------------ #
+        # Predictability mode overrides
+        # ------------------------------------------------------------------ #
+        self.predictability_mode = predictability_mode
+        if predictability_mode:
+            # Last 8 layers only
+            self.resonant_layers = list(range(n_layers - 8, n_layers))
+        # ------------------------------------------------------------------ #
+        # Inference / training
+        # ------------------------------------------------------------------ #
+        self.use_cache              = use_cache
+        self.gradient_checkpointing = gradient_checkpointing
+        # Ablation flags — allow disabling novel components
+        self.enable_anchors    = enable_anchors
+        self.enable_experience = enable_experience
+        # ------------------------------------------------------------------ #
+        # Pass through to PretrainedConfig (handles tie_word_embeddings, etc.)
+        # ------------------------------------------------------------------ #
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    # ---------------------------------------------------------------------- #
+    # Derived helpers (read-only properties, not serialized)
+    # ---------------------------------------------------------------------- #
+    @property
+    def num_key_value_heads(self) -> int:
+        """Alias for n_heads (SABER uses MHA, not GQA)."""
+        return self.n_heads
+    @property
+    def n_resonant_layers(self) -> int:
+        """Number of layers that use the resonant FFN."""
+        return len(self.resonant_layers)
+    def __repr__(self) -> str:  # noqa: D401
+        resonant_str = (
+            f"all-even (n={self.n_resonant_layers})"
+            if self.resonant_layers == [i for i in range(self.n_layers) if i % 2 == 0]
+            else str(self.resonant_layers)
+        )
+        return (
+            f"SABERConfig(\n"
+            f"  d_model={self.d_model}, n_heads={self.n_heads}, "
+            f"head_dim={self.head_dim}, n_layers={self.n_layers},\n"
+            f"  d_ff={self.d_ff}, vocab_size={self.vocab_size}, "
+            f"max_seq={self.max_position_embeddings},\n"
+            f"  n_anchors={self.n_anchors}, d_anchor={self.d_anchor}, "
+            f"d_exp={self.d_exp},\n"
+            f"  curiosity_coeff={self.curiosity_coeff}, "
+            f"resonant_layers={resonant_str},\n"
+            f"  resonant_alpha_init={self.resonant_alpha_init}, "
+            f"predictability_mode={self.predictability_mode},\n"
+            f"  tie_word_embeddings={self.tie_word_embeddings}, "
+            f"use_cache={self.use_cache}\n"
+            f")"
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.3.0",
+  "use_cache": true
+}

meta.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "step": 53110,
+  "tokens_seen": 27108895744,
+  "stage_idx": 1,
+  "wandb_run_id": null,
+  "total_target": 50000000000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c91927e9f167f59f63c90c285590e3fd3c4ead4eb6474bd6ddeba597c6806ea4
+size 1999952456

modeling_saber.py ADDED Viewed

	@@ -0,0 +1,948 @@

+"""
+modeling_saber.py — Full PyTorch implementation of Eve-3-SABER-1B.
+Architecture highlights
+-----------------------
+* Dense decoder-only transformer with pre-RMSNorm.
+* RoPE (rotary position embeddings) applied to Q and K after head reshape.
+* **Slip-Anchors**: learnable codebook biases K/V *after* RoPE, fully
+  compatible with FlashAttention / F.scaled_dot_product_attention.
+* **Experience Stream**: a per-token, layer-traversing state with a curiosity
+  auxiliary loss (prediction-error on a stop-gradient summary).
+* **Resonant FFN**: even-indexed layers augment SwiGLU with a learned
+  sinusoidal modulation blended by a trainable scalar alpha.
+* Weight-tied LM head.
+* Gradient-checkpointing support.
+Intended usage (HuggingFace Trainer / SFTTrainer compatible):
+    from configuration_saber import SABERConfig
+    from modeling_saber import SABERForCausalLM
+    config = SABERConfig()
+    model  = SABERForCausalLM(config)
+"""
+from __future__ import annotations
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers import PreTrainedModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast
+from transformers.utils import logging
+from configuration_saber import SABERConfig
+logger = logging.get_logger(__name__)
+# ---------------------------------------------------------------------------
+# 1. RMSNorm
+# ---------------------------------------------------------------------------
+class SABERRMSNorm(nn.Module):
+    """Root-mean-square layer normalization (no bias, learnable scale)."""
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps    = eps
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (..., hidden_size)
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Cast to float for numerical stability, then back to input dtype
+        return (self._norm(x.float()) * self.weight.float()).to(x.dtype)
+# ---------------------------------------------------------------------------
+# 2. Rotary Position Embeddings (RoPE)
+# ---------------------------------------------------------------------------
+class SABERRotaryEmbedding(nn.Module):
+    """
+    Standard RoPE using complex-number rotation (Llama / GPT-NeoX style).
+    Frequencies are cached up to ``max_seq_len`` and extended on the fly if
+    a longer sequence is encountered.
+    """
+    def __init__(
+        self,
+        head_dim: int,
+        max_seq_len: int = 2048,
+        theta: float = 10_000.0,
+        device: Optional[torch.device] = None,
+    ) -> None:
+        super().__init__()
+        self.head_dim    = head_dim
+        self.max_seq_len = max_seq_len
+        self.theta       = theta
+        # Precompute inverse frequencies (half of head_dim)
+        inv_freq = 1.0 / (
+            theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+                      / head_dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._build_cache(max_seq_len, device=device)
+    def _build_cache(self, seq_len: int, device: Optional[torch.device] = None) -> None:
+        """Build (or extend) the cos/sin cache."""
+        t       = torch.arange(seq_len, dtype=torch.float32,
+                               device=self.inv_freq.device if device is None else device)
+        freqs   = torch.outer(t, self.inv_freq)          # (seq_len, head_dim/2)
+        emb     = torch.cat([freqs, freqs], dim=-1)       # (seq_len, head_dim)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        self.max_seq_len = seq_len
+    @staticmethod
+    def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+        """Rotate the second half of the last dimension by -90°."""
+        half = x.shape[-1] // 2
+        x1, x2 = x[..., :half], x[..., half:]
+        return torch.cat([-x2, x1], dim=-1)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        seq_len: int,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply RoPE to q and k.
+        q, k: (batch, n_heads, seq_len, head_dim)
+        position_ids: (batch, seq_len) or None
+        """
+        if seq_len > self.max_seq_len:
+            self._build_cache(seq_len, device=q.device)
+        if position_ids is not None:
+            # Gather cos/sin for the specific positions in this batch.
+            # cos_cached: (1, 1, max_seq, head_dim) → flatten to (max_seq, head_dim)
+            # then index with position_ids (B, L) → (B, L, head_dim)
+            # and unsqueeze head axis → (B, 1, L, head_dim)
+            cos_2d = self.cos_cached.squeeze(0).squeeze(0).to(q.dtype)  # (max_seq, head_dim)
+            sin_2d = self.sin_cached.squeeze(0).squeeze(0).to(q.dtype)
+            cos = cos_2d[position_ids].unsqueeze(1)   # (B, 1, L, head_dim)
+            sin = sin_2d[position_ids].unsqueeze(1)
+        else:
+            cos = self.cos_cached[:, :, :seq_len, :].to(q.dtype)   # (1, 1, L, head_dim)
+            sin = self.sin_cached[:, :, :seq_len, :].to(q.dtype)
+        q_rot = q * cos + self._rotate_half(q) * sin
+        k_rot = k * cos + self._rotate_half(k) * sin
+        return q_rot, k_rot
+# ---------------------------------------------------------------------------
+# 3. Slip-Anchors
+# ---------------------------------------------------------------------------
+class SlipAnchors(nn.Module):
+    """
+    Slip-anchor module — biases K and V using a learnable codebook.
+    Applied *after* RoPE, so FlashAttention compatibility is preserved.
+    Parameters
+    ----------
+    d_model   : residual hidden dimension (2048)
+    n_anchors : codebook size (64)
+    d_anchor  : anchor bottleneck dim (128)
+    head_dim  : per-head dimension (128)
+    n_heads   : number of attention heads (16)
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_anchors: int,
+        d_anchor: int,
+        head_dim: int,
+        n_heads: int,
+    ) -> None:
+        super().__init__()
+        self.n_anchors = n_anchors
+        self.d_anchor  = d_anchor
+        self.n_heads   = n_heads
+        self.head_dim  = head_dim
+        # Learnable codebook: (n_anchors, d_anchor)
+        self.anchors       = nn.Parameter(torch.empty(n_anchors, d_anchor))
+        # h → anchor space
+        self.W_anchor_down = nn.Linear(d_model, d_anchor, bias=False)
+        # anchor context → K bias (per head)
+        self.U_k = nn.Linear(d_anchor, head_dim, bias=False)
+        # anchor context → V bias (per head)
+        self.U_v = nn.Linear(d_anchor, head_dim, bias=False)
+        self._init_weights()
+    def _init_weights(self) -> None:
+        nn.init.normal_(self.anchors, std=0.02)
+        nn.init.normal_(self.W_anchor_down.weight, std=0.02)
+        nn.init.normal_(self.U_k.weight, std=0.02)
+        nn.init.normal_(self.U_v.weight, std=0.02)
+    def forward(
+        self,
+        h: torch.Tensor,                  # (B, L, d_model) — pre-attention hidden state
+        K: torch.Tensor,                  # (B, n_heads, L, head_dim) — post-RoPE
+        V: torch.Tensor,                  # (B, n_heads, L, head_dim)
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Return K_modified, V_modified."""
+        B, L, _ = h.shape
+        # 1. Project h to anchor space: (B, L, d_anchor)
+        h_anchor = self.W_anchor_down(h)
+        # 2. Soft scores over codebook: (B, L, n_anchors)
+        scores = torch.softmax(h_anchor @ self.anchors.T, dim=-1)
+        # 3. Weighted anchor context: (B, L, d_anchor)
+        anchor_context = scores @ self.anchors
+        # 4. Project to K and V bias spaces: (B, L, head_dim)
+        k_bias = self.U_k(anchor_context)   # (B, L, head_dim)
+        v_bias = self.U_v(anchor_context)   # (B, L, head_dim)
+        # 5. Broadcast across heads: unsqueeze head dim → (B, 1, L, head_dim)
+        K_modified = K + k_bias.unsqueeze(1)
+        V_modified = V + v_bias.unsqueeze(1)
+        return K_modified, V_modified
+# ---------------------------------------------------------------------------
+# 4. Attention
+# ---------------------------------------------------------------------------
+class SABERAttention(nn.Module):
+    """
+    Multi-head attention with:
+    * No projection biases.
+    * RoPE applied to Q and K after head reshape.
+    * Slip-anchor modulation of K and V after RoPE.
+    * F.scaled_dot_product_attention (FlashAttention 2 compatible).
+    """
+    def __init__(self, config: SABERConfig, layer_idx: int) -> None:
+        super().__init__()
+        self.config    = config
+        self.layer_idx = layer_idx
+        self.d_model   = config.d_model
+        self.n_heads   = config.n_heads
+        self.head_dim  = config.head_dim
+        # QKV and O projections — no bias throughout
+        self.q_proj = nn.Linear(self.d_model, self.d_model, bias=False)
+        self.k_proj = nn.Linear(self.d_model, self.d_model, bias=False)
+        self.v_proj = nn.Linear(self.d_model, self.d_model, bias=False)
+        self.o_proj = nn.Linear(self.d_model, self.d_model, bias=False)
+        # Rotary embeddings (shared via the parent model, but instantiated here
+        # for standalone correctness)
+        self.rotary_emb = SABERRotaryEmbedding(
+            head_dim=self.head_dim,
+            max_seq_len=config.max_position_embeddings,
+            theta=config.rope_theta,
+        )
+        # Slip-anchors
+        self.slip_anchors = SlipAnchors(
+            d_model=self.d_model,
+            n_anchors=config.n_anchors,
+            d_anchor=config.d_anchor,
+            head_dim=self.head_dim,
+            n_heads=self.n_heads,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,          # (B, L, d_model)
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, ...]:
+        B, L, _ = hidden_states.shape
+        # ---- QKV projections ----
+        Q = self.q_proj(hidden_states)   # (B, L, d_model)
+        K = self.k_proj(hidden_states)
+        V = self.v_proj(hidden_states)
+        # ---- Reshape to (B, n_heads, L, head_dim) ----
+        def _reshape(t: torch.Tensor) -> torch.Tensor:
+            return t.view(B, L, self.n_heads, self.head_dim).transpose(1, 2)
+        Q, K, V = _reshape(Q), _reshape(K), _reshape(V)
+        # ---- Apply RoPE to Q and K ----
+        kv_seq_len = L
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        Q, K = self.rotary_emb(Q, K, seq_len=kv_seq_len, position_ids=position_ids)
+        # ---- KV cache ----
+        if past_key_value is not None:
+            K = torch.cat([past_key_value[0], K], dim=2)
+            V = torch.cat([past_key_value[1], V], dim=2)
+        present_kv = (K, V) if use_cache else None
+        # ---- Slip-anchor modulation of K and V ----
+        # Pass raw h (pre-attn hidden state) to avoid circularity
+        if getattr(self.config, 'enable_anchors', True):
+            K, V = self.slip_anchors(hidden_states, K, V)
+        # ---- Scaled dot-product attention (FlashAttention 2 compatible) ----
+        # Build causal mask if needed (SDPA handles is_causal natively)
+        is_causal = attention_mask is None and L > 1
+        attn_out = F.scaled_dot_product_attention(
+            Q, K, V,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=is_causal,
+        )   # (B, n_heads, L, head_dim)
+        # ---- Merge heads and project ----
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, L, self.d_model)
+        attn_out = self.o_proj(attn_out)
+        outputs: Tuple = (attn_out,)
+        if use_cache:
+            outputs += (present_kv,)
+        if output_attentions:
+            # Attention weights are not explicitly computed when using SDPA
+            outputs += (None,)
+        return outputs
+# ---------------------------------------------------------------------------
+# 5. Experience Stream
+# ---------------------------------------------------------------------------
+class ExperienceStream(nn.Module):
+    """
+    Per-layer experience update with a curiosity (prediction-error) auxiliary loss.
+    State flows layer-to-layer within a single forward pass; it is reset to
+    zeros at the start of each new sequence.
+    Parameters
+    ----------
+    d_model : residual hidden dimension
+    d_exp   : experience state dimension (256)
+    """
+    def __init__(self, d_model: int, d_exp: int) -> None:
+        super().__init__()
+        # Summarise post-attention hidden state → experience space
+        self.W_s    = nn.Linear(d_model, d_exp, bias=False)
+        # Predict current summary from previous state (curiosity signal)
+        self.W_pred = nn.Linear(d_exp,   d_exp, bias=False)
+        # Gated update to experience state
+        self.W_e    = nn.Linear(d_exp,   d_exp, bias=False)
+        # Learned decay gate: sigmoid(3.0) ~ 0.95 retains most state initially
+        self.decay_raw = nn.Parameter(torch.full((d_exp,), 3.0))
+        # Layer-norm on experience state to prevent magnitude drift
+        self.exp_norm = nn.LayerNorm(d_exp)
+    def forward(
+        self,
+        h: torch.Tensor,                  # (B, L, d_model) post-attention
+        experience_state: torch.Tensor,   # (B, L, d_exp)   previous state
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns
+        -------
+        new_experience_state : (B, L, d_exp)
+        curiosity_loss       : scalar tensor
+        """
+        # 1. Summarise current hidden state
+        s = self.W_s(h)                                # (B, L, d_exp)
+        # 2. Stop-gradient on s for the curiosity term (CRITICAL for stability)
+        s_sg = s.detach()
+        # 3. Predict current summary from previous experience state
+        s_pred = self.W_pred(experience_state)         # (B, L, d_exp)
+        # 4. Curiosity = mean squared prediction error
+        curiosity_loss = (s_sg - s_pred).pow(2).mean()
+        # 5. Update experience state with SiLU-gated delta
+        decay = torch.sigmoid(self.decay_raw)          # (d_exp,) in [0, 1]
+        delta = F.silu(self.W_e(s))                    # (B, L, d_exp)
+        new_state = decay * experience_state + delta
+        new_state = self.exp_norm(new_state)
+        return new_state, curiosity_loss
+# ---------------------------------------------------------------------------
+# 6. Feed-forward networks
+# ---------------------------------------------------------------------------
+class StandardFFN(nn.Module):
+    """Standard SwiGLU FFN (used on odd-indexed layers)."""
+    def __init__(self, d_model: int, d_ff: int) -> None:
+        super().__init__()
+        self.W1 = nn.Linear(d_model, d_ff, bias=False)   # gate projection
+        self.W3 = nn.Linear(d_model, d_ff, bias=False)   # up projection
+        self.W2 = nn.Linear(d_ff,   d_model, bias=False) # down projection
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # SwiGLU: silu(gate) ⊙ up, then project down
+        return self.W2(F.silu(self.W1(x)) * self.W3(x))
+class ResonantFFN(nn.Module):
+    """
+    Resonant FFN (used on even-indexed layers).
+    Augments standard SwiGLU with a learned sinusoidal modulation.
+    The blend is controlled by a per-layer scalar alpha (init ≈ 0.95).
+        ffn_out  = W2(silu(W1(x)) * W3(x))          # standard SwiGLU
+        mod      = sin(W_freq @ x)                   # sinusoidal modulation
+        alpha    = sigmoid(alpha_raw)                # ≈ 0.95 at init
+        output   = alpha * ffn_out + (1-alpha) * ffn_out * (1 + mod)
+                 = ffn_out * (alpha + (1-alpha) * (1 + mod))
+    """
+    def __init__(self, d_model: int, d_ff: int, alpha_init: float = 3.0) -> None:
+        super().__init__()
+        # Shared SwiGLU matrices
+        self.W1 = nn.Linear(d_model, d_ff,    bias=False)
+        self.W3 = nn.Linear(d_model, d_ff,    bias=False)
+        self.W2 = nn.Linear(d_ff,    d_model, bias=False)
+        # Sinusoidal modulation projection
+        self.W_freq = nn.Linear(d_model, d_model, bias=False)
+        # Per-layer blending scalar; init so sigmoid(alpha_raw) ≈ 0.95
+        self.alpha_raw = nn.Parameter(torch.tensor(alpha_init))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Standard SwiGLU output
+        ffn_out = self.W2(F.silu(self.W1(x)) * self.W3(x))   # (B, L, d_model)
+        # Sinusoidal modulation
+        mod = torch.sin(self.W_freq(x))                        # (B, L, d_model)
+        # Learned blend
+        alpha = torch.sigmoid(self.alpha_raw)                  # scalar ∈ (0,1)
+        output = alpha * ffn_out + (1.0 - alpha) * (ffn_out * (1.0 + mod))
+        return output
+# ---------------------------------------------------------------------------
+# 7. Transformer Block
+# ---------------------------------------------------------------------------
+class SABERBlock(nn.Module):
+    """
+    Single SABER transformer block.
+    Structure (pre-norm):
+        h = h + Attention(RMSNorm(h))
+        h = h + FFN(RMSNorm(h))
+        experience_state, curiosity = ExperienceStream(h, experience_state)
+    """
+    def __init__(self, config: SABERConfig, layer_idx: int) -> None:
+        super().__init__()
+        self.config    = config
+        self.layer_idx = layer_idx
+        self.input_layernorm      = SABERRMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = SABERRMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.self_attn = SABERAttention(config, layer_idx=layer_idx)
+        # Select FFN type based on layer index
+        if layer_idx in config.resonant_layers:
+            self.ffn: nn.Module = ResonantFFN(
+                d_model=config.d_model,
+                d_ff=config.d_ff,
+                alpha_init=config.resonant_alpha_init,
+            )
+        else:
+            self.ffn = StandardFFN(d_model=config.d_model, d_ff=config.d_ff)
+        self.experience_stream = ExperienceStream(
+            d_model=config.d_model,
+            d_exp=config.d_exp,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,                   # (B, L, d_model)
+        experience_state: torch.Tensor,                # (B, L, d_exp)
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ) -> Tuple:
+        residual = hidden_states
+        # ---- Pre-norm attention ----
+        normed = self.input_layernorm(hidden_states)
+        attn_outputs = self.self_attn(
+            normed,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_out = attn_outputs[0]
+        hidden_states = residual + attn_out                # residual connection
+        # ---- Pre-norm FFN ----
+        residual = hidden_states
+        hidden_states = residual + self.ffn(self.post_attention_layernorm(hidden_states))
+        # ---- Experience stream update ----
+        if getattr(self.config, 'enable_experience', True):
+            experience_state, curiosity_loss = self.experience_stream(
+                hidden_states, experience_state
+            )
+        else:
+            curiosity_loss = torch.tensor(0.0, device=hidden_states.device)
+        # Pack remaining outputs
+        extra = attn_outputs[1:]   # present_kv and/or attention_weights
+        return (hidden_states, experience_state, curiosity_loss) + extra
+# ---------------------------------------------------------------------------
+# 8. Base Model
+# ---------------------------------------------------------------------------
+class SABERModel(PreTrainedModel):
+    """
+    SABER base model: token embeddings → blocks → final RMSNorm.
+    Does not include the LM head — use ``SABERForCausalLM`` for training.
+    """
+    config_class = SABERConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SABERBlock"]
+    _supports_flash_attn_2 = True
+    def __init__(self, config: SABERConfig) -> None:
+        super().__init__(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.layers       = nn.ModuleList(
+            [SABERBlock(config, layer_idx=i) for i in range(config.n_layers)]
+        )
+        self.norm = SABERRMSNorm(config.d_model, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        self.post_init()   # weight init + gradient-checkpointing setup
+    # ------------------------------------------------------------------ #
+    # Weight initialization (called by post_init via _init_weights)
+    # ------------------------------------------------------------------ #
+    def _init_weights(self, module: nn.Module) -> None:
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=std)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=std)
+        elif isinstance(module, SABERRMSNorm):
+            nn.init.ones_(module.weight)
+        elif isinstance(module, SlipAnchors):
+            # Handled inside SlipAnchors._init_weights; no-op here
+            pass
+        # ResonantFFN.alpha_raw: initialised inside the class (default=3.0)
+    # ------------------------------------------------------------------ #
+    # Accessors
+    # ------------------------------------------------------------------ #
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+    def set_input_embeddings(self, value: nn.Embedding) -> None:
+        self.embed_tokens = value
+    # ------------------------------------------------------------------ #
+    # Forward
+    # ------------------------------------------------------------------ #
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutputWithPast, Tuple]:
+        use_cache           = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions   = output_attentions or False
+        output_hidden_states = output_hidden_states or False
+        return_dict         = return_dict if return_dict is not None else self.config.use_return_dict
+        # ---- Embeddings ----
+        if inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError("Provide either input_ids or inputs_embeds.")
+            inputs_embeds = self.embed_tokens(input_ids)
+        B, L, _ = inputs_embeds.shape
+        # ---- Position ids ----
+        if position_ids is None:
+            past_len    = past_key_values[0][0].shape[-2] if past_key_values else 0
+            position_ids = torch.arange(
+                past_len, past_len + L,
+                dtype=torch.long,
+                device=inputs_embeds.device,
+            ).unsqueeze(0).expand(B, -1)
+        # ---- Attention mask conversion for SDPA ----
+        # We rely on SDPA's built-in is_causal flag; user-supplied masks are
+        # passed as-is (e.g., padding masks in float format).
+        # If a 2-D (B, L) boolean mask is supplied, convert to additive float.
+        causal_mask: Optional[torch.Tensor] = None
+        if attention_mask is not None and attention_mask.dim() == 2:
+            # 0 → masked (−∞), 1 → attended (0)
+            # Expand to (B, 1, 1, L) for SDPA broadcasting
+            causal_mask = (
+                (1.0 - attention_mask[:, None, None, :].float())
+                * torch.finfo(inputs_embeds.dtype).min
+            )
+        # ---- Initialise experience state ----
+        # Shape: (B, L, d_exp) — zeros at the start of each sequence.
+        # Note: when using KV cache the sequence length L changes per step;
+        # experience state is kept external to the model for incremental
+        # decoding (callers may pass zeros each step for generation).
+        experience_state = torch.zeros(
+            B, L, self.config.d_exp,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
+        )
+        # ---- Layer loop ----
+        hidden_states      = inputs_embeds
+        all_hidden_states  = () if output_hidden_states else None
+        all_self_attns     = () if output_attentions   else None
+        next_cache         = []
+        total_curiosity    = torch.tensor(0.0, device=inputs_embeds.device,
+                                          dtype=inputs_embeds.dtype)
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_kv = past_key_values[i] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                # Wrap block forward through torch.utils.checkpoint.
+                # Curiosity loss gradient flows normally; only activations
+                # are recomputed.
+                def _make_ckpt_fn(layer, experience_state):
+                    def _fn(hidden_states, causal_mask, position_ids):
+                        return layer(
+                            hidden_states,
+                            experience_state=experience_state,
+                            attention_mask=causal_mask,
+                            position_ids=position_ids,
+                            past_key_value=None,
+                            use_cache=False,
+                            output_attentions=output_attentions,
+                        )
+                    return _fn
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    _make_ckpt_fn(layer, experience_state),
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    use_reentrant=False,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    experience_state=experience_state,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_kv,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            hidden_states    = layer_outputs[0]
+            experience_state = layer_outputs[1]
+            total_curiosity  = total_curiosity + layer_outputs[2]
+            # Collect KV cache
+            if use_cache:
+                # present_kv is at index 3 (after hidden, exp_state, curiosity)
+                next_cache.append(layer_outputs[3] if len(layer_outputs) > 3 else None)
+            if output_attentions:
+                # attn weights at last position when output_attentions=True
+                all_self_attns += (layer_outputs[-1],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        # Average curiosity loss across layers
+        mean_curiosity = total_curiosity / self.config.n_layers
+        next_cache_out = next_cache if use_cache else None
+        if not return_dict:
+            # Always emit a fixed-position tuple so SABERForCausalLM can
+            # index reliably:
+            #   [0] hidden_states
+            #   [1] mean_curiosity
+            #   [2] past_key_values (None when use_cache=False)
+            #   [3] all_hidden_states (None when output_hidden_states=False)
+            #   [4] all_self_attns   (None when output_attentions=False)
+            return (
+                hidden_states,
+                mean_curiosity,
+                next_cache_out,
+                all_hidden_states,
+                all_self_attns,
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache_out,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        ), mean_curiosity
+# ---------------------------------------------------------------------------
+# 9. Causal LM wrapper
+# ---------------------------------------------------------------------------
+class SABERForCausalLM(PreTrainedModel, GenerationMixin):
+    """
+    Eve-3-SABER-1B for causal language modelling.
+    Compatible with HuggingFace ``Trainer``, ``SFTTrainer``, PEFT, and
+    standard ``generate()`` pipelines.
+    Loss = L_CE + curiosity_coeff * L_curiosity
+    """
+    config_class = SABERConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SABERBlock"]
+    _supports_flash_attn_2 = True
+    # Map required for AutoModel/AutoModelForCausalLM
+    # Dict mapping parameter to its tied source (HF 5.x format)
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
+    def __init__(self, config: SABERConfig) -> None:
+        super().__init__(config)
+        self.model   = SABERModel(config)
+        # LM head — tied to token embeddings (no extra params)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.post_init()
+    # ------------------------------------------------------------------ #
+    # Weight tying (called by post_init)
+    # ------------------------------------------------------------------ #
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value: nn.Embedding) -> None:
+        self.model.embed_tokens = value
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
+        self.lm_head = new_embeddings
+    def tie_weights(self, **kwargs) -> None:
+        """Tie lm_head.weight ← embed_tokens.weight."""
+        self.lm_head.weight = self.model.embed_tokens.weight
+    # ------------------------------------------------------------------ #
+    # Forward
+    # ------------------------------------------------------------------ #
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[CausalLMOutputWithPast, Tuple]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # ---- Base model (always use tuple return for clean unpacking) ----
+        # SABERModel always returns (hidden_states, curiosity, [pkv], [all_hs], [all_attn])
+        # when return_dict=False.  We unpack manually and re-wrap for return_dict=True.
+        base_out = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=False,   # always False so we get a plain tuple
+        )
+        # base_out: (hidden_states, curiosity_loss, [pkv], [all_hs], [all_attn])
+        hidden_states  = base_out[0]                      # (B, L, d_model)
+        curiosity_loss = base_out[1]                      # scalar
+        pkv            = base_out[2] if len(base_out) > 2 else None
+        all_hs         = base_out[3] if len(base_out) > 3 else None
+        all_attn       = base_out[4] if len(base_out) > 4 else None
+        # ---- LM logits ----
+        logits = self.lm_head(hidden_states)   # (B, L, vocab_size)
+        # ---- Loss computation ----
+        loss: Optional[torch.Tensor] = None
+        if labels is not None:
+            # Causal LM: predict token t+1 from position t.
+            # Shift logits left by one, labels right by one.
+            shift_logits = logits[:, :-1, :].contiguous()   # (B, L-1, V)
+            shift_labels = labels[:, 1:].contiguous()        # (B, L-1)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            ce_loss  = loss_fct(
+                shift_logits.view(-1, self.config.vocab_size),
+                shift_labels.view(-1),
+            )
+            loss = ce_loss + self.config.curiosity_coeff * curiosity_loss
+        if not return_dict:
+            out = (logits,)
+            if loss is not None:
+                out = (loss,) + out
+            if pkv is not None:
+                out += (pkv,)
+            return out
+        # Return dict during training (allows extra keys), ModelOutput for inference
+        if labels is not None:
+            return {
+                "loss": loss,
+                "logits": logits,
+                "past_key_values": pkv,
+                "hidden_states": all_hs,
+                "attentions": all_attn,
+                "ce_loss": ce_loss,
+                "curiosity_loss": curiosity_loss,
+            }
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=pkv,
+            hidden_states=all_hs,
+            attentions=all_attn,
+        )
+    # ------------------------------------------------------------------ #
+    # Generation helpers
+    # ------------------------------------------------------------------ #
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> dict:
+        if past_key_values is not None:
+            # Only pass the last token during incremental decoding
+            input_ids = input_ids[:, -1:]
+        # Build position_ids from the current seq length
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values is not None:
+                position_ids = position_ids[:, -1:]
+        model_inputs: dict = {}
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs["inputs_embeds"] = inputs_embeds
+        else:
+            model_inputs["input_ids"] = input_ids
+        model_inputs.update(
+            {
+                "position_ids":    position_ids,
+                "past_key_values": past_key_values,
+                "use_cache":       kwargs.get("use_cache", True),
+                "attention_mask":  attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(
+        past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        beam_idx: torch.LongTensor,
+    ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
+        """Re-order KV cache for beam search."""
+        return [
+            (
+                past_kv[0].index_select(0, beam_idx.to(past_kv[0].device)),
+                past_kv[1].index_select(0, beam_idx.to(past_kv[1].device)),
+            )
+            for past_kv in past_key_values
+        ]
+# ---------------------------------------------------------------------------
+# Auto-class registration hint (used by HF hub auto-loading)
+# ---------------------------------------------------------------------------
+SABERConfig.register_for_auto_class("AutoConfig")
+SABERForCausalLM.register_for_auto_class("AutoModelForCausalLM")

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a361bdf219883b82d1436a8b13b5c5c17e522a9f40a59e952cadb1d4063a93e8
+size 3997491658

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "is_local": false,
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}