Spaces:
Running
Running
| """T5-style Layer Normalization (RMSNorm without mean centering). | |
| T5 uses a variant of RMSNorm that does NOT subtract the mean. | |
| This is critical for matching T5's behavior. | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| class T5LayerNorm(nn.Module): | |
| """ | |
| T5-style layer normalization without mean centering. | |
| This is similar to RMSNorm but does NOT subtract the mean from x. | |
| Formula: output = x / sqrt(mean(x^2) + eps) * weight | |
| Args: | |
| normalized_shape: Input shape (typically d_model) | |
| eps: Small constant for numerical stability | |
| """ | |
| def __init__(self, normalized_shape: int, eps: float = 1e-6): | |
| super().__init__() | |
| self.weight = nn.Parameter(torch.ones(normalized_shape)) | |
| self.variance_epsilon = eps | |
| def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: | |
| """ | |
| Args: | |
| hidden_states: (*, normalized_shape) | |
| Returns: | |
| Normalized tensor of same shape | |
| """ | |
| # T5 uses variance = mean(x^2), does NOT subtract mean | |
| variance = hidden_states.pow(2).mean(-1, keepdim=True) | |
| hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) | |
| # Scale by learned weight (no bias in T5) | |
| return self.weight * hidden_states | |