Deployment: L25 RAW Probe | Acc: 75.76%
Browse files- README.md +25 -0
- config.json +14 -0
- model.py +15 -0
- model.safetensors +3 -0
README.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
library_name: transformers
|
| 4 |
+
tags:
|
| 5 |
+
- fairsteer
|
| 6 |
+
- bias-detection
|
| 7 |
+
- interpretability
|
| 8 |
+
- llama-2
|
| 9 |
+
- linear-probe
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# BAD Classifier (FairSteer): Llama-2-7b-chat-hf Layer 25 (RAW Mode)
|
| 13 |
+
|
| 14 |
+
This **Biased Activation Detector (BAD)** was trained using the FairSteer methodology.
|
| 15 |
+
|
| 16 |
+
## Model Metadata
|
| 17 |
+
- **Base Model:** `meta-llama/Llama-2-7b-chat-hf`
|
| 18 |
+
- **Optimal Layer:** 25
|
| 19 |
+
- **Validation Accuracy:** 75.76%
|
| 20 |
+
- **Extraction Mode:** RAW (Directly matches FairSteer GitHub logic)
|
| 21 |
+
- **Protocol:** 1:1 Balanced Undersampling (Scenario-Grouped)
|
| 22 |
+
|
| 23 |
+
## Usage
|
| 24 |
+
1. Extract residual stream activation `[:, -1, :]` from Layer 25.
|
| 25 |
+
2. Pass raw activation directly to `model.safetensors`.
|
config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"project": "FairSteer Llama-2 Debiasing",
|
| 4 |
+
"timestamp": "20260111_1145",
|
| 5 |
+
"optimal_layer": 25,
|
| 6 |
+
"val_accuracy": 0.7575601374570446,
|
| 7 |
+
"extraction_mode": "raw"
|
| 8 |
+
},
|
| 9 |
+
"model_config": {
|
| 10 |
+
"base_model": "meta-llama/Llama-2-7b-chat-hf",
|
| 11 |
+
"input_dim": 4096,
|
| 12 |
+
"architecture": "LinearProbe"
|
| 13 |
+
}
|
| 14 |
+
}
|
model.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
|
| 4 |
+
class BADClassifier(nn.Module):
|
| 5 |
+
"""
|
| 6 |
+
Linear Probe for Biased Activation Detection (FairSteer Standard).
|
| 7 |
+
Optimized for Llama-2-7b-chat-hf - Layer 25.
|
| 8 |
+
"""
|
| 9 |
+
def __init__(self, input_dim=4096):
|
| 10 |
+
super().__init__()
|
| 11 |
+
self.linear = nn.Linear(input_dim, 1)
|
| 12 |
+
|
| 13 |
+
def forward(self, x):
|
| 14 |
+
# Direct linear pass matching cuml.LogisticRegression
|
| 15 |
+
return self.linear(x)
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d2cebab4f6afde619bdb04044a5c304738016d9d72c01996c0a7d7ac6a7e3c7
|
| 3 |
+
size 16540
|