Spaces:
Sleeping
Sleeping
File size: 5,190 Bytes
5567ff6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | """
OpsGate Hyperparameters
All training, environment, and scoring config in one place.
Mirrors the centralized config pattern from MADDPG hyperparameters.py
and the weighted scoring system from RoboGraph safety_score.py.
Adjust these before each training run.
"""
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Environment
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MAX_STEPS_PER_EPISODE = 15 # Max tool calls before episode ends
TOOL_CALL_PENALTY = -0.05 # Per tool call (forces efficiency)
INVALID_TOOL_PENALTY = -0.1 # Malformed args or unknown tool
POLICY_VIOLATION_PENALTY = -0.5 # Breaking a business rule
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Safety Score β Weighted Multi-Metric Scoring (100 pts total)
# Modeled after RoboGraph's _compute_score() system
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
SCORE_WEIGHTS = {
"task_completion": {
"max_points": 30,
"description": "Correct final state across all tools",
},
"policy_compliance": {
"max_points": 20,
"penalty_per_violation": 10,
"description": "No business rule violations",
},
"tool_efficiency": {
"max_points": 15,
"optimal_calls": 4,
"penalty_per_extra": 3,
"description": "Fewest tool calls needed to complete task",
},
"notification_completeness": {
"max_points": 15,
"description": "All stakeholder notifications delivered",
},
"state_accuracy": {
"max_points": 10,
"description": "Precise field-level correctness in final state",
},
"action_hygiene": {
"max_points": 10,
"penalty_per_invalid": 5,
"description": "No malformed or invalid calls",
},
}
GRADE_THRESHOLDS = {"A": 90, "B": 80, "C": 70, "D": 60, "F": 0}
GRADE_COLORS = {"A": "emerald", "B": "blue", "C": "yellow", "D": "orange", "F": "red"}
# 3-way verdict: PASS / HOLD / BLOCK
VERDICT_THRESHOLDS = {
"pass_min_score": 90,
"hold_min_score": 60,
}
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# RL Reward Mapping
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
REWARD_PASS = 1.0
REWARD_HOLD = 0.3
REWARD_BLOCK = -0.5
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Model
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MODEL_NAME = "unsloth/Llama-3.1-8B-Instruct"
MAX_SEQ_LENGTH = 4096
LORA_RANK = 16
LORA_ALPHA = 32
LORA_TARGETS = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
]
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# GRPO Training
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
LEARNING_RATE = 5e-6
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
NUM_GENERATIONS = 4
NUM_TRAIN_EPOCHS = 3
SAVE_STEPS = 200
LOGGING_STEPS = 10
MAX_COMPLETION_LENGTH = 256
TEMPERATURE = 0.7
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Inference
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
EVAL_TEMPERATURE = 0.1
EVAL_MAX_TOKENS = 256
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Paths
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CHECKPOINT_DIR = "./opsgate_checkpoints"
FINAL_MODEL_DIR = "./opsgate_final"
WANDB_PROJECT = "opsgate"
|