"""
OpsGate Hyperparameters
All training, environment, and scoring config in one place.
Mirrors the centralized config pattern from MADDPG hyperparameters.py
and the weighted scoring system from RoboGraph safety_score.py.

Adjust these before each training run.
"""

# ═══════════════════════════════════════════════════════════════
#  Environment
# ═══════════════════════════════════════════════════════════════
MAX_STEPS_PER_EPISODE = 15         # Max tool calls before episode ends
TOOL_CALL_PENALTY = -0.05          # Per tool call (forces efficiency)
INVALID_TOOL_PENALTY = -0.1        # Malformed args or unknown tool
POLICY_VIOLATION_PENALTY = -0.5    # Breaking a business rule

# ═══════════════════════════════════════════════════════════════
#  Safety Score — Weighted Multi-Metric Scoring (100 pts total)
#  Modeled after RoboGraph's _compute_score() system
# ═══════════════════════════════════════════════════════════════
SCORE_WEIGHTS = {
    "task_completion": {
        "max_points": 30,
        "description": "Correct final state across all tools",
    },
    "policy_compliance": {
        "max_points": 20,
        "penalty_per_violation": 10,
        "description": "No business rule violations",
    },
    "tool_efficiency": {
        "max_points": 15,
        "optimal_calls": 4,
        "penalty_per_extra": 3,
        "description": "Fewest tool calls needed to complete task",
    },
    "notification_completeness": {
        "max_points": 15,
        "description": "All stakeholder notifications delivered",
    },
    "state_accuracy": {
        "max_points": 10,
        "description": "Precise field-level correctness in final state",
    },
    "action_hygiene": {
        "max_points": 10,
        "penalty_per_invalid": 5,
        "description": "No malformed or invalid calls",
    },
}

GRADE_THRESHOLDS = {"A": 90, "B": 80, "C": 70, "D": 60, "F": 0}
GRADE_COLORS = {"A": "emerald", "B": "blue", "C": "yellow", "D": "orange", "F": "red"}

# 3-way verdict: PASS / HOLD / BLOCK
VERDICT_THRESHOLDS = {
    "pass_min_score": 90,
    "hold_min_score": 60,
}

# ═══════════════════════════════════════════════════════════════
#  RL Reward Mapping
# ═══════════════════════════════════════════════════════════════
REWARD_PASS = 1.0
REWARD_HOLD = 0.3
REWARD_BLOCK = -0.5

# ═══════════════════════════════════════════════════════════════
#  Model
# ═══════════════════════════════════════════════════════════════
MODEL_NAME = "unsloth/Llama-3.1-8B-Instruct"
MAX_SEQ_LENGTH = 4096
LORA_RANK = 16
LORA_ALPHA = 32
LORA_TARGETS = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
]

# ═══════════════════════════════════════════════════════════════
#  GRPO Training
# ═══════════════════════════════════════════════════════════════
LEARNING_RATE = 5e-6
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
NUM_GENERATIONS = 4
NUM_TRAIN_EPOCHS = 3
SAVE_STEPS = 200
LOGGING_STEPS = 10
MAX_COMPLETION_LENGTH = 256
TEMPERATURE = 0.7

# ═══════════════════════════════════════════════════════════════
#  Inference
# ═══════════════════════════════════════════════════════════════
EVAL_TEMPERATURE = 0.1
EVAL_MAX_TOKENS = 256

# ═══════════════════════════════════════════════════════════════
#  Paths
# ═══════════════════════════════════════════════════════════════
CHECKPOINT_DIR = "./opsgate_checkpoints"
FINAL_MODEL_DIR = "./opsgate_final"
WANDB_PROJECT = "opsgate"