"""
benchmark_field_tests.py  — Suite 7, 8, 9
Tests designed specifically for what a Hebbian attractor field can do
that pure cosine-similarity RAG fundamentally cannot.

Run standalone:  python benchmark_field_tests.py
Or append to benchmark.py by importing and calling run_field_suites().

The three suites:

  Suite 7 — ASSOCIATIVE INFERENCE
    Teach A→B and B→C in separate teach() calls.
    Ask for A→C.  No explicit link was ever stated.
    Field: co-activation of regions encoding A and C should create implicit link.
    RAG: A and C are separate embeddings with no stored connector.
    Expected: Hybrid > RAG on transitive / relational queries.

  Suite 8 — PATTERN EXTRACTION
    Teach 6-8 examples of an unstated rule (no rule is ever named).
    Ask model to infer the rule or apply it to a new case.
    Field: repeated co-activation of same regions strengthens attractor.
    RAG: returns 5 nearest examples, no pattern synthesis.
    Expected: Hybrid > RAG on novel-instance questions.

  Suite 9 — DEGRADED CUE COMPLETION
    Teach a fact with exact phrasing.
    Query with a paraphrased / partial / typo'd version.
    Field: attractor basin should pull in near-neighbor activations.
    RAG: cosine sim drops if phrasing diverges enough.
    Expected: Hybrid >= RAG; RAG degrades on severe paraphrase.

  Suite 10 — CONTRADICTION / UPDATE
    Teach fact F1.  Then teach a contradicting update F2.
    Query for the current value.  Correct answer = F2.
    Field: protected regions for F2 should dominate; F1 decays.
    RAG: both F1 and F2 are pinned; retrieval returns both, confusing LLM.
    Expected: Hybrid > RAG on update/contradiction.
"""

import os, sys, time, statistics, traceback
from dataclasses import dataclass

import torch
os.environ["TRANSFORMERS_DISABLE_FLASH_ATTN"] = "1"

try:
    from app import HybridLLM, RAGBaseline, ContextBaseline
except ImportError as e:
    print(f"[error] {e}"); sys.exit(1)

if not hasattr(HybridLLM, "reset_world"):
    def _rw(self):
        self.world.S.zero_(); self.world.strength.zero_()
        self.world.step_count = 0; self.world.memories = [""] * self.world.n
        if hasattr(self.world,"thresholds"): self.world.thresholds.fill_(0.5)
        if hasattr(self.world,"protected"):  self.world.protected.fill_(False)
        if hasattr(self,"episodes"):         self.episodes.clear()
        self.call_count = 0
    HybridLLM.reset_world = _rw


# ─── helpers ──────────────────────────────────────────────────────────────────

def kw(response, required, any_of=None):
    r = response.lower()
    hits = sum(1 for k in required if k.lower() in r)
    score = hits / len(required) if required else 0.0
    if any_of and any(k.lower() in r for k in any_of):
        score = min(1.0, score + 0.2)
    return round(score, 4)

def extract(text):
    m = "assistant\n"
    i = text.rfind(m)
    return text[i+len(m):].strip() if i != -1 else text.strip()

def run(model, text, max_tokens=120):
    t0 = time.perf_counter()
    try:
        raw = model.generate(text, max_new_tokens=max_tokens)
    except Exception as e:
        return f"[ERR:{e}]", time.perf_counter()-t0
    return extract(raw), round(time.perf_counter()-t0, 3)

def bar(s, w=14):
    f = int(s*w)
    return "█"*f + "░"*(w-f)

TRIALS = 3


# =============================================================================
#  SUITE 7 — ASSOCIATIVE INFERENCE
#  Teach A, teach B (no explicit A→B link), ask for A∧B synthesis.
#  Harder than Suite 4: the two facts share NO keyword overlap at all.
# =============================================================================

ASSOC_TESTS = [
    {
        "teach_a": "Dr. Elena Vasquez leads the project HELIOS.",
        "teach_b": "Project HELIOS is classified at clearance level CRIMSON.",
        "fillers": [
            "What is REST?",
            "Explain DNS.",
            "What is a semaphore?",
        ],
        "question": "What clearance level does Dr. Elena Vasquez work at?",
        "required": ["crimson"],
        "any_of":   ["elena", "vasquez", "helios"],
        "note": "Person → project → clearance (2-hop)",
    },
    {
        "teach_a": "The database cluster is named ORION-DB.",
        "teach_b": "ORION-DB uses a replication lag budget of 50 milliseconds.",
        "teach_c": "The SLA for read operations requires responses under 20ms.",
        "fillers": [
            "What is a bloom filter?",
            "Explain eventual consistency.",
        ],
        "question": "Given the database replication lag and the read SLA, is there a potential violation? Explain.",
        "required": ["50", "20"],
        "any_of":   ["violation", "lag", "exceed", "budget", "replication"],
        "note": "DB lag + SLA → violation inference (2-hop)",
    },
    {
        "teach_a": "Engineer Riku Tanaka is responsible for the payment gateway service.",
        "teach_b": "The payment gateway service experienced a P1 incident on 2025-03-14.",
        "teach_c": "P1 incidents require a post-mortem within 48 hours.",
        "fillers": [
            "What is CAP theorem?",
            "What is a saga pattern?",
            "Explain write-ahead logging.",
            "What is a distributed hash table?",
        ],
        "question": "Who is responsible for writing the post-mortem for the March 14 incident, and what is the deadline constraint?",
        "required": ["riku", "48"],
        "any_of":   ["tanaka", "payment", "post-mortem", "p1"],
        "note": "Person → service → incident → SLA (3-hop)",
    },
    {
        "teach_a": "The config key MAX_CONNECTIONS is set to 100.",
        "teach_b": "Each WebSocket connection consumes 2 config units.",
        "fillers": [
            "What is backpressure?",
            "Describe circuit breakers.",
            "What is a lease?",
        ],
        "question": "How many simultaneous WebSocket connections can the system support given the config?",
        "required": ["50"],
        "any_of":   ["100", "max_connections", "websocket", "divide", "half"],
        "note": "Config value + unit cost → derived capacity (arithmetic)",
    },
]


def run_suite7(models, trials=TRIALS):
    print("\n" + "═"*64)
    print("  SUITE 7 — ASSOCIATIVE INFERENCE")
    print("  Two or three facts taught separately; question requires combining them.")
    print("  RAG returns individual facts; field may bind co-activated regions.")
    print("═"*64)

    suite = {n: [] for n in models}

    for test in ASSOC_TESTS:
        print(f"\n  [{test['note']}]")
        facts  = [test["teach_a"], test["teach_b"]] + \
                 ([test["teach_c"]] if "teach_c" in test else [])
        fillers = test["fillers"]

        for name, model in models.items():
            scores = []
            for _ in range(trials):
                model.reset_world()
                for f in facts:   model.teach(f)
                for fl in fillers: run(model, fl, max_tokens=30)
                ans, _ = run(model, test["question"], max_tokens=150)
                scores.append(kw(ans, test["required"], test.get("any_of",[])))

            m = round(statistics.mean(scores), 4)
            s = round(statistics.stdev(scores), 4) if len(scores)>1 else 0.0
            suite[name].append(m)
            snip = scores  # show trial pattern
            print(f"    [{name:16s}] {bar(m)} {m:.3f}±{s:.3f}  trials={['✓' if x>=0.5 else '✗' for x in scores]}")

    print("\n  Suite 7 summary:")
    for name, sc in suite.items():
        avg = statistics.mean(sc); std = statistics.stdev(sc) if len(sc)>1 else 0.0
        print(f"    [{name:16s}] mean={avg:.4f} ± {std:.4f}")
    return suite


# =============================================================================
#  SUITE 8 — PATTERN EXTRACTION
#  Teach N examples of an implicit rule. Never state the rule.
#  Ask model to apply the rule to a new case.
# =============================================================================

PATTERN_TESTS = [
    {
        "examples": [
            "Service auth-service: tier=critical, on-call rotation=24/7.",
            "Service payment-service: tier=critical, on-call rotation=24/7.",
            "Service logging-service: tier=standard, on-call rotation=business-hours.",
            "Service analytics-service: tier=standard, on-call rotation=business-hours.",
            "Service cache-service: tier=critical, on-call rotation=24/7.",
            "Service monitoring-service: tier=standard, on-call rotation=business-hours.",
        ],
        "question": "A new service called notification-service has tier=critical. What on-call rotation should it have?",
        "required": ["24/7"],
        "any_of":   ["critical", "24", "always"],
        "note": "tier→rotation pattern (6 examples, never stated)",
    },
    {
        "examples": [
            "Error code E-401: severity=warn, auto-retry=yes, max-retries=3.",
            "Error code E-403: severity=warn, auto-retry=yes, max-retries=3.",
            "Error code E-500: severity=critical, auto-retry=no, escalate=true.",
            "Error code E-503: severity=critical, auto-retry=no, escalate=true.",
            "Error code E-404: severity=warn, auto-retry=yes, max-retries=3.",
        ],
        "question": "A new error E-502 occurs with severity=critical. Based on the pattern, should it auto-retry?",
        "required": ["no"],
        "any_of":   ["escalate", "critical", "should not", "shouldn't"],
        "note": "severity→retry pattern (5 examples)",
    },
    {
        "examples": [
            "Region eu-west-1: data-residency=EU, encryption=AES-256, audit-log=enabled.",
            "Region eu-central-1: data-residency=EU, encryption=AES-256, audit-log=enabled.",
            "Region us-east-1: data-residency=US, encryption=AES-128, audit-log=disabled.",
            "Region us-west-2: data-residency=US, encryption=AES-128, audit-log=disabled.",
            "Region ap-southeast-1: data-residency=APAC, encryption=AES-128, audit-log=disabled.",
        ],
        "question": "A new region eu-north-1 is added with data-residency=EU. What encryption and audit-log settings should it have?",
        "required": ["aes-256", "enabled"],
        "any_of":   ["eu", "256", "audit"],
        "note": "region→config pattern (5 examples, GDPR-style)",
    },
]


def run_suite8(models, trials=TRIALS):
    print("\n" + "═"*64)
    print("  SUITE 8 — PATTERN EXTRACTION")
    print("  Teach N examples of an implicit rule; ask model to apply it.")
    print("  Field builds attractor from repeated co-activation; RAG has N separate facts.")
    print("═"*64)

    suite = {n: [] for n in models}

    for test in PATTERN_TESTS:
        print(f"\n  [{test['note']}]  ({len(test['examples'])} examples)")

        for name, model in models.items():
            scores = []
            for _ in range(trials):
                model.reset_world()
                for ex in test["examples"]: model.teach(ex)
                ans, _ = run(model, test["question"], max_tokens=100)
                scores.append(kw(ans, test["required"], test.get("any_of",[])))

            m = round(statistics.mean(scores), 4)
            s = round(statistics.stdev(scores), 4) if len(scores)>1 else 0.0
            suite[name].append(m)
            print(f"    [{name:16s}] {bar(m)} {m:.3f}±{s:.3f}  trials={['✓' if x>=0.5 else '✗' for x in scores]}")

    print("\n  Suite 8 summary:")
    for name, sc in suite.items():
        avg = statistics.mean(sc); std = statistics.stdev(sc) if len(sc)>1 else 0.0
        print(f"    [{name:16s}] mean={avg:.4f} ± {std:.4f}")
    return suite


# =============================================================================
#  SUITE 9 — DEGRADED CUE COMPLETION
#  Teach an exact fact; query with paraphrase / partial / wrong-word cue.
#  Attractor dynamics should complete from degraded input;
#  RAG cosine sim drops when phrasing diverges.
# =============================================================================

DEGRADED_TESTS = [
    {
        "teach":    "The master encryption key ID is ENC-KEY-2025-ALPHA-7742.",
        "query_exact":    "What is the master encryption key ID?",
        "query_degraded": "What's the key identifier for the main encryption credential?",
        "required": ["enc-key-2025-alpha-7742"],
        "note": "Paraphrase: 'master/identifier/main encryption credential'",
    },
    {
        "teach":    "The primary on-call contact for the payments cluster is engineer Sofia Berglund.",
        "query_exact":    "Who is the on-call contact for the payments cluster?",
        "query_degraded": "If the payment system goes down, who do we call first?",
        "required": ["sofia", "berglund"],
        "note": "Semantic paraphrase: 'system goes down' vs 'on-call contact'",
    },
    {
        "teach":    "The NEXUS-7 project's deployment window is every Tuesday between 22:00 and 02:00 UTC.",
        "query_exact":    "When is the NEXUS-7 deployment window?",
        "query_degraded": "What time can we push changes to the NEXUS project on a weekly basis?",
        "required": ["tuesday", "22"],
        "note": "Partial: 'NEXUS-7' → 'NEXUS', time paraphrased",
    },
    {
        "teach":    "The fallback database server hostname is db-fallback-prod-03.internal.",
        "query_exact":    "What is the fallback database server hostname?",
        "query_degraded": "Our main DB is down. Where is the backup server?",
        "required": ["db-fallback-prod-03"],
        "note": "Semantic drift: 'fallback/backup', no hostname in query",
    },
]


def run_suite9(models, trials=TRIALS):
    """
    For each test: run BOTH exact and degraded query.
    Score both. Report degradation delta = exact_score - degraded_score.
    A field-based model should degrade less than pure cosine RAG.
    """
    print("\n" + "═"*64)
    print("  SUITE 9 — DEGRADED CUE COMPLETION")
    print("  Same fact, exact vs paraphrased query.")
    print("  Delta = how much score drops on degraded cue. Lower delta = better.")
    print("═"*64)

    exact_suite    = {n: [] for n in models}
    degraded_suite = {n: [] for n in models}
    delta_suite    = {n: [] for n in models}

    for test in DEGRADED_TESTS:
        print(f"\n  [{test['note']}]")
        print(f"    Exact:    '{test['query_exact'][:60]}'")
        print(f"    Degraded: '{test['query_degraded'][:60]}'")

        for name, model in models.items():
            exact_scores, degraded_scores = [], []

            for _ in range(trials):
                # Exact
                model.reset_world()
                model.teach(test["teach"])
                ans, _ = run(model, test["query_exact"], max_tokens=80)
                exact_scores.append(kw(ans, test["required"]))

                # Degraded
                model.reset_world()
                model.teach(test["teach"])
                ans, _ = run(model, test["query_degraded"], max_tokens=80)
                degraded_scores.append(kw(ans, test["required"]))

            em = round(statistics.mean(exact_scores), 4)
            dm = round(statistics.mean(degraded_scores), 4)
            delta = round(em - dm, 4)

            exact_suite[name].append(em)
            degraded_suite[name].append(dm)
            delta_suite[name].append(delta)

            print(f"    [{name:16s}] exact={em:.3f} degraded={dm:.3f} "
                  f"Δ={delta:+.3f} {'← GOOD' if delta <= 0.1 else '← degrades'}")

    print("\n  Suite 9 summary (mean degradation delta — lower is better):")
    for name in models:
        avg_delta = statistics.mean(delta_suite[name])
        avg_exact = statistics.mean(exact_suite[name])
        avg_deg   = statistics.mean(degraded_suite[name])
        print(f"    [{name:16s}] exact={avg_exact:.4f}  degraded={avg_deg:.4f}  "
              f"mean_delta={avg_delta:+.4f}")

    return {"exact": exact_suite, "degraded": degraded_suite, "delta": delta_suite}


# =============================================================================
#  SUITE 10 — CONTRADICTION / KNOWLEDGE UPDATE
#  Teach F1 (old value), then teach F2 (new value, contradicts F1).
#  Correct answer = F2. Tests whether model updates or gets confused.
# =============================================================================

UPDATE_TESTS = [
    {
        "old_fact":  "Server alpha's IP address is 10.0.0.42.",
        "new_fact":  "Server alpha has been migrated. Its new IP address is 10.0.1.99.",
        "fillers":   ["What is DNS?", "Explain load balancing.", "What is BGP?"],
        "question":  "What is the current IP address of server alpha?",
        "required":  ["10.0.1.99"],
        "anti":      ["10.0.0.42"],       # old value should NOT appear as answer
        "note": "IP update",
    },
    {
        "old_fact":  "The rate limit for the public API is 100 requests per minute.",
        "new_fact":  "The rate limit for the public API has been updated to 500 requests per minute.",
        "fillers":   ["What is OAuth?", "Explain JWT tokens."],
        "question":  "What is the current rate limit for the public API?",
        "required":  ["500"],
        "anti":      ["100"],
        "note": "Config value update",
    },
    {
        "old_fact":  "The project lead for NEXUS-7 is Marcus Reyes.",
        "new_fact":  "NEXUS-7 has a new project lead: Dr. Amara Singh replaced Marcus Reyes.",
        "fillers":   ["What is agile?", "Describe sprint planning.", "What is a retrospective?"],
        "question":  "Who is the current project lead for NEXUS-7?",
        "required":  ["amara", "singh"],
        "anti":      ["marcus reyes"],
        "note": "Personnel update (old and new names both in context)",
    },
]


def run_suite10(models, trials=TRIALS):
    print("\n" + "═"*64)
    print("  SUITE 10 — CONTRADICTION / KNOWLEDGE UPDATE")
    print("  Old fact → new fact. Correct answer = new value.")
    print("  RAG pins both; LLM may pick old. Field protects newer encoding.")
    print("═"*64)

    suite = {n: [] for n in models}

    for test in UPDATE_TESTS:
        print(f"\n  [{test['note']}]")

        for name, model in models.items():
            scores = []
            for _ in range(trials):
                model.reset_world()
                model.teach(test["old_fact"])
                # Filler between old and new fact
                for fl in test["fillers"]: run(model, fl, max_tokens=30)
                model.teach(test["new_fact"])
                ans, _ = run(model, test["question"], max_tokens=80)

                s = kw(ans, test["required"])
                # Penalise if old (wrong) value appears in answer
                if any(bad.lower() in ans.lower() for bad in test.get("anti", [])):
                    s = max(0.0, s - 0.4)

                scores.append(round(s, 4))

            m = round(statistics.mean(scores), 4)
            st = round(statistics.stdev(scores), 4) if len(scores)>1 else 0.0
            suite[name].append(m)
            print(f"    [{name:16s}] {bar(m)} {m:.3f}±{st:.3f}  trials={['✓' if x>=0.5 else '✗' for x in scores]}")

    print("\n  Suite 10 summary:")
    for name, sc in suite.items():
        avg = statistics.mean(sc); std = statistics.stdev(sc) if len(sc)>1 else 0.0
        print(f"    [{name:16s}] mean={avg:.4f} ± {std:.4f}")
    return suite


# =============================================================================
#  GRAPH
# =============================================================================

def graph_field_suites(results, models, output_dir="./benchmark_plots"):
    try:
        import matplotlib
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt
        import numpy as np
        from pathlib import Path
    except ImportError:
        print("[graphs] pip install matplotlib"); return []

    Path(output_dir).mkdir(parents=True, exist_ok=True)
    saved = []

    COLORS = {"HybridLLM":"#3B8BD4","RAGBaseline":"#1D9E75","ContextBaseline":"#888780"}
    model_names = list(models.keys())

    suite_keys   = ["s7_assoc", "s8_pattern", "s9_exact", "s9_degraded", "s10_update"]
    suite_labels = ["Associative\nInference", "Pattern\nExtraction",
                    "Degraded Cue\n(exact)", "Degraded Cue\n(paraphrase)", "Knowledge\nUpdate"]

    all_data = {}
    if "s7" in results:
        for n in model_names:
            all_data.setdefault("s7_assoc", {})[n] = \
                statistics.mean(results["s7"].get(n, [0]))
    if "s8" in results:
        for n in model_names:
            all_data.setdefault("s8_pattern", {})[n] = \
                statistics.mean(results["s8"].get(n, [0]))
    if "s9" in results:
        for k, sk in [("exact","s9_exact"), ("degraded","s9_degraded")]:
            for n in model_names:
                all_data.setdefault(sk, {})[n] = \
                    statistics.mean(results["s9"][k].get(n, [0]))
    if "s10" in results:
        for n in model_names:
            all_data.setdefault("s10_update", {})[n] = \
                statistics.mean(results["s10"].get(n, [0]))

    avail_keys   = [k for k in suite_keys if k in all_data]
    avail_labels = [suite_labels[suite_keys.index(k)] for k in avail_keys]

    if not avail_keys:
        return saved

    fig, ax = plt.subplots(figsize=(11, 5.5))
    x = np.arange(len(avail_keys))
    n_m = len(model_names)
    bw  = 0.65 / n_m

    plt.rcParams.update({"axes.spines.top":False,"axes.spines.right":False,
                          "axes.grid":True,"grid.alpha":0.3,"figure.dpi":150})

    for i, name in enumerate(model_names):
        vals   = [all_data[k].get(name, 0) for k in avail_keys]
        offset = (i - n_m/2 + 0.5) * bw
        bars = ax.bar(x + offset, vals, bw,
                      color=COLORS.get(name,"#555"), label=name,
                      alpha=0.88, zorder=3)
        for bar, v in zip(bars, vals):
            if v > 0.05:
                ax.text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.02,
                        f"{v:.2f}", ha="center", va="bottom", fontsize=8)

    ax.set_xticks(x); ax.set_xticklabels(avail_labels, fontsize=10)
    ax.set_ylabel("Mean score  (0–1)"); ax.set_ylim(0, 1.3)
    ax.set_title("Figure 5  —  Field-Specific Tests\n"
                 "Tasks designed to expose what Hebbian field adds over pure RAG",
                 fontsize=12, fontweight="bold")
    ax.legend(framealpha=0.9, fontsize=10)
    fig.tight_layout()

    path = f"{output_dir}/fig5_field_tests.png"
    fig.savefig(path, bbox_inches="tight"); plt.close(fig)
    saved.append(path)
    print(f"  [Graph] {path}")
    return saved


# =============================================================================
#  MAIN
# =============================================================================

def run_field_suites(models, trials=TRIALS, graphs_dir="./benchmark_plots"):
    results = {}

    results["s7"]  = run_suite7(models, trials)
    results["s8"]  = run_suite8(models, trials)
    results["s9"]  = run_suite9(models, trials)
    results["s10"] = run_suite10(models, trials)

    # Final comparison table
    print("\n" + "═"*70)
    print("  FIELD TEST REPORT — HybridLLM vs RAGBaseline vs ContextBaseline")
    print("═"*70)

    model_names = list(models.keys())
    col = 18
    print(f"  {'Suite':<28}" + "".join(f"{n:>{col}}" for n in model_names))
    print("  " + "─"*(28 + col*len(model_names)))

    def row(label, data_dict):
        r = f"  {label:<28}"
        for n in model_names:
            sc = data_dict.get(n, [])
            if sc:
                avg = statistics.mean(sc)
                r += f"{avg:>{col}.4f}"
            else:
                r += f"{'N/A':>{col}}"
        print(r)

    row("Associative inference",    results["s7"])
    row("Pattern extraction",       results["s8"])
    row("Degraded cue (exact)",     results["s9"]["exact"])
    row("Degraded cue (paraphrase)",results["s9"]["degraded"])
    row("Knowledge update",         results["s10"])

    print("  " + "─"*(28 + col*len(model_names)))

    # Overall field-suite score
    r = f"  {'FIELD OVERALL':<28}"
    for n in model_names:
        all_sc = (results["s7"].get(n,[]) + results["s8"].get(n,[]) +
                  results["s9"]["exact"].get(n,[]) +
                  results["s9"]["degraded"].get(n,[]) + results["s10"].get(n,[]))
        avg = statistics.mean(all_sc) if all_sc else 0.0
        r += f"{avg:>{col}.4f}"
    print(r)
    print("═"*70)

    saved = graph_field_suites(results, models, graphs_dir)
    if saved:
        print(f"\n  Graph: {saved[0]}")

    return results


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--trials", type=int, default=3)
    parser.add_argument("--graphs", type=str, default="./benchmark_plots")
    parser.add_argument("--no-context", action="store_true")
    args = parser.parse_args()

    print("\n" + "═"*64)
    print("  Field-Specific Benchmark Suites 7–10")
    print("  Tests what Hebbian field uniquely enables vs pure RAG")
    print("═"*64)

    import torch as _torch
    DEVICE = "cuda" if _torch.cuda.is_available() else "cpu"
    if DEVICE == "cpu":
        print("  [ERROR] CUDA not available — install GPU torch:")
        print("          pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 --upgrade")
        print("  Proceeding on CPU — benchmarks will be SLOW.", flush=True)
    else:
        for _i in range(_torch.cuda.device_count()):
            _props = _torch.cuda.get_device_properties(_i)
            print(f"  [GPU {_i}] {_props.name}  ({_props.total_memory // 1024**3} GB VRAM)  CUDA {_torch.version.cuda}", flush=True)
    print(f"  Device: {DEVICE}", flush=True)

    print("\n  Loading HybridLLM...")
    hybrid = HybridLLM()
    print(f"  Model device: {next(hybrid.model.parameters()).device}  dtype={next(hybrid.model.parameters()).dtype}")

    models = {"HybridLLM": hybrid}
    models["RAGBaseline"] = RAGBaseline(hybrid.tokenizer, hybrid.model)
    if not args.no_context:
        models["ContextBaseline"] = ContextBaseline(hybrid.tokenizer, hybrid.model)

    run_field_suites(models, trials=args.trials, graphs_dir=args.graphs)