""" benchmark_field_tests.py — Suite 7, 8, 9 Tests designed specifically for what a Hebbian attractor field can do that pure cosine-similarity RAG fundamentally cannot. Run standalone: python benchmark_field_tests.py Or append to benchmark.py by importing and calling run_field_suites(). The three suites: Suite 7 — ASSOCIATIVE INFERENCE Teach A→B and B→C in separate teach() calls. Ask for A→C. No explicit link was ever stated. Field: co-activation of regions encoding A and C should create implicit link. RAG: A and C are separate embeddings with no stored connector. Expected: Hybrid > RAG on transitive / relational queries. Suite 8 — PATTERN EXTRACTION Teach 6-8 examples of an unstated rule (no rule is ever named). Ask model to infer the rule or apply it to a new case. Field: repeated co-activation of same regions strengthens attractor. RAG: returns 5 nearest examples, no pattern synthesis. Expected: Hybrid > RAG on novel-instance questions. Suite 9 — DEGRADED CUE COMPLETION Teach a fact with exact phrasing. Query with a paraphrased / partial / typo'd version. Field: attractor basin should pull in near-neighbor activations. RAG: cosine sim drops if phrasing diverges enough. Expected: Hybrid >= RAG; RAG degrades on severe paraphrase. Suite 10 — CONTRADICTION / UPDATE Teach fact F1. Then teach a contradicting update F2. Query for the current value. Correct answer = F2. Field: protected regions for F2 should dominate; F1 decays. RAG: both F1 and F2 are pinned; retrieval returns both, confusing LLM. Expected: Hybrid > RAG on update/contradiction. """ import os, sys, time, statistics, traceback from dataclasses import dataclass import torch os.environ["TRANSFORMERS_DISABLE_FLASH_ATTN"] = "1" try: from app import HybridLLM, RAGBaseline, ContextBaseline except ImportError as e: print(f"[error] {e}"); sys.exit(1) if not hasattr(HybridLLM, "reset_world"): def _rw(self): self.world.S.zero_(); self.world.strength.zero_() self.world.step_count = 0; self.world.memories = [""] * self.world.n if hasattr(self.world,"thresholds"): self.world.thresholds.fill_(0.5) if hasattr(self.world,"protected"): self.world.protected.fill_(False) if hasattr(self,"episodes"): self.episodes.clear() self.call_count = 0 HybridLLM.reset_world = _rw # ─── helpers ────────────────────────────────────────────────────────────────── def kw(response, required, any_of=None): r = response.lower() hits = sum(1 for k in required if k.lower() in r) score = hits / len(required) if required else 0.0 if any_of and any(k.lower() in r for k in any_of): score = min(1.0, score + 0.2) return round(score, 4) def extract(text): m = "assistant\n" i = text.rfind(m) return text[i+len(m):].strip() if i != -1 else text.strip() def run(model, text, max_tokens=120): t0 = time.perf_counter() try: raw = model.generate(text, max_new_tokens=max_tokens) except Exception as e: return f"[ERR:{e}]", time.perf_counter()-t0 return extract(raw), round(time.perf_counter()-t0, 3) def bar(s, w=14): f = int(s*w) return "█"*f + "░"*(w-f) TRIALS = 3 # ============================================================================= # SUITE 7 — ASSOCIATIVE INFERENCE # Teach A, teach B (no explicit A→B link), ask for A∧B synthesis. # Harder than Suite 4: the two facts share NO keyword overlap at all. # ============================================================================= ASSOC_TESTS = [ { "teach_a": "Dr. Elena Vasquez leads the project HELIOS.", "teach_b": "Project HELIOS is classified at clearance level CRIMSON.", "fillers": [ "What is REST?", "Explain DNS.", "What is a semaphore?", ], "question": "What clearance level does Dr. Elena Vasquez work at?", "required": ["crimson"], "any_of": ["elena", "vasquez", "helios"], "note": "Person → project → clearance (2-hop)", }, { "teach_a": "The database cluster is named ORION-DB.", "teach_b": "ORION-DB uses a replication lag budget of 50 milliseconds.", "teach_c": "The SLA for read operations requires responses under 20ms.", "fillers": [ "What is a bloom filter?", "Explain eventual consistency.", ], "question": "Given the database replication lag and the read SLA, is there a potential violation? Explain.", "required": ["50", "20"], "any_of": ["violation", "lag", "exceed", "budget", "replication"], "note": "DB lag + SLA → violation inference (2-hop)", }, { "teach_a": "Engineer Riku Tanaka is responsible for the payment gateway service.", "teach_b": "The payment gateway service experienced a P1 incident on 2025-03-14.", "teach_c": "P1 incidents require a post-mortem within 48 hours.", "fillers": [ "What is CAP theorem?", "What is a saga pattern?", "Explain write-ahead logging.", "What is a distributed hash table?", ], "question": "Who is responsible for writing the post-mortem for the March 14 incident, and what is the deadline constraint?", "required": ["riku", "48"], "any_of": ["tanaka", "payment", "post-mortem", "p1"], "note": "Person → service → incident → SLA (3-hop)", }, { "teach_a": "The config key MAX_CONNECTIONS is set to 100.", "teach_b": "Each WebSocket connection consumes 2 config units.", "fillers": [ "What is backpressure?", "Describe circuit breakers.", "What is a lease?", ], "question": "How many simultaneous WebSocket connections can the system support given the config?", "required": ["50"], "any_of": ["100", "max_connections", "websocket", "divide", "half"], "note": "Config value + unit cost → derived capacity (arithmetic)", }, ] def run_suite7(models, trials=TRIALS): print("\n" + "═"*64) print(" SUITE 7 — ASSOCIATIVE INFERENCE") print(" Two or three facts taught separately; question requires combining them.") print(" RAG returns individual facts; field may bind co-activated regions.") print("═"*64) suite = {n: [] for n in models} for test in ASSOC_TESTS: print(f"\n [{test['note']}]") facts = [test["teach_a"], test["teach_b"]] + \ ([test["teach_c"]] if "teach_c" in test else []) fillers = test["fillers"] for name, model in models.items(): scores = [] for _ in range(trials): model.reset_world() for f in facts: model.teach(f) for fl in fillers: run(model, fl, max_tokens=30) ans, _ = run(model, test["question"], max_tokens=150) scores.append(kw(ans, test["required"], test.get("any_of",[]))) m = round(statistics.mean(scores), 4) s = round(statistics.stdev(scores), 4) if len(scores)>1 else 0.0 suite[name].append(m) snip = scores # show trial pattern print(f" [{name:16s}] {bar(m)} {m:.3f}±{s:.3f} trials={['✓' if x>=0.5 else '✗' for x in scores]}") print("\n Suite 7 summary:") for name, sc in suite.items(): avg = statistics.mean(sc); std = statistics.stdev(sc) if len(sc)>1 else 0.0 print(f" [{name:16s}] mean={avg:.4f} ± {std:.4f}") return suite # ============================================================================= # SUITE 8 — PATTERN EXTRACTION # Teach N examples of an implicit rule. Never state the rule. # Ask model to apply the rule to a new case. # ============================================================================= PATTERN_TESTS = [ { "examples": [ "Service auth-service: tier=critical, on-call rotation=24/7.", "Service payment-service: tier=critical, on-call rotation=24/7.", "Service logging-service: tier=standard, on-call rotation=business-hours.", "Service analytics-service: tier=standard, on-call rotation=business-hours.", "Service cache-service: tier=critical, on-call rotation=24/7.", "Service monitoring-service: tier=standard, on-call rotation=business-hours.", ], "question": "A new service called notification-service has tier=critical. What on-call rotation should it have?", "required": ["24/7"], "any_of": ["critical", "24", "always"], "note": "tier→rotation pattern (6 examples, never stated)", }, { "examples": [ "Error code E-401: severity=warn, auto-retry=yes, max-retries=3.", "Error code E-403: severity=warn, auto-retry=yes, max-retries=3.", "Error code E-500: severity=critical, auto-retry=no, escalate=true.", "Error code E-503: severity=critical, auto-retry=no, escalate=true.", "Error code E-404: severity=warn, auto-retry=yes, max-retries=3.", ], "question": "A new error E-502 occurs with severity=critical. Based on the pattern, should it auto-retry?", "required": ["no"], "any_of": ["escalate", "critical", "should not", "shouldn't"], "note": "severity→retry pattern (5 examples)", }, { "examples": [ "Region eu-west-1: data-residency=EU, encryption=AES-256, audit-log=enabled.", "Region eu-central-1: data-residency=EU, encryption=AES-256, audit-log=enabled.", "Region us-east-1: data-residency=US, encryption=AES-128, audit-log=disabled.", "Region us-west-2: data-residency=US, encryption=AES-128, audit-log=disabled.", "Region ap-southeast-1: data-residency=APAC, encryption=AES-128, audit-log=disabled.", ], "question": "A new region eu-north-1 is added with data-residency=EU. What encryption and audit-log settings should it have?", "required": ["aes-256", "enabled"], "any_of": ["eu", "256", "audit"], "note": "region→config pattern (5 examples, GDPR-style)", }, ] def run_suite8(models, trials=TRIALS): print("\n" + "═"*64) print(" SUITE 8 — PATTERN EXTRACTION") print(" Teach N examples of an implicit rule; ask model to apply it.") print(" Field builds attractor from repeated co-activation; RAG has N separate facts.") print("═"*64) suite = {n: [] for n in models} for test in PATTERN_TESTS: print(f"\n [{test['note']}] ({len(test['examples'])} examples)") for name, model in models.items(): scores = [] for _ in range(trials): model.reset_world() for ex in test["examples"]: model.teach(ex) ans, _ = run(model, test["question"], max_tokens=100) scores.append(kw(ans, test["required"], test.get("any_of",[]))) m = round(statistics.mean(scores), 4) s = round(statistics.stdev(scores), 4) if len(scores)>1 else 0.0 suite[name].append(m) print(f" [{name:16s}] {bar(m)} {m:.3f}±{s:.3f} trials={['✓' if x>=0.5 else '✗' for x in scores]}") print("\n Suite 8 summary:") for name, sc in suite.items(): avg = statistics.mean(sc); std = statistics.stdev(sc) if len(sc)>1 else 0.0 print(f" [{name:16s}] mean={avg:.4f} ± {std:.4f}") return suite # ============================================================================= # SUITE 9 — DEGRADED CUE COMPLETION # Teach an exact fact; query with paraphrase / partial / wrong-word cue. # Attractor dynamics should complete from degraded input; # RAG cosine sim drops when phrasing diverges. # ============================================================================= DEGRADED_TESTS = [ { "teach": "The master encryption key ID is ENC-KEY-2025-ALPHA-7742.", "query_exact": "What is the master encryption key ID?", "query_degraded": "What's the key identifier for the main encryption credential?", "required": ["enc-key-2025-alpha-7742"], "note": "Paraphrase: 'master/identifier/main encryption credential'", }, { "teach": "The primary on-call contact for the payments cluster is engineer Sofia Berglund.", "query_exact": "Who is the on-call contact for the payments cluster?", "query_degraded": "If the payment system goes down, who do we call first?", "required": ["sofia", "berglund"], "note": "Semantic paraphrase: 'system goes down' vs 'on-call contact'", }, { "teach": "The NEXUS-7 project's deployment window is every Tuesday between 22:00 and 02:00 UTC.", "query_exact": "When is the NEXUS-7 deployment window?", "query_degraded": "What time can we push changes to the NEXUS project on a weekly basis?", "required": ["tuesday", "22"], "note": "Partial: 'NEXUS-7' → 'NEXUS', time paraphrased", }, { "teach": "The fallback database server hostname is db-fallback-prod-03.internal.", "query_exact": "What is the fallback database server hostname?", "query_degraded": "Our main DB is down. Where is the backup server?", "required": ["db-fallback-prod-03"], "note": "Semantic drift: 'fallback/backup', no hostname in query", }, ] def run_suite9(models, trials=TRIALS): """ For each test: run BOTH exact and degraded query. Score both. Report degradation delta = exact_score - degraded_score. A field-based model should degrade less than pure cosine RAG. """ print("\n" + "═"*64) print(" SUITE 9 — DEGRADED CUE COMPLETION") print(" Same fact, exact vs paraphrased query.") print(" Delta = how much score drops on degraded cue. Lower delta = better.") print("═"*64) exact_suite = {n: [] for n in models} degraded_suite = {n: [] for n in models} delta_suite = {n: [] for n in models} for test in DEGRADED_TESTS: print(f"\n [{test['note']}]") print(f" Exact: '{test['query_exact'][:60]}'") print(f" Degraded: '{test['query_degraded'][:60]}'") for name, model in models.items(): exact_scores, degraded_scores = [], [] for _ in range(trials): # Exact model.reset_world() model.teach(test["teach"]) ans, _ = run(model, test["query_exact"], max_tokens=80) exact_scores.append(kw(ans, test["required"])) # Degraded model.reset_world() model.teach(test["teach"]) ans, _ = run(model, test["query_degraded"], max_tokens=80) degraded_scores.append(kw(ans, test["required"])) em = round(statistics.mean(exact_scores), 4) dm = round(statistics.mean(degraded_scores), 4) delta = round(em - dm, 4) exact_suite[name].append(em) degraded_suite[name].append(dm) delta_suite[name].append(delta) print(f" [{name:16s}] exact={em:.3f} degraded={dm:.3f} " f"Δ={delta:+.3f} {'← GOOD' if delta <= 0.1 else '← degrades'}") print("\n Suite 9 summary (mean degradation delta — lower is better):") for name in models: avg_delta = statistics.mean(delta_suite[name]) avg_exact = statistics.mean(exact_suite[name]) avg_deg = statistics.mean(degraded_suite[name]) print(f" [{name:16s}] exact={avg_exact:.4f} degraded={avg_deg:.4f} " f"mean_delta={avg_delta:+.4f}") return {"exact": exact_suite, "degraded": degraded_suite, "delta": delta_suite} # ============================================================================= # SUITE 10 — CONTRADICTION / KNOWLEDGE UPDATE # Teach F1 (old value), then teach F2 (new value, contradicts F1). # Correct answer = F2. Tests whether model updates or gets confused. # ============================================================================= UPDATE_TESTS = [ { "old_fact": "Server alpha's IP address is 10.0.0.42.", "new_fact": "Server alpha has been migrated. Its new IP address is 10.0.1.99.", "fillers": ["What is DNS?", "Explain load balancing.", "What is BGP?"], "question": "What is the current IP address of server alpha?", "required": ["10.0.1.99"], "anti": ["10.0.0.42"], # old value should NOT appear as answer "note": "IP update", }, { "old_fact": "The rate limit for the public API is 100 requests per minute.", "new_fact": "The rate limit for the public API has been updated to 500 requests per minute.", "fillers": ["What is OAuth?", "Explain JWT tokens."], "question": "What is the current rate limit for the public API?", "required": ["500"], "anti": ["100"], "note": "Config value update", }, { "old_fact": "The project lead for NEXUS-7 is Marcus Reyes.", "new_fact": "NEXUS-7 has a new project lead: Dr. Amara Singh replaced Marcus Reyes.", "fillers": ["What is agile?", "Describe sprint planning.", "What is a retrospective?"], "question": "Who is the current project lead for NEXUS-7?", "required": ["amara", "singh"], "anti": ["marcus reyes"], "note": "Personnel update (old and new names both in context)", }, ] def run_suite10(models, trials=TRIALS): print("\n" + "═"*64) print(" SUITE 10 — CONTRADICTION / KNOWLEDGE UPDATE") print(" Old fact → new fact. Correct answer = new value.") print(" RAG pins both; LLM may pick old. Field protects newer encoding.") print("═"*64) suite = {n: [] for n in models} for test in UPDATE_TESTS: print(f"\n [{test['note']}]") for name, model in models.items(): scores = [] for _ in range(trials): model.reset_world() model.teach(test["old_fact"]) # Filler between old and new fact for fl in test["fillers"]: run(model, fl, max_tokens=30) model.teach(test["new_fact"]) ans, _ = run(model, test["question"], max_tokens=80) s = kw(ans, test["required"]) # Penalise if old (wrong) value appears in answer if any(bad.lower() in ans.lower() for bad in test.get("anti", [])): s = max(0.0, s - 0.4) scores.append(round(s, 4)) m = round(statistics.mean(scores), 4) st = round(statistics.stdev(scores), 4) if len(scores)>1 else 0.0 suite[name].append(m) print(f" [{name:16s}] {bar(m)} {m:.3f}±{st:.3f} trials={['✓' if x>=0.5 else '✗' for x in scores]}") print("\n Suite 10 summary:") for name, sc in suite.items(): avg = statistics.mean(sc); std = statistics.stdev(sc) if len(sc)>1 else 0.0 print(f" [{name:16s}] mean={avg:.4f} ± {std:.4f}") return suite # ============================================================================= # GRAPH # ============================================================================= def graph_field_suites(results, models, output_dir="./benchmark_plots"): try: import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np from pathlib import Path except ImportError: print("[graphs] pip install matplotlib"); return [] Path(output_dir).mkdir(parents=True, exist_ok=True) saved = [] COLORS = {"HybridLLM":"#3B8BD4","RAGBaseline":"#1D9E75","ContextBaseline":"#888780"} model_names = list(models.keys()) suite_keys = ["s7_assoc", "s8_pattern", "s9_exact", "s9_degraded", "s10_update"] suite_labels = ["Associative\nInference", "Pattern\nExtraction", "Degraded Cue\n(exact)", "Degraded Cue\n(paraphrase)", "Knowledge\nUpdate"] all_data = {} if "s7" in results: for n in model_names: all_data.setdefault("s7_assoc", {})[n] = \ statistics.mean(results["s7"].get(n, [0])) if "s8" in results: for n in model_names: all_data.setdefault("s8_pattern", {})[n] = \ statistics.mean(results["s8"].get(n, [0])) if "s9" in results: for k, sk in [("exact","s9_exact"), ("degraded","s9_degraded")]: for n in model_names: all_data.setdefault(sk, {})[n] = \ statistics.mean(results["s9"][k].get(n, [0])) if "s10" in results: for n in model_names: all_data.setdefault("s10_update", {})[n] = \ statistics.mean(results["s10"].get(n, [0])) avail_keys = [k for k in suite_keys if k in all_data] avail_labels = [suite_labels[suite_keys.index(k)] for k in avail_keys] if not avail_keys: return saved fig, ax = plt.subplots(figsize=(11, 5.5)) x = np.arange(len(avail_keys)) n_m = len(model_names) bw = 0.65 / n_m plt.rcParams.update({"axes.spines.top":False,"axes.spines.right":False, "axes.grid":True,"grid.alpha":0.3,"figure.dpi":150}) for i, name in enumerate(model_names): vals = [all_data[k].get(name, 0) for k in avail_keys] offset = (i - n_m/2 + 0.5) * bw bars = ax.bar(x + offset, vals, bw, color=COLORS.get(name,"#555"), label=name, alpha=0.88, zorder=3) for bar, v in zip(bars, vals): if v > 0.05: ax.text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.02, f"{v:.2f}", ha="center", va="bottom", fontsize=8) ax.set_xticks(x); ax.set_xticklabels(avail_labels, fontsize=10) ax.set_ylabel("Mean score (0–1)"); ax.set_ylim(0, 1.3) ax.set_title("Figure 5 — Field-Specific Tests\n" "Tasks designed to expose what Hebbian field adds over pure RAG", fontsize=12, fontweight="bold") ax.legend(framealpha=0.9, fontsize=10) fig.tight_layout() path = f"{output_dir}/fig5_field_tests.png" fig.savefig(path, bbox_inches="tight"); plt.close(fig) saved.append(path) print(f" [Graph] {path}") return saved # ============================================================================= # MAIN # ============================================================================= def run_field_suites(models, trials=TRIALS, graphs_dir="./benchmark_plots"): results = {} results["s7"] = run_suite7(models, trials) results["s8"] = run_suite8(models, trials) results["s9"] = run_suite9(models, trials) results["s10"] = run_suite10(models, trials) # Final comparison table print("\n" + "═"*70) print(" FIELD TEST REPORT — HybridLLM vs RAGBaseline vs ContextBaseline") print("═"*70) model_names = list(models.keys()) col = 18 print(f" {'Suite':<28}" + "".join(f"{n:>{col}}" for n in model_names)) print(" " + "─"*(28 + col*len(model_names))) def row(label, data_dict): r = f" {label:<28}" for n in model_names: sc = data_dict.get(n, []) if sc: avg = statistics.mean(sc) r += f"{avg:>{col}.4f}" else: r += f"{'N/A':>{col}}" print(r) row("Associative inference", results["s7"]) row("Pattern extraction", results["s8"]) row("Degraded cue (exact)", results["s9"]["exact"]) row("Degraded cue (paraphrase)",results["s9"]["degraded"]) row("Knowledge update", results["s10"]) print(" " + "─"*(28 + col*len(model_names))) # Overall field-suite score r = f" {'FIELD OVERALL':<28}" for n in model_names: all_sc = (results["s7"].get(n,[]) + results["s8"].get(n,[]) + results["s9"]["exact"].get(n,[]) + results["s9"]["degraded"].get(n,[]) + results["s10"].get(n,[])) avg = statistics.mean(all_sc) if all_sc else 0.0 r += f"{avg:>{col}.4f}" print(r) print("═"*70) saved = graph_field_suites(results, models, graphs_dir) if saved: print(f"\n Graph: {saved[0]}") return results if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--trials", type=int, default=3) parser.add_argument("--graphs", type=str, default="./benchmark_plots") parser.add_argument("--no-context", action="store_true") args = parser.parse_args() print("\n" + "═"*64) print(" Field-Specific Benchmark Suites 7–10") print(" Tests what Hebbian field uniquely enables vs pure RAG") print("═"*64) import torch as _torch DEVICE = "cuda" if _torch.cuda.is_available() else "cpu" if DEVICE == "cpu": print(" [ERROR] CUDA not available — install GPU torch:") print(" pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 --upgrade") print(" Proceeding on CPU — benchmarks will be SLOW.", flush=True) else: for _i in range(_torch.cuda.device_count()): _props = _torch.cuda.get_device_properties(_i) print(f" [GPU {_i}] {_props.name} ({_props.total_memory // 1024**3} GB VRAM) CUDA {_torch.version.cuda}", flush=True) print(f" Device: {DEVICE}", flush=True) print("\n Loading HybridLLM...") hybrid = HybridLLM() print(f" Model device: {next(hybrid.model.parameters()).device} dtype={next(hybrid.model.parameters()).dtype}") models = {"HybridLLM": hybrid} models["RAGBaseline"] = RAGBaseline(hybrid.tokenizer, hybrid.model) if not args.no_context: models["ContextBaseline"] = ContextBaseline(hybrid.tokenizer, hybrid.model) run_field_suites(models, trials=args.trials, graphs_dir=args.graphs)