Spaces:

WHOAM-EYE
/

network_forensics

Running

App Files Files Community

WHOAM-EYE commited on 4 days ago

Commit

c894ea4

verified ·

1 Parent(s): 3d0eba6

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

inference.py +588 -90
server/gradio_ui.py +0 -6

inference.py CHANGED Viewed

@@ -4,6 +4,7 @@ import sys
 import asyncio
 import inspect
 import random
 from pathlib import Path
 from typing import Any
@@ -37,28 +38,46 @@ _ASYNC_LOOP: asyncio.AbstractEventLoop | None = None
 SYSTEM_PROMPT = """You are a senior Network Forensics Analyst. Your goal is to investigate malicious network traffic and achieve a 100% detection score.
 ### SCORING RULES:
-- You MUST identify and `flag_as_suspicious` every malicious packet to increase RECALL.
 - Only grouped packets or flagged packets contribute towards your score.
-- If RECALL is < 0.5, your score will be 0.0. DO NOT stop until you have grouped at least 50% of the traffic.
 ### WORKFLOW:
 1. **Explore**: `inspect_packet` on suspicious samples.
-2. **Correlate**: `group_into_session` with descriptive names.
-3. **Classify**: `tag_pattern` with a valid type (ddos, web_sql_injection, heartbleed, etc.).
-4. **Report**: `submit_report` ONLY when you have covered all visible malicious sessions.
 ### JSON SCHEMA EXAMPLES (Use these exactly):
 - Inspect: {"action_type":"inspect_packet","packet_id":"pkt_0001"}
 - Flag: {"action_type":"flag_as_suspicious","packet_id":"pkt_0001"}
 - Group: {"action_type":"group_into_session","session_name":"DDoS_Burst_2","packet_ids":["pkt_0001","pkt_0002"]}
 - Tag: {"action_type":"tag_pattern","session_name":"DDoS_Burst_2","pattern_type":"ddos"}
-- Report: {"action_type":"submit_report","incident_summary":"Brief summary here.","claimed_entry_point":"pkt_0001"}"""
 HISTORY_WINDOW = 20
 REPEAT_ACTION_LIMIT = 3
 CORRECTION_WINDOW = 5
-UNTAGGED_BACKLOG_LIMIT = 4
 INSPECT_SOFT_RATIO_THRESHOLD = 0.60
 def build_client() -> OpenAI:
@@ -93,7 +112,7 @@ def format_action(action: NetworkForensicsAction) -> str:
 def summarize_observation(obs: Any, agent_state: dict[str, Any]) -> str:
-    """Provide a structured text summary for the LLM to learn from."""
     packets = obs.visible_packets
     revealed = [p for p in packets if p.is_revealed]
     revealed_ids = [p.packet_id for p in revealed]
@@ -104,14 +123,31 @@ def summarize_observation(obs: Any, agent_state: dict[str, Any]) -> str:
     reward_feedback = agent_state.get("last_reward_feedback", "n/a")
     recent_corrections = agent_state.get("recent_corrections", [])[-CORRECTION_WINDOW:]
     strategy_hints = agent_state.get("strategy_hints", [])
     summary = [
         f"Step: {obs.step_number}/{obs.step_number + obs.steps_remaining}",
         f"Current Progress: {obs.current_score_estimate:.2f}",
-        f"Recall Progress: {len(obs.flagged_packet_ids)} flagged / {len(obs.visible_packets)} visible",
         f"Last Step Reward: {last_reward:.2f}" if isinstance(last_reward, (int, float)) else "Last Step Reward: n/a",
         f"Last Reward Feedback: {reward_feedback}",
-        f"ALREADY REVEALED: {', '.join(revealed_ids[-10:])} " + ("..." if len(revealed_ids) > 10 else ""),
         "\n### SESSIONS PENDING TAGGING:",
     ]
@@ -126,13 +162,13 @@ def summarize_observation(obs: Any, agent_state: dict[str, Any]) -> str:
             summary.append(f"- {hint}")
     if untagged_sessions:
-        for s in untagged_sessions:
             summary.append(f"- {s} ({len(sessions[s])} packets)")
     else:
         summary.append("- [No pending sessions]")
     summary.append("\n### REVEALED INDICATORS:")
-    for p in revealed[-8:]: # Show last 8 revealed for context
         payload = (p.full_payload or "")[:150]
         if payload:
             summary.append(f"- {p.packet_id}: {payload}")
@@ -207,30 +243,72 @@ def packet_payload_text(packet: Any) -> str:
 def keyword_to_pattern(payload: str) -> str | None:
     text = payload.lower()
     if "slowloris" in text:
         return "dos_slowloris"
-    if "slowhttptest" in text:
         return "dos_slowhttptest"
-    if "goldeneye" in text:
         return "dos_goldeneye"
     if "hulk" in text:
         return "dos_hulk"
-    if "heartbeat" in text or "tls" in text:
         return "heartbleed"
-    if "xss" in text or "<script>" in text or "<scrip" in text or "/search?q=" in text:
         return "web_xss"
     if (
         "or 1=1" in text
         or "%20or" in text
         or "/items?id=" in text
         or "1=1" in text
         or "sql" in text
     ):
         return "web_sql_injection"
-    if "login" in text or "username=admin" in text:
         return "web_bruteforce"
-    if "flood" in text or "burst" in text:
-        return "ddos"
     return None
@@ -245,9 +323,40 @@ def packet_signature(packet: Any, pattern: str) -> tuple[str, str, int, str]:
     return (packet.src_ip, packet.dst_ip, packet.dst_port, pattern)
 def session_candidates(obs: Any) -> list[tuple[tuple[str, str, int, str], list[Any]]]:
     grouped: dict[tuple[str, str, int, str], list[Any]] = {}
     attack_source_ports: dict[tuple[str, str, int, str], set[int]] = {}
     for packet in obs.visible_packets:
         pattern = keyword_to_pattern(packet_payload_text(packet))
         if pattern:
@@ -255,6 +364,7 @@ def session_candidates(obs: Any) -> list[tuple[tuple[str, str, int, str], list[A
             grouped.setdefault(key, []).append(packet)
             attack_source_ports.setdefault(key, set()).add(packet.src_port)
     for key, source_ports in attack_source_ports.items():
         src_ip, dst_ip, dst_port, _pattern = key
         for packet in obs.visible_packets:
@@ -267,6 +377,30 @@ def session_candidates(obs: Any) -> list[tuple[tuple[str, str, int, str], list[A
             if is_reverse_response:
                 grouped[key].append(packet)
     candidates = [
         (
             key,
@@ -287,8 +421,19 @@ def required_tag_count(task_name: str, total_sessions: int) -> int:
     return 0
-def select_inspect_packet(obs: Any, inspected_ids: set[str]) -> str | None:
-    unrevealed = [p for p in obs.visible_packets if not p.is_revealed]
     if not unrevealed:
         return None
@@ -314,6 +459,9 @@ def select_inspect_packet(obs: Any, inspected_ids: set[str]) -> str | None:
 def append_action_history(agent_state: dict[str, Any], action: NetworkForensicsAction) -> None:
     history = agent_state.setdefault("previous_actions", [])
     history.append(format_action(action))
     if len(history) > HISTORY_WINDOW:
         del history[:-HISTORY_WINDOW]
@@ -355,25 +503,29 @@ def group_meets_evidence_gate(
         candidate_packets, flagged_ids, visible_by_id
     )
     size = len(candidate_packets)
     if task_name == "easy":
         min_flagged = 1 if size >= 2 else 0
     elif task_name == "medium":
-        min_flagged = 1 if size >= 3 else 0
     else:
-        min_flagged = 2 if size >= 4 else 1
-    if trusted_pattern and size >= 4:
         min_flagged = 1
     if flagged >= min_flagged:
         return True
     # Allow grouping with strong revealed malicious evidence.
-    if malicious_revealed >= min_flagged and revealed >= min(3, size):
         return True
-    # After a pattern has been confirmed by tagging, allow structure-first grouping.
-    if trusted_pattern and size >= 5:
         return True
-    if task_name == "easy" and malicious_revealed >= 1:
         return True
-    if task_name == "medium" and malicious_revealed >= 1 and revealed >= 2:
         return True
     return False
@@ -415,10 +567,10 @@ def derive_strategy_hints(obs: Any, agent_state: dict[str, Any]) -> list[str]:
         )
     inspect_limit = {
-        "easy": 2,
-        "medium": 4,
-        "hard": 6,
-    }.get(agent_state.get("current_task_name", ""), 8)
     if len(previous_actions) >= inspect_limit and inspect_ratio >= INSPECT_SOFT_RATIO_THRESHOLD:
         hints.append(
             "You are over-inspecting. Shift to flagging, grouping, tagging, or report submission unless the next packet is clearly high-value."
@@ -426,10 +578,43 @@ def derive_strategy_hints(obs: Any, agent_state: dict[str, Any]) -> list[str]:
     return hints
 def build_fallback_action(
     task_name: str, obs: Any, agent_state: dict[str, Any]
 ) -> NetworkForensicsAction:
-    """Smart workflow engine: Inspect -> Flag -> Group -> Tag -> Report."""
     inspected_ids = agent_state.setdefault("inspected_ids", set())
     flagged_ids = agent_state.setdefault("flagged_ids", set())
     session_map = agent_state.setdefault("sessions", {})  # key -> session_name
@@ -438,7 +623,7 @@ def build_fallback_action(
     visible_by_id = {p.packet_id: p for p in obs.visible_packets}
     trusted = trusted_patterns(session_map, tagged_sessions)
-    if obs.steps_remaining <= 1:
         summary = _build_report_summary(obs, agent_state)
         return NetworkForensicsAction(
             action_type="submit_report",
@@ -446,21 +631,32 @@ def build_fallback_action(
             claimed_entry_point=claimed_entry,
         )
-    # PHASE 1: Flag revealed malicious packets
     for packet in obs.visible_packets:
         if packet.is_revealed and packet.packet_id not in flagged_ids:
             payload = packet.full_payload or ""
             pattern = keyword_to_pattern(payload)
             if pattern:
-                flagged_ids.add(packet.packet_id)
                 return NetworkForensicsAction(
                     action_type="flag_as_suspicious",
-                    packet_id=packet.packet_id,
                 )
     # PHASE 2: Group flagged packets into sessions with evidence gate and backlog pacing.
     untagged_backlog = max(0, len(session_map) - len(tagged_sessions))
-    if untagged_backlog <= UNTAGGED_BACKLOG_LIMIT:
         candidates = session_candidates(obs)
         for key, items in candidates:
             if key in session_map:
@@ -482,14 +678,54 @@ def build_fallback_action(
                 packet_ids=packet_ids,
             )
-    # PHASE 3: Tag ungrouped sessions.
-    # Easy mode prioritizes coverage/recall and skips tagging to spend turns on recovery.
-    allow_tagging = task_name != "easy"
-    if allow_tagging:
-        for key, session_name in session_map.items():
-            if session_name in tagged_sessions:
-                continue
-            _src_ip, _dst_ip, _dst_port, pattern = key
             tagged_sessions.add(session_name)
             return NetworkForensicsAction(
                 action_type="tag_pattern",
@@ -497,19 +733,52 @@ def build_fallback_action(
                 pattern_type=pattern,
             )
-    # PHASE 4: Identify entry point only when confidence is higher or near episode end.
-    if not claimed_entry and flagged_ids and (
-        len(tagged_sessions) >= 3 or obs.steps_remaining <= 8
-    ):
-        earliest = min(flagged_ids, key=lambda pid: packet_sort_key(pid))
-        agent_state["claimed_entry_point"] = earliest
-        return NetworkForensicsAction(
-            action_type="identify_entry_point",
-            claimed_entry_point=earliest,
-        )
-    # PHASE 5: Inspect more unrevealed packets
-    inspect_id = select_inspect_packet(obs, inspected_ids)
     if inspect_id is not None:
         return NetworkForensicsAction(action_type="inspect_packet", packet_id=inspect_id)
@@ -523,19 +792,50 @@ def build_fallback_action(
 def _build_report_summary(obs: Any, agent_state: dict[str, Any]) -> str:
-    """Generate a meaningful incident summary for the report."""
     flagged = agent_state.get("flagged_ids", set())
     sessions = agent_state.get("sessions", {})
     tagged = agent_state.get("tagged_sessions", set())
-    patterns = set()
-    for key in sessions:
         if len(key) >= 4:
-            patterns.add(key[3])
     return (
-        f"Incident report: Detected {len(flagged)} malicious packets across "
-        f"{len(sessions)} attack sessions. Attack patterns observed: "
-        f"{', '.join(patterns) if patterns else 'unknown'}. "
-        f"{len(tagged)} sessions were classified."
     )
@@ -556,10 +856,10 @@ def should_override_action(
     inspect_count = sum(1 for a in previous_actions if '"inspect_packet"' in a)
     revealed_count = sum(1 for p in obs.visible_packets if p.is_revealed)
     inspect_limit = {
-        "easy": 2,
-        "medium": 4,
-        "hard": 6,
-    }.get(task_name, 8)
     if action.action_type not in {
         "inspect_packet",
@@ -580,13 +880,52 @@ def should_override_action(
             return "Missing packet_id for inspect_packet"
         if action.packet_id not in {p.packet_id for p in obs.visible_packets}:
             return f"Invalid packet_id {action.packet_id} - not in visible_packets"
         revealed_ids = {p.packet_id for p in obs.visible_packets if p.is_revealed}
         if action.packet_id in revealed_ids:
             return f"Packet {action.packet_id} is ALREADY revealed. Choose a HIDDEN packet."
-        if inspect_count >= inspect_limit and (len(sessions) > 0 or len(flagged_ids) > 0 or revealed_count >= 4):
             return (
-                f"Inspection budget reached for {task_name}. Shift to flagging, grouping, tagging, or report submission."
             )
     if action.action_type == "flag_as_suspicious":
         if not action.packet_id:
@@ -606,6 +945,22 @@ def should_override_action(
         }
         if invalid_ids:
             return f"Invalid packet_ids in session: {invalid_ids}"
         untagged_backlog = max(0, len(sessions) - len(tagged_sessions))
         if untagged_backlog > UNTAGGED_BACKLOG_LIMIT:
             return (
@@ -631,18 +986,61 @@ def should_override_action(
     if action.action_type == "submit_report":
         untagged_backlog = max(0, len(sessions) - len(tagged_sessions))
-        if obs.steps_remaining > 2 and obs.current_score_estimate < 0.60:
             return (
                 "Premature report submission. Improve coverage and score estimate before submit_report."
             )
-        if task_name != "easy" and obs.steps_remaining > 2 and untagged_backlog > 0:
             return "Premature report submission. Tag pending sessions before submitting report."
     if action.action_type == "tag_pattern":
         if not action.session_name:
             return "Missing session_name for tag_pattern"
         if not action.pattern_type:
             return "Missing pattern_type for tag_pattern"
         valid_patterns = {
             "ddos", "dos_slowloris", "dos_slowhttptest", "dos_goldeneye", "dos_hulk",
             "heartbleed", "web_sql_injection", "web_xss", "web_bruteforce",
@@ -654,7 +1052,9 @@ def should_override_action(
     if action.action_type == "identify_entry_point":
         if not action.claimed_entry_point:
             return "Missing claimed_entry_point for identify_entry_point"
-        if obs.steps_remaining > 8 and len(flagged_ids) < 3:
             return (
                 "Premature entry-point claim. Gather and flag more evidence before identify_entry_point."
             )
@@ -671,6 +1071,14 @@ def choose_action(
 ) -> NetworkForensicsAction:
     agent_state["current_task_name"] = task_name
     agent_state["strategy_hints"] = derive_strategy_hints(obs, agent_state)
     history = agent_state.get("previous_actions", [])[-HISTORY_WINDOW:]
     history_str = "\n".join([f"Step {i+1}: {a}" for i, a in enumerate(history)])
@@ -685,17 +1093,24 @@ def choose_action(
             "Follow the JSON schema in the system prompt."
         )
-    response = client.chat.completions.create(
-        model=model_name or MODEL_NAME,
-        temperature=0.1,
-        messages=[
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {
-                "role": "user",
-                "content": f"TASK: {task_name}{correction_text}\n\n### RECENT HISTORY:\n{history_str}\n\n### CURRENT OBSERVATION:\n{summarize_observation(obs, agent_state)}",
-            },
-        ],
-    )
     content = response.choices[0].message.content or ""
     try:
         action = sanitize_action(parse_action(content))
@@ -857,6 +1272,58 @@ def step_env(env: NetworkForensicsEnv, action: NetworkForensicsAction) -> Any:
     return result
 def close_env(env: NetworkForensicsEnv | None) -> None:
     if env is None:
         return
@@ -889,19 +1356,50 @@ def run_task(task_name: str) -> None:
         obs = reset_result.observation
         sync_agent_state(obs, agent_state)
         max_steps = obs.steps_remaining or 50
-        for _ in range(max_steps):
             if obs.done:
                 break
             error = None
             try:
-                action = choose_action(client, task_name, obs, agent_state)
             except Exception as exc:
                 error = str(exc).replace("\n", " ")
                 action = build_fallback_action(task_name, obs, agent_state)
-            step_result = step_env(env, action)
             obs = step_result.observation
             sync_agent_state(obs, agent_state)
             step_reward = float(step_result.reward or 0.0)

 import asyncio
 import inspect
 import random
+import time
 from pathlib import Path
 from typing import Any
 SYSTEM_PROMPT = """You are a senior Network Forensics Analyst. Your goal is to investigate malicious network traffic and achieve a 100% detection score.
 ### SCORING RULES:
+- You MUST identify and `flag_as_suspicious` EVERY malicious packet to maximize RECALL (very important!).
 - Only grouped packets or flagged packets contribute towards your score.
+- If RECALL is < 0.5, your score will be 0.0. DO NOT stop until you have flagged/grouped at least 60% of visible malicious packets.
+- Entry point must be the EARLIEST packet that initiated the attack (often in first group).
+- For HARD tasks: wrong entry point = score 0. Always identify_entry_point before submitting.
 ### WORKFLOW:
 1. **Explore**: `inspect_packet` on suspicious samples.
+2. **Flag**: `flag_as_suspicious` on ALL revealed malicious packets.
+3. **Correlate**: `group_into_session` with descriptive names.
+4. **Classify**: `tag_pattern` with a valid type.
+5. **Root Cause**: `identify_entry_point` with the earliest malicious packet.
+6. **Report**: `submit_report` ONLY when you have covered all visible malicious sessions.
+### VALID PATTERN TYPES:
+ddos, dos_slowloris, dos_slowhttptest, dos_goldeneye, dos_hulk, heartbleed, web_sql_injection, web_xss, web_bruteforce, c2, exfiltration, scan, lateral
 ### JSON SCHEMA EXAMPLES (Use these exactly):
 - Inspect: {"action_type":"inspect_packet","packet_id":"pkt_0001"}
 - Flag: {"action_type":"flag_as_suspicious","packet_id":"pkt_0001"}
 - Group: {"action_type":"group_into_session","session_name":"DDoS_Burst_2","packet_ids":["pkt_0001","pkt_0002"]}
 - Tag: {"action_type":"tag_pattern","session_name":"DDoS_Burst_2","pattern_type":"ddos"}
+- Entry: {"action_type":"identify_entry_point","claimed_entry_point":"pkt_0001"}
+- Report: {"action_type":"submit_report","incident_summary":"Detailed incident summary here.","claimed_entry_point":"pkt_0001"}"""
 HISTORY_WINDOW = 20
 REPEAT_ACTION_LIMIT = 3
 CORRECTION_WINDOW = 5
+UNTAGGED_BACKLOG_LIMIT = 6
 INSPECT_SOFT_RATIO_THRESHOLD = 0.60
+SOFT_STEP_BUDGETS = {"easy": 14, "medium": 28, "hard": 40}
+HARD_STEP_CAPS = {"easy": 30, "medium": 50, "hard": 65}
+TASK_SCORE_TARGETS = {"easy": 0.70, "medium": 0.68, "hard": 0.66}
+TASK_COVERAGE_TARGETS = {"easy": 0.32, "medium": 0.24, "hard": 0.20}
+MAX_TASK_SECONDS = float(os.getenv("MAX_TASK_SECONDS", "780"))
+TASK_TIME_BUDGET_SECONDS = {
+    "easy": float(os.getenv("EASY_MAX_SECONDS", "150")),
+    "medium": float(os.getenv("MEDIUM_MAX_SECONDS", "220")),
+    "hard": float(os.getenv("HARD_MAX_SECONDS", "320")),
+}
 def build_client() -> OpenAI:
 def summarize_observation(obs: Any, agent_state: dict[str, Any]) -> str:
+    """Provide a compact structured summary for low-latency policy learning."""
     packets = obs.visible_packets
     revealed = [p for p in packets if p.is_revealed]
     revealed_ids = [p.packet_id for p in revealed]
     reward_feedback = agent_state.get("last_reward_feedback", "n/a")
     recent_corrections = agent_state.get("recent_corrections", [])[-CORRECTION_WINDOW:]
     strategy_hints = agent_state.get("strategy_hints", [])
+    task_name = agent_state.get("current_task_name", "")
+    flagged_count = len(obs.flagged_packet_ids)
+    total_visible = max(1, len(obs.visible_packets))
+    coverage = flagged_count / total_visible
+    coverage_target = TASK_COVERAGE_TARGETS.get(task_name, 0.25)
+    score_target = TASK_SCORE_TARGETS.get(task_name, 0.65)
+    grouped_count = len(sessions)
+    tagged_count = len(tags)
+    ready_to_submit = (
+        obs.current_score_estimate >= score_target
+        and coverage >= coverage_target
+        and (task_name == "easy" or grouped_count >= 2)
+        and (task_name == "easy" or tagged_count >= 1)
+    )
     summary = [
         f"Step: {obs.step_number}/{obs.step_number + obs.steps_remaining}",
         f"Current Progress: {obs.current_score_estimate:.2f}",
+        f"Coverage: {flagged_count}/{total_visible} ({coverage:.2%}) | target {coverage_target:.0%}",
+        f"Sessions: grouped={grouped_count}, tagged={tagged_count}",
+        f"Submit Readiness: {'READY' if ready_to_submit else 'KEEP INVESTIGATING'}",
         f"Last Step Reward: {last_reward:.2f}" if isinstance(last_reward, (int, float)) else "Last Step Reward: n/a",
         f"Last Reward Feedback: {reward_feedback}",
+        f"ALREADY REVEALED: {', '.join(revealed_ids[-6:])} " + ("..." if len(revealed_ids) > 6 else ""),
         "\n### SESSIONS PENDING TAGGING:",
     ]
             summary.append(f"- {hint}")
     if untagged_sessions:
+        for s in untagged_sessions[:6]:
             summary.append(f"- {s} ({len(sessions[s])} packets)")
     else:
         summary.append("- [No pending sessions]")
     summary.append("\n### REVEALED INDICATORS:")
+    for p in revealed[-4:]:
         payload = (p.full_payload or "")[:150]
         if payload:
             summary.append(f"- {p.packet_id}: {payload}")
 def keyword_to_pattern(payload: str) -> str | None:
     text = payload.lower()
+    # --- DoS / DDoS variants ---
     if "slowloris" in text:
         return "dos_slowloris"
+    if "slowhttptest" in text or "slow http" in text:
         return "dos_slowhttptest"
+    if "goldeneye" in text or "golden eye" in text:
         return "dos_goldeneye"
     if "hulk" in text:
         return "dos_hulk"
+    if "heartbeat" in text or "heartbleed" in text or ("tls" in text and "ext" in text):
         return "heartbleed"
+    if "flood" in text or "burst" in text or "ddos" in text:
+        return "ddos"
+    # HTTP flood indicators (repeated GET/POST to same endpoint)
+    if text.startswith("get /") or text.startswith("post /") or text.startswith("get http"):
+        if "accept-encoding" in text or "connection" in text or "keep-alive" in text:
+            return "ddos"
+    # SYN flood / connection flood
+    if "syn" in text and "ack" not in text and len(text) < 30:
+        return "ddos"
+    # ICMP flood
+    if "icmp" in text and ("echo" in text or "request" in text or len(text) < 20):
+        return "ddos"
+    # --- Web attacks ---
+    if "xss" in text or "<script>" in text or "<scrip" in text or "/search?q=" in text or "onerror" in text or "onload" in text or "javascript:" in text or "alert(" in text or "%3cscript" in text:
         return "web_xss"
     if (
         "or 1=1" in text
         or "%20or" in text
         or "/items?id=" in text
         or "1=1" in text
+        or "' or " in text
+        or "'--" in text
+        or "union select" in text
+        or "union all select" in text
+        or "drop table" in text
+        or "select * from" in text
         or "sql" in text
+        or "%27" in text  # URL-encoded single quote
+        or "' and " in text
+        or "admin'--" in text
     ):
         return "web_sql_injection"
+    if (
+        "login" in text
+        or "username=admin" in text
+        or "password=" in text
+        or "passwd=" in text
+        or "user=admin" in text
+        or "brute" in text
+        or "/login" in text
+        or "/signin" in text
+        or "/auth" in text
+        or "post /login" in text
+        or "post /sign" in text
+    ):
         return "web_bruteforce"
+    # --- C2 / exfil / scan / lateral ---
+    if "c2" in text or "command" in text or "shell" in text or "cmd" in text or "/bin/" in text or "reverse" in text:
+        return "c2"
+    if "exfil" in text or "exfiltrat" in text or "data_leak" in text or "dns_tunnel" in text:
+        return "exfiltration"
+    if "scan" in text or "nmap" in text or "port_scan" in text or "recon" in text:
+        return "scan"
+    if "lateral" in text or "pivot" in text or "spread" in text or "propagat" in text:
+        return "lateral"
     return None
     return (packet.src_ip, packet.dst_ip, packet.dst_port, pattern)
+SUSPICIOUS_PORTS = {22, 23, 445, 1433, 3306, 5432, 4444, 5555, 6666, 6667, 7777, 8888, 9999, 31337}
+SUSPICIOUS_PROTOCOLS = {"ICMP"}
+def _infer_flow_pattern(packet: Any, flow_size: int) -> str | None:
+    """Heuristic pattern inference from flow characteristics when keyword matching fails."""
+    dst_port = packet.dst_port
+    protocol = packet.protocol
+    flags = getattr(packet, "flags", []) or []
+    # High-density flows to web ports → likely DDoS
+    if flow_size >= 5 and dst_port in {80, 8080, 443, 8443}:
+        return "ddos"
+    # SYN-only flood
+    if flow_size >= 5 and flags == ["SYN"]:
+        return "ddos"
+    # Suspicious ports → C2 or lateral
+    if dst_port in SUSPICIOUS_PORTS:
+        if dst_port in {4444, 5555, 6666, 7777, 31337}:
+            return "c2"
+        if dst_port in {445, 1433, 3306, 5432}:
+            return "lateral"
+    # ICMP flood
+    if protocol in SUSPICIOUS_PROTOCOLS and flow_size >= 3:
+        return "ddos"
+    # High-density flow to non-standard port
+    if flow_size >= 8 and dst_port not in {53, 80, 443, 8080}:
+        return "scan"
+    return None
 def session_candidates(obs: Any) -> list[tuple[tuple[str, str, int, str], list[Any]]]:
     grouped: dict[tuple[str, str, int, str], list[Any]] = {}
     attack_source_ports: dict[tuple[str, str, int, str], set[int]] = {}
+    # Phase 1: keyword-based grouping (high confidence)
     for packet in obs.visible_packets:
         pattern = keyword_to_pattern(packet_payload_text(packet))
         if pattern:
             grouped.setdefault(key, []).append(packet)
             attack_source_ports.setdefault(key, set()).add(packet.src_port)
+    # Add reverse-response packets to keyword-matched sessions
     for key, source_ports in attack_source_ports.items():
         src_ip, dst_ip, dst_port, _pattern = key
         for packet in obs.visible_packets:
             if is_reverse_response:
                 grouped[key].append(packet)
+    # Phase 2: flow-based grouping for packets without keyword match
+    # Group unclaimed packets by (src_ip, dst_ip, dst_port) and infer pattern
+    claimed_ids: set[str] = set()
+    for items in grouped.values():
+        for p in items:
+            claimed_ids.add(p.packet_id)
+    flow_buckets: dict[tuple[str, str, int], list[Any]] = {}
+    for packet in obs.visible_packets:
+        if packet.packet_id in claimed_ids:
+            continue
+        flow_key = (packet.src_ip, packet.dst_ip, packet.dst_port)
+        flow_buckets.setdefault(flow_key, []).append(packet)
+    for flow_key, items in flow_buckets.items():
+        if len(items) < 2:
+            continue
+        pattern = _infer_flow_pattern(items[0], len(items))
+        if pattern:
+            session_key = (*flow_key, pattern)
+            grouped.setdefault(session_key, []).extend(items)
+            for p in items:
+                claimed_ids.add(p.packet_id)
     candidates = [
         (
             key,
     return 0
+def select_inspect_packet(
+    obs: Any,
+    inspected_ids: set[str],
+    flagged_ids: set[str] | None = None,
+) -> str | None:
+    flagged_ids = flagged_ids or set()
+    unrevealed = [
+        p
+        for p in obs.visible_packets
+        if (not p.is_revealed)
+        and (p.packet_id not in inspected_ids)
+        and (p.packet_id not in flagged_ids)
+    ]
     if not unrevealed:
         return None
 def append_action_history(agent_state: dict[str, Any], action: NetworkForensicsAction) -> None:
     history = agent_state.setdefault("previous_actions", [])
     history.append(format_action(action))
+    if action.action_type == "inspect_packet" and action.packet_id:
+        inspected_ids = agent_state.setdefault("inspected_ids", set())
+        inspected_ids.add(action.packet_id)
     if len(history) > HISTORY_WINDOW:
         del history[:-HISTORY_WINDOW]
         candidate_packets, flagged_ids, visible_by_id
     )
     size = len(candidate_packets)
+    # Lowered thresholds for more aggressive grouping
     if task_name == "easy":
         min_flagged = 1 if size >= 2 else 0
     elif task_name == "medium":
+        min_flagged = 1 if size >= 2 else 0
     else:
+        min_flagged = 1 if size >= 3 else 0
+    if trusted_pattern and size >= 3:
         min_flagged = 1
     if flagged >= min_flagged:
         return True
     # Allow grouping with strong revealed malicious evidence.
+    if task_name == "easy" and (malicious_revealed >= 1 or revealed >= 1):
         return True
+    if task_name == "medium" and malicious_revealed >= 1 and revealed >= 1:
         return True
+    if malicious_revealed >= 1 and revealed >= min(2, size):
+        return True
+    # After a pattern has been confirmed by tagging, allow structure-first grouping.
+    if trusted_pattern and size >= 3:
         return True
+    # Large flows are very likely attack sessions - allow with minimal evidence
+    if size >= 6 and (flagged >= 1 or revealed >= 2 or malicious_revealed >= 1):
         return True
     return False
         )
     inspect_limit = {
+        "easy": 18,
+        "medium": 20,
+        "hard": 25,
+    }.get(agent_state.get("current_task_name", ""), 15)
     if len(previous_actions) >= inspect_limit and inspect_ratio >= INSPECT_SOFT_RATIO_THRESHOLD:
         hints.append(
             "You are over-inspecting. Shift to flagging, grouping, tagging, or report submission unless the next packet is clearly high-value."
     return hints
+def should_submit_early(task_name: str, obs: Any, agent_state: dict[str, Any]) -> bool:
+    flagged_count = len(obs.flagged_packet_ids)
+    total_visible = max(1, len(obs.visible_packets))
+    coverage = flagged_count / total_visible
+    score = float(obs.current_score_estimate)
+    sessions = obs.grouped_sessions or {}
+    tags = obs.tagged_patterns or {}
+    score_target = TASK_SCORE_TARGETS.get(task_name, 0.65)
+    coverage_target = TASK_COVERAGE_TARGETS.get(task_name, 0.25)
+    if task_name == "easy":
+        return (
+            coverage >= max(coverage_target * 0.7, 0.20)
+            and flagged_count >= 6
+            and len(sessions) >= 1
+        )
+    if task_name == "medium":
+        return (
+            score >= score_target * 0.8
+            and coverage >= coverage_target * 0.7
+            and len(sessions) >= 1
+            and len(tags) >= 1
+        )
+    return (
+        score >= score_target * 0.8
+        and coverage >= coverage_target * 0.7
+        and len(sessions) >= 2
+        and len(tags) >= 1
+        and bool(agent_state.get("claimed_entry_point") or obs.claimed_entry_point)
+    )
 def build_fallback_action(
     task_name: str, obs: Any, agent_state: dict[str, Any]
 ) -> NetworkForensicsAction:
+    """Smart workflow engine: Flag aggressive -> Group -> Tag -> Entry Point -> Report."""
     inspected_ids = agent_state.setdefault("inspected_ids", set())
     flagged_ids = agent_state.setdefault("flagged_ids", set())
     session_map = agent_state.setdefault("sessions", {})  # key -> session_name
     visible_by_id = {p.packet_id: p for p in obs.visible_packets}
     trusted = trusted_patterns(session_map, tagged_sessions)
+    if obs.steps_remaining <= 1 or should_submit_early(task_name, obs, agent_state):
         summary = _build_report_summary(obs, agent_state)
         return NetworkForensicsAction(
             action_type="submit_report",
             claimed_entry_point=claimed_entry,
         )
+    # PHASE 1: Aggressive flag of ALL revealed malicious packets
+    # This maximizes recall by comprehensively flagging known-bad traffic
+    unflagged_malicious = []
     for packet in obs.visible_packets:
         if packet.is_revealed and packet.packet_id not in flagged_ids:
             payload = packet.full_payload or ""
             pattern = keyword_to_pattern(payload)
             if pattern:
+                unflagged_malicious.append(packet.packet_id)
+    if unflagged_malicious:
+        # Flag up to 5 per turn for aggressive recall buildup
+        target = min(5, len(unflagged_malicious))
+        for _ in range(target):
+            if unflagged_malicious:
+                pid = unflagged_malicious.pop(0)
+                flagged_ids.add(pid)
                 return NetworkForensicsAction(
                     action_type="flag_as_suspicious",
+                    packet_id=pid,
                 )
     # PHASE 2: Group flagged packets into sessions with evidence gate and backlog pacing.
+    min_flagged_before_group = 1 if task_name == "easy" else 2
     untagged_backlog = max(0, len(session_map) - len(tagged_sessions))
+    if len(flagged_ids) >= min_flagged_before_group and untagged_backlog <= UNTAGGED_BACKLOG_LIMIT:
         candidates = session_candidates(obs)
         for key, items in candidates:
             if key in session_map:
                 packet_ids=packet_ids,
             )
+    # PHASE 2.5: Recall sweep - flag packets that are already part of grouped sessions.
+    # This boosts recall quickly without requiring more inspections.
+    grouped_packets = []
+    for packet_ids in (obs.grouped_sessions or {}).values():
+        grouped_packets.extend(packet_ids)
+    for pid in sorted(set(grouped_packets), key=packet_sort_key):
+        if pid in flagged_ids:
+            continue
+        if pid in visible_by_id:
+            flagged_ids.add(pid)
+            return NetworkForensicsAction(
+                action_type="flag_as_suspicious",
+                packet_id=pid,
+            )
+    # PHASE 3: Tag ALL untagged sessions aggressively (critical for medium/hard logic_score).
+    # Tagging helps LLM report score and logic_score for all difficulties.
+    for key, session_name in session_map.items():
+        if session_name in tagged_sessions:
+            continue
+        _src_ip, _dst_ip, _dst_port, pattern = key
+        tagged_sessions.add(session_name)
+        return NetworkForensicsAction(
+            action_type="tag_pattern",
+            session_name=session_name,
+            pattern_type=pattern,
+        )
+    # Also tag any observed sessions not yet in our session_map
+    for session_name, session_data in (obs.grouped_sessions or {}).items():
+        if session_name in tagged_sessions:
+            continue
+        if session_name in (obs.tagged_patterns or {}):
+            tagged_sessions.add(session_name)
+            continue
+        # Infer pattern from session packets
+        pattern = None
+        for pid in session_data:
+            pkt = visible_by_id.get(pid)
+            if pkt and pkt.is_revealed:
+                pattern = keyword_to_pattern(packet_payload_text(pkt))
+                if pattern:
+                    break
+        if not pattern:
+            # Try flow-based inference
+            pkt = visible_by_id.get(session_data[0]) if session_data else None
+            if pkt:
+                pattern = _infer_flow_pattern(pkt, len(session_data))
+        if pattern:
             tagged_sessions.add(session_name)
             return NetworkForensicsAction(
                 action_type="tag_pattern",
                 pattern_type=pattern,
             )
+    # PHASE 4: Identify entry point - CRITICAL for hard mode (score=0 without it)
+    if not claimed_entry:
+        entry_candidate = None
+        # Strategy 1: earliest packet in any grouped session from observation
+        try:
+            grouped_packets = set()
+            for session_name in session_map.values():
+                if obs.grouped_sessions and session_name in obs.grouped_sessions:
+                    grouped_packets.update(obs.grouped_sessions[session_name])
+            if grouped_packets:
+                entry_candidate = min(grouped_packets, key=lambda pid: packet_sort_key(pid))
+        except Exception:
+            pass
+        # Strategy 2: earliest flagged packet (often the first discovered attack)
+        if not entry_candidate and flagged_ids:
+            entry_candidate = min(flagged_ids, key=lambda pid: packet_sort_key(pid))
+        # Strategy 3: earliest revealed malicious packet
+        if not entry_candidate:
+            revealed_malicious = [
+                p for p in obs.visible_packets
+                if p.is_revealed and keyword_to_pattern(packet_payload_text(p))
+            ]
+            if revealed_malicious:
+                entry_candidate = min(
+                    revealed_malicious, key=lambda p: packet_sort_key(p.packet_id)
+                ).packet_id
+        # Strategy 4: earliest packet in session_candidates
+        if not entry_candidate:
+            all_session_packets = []
+            for key, items in session_candidates(obs):
+                for p in items:
+                    all_session_packets.append(p.packet_id)
+            if all_session_packets:
+                entry_candidate = min(all_session_packets, key=packet_sort_key)
+        # Strategy 5: earliest flagged packet from observation
+        if not entry_candidate and obs.flagged_packet_ids:
+            entry_candidate = min(obs.flagged_packet_ids, key=packet_sort_key)
+        if entry_candidate:
+            agent_state["claimed_entry_point"] = entry_candidate
+            return NetworkForensicsAction(
+                action_type="identify_entry_point",
+                claimed_entry_point=entry_candidate,
+            )
+    # PHASE 5: Inspect more unrevealed packets (to discover more malicious traffic)
+    inspect_id = select_inspect_packet(obs, inspected_ids, flagged_ids)
     if inspect_id is not None:
         return NetworkForensicsAction(action_type="inspect_packet", packet_id=inspect_id)
 def _build_report_summary(obs: Any, agent_state: dict[str, Any]) -> str:
+    """Generate a detailed incident summary for high LLM judge scores."""
     flagged = agent_state.get("flagged_ids", set())
     sessions = agent_state.get("sessions", {})
     tagged = agent_state.get("tagged_sessions", set())
+    entry_point = agent_state.get("claimed_entry_point") or getattr(obs, "claimed_entry_point", None)
+    patterns_by_session: dict[str, str] = {}
+    src_ips_by_pattern: dict[str, set[str]] = {}
+    dst_ips_by_pattern: dict[str, set[str]] = {}
+    for key, session_name in sessions.items():
         if len(key) >= 4:
+            pattern = key[3]
+            patterns_by_session[session_name] = pattern
+            src_ips_by_pattern.setdefault(pattern, set()).add(key[0])
+            dst_ips_by_pattern.setdefault(pattern, set()).add(key[1])
+    # Build detailed per-pattern section
+    pattern_details = []
+    for pattern in sorted(set(patterns_by_session.values())):
+        srcs = ", ".join(sorted(src_ips_by_pattern.get(pattern, set()))[:5])
+        dsts = ", ".join(sorted(dst_ips_by_pattern.get(pattern, set()))[:5])
+        session_names = [n for n, p in patterns_by_session.items() if p == pattern]
+        pattern_details.append(
+            f"  - {pattern}: {len(session_names)} session(s) from {srcs} targeting {dsts}"
+        )
+    pattern_section = "\n".join(pattern_details) if pattern_details else "  - No patterns classified"
+    # Tagged pattern summary
+    tagged_details = []
+    for session_name in sorted(tagged):
+        pattern = patterns_by_session.get(session_name, "unknown")
+        tagged_details.append(f"{session_name}={pattern}")
+    tagged_section = "; ".join(tagged_details) if tagged_details else "none"
+    entry_section = f"Entry point: {entry_point}" if entry_point else "Entry point: not identified"
     return (
+        f"INCIDENT REPORT\n\n"
+        f"Summary: Detected {len(flagged)} malicious packets across "
+        f"{len(sessions)} attack sessions.\n\n"
+        f"Attack Patterns:\n{pattern_section}\n\n"
+        f"Tagged Sessions: {tagged_section}\n\n"
+        f"{entry_section}\n\n"
+        f"Total flagged: {len(flagged)} | Total sessions: {len(sessions)} | "
+        f"Classified sessions: {len(tagged)}"
     )
     inspect_count = sum(1 for a in previous_actions if '"inspect_packet"' in a)
     revealed_count = sum(1 for p in obs.visible_packets if p.is_revealed)
     inspect_limit = {
+        "easy": 25,
+        "medium": 18,
+        "hard": 25,
+    }.get(task_name, 15)
     if action.action_type not in {
         "inspect_packet",
             return "Missing packet_id for inspect_packet"
         if action.packet_id not in {p.packet_id for p in obs.visible_packets}:
             return f"Invalid packet_id {action.packet_id} - not in visible_packets"
+        inspected_ids = agent_state.setdefault("inspected_ids", set())
+        if action.packet_id in inspected_ids:
+            return f"Packet {action.packet_id} was already inspected. Choose a different hidden packet."
         revealed_ids = {p.packet_id for p in obs.visible_packets if p.is_revealed}
         if action.packet_id in revealed_ids:
             return f"Packet {action.packet_id} is ALREADY revealed. Choose a HIDDEN packet."
+        if action.packet_id in set(obs.flagged_packet_ids):
+            return (
+                f"Packet {action.packet_id} is already flagged. Inspect a new hidden unflagged packet instead."
+            )
+        revealed_unflagged_malicious = [
+            p.packet_id
+            for p in obs.visible_packets
+            if p.is_revealed
+            and p.packet_id not in set(obs.flagged_packet_ids)
+            and keyword_to_pattern(packet_payload_text(p))
+        ]
+        if revealed_unflagged_malicious:
+            return (
+                "Recall-first policy: revealed malicious packets exist and must be flagged before new inspection."
+            )
+        grouped_unflagged = [
+            pid
+            for packet_ids in (obs.grouped_sessions or {}).values()
+            for pid in packet_ids
+            if pid not in set(obs.flagged_packet_ids)
+        ]
+        if grouped_unflagged:
             return (
+                "Recall-first policy: grouped session packets remain unflagged. Flag them before further inspection."
             )
+        if task_name == "easy" and len(flagged_ids) >= 4:
+            grouped_session_names = set((obs.grouped_sessions or {}).keys())
+            for key, items in session_candidates(obs):
+                if key in sessions:
+                    continue
+                if len(items) >= 4:
+                    return (
+                        "Exploit mode: enough evidence exists. Group high-confidence attack flows before more inspection."
+                    )
+        if inspect_count >= inspect_limit and (len(sessions) > 0 or len(flagged_ids) > 0 or revealed_count >= 4):
+            # Only block inspections for medium/hard modes; easy mode needs discovery
+            if task_name != "easy":
+                return (
+                    f"Inspection budget reached for {task_name}. Shift to flagging, grouping, tagging, or report submission."
+                )
     if action.action_type == "flag_as_suspicious":
         if not action.packet_id:
         }
         if invalid_ids:
             return f"Invalid packet_ids in session: {invalid_ids}"
+        if action.session_name in sessions.values():
+            return f"Session name {action.session_name} is already used."
+        min_flagged_before_group = 1 if task_name == "easy" else 1
+        if len(flagged_ids) < min_flagged_before_group:
+            return (
+                f"Group blocked until enough evidence is flagged ({len(flagged_ids)}/{min_flagged_before_group}). "
+                "Inspect and flag suspicious packets first."
+            )
+        new_group_ids = set(action.packet_ids)
+        for existing_ids in (obs.grouped_sessions or {}).values():
+            existing_set = set(existing_ids)
+            if not existing_set:
+                continue
+            overlap = len(new_group_ids & existing_set) / max(1, len(new_group_ids))
+            if overlap >= 0.8:
+                return "This grouping heavily overlaps an existing session. Prioritize new evidence."
         untagged_backlog = max(0, len(sessions) - len(tagged_sessions))
         if untagged_backlog > UNTAGGED_BACKLOG_LIMIT:
             return (
     if action.action_type == "submit_report":
         untagged_backlog = max(0, len(sessions) - len(tagged_sessions))
+        total_visible = max(1, len(obs.visible_packets))
+        flagged_count = len(obs.flagged_packet_ids)
+        coverage = flagged_count / total_visible
+        min_cov = TASK_COVERAGE_TARGETS.get(task_name, 0.25) * 0.6
+        min_flags = 4 if task_name == "easy" else (3 if task_name == "medium" else 4)
+        min_groups = 1 if task_name == "easy" else (2 if task_name == "medium" else 2)
+        if (
+            obs.steps_remaining > 2
+            and obs.current_score_estimate < 0.40
+            and not should_submit_early(task_name, obs, agent_state)
+        ):
             return (
                 "Premature report submission. Improve coverage and score estimate before submit_report."
             )
+        if obs.steps_remaining > 1 and (coverage < min_cov or flagged_count < min_flags):
+            return (
+                f"Premature report submission. Need stronger recall coverage before submit_report "
+                f"(coverage {coverage:.0%}/{min_cov:.0%}, flags {flagged_count}/{min_flags})."
+            )
+        if obs.steps_remaining > 1 and len(sessions) < min_groups:
+            return (
+                f"Premature report submission. Need stronger session evidence before submit_report "
+                f"(grouped {len(sessions)}/{min_groups})."
+            )
+        if task_name == "hard" and obs.steps_remaining > 3 and untagged_backlog > 0:
             return "Premature report submission. Tag pending sessions before submitting report."
+        # CRITICAL: Hard mode zero-out if no entry point identified
+        if task_name == "hard" and not (agent_state.get("claimed_entry_point") or obs.claimed_entry_point):
+            return (
+                "FATAL: Hard mode requires identify_entry_point before submit_report. "
+                "No entry point claimed yet — score will be 0.0 without it. "
+                "Use identify_entry_point with the earliest malicious packet first."
+            )
+        # Medium mode: need entry point for good logic_score
+        if task_name == "medium" and obs.steps_remaining > 5 and not (agent_state.get("claimed_entry_point") or obs.claimed_entry_point):
+            return (
+                "Missing entry point. Use identify_entry_point before submit_report for higher score."
+            )
+        # Require minimum tagging coverage for medium/hard
+        min_tagged = 1 if task_name == "medium" else 2
+        if task_name in {"medium", "hard"} and len(tagged_sessions) < min_tagged and obs.steps_remaining > 3:
+            return (
+                f"Premature report submission. Need at least {min_tagged} tagged session(s) before submit_report "
+                f"(currently {len(tagged_sessions)})."
+            )
     if action.action_type == "tag_pattern":
         if not action.session_name:
             return "Missing session_name for tag_pattern"
         if not action.pattern_type:
             return "Missing pattern_type for tag_pattern"
+        if action.session_name in set((obs.tagged_patterns or {}).keys()):
+            return f"Session {action.session_name} is already tagged."
+        if task_name == "easy" and obs.steps_remaining > 8:
+            return "For easy mode, prioritize recall actions (inspect/flag/group) before tagging."
         valid_patterns = {
             "ddos", "dos_slowloris", "dos_slowhttptest", "dos_goldeneye", "dos_hulk",
             "heartbleed", "web_sql_injection", "web_xss", "web_bruteforce",
     if action.action_type == "identify_entry_point":
         if not action.claimed_entry_point:
             return "Missing claimed_entry_point for identify_entry_point"
+        # Lenient gating for easy mode
+        min_flags_needed = 1 if task_name == "easy" else (2 if task_name == "medium" else 2)
+        if obs.steps_remaining > 8 and len(flagged_ids) < min_flags_needed:
             return (
                 "Premature entry-point claim. Gather and flag more evidence before identify_entry_point."
             )
 ) -> NetworkForensicsAction:
     agent_state["current_task_name"] = task_name
     agent_state["strategy_hints"] = derive_strategy_hints(obs, agent_state)
+    if should_submit_early(task_name, obs, agent_state):
+        action = NetworkForensicsAction(
+            action_type="submit_report",
+            incident_summary=_build_report_summary(obs, agent_state),
+            claimed_entry_point=agent_state.get("claimed_entry_point") or obs.claimed_entry_point,
+        )
+        append_action_history(agent_state, action)
+        return action
     history = agent_state.get("previous_actions", [])[-HISTORY_WINDOW:]
     history_str = "\n".join([f"Step {i+1}: {a}" for i, a in enumerate(history)])
             "Follow the JSON schema in the system prompt."
         )
+    try:
+        response = client.chat.completions.create(
+            model=model_name or MODEL_NAME,
+            temperature=0.1,
+            timeout=LLM_TIMEOUT_S,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {
+                    "role": "user",
+                    "content": f"TASK: {task_name}{correction_text}\n\n### RECENT HISTORY:\n{history_str}\n\n### CURRENT OBSERVATION:\n{summarize_observation(obs, agent_state)}",
+                },
+            ],
+        )
+    except Exception as llm_exc:
+        print(f"[WARN] LLM call failed/timed out: {llm_exc}")
+        fallback = build_fallback_action(task_name, obs, agent_state)
+        append_action_history(agent_state, fallback)
+        return fallback
     content = response.choices[0].message.content or ""
     try:
         action = sanitize_action(parse_action(content))
     return result
+WS_RETRY_COUNT = 3
+WS_RETRY_DELAY_S = 2.0
+LLM_TIMEOUT_S = 45.0
+def step_env_with_retry(
+    env: NetworkForensicsEnv,
+    action: NetworkForensicsAction,
+    task_name: str,
+    agent_state: dict[str, Any],
+) -> tuple[Any, NetworkForensicsEnv | None]:
+    """Try step_env with retries on WebSocket timeout.
+    Returns (step_result, new_env_or_None).
+    If the WebSocket connection drops, reconnects and retries.
+    """
+    last_exc = None
+    for attempt in range(1, WS_RETRY_COUNT + 1):
+        try:
+            result = step_env(env, action)
+            return result, None
+        except Exception as exc:
+            last_exc = exc
+            exc_str = str(exc).lower()
+            is_ws_timeout = any(
+                kw in exc_str
+                for kw in ("keepalive", "ping timeout", "1011", "websocket", "connection")
+            )
+            if not is_ws_timeout:
+                raise
+            print(
+                f"[WARN] WebSocket timeout on attempt {attempt}/{WS_RETRY_COUNT}: {exc}"
+            )
+            if attempt < WS_RETRY_COUNT:
+                time.sleep(WS_RETRY_DELAY_S * attempt)
+                # Try reconnecting
+                try:
+                    close_env(env)
+                except Exception:
+                    pass
+                try:
+                    env = create_env()
+                    reset_result = reset_env(env, task_name)
+                    obs = reset_result.observation
+                    sync_agent_state(obs, agent_state)
+                    print(f"[INFO] Reconnected to environment, resuming task={task_name}")
+                except Exception as reconnect_exc:
+                    print(f"[WARN] Reconnect failed: {reconnect_exc}")
+                    continue
+    raise last_exc  # type: ignore[misc]
 def close_env(env: NetworkForensicsEnv | None) -> None:
     if env is None:
         return
         obs = reset_result.observation
         sync_agent_state(obs, agent_state)
         max_steps = obs.steps_remaining or 50
+        soft_budget = min(max_steps, SOFT_STEP_BUDGETS.get(task_name, max_steps))
+        hard_budget = min(max_steps, HARD_STEP_CAPS.get(task_name, max_steps))
+        start_ts = time.monotonic()
+        task_time_budget = min(MAX_TASK_SECONDS, TASK_TIME_BUDGET_SECONDS.get(task_name, MAX_TASK_SECONDS))
+        for _ in range(hard_budget):
             if obs.done:
                 break
+            elapsed = time.monotonic() - start_ts
+            total_visible = max(1, len(obs.visible_packets))
+            current_coverage = len(obs.flagged_packet_ids) / total_visible
+            min_cov = TASK_COVERAGE_TARGETS.get(task_name, 0.25)
+            ready_for_budget_submit = (
+                obs.step_number >= soft_budget
+                and should_submit_early(task_name, obs, agent_state)
+            )
+            forced_at_hard_cap = (
+                obs.step_number >= max(1, hard_budget - 1)
+                and (should_submit_early(task_name, obs, agent_state) or task_name != "easy")
+            )
+            nearing_time_limit = elapsed >= max(20.0, task_time_budget - 12.0)
             error = None
             try:
+                if forced_at_hard_cap or nearing_time_limit or ready_for_budget_submit:
+                    action = NetworkForensicsAction(
+                        action_type="submit_report",
+                        incident_summary=_build_report_summary(obs, agent_state),
+                        claimed_entry_point=agent_state.get("claimed_entry_point") or obs.claimed_entry_point,
+                    )
+                else:
+                    action = choose_action(client, task_name, obs, agent_state)
             except Exception as exc:
                 error = str(exc).replace("\n", " ")
                 action = build_fallback_action(task_name, obs, agent_state)
+            try:
+                step_result, new_env = step_env_with_retry(env, action, task_name, agent_state)
+                if new_env is not None:
+                    env = new_env
+            except Exception as exc:
+                print(f"[WARN] step failure on task={task_name}: {exc}")
+                break
             obs = step_result.observation
             sync_agent_state(obs, agent_state)
             step_reward = float(step_result.reward or 0.0)

server/gradio_ui.py CHANGED Viewed

@@ -453,12 +453,6 @@ def create_demo() -> gr.Blocks:
     with gr.Blocks(
         title="NetForensics-RL · Analyst Console",
-        theme=gr.themes.Base(
-            primary_hue="blue",
-            neutral_hue="slate",
-            font=gr.themes.GoogleFont("Inter"),
-        ),
-        css=css,
     ) as demo:
         with gr.Column(elem_classes=["app-shell"]):
             gr.HTML(f"<style>{css}</style>")

     with gr.Blocks(
         title="NetForensics-RL · Analyst Console",
     ) as demo:
         with gr.Column(elem_classes=["app-shell"]):
             gr.HTML(f"<style>{css}</style>")