Spaces:

nvidia
/

LocateAnything

Running on Zero

App Files Files Community

gradio server

by akhaliq HF Staff - opened 6 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+1134

-854

Files changed (4) hide show

README.md +1 -1
app.py +206 -852
index.html +926 -0
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 💬
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
-sdk_version: 6.5.1
 python_version: "3.10.13"
 app_file: app.py
 pinned: false

 colorFrom: yellow
 colorTo: purple
 sdk: gradio
+sdk_version: 6.14.0
 python_version: "3.10.13"
 app_file: app.py
 pinned: false

app.py CHANGED Viewed

@@ -1,9 +1,17 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import gradio as gr
 import cv2
 import numpy as np
 import os
 import tempfile
 import re
 import time
@@ -13,50 +21,16 @@ import io
 import json
 import uuid
 from pathlib import Path
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from transformers import AutoProcessor, AutoModel, AutoTokenizer
-from huggingface_hub import CommitScheduler
-import spaces
 _FONT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "LXGWWenKai-Bold.ttf")
-def _get_first_env(*names):
-    for name in names:
-        value = os.environ.get(name)
-        if value and value.strip():
-            return value.strip()
-    return None
-def _configure_hf_auth():
-    model_token = _get_first_env(
-        "MODEL_HF_TOKEN",
-        "LOG_HF_TOKEN",
-        "HF_TOKEN",
-        "HUGGINGFACE_HUB_TOKEN",
-        "HUGGINGFACEHUB_API_TOKEN",
-    )
-    log_token = _get_first_env(
-        "LOG_HF_TOKEN",
-        "MODEL_HF_TOKEN",
-        "HF_TOKEN",
-        "HUGGINGFACE_HUB_TOKEN",
-        "HUGGINGFACEHUB_API_TOKEN",
-    )
-    shared_token = model_token or log_token
-    if shared_token:
-        # Some downstream hub calls still rely on standard env var names.
-        for name in ("HF_TOKEN", "HUGGINGFACE_HUB_TOKEN", "HUGGINGFACEHUB_API_TOKEN"):
-            os.environ[name] = shared_token
-    return model_token, log_token
-MODEL_HF_TOKEN, LOG_HF_TOKEN = _configure_hf_auth()
 def _load_font(size=20):
     """加载中文字体（LXGW WenKai），需提前放置到 assets/ 目录"""
@@ -233,23 +207,22 @@ class EagleWorker:
         self.device = device
         self.dtype = torch.bfloat16
         self.generation_mode = generation_mode
-        self.hf_token = MODEL_HF_TOKEN
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path,
             trust_remote_code=True,
-            token=self.hf_token,
         )
         self.processor = AutoProcessor.from_pretrained(
             model_path,
             trust_remote_code=True,
-            token=self.hf_token,
         )
         self.model = AutoModel.from_pretrained(
             model_path,
             torch_dtype=self.dtype,
             _attn_implementation="sdpa",
             trust_remote_code=True,
-            token=self.hf_token,
         ).to(device).eval()
         print("Model Loaded Successfully!")
@@ -299,7 +272,7 @@ class EagleWorker:
 # ============================================================
-# 后处理 / HTML
 # ============================================================
 def _postprocess_detections(detections, w, h):
     valid = []
@@ -333,106 +306,6 @@ def _parse_out_info_dict(out_info: str) -> dict:
     return stats
-def generate_dynamic_html(token_sequence, out_info, raw_text):
-    uid = f"a{int(time.time() * 1000)}"
-    css = f"""
-    <style>
-        .dc-root {{
-            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
-            border: 1px solid #cce875; border-radius: 10px; background: #ffffff; overflow: hidden;
-        }}
-        .dc-header {{
-            display: flex; align-items: center; justify-content: space-between;
-            padding: 12px 18px;
-            background: linear-gradient(135deg, #76b900 0%, #649d00 100%);
-            border-bottom: 1px solid #527f00;
-        }}
-        .dc-header-title {{ font-weight: 700; font-size: 0.95em; color: #ffffff !important; letter-spacing: 0.3px; }}
-        .dc-legend {{ display: flex; gap: 16px; align-items: center; }}
-        .dc-legend-item {{ display: flex; align-items: center; gap: 5px; font-size: 0.78em; color: rgba(255,255,255,0.92); font-weight: 500; }}
-        .dc-legend-dot {{ width: 10px; height: 10px; border-radius: 3px; display: inline-block; border: 1px solid rgba(255,255,255,0.5); }}
-        .dc-row {{ display: flex; gap: 10px; padding: 14px 18px; border-bottom: 1px solid #eef7d1; }}
-        .dc-row:last-child {{ border-bottom: none; }}
-        .dc-val {{ flex: 1; line-height: 2.3; word-wrap: break-word; color: #4b5563; font-size: 0.92em; }}
-        @keyframes tk-{uid} {{
-            0%   {{ opacity: 0; transform: translateY(8px) scale(0.92); }}
-            60%  {{ opacity: 1; transform: translateY(-2px) scale(1.02); }}
-            100% {{ opacity: 1; transform: translateY(0) scale(1); }}
-        }}
-        .tk-mtp-{uid}, .tk-ar-{uid} {{
-            opacity: 0; animation: tk-{uid} 0.35s ease-out forwards;
-            border-radius: 5px; padding: 2px 7px; margin: 2px 1px; display: inline-block;
-            font-size: 0.80em; font-weight: 600;
-            font-family: 'SFMono-Regular', Consolas, 'Courier New', monospace; white-space: nowrap;
-        }}
-        .tk-mtp-{uid} {{ background: #e8f5e9; border: 2px solid #76b900; color: #2d4400; box-shadow: 0 1px 2px rgba(118,185,0,0.15); }}
-        .tk-ar-{uid} {{ background: #fff3e0; border: 2px solid #e65100; color: #bf360c; box-shadow: 0 1px 2px rgba(230,81,0,0.15); }}
-        .tk-stat-{uid} {{
-            opacity: 0; animation: tk-{uid} 0.4s ease-out forwards;
-            background: #f0f9e2; border: 1px solid #a4d422; border-radius: 6px;
-            padding: 5px 14px; display: inline-block; font-size: 0.82em; color: #3f6200; font-weight: 600;
-        }}
-        .dc-raw {{ padding: 0 18px 14px; }}
-        .dc-raw summary {{ cursor: pointer; color: #9ca3af; font-size: 0.82em; user-select: none; transition: color .15s; }}
-        .dc-raw summary:hover {{ color: #649d00; }}
-        .dc-raw-pre {{
-            background: #f7fbe8; border: 1px solid #ddf0a3; border-radius: 6px;
-            padding: 12px; margin-top: 8px;
-            font-family: 'SFMono-Regular', Consolas, 'Courier New', monospace;
-            font-size: 0.78em; color: #374151; white-space: pre-wrap; word-break: break-all;
-            max-height: 200px; overflow-y: auto;
-        }}
-        @media (max-width: 640px) {{
-            .dc-header {{ flex-direction: column; gap: 8px; align-items: flex-start; }}
-            .dc-row {{ flex-direction: column; gap: 4px; }}
-        }}
-    </style>
-    """
-    h = css + '<div class="dc-root">'
-    h += ('<div class="dc-header">'
-          '<span class="dc-header-title">LocateAnything Decoding Trace</span>'
-          '<div class="dc-legend">'
-          '<div class="dc-legend-item"><span class="dc-legend-dot" style="background:#76b900;"></span>MTP &mdash; Parallel Box Decoding</div>'
-          '<div class="dc-legend-item"><span class="dc-legend-dot" style="background:#e65100;"></span>AR &mdash; NTP Fallback (Re-decoding)</div>'
-          '</div></div>')
-    h += '<div class="dc-row"><div class="dc-val">'
-    tok_idx = 0
-    if token_sequence:
-        for item in token_sequence:
-            if not isinstance(item, (list, tuple)) or len(item) < 2:
-                continue
-            decode_type = str(item[0]).lower()
-            text = str(item[1])
-            safe = text.replace("<", "&lt;").replace(">", "&gt;")
-            delay = f"{tok_idx * 0.06:.2f}s"
-            cls = f"tk-ar-{uid}" if decode_type == "ar" else f"tk-mtp-{uid}"
-            h += f'<span class="{cls}" style="animation-delay:{delay}">{safe}</span> '
-            tok_idx += 1
-    h += '</div></div>'
-    if out_info:
-        stats = _parse_out_info_dict(out_info)
-        bits = []
-        if "forward_step" in stats: bits.append(f"{stats['forward_step']} steps")
-        if "num_tokens" in stats: bits.append(f"{stats['num_tokens']} tokens")
-        if "num_boxes" in stats: bits.append(f"{stats['num_boxes']} boxes")
-        if "switch_to_ar" in stats:
-            n = stats["switch_to_ar"]
-            bits.append(f"{n} AR Fallback{'s' if n != '1' else ''}")
-        if "ar_step" in stats: bits.append(f"{stats['ar_step']} AR steps")
-        if "tps" in stats: bits.append(f"{stats['tps']} tok/s")
-        if "bps" in stats: bits.append(f"{stats['bps']} box/s")
-        summary = " &middot; ".join(bits) if bits else out_info.strip()
-        stat_delay = f"{tok_idx * 0.06 + 0.3:.2f}s"
-        h += (f'<div class="dc-row" style="justify-content:flex-end;padding-top:4px;padding-bottom:10px;border-bottom:none;">'
-              f'<span class="tk-stat-{uid}" style="animation-delay:{stat_delay}">⚡ {summary}</span></div>')
-    if raw_text:
-        safe_raw = raw_text.replace("<", "&lt;").replace(">", "&gt;")
-        h += (f'<div class="dc-raw"><details><summary>📄 Show Raw Response</summary>'
-              f'<div class="dc-raw-pre">{safe_raw}</div></details></div>')
-    h += '</div>'
-    return h
 def generate_raw_prompt(task_type, category):
     if not category:
         category = "objects"
@@ -454,123 +327,21 @@ def generate_raw_prompt(task_type, category):
 # ============================================================
 # 模型初始化
 # ============================================================
-try:
-    MODEL_PATH = os.environ.get("MODEL_PATH", "woshichaoren123/test001")
-    GLOBAL_WORKER = EagleWorker(MODEL_PATH)
-except Exception as e:
-    print(f"Failed to load model: {e}. Will run in Mock Mode.")
-    GLOBAL_WORKER = None
-# ============================================================
-# 用户数据收集（HuggingFace Public Dataset）
-#
-# 策略：one-record-per-file，配合按天目录 + 容器级 SESSION_ID。
-# 这样可以解决两个问题：
-#   1. 容器被回收时，本地 ephemeral 目录被清空。原来��有 session 都
-#      写同一个 logs_<date>.jsonl，新容器起来后会用空文件把 dataset 里
-#      旧的同名文件覆盖掉，造成数据丢失。
-#   2. 每次 commit 都要重传整份 LFS（appended 文件 hash 变了），浪费带宽。
-#
-# 现在每条记录写成独立的 JSONL 文件：
-#   data/<date>/<SESSION_ID>__<entry_id>.jsonl
-# CommitScheduler 只会“新增”文件，永远不会覆盖其它 session 的数据；
-# 单文件上传后即被封存，不会重复上传。
-# ============================================================
-LOG_DATASET_REPO = os.environ.get("LOG_DATASET_REPO", "woshichaoren123/log")
-_LOG_DIR = Path(tempfile.mkdtemp(prefix="hf_log_"))
-_SESSION_ID = uuid.uuid4().hex[:8]
-_log_scheduler = None
-if LOG_DATASET_REPO and LOG_HF_TOKEN:
-    try:
-        _log_scheduler = CommitScheduler(
-            repo_id=LOG_DATASET_REPO,
-            repo_type="dataset",
-            folder_path=str(_LOG_DIR),
-            path_in_repo="data",
-            every=3,
-            token=LOG_HF_TOKEN,
-            squash_history=False,
-        )
-        print(f"[LOG] Dataset logging enabled → {LOG_DATASET_REPO} "
-              f"(session={_SESSION_ID}, dir={_LOG_DIR})")
-    except Exception as e:
-        _log_scheduler = None
-        print(f"[LOG] Dataset logging disabled: {e}")
-else:
-    print("[LOG] Dataset logging disabled (LOG_HF_TOKEN not set)")
-def _pil_to_b64(pil_img):
-    """将 PIL 图片无损转为 PNG base64 字符串。"""
-    buf = io.BytesIO()
-    pil_img.save(buf, "PNG")
-    return base64.b64encode(buf.getvalue()).decode("ascii")
-def _atomic_write_text(path: Path, text: str):
-    """原子写入：先写临时文件再 rename，避免 CommitScheduler 读到半截文件。"""
-    tmp_path = path.with_name(path.name + ".tmp")
-    with open(tmp_path, "w", encoding="utf-8") as f:
-        f.write(text)
-    os.replace(tmp_path, path)
-def _log_to_dataset(
-    input_type, category, model_mode, raw_prompt,
-    output_text="", input_image=None, output_image=None,
-    extra=None,
-):
-    """每条记录写到独立的 JSONL 文件，按日期分目录、文件名包含 session_id。
-    最终落盘路径（也是 dataset 里的路径）：
-        data/<YYYY-MM-DD>/<session_id>__<entry_id>.jsonl
-    """
-    if _log_scheduler is None:
-        return
-    try:
-        entry_id = f"{int(time.time())}_{uuid.uuid4().hex[:6]}"
-        ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
-        date_str = time.strftime("%Y-%m-%d", time.gmtime())
-        input_b64 = None
-        if input_image is not None and isinstance(input_image, Image.Image):
-            input_b64 = _pil_to_b64(input_image)
-        output_b64 = None
-        if output_image is not None and isinstance(output_image, Image.Image):
-            output_b64 = _pil_to_b64(output_image)
-        record = {
-            "id": entry_id,
-            "session_id": _SESSION_ID,
-            "timestamp": ts,
-            "input_type": input_type,
-            "category": category,
-            "model_mode": model_mode,
-            "raw_prompt": raw_prompt,
-            "output_text": output_text,
-            "input_image_b64": input_b64,
-            "output_image_b64": output_b64,
-        }
-        if extra:
-            record.update(extra)
-        day_dir = _LOG_DIR / date_str
-        day_dir.mkdir(parents=True, exist_ok=True)
-        log_file = day_dir / f"{_SESSION_ID}__{entry_id}.jsonl"
-        payload = json.dumps(record, ensure_ascii=False) + "\n"
-        with _log_scheduler.lock:
-            _atomic_write_text(log_file, payload)
-    except Exception as e:
-        print(f"[LOG] Failed to log to dataset: {e}")
-# ============================================================
-# 公用预处理
-# ============================================================
 def _prepare_image_for_model(pil_img, short_size):
     process_img = pil_img.copy()
     if short_size is not None and short_size > 0:
@@ -582,104 +353,77 @@ def _prepare_image_for_model(pil_img, short_size):
 # ============================================================
-# GPU 时间预算常量（按模式区分）
 # ============================================================
-GPU_HARD_LIMIT_IMAGE = 30     # Image 模式 @spaces.GPU(duration=...)
-GPU_HARD_LIMIT_VIDEO = 240    # Video 模式 @spaces.GPU(duration=...)
-PHASE2_RESERVE = 55           # 留给 Phase 2（绘制 + ffmpeg）的秒数
-SAFETY_MARGIN = 25            # 额外安全裕量，永远不���触碰硬限制
-INFERENCE_BUDGET = GPU_HARD_LIMIT_VIDEO - PHASE2_RESERVE - SAFETY_MARGIN
-EST_SECONDS_PER_FRAME = 20    # 保守估计：每帧推理耗时
-# ============================================================
-# ✅ 图像推理（独立函数）
-# ============================================================
-def _run_image_inference(
-    image_in, categories_list, category_str,
-    model_mode, temp, top_p, top_k, short_size, question_override,
-    progress=None,  # 接收 progress
 ):
-    if image_in is None:
-        return (
-            gr.update(value=None, visible=True),
-            gr.update(value=None, visible=False),
-            "<p style='color:#ef4444;padding:12px;'>⚠️ Please upload an image first.</p>",
-        )
-    if progress is not None:  # 进度提示
-        progress(0.1, desc="Preprocessing image ...")
     process_img = _prepare_image_for_model(image_in, short_size)
-    if progress is not None:
-        progress(0.2, desc="Running model inference ...")
-    if GLOBAL_WORKER:
-        output_text, token_sequence, out_info = GLOBAL_WORKER.generate(
             process_img, categories_list, model_mode,
             temp=temp, top_p=top_p, top_k=top_k,
             question_override=question_override,
         )
     else:
-        output_text, token_sequence, out_info = "", [], ""
-    if progress is not None:
-        progress(0.8, desc="Drawing results ...")
     detections = parse_mixed_results(output_text, category_str)
     frame_bgr = cv2.cvtColor(np.array(image_in), cv2.COLOR_RGB2BGR)
     out_img_bgr = draw_on_frame(frame_bgr, detections, draw_label=True)
     output_image = Image.fromarray(cv2.cvtColor(out_img_bgr, cv2.COLOR_BGR2RGB))
-    html = generate_dynamic_html(token_sequence, out_info, output_text)
-    _log_to_dataset(
-        input_type="image",
-        category=", ".join(categories_list),
-        model_mode=model_mode,
-        raw_prompt=question_override or category_str,
-        output_text=output_text,
-        input_image=image_in,
-        output_image=output_image,
-    )
-    if progress is not None:
-        progress(1.0, desc="Done!")
-    return (
-        gr.update(value=output_image, visible=True),
-        gr.update(value=None, visible=False),
-        html,
-    )
-# ============================================================
-# ✅ 视频推理（独立函数 — 带完整超时保护）
-# ============================================================
-def _run_video_inference(
-    video_in, categories_list, category_str,
-    model_mode, temp, top_p, top_k, short_size, question_override,
-    max_video_frames,  # 可调帧数
-    progress=None,     # 接收 progress
 ):
     import subprocess as _sp
-    if video_in is None:
-        return (
-            gr.update(value=None, visible=False),
-            gr.update(value=None, visible=True),
-            "<p style='color:#ef4444;padding:12px;'>⚠️ Please upload a video first.</p>",
-        )
     total_start = time.time()
     max_frames = int(max_video_frames) if max_video_frames else 4
-    if progress is not None:
-        progress(0.0, desc="Reading video ...")
-    # ---------- 读取视频 ----------
-    t0 = time.time()
-    cap = cv2.VideoCapture(video_in)
     fps = cap.get(cv2.CAP_PROP_FPS)
     vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
@@ -692,212 +436,103 @@ def _run_video_inference(
         all_frames.append(frame)
     cap.release()
     total = len(all_frames)
-    read_elapsed = time.time() - t0
-    print(f"[TIMING] Video read: {read_elapsed:.2f}s, total frames={total}, "
-          f"resolution={vid_w}x{vid_h}, fps={fps:.1f}")
     if total == 0:
-        return (
-            gr.update(value=None, visible=False),
-            gr.update(value=None, visible=True),
-            "<p style='color:#ef4444;padding:12px;'>⚠️ Failed to read any frames from the video.</p>",
-        )
-    # ---------- 采样帧 ----------
     if total <= max_frames:
         sample_indices = list(range(total))
     else:
-        sample_indices = [int(round(i * (total - 1) / (max_frames - 1)))
-                          for i in range(max_frames)]
     sampled_frames = [all_frames[i] for i in sample_indices]
     n_sampled = len(sampled_frames)
-    # ============================================================
-    # 🛡️ 预估检查：在开跑前判断能不能在 GPU 时间预算内跑完
-    # ============================================================
     time_already_used = time.time() - total_start
     available_for_inference = GPU_HARD_LIMIT_VIDEO - time_already_used - PHASE2_RESERVE - SAFETY_MARGIN
     estimated_inference_time = n_sampled * EST_SECONDS_PER_FRAME
     if estimated_inference_time > available_for_inference:
-        # 尝试自动缩减帧数
-        max_feasible = max(0, int(available_for_inference // EST_SECONDS_PER_FRAME))
-        print(f"[PRE-CHECK] Estimated {estimated_inference_time:.0f}s > budget {available_for_inference:.0f}s, "
-              f"reducing from {n_sampled} to {max_feasible} frames")
-        if max_feasible < 1:
-            # 连 1 帧都跑不了，直接拒绝
-            del all_frames
-            gc.collect()
-            return (
-                gr.update(value=None, visible=False),
-                gr.update(value=None, visible=True),
-                "<div style='background:#fef2f2;border:1px solid #fca5a5;border-radius:8px;"
-                "padding:16px;margin:8px 0;'>"
-                "<p style='color:#dc2626;font-weight:700;font-size:1.05em;margin:0 0 8px;'>"
-                "⚠️ Video too large to process</p>"
-                f"<p style='color:#7f1d1d;margin:0;font-size:0.92em;'>"
-                f"This video has <b>{total}</b> frames. "
-                f"Even processing <b>1</b> sampled frame (~{EST_SECONDS_PER_FRAME}s) "
-                f"would exceed the <b>{GPU_HARD_LIMIT_VIDEO}s</b> GPU time limit.<br><br>"
-                "💡 <b>Suggestions:</b> use a shorter / lower-resolution video, "
-                "or switch to <b>Image</b> mode with a single frame screenshot.</p></div>",
-            )
-        # 用缩减后的帧数重新采样
         if total <= max_feasible:
             sample_indices = list(range(total))
         else:
-            sample_indices = [int(round(i * (total - 1) / (max_feasible - 1)))
-                              for i in range(max_feasible)]
         sampled_frames = [all_frames[i] for i in sample_indices]
         n_sampled = len(sampled_frames)
-    # 释放原始帧列表，节省内存
     out_fps = max(1.0, n_sampled / (total / fps)) if fps > 0 else 5.0
     del all_frames
     gc.collect()
-    print(f"[TIMING] Sampled {n_sampled} frames, output fps: {out_fps:.2f}")
-    # ============================================================
-    # 阶段一：推理（逐帧检查剩余时间）
-    # ============================================================
-    print("=" * 60)
-    print("[PHASE 1] Starting model inference ...")
-    print("=" * 60)
     inference_results = []
-    phase1_start = time.time()
     processed_count = 0
     early_stopped = False
     early_stop_reason = ""
     for i, frame in enumerate(sampled_frames):
-        # ---- 🛡️ 运行时时间检查：还够不够跑下一帧 + Phase 2？----
         elapsed_since_start = time.time() - total_start
         remaining_total = GPU_HARD_LIMIT_VIDEO - elapsed_since_start
         if remaining_total < PHASE2_RESERVE + SAFETY_MARGIN:
             early_stopped = True
-            early_stop_reason = (
-                f"GPU time budget is running out: "
-                f"{elapsed_since_start:.0f}s used, only {remaining_total:.0f}s left "
-                f"(need ≥{PHASE2_RESERVE}s for video encoding). "
-                f"Successfully processed {processed_count}/{n_sampled} frames."
-            )
-            print(f"[⏰ EARLY STOP] {early_stop_reason}")
             break
-        if progress is not None:
-            progress(
-                (i / n_sampled) * 0.85,
-                desc=f"🧠 Inference: frame {i + 1}/{n_sampled} "
-                     f"(⏱️ {elapsed_since_start:.0f}s / {GPU_HARD_LIMIT_VIDEO}s) ...",
-            )
-        frame_t0 = time.time()
-        # 预处理
-        prep_t0 = time.time()
         pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
         process_img = _prepare_image_for_model(pil_img, short_size)
-        prep_time = time.time() - prep_t0
-        # 推理
-        infer_t0 = time.time()
-        if GLOBAL_WORKER:
-            output_text, _, _ = GLOBAL_WORKER.generate(
                 process_img, categories_list, model_mode,
                 temp=temp, top_p=top_p, top_k=top_k,
                 question_override=question_override,
             )
         else:
-            output_text = ""
-        infer_time = time.time() - infer_t0
         inference_results.append(output_text)
         processed_count += 1
-        # 清理 GPU 缓存
-        cleanup_t0 = time.time()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         gc.collect()
-        cleanup_time = time.time() - cleanup_t0
-        total_frame_time = time.time() - frame_t0
-        print(f"[PHASE 1] Frame {i + 1}/{n_sampled} done: "
-              f"prep={prep_time:.2f}s, infer={infer_time:.2f}s, "
-              f"cleanup={cleanup_time:.2f}s, total={total_frame_time:.2f}s")
-        if torch.cuda.is_available():
-            allocated = torch.cuda.memory_allocated() / 1024**3
-            reserved = torch.cuda.memory_reserved() / 1024**3
-            print(f"         GPU mem: allocated={allocated:.2f}GB, reserved={reserved:.2f}GB")
-    phase1_time = time.time() - phase1_start
-    print(f"[PHASE 1] COMPLETE: {phase1_time:.2f}s for {processed_count} frames "
-          f"({phase1_time / max(processed_count, 1):.2f}s/frame)")
-    # 如果 1 帧都没处理完，返回错误
     if processed_count == 0:
-        return (
-            gr.update(value=None, visible=False),
-            gr.update(value=None, visible=True),
-            "<div style='background:#fef2f2;border:1px solid #fca5a5;border-radius:8px;"
-            "padding:16px;margin:8px 0;'>"
-            "<p style='color:#dc2626;font-weight:700;font-size:1.05em;margin:0 0 8px;'>"
-            "⚠️ Could not process any frames</p>"
-            "<p style='color:#7f1d1d;margin:0;font-size:0.92em;'>"
-            "The GPU time limit was reached before even one frame could be processed. "
-            "Please try a lower resolution video or use Image mode instead.</p></div>",
-        )
-    # 裁剪到实际处理过的帧
     sampled_frames_for_draw = sampled_frames[:processed_count]
     inference_results_for_draw = inference_results[:processed_count]
-    # ============================================================
-    # 阶段二：绘制 + 编码（只处理已推理完的帧）
-    # ============================================================
-    if progress is not None:
-        progress(0.88, desc="🎨 Drawing & encoding video ...")
-    print("=" * 60)
-    print(f"[PHASE 2] Drawing & video encoding ({processed_count} frames) ...")
-    print("=" * 60)
-    phase2_start = time.time()
     tmp_raw = tempfile.mktemp(suffix=".raw.mp4")
     out_video_path = tempfile.mktemp(suffix=".mp4")
-    out = cv2.VideoWriter(tmp_raw, cv2.VideoWriter_fourcc(*"mp4v"),
-                          out_fps, (vid_w, vid_h))
-    for i, (frame, output_text) in enumerate(
-            zip(sampled_frames_for_draw, inference_results_for_draw)):
-        draw_t0 = time.time()
         detections = parse_mixed_results(output_text, category_str)
         valid_results = _postprocess_detections(detections, vid_w, vid_h)
         frame_to_draw = draw_on_frame(frame, valid_results, draw_label=True)
         out.write(frame_to_draw)
-        draw_time = time.time() - draw_t0
-        print(f"[PHASE 2] Frame {i + 1}/{processed_count}: "
-              f"draw={draw_time:.3f}s, det={len(valid_results)}")
     out.release()
-    phase2_draw_time = time.time() - phase2_start
-    # ---- ffmpeg 重编码（如果还有时间的话） ----
     elapsed_now = time.time() - total_start
     remaining_now = GPU_HARD_LIMIT_VIDEO - elapsed_now
-    if progress is not None:
-        progress(0.95, desc="📦 Re-encoding with ffmpeg ...")
-    ffmpeg_t0 = time.time()
     if remaining_now > 15:
-        # 还有时间，用 ffmpeg 重编码（兼容性更好）
         try:
             ffmpeg_timeout = max(10, int(remaining_now - 5))
             _sp.run(
@@ -907,416 +542,135 @@ def _run_video_inference(
                 check=True, capture_output=True, timeout=ffmpeg_timeout,
             )
             os.remove(tmp_raw)
-        except Exception as ffmpeg_err:
-            print(f"[PHASE 2] ffmpeg failed or timed out: {ffmpeg_err}, using raw file")
             if os.path.exists(tmp_raw):
                 os.replace(tmp_raw, out_video_path)
     else:
-        # 时间不够了，直接用 mp4v 原始文件
         os.replace(tmp_raw, out_video_path)
-        print("[PHASE 2] Skipped ffmpeg re-encoding due to time constraint")
-    ffmpeg_time = time.time() - ffmpeg_t0
     total_time = time.time() - total_start
-    print("=" * 60)
-    print(f"[TOTAL] {total_time:.2f}s  |  inference={phase1_time:.2f}s  "
-          f"draw={phase2_draw_time:.2f}s  ffmpeg={ffmpeg_time:.2f}s  "
-          f"frames_done={processed_count}/{n_sampled}")
-    print("=" * 60)
-    # ---- 构建结果 HTML ----
-    warning_html = ""
-    if early_stopped:
-        warning_html = (
-            "<div style='background:#fefce8;border:1px solid #fde047;border-radius:8px;"
-            "padding:14px;margin-bottom:12px;'>"
-            "<p style='color:#a16207;font-weight:700;font-size:1.02em;margin:0 0 6px;'>"
-            "⚡ Partial Result — Early Stop Due to GPU Time Limit</p>"
-            f"<p style='color:#854d0e;margin:0;font-size:0.9em;'>{early_stop_reason}</p>"
-            "<p style='color:#854d0e;margin:6px 0 0;font-size:0.88em;'>"
-            "💡 <b>Tip:</b> Reduce <b>Max Video Frames</b> slider or use a shorter video "
-            "to process all frames within the GPU budget.</p>"
-            "</div>"
-        )
-    timing_summary = (
-        f"Video: {total} total frames, sampled {n_sampled}, "
-        f"processed {processed_count} | "
-        f"Inference: {phase1_time:.1f}s ({phase1_time / max(processed_count, 1):.1f}s/frame) | "
-        f"Drawing: {phase2_draw_time:.1f}s | ffmpeg: {ffmpeg_time:.1f}s | "
-        f"Total: {total_time:.1f}s / {GPU_HARD_LIMIT_VIDEO}s budget"
-    )
-    html = warning_html + generate_dynamic_html(
-        token_sequence=[], out_info="", raw_text=timing_summary)
-    try:
-        thumb = Image.fromarray(
-            cv2.cvtColor(sampled_frames_for_draw[0], cv2.COLOR_BGR2RGB))
-    except Exception:
-        thumb = None
-    _log_to_dataset(
-        input_type="video",
-        category=", ".join(categories_list),
-        model_mode=model_mode,
-        raw_prompt=question_override or category_str,
-        output_text="\n---\n".join(inference_results_for_draw),
-        input_image=thumb,
-        extra={
-            "video_total_frames": total,
-            "video_sampled_frames": n_sampled,
-            "video_processed_frames": processed_count,
-        },
-    )
-    if progress is not None:
-        progress(1.0, desc="Done!")
-    return (
-        gr.update(value=None, visible=False),
-        gr.update(value=out_video_path, visible=True),
-        html,
-    )
 # ============================================================
-# 🛡️ 主入口：按模式分配不同 GPU 时长
 # ============================================================
-def _build_error_html(e, gpu_limit, input_type):
-    """统一的异常→友好 HTML 构建。"""
-    import traceback
-    traceback.print_exc()
-    error_type = type(e).__name__
-    error_msg = str(e)
-    is_timeout = ("timeout" in error_msg.lower()
-                  or "timelimit" in error_msg.lower()
-                  or "time limit" in error_msg.lower()
-                  or "duration" in error_msg.lower())
-    if is_timeout:
-        detail = (
-            f"The GPU time limit ({gpu_limit}s) was exceeded before the result "
-            "could be fully assembled. This typically happens with large videos."
-        )
-        suggestion = (
-            "Please reduce <b>Max Video Frames</b>, use a shorter / smaller video, "
-            "or switch to <b>Image</b> mode."
-        )
-    else:
-        detail = f"{error_type}: {error_msg}"
-        suggestion = (
-            "If the problem persists, try reducing video size or "
-            "switching to Image mode."
-        )
-    error_html = (
-        "<div style='background:#fef2f2;border:1px solid #fca5a5;border-radius:8px;"
-        "padding:16px;margin:8px 0;'>"
-        "<p style='color:#dc2626;font-weight:700;font-size:1.05em;margin:0 0 8px;'>"
-        "⚠️ Processing interrupted</p>"
-        f"<p style='color:#7f1d1d;margin:0 0 8px;font-size:0.92em;'>{detail}</p>"
-        f"<p style='color:#7f1d1d;margin:0;font-size:0.88em;'>💡 {suggestion}</p>"
-        "</div>"
-    )
-    return (
-        gr.update(value=None, visible=(input_type == "Image")),
-        gr.update(value=None, visible=(input_type == "Video")),
-        error_html,
-    )
-@spaces.GPU(duration=GPU_HARD_LIMIT_IMAGE)
-def _run_image_gpu(
-    image_in, category, model_mode, temp, top_p, top_k,
-    short_size, question_override, progress,
-):
-    try:
-        categories_list = [c.strip() for c in category.split(",") if c.strip()]
-        category_str = "</c>".join(categories_list)
-        return _run_image_inference(
-            image_in, categories_list, category_str,
-            model_mode, temp, top_p, top_k, short_size, question_override,
-            progress=progress,
-        )
-    except Exception as e:
-        return _build_error_html(e, GPU_HARD_LIMIT_IMAGE, "Image")
-@spaces.GPU(duration=GPU_HARD_LIMIT_VIDEO)
-def _run_video_gpu(
-    video_in, category, model_mode, temp, top_p, top_k,
-    short_size, question_override, max_video_frames, progress,
-):
     try:
-        categories_list = [c.strip() for c in category.split(",") if c.strip()]
-        category_str = "</c>".join(categories_list)
-        return _run_video_inference(
-            video_in, categories_list, category_str,
-            model_mode, temp, top_p, top_k, short_size, question_override,
-            max_video_frames=max_video_frames,
-            progress=progress,
-        )
-    except Exception as e:
-        return _build_error_html(e, GPU_HARD_LIMIT_VIDEO, "Video")
-def run_inference(
-    input_type, image_in, video_in, task_type, category,
-    model_mode, temp, top_p, top_k, short_size, question_override,
-    max_video_frames,
-    progress=gr.Progress(track_tqdm=False),
-):
-    if input_type == "Image":
-        return _run_image_gpu(
-            image_in, category, model_mode, temp, top_p, top_k,
-            short_size, question_override, progress,
-        )
-    else:
-        return _run_video_gpu(
-            video_in, category, model_mode, temp, top_p, top_k,
-            short_size, question_override, max_video_frames, progress,
-        )
-# ============================================================
-# 按钮状态
-# ============================================================
-def _disable_run_btn():
-    return gr.update(interactive=False, value="⏳ Running ...")
-def _enable_run_btn():
-    return gr.update(interactive=True, value="🧠 Run Inference")
-# ============================================================
-# Examples
-# ============================================================
-EXAMPLE_CONFIGS = [
-    {"name": "Book", "input_type": "Image", "image": "./assets/book.jpg", "video": None,
-     "task": "Detection", "category": "book", "mode": "hybrid"},
-    {"name": "Sweet", "input_type": "Image", "image": "./assets/sweet.jpg", "video": None,
-     "task": "Detection", "category": "sweet", "mode": "hybrid"},
-    {"name": "Person", "input_type": "Image", "image": "./assets/person.jpg", "video": None,
-     "task": "Detection", "category": "person", "mode": "hybrid"},
-    {"name": "OCR", "input_type": "Image", "image": "./assets/ocr.jpg", "video": None,
-     "task": "OCR", "category": "text", "mode": "fast"},
-]
-def prepare_gallery_data():
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    gallery_images, gallery_captions = [], []
-    for config in EXAMPLE_CONFIGS:
-        img_path = (os.path.normpath(os.path.join(base_dir, config["image"]))
-                    if config["image"] else None)
-        if img_path and os.path.exists(img_path):
-            gallery_images.append(img_path)
         else:
-            gallery_images.append(Image.new("RGB", (200, 200), color="black"))
-        gallery_captions.append(config["name"])
-    return gallery_images, gallery_captions
-def update_example_selection(evt: gr.SelectData):
-    config = EXAMPLE_CONFIGS[evt.index]
-    base_dir = os.path.dirname(os.path.abspath(__file__))
-    img_path = (os.path.normpath(os.path.join(base_dir, config["image"]))
-                if config["image"] else None)
-    vid_path = (os.path.normpath(os.path.join(base_dir, config["video"]))
-                if config["video"] else None)
-    return (
-        config["input_type"],
-        gr.update(value=img_path, visible=(config["input_type"] == "Image")),
-        gr.update(value=vid_path, visible=(config["input_type"] == "Video")),
-        config["task"], config["category"], config["mode"],
-    )
-# ============================================================
-# UI
-# ============================================================
-def create_demo():
-    nv_green = gr.themes.Color(
-        c50="#f7fbe8", c100="#eef7d1", c200="#ddf0a3",
-        c300="#cce875", c400="#a4d422", c500="#76b900",
-        c600="#649d00", c700="#527f00", c800="#3f6200",
-        c900="#2d4400", c950="#1a2700",
-    )
-    with gr.Blocks(
-        theme=gr.themes.Soft(primary_hue=nv_green, secondary_hue=nv_green),
-        title="LocateAnything",
-    ) as demo:
-        with gr.Row():
-            with gr.Column(scale=2):
-                gr.Markdown("# 🚀 LocateAnything")
-                gr.Markdown(
-                    "> **Locate any object in images or videos with natural language.**  \n"
-                    "> Upload an image/video on the left, choose a task type, enter what you want to find, "
-                    "then click **Run Inference**. Results with bounding boxes will appear on the right.\n"
-                    ">\n"
-                    "> **Quick Start:** "
-                    "① Select *Image* or *Video* → "
-                    "② Pick a *Task Type* (Detection / Grounding / OCR / GUI / Pointing) → "
-                    "③ Type your *Categories* (comma-separated) → "
-                    "④ Click **🧠 Run Inference**"
-                )
-            with gr.Column(scale=1):
-                gr.Markdown(
-                    "> ⚠️ **Note:** `magi-attention` cannot be installed in this Hugging Face Space, "
-                    "so inputs larger than 1K are resized to 1K in this demo.\n"
-                    ">\n"
-                    "> For full-resolution inference, please download the weights and run the model locally."
-                )
-        with gr.Row():
-            # ===== COL 1: Settings =====
-            with gr.Column(scale=1):
-                gr.Markdown("### ⚙️ Settings")
-                input_type = gr.Radio(
-                    ["Image", "Video"], label="1. Input Media Type", value="Image",
-                    info="Select whether to process a single image or a video clip.",
-                )
-                task_dropdown = gr.Dropdown(
-                    choices=["Detection", "Grounding", "OCR", "GUI", "Pointing"],
-                    value="Detection", label="2. Task Type",
-                    info="Detection: find all instances | Grounding: match description | "
-                         "OCR: extract text | GUI: locate UI element | Pointing: point to target",
-                )
-                category_input = gr.Textbox(
-                    label="3. Categories",
-                    value="car, bus, person, potted plant",
-                    placeholder="e.g.  car, person, dog  (comma-separated, supports Chinese)",
-                    info="Enter one or more categories separated by commas. "
-                         "Supports both English and Chinese (e.g. 汽车, 行人).",
-                )
-                model_dropdown = gr.Dropdown(
-                    choices=["fast", "slow", "hybrid"],
-                    value="hybrid", label="4. Inference Mode",
-                    info="fast: MTP parallel decoding | slow: standard AR decoding | "
-                         "hybrid: auto-switch for best quality-speed balance",
-                )
-                with gr.Accordion("5. Advanced Settings", open=False):
-                    gr.Markdown(
-                        "*Adjust these only if needed. Default values work well for most cases.*"
-                    )
-                    temp_slider = gr.Slider(
-                        minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature",
-                        info="Higher = more diverse results; lower = more deterministic.",
-                    )
-                    top_p_slider = gr.Slider(
-                        minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P",
-                        info="Nucleus sampling threshold.",
-                    )
-                    top_k_slider = gr.Slider(
-                        minimum=1, maximum=100, value=20, step=1, label="Top K",
-                        info="Top-K sampling: number of highest probability tokens to consider.",
-                    )
-                    short_size_input = gr.Number(
-                        label="Short Side Size (px)", value=None, precision=0,
-                        info="Resize the short side of the image to this value before inference. "
-                             "Leave empty to keep original size (auto-capped at 1024).",
-                    )
-                    max_video_frames_slider = gr.Slider(
-                        minimum=1, maximum=10, value=4, step=1,
-                        label="Max Video Frames",
-                        info="Number of frames to sample from the video for inference. "
-                             "Each frame takes ~15-20s. Keep ≤ 6 to avoid GPU timeout.",
-                    )
-                run_btn = gr.Button("🧠 Run Inference", variant="primary", size="lg")
-            # ===== COL 2: Main =====
-            with gr.Column(scale=3):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.Markdown("### 📥 Input Media")
-                        image_input = gr.Image(
-                            label="Input Image", type="pil", visible=True,
-                        )
-                        video_input = gr.Video(
-                            label="Input Video",
-                            visible=False,
-                        )
-                    with gr.Column(scale=1):
-                        gr.Markdown("### 📤 Output Result")
-                        output_image = gr.Image(
-                            label="Detection Result", type="pil", visible=True,
-                        )
-                        output_video = gr.Video(
-                            label="Video Result", visible=False,
-                        )
-                gr.Markdown("### 📝 Raw Input Prompt")
-                raw_prompt_box = gr.Textbox(
-                    value=generate_raw_prompt("Detection", "car, bus, person, potted plant"),
-                    interactive=False, lines=2,
-                    info="This is the prompt sent to the model (auto-generated from your settings above).",
-                )
-                gr.Markdown("### 🔍 Decoding Visualization")
-                raw_output_box = gr.HTML(label="Decoding Steps")
-        # ===== EXAMPLES =====
-        gr.Markdown("---")
-        gr.Markdown(
-            "## 🖼️ Examples\n"
-            "Click any example below to auto-fill the settings and input image."
-        )
-        gallery_images, gallery_captions = prepare_gallery_data()
-        example_gallery = gr.Gallery(
-            value=list(zip(gallery_images, gallery_captions)),
-            show_label=True, columns=4, rows=1, height="auto", allow_preview=False,
-        )
-        # ===== EVENTS =====
-        input_type.change(
-            fn=lambda c: (gr.update(visible=(c == "Image")), gr.update(visible=(c == "Video"))),
-            inputs=input_type, outputs=[image_input, video_input],
-        )
-        for comp in [task_dropdown, category_input]:
-            comp.change(
-                fn=generate_raw_prompt,
-                inputs=[task_dropdown, category_input],
-                outputs=raw_prompt_box,
             )
-        run_btn.click(
-            fn=_disable_run_btn,
-            inputs=None,
-            outputs=[run_btn],
-        ).then(
-            fn=run_inference,
-            inputs=[
-                input_type, image_input, video_input,
-                task_dropdown, category_input, model_dropdown,
-                temp_slider, top_p_slider, top_k_slider,
-                short_size_input, raw_prompt_box,
-                max_video_frames_slider,
-            ],
-            outputs=[output_image, output_video, raw_output_box],
-        ).then(
-            fn=_enable_run_btn,
-            inputs=None,
-            outputs=[run_btn],
-        )
-        example_gallery.select(
-            fn=update_example_selection,
-            outputs=[input_type, image_input, video_input,
-                     task_dropdown, category_input, model_dropdown],
-        ).then(
-            fn=generate_raw_prompt,
-            inputs=[task_dropdown, category_input],
-            outputs=raw_prompt_box,
-        )
-    return demo
 if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch(debug=True)

 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import spaces  # MUST BE THE ABSOLUTE FIRST IMPORT FOR ZEROGPU EMULATION
 import gradio as gr
+from gradio import Server
+from gradio.data_classes import FileData
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
 import cv2
 import numpy as np
 import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import tempfile
 import re
 import time
 import json
 import uuid
 from pathlib import Path
+from typing import Any
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from transformers import AutoProcessor, AutoModel, AutoTokenizer
 _FONT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "LXGWWenKai-Bold.ttf")
+# Retrieve optional HF Token from typical env variables
+HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") or os.environ.get("MODEL_HF_TOKEN")
 def _load_font(size=20):
     """加载中文字体（LXGW WenKai），需提前放置到 assets/ 目录"""
         self.device = device
         self.dtype = torch.bfloat16
         self.generation_mode = generation_mode
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_path,
             trust_remote_code=True,
+            token=HF_TOKEN if HF_TOKEN else None,
         )
         self.processor = AutoProcessor.from_pretrained(
             model_path,
             trust_remote_code=True,
+            token=HF_TOKEN if HF_TOKEN else None,
         )
         self.model = AutoModel.from_pretrained(
             model_path,
             torch_dtype=self.dtype,
             _attn_implementation="sdpa",
             trust_remote_code=True,
+            token=HF_TOKEN if HF_TOKEN else None,
         ).to(device).eval()
         print("Model Loaded Successfully!")
 # ============================================================
+# 后处理
 # ============================================================
 def _postprocess_detections(detections, w, h):
     valid = []
     return stats
 def generate_raw_prompt(task_type, category):
     if not category:
         category = "objects"
 # ============================================================
 # 模型初始化
 # ============================================================
+GLOBAL_WORKER = None
+def get_worker():
+    global GLOBAL_WORKER
+    if GLOBAL_WORKER is None:
+        try:
+            MODEL_PATH = os.environ.get("MODEL_PATH", "nvidia/LocateAnything-3B")
+            print(f"Loading model inside @spaces.GPU context: {MODEL_PATH}")
+            GLOBAL_WORKER = EagleWorker(MODEL_PATH)
+        except Exception as e:
+            print(f"Failed to load model: {e}. Will run in Mock Mode.")
+            GLOBAL_WORKER = None
+    return GLOBAL_WORKER
 def _prepare_image_for_model(pil_img, short_size):
     process_img = pil_img.copy()
     if short_size is not None and short_size > 0:
 # ============================================================
+# GPU 时间预算与推理保护（按模式区分）
 # ============================================================
+GPU_HARD_LIMIT_IMAGE = 30
+GPU_HARD_LIMIT_VIDEO = 240
+PHASE2_RESERVE = 55
+SAFETY_MARGIN = 25
+EST_SECONDS_PER_FRAME = 20
+@spaces.GPU(duration=120, size="xlarge")
+def run_image_gpu_api(
+    image_path: str, category: str, model_mode: str, temp: float, top_p: float, top_k: int,
+    short_size: int | None, question_override: str | None
 ):
+    image_in = Image.open(image_path).convert("RGB")
+    categories_list = [c.strip() for c in category.split(",") if c.strip()]
+    category_str = "</c>".join(categories_list)
     process_img = _prepare_image_for_model(image_in, short_size)
+    worker = get_worker()
+    if worker:
+        output_text, token_sequence, out_info = worker.generate(
             process_img, categories_list, model_mode,
             temp=temp, top_p=top_p, top_k=top_k,
             question_override=question_override,
         )
     else:
+        # Mock mode fallback
+        output_text = "Mock detection: <ref>sweet</ref><box><240><480><620><940></box> and <ref>book</ref><box><50><120><400><380></box>"
+        token_sequence = []
+        out_info = "forward_step=1;num_tokens=18;num_boxes=2;tps=45;bps=15"
     detections = parse_mixed_results(output_text, category_str)
     frame_bgr = cv2.cvtColor(np.array(image_in), cv2.COLOR_RGB2BGR)
     out_img_bgr = draw_on_frame(frame_bgr, detections, draw_label=True)
     output_image = Image.fromarray(cv2.cvtColor(out_img_bgr, cv2.COLOR_BGR2RGB))
+    # Save to temp file
+    temp_dir = tempfile.mkdtemp()
+    out_img_path = os.path.join(temp_dir, "output.png")
+    output_image.save(out_img_path)
+    stats = _parse_out_info_dict(out_info)
+    # Simplified summary lists
+    detections_summary = []
+    for det in detections:
+        detections_summary.append({
+            "label": det.get("label", "object"),
+            "type": det.get("type", "box"),
+            "coords": [round(c, 2) for c in det.get("coords", [])]
+        })
+    return out_img_path, stats, output_text, detections_summary
+@spaces.GPU(duration=240, size="xlarge")
+def run_video_gpu_api(
+    video_path: str, category: str, model_mode: str, temp: float, top_p: float, top_k: int,
+    short_size: int | None, question_override: str | None, max_video_frames: int
 ):
     import subprocess as _sp
     total_start = time.time()
     max_frames = int(max_video_frames) if max_video_frames else 4
+    categories_list = [c.strip() for c in category.split(",") if c.strip()]
+    category_str = "</c>".join(categories_list)
+    cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS)
     vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         all_frames.append(frame)
     cap.release()
     total = len(all_frames)
     if total == 0:
+        raise ValueError("Failed to read any frames from the video.")
+    # Sample frames
     if total <= max_frames:
         sample_indices = list(range(total))
     else:
+        sample_indices = [int(round(i * (total - 1) / (max_frames - 1))) for i in range(max_frames)]
     sampled_frames = [all_frames[i] for i in sample_indices]
     n_sampled = len(sampled_frames)
+    # Budget check
     time_already_used = time.time() - total_start
     available_for_inference = GPU_HARD_LIMIT_VIDEO - time_already_used - PHASE2_RESERVE - SAFETY_MARGIN
     estimated_inference_time = n_sampled * EST_SECONDS_PER_FRAME
     if estimated_inference_time > available_for_inference:
+        max_feasible = max(1, int(available_for_inference // EST_SECONDS_PER_FRAME))
         if total <= max_feasible:
             sample_indices = list(range(total))
         else:
+            sample_indices = [int(round(i * (total - 1) / (max_feasible - 1))) for i in range(max_feasible)]
         sampled_frames = [all_frames[i] for i in sample_indices]
         n_sampled = len(sampled_frames)
     out_fps = max(1.0, n_sampled / (total / fps)) if fps > 0 else 5.0
     del all_frames
     gc.collect()
     inference_results = []
     processed_count = 0
     early_stopped = False
     early_stop_reason = ""
     for i, frame in enumerate(sampled_frames):
         elapsed_since_start = time.time() - total_start
         remaining_total = GPU_HARD_LIMIT_VIDEO - elapsed_since_start
         if remaining_total < PHASE2_RESERVE + SAFETY_MARGIN:
             early_stopped = True
+            early_stop_reason = f"GPU time budget running out. Only {remaining_total:.0f}s left."
             break
         pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
         process_img = _prepare_image_for_model(pil_img, short_size)
+        worker = get_worker()
+        if worker:
+            output_text, _, _ = worker.generate(
                 process_img, categories_list, model_mode,
                 temp=temp, top_p=top_p, top_k=top_k,
                 question_override=question_override,
             )
         else:
+            output_text = f"Mock video detection: <ref>person</ref><box><100><150><800><900></box>"
         inference_results.append(output_text)
         processed_count += 1
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         gc.collect()
     if processed_count == 0:
+        raise RuntimeError("GPU budget exceeded before processing any frames.")
     sampled_frames_for_draw = sampled_frames[:processed_count]
     inference_results_for_draw = inference_results[:processed_count]
     tmp_raw = tempfile.mktemp(suffix=".raw.mp4")
     out_video_path = tempfile.mktemp(suffix=".mp4")
+    out = cv2.VideoWriter(tmp_raw, cv2.VideoWriter_fourcc(*"mp4v"), out_fps, (vid_w, vid_h))
+    detections_summary = []
+    for i, (frame, output_text) in enumerate(zip(sampled_frames_for_draw, inference_results_for_draw)):
         detections = parse_mixed_results(output_text, category_str)
         valid_results = _postprocess_detections(detections, vid_w, vid_h)
         frame_to_draw = draw_on_frame(frame, valid_results, draw_label=True)
         out.write(frame_to_draw)
+        for det in valid_results:
+            detections_summary.append({
+                "frame": i + 1,
+                "label": det.get("label", "object"),
+                "type": det.get("type", "box"),
+                "coords": det.get("coords", [])
+            })
     out.release()
+    # ffmpeg re-encode
     elapsed_now = time.time() - total_start
     remaining_now = GPU_HARD_LIMIT_VIDEO - elapsed_now
     if remaining_now > 15:
         try:
             ffmpeg_timeout = max(10, int(remaining_now - 5))
             _sp.run(
                 check=True, capture_output=True, timeout=ffmpeg_timeout,
             )
             os.remove(tmp_raw)
+        except Exception:
             if os.path.exists(tmp_raw):
                 os.replace(tmp_raw, out_video_path)
     else:
         os.replace(tmp_raw, out_video_path)
     total_time = time.time() - total_start
+    stats = {
+        "total_frames": total,
+        "sampled_frames": n_sampled,
+        "processed_frames": processed_count,
+        "total_time_seconds": round(total_time, 2),
+        "early_stopped": early_stopped,
+        "early_stop_reason": early_stop_reason
+    }
+    return out_video_path, stats, "\n---\n".join(inference_results_for_draw), detections_summary
 # ============================================================
+# GRADIO SERVER APP
 # ============================================================
+app = Server()
+# Serve static assets folder
+assets_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
+if os.path.exists(assets_dir):
+    app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
+@app.get("/")
+async def homepage():
+    html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
+    if os.path.exists(html_path):
+        with open(html_path, "r", encoding="utf-8") as f:
+            return HTMLResponse(f.read())
+    return HTMLResponse("<h1 style='color: #ef4444; font-family: Inter, sans-serif; text-align: center; margin-top: 100px;'>index.html is missing</h1>")
+@app.api(name="run_inference")
+def run_inference_api(
+    input_type: str,
+    image_file: Any = None,
+    video_file: Any = None,
+    task_type: str = "Detection",
+    category: str = "objects",
+    model_mode: str = "hybrid",
+    temp: float = 0.7,
+    top_p: float = 0.9,
+    top_k: int = 20,
+    short_size: int | None = None,
+    question_override: str | None = None,
+    max_video_frames: int = 4
+) -> tuple[FileData | None, FileData | None, dict]:
+    """Exposed Gradio Queueing Endpoint for custom frontend interactions.
+    ZeroGPU allocation is triggered directly at this endpoint boundary.
+    Supports both FileData dict (from web uploads) and local strings (for examples).
+    """
     try:
+        if not category:
+            category = "objects"
+        final_prompt = question_override
+        if not final_prompt:
+            final_prompt = generate_raw_prompt(task_type, category)
+        if input_type == "Image":
+            if not image_file:
+                return None, None, {"success": False, "error": "Please upload an image."}
+            # Resolve image path (from either FileData upload or local example string)
+            if isinstance(image_file, str):
+                img_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), image_file)
+            elif isinstance(image_file, dict):
+                img_path = image_file.get("path")
+            else:
+                img_path = getattr(image_file, "path", None)
+            if not img_path or not os.path.exists(img_path):
+                return None, None, {"success": False, "error": f"Invalid image file path: {img_path}"}
+            out_img_path, stats, raw_text, detections = run_image_gpu_api(
+                img_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt
+            )
+            meta = {
+                "success": True,
+                "input_type": "Image",
+                "stats": stats,
+                "raw_text": raw_text,
+                "detections": detections,
+                "final_prompt": final_prompt
+            }
+            return FileData(path=out_img_path), None, meta
         else:
+            if not video_file:
+                return None, None, {"success": False, "error": "Please upload a video."}
+            # Resolve video path (from either FileData upload or local example string)
+            if isinstance(video_file, str):
+                vid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), video_file)
+            elif isinstance(video_file, dict):
+                vid_path = video_file.get("path")
+            else:
+                vid_path = getattr(video_file, "path", None)
+            if not vid_path or not os.path.exists(vid_path):
+                return None, None, {"success": False, "error": f"Invalid video file path: {vid_path}"}
+            out_vid_path, stats, raw_text, detections = run_video_gpu_api(
+                vid_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt, max_video_frames
             )
+            meta = {
+                "success": True,
+                "input_type": "Video",
+                "stats": stats,
+                "raw_text": raw_text,
+                "detections": detections,
+                "final_prompt": final_prompt
+            }
+            return None, FileData(path=out_vid_path), meta
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, None, {"success": False, "error": str(e)}
 if __name__ == "__main__":
+    app.launch(show_error=True)

index.html ADDED Viewed

	@@ -0,0 +1,926 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>NVIDIA LocateAnything - Fast Vision-Language Grounding</title>
+    <!-- Premium Google Fonts -->
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Outfit:wght@500;600;700;800;900&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
+    <!-- Tailwind CSS CDN -->
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script>
+        tailwind.config = {
+            theme: {
+                extend: {
+                    fontFamily: {
+                        sans: ['Inter', 'sans-serif'],
+                        outfit: ['Outfit', 'sans-serif'],
+                        mono: ['Fira Code', 'monospace'],
+                    },
+                    colors: {
+                        nvidia: {
+                            light: '#76b900',
+                            brand: '#76b900',
+                            dark: '#5c9000',
+                            hover: '#87d300',
+                        },
+                        dark: {
+                            50: '#222222',
+                            100: '#1a1a1a',
+                            200: '#121212',
+                            300: '#0a0a0a',
+                            400: '#050505',
+                        }
+                    }
+                }
+            }
+        }
+    </script>
+    <style>
+        body {
+            background-color: #050505;
+            background-image:
+                radial-gradient(circle at 10% 20%, rgba(118, 185, 0, 0.08) 0%, transparent 45%),
+                radial-gradient(circle at 90% 80%, rgba(99, 102, 241, 0.05) 0%, transparent 45%);
+            background-attachment: fixed;
+        }
+        /* NVIDIA-style Carbon Triangle Grid Pattern */
+        .carbon-grid {
+            background-image:
+                linear-gradient(30deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
+                linear-gradient(150deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
+                linear-gradient(30deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
+                linear-gradient(150deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
+                linear-gradient(60deg, #171717 25%, transparent 25.5%, transparent 75%, #171717 75.5%, #171717),
+                linear-gradient(60deg, #171717 25%, transparent 25.5%, transparent 75%, #171717 75.5%, #171717);
+            background-size: 80px 140px;
+            background-position: 0 0, 0 0, 40px 70px, 40px 70px, 0 0, 40px 70px;
+        }
+        /* Glassmorphism Styles */
+        .glass-panel {
+            background: rgba(18, 18, 18, 0.65);
+            backdrop-filter: blur(20px);
+            -webkit-backdrop-filter: blur(20px);
+            border: 1px solid rgba(255, 255, 255, 0.04);
+            box-shadow: 0 24px 64px 0 rgba(0, 0, 0, 0.7);
+        }
+        .glass-panel-interactive {
+            transition: all 0.4s cubic-bezier(0.16, 1, 0.3, 1);
+        }
+        .glass-panel-interactive:hover {
+            border-color: rgba(118, 185, 0, 0.25);
+            box-shadow: 0 30px 80px 0 rgba(118, 185, 0, 0.08);
+            transform: translateY(-2px);
+        }
+        /* SAM 3 Style Glassmorphic Float Input */
+        .sam-input-bar {
+            background: rgba(255, 255, 255, 0.06);
+            backdrop-filter: blur(25px);
+            -webkit-backdrop-filter: blur(25px);
+            border: 1px solid rgba(255, 255, 255, 0.08);
+            box-shadow: 0 16px 40px rgba(0, 0, 0, 0.5);
+            transition: all 0.3s cubic-bezier(0.16, 1, 0.3, 1);
+        }
+        .sam-input-bar:focus-within {
+            background: rgba(255, 255, 255, 0.09);
+            border-color: rgba(118, 185, 0, 0.6);
+            box-shadow: 0 20px 48px rgba(118, 185, 0, 0.15);
+        }
+        /* Hexagonal Glowing Border for Media Workspace (NVIDIA GTC Keynote Style) */
+        .gtc-polygon-wrapper {
+            position: relative;
+            background: #0f1218;
+            border: 1px solid rgba(118, 185, 0, 0.15);
+            box-shadow: 0 0 50px rgba(0, 0, 0, 0.8);
+            overflow: hidden;
+            clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
+        }
+        .gtc-polygon-wrapper::before {
+            content: '';
+            position: absolute;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            border: 2px solid #76b900;
+            pointer-events: none;
+            clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
+            opacity: 0.8;
+            box-shadow: inset 0 0 20px rgba(118, 185, 0, 0.3);
+        }
+        .gtc-neon-border {
+            position: absolute;
+            top: -2px;
+            left: -2px;
+            right: -2px;
+            bottom: -2px;
+            background: linear-gradient(135deg, #76b900, #3f6200, #76b900);
+            z-index: 0;
+            pointer-events: none;
+            opacity: 0.95;
+            clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
+        }
+        .gtc-inner-box {
+            position: relative;
+            background: #080a0e;
+            z-index: 10;
+            height: 100%;
+            clip-path: polygon(8.1% 0.1%, 99.9% 0.1%, 99.9% 91.9%, 91.9% 99.9%, 0.1% 99.9%, 0.1% 8.1%);
+        }
+        /* Pill Buttons styling */
+        .pill-btn-green {
+            background-color: #76b900;
+            transition: all 0.3s cubic-bezier(0.16, 1, 0.3, 1);
+        }
+        .pill-btn-green:hover {
+            background-color: #87d300;
+            box-shadow: 0 0 24px rgba(118, 185, 0, 0.45);
+            transform: translateY(-1px);
+        }
+        .pill-btn-green:active {
+            transform: translateY(1px);
+        }
+        /* Custom Scrollbar */
+        ::-webkit-scrollbar {
+            width: 6px;
+            height: 6px;
+        }
+        ::-webkit-scrollbar-track {
+            background: #0a0a0a;
+        }
+        ::-webkit-scrollbar-thumb {
+            background: #222;
+            border-radius: 3px;
+        }
+        ::-webkit-scrollbar-thumb:hover {
+            background: #333;
+        }
+        /* Pulse loaders */
+        .dot-pulse {
+            animation: pulse 1.4s infinite ease-in-out;
+        }
+        @keyframes pulse {
+            0%, 100% { opacity: 0.3; transform: scale(0.9); }
+            50% { opacity: 1; transform: scale(1.1); }
+        }
+        .drop-zone-active {
+            border-color: #76b900 !important;
+            background: rgba(118, 185, 0, 0.04) !important;
+        }
+    </style>
+</head>
+<body class="text-slate-100 font-sans min-h-screen pb-16 carbon-grid">
+    <!-- NVIDIA Brand Navigation Header (Transparent dark blur) -->
+    <nav class="bg-black/40 backdrop-blur-md sticky top-0 z-50 px-6 py-3.5 border-b border-white/5 shadow-lg">
+        <div class="max-w-7xl mx-auto flex items-center justify-between">
+            <!-- Official Styled NVIDIA Brand Text Logo -->
+            <a href="#" class="flex items-center gap-1.5 select-none group">
+                <svg class="h-6 w-6 text-nvidia-brand transition-transform duration-500 group-hover:rotate-180" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5">
+                    <path stroke-linecap="round" stroke-linejoin="round" d="M9 3v2m6-2v2M9 19v2m6-2v2M5 9H3m2 6H3m18-6h-2m2 6h-2M7 19h10a2 2 0 002-2V7a2 2 0 00-2-2H7a2 2 0 00-2 2v10a2 2 0 002 2z" />
+                </svg>
+                <span class="font-outfit text-[22px] font-black tracking-tighter text-white">
+                    NVIDIA <span class="font-light tracking-wide text-slate-400">LocateAnything</span>
+                </span>
+            </a>
+            <span class="px-3 py-1 text-xs font-semibold rounded bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 flex items-center gap-1.5 font-mono">
+                <span class="h-1.5 w-1.5 rounded-full bg-nvidia-brand animate-pulse"></span>
+                ZeroGPU Server
+            </span>
+        </div>
+    </nav>
+    <!-- MAIN MINIMAL LAYOUT CONTAINER -->
+    <main class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 pt-8 lg:pt-10 space-y-8">
+        <!-- Giant Showcase Container (SAM 3 Full-Bleed Style) -->
+        <div class="relative w-full rounded-[32px] overflow-hidden border border-white/5 bg-[#080a0e] shadow-2xl h-[580px] lg:h-[640px] flex select-none">
+            <!-- 1. Background Media Canvas (Coverage Layer) -->
+            <div class="absolute inset-0 z-0 flex items-center justify-center bg-black/40">
+                <!-- Drop Zone (Initially shown) -->
+                <div id="drop-zone" class="absolute inset-0 border-none rounded-none bg-transparent flex flex-col items-center justify-center p-4 text-center cursor-pointer transition-all z-10">
+                    <div id="upload-prompt" class="space-y-3 opacity-60 hover:opacity-100 transition-opacity">
+                        <div class="inline-flex h-12 w-12 rounded-full bg-white/5 items-center justify-center text-slate-300">
+                            <svg class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
+                                <path stroke-linecap="round" stroke-linejoin="round" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-8l-4-4m0 0L8 8m4-4v12" />
+                            </svg>
+                        </div>
+                        <div>
+                            <p class="text-xs font-bold text-slate-200">Drag & drop your file here</p>
+                            <p class="text-[10px] text-slate-500 mt-1">or click to browse local folders</p>
+                        </div>
+                    </div>
+                    <!-- Dynamic Preview Media -->
+                    <img id="preview-image" src="" alt="Input Preview" class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl z-20 border border-white/5">
+                    <video id="preview-video" src="" controls class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl z-20 border border-white/5"></video>
+                    <!-- File Input -->
+                    <input type="file" id="media-file-input" accept="image/*,video/*" class="absolute inset-0 opacity-0 cursor-pointer z-30">
+                </div>
+                <!-- Inference Output Zone -->
+                <div class="absolute inset-0 pointer-events-none flex items-center justify-center z-20">
+                    <img id="output-image" src="" alt="Inference Output" class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl pointer-events-auto border border-white/5">
+                    <video id="output-video" src="" controls class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl pointer-events-auto border border-white/5"></video>
+                </div>
+                <!-- Processing Overlays -->
+                <div id="processing-overlay" class="absolute inset-0 bg-black/85 backdrop-blur-sm hidden flex-col items-center justify-center gap-4 z-40">
+                    <div class="flex gap-1.5">
+                        <span class="dot-pulse inline-block h-3 w-3 rounded-full bg-nvidia-brand" style="animation-delay: 0s;"></span>
+                        <span class="dot-pulse inline-block h-3 w-3 rounded-full bg-emerald-400" style="animation-delay: 0.2s;"></span>
+                        <span class="dot-pulse inline-block h-3 w-3 rounded-full bg-emerald-300" style="animation-delay: 0.4s;"></span>
+                    </div>
+                    <div class="text-center space-y-1">
+                        <p id="processing-status" class="text-[11px] font-bold tracking-widest text-slate-200 uppercase">Executing Model...</p>
+                        <p class="text-[9px] text-slate-500 uppercase tracking-wider font-mono">ZeroGPU Queue Active</p>
+                    </div>
+                </div>
+            </div>
+            <!-- 2. Left Floating Overlay Panel (Title, simple selectors, accordion, and action buttons) -->
+            <div class="absolute left-6 lg:left-12 top-8 bottom-8 z-30 flex flex-col justify-between max-w-sm sm:max-w-md pointer-events-none">
+                <!-- Main Header Overlay text -->
+                <div class="space-y-3 pt-4 pointer-events-auto bg-gradient-to-b from-[#080a0e]/90 via-[#080a0e]/60 to-transparent p-4 rounded-2xl">
+                    <span class="text-[9px] font-bold text-nvidia-brand uppercase tracking-widest block font-mono">AI Research from NVIDIA</span>
+                    <h1 class="font-outfit text-3xl sm:text-5xl font-black tracking-tight text-white leading-none">
+                        Locate<span class="text-nvidia-brand font-light">Anything</span>
+                    </h1>
+                    <p class="text-xs text-slate-400 max-w-sm font-medium leading-relaxed">
+                        NVIDIA's advanced 3B vision-language model. Locate any object, UI target, or text in images and videos with natural language.
+                    </p>
+                </div>
+                <!-- Setup Glass Card Controls -->
+                <div class="glass-panel rounded-2xl p-4 space-y-4 pointer-events-auto max-w-xs shadow-2xl">
+                    <div class="grid grid-cols-2 gap-3">
+                        <!-- Media Type toggle selection -->
+                        <div class="space-y-1">
+                            <label class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Media Type</label>
+                            <div class="grid grid-cols-2 gap-0.5 bg-black/40 p-0.5 rounded-lg border border-white/5 text-center">
+                                <button id="media-type-image" class="py-1 rounded-md font-semibold text-[9px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10">
+                                    Image
+                                </button>
+                                <button id="media-type-video" class="py-1 rounded-md font-semibold text-[9px] text-slate-400 hover:text-slate-200 transition-all">
+                                    Video
+                                </button>
+                            </div>
+                        </div>
+                        <!-- Task Selector -->
+                        <div class="space-y-1">
+                            <label for="task-type" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Task Type</label>
+                            <select id="task-type" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200 font-semibold">
+                                <option value="Detection">Detection</option>
+                                <option value="Grounding">Grounding</option>
+                                <option value="OCR">OCR</option>
+                                <option value="GUI">GUI</option>
+                                <option value="Pointing">Pointing</option>
+                            </select>
+                        </div>
+                    </div>
+                    <!-- Advanced parameters sliders (Collapsible details inside the left overlay) -->
+                    <details class="group border-t border-white/5 pt-3">
+                        <summary class="list-none flex justify-between items-center cursor-pointer select-none text-[8px] font-bold text-slate-400 tracking-wider uppercase hover:text-slate-200 transition-colors">
+                            <span>⚙️ Advanced parameters</span>
+                            <svg class="h-3 w-3 transform group-open:rotate-180 transition-transform text-slate-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                                <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7" />
+                            </svg>
+                        </summary>
+                        <div class="space-y-3 pt-3">
+                            <!-- Inference Mode Selection -->
+                            <div class="space-y-1">
+                                <label for="inference-mode" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Inference Mode</label>
+                                <select id="inference-mode" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200">
+                                    <option value="hybrid">Hybrid</option>
+                                    <option value="fast">Fast</option>
+                                    <option value="slow">Slow</option>
+                                </select>
+                            </div>
+                            <!-- Short side resize cap -->
+                            <div class="space-y-1">
+                                <label for="short-size" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Resize Cap (px)</label>
+                                <input type="number" id="short-size" placeholder="Auto-Cap (1024)" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200 font-mono">
+                            </div>
+                            <!-- Temp -->
+                            <div class="space-y-1">
+                                <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
+                                    <span>Temperature</span>
+                                    <span id="temp-val" class="font-mono text-nvidia-brand">0.7</span>
+                                </div>
+                                <input type="range" id="temp" min="0.1" max="2.0" step="0.1" value="0.7" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
+                            </div>
+                            <!-- Top P -->
+                            <div class="space-y-1">
+                                <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
+                                    <span>Top P</span>
+                                    <span id="topp-val" class="font-mono text-nvidia-brand">0.9</span>
+                                </div>
+                                <input type="range" id="topp" min="0.05" max="1.0" step="0.05" value="0.9" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
+                            </div>
+                            <!-- Top K -->
+                            <div class="space-y-1">
+                                <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
+                                    <span>Top K</span>
+                                    <span id="topk-val" class="font-mono text-nvidia-brand">20</span>
+                                </div>
+                                <input type="range" id="topk" min="1" max="100" step="1" value="20" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
+                            </div>
+                            <!-- Video Frames (Only displayed for Video mode) -->
+                            <div id="video-frames-wrapper" class="space-y-1 opacity-50 pointer-events-none transition-opacity duration-300">
+                                <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
+                                    <span>Max Video Frames</span>
+                                    <span id="frames-val" class="font-mono text-nvidia-brand">4</span>
+                                </div>
+                                <input type="range" id="max-frames" min="1" max="10" step="1" value="4" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand" disabled>
+                            </div>
+                        </div>
+                    </details>
+                </div>
+                <!-- CTA Action Button (Floats at bottom-left corner of visual container) -->
+                <div class="pointer-events-auto pt-2 max-w-xs">
+                    <button id="run-btn" class="pill-btn-green w-full py-3 px-6 rounded-full text-black font-extrabold text-sm flex items-center justify-center gap-2 select-none shadow-2xl">
+                        <span id="btn-icon">🧠</span>
+                        <span id="btn-text">Run Inference</span>
+                    </button>
+                </div>
+            </div>
+            <!-- 3. Floating Categories Search Bar Overlay (Right/Center side, extremely clean glass box) -->
+            <div class="absolute top-1/2 right-6 lg:right-16 -translate-y-1/2 z-30 flex justify-center pointer-events-none w-full max-w-xs">
+                <div class="sam-input-bar rounded-2xl px-3.5 py-2.5 flex items-center gap-2 w-full pointer-events-auto">
+                    <svg class="h-4 w-4 text-nvidia-brand shrink-0" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
+                        <path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
+                    </svg>
+                    <input type="text" id="categories" value="car, bus, person, potted plant" placeholder="Describe objects to locate..." class="bg-transparent border-none outline-none focus:outline-none w-full text-slate-100 placeholder-slate-600 font-semibold text-xs">
+                    <button id="clear-search-btn" class="text-slate-500 hover:text-white transition-colors p-0.5 rounded-full hover:bg-white/5 shrink-0">
+                        <svg class="h-3.5 w-3.5" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
+                            <path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
+                        </svg>
+                    </button>
+                </div>
+            </div>
+            <!-- Floating Workspace Status -->
+            <div class="absolute bottom-4 right-4 z-30 bg-black/60 backdrop-blur px-2.5 py-1 rounded-lg border border-white/10 text-[9px] text-slate-400 font-mono select-none pointer-events-none">
+                status: <span id="workspace-status" class="text-slate-200 font-semibold">No Media Loaded</span>
+            </div>
+        </div>
+        <!-- Shelf Section (Examples and Log metrics placed directly below the giant showcase) -->
+        <div class="grid grid-cols-1 lg:grid-cols-12 gap-6 items-start">
+            <!-- Left: Examples Library Shelf (Col Span: 5) -->
+            <div class="lg:col-span-5 space-y-4">
+                <div class="glass-panel rounded-2xl p-5 space-y-4">
+                    <span class="text-[9px] font-bold text-slate-400 uppercase tracking-widest block font-mono">🖼️ Interactive Quick Sandbox</span>
+                    <div class="grid grid-cols-4 gap-3">
+                        <!-- Card 1 -->
+                        <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Book" data-category="book" data-task="Detection" data-mode="hybrid" data-asset="assets/book.jpg">
+                            <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/book.jpg');"></div>
+                            <span class="text-[9px] font-semibold text-slate-300 block truncate">Book</span>
+                        </div>
+                        <!-- Card 2 -->
+                        <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Sweet" data-category="sweet" data-task="Detection" data-mode="hybrid" data-asset="assets/sweet.jpg">
+                            <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/sweet.jpg');"></div>
+                            <span class="text-[9px] font-semibold text-slate-300 block truncate">Sweet</span>
+                        </div>
+                        <!-- Card 3 -->
+                        <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Person" data-category="person" data-task="Detection" data-mode="hybrid" data-asset="assets/person.jpg">
+                            <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/person.jpg');"></div>
+                            <span class="text-[9px] font-semibold text-slate-300 block truncate">People</span>
+                        </div>
+                        <!-- Card 4 -->
+                        <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="OCR" data-category="text" data-task="OCR" data-mode="fast" data-asset="assets/ocr.jpg">
+                            <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/ocr.jpg');"></div>
+                            <span class="text-[9px] font-semibold text-slate-300 block truncate">OCR</span>
+                        </div>
+                    </div>
+                </div>
+                <!-- Text Prompt logs -->
+                <div class="glass-panel rounded-2xl p-4 text-[10px] text-slate-500 font-mono flex justify-between items-center select-none bg-black/40">
+                    <span class="truncate block">compiled: <span id="raw-prompt-preview" class="text-slate-400"></span></span>
+                </div>
+            </div>
+            <!-- Right: Performance Metrics & Tag draw overlays (Col Span: 7) -->
+            <div class="lg:col-span-7 space-y-4">
+                <div class="glass-panel rounded-2xl p-5 space-y-4">
+                    <div class="grid grid-cols-1 sm:grid-cols-12 gap-4 items-stretch">
+                        <!-- Performance Statistics Metrics Console (Grid: 5) -->
+                        <div class="sm:col-span-5 bg-black/60 rounded-xl p-4 border border-white/5 font-mono text-[10px] text-slate-300 space-y-2 leading-normal">
+                            <div class="text-nvidia-brand font-bold border-b border-white/5 pb-1 mb-1.5 uppercase tracking-widest text-[9px] font-mono">📊 Metrics Log</div>
+                            <div class="flex justify-between"><span class="text-slate-500">Status:</span> <span id="meta-status" class="text-emerald-500 font-semibold">Idle</span></div>
+                            <div class="flex justify-between"><span class="text-slate-500">Tokens/Frames:</span> <span id="meta-tokens">-</span></div>
+                            <div class="flex justify-between"><span class="text-slate-500">Detections:</span> <span id="meta-boxes">-</span></div>
+                            <div class="flex justify-between"><span class="text-slate-500">TPS / BPS:</span> <span><span id="meta-tps">-</span> / <span id="meta-bps">-</span></span></div>
+                            <div class="flex justify-between"><span class="text-slate-500">Time:</span> <span id="meta-time">-</span></div>
+                        </div>
+                        <!-- Tag drawer box list (Grid: 7) -->
+                        <div class="sm:col-span-7 bg-black/60 rounded-xl p-4 border border-white/5 flex flex-col">
+                            <div class="text-nvidia-brand font-mono font-bold border-b border-white/5 pb-1 mb-2 uppercase tracking-widest text-[9px] flex justify-between shrink-0">
+                                <span>🎯 Detected Target Overlays</span>
+                                <span id="detection-count-badge" class="text-[8px] bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 px-1.5 py-0.5 rounded-full font-bold">0</span>
+                            </div>
+                            <div id="detection-tags-wrapper" class="flex-1 flex flex-wrap gap-1.5 max-h-[100px] overflow-y-auto pt-1 align-content-start text-[10px] text-slate-500">
+                                Run inference to populate target tags here.
+                            </div>
+                        </div>
+                    </div>
+                    <!-- Optional dynamic trace wrapper -->
+                    <div id="rich-trace-log" class="hidden border-t border-white/5 pt-3"></div>
+                </div>
+            </div>
+        </div>
+    </main>
+    <!-- Gradio client connection & app runtime logic -->
+    <script type="module">
+        import { client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
+        // State variables
+        let selectedMediaType = "Image";
+        let activeFile = null;
+        let clientInstance = null;
+        // Cache elements
+        const mediaTypeImageBtn = document.getElementById("media-type-image");
+        const mediaTypeVideoBtn = document.getElementById("media-type-video");
+        const videoFramesWrapper = document.getElementById("video-frames-wrapper");
+        const taskTypeSelect = document.getElementById("task-type");
+        const categoriesInput = document.getElementById("categories");
+        const clearSearchBtn = document.getElementById("clear-search-btn");
+        const inferenceModeSelect = document.getElementById("inference-mode");
+        const rawPromptPreview = document.getElementById("raw-prompt-preview");
+        // Advanced Controls Elements
+        const tempSlider = document.getElementById("temp");
+        const tempVal = document.getElementById("temp-val");
+        const toppSlider = document.getElementById("topp");
+        const toppVal = document.getElementById("topp-val");
+        const topkSlider = document.getElementById("topk");
+        const topkVal = document.getElementById("topk-val");
+        const shortSizeInput = document.getElementById("short-size");
+        const maxFramesSlider = document.getElementById("max-frames");
+        const maxFramesVal = document.getElementById("frames-val");
+        // Workspace Preview elements
+        const dropZone = document.getElementById("drop-zone");
+        const uploadPrompt = document.getElementById("upload-prompt");
+        const previewImage = document.getElementById("preview-image");
+        const previewVideo = document.getElementById("preview-video");
+        const fileInput = document.getElementById("media-file-input");
+        const workspaceStatus = document.getElementById("workspace-status");
+        // Output result elements
+        const outputEmpty = document.getElementById("output-empty");
+        const outputImage = document.getElementById("output-image");
+        const outputVideo = document.getElementById("output-video");
+        // Overlay and run button
+        const runBtn = document.getElementById("run-btn");
+        const btnText = document.getElementById("btn-text");
+        const btnIcon = document.getElementById("btn-icon");
+        const processingOverlay = document.getElementById("processing-overlay");
+        const processingStatus = document.getElementById("processing-status");
+        // Logging & Trace elements
+        const metaStatus = document.getElementById("meta-status");
+        const metaTokens = document.getElementById("meta-tokens");
+        const metaBoxes = document.getElementById("meta-boxes");
+        const metaTps = document.getElementById("meta-tps");
+        const metaBps = document.getElementById("meta-bps");
+        const metaTime = document.getElementById("meta-time");
+        const detectionTagsWrapper = document.getElementById("detection-tags-wrapper");
+        const detectionCountBadge = document.getElementById("detection-count-badge");
+        const richTraceLog = document.getElementById("rich-trace-log");
+        // Connect client
+        async function getClient() {
+            if (!clientInstance) {
+                try {
+                    clientInstance = await client(window.location.origin);
+                } catch (e) {
+                    console.error("Gradio Server connection failed:", e);
+                    alert("Could not connect to Gradio backend. Ensure the server is active.");
+                }
+            }
+            return clientInstance;
+        }
+        // Live values updater
+        function setupLiveUpdaters() {
+            tempSlider.addEventListener("input", (e) => tempVal.textContent = e.target.value);
+            toppSlider.addEventListener("input", (e) => toppVal.textContent = e.target.value);
+            topkSlider.addEventListener("input", (e) => topkVal.textContent = e.target.value);
+            maxFramesSlider.addEventListener("input", (e) => maxFramesVal.textContent = e.target.value);
+            // Clear search categories button
+            clearSearchBtn.addEventListener("click", () => {
+                categoriesInput.value = "";
+                categoriesInput.focus();
+                triggerPromptUpdate();
+            });
+            // Trigger prompt generation updates
+            const triggerPromptUpdate = () => {
+                const task = taskTypeSelect.value;
+                const cat = categoriesInput.value;
+                rawPromptPreview.textContent = generateRawPromptText(task, cat);
+            };
+            taskTypeSelect.addEventListener("change", triggerPromptUpdate);
+            categoriesInput.addEventListener("input", triggerPromptUpdate);
+            // Run prompt builder initially
+            triggerPromptUpdate();
+        }
+        // Prompt builder mirroring python logic
+        function generateRawPromptText(taskType, category) {
+            if (!category) category = "objects";
+            const cats = category.split(",")
+                                 .map(c => c.trim())
+                                 .filter(c => c.length > 0)
+                                 .join("</c>");
+            switch (taskType) {
+                case "Detection": return `Locate all the instances that matches the following description: ${cats}.`;
+                case "Grounding": return `Locate all the instances that match the following description: ${cats}.`;
+                case "OCR": return "Detect all the text in box format.";
+                case "GUI": return `Locate the region that matches the following description: ${cats}.`;
+                case "Pointing": return `Point to: ${cats}.`;
+                default: return `Locate all the instances that matches the following description: ${cats}.`;
+            }
+        }
+        // Switch workspace input styles without clearing
+        function setMediaType(type) {
+            selectedMediaType = type;
+            if (type === "Image") {
+                mediaTypeImageBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10";
+                mediaTypeVideoBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] text-slate-400 hover:text-slate-200 transition-all";
+                videoFramesWrapper.classList.add("hidden");
+                videoFramesWrapper.classList.add("opacity-50");
+                videoFramesWrapper.classList.add("pointer-events-none");
+                maxFramesSlider.disabled = true;
+                fileInput.accept = "image/*";
+                workspaceStatus.textContent = activeFile ? "Image Loaded" : "No Media Loaded";
+            } else {
+                mediaTypeVideoBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10";
+                mediaTypeImageBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] text-slate-400 hover:text-slate-200 transition-all";
+                videoFramesWrapper.classList.remove("hidden");
+                videoFramesWrapper.classList.remove("opacity-50");
+                videoFramesWrapper.classList.remove("pointer-events-none");
+                maxFramesSlider.disabled = false;
+                fileInput.accept = "video/*";
+                workspaceStatus.textContent = activeFile ? "Video Loaded" : "No Media Loaded";
+            }
+        }
+        // Reset elements
+        function clearWorkspace() {
+            activeFile = null;
+            previewImage.src = "";
+            previewImage.classList.add("hidden");
+            previewVideo.src = "";
+            previewVideo.classList.add("hidden");
+            uploadPrompt.classList.remove("hidden");
+            if (outputEmpty) outputEmpty.classList.remove("hidden");
+            outputImage.src = "";
+            outputImage.classList.add("hidden");
+            outputVideo.src = "";
+            outputVideo.classList.add("hidden");
+            workspaceStatus.textContent = "Workspace Cleared";
+        }
+        // Drag and drop utilities
+        function setupDragDrop() {
+            ['dragenter', 'dragover'].forEach(eventName => {
+                dropZone.addEventListener(eventName, (e) => {
+                    e.preventDefault();
+                    dropZone.classList.add('drop-zone-active');
+                }, false);
+            });
+            ['dragleave', 'drop'].forEach(eventName => {
+                dropZone.addEventListener(eventName, (e) => {
+                    e.preventDefault();
+                    dropZone.classList.remove('drop-zone-active');
+                }, false);
+            });
+            dropZone.addEventListener('drop', (e) => {
+                const dt = e.dataTransfer;
+                const file = dt.files[0];
+                if (file) handleFileImport(file);
+            });
+            fileInput.addEventListener('change', (e) => {
+                const file = e.target.files[0];
+                if (file) handleFileImport(file);
+            });
+        }
+        // Display imported media
+        function handleFileImport(file) {
+            uploadPrompt.classList.add("hidden");
+            if (file.type.startsWith("image/")) {
+                setMediaType("Image");
+                activeFile = file;
+                const reader = new FileReader();
+                reader.onload = (e) => {
+                    previewImage.src = e.target.result;
+                    previewImage.classList.remove("hidden");
+                    previewVideo.classList.add("hidden");
+                };
+                reader.readAsDataURL(file);
+                workspaceStatus.textContent = `Image Loaded: ${file.name}`;
+            } else if (file.type.startsWith("video/")) {
+                setMediaType("Video");
+                activeFile = file;
+                previewVideo.src = URL.createObjectURL(file);
+                previewVideo.classList.remove("hidden");
+                previewImage.classList.add("hidden");
+                workspaceStatus.textContent = `Video Loaded: ${file.name}`;
+            }
+        }
+        // Initialize preloaded examples click actions
+        // Utility to fetch preloaded example assets and convert to File
+        async function loadExampleFromAsset(url, filename) {
+            try {
+                const response = await fetch(url);
+                const blob = await response.blob();
+                return new File([blob], filename, { type: blob.type });
+            } catch (err) {
+                console.error("Failed to load example asset:", err);
+                return null;
+            }
+        }
+        // Initialize preloaded examples click actions
+        function setupExamples() {
+            document.querySelectorAll(".example-card").forEach(card => {
+                card.addEventListener("click", async () => {
+                    const type = card.getAttribute("data-type");
+                    const name = card.getAttribute("data-name");
+                    const category = card.getAttribute("data-category");
+                    const task = card.getAttribute("data-task");
+                    const mode = card.getAttribute("data-mode");
+                    const assetPath = card.getAttribute("data-asset"); // e.g. "assets/book.jpg"
+                    clearWorkspace();
+                    workspaceStatus.textContent = `Loading ${name} example...`;
+                    // Set parameters
+                    taskTypeSelect.value = task;
+                    categoriesInput.value = category;
+                    inferenceModeSelect.value = mode;
+                    // Trigger live prompt update
+                    taskTypeSelect.dispatchEvent(new Event("change"));
+                    // Setup Media type
+                    setMediaType(type);
+                    // Fetch asset file with robust absolute URL resolution (works in iframe)
+                    const ext = type === "Image" ? "jpg" : "mp4";
+                    const resolvedAssetUrl = new URL(assetPath, window.location.href).href;
+                    console.log("Fetching example from:", resolvedAssetUrl);
+                    const file = await loadExampleFromAsset(resolvedAssetUrl, `${name.toLowerCase()}.${ext}`);
+                    if (file) {
+                        activeFile = file;
+                        uploadPrompt.classList.add("hidden");
+                        if (type === "Image") {
+                            previewImage.src = URL.createObjectURL(file);
+                            previewImage.classList.remove("hidden");
+                            previewVideo.classList.add("hidden");
+                            workspaceStatus.textContent = `Example Image Loaded: ${name}`;
+                        } else {
+                            previewVideo.src = URL.createObjectURL(file);
+                            previewVideo.classList.remove("hidden");
+                            previewImage.classList.add("hidden");
+                            workspaceStatus.textContent = `Example Video Loaded: ${name}`;
+                        }
+                    } else {
+                        workspaceStatus.textContent = `Failed to load ${name} example`;
+                    }
+                });
+            });
+        }
+        // Execution logic
+        async function executeInference() {
+            if (!activeFile) {
+                alert("Please upload a media file (Image or Video) or select an example first.");
+                return;
+            }
+            // Set loading state
+            runBtn.disabled = true;
+            btnText.textContent = "⏳ Queueing Request...";
+            btnIcon.textContent = "🔒";
+            processingOverlay.classList.remove("hidden");
+            processingStatus.textContent = "Waiting for Gradio queue...";
+            // Clean outputs
+            if (outputEmpty) outputEmpty.classList.add("hidden");
+            outputImage.classList.add("hidden");
+            outputVideo.classList.add("hidden");
+            richTraceLog.innerHTML = "";
+            richTraceLog.classList.add("hidden");
+            metaStatus.textContent = "Processing...";
+            metaStatus.className = "text-yellow-500 font-semibold";
+            detectionTagsWrapper.innerHTML = "Processing objects in backend...";
+            detectionCountBadge.textContent = "0";
+            try {
+                const clientInstance = await getClient();
+                if (!clientInstance) {
+                    throw new Error("Unable to create Gradio Client instance.");
+                }
+                // Handle file parameter wrapping using Gradio client handle_file
+                const wrappedFile = activeFile ? handle_file(activeFile) : null;
+                const imageFile = (selectedMediaType === "Image") ? wrappedFile : null;
+                const videoFile = (selectedMediaType === "Video") ? wrappedFile : null;
+                // Collect configuration values
+                const taskType = taskTypeSelect.value;
+                const category = categoriesInput.value;
+                const modelMode = inferenceModeSelect.value;
+                const temp = parseFloat(tempSlider.value);
+                const topp = parseFloat(toppSlider.value);
+                const topk = parseInt(topkSlider.value);
+                const shortSize = shortSizeInput.value ? parseInt(shortSizeInput.value) : null;
+                const maxVideoFrames = parseInt(maxFramesSlider.value);
+                processingStatus.textContent = "Running Vision Model (duration-locked)...";
+                // Execute predictions using named parameters object matching app.py signature
+                const result = await clientInstance.predict("/run_inference", {
+                    input_type: selectedMediaType,
+                    image_file: imageFile,
+                    video_file: videoFile,
+                    task_type: taskType,
+                    category: category,
+                    model_mode: modelMode,
+                    temp: temp,
+                    top_p: topp,
+                    top_k: topk,
+                    short_size: shortSize,
+                    question_override: null,
+                    max_video_frames: maxVideoFrames
+                });
+                console.log("Inference complete. API outputs:", result);
+                // Unpack result values
+                const [outImageObj, outVideoObj, meta] = result.data;
+                if (!meta.success) {
+                    throw new Error(meta.error || "Backend returned processing failure.");
+                }
+                // Process image result
+                if (selectedMediaType === "Image" && outImageObj) {
+                    outputImage.src = outImageObj.url;
+                    outputImage.classList.remove("hidden");
+                    outputVideo.classList.add("hidden");
+                }
+                // Process video result
+                else if (selectedMediaType === "Video" && outVideoObj) {
+                    outputVideo.src = outVideoObj.url;
+                    outputVideo.classList.remove("hidden");
+                    outputImage.classList.add("hidden");
+                }
+                // Render metrics logs
+                metaStatus.textContent = "Success";
+                metaStatus.className = "text-emerald-500 font-semibold";
+                const stats = meta.stats || {};
+                metaTokens.textContent = stats.num_tokens || stats.total_frames || "-";
+                metaBoxes.textContent = stats.num_boxes || stats.processed_frames || "-";
+                metaTps.textContent = stats.tps || "-";
+                metaBps.textContent = stats.bps || "-";
+                metaTime.textContent = stats.total_time_seconds ? `${stats.total_time_seconds}s` : "Optimal";
+                // Render detection tags
+                const detections = meta.detections || [];
+                detectionCountBadge.textContent = detections.length;
+                if (detections.length === 0) {
+                    detectionTagsWrapper.innerHTML = "No objects matched categories.";
+                } else {
+                    detectionTagsWrapper.innerHTML = "";
+                    detections.forEach(det => {
+                        const tag = document.createElement("span");
+                        tag.className = "px-2 py-0.5 rounded bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 font-bold uppercase tracking-wider text-[8px] animate-fade-in";
+                        tag.textContent = det.frame ? `[Frame ${det.frame}] ${det.label}` : det.label;
+                        detectionTagsWrapper.appendChild(tag);
+                    });
+                }
+                // Render logs trace
+                if (meta.html) {
+                    richTraceLog.innerHTML = meta.html;
+                    richTraceLog.classList.remove("hidden");
+                }
+            } catch (err) {
+                console.error("Execution failed:", err);
+                metaStatus.textContent = "Error";
+                metaStatus.className = "text-red-500 font-semibold";
+                detectionTagsWrapper.innerHTML = `<span class="text-red-400">Failed: ${err.message}</span>`;
+                alert(`Inference failed: ${err.message}`);
+                if (outputEmpty) outputEmpty.classList.remove("hidden");
+            } finally {
+                // Restore UI state
+                runBtn.disabled = false;
+                btnText.textContent = "Run Inference";
+                btnIcon.textContent = "🧠";
+                processingOverlay.classList.add("hidden");
+            }
+        }
+        // Add event listeners on load
+        document.addEventListener("DOMContentLoaded", () => {
+            mediaTypeImageBtn.addEventListener("click", () => {
+                if (selectedMediaType !== "Image") {
+                    setMediaType("Image");
+                    clearWorkspace();
+                }
+            });
+            mediaTypeVideoBtn.addEventListener("click", () => {
+                if (selectedMediaType !== "Video") {
+                    setMediaType("Video");
+                    clearWorkspace();
+                }
+            });
+            runBtn.addEventListener("click", executeInference);
+            // Bind enter key press in Categories float bar input
+            categoriesInput.addEventListener("keydown", (e) => {
+                if (e.key === "Enter") {
+                    e.preventDefault();
+                    executeInference();
+                }
+            });
+            setupLiveUpdaters();
+            setupDragDrop();
+            setupExamples();
+        });
+    </script>
+</body>
+</html>

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 opencv-python-headless==4.11.0.86
-transformers==4.51.0
 torch==2.8.0
 torchvision==0.23.0
 numpy==1.25.0

 opencv-python-headless==4.11.0.86
+transformers==4.57.1
 torch==2.8.0
 torchvision==0.23.0
 numpy==1.25.0