gradio server

#1
by akhaliq HF Staff - opened
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +206 -852
  3. index.html +926 -0
  4. requirements.txt +1 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 💬
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  python_version: "3.10.13"
9
  app_file: app.py
10
  pinned: false
 
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 6.14.0
8
  python_version: "3.10.13"
9
  app_file: app.py
10
  pinned: false
app.py CHANGED
@@ -1,9 +1,17 @@
1
  #!/usr/bin/env python
2
  # -*- coding: utf-8 -*-
 
 
3
  import gradio as gr
 
 
 
 
 
4
  import cv2
5
  import numpy as np
6
  import os
 
7
  import tempfile
8
  import re
9
  import time
@@ -13,50 +21,16 @@ import io
13
  import json
14
  import uuid
15
  from pathlib import Path
 
16
 
17
  import torch
18
  from PIL import Image, ImageDraw, ImageFont
19
  from transformers import AutoProcessor, AutoModel, AutoTokenizer
20
- from huggingface_hub import CommitScheduler
21
-
22
- import spaces
23
 
24
  _FONT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "LXGWWenKai-Bold.ttf")
25
 
26
-
27
- def _get_first_env(*names):
28
- for name in names:
29
- value = os.environ.get(name)
30
- if value and value.strip():
31
- return value.strip()
32
- return None
33
-
34
-
35
- def _configure_hf_auth():
36
- model_token = _get_first_env(
37
- "MODEL_HF_TOKEN",
38
- "LOG_HF_TOKEN",
39
- "HF_TOKEN",
40
- "HUGGINGFACE_HUB_TOKEN",
41
- "HUGGINGFACEHUB_API_TOKEN",
42
- )
43
- log_token = _get_first_env(
44
- "LOG_HF_TOKEN",
45
- "MODEL_HF_TOKEN",
46
- "HF_TOKEN",
47
- "HUGGINGFACE_HUB_TOKEN",
48
- "HUGGINGFACEHUB_API_TOKEN",
49
- )
50
- shared_token = model_token or log_token
51
- if shared_token:
52
- # Some downstream hub calls still rely on standard env var names.
53
- for name in ("HF_TOKEN", "HUGGINGFACE_HUB_TOKEN", "HUGGINGFACEHUB_API_TOKEN"):
54
- os.environ[name] = shared_token
55
- return model_token, log_token
56
-
57
-
58
- MODEL_HF_TOKEN, LOG_HF_TOKEN = _configure_hf_auth()
59
-
60
 
61
  def _load_font(size=20):
62
  """加载中文字体(LXGW WenKai),需提前放置到 assets/ 目录"""
@@ -233,23 +207,22 @@ class EagleWorker:
233
  self.device = device
234
  self.dtype = torch.bfloat16
235
  self.generation_mode = generation_mode
236
- self.hf_token = MODEL_HF_TOKEN
237
  self.tokenizer = AutoTokenizer.from_pretrained(
238
  model_path,
239
  trust_remote_code=True,
240
- token=self.hf_token,
241
  )
242
  self.processor = AutoProcessor.from_pretrained(
243
  model_path,
244
  trust_remote_code=True,
245
- token=self.hf_token,
246
  )
247
  self.model = AutoModel.from_pretrained(
248
  model_path,
249
  torch_dtype=self.dtype,
250
  _attn_implementation="sdpa",
251
  trust_remote_code=True,
252
- token=self.hf_token,
253
  ).to(device).eval()
254
  print("Model Loaded Successfully!")
255
 
@@ -299,7 +272,7 @@ class EagleWorker:
299
 
300
 
301
  # ============================================================
302
- # 后处理 / HTML
303
  # ============================================================
304
  def _postprocess_detections(detections, w, h):
305
  valid = []
@@ -333,106 +306,6 @@ def _parse_out_info_dict(out_info: str) -> dict:
333
  return stats
334
 
335
 
336
- def generate_dynamic_html(token_sequence, out_info, raw_text):
337
- uid = f"a{int(time.time() * 1000)}"
338
- css = f"""
339
- <style>
340
- .dc-root {{
341
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
342
- border: 1px solid #cce875; border-radius: 10px; background: #ffffff; overflow: hidden;
343
- }}
344
- .dc-header {{
345
- display: flex; align-items: center; justify-content: space-between;
346
- padding: 12px 18px;
347
- background: linear-gradient(135deg, #76b900 0%, #649d00 100%);
348
- border-bottom: 1px solid #527f00;
349
- }}
350
- .dc-header-title {{ font-weight: 700; font-size: 0.95em; color: #ffffff !important; letter-spacing: 0.3px; }}
351
- .dc-legend {{ display: flex; gap: 16px; align-items: center; }}
352
- .dc-legend-item {{ display: flex; align-items: center; gap: 5px; font-size: 0.78em; color: rgba(255,255,255,0.92); font-weight: 500; }}
353
- .dc-legend-dot {{ width: 10px; height: 10px; border-radius: 3px; display: inline-block; border: 1px solid rgba(255,255,255,0.5); }}
354
- .dc-row {{ display: flex; gap: 10px; padding: 14px 18px; border-bottom: 1px solid #eef7d1; }}
355
- .dc-row:last-child {{ border-bottom: none; }}
356
- .dc-val {{ flex: 1; line-height: 2.3; word-wrap: break-word; color: #4b5563; font-size: 0.92em; }}
357
- @keyframes tk-{uid} {{
358
- 0% {{ opacity: 0; transform: translateY(8px) scale(0.92); }}
359
- 60% {{ opacity: 1; transform: translateY(-2px) scale(1.02); }}
360
- 100% {{ opacity: 1; transform: translateY(0) scale(1); }}
361
- }}
362
- .tk-mtp-{uid}, .tk-ar-{uid} {{
363
- opacity: 0; animation: tk-{uid} 0.35s ease-out forwards;
364
- border-radius: 5px; padding: 2px 7px; margin: 2px 1px; display: inline-block;
365
- font-size: 0.80em; font-weight: 600;
366
- font-family: 'SFMono-Regular', Consolas, 'Courier New', monospace; white-space: nowrap;
367
- }}
368
- .tk-mtp-{uid} {{ background: #e8f5e9; border: 2px solid #76b900; color: #2d4400; box-shadow: 0 1px 2px rgba(118,185,0,0.15); }}
369
- .tk-ar-{uid} {{ background: #fff3e0; border: 2px solid #e65100; color: #bf360c; box-shadow: 0 1px 2px rgba(230,81,0,0.15); }}
370
- .tk-stat-{uid} {{
371
- opacity: 0; animation: tk-{uid} 0.4s ease-out forwards;
372
- background: #f0f9e2; border: 1px solid #a4d422; border-radius: 6px;
373
- padding: 5px 14px; display: inline-block; font-size: 0.82em; color: #3f6200; font-weight: 600;
374
- }}
375
- .dc-raw {{ padding: 0 18px 14px; }}
376
- .dc-raw summary {{ cursor: pointer; color: #9ca3af; font-size: 0.82em; user-select: none; transition: color .15s; }}
377
- .dc-raw summary:hover {{ color: #649d00; }}
378
- .dc-raw-pre {{
379
- background: #f7fbe8; border: 1px solid #ddf0a3; border-radius: 6px;
380
- padding: 12px; margin-top: 8px;
381
- font-family: 'SFMono-Regular', Consolas, 'Courier New', monospace;
382
- font-size: 0.78em; color: #374151; white-space: pre-wrap; word-break: break-all;
383
- max-height: 200px; overflow-y: auto;
384
- }}
385
- @media (max-width: 640px) {{
386
- .dc-header {{ flex-direction: column; gap: 8px; align-items: flex-start; }}
387
- .dc-row {{ flex-direction: column; gap: 4px; }}
388
- }}
389
- </style>
390
- """
391
- h = css + '<div class="dc-root">'
392
- h += ('<div class="dc-header">'
393
- '<span class="dc-header-title">LocateAnything Decoding Trace</span>'
394
- '<div class="dc-legend">'
395
- '<div class="dc-legend-item"><span class="dc-legend-dot" style="background:#76b900;"></span>MTP &mdash; Parallel Box Decoding</div>'
396
- '<div class="dc-legend-item"><span class="dc-legend-dot" style="background:#e65100;"></span>AR &mdash; NTP Fallback (Re-decoding)</div>'
397
- '</div></div>')
398
- h += '<div class="dc-row"><div class="dc-val">'
399
- tok_idx = 0
400
- if token_sequence:
401
- for item in token_sequence:
402
- if not isinstance(item, (list, tuple)) or len(item) < 2:
403
- continue
404
- decode_type = str(item[0]).lower()
405
- text = str(item[1])
406
- safe = text.replace("<", "&lt;").replace(">", "&gt;")
407
- delay = f"{tok_idx * 0.06:.2f}s"
408
- cls = f"tk-ar-{uid}" if decode_type == "ar" else f"tk-mtp-{uid}"
409
- h += f'<span class="{cls}" style="animation-delay:{delay}">{safe}</span> '
410
- tok_idx += 1
411
- h += '</div></div>'
412
- if out_info:
413
- stats = _parse_out_info_dict(out_info)
414
- bits = []
415
- if "forward_step" in stats: bits.append(f"{stats['forward_step']} steps")
416
- if "num_tokens" in stats: bits.append(f"{stats['num_tokens']} tokens")
417
- if "num_boxes" in stats: bits.append(f"{stats['num_boxes']} boxes")
418
- if "switch_to_ar" in stats:
419
- n = stats["switch_to_ar"]
420
- bits.append(f"{n} AR Fallback{'s' if n != '1' else ''}")
421
- if "ar_step" in stats: bits.append(f"{stats['ar_step']} AR steps")
422
- if "tps" in stats: bits.append(f"{stats['tps']} tok/s")
423
- if "bps" in stats: bits.append(f"{stats['bps']} box/s")
424
- summary = " &middot; ".join(bits) if bits else out_info.strip()
425
- stat_delay = f"{tok_idx * 0.06 + 0.3:.2f}s"
426
- h += (f'<div class="dc-row" style="justify-content:flex-end;padding-top:4px;padding-bottom:10px;border-bottom:none;">'
427
- f'<span class="tk-stat-{uid}" style="animation-delay:{stat_delay}">⚡ {summary}</span></div>')
428
- if raw_text:
429
- safe_raw = raw_text.replace("<", "&lt;").replace(">", "&gt;")
430
- h += (f'<div class="dc-raw"><details><summary>📄 Show Raw Response</summary>'
431
- f'<div class="dc-raw-pre">{safe_raw}</div></details></div>')
432
- h += '</div>'
433
- return h
434
-
435
-
436
  def generate_raw_prompt(task_type, category):
437
  if not category:
438
  category = "objects"
@@ -454,123 +327,21 @@ def generate_raw_prompt(task_type, category):
454
  # ============================================================
455
  # 模型初始化
456
  # ============================================================
457
- try:
458
- MODEL_PATH = os.environ.get("MODEL_PATH", "woshichaoren123/test001")
459
- GLOBAL_WORKER = EagleWorker(MODEL_PATH)
460
- except Exception as e:
461
- print(f"Failed to load model: {e}. Will run in Mock Mode.")
462
- GLOBAL_WORKER = None
463
-
464
 
465
- # ============================================================
466
- # 用户数据收集(HuggingFace Public Dataset)
467
- #
468
- # 策略:one-record-per-file,配合按天目录 + 容器级 SESSION_ID。
469
- # 这样可以解决两个问题:
470
- # 1. 容器被回收时,本地 ephemeral 目录被清空。原来��有 session
471
- # 写同一个 logs_<date>.jsonl,新容器起来后会用空文件把 dataset 里
472
- # 旧的同名文件覆盖掉,造成数据丢失。
473
- # 2. 每次 commit 都要重传整份 LFS(appended 文件 hash 变了),浪费带宽。
474
- #
475
- # 现在每条记录写成独立的 JSONL 文件:
476
- # data/<date>/<SESSION_ID>__<entry_id>.jsonl
477
- # CommitScheduler 只会“新增”文件,永远不会覆盖其它 session 的数据;
478
- # 单文件上传后即被封存,不会重复上传。
479
- # ============================================================
480
- LOG_DATASET_REPO = os.environ.get("LOG_DATASET_REPO", "woshichaoren123/log")
481
- _LOG_DIR = Path(tempfile.mkdtemp(prefix="hf_log_"))
482
- _SESSION_ID = uuid.uuid4().hex[:8]
483
- _log_scheduler = None
484
-
485
- if LOG_DATASET_REPO and LOG_HF_TOKEN:
486
- try:
487
- _log_scheduler = CommitScheduler(
488
- repo_id=LOG_DATASET_REPO,
489
- repo_type="dataset",
490
- folder_path=str(_LOG_DIR),
491
- path_in_repo="data",
492
- every=3,
493
- token=LOG_HF_TOKEN,
494
- squash_history=False,
495
- )
496
- print(f"[LOG] Dataset logging enabled → {LOG_DATASET_REPO} "
497
- f"(session={_SESSION_ID}, dir={_LOG_DIR})")
498
- except Exception as e:
499
- _log_scheduler = None
500
- print(f"[LOG] Dataset logging disabled: {e}")
501
- else:
502
- print("[LOG] Dataset logging disabled (LOG_HF_TOKEN not set)")
503
-
504
-
505
- def _pil_to_b64(pil_img):
506
- """将 PIL 图片无损转为 PNG base64 字符串。"""
507
- buf = io.BytesIO()
508
- pil_img.save(buf, "PNG")
509
- return base64.b64encode(buf.getvalue()).decode("ascii")
510
-
511
-
512
- def _atomic_write_text(path: Path, text: str):
513
- """原子写入:先写临时文件再 rename,避免 CommitScheduler 读到半截文件。"""
514
- tmp_path = path.with_name(path.name + ".tmp")
515
- with open(tmp_path, "w", encoding="utf-8") as f:
516
- f.write(text)
517
- os.replace(tmp_path, path)
518
-
519
-
520
- def _log_to_dataset(
521
- input_type, category, model_mode, raw_prompt,
522
- output_text="", input_image=None, output_image=None,
523
- extra=None,
524
- ):
525
- """每条记录写到独立的 JSONL 文件,按日期分目录、文件名包含 session_id。
526
-
527
- 最终落盘路径(也是 dataset 里的路径):
528
- data/<YYYY-MM-DD>/<session_id>__<entry_id>.jsonl
529
- """
530
- if _log_scheduler is None:
531
- return
532
- try:
533
- entry_id = f"{int(time.time())}_{uuid.uuid4().hex[:6]}"
534
- ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
535
- date_str = time.strftime("%Y-%m-%d", time.gmtime())
536
-
537
- input_b64 = None
538
- if input_image is not None and isinstance(input_image, Image.Image):
539
- input_b64 = _pil_to_b64(input_image)
540
-
541
- output_b64 = None
542
- if output_image is not None and isinstance(output_image, Image.Image):
543
- output_b64 = _pil_to_b64(output_image)
544
-
545
- record = {
546
- "id": entry_id,
547
- "session_id": _SESSION_ID,
548
- "timestamp": ts,
549
- "input_type": input_type,
550
- "category": category,
551
- "model_mode": model_mode,
552
- "raw_prompt": raw_prompt,
553
- "output_text": output_text,
554
- "input_image_b64": input_b64,
555
- "output_image_b64": output_b64,
556
- }
557
- if extra:
558
- record.update(extra)
559
-
560
- day_dir = _LOG_DIR / date_str
561
- day_dir.mkdir(parents=True, exist_ok=True)
562
- log_file = day_dir / f"{_SESSION_ID}__{entry_id}.jsonl"
563
-
564
- payload = json.dumps(record, ensure_ascii=False) + "\n"
565
- with _log_scheduler.lock:
566
- _atomic_write_text(log_file, payload)
567
- except Exception as e:
568
- print(f"[LOG] Failed to log to dataset: {e}")
569
 
570
 
571
- # ============================================================
572
- # 公用预处理
573
- # ============================================================
574
  def _prepare_image_for_model(pil_img, short_size):
575
  process_img = pil_img.copy()
576
  if short_size is not None and short_size > 0:
@@ -582,104 +353,77 @@ def _prepare_image_for_model(pil_img, short_size):
582
 
583
 
584
  # ============================================================
585
- # GPU 时间预算常量(按模式区分)
586
  # ============================================================
587
- GPU_HARD_LIMIT_IMAGE = 30 # Image 模式 @spaces.GPU(duration=...)
588
- GPU_HARD_LIMIT_VIDEO = 240 # Video 模式 @spaces.GPU(duration=...)
589
- PHASE2_RESERVE = 55 # 留给 Phase 2(绘制 + ffmpeg)的秒数
590
- SAFETY_MARGIN = 25 # 额外安全裕量,永远不���触碰硬限制
591
- INFERENCE_BUDGET = GPU_HARD_LIMIT_VIDEO - PHASE2_RESERVE - SAFETY_MARGIN
592
- EST_SECONDS_PER_FRAME = 20 # 保守估计:每帧推理耗时
593
 
594
 
595
- # ============================================================
596
- # ✅ 图像推理(独立函数)
597
- # ============================================================
598
- def _run_image_inference(
599
- image_in, categories_list, category_str,
600
- model_mode, temp, top_p, top_k, short_size, question_override,
601
- progress=None, # 接收 progress
602
  ):
603
- if image_in is None:
604
- return (
605
- gr.update(value=None, visible=True),
606
- gr.update(value=None, visible=False),
607
- "<p style='color:#ef4444;padding:12px;'>⚠️ Please upload an image first.</p>",
608
- )
609
-
610
- if progress is not None: # 进度提示
611
- progress(0.1, desc="Preprocessing image ...")
612
 
613
  process_img = _prepare_image_for_model(image_in, short_size)
614
 
615
- if progress is not None:
616
- progress(0.2, desc="Running model inference ...")
617
-
618
- if GLOBAL_WORKER:
619
- output_text, token_sequence, out_info = GLOBAL_WORKER.generate(
620
  process_img, categories_list, model_mode,
621
  temp=temp, top_p=top_p, top_k=top_k,
622
  question_override=question_override,
623
  )
624
  else:
625
- output_text, token_sequence, out_info = "", [], ""
626
-
627
- if progress is not None:
628
- progress(0.8, desc="Drawing results ...")
629
 
630
  detections = parse_mixed_results(output_text, category_str)
631
  frame_bgr = cv2.cvtColor(np.array(image_in), cv2.COLOR_RGB2BGR)
632
  out_img_bgr = draw_on_frame(frame_bgr, detections, draw_label=True)
633
  output_image = Image.fromarray(cv2.cvtColor(out_img_bgr, cv2.COLOR_BGR2RGB))
634
- html = generate_dynamic_html(token_sequence, out_info, output_text)
635
 
636
- _log_to_dataset(
637
- input_type="image",
638
- category=", ".join(categories_list),
639
- model_mode=model_mode,
640
- raw_prompt=question_override or category_str,
641
- output_text=output_text,
642
- input_image=image_in,
643
- output_image=output_image,
644
- )
645
 
646
- if progress is not None:
647
- progress(1.0, desc="Done!")
648
 
649
- return (
650
- gr.update(value=output_image, visible=True),
651
- gr.update(value=None, visible=False),
652
- html,
653
- )
 
 
 
654
 
 
655
 
656
- # ============================================================
657
- # ✅ 视频推理(独立函数 — 带完整超时保护)
658
- # ============================================================
659
- def _run_video_inference(
660
- video_in, categories_list, category_str,
661
- model_mode, temp, top_p, top_k, short_size, question_override,
662
- max_video_frames, # 可调帧数
663
- progress=None, # 接收 progress
664
  ):
665
  import subprocess as _sp
666
 
667
- if video_in is None:
668
- return (
669
- gr.update(value=None, visible=False),
670
- gr.update(value=None, visible=True),
671
- "<p style='color:#ef4444;padding:12px;'>⚠️ Please upload a video first.</p>",
672
- )
673
-
674
  total_start = time.time()
675
  max_frames = int(max_video_frames) if max_video_frames else 4
676
 
677
- if progress is not None:
678
- progress(0.0, desc="Reading video ...")
679
 
680
- # ---------- 读取视频 ----------
681
- t0 = time.time()
682
- cap = cv2.VideoCapture(video_in)
683
  fps = cap.get(cv2.CAP_PROP_FPS)
684
  vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
685
  vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
@@ -692,212 +436,103 @@ def _run_video_inference(
692
  all_frames.append(frame)
693
  cap.release()
694
  total = len(all_frames)
695
- read_elapsed = time.time() - t0
696
- print(f"[TIMING] Video read: {read_elapsed:.2f}s, total frames={total}, "
697
- f"resolution={vid_w}x{vid_h}, fps={fps:.1f}")
698
 
699
  if total == 0:
700
- return (
701
- gr.update(value=None, visible=False),
702
- gr.update(value=None, visible=True),
703
- "<p style='color:#ef4444;padding:12px;'>⚠️ Failed to read any frames from the video.</p>",
704
- )
705
 
706
- # ---------- 采样帧 ----------
707
  if total <= max_frames:
708
  sample_indices = list(range(total))
709
  else:
710
- sample_indices = [int(round(i * (total - 1) / (max_frames - 1)))
711
- for i in range(max_frames)]
712
 
713
  sampled_frames = [all_frames[i] for i in sample_indices]
714
  n_sampled = len(sampled_frames)
715
 
716
- # ============================================================
717
- # 🛡️ 预估检查:在开跑前判断能不能在 GPU 时间预算内跑完
718
- # ============================================================
719
  time_already_used = time.time() - total_start
720
  available_for_inference = GPU_HARD_LIMIT_VIDEO - time_already_used - PHASE2_RESERVE - SAFETY_MARGIN
721
  estimated_inference_time = n_sampled * EST_SECONDS_PER_FRAME
722
 
723
  if estimated_inference_time > available_for_inference:
724
- # 尝试自动缩减帧数
725
- max_feasible = max(0, int(available_for_inference // EST_SECONDS_PER_FRAME))
726
- print(f"[PRE-CHECK] Estimated {estimated_inference_time:.0f}s > budget {available_for_inference:.0f}s, "
727
- f"reducing from {n_sampled} to {max_feasible} frames")
728
-
729
- if max_feasible < 1:
730
- # 连 1 帧都跑不了,直接拒绝
731
- del all_frames
732
- gc.collect()
733
- return (
734
- gr.update(value=None, visible=False),
735
- gr.update(value=None, visible=True),
736
- "<div style='background:#fef2f2;border:1px solid #fca5a5;border-radius:8px;"
737
- "padding:16px;margin:8px 0;'>"
738
- "<p style='color:#dc2626;font-weight:700;font-size:1.05em;margin:0 0 8px;'>"
739
- "⚠️ Video too large to process</p>"
740
- f"<p style='color:#7f1d1d;margin:0;font-size:0.92em;'>"
741
- f"This video has <b>{total}</b> frames. "
742
- f"Even processing <b>1</b> sampled frame (~{EST_SECONDS_PER_FRAME}s) "
743
- f"would exceed the <b>{GPU_HARD_LIMIT_VIDEO}s</b> GPU time limit.<br><br>"
744
- "💡 <b>Suggestions:</b> use a shorter / lower-resolution video, "
745
- "or switch to <b>Image</b> mode with a single frame screenshot.</p></div>",
746
- )
747
-
748
- # 用缩减后的帧数重新采样
749
  if total <= max_feasible:
750
  sample_indices = list(range(total))
751
  else:
752
- sample_indices = [int(round(i * (total - 1) / (max_feasible - 1)))
753
- for i in range(max_feasible)]
754
  sampled_frames = [all_frames[i] for i in sample_indices]
755
  n_sampled = len(sampled_frames)
756
 
757
- # 释放原始帧列表,节省内存
758
  out_fps = max(1.0, n_sampled / (total / fps)) if fps > 0 else 5.0
759
  del all_frames
760
  gc.collect()
761
 
762
- print(f"[TIMING] Sampled {n_sampled} frames, output fps: {out_fps:.2f}")
763
-
764
- # ============================================================
765
- # 阶段一:推理(逐帧检查剩余时间)
766
- # ============================================================
767
- print("=" * 60)
768
- print("[PHASE 1] Starting model inference ...")
769
- print("=" * 60)
770
-
771
  inference_results = []
772
- phase1_start = time.time()
773
  processed_count = 0
774
  early_stopped = False
775
  early_stop_reason = ""
776
 
777
  for i, frame in enumerate(sampled_frames):
778
- # ---- 🛡️ 运行时时间检查:还够不够跑下一帧 + Phase 2?----
779
  elapsed_since_start = time.time() - total_start
780
  remaining_total = GPU_HARD_LIMIT_VIDEO - elapsed_since_start
781
 
782
  if remaining_total < PHASE2_RESERVE + SAFETY_MARGIN:
783
  early_stopped = True
784
- early_stop_reason = (
785
- f"GPU time budget is running out: "
786
- f"{elapsed_since_start:.0f}s used, only {remaining_total:.0f}s left "
787
- f"(need ≥{PHASE2_RESERVE}s for video encoding). "
788
- f"Successfully processed {processed_count}/{n_sampled} frames."
789
- )
790
- print(f"[⏰ EARLY STOP] {early_stop_reason}")
791
  break
792
 
793
- if progress is not None:
794
- progress(
795
- (i / n_sampled) * 0.85,
796
- desc=f"🧠 Inference: frame {i + 1}/{n_sampled} "
797
- f"(⏱️ {elapsed_since_start:.0f}s / {GPU_HARD_LIMIT_VIDEO}s) ...",
798
- )
799
-
800
- frame_t0 = time.time()
801
-
802
- # 预处理
803
- prep_t0 = time.time()
804
  pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
805
  process_img = _prepare_image_for_model(pil_img, short_size)
806
- prep_time = time.time() - prep_t0
807
 
808
- # 推理
809
- infer_t0 = time.time()
810
- if GLOBAL_WORKER:
811
- output_text, _, _ = GLOBAL_WORKER.generate(
812
  process_img, categories_list, model_mode,
813
  temp=temp, top_p=top_p, top_k=top_k,
814
  question_override=question_override,
815
  )
816
  else:
817
- output_text = ""
818
- infer_time = time.time() - infer_t0
819
 
820
  inference_results.append(output_text)
821
  processed_count += 1
822
-
823
- # 清理 GPU 缓存
824
- cleanup_t0 = time.time()
825
  if torch.cuda.is_available():
826
  torch.cuda.empty_cache()
827
  gc.collect()
828
- cleanup_time = time.time() - cleanup_t0
829
-
830
- total_frame_time = time.time() - frame_t0
831
- print(f"[PHASE 1] Frame {i + 1}/{n_sampled} done: "
832
- f"prep={prep_time:.2f}s, infer={infer_time:.2f}s, "
833
- f"cleanup={cleanup_time:.2f}s, total={total_frame_time:.2f}s")
834
- if torch.cuda.is_available():
835
- allocated = torch.cuda.memory_allocated() / 1024**3
836
- reserved = torch.cuda.memory_reserved() / 1024**3
837
- print(f" GPU mem: allocated={allocated:.2f}GB, reserved={reserved:.2f}GB")
838
 
839
- phase1_time = time.time() - phase1_start
840
- print(f"[PHASE 1] COMPLETE: {phase1_time:.2f}s for {processed_count} frames "
841
- f"({phase1_time / max(processed_count, 1):.2f}s/frame)")
842
-
843
- # 如果 1 帧都没处理完,返回错误
844
  if processed_count == 0:
845
- return (
846
- gr.update(value=None, visible=False),
847
- gr.update(value=None, visible=True),
848
- "<div style='background:#fef2f2;border:1px solid #fca5a5;border-radius:8px;"
849
- "padding:16px;margin:8px 0;'>"
850
- "<p style='color:#dc2626;font-weight:700;font-size:1.05em;margin:0 0 8px;'>"
851
- "⚠️ Could not process any frames</p>"
852
- "<p style='color:#7f1d1d;margin:0;font-size:0.92em;'>"
853
- "The GPU time limit was reached before even one frame could be processed. "
854
- "Please try a lower resolution video or use Image mode instead.</p></div>",
855
- )
856
 
857
- # 裁剪到实际处理过的帧
858
  sampled_frames_for_draw = sampled_frames[:processed_count]
859
  inference_results_for_draw = inference_results[:processed_count]
860
 
861
- # ============================================================
862
- # 阶段二:绘制 + 编码(只处理已推理完的帧)
863
- # ============================================================
864
- if progress is not None:
865
- progress(0.88, desc="🎨 Drawing & encoding video ...")
866
-
867
- print("=" * 60)
868
- print(f"[PHASE 2] Drawing & video encoding ({processed_count} frames) ...")
869
- print("=" * 60)
870
-
871
- phase2_start = time.time()
872
  tmp_raw = tempfile.mktemp(suffix=".raw.mp4")
873
  out_video_path = tempfile.mktemp(suffix=".mp4")
874
- out = cv2.VideoWriter(tmp_raw, cv2.VideoWriter_fourcc(*"mp4v"),
875
- out_fps, (vid_w, vid_h))
876
 
877
- for i, (frame, output_text) in enumerate(
878
- zip(sampled_frames_for_draw, inference_results_for_draw)):
879
- draw_t0 = time.time()
880
  detections = parse_mixed_results(output_text, category_str)
881
  valid_results = _postprocess_detections(detections, vid_w, vid_h)
882
  frame_to_draw = draw_on_frame(frame, valid_results, draw_label=True)
883
  out.write(frame_to_draw)
884
- draw_time = time.time() - draw_t0
885
- print(f"[PHASE 2] Frame {i + 1}/{processed_count}: "
886
- f"draw={draw_time:.3f}s, det={len(valid_results)}")
 
 
 
 
 
887
 
888
  out.release()
889
- phase2_draw_time = time.time() - phase2_start
890
 
891
- # ---- ffmpeg 重编码(如果还有时间的话) ----
892
  elapsed_now = time.time() - total_start
893
  remaining_now = GPU_HARD_LIMIT_VIDEO - elapsed_now
894
 
895
- if progress is not None:
896
- progress(0.95, desc="📦 Re-encoding with ffmpeg ...")
897
-
898
- ffmpeg_t0 = time.time()
899
  if remaining_now > 15:
900
- # 还有时间,用 ffmpeg 重编码(兼容性更好)
901
  try:
902
  ffmpeg_timeout = max(10, int(remaining_now - 5))
903
  _sp.run(
@@ -907,416 +542,135 @@ def _run_video_inference(
907
  check=True, capture_output=True, timeout=ffmpeg_timeout,
908
  )
909
  os.remove(tmp_raw)
910
- except Exception as ffmpeg_err:
911
- print(f"[PHASE 2] ffmpeg failed or timed out: {ffmpeg_err}, using raw file")
912
  if os.path.exists(tmp_raw):
913
  os.replace(tmp_raw, out_video_path)
914
  else:
915
- # 时间不够了,直接用 mp4v 原始文件
916
  os.replace(tmp_raw, out_video_path)
917
- print("[PHASE 2] Skipped ffmpeg re-encoding due to time constraint")
918
 
919
- ffmpeg_time = time.time() - ffmpeg_t0
920
  total_time = time.time() - total_start
 
 
 
 
 
 
 
 
921
 
922
- print("=" * 60)
923
- print(f"[TOTAL] {total_time:.2f}s | inference={phase1_time:.2f}s "
924
- f"draw={phase2_draw_time:.2f}s ffmpeg={ffmpeg_time:.2f}s "
925
- f"frames_done={processed_count}/{n_sampled}")
926
- print("=" * 60)
927
-
928
- # ---- 构建结果 HTML ----
929
- warning_html = ""
930
- if early_stopped:
931
- warning_html = (
932
- "<div style='background:#fefce8;border:1px solid #fde047;border-radius:8px;"
933
- "padding:14px;margin-bottom:12px;'>"
934
- "<p style='color:#a16207;font-weight:700;font-size:1.02em;margin:0 0 6px;'>"
935
- "⚡ Partial Result — Early Stop Due to GPU Time Limit</p>"
936
- f"<p style='color:#854d0e;margin:0;font-size:0.9em;'>{early_stop_reason}</p>"
937
- "<p style='color:#854d0e;margin:6px 0 0;font-size:0.88em;'>"
938
- "💡 <b>Tip:</b> Reduce <b>Max Video Frames</b> slider or use a shorter video "
939
- "to process all frames within the GPU budget.</p>"
940
- "</div>"
941
- )
942
-
943
- timing_summary = (
944
- f"Video: {total} total frames, sampled {n_sampled}, "
945
- f"processed {processed_count} | "
946
- f"Inference: {phase1_time:.1f}s ({phase1_time / max(processed_count, 1):.1f}s/frame) | "
947
- f"Drawing: {phase2_draw_time:.1f}s | ffmpeg: {ffmpeg_time:.1f}s | "
948
- f"Total: {total_time:.1f}s / {GPU_HARD_LIMIT_VIDEO}s budget"
949
- )
950
- html = warning_html + generate_dynamic_html(
951
- token_sequence=[], out_info="", raw_text=timing_summary)
952
-
953
- try:
954
- thumb = Image.fromarray(
955
- cv2.cvtColor(sampled_frames_for_draw[0], cv2.COLOR_BGR2RGB))
956
- except Exception:
957
- thumb = None
958
- _log_to_dataset(
959
- input_type="video",
960
- category=", ".join(categories_list),
961
- model_mode=model_mode,
962
- raw_prompt=question_override or category_str,
963
- output_text="\n---\n".join(inference_results_for_draw),
964
- input_image=thumb,
965
- extra={
966
- "video_total_frames": total,
967
- "video_sampled_frames": n_sampled,
968
- "video_processed_frames": processed_count,
969
- },
970
- )
971
-
972
- if progress is not None:
973
- progress(1.0, desc="Done!")
974
-
975
- return (
976
- gr.update(value=None, visible=False),
977
- gr.update(value=out_video_path, visible=True),
978
- html,
979
- )
980
 
981
 
982
  # ============================================================
983
- # 🛡️ 主入口:按模式分配不同 GPU 时长
984
  # ============================================================
985
-
986
- def _build_error_html(e, gpu_limit, input_type):
987
- """统一的异常→友好 HTML 构建。"""
988
- import traceback
989
- traceback.print_exc()
990
-
991
- error_type = type(e).__name__
992
- error_msg = str(e)
993
-
994
- is_timeout = ("timeout" in error_msg.lower()
995
- or "timelimit" in error_msg.lower()
996
- or "time limit" in error_msg.lower()
997
- or "duration" in error_msg.lower())
998
-
999
- if is_timeout:
1000
- detail = (
1001
- f"The GPU time limit ({gpu_limit}s) was exceeded before the result "
1002
- "could be fully assembled. This typically happens with large videos."
1003
- )
1004
- suggestion = (
1005
- "Please reduce <b>Max Video Frames</b>, use a shorter / smaller video, "
1006
- "or switch to <b>Image</b> mode."
1007
- )
1008
- else:
1009
- detail = f"{error_type}: {error_msg}"
1010
- suggestion = (
1011
- "If the problem persists, try reducing video size or "
1012
- "switching to Image mode."
1013
- )
1014
-
1015
- error_html = (
1016
- "<div style='background:#fef2f2;border:1px solid #fca5a5;border-radius:8px;"
1017
- "padding:16px;margin:8px 0;'>"
1018
- "<p style='color:#dc2626;font-weight:700;font-size:1.05em;margin:0 0 8px;'>"
1019
- "⚠️ Processing interrupted</p>"
1020
- f"<p style='color:#7f1d1d;margin:0 0 8px;font-size:0.92em;'>{detail}</p>"
1021
- f"<p style='color:#7f1d1d;margin:0;font-size:0.88em;'>💡 {suggestion}</p>"
1022
- "</div>"
1023
- )
1024
-
1025
- return (
1026
- gr.update(value=None, visible=(input_type == "Image")),
1027
- gr.update(value=None, visible=(input_type == "Video")),
1028
- error_html,
1029
- )
1030
-
1031
-
1032
- @spaces.GPU(duration=GPU_HARD_LIMIT_IMAGE)
1033
- def _run_image_gpu(
1034
- image_in, category, model_mode, temp, top_p, top_k,
1035
- short_size, question_override, progress,
1036
- ):
1037
- try:
1038
- categories_list = [c.strip() for c in category.split(",") if c.strip()]
1039
- category_str = "</c>".join(categories_list)
1040
- return _run_image_inference(
1041
- image_in, categories_list, category_str,
1042
- model_mode, temp, top_p, top_k, short_size, question_override,
1043
- progress=progress,
1044
- )
1045
- except Exception as e:
1046
- return _build_error_html(e, GPU_HARD_LIMIT_IMAGE, "Image")
1047
-
1048
-
1049
- @spaces.GPU(duration=GPU_HARD_LIMIT_VIDEO)
1050
- def _run_video_gpu(
1051
- video_in, category, model_mode, temp, top_p, top_k,
1052
- short_size, question_override, max_video_frames, progress,
1053
- ):
1054
  try:
1055
- categories_list = [c.strip() for c in category.split(",") if c.strip()]
1056
- category_str = "</c>".join(categories_list)
1057
- return _run_video_inference(
1058
- video_in, categories_list, category_str,
1059
- model_mode, temp, top_p, top_k, short_size, question_override,
1060
- max_video_frames=max_video_frames,
1061
- progress=progress,
1062
- )
1063
- except Exception as e:
1064
- return _build_error_html(e, GPU_HARD_LIMIT_VIDEO, "Video")
1065
-
1066
-
1067
- def run_inference(
1068
- input_type, image_in, video_in, task_type, category,
1069
- model_mode, temp, top_p, top_k, short_size, question_override,
1070
- max_video_frames,
1071
- progress=gr.Progress(track_tqdm=False),
1072
- ):
1073
- if input_type == "Image":
1074
- return _run_image_gpu(
1075
- image_in, category, model_mode, temp, top_p, top_k,
1076
- short_size, question_override, progress,
1077
- )
1078
- else:
1079
- return _run_video_gpu(
1080
- video_in, category, model_mode, temp, top_p, top_k,
1081
- short_size, question_override, max_video_frames, progress,
1082
- )
1083
-
1084
-
1085
- # ============================================================
1086
- # 按钮状态
1087
- # ============================================================
1088
- def _disable_run_btn():
1089
- return gr.update(interactive=False, value="⏳ Running ...")
1090
-
1091
-
1092
- def _enable_run_btn():
1093
- return gr.update(interactive=True, value="🧠 Run Inference")
1094
-
1095
 
1096
- # ============================================================
1097
- # Examples
1098
- # ============================================================
1099
- EXAMPLE_CONFIGS = [
1100
- {"name": "Book", "input_type": "Image", "image": "./assets/book.jpg", "video": None,
1101
- "task": "Detection", "category": "book", "mode": "hybrid"},
1102
- {"name": "Sweet", "input_type": "Image", "image": "./assets/sweet.jpg", "video": None,
1103
- "task": "Detection", "category": "sweet", "mode": "hybrid"},
1104
- {"name": "Person", "input_type": "Image", "image": "./assets/person.jpg", "video": None,
1105
- "task": "Detection", "category": "person", "mode": "hybrid"},
1106
- {"name": "OCR", "input_type": "Image", "image": "./assets/ocr.jpg", "video": None,
1107
- "task": "OCR", "category": "text", "mode": "fast"},
1108
- ]
1109
-
1110
-
1111
- def prepare_gallery_data():
1112
- base_dir = os.path.dirname(os.path.abspath(__file__))
1113
- gallery_images, gallery_captions = [], []
1114
- for config in EXAMPLE_CONFIGS:
1115
- img_path = (os.path.normpath(os.path.join(base_dir, config["image"]))
1116
- if config["image"] else None)
1117
- if img_path and os.path.exists(img_path):
1118
- gallery_images.append(img_path)
1119
  else:
1120
- gallery_images.append(Image.new("RGB", (200, 200), color="black"))
1121
- gallery_captions.append(config["name"])
1122
- return gallery_images, gallery_captions
1123
-
1124
-
1125
- def update_example_selection(evt: gr.SelectData):
1126
- config = EXAMPLE_CONFIGS[evt.index]
1127
- base_dir = os.path.dirname(os.path.abspath(__file__))
1128
- img_path = (os.path.normpath(os.path.join(base_dir, config["image"]))
1129
- if config["image"] else None)
1130
- vid_path = (os.path.normpath(os.path.join(base_dir, config["video"]))
1131
- if config["video"] else None)
1132
- return (
1133
- config["input_type"],
1134
- gr.update(value=img_path, visible=(config["input_type"] == "Image")),
1135
- gr.update(value=vid_path, visible=(config["input_type"] == "Video")),
1136
- config["task"], config["category"], config["mode"],
1137
- )
1138
-
1139
-
1140
- # ============================================================
1141
- # UI
1142
- # ============================================================
1143
- def create_demo():
1144
- nv_green = gr.themes.Color(
1145
- c50="#f7fbe8", c100="#eef7d1", c200="#ddf0a3",
1146
- c300="#cce875", c400="#a4d422", c500="#76b900",
1147
- c600="#649d00", c700="#527f00", c800="#3f6200",
1148
- c900="#2d4400", c950="#1a2700",
1149
- )
1150
- with gr.Blocks(
1151
- theme=gr.themes.Soft(primary_hue=nv_green, secondary_hue=nv_green),
1152
- title="LocateAnything",
1153
- ) as demo:
1154
- with gr.Row():
1155
- with gr.Column(scale=2):
1156
- gr.Markdown("# 🚀 LocateAnything")
1157
- gr.Markdown(
1158
- "> **Locate any object in images or videos with natural language.** \n"
1159
- "> Upload an image/video on the left, choose a task type, enter what you want to find, "
1160
- "then click **Run Inference**. Results with bounding boxes will appear on the right.\n"
1161
- ">\n"
1162
- "> **Quick Start:** "
1163
- "① Select *Image* or *Video* → "
1164
- "② Pick a *Task Type* (Detection / Grounding / OCR / GUI / Pointing) → "
1165
- "③ Type your *Categories* (comma-separated) → "
1166
- "④ Click **🧠 Run Inference**"
1167
- )
1168
- with gr.Column(scale=1):
1169
- gr.Markdown(
1170
- "> ⚠️ **Note:** `magi-attention` cannot be installed in this Hugging Face Space, "
1171
- "so inputs larger than 1K are resized to 1K in this demo.\n"
1172
- ">\n"
1173
- "> For full-resolution inference, please download the weights and run the model locally."
1174
- )
1175
-
1176
- with gr.Row():
1177
- # ===== COL 1: Settings =====
1178
- with gr.Column(scale=1):
1179
- gr.Markdown("### ⚙️ Settings")
1180
- input_type = gr.Radio(
1181
- ["Image", "Video"], label="1. Input Media Type", value="Image",
1182
- info="Select whether to process a single image or a video clip.",
1183
- )
1184
- task_dropdown = gr.Dropdown(
1185
- choices=["Detection", "Grounding", "OCR", "GUI", "Pointing"],
1186
- value="Detection", label="2. Task Type",
1187
- info="Detection: find all instances | Grounding: match description | "
1188
- "OCR: extract text | GUI: locate UI element | Pointing: point to target",
1189
- )
1190
- category_input = gr.Textbox(
1191
- label="3. Categories",
1192
- value="car, bus, person, potted plant",
1193
- placeholder="e.g. car, person, dog (comma-separated, supports Chinese)",
1194
- info="Enter one or more categories separated by commas. "
1195
- "Supports both English and Chinese (e.g. 汽车, 行人).",
1196
- )
1197
- model_dropdown = gr.Dropdown(
1198
- choices=["fast", "slow", "hybrid"],
1199
- value="hybrid", label="4. Inference Mode",
1200
- info="fast: MTP parallel decoding | slow: standard AR decoding | "
1201
- "hybrid: auto-switch for best quality-speed balance",
1202
- )
1203
- with gr.Accordion("5. Advanced Settings", open=False):
1204
- gr.Markdown(
1205
- "*Adjust these only if needed. Default values work well for most cases.*"
1206
- )
1207
- temp_slider = gr.Slider(
1208
- minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature",
1209
- info="Higher = more diverse results; lower = more deterministic.",
1210
- )
1211
- top_p_slider = gr.Slider(
1212
- minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P",
1213
- info="Nucleus sampling threshold.",
1214
- )
1215
- top_k_slider = gr.Slider(
1216
- minimum=1, maximum=100, value=20, step=1, label="Top K",
1217
- info="Top-K sampling: number of highest probability tokens to consider.",
1218
- )
1219
- short_size_input = gr.Number(
1220
- label="Short Side Size (px)", value=None, precision=0,
1221
- info="Resize the short side of the image to this value before inference. "
1222
- "Leave empty to keep original size (auto-capped at 1024).",
1223
- )
1224
- max_video_frames_slider = gr.Slider(
1225
- minimum=1, maximum=10, value=4, step=1,
1226
- label="Max Video Frames",
1227
- info="Number of frames to sample from the video for inference. "
1228
- "Each frame takes ~15-20s. Keep ≤ 6 to avoid GPU timeout.",
1229
- )
1230
- run_btn = gr.Button("🧠 Run Inference", variant="primary", size="lg")
1231
-
1232
- # ===== COL 2: Main =====
1233
- with gr.Column(scale=3):
1234
- with gr.Row():
1235
- with gr.Column(scale=1):
1236
- gr.Markdown("### 📥 Input Media")
1237
- image_input = gr.Image(
1238
- label="Input Image", type="pil", visible=True,
1239
- )
1240
- video_input = gr.Video(
1241
- label="Input Video",
1242
- visible=False,
1243
- )
1244
- with gr.Column(scale=1):
1245
- gr.Markdown("### 📤 Output Result")
1246
- output_image = gr.Image(
1247
- label="Detection Result", type="pil", visible=True,
1248
- )
1249
- output_video = gr.Video(
1250
- label="Video Result", visible=False,
1251
- )
1252
-
1253
- gr.Markdown("### 📝 Raw Input Prompt")
1254
- raw_prompt_box = gr.Textbox(
1255
- value=generate_raw_prompt("Detection", "car, bus, person, potted plant"),
1256
- interactive=False, lines=2,
1257
- info="This is the prompt sent to the model (auto-generated from your settings above).",
1258
- )
1259
- gr.Markdown("### 🔍 Decoding Visualization")
1260
- raw_output_box = gr.HTML(label="Decoding Steps")
1261
-
1262
- # ===== EXAMPLES =====
1263
- gr.Markdown("---")
1264
- gr.Markdown(
1265
- "## 🖼️ Examples\n"
1266
- "Click any example below to auto-fill the settings and input image."
1267
- )
1268
- gallery_images, gallery_captions = prepare_gallery_data()
1269
- example_gallery = gr.Gallery(
1270
- value=list(zip(gallery_images, gallery_captions)),
1271
- show_label=True, columns=4, rows=1, height="auto", allow_preview=False,
1272
- )
1273
-
1274
- # ===== EVENTS =====
1275
- input_type.change(
1276
- fn=lambda c: (gr.update(visible=(c == "Image")), gr.update(visible=(c == "Video"))),
1277
- inputs=input_type, outputs=[image_input, video_input],
1278
- )
1279
-
1280
- for comp in [task_dropdown, category_input]:
1281
- comp.change(
1282
- fn=generate_raw_prompt,
1283
- inputs=[task_dropdown, category_input],
1284
- outputs=raw_prompt_box,
1285
  )
1286
 
1287
- run_btn.click(
1288
- fn=_disable_run_btn,
1289
- inputs=None,
1290
- outputs=[run_btn],
1291
- ).then(
1292
- fn=run_inference,
1293
- inputs=[
1294
- input_type, image_input, video_input,
1295
- task_dropdown, category_input, model_dropdown,
1296
- temp_slider, top_p_slider, top_k_slider,
1297
- short_size_input, raw_prompt_box,
1298
- max_video_frames_slider,
1299
- ],
1300
- outputs=[output_image, output_video, raw_output_box],
1301
- ).then(
1302
- fn=_enable_run_btn,
1303
- inputs=None,
1304
- outputs=[run_btn],
1305
- )
1306
 
1307
- example_gallery.select(
1308
- fn=update_example_selection,
1309
- outputs=[input_type, image_input, video_input,
1310
- task_dropdown, category_input, model_dropdown],
1311
- ).then(
1312
- fn=generate_raw_prompt,
1313
- inputs=[task_dropdown, category_input],
1314
- outputs=raw_prompt_box,
1315
- )
1316
-
1317
- return demo
1318
 
1319
 
1320
  if __name__ == "__main__":
1321
- demo = create_demo()
1322
- demo.launch(debug=True)
 
1
  #!/usr/bin/env python
2
  # -*- coding: utf-8 -*-
3
+ import spaces # MUST BE THE ABSOLUTE FIRST IMPORT FOR ZEROGPU EMULATION
4
+
5
  import gradio as gr
6
+ from gradio import Server
7
+ from gradio.data_classes import FileData
8
+ from fastapi.responses import HTMLResponse
9
+ from fastapi.staticfiles import StaticFiles
10
+
11
  import cv2
12
  import numpy as np
13
  import os
14
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
  import tempfile
16
  import re
17
  import time
 
21
  import json
22
  import uuid
23
  from pathlib import Path
24
+ from typing import Any
25
 
26
  import torch
27
  from PIL import Image, ImageDraw, ImageFont
28
  from transformers import AutoProcessor, AutoModel, AutoTokenizer
 
 
 
29
 
30
  _FONT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "LXGWWenKai-Bold.ttf")
31
 
32
+ # Retrieve optional HF Token from typical env variables
33
+ HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") or os.environ.get("MODEL_HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def _load_font(size=20):
36
  """加载中文字体(LXGW WenKai),需提前放置到 assets/ 目录"""
 
207
  self.device = device
208
  self.dtype = torch.bfloat16
209
  self.generation_mode = generation_mode
 
210
  self.tokenizer = AutoTokenizer.from_pretrained(
211
  model_path,
212
  trust_remote_code=True,
213
+ token=HF_TOKEN if HF_TOKEN else None,
214
  )
215
  self.processor = AutoProcessor.from_pretrained(
216
  model_path,
217
  trust_remote_code=True,
218
+ token=HF_TOKEN if HF_TOKEN else None,
219
  )
220
  self.model = AutoModel.from_pretrained(
221
  model_path,
222
  torch_dtype=self.dtype,
223
  _attn_implementation="sdpa",
224
  trust_remote_code=True,
225
+ token=HF_TOKEN if HF_TOKEN else None,
226
  ).to(device).eval()
227
  print("Model Loaded Successfully!")
228
 
 
272
 
273
 
274
  # ============================================================
275
+ # 后处理
276
  # ============================================================
277
  def _postprocess_detections(detections, w, h):
278
  valid = []
 
306
  return stats
307
 
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  def generate_raw_prompt(task_type, category):
310
  if not category:
311
  category = "objects"
 
327
  # ============================================================
328
  # 模型初始化
329
  # ============================================================
330
+ GLOBAL_WORKER = None
 
 
 
 
 
 
331
 
332
+ def get_worker():
333
+ global GLOBAL_WORKER
334
+ if GLOBAL_WORKER is None:
335
+ try:
336
+ MODEL_PATH = os.environ.get("MODEL_PATH", "nvidia/LocateAnything-3B")
337
+ print(f"Loading model inside @spaces.GPU context: {MODEL_PATH}")
338
+ GLOBAL_WORKER = EagleWorker(MODEL_PATH)
339
+ except Exception as e:
340
+ print(f"Failed to load model: {e}. Will run in Mock Mode.")
341
+ GLOBAL_WORKER = None
342
+ return GLOBAL_WORKER
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
 
 
 
 
345
  def _prepare_image_for_model(pil_img, short_size):
346
  process_img = pil_img.copy()
347
  if short_size is not None and short_size > 0:
 
353
 
354
 
355
  # ============================================================
356
+ # GPU 时间预算与推理保护(按模式区分)
357
  # ============================================================
358
+ GPU_HARD_LIMIT_IMAGE = 30
359
+ GPU_HARD_LIMIT_VIDEO = 240
360
+ PHASE2_RESERVE = 55
361
+ SAFETY_MARGIN = 25
362
+ EST_SECONDS_PER_FRAME = 20
 
363
 
364
 
365
+ @spaces.GPU(duration=120, size="xlarge")
366
+ def run_image_gpu_api(
367
+ image_path: str, category: str, model_mode: str, temp: float, top_p: float, top_k: int,
368
+ short_size: int | None, question_override: str | None
 
 
 
369
  ):
370
+ image_in = Image.open(image_path).convert("RGB")
371
+ categories_list = [c.strip() for c in category.split(",") if c.strip()]
372
+ category_str = "</c>".join(categories_list)
 
 
 
 
 
 
373
 
374
  process_img = _prepare_image_for_model(image_in, short_size)
375
 
376
+ worker = get_worker()
377
+ if worker:
378
+ output_text, token_sequence, out_info = worker.generate(
 
 
379
  process_img, categories_list, model_mode,
380
  temp=temp, top_p=top_p, top_k=top_k,
381
  question_override=question_override,
382
  )
383
  else:
384
+ # Mock mode fallback
385
+ output_text = "Mock detection: <ref>sweet</ref><box><240><480><620><940></box> and <ref>book</ref><box><50><120><400><380></box>"
386
+ token_sequence = []
387
+ out_info = "forward_step=1;num_tokens=18;num_boxes=2;tps=45;bps=15"
388
 
389
  detections = parse_mixed_results(output_text, category_str)
390
  frame_bgr = cv2.cvtColor(np.array(image_in), cv2.COLOR_RGB2BGR)
391
  out_img_bgr = draw_on_frame(frame_bgr, detections, draw_label=True)
392
  output_image = Image.fromarray(cv2.cvtColor(out_img_bgr, cv2.COLOR_BGR2RGB))
 
393
 
394
+ # Save to temp file
395
+ temp_dir = tempfile.mkdtemp()
396
+ out_img_path = os.path.join(temp_dir, "output.png")
397
+ output_image.save(out_img_path)
 
 
 
 
 
398
 
399
+ stats = _parse_out_info_dict(out_info)
 
400
 
401
+ # Simplified summary lists
402
+ detections_summary = []
403
+ for det in detections:
404
+ detections_summary.append({
405
+ "label": det.get("label", "object"),
406
+ "type": det.get("type", "box"),
407
+ "coords": [round(c, 2) for c in det.get("coords", [])]
408
+ })
409
 
410
+ return out_img_path, stats, output_text, detections_summary
411
 
412
+
413
+ @spaces.GPU(duration=240, size="xlarge")
414
+ def run_video_gpu_api(
415
+ video_path: str, category: str, model_mode: str, temp: float, top_p: float, top_k: int,
416
+ short_size: int | None, question_override: str | None, max_video_frames: int
 
 
 
417
  ):
418
  import subprocess as _sp
419
 
 
 
 
 
 
 
 
420
  total_start = time.time()
421
  max_frames = int(max_video_frames) if max_video_frames else 4
422
 
423
+ categories_list = [c.strip() for c in category.split(",") if c.strip()]
424
+ category_str = "</c>".join(categories_list)
425
 
426
+ cap = cv2.VideoCapture(video_path)
 
 
427
  fps = cap.get(cv2.CAP_PROP_FPS)
428
  vid_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
429
  vid_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
436
  all_frames.append(frame)
437
  cap.release()
438
  total = len(all_frames)
 
 
 
439
 
440
  if total == 0:
441
+ raise ValueError("Failed to read any frames from the video.")
 
 
 
 
442
 
443
+ # Sample frames
444
  if total <= max_frames:
445
  sample_indices = list(range(total))
446
  else:
447
+ sample_indices = [int(round(i * (total - 1) / (max_frames - 1))) for i in range(max_frames)]
 
448
 
449
  sampled_frames = [all_frames[i] for i in sample_indices]
450
  n_sampled = len(sampled_frames)
451
 
452
+ # Budget check
 
 
453
  time_already_used = time.time() - total_start
454
  available_for_inference = GPU_HARD_LIMIT_VIDEO - time_already_used - PHASE2_RESERVE - SAFETY_MARGIN
455
  estimated_inference_time = n_sampled * EST_SECONDS_PER_FRAME
456
 
457
  if estimated_inference_time > available_for_inference:
458
+ max_feasible = max(1, int(available_for_inference // EST_SECONDS_PER_FRAME))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  if total <= max_feasible:
460
  sample_indices = list(range(total))
461
  else:
462
+ sample_indices = [int(round(i * (total - 1) / (max_feasible - 1))) for i in range(max_feasible)]
 
463
  sampled_frames = [all_frames[i] for i in sample_indices]
464
  n_sampled = len(sampled_frames)
465
 
 
466
  out_fps = max(1.0, n_sampled / (total / fps)) if fps > 0 else 5.0
467
  del all_frames
468
  gc.collect()
469
 
 
 
 
 
 
 
 
 
 
470
  inference_results = []
 
471
  processed_count = 0
472
  early_stopped = False
473
  early_stop_reason = ""
474
 
475
  for i, frame in enumerate(sampled_frames):
 
476
  elapsed_since_start = time.time() - total_start
477
  remaining_total = GPU_HARD_LIMIT_VIDEO - elapsed_since_start
478
 
479
  if remaining_total < PHASE2_RESERVE + SAFETY_MARGIN:
480
  early_stopped = True
481
+ early_stop_reason = f"GPU time budget running out. Only {remaining_total:.0f}s left."
 
 
 
 
 
 
482
  break
483
 
 
 
 
 
 
 
 
 
 
 
 
484
  pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
485
  process_img = _prepare_image_for_model(pil_img, short_size)
 
486
 
487
+ worker = get_worker()
488
+ if worker:
489
+ output_text, _, _ = worker.generate(
 
490
  process_img, categories_list, model_mode,
491
  temp=temp, top_p=top_p, top_k=top_k,
492
  question_override=question_override,
493
  )
494
  else:
495
+ output_text = f"Mock video detection: <ref>person</ref><box><100><150><800><900></box>"
 
496
 
497
  inference_results.append(output_text)
498
  processed_count += 1
499
+
 
 
500
  if torch.cuda.is_available():
501
  torch.cuda.empty_cache()
502
  gc.collect()
 
 
 
 
 
 
 
 
 
 
503
 
 
 
 
 
 
504
  if processed_count == 0:
505
+ raise RuntimeError("GPU budget exceeded before processing any frames.")
 
 
 
 
 
 
 
 
 
 
506
 
 
507
  sampled_frames_for_draw = sampled_frames[:processed_count]
508
  inference_results_for_draw = inference_results[:processed_count]
509
 
 
 
 
 
 
 
 
 
 
 
 
510
  tmp_raw = tempfile.mktemp(suffix=".raw.mp4")
511
  out_video_path = tempfile.mktemp(suffix=".mp4")
512
+ out = cv2.VideoWriter(tmp_raw, cv2.VideoWriter_fourcc(*"mp4v"), out_fps, (vid_w, vid_h))
 
513
 
514
+ detections_summary = []
515
+ for i, (frame, output_text) in enumerate(zip(sampled_frames_for_draw, inference_results_for_draw)):
 
516
  detections = parse_mixed_results(output_text, category_str)
517
  valid_results = _postprocess_detections(detections, vid_w, vid_h)
518
  frame_to_draw = draw_on_frame(frame, valid_results, draw_label=True)
519
  out.write(frame_to_draw)
520
+
521
+ for det in valid_results:
522
+ detections_summary.append({
523
+ "frame": i + 1,
524
+ "label": det.get("label", "object"),
525
+ "type": det.get("type", "box"),
526
+ "coords": det.get("coords", [])
527
+ })
528
 
529
  out.release()
 
530
 
531
+ # ffmpeg re-encode
532
  elapsed_now = time.time() - total_start
533
  remaining_now = GPU_HARD_LIMIT_VIDEO - elapsed_now
534
 
 
 
 
 
535
  if remaining_now > 15:
 
536
  try:
537
  ffmpeg_timeout = max(10, int(remaining_now - 5))
538
  _sp.run(
 
542
  check=True, capture_output=True, timeout=ffmpeg_timeout,
543
  )
544
  os.remove(tmp_raw)
545
+ except Exception:
 
546
  if os.path.exists(tmp_raw):
547
  os.replace(tmp_raw, out_video_path)
548
  else:
 
549
  os.replace(tmp_raw, out_video_path)
 
550
 
 
551
  total_time = time.time() - total_start
552
+ stats = {
553
+ "total_frames": total,
554
+ "sampled_frames": n_sampled,
555
+ "processed_frames": processed_count,
556
+ "total_time_seconds": round(total_time, 2),
557
+ "early_stopped": early_stopped,
558
+ "early_stop_reason": early_stop_reason
559
+ }
560
 
561
+ return out_video_path, stats, "\n---\n".join(inference_results_for_draw), detections_summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
 
563
 
564
  # ============================================================
565
+ # GRADIO SERVER APP
566
  # ============================================================
567
+ app = Server()
568
+
569
+ # Serve static assets folder
570
+ assets_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
571
+ if os.path.exists(assets_dir):
572
+ app.mount("/assets", StaticFiles(directory=assets_dir), name="assets")
573
+
574
+ @app.get("/")
575
+ async def homepage():
576
+ html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
577
+ if os.path.exists(html_path):
578
+ with open(html_path, "r", encoding="utf-8") as f:
579
+ return HTMLResponse(f.read())
580
+ return HTMLResponse("<h1 style='color: #ef4444; font-family: Inter, sans-serif; text-align: center; margin-top: 100px;'>index.html is missing</h1>")
581
+
582
+
583
+ @app.api(name="run_inference")
584
+ def run_inference_api(
585
+ input_type: str,
586
+ image_file: Any = None,
587
+ video_file: Any = None,
588
+ task_type: str = "Detection",
589
+ category: str = "objects",
590
+ model_mode: str = "hybrid",
591
+ temp: float = 0.7,
592
+ top_p: float = 0.9,
593
+ top_k: int = 20,
594
+ short_size: int | None = None,
595
+ question_override: str | None = None,
596
+ max_video_frames: int = 4
597
+ ) -> tuple[FileData | None, FileData | None, dict]:
598
+ """Exposed Gradio Queueing Endpoint for custom frontend interactions.
599
+
600
+ ZeroGPU allocation is triggered directly at this endpoint boundary.
601
+ Supports both FileData dict (from web uploads) and local strings (for examples).
602
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
  try:
604
+ if not category:
605
+ category = "objects"
606
+
607
+ final_prompt = question_override
608
+ if not final_prompt:
609
+ final_prompt = generate_raw_prompt(task_type, category)
610
+
611
+ if input_type == "Image":
612
+ if not image_file:
613
+ return None, None, {"success": False, "error": "Please upload an image."}
614
+
615
+ # Resolve image path (from either FileData upload or local example string)
616
+ if isinstance(image_file, str):
617
+ img_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), image_file)
618
+ elif isinstance(image_file, dict):
619
+ img_path = image_file.get("path")
620
+ else:
621
+ img_path = getattr(image_file, "path", None)
622
+
623
+ if not img_path or not os.path.exists(img_path):
624
+ return None, None, {"success": False, "error": f"Invalid image file path: {img_path}"}
625
+
626
+ out_img_path, stats, raw_text, detections = run_image_gpu_api(
627
+ img_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt
628
+ )
629
+
630
+ meta = {
631
+ "success": True,
632
+ "input_type": "Image",
633
+ "stats": stats,
634
+ "raw_text": raw_text,
635
+ "detections": detections,
636
+ "final_prompt": final_prompt
637
+ }
638
+ return FileData(path=out_img_path), None, meta
 
 
 
 
 
639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
  else:
641
+ if not video_file:
642
+ return None, None, {"success": False, "error": "Please upload a video."}
643
+
644
+ # Resolve video path (from either FileData upload or local example string)
645
+ if isinstance(video_file, str):
646
+ vid_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), video_file)
647
+ elif isinstance(video_file, dict):
648
+ vid_path = video_file.get("path")
649
+ else:
650
+ vid_path = getattr(video_file, "path", None)
651
+
652
+ if not vid_path or not os.path.exists(vid_path):
653
+ return None, None, {"success": False, "error": f"Invalid video file path: {vid_path}"}
654
+
655
+ out_vid_path, stats, raw_text, detections = run_video_gpu_api(
656
+ vid_path, category, model_mode, temp, top_p, top_k, short_size, final_prompt, max_video_frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  )
658
 
659
+ meta = {
660
+ "success": True,
661
+ "input_type": "Video",
662
+ "stats": stats,
663
+ "raw_text": raw_text,
664
+ "detections": detections,
665
+ "final_prompt": final_prompt
666
+ }
667
+ return None, FileData(path=out_vid_path), meta
 
 
 
 
 
 
 
 
 
 
668
 
669
+ except Exception as e:
670
+ import traceback
671
+ traceback.print_exc()
672
+ return None, None, {"success": False, "error": str(e)}
 
 
 
 
 
 
 
673
 
674
 
675
  if __name__ == "__main__":
676
+ app.launch(show_error=True)
 
index.html ADDED
@@ -0,0 +1,926 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>NVIDIA LocateAnything - Fast Vision-Language Grounding</title>
7
+
8
+ <!-- Premium Google Fonts -->
9
+ <link rel="preconnect" href="https://fonts.googleapis.com">
10
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
11
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Outfit:wght@500;600;700;800;900&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
12
+
13
+ <!-- Tailwind CSS CDN -->
14
+ <script src="https://cdn.tailwindcss.com"></script>
15
+
16
+ <script>
17
+ tailwind.config = {
18
+ theme: {
19
+ extend: {
20
+ fontFamily: {
21
+ sans: ['Inter', 'sans-serif'],
22
+ outfit: ['Outfit', 'sans-serif'],
23
+ mono: ['Fira Code', 'monospace'],
24
+ },
25
+ colors: {
26
+ nvidia: {
27
+ light: '#76b900',
28
+ brand: '#76b900',
29
+ dark: '#5c9000',
30
+ hover: '#87d300',
31
+ },
32
+ dark: {
33
+ 50: '#222222',
34
+ 100: '#1a1a1a',
35
+ 200: '#121212',
36
+ 300: '#0a0a0a',
37
+ 400: '#050505',
38
+ }
39
+ }
40
+ }
41
+ }
42
+ }
43
+ </script>
44
+
45
+ <style>
46
+ body {
47
+ background-color: #050505;
48
+ background-image:
49
+ radial-gradient(circle at 10% 20%, rgba(118, 185, 0, 0.08) 0%, transparent 45%),
50
+ radial-gradient(circle at 90% 80%, rgba(99, 102, 241, 0.05) 0%, transparent 45%);
51
+ background-attachment: fixed;
52
+ }
53
+
54
+ /* NVIDIA-style Carbon Triangle Grid Pattern */
55
+ .carbon-grid {
56
+ background-image:
57
+ linear-gradient(30deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
58
+ linear-gradient(150deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
59
+ linear-gradient(30deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
60
+ linear-gradient(150deg, #0f0f0f 12%, transparent 12.5%, transparent 87%, #0f0f0f 87.5%, #0f0f0f),
61
+ linear-gradient(60deg, #171717 25%, transparent 25.5%, transparent 75%, #171717 75.5%, #171717),
62
+ linear-gradient(60deg, #171717 25%, transparent 25.5%, transparent 75%, #171717 75.5%, #171717);
63
+ background-size: 80px 140px;
64
+ background-position: 0 0, 0 0, 40px 70px, 40px 70px, 0 0, 40px 70px;
65
+ }
66
+
67
+ /* Glassmorphism Styles */
68
+ .glass-panel {
69
+ background: rgba(18, 18, 18, 0.65);
70
+ backdrop-filter: blur(20px);
71
+ -webkit-backdrop-filter: blur(20px);
72
+ border: 1px solid rgba(255, 255, 255, 0.04);
73
+ box-shadow: 0 24px 64px 0 rgba(0, 0, 0, 0.7);
74
+ }
75
+
76
+ .glass-panel-interactive {
77
+ transition: all 0.4s cubic-bezier(0.16, 1, 0.3, 1);
78
+ }
79
+ .glass-panel-interactive:hover {
80
+ border-color: rgba(118, 185, 0, 0.25);
81
+ box-shadow: 0 30px 80px 0 rgba(118, 185, 0, 0.08);
82
+ transform: translateY(-2px);
83
+ }
84
+
85
+ /* SAM 3 Style Glassmorphic Float Input */
86
+ .sam-input-bar {
87
+ background: rgba(255, 255, 255, 0.06);
88
+ backdrop-filter: blur(25px);
89
+ -webkit-backdrop-filter: blur(25px);
90
+ border: 1px solid rgba(255, 255, 255, 0.08);
91
+ box-shadow: 0 16px 40px rgba(0, 0, 0, 0.5);
92
+ transition: all 0.3s cubic-bezier(0.16, 1, 0.3, 1);
93
+ }
94
+ .sam-input-bar:focus-within {
95
+ background: rgba(255, 255, 255, 0.09);
96
+ border-color: rgba(118, 185, 0, 0.6);
97
+ box-shadow: 0 20px 48px rgba(118, 185, 0, 0.15);
98
+ }
99
+
100
+ /* Hexagonal Glowing Border for Media Workspace (NVIDIA GTC Keynote Style) */
101
+ .gtc-polygon-wrapper {
102
+ position: relative;
103
+ background: #0f1218;
104
+ border: 1px solid rgba(118, 185, 0, 0.15);
105
+ box-shadow: 0 0 50px rgba(0, 0, 0, 0.8);
106
+ overflow: hidden;
107
+ clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
108
+ }
109
+ .gtc-polygon-wrapper::before {
110
+ content: '';
111
+ position: absolute;
112
+ top: 0;
113
+ left: 0;
114
+ width: 100%;
115
+ height: 100%;
116
+ border: 2px solid #76b900;
117
+ pointer-events: none;
118
+ clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
119
+ opacity: 0.8;
120
+ box-shadow: inset 0 0 20px rgba(118, 185, 0, 0.3);
121
+ }
122
+
123
+ .gtc-neon-border {
124
+ position: absolute;
125
+ top: -2px;
126
+ left: -2px;
127
+ right: -2px;
128
+ bottom: -2px;
129
+ background: linear-gradient(135deg, #76b900, #3f6200, #76b900);
130
+ z-index: 0;
131
+ pointer-events: none;
132
+ opacity: 0.95;
133
+ clip-path: polygon(8% 0%, 100% 0%, 100% 92%, 92% 100%, 0% 100%, 0% 8%);
134
+ }
135
+
136
+ .gtc-inner-box {
137
+ position: relative;
138
+ background: #080a0e;
139
+ z-index: 10;
140
+ height: 100%;
141
+ clip-path: polygon(8.1% 0.1%, 99.9% 0.1%, 99.9% 91.9%, 91.9% 99.9%, 0.1% 99.9%, 0.1% 8.1%);
142
+ }
143
+
144
+ /* Pill Buttons styling */
145
+ .pill-btn-green {
146
+ background-color: #76b900;
147
+ transition: all 0.3s cubic-bezier(0.16, 1, 0.3, 1);
148
+ }
149
+ .pill-btn-green:hover {
150
+ background-color: #87d300;
151
+ box-shadow: 0 0 24px rgba(118, 185, 0, 0.45);
152
+ transform: translateY(-1px);
153
+ }
154
+ .pill-btn-green:active {
155
+ transform: translateY(1px);
156
+ }
157
+
158
+ /* Custom Scrollbar */
159
+ ::-webkit-scrollbar {
160
+ width: 6px;
161
+ height: 6px;
162
+ }
163
+ ::-webkit-scrollbar-track {
164
+ background: #0a0a0a;
165
+ }
166
+ ::-webkit-scrollbar-thumb {
167
+ background: #222;
168
+ border-radius: 3px;
169
+ }
170
+ ::-webkit-scrollbar-thumb:hover {
171
+ background: #333;
172
+ }
173
+
174
+ /* Pulse loaders */
175
+ .dot-pulse {
176
+ animation: pulse 1.4s infinite ease-in-out;
177
+ }
178
+ @keyframes pulse {
179
+ 0%, 100% { opacity: 0.3; transform: scale(0.9); }
180
+ 50% { opacity: 1; transform: scale(1.1); }
181
+ }
182
+
183
+ .drop-zone-active {
184
+ border-color: #76b900 !important;
185
+ background: rgba(118, 185, 0, 0.04) !important;
186
+ }
187
+ </style>
188
+ </head>
189
+ <body class="text-slate-100 font-sans min-h-screen pb-16 carbon-grid">
190
+
191
+ <!-- NVIDIA Brand Navigation Header (Transparent dark blur) -->
192
+ <nav class="bg-black/40 backdrop-blur-md sticky top-0 z-50 px-6 py-3.5 border-b border-white/5 shadow-lg">
193
+ <div class="max-w-7xl mx-auto flex items-center justify-between">
194
+ <!-- Official Styled NVIDIA Brand Text Logo -->
195
+ <a href="#" class="flex items-center gap-1.5 select-none group">
196
+ <svg class="h-6 w-6 text-nvidia-brand transition-transform duration-500 group-hover:rotate-180" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5">
197
+ <path stroke-linecap="round" stroke-linejoin="round" d="M9 3v2m6-2v2M9 19v2m6-2v2M5 9H3m2 6H3m18-6h-2m2 6h-2M7 19h10a2 2 0 002-2V7a2 2 0 00-2-2H7a2 2 0 00-2 2v10a2 2 0 002 2z" />
198
+ </svg>
199
+ <span class="font-outfit text-[22px] font-black tracking-tighter text-white">
200
+ NVIDIA <span class="font-light tracking-wide text-slate-400">LocateAnything</span>
201
+ </span>
202
+ </a>
203
+
204
+ <span class="px-3 py-1 text-xs font-semibold rounded bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 flex items-center gap-1.5 font-mono">
205
+ <span class="h-1.5 w-1.5 rounded-full bg-nvidia-brand animate-pulse"></span>
206
+ ZeroGPU Server
207
+ </span>
208
+ </div>
209
+ </nav>
210
+
211
+ <!-- MAIN MINIMAL LAYOUT CONTAINER -->
212
+ <main class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 pt-8 lg:pt-10 space-y-8">
213
+
214
+ <!-- Giant Showcase Container (SAM 3 Full-Bleed Style) -->
215
+ <div class="relative w-full rounded-[32px] overflow-hidden border border-white/5 bg-[#080a0e] shadow-2xl h-[580px] lg:h-[640px] flex select-none">
216
+
217
+ <!-- 1. Background Media Canvas (Coverage Layer) -->
218
+ <div class="absolute inset-0 z-0 flex items-center justify-center bg-black/40">
219
+ <!-- Drop Zone (Initially shown) -->
220
+ <div id="drop-zone" class="absolute inset-0 border-none rounded-none bg-transparent flex flex-col items-center justify-center p-4 text-center cursor-pointer transition-all z-10">
221
+ <div id="upload-prompt" class="space-y-3 opacity-60 hover:opacity-100 transition-opacity">
222
+ <div class="inline-flex h-12 w-12 rounded-full bg-white/5 items-center justify-center text-slate-300">
223
+ <svg class="h-6 w-6" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
224
+ <path stroke-linecap="round" stroke-linejoin="round" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-8l-4-4m0 0L8 8m4-4v12" />
225
+ </svg>
226
+ </div>
227
+ <div>
228
+ <p class="text-xs font-bold text-slate-200">Drag & drop your file here</p>
229
+ <p class="text-[10px] text-slate-500 mt-1">or click to browse local folders</p>
230
+ </div>
231
+ </div>
232
+
233
+ <!-- Dynamic Preview Media -->
234
+ <img id="preview-image" src="" alt="Input Preview" class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl z-20 border border-white/5">
235
+ <video id="preview-video" src="" controls class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl z-20 border border-white/5"></video>
236
+
237
+ <!-- File Input -->
238
+ <input type="file" id="media-file-input" accept="image/*,video/*" class="absolute inset-0 opacity-0 cursor-pointer z-30">
239
+ </div>
240
+
241
+ <!-- Inference Output Zone -->
242
+ <div class="absolute inset-0 pointer-events-none flex items-center justify-center z-20">
243
+ <img id="output-image" src="" alt="Inference Output" class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl pointer-events-auto border border-white/5">
244
+ <video id="output-video" src="" controls class="hidden max-h-full max-w-full rounded-2xl object-contain shadow-2xl pointer-events-auto border border-white/5"></video>
245
+ </div>
246
+
247
+ <!-- Processing Overlays -->
248
+ <div id="processing-overlay" class="absolute inset-0 bg-black/85 backdrop-blur-sm hidden flex-col items-center justify-center gap-4 z-40">
249
+ <div class="flex gap-1.5">
250
+ <span class="dot-pulse inline-block h-3 w-3 rounded-full bg-nvidia-brand" style="animation-delay: 0s;"></span>
251
+ <span class="dot-pulse inline-block h-3 w-3 rounded-full bg-emerald-400" style="animation-delay: 0.2s;"></span>
252
+ <span class="dot-pulse inline-block h-3 w-3 rounded-full bg-emerald-300" style="animation-delay: 0.4s;"></span>
253
+ </div>
254
+ <div class="text-center space-y-1">
255
+ <p id="processing-status" class="text-[11px] font-bold tracking-widest text-slate-200 uppercase">Executing Model...</p>
256
+ <p class="text-[9px] text-slate-500 uppercase tracking-wider font-mono">ZeroGPU Queue Active</p>
257
+ </div>
258
+ </div>
259
+ </div>
260
+
261
+ <!-- 2. Left Floating Overlay Panel (Title, simple selectors, accordion, and action buttons) -->
262
+ <div class="absolute left-6 lg:left-12 top-8 bottom-8 z-30 flex flex-col justify-between max-w-sm sm:max-w-md pointer-events-none">
263
+
264
+ <!-- Main Header Overlay text -->
265
+ <div class="space-y-3 pt-4 pointer-events-auto bg-gradient-to-b from-[#080a0e]/90 via-[#080a0e]/60 to-transparent p-4 rounded-2xl">
266
+ <span class="text-[9px] font-bold text-nvidia-brand uppercase tracking-widest block font-mono">AI Research from NVIDIA</span>
267
+ <h1 class="font-outfit text-3xl sm:text-5xl font-black tracking-tight text-white leading-none">
268
+ Locate<span class="text-nvidia-brand font-light">Anything</span>
269
+ </h1>
270
+ <p class="text-xs text-slate-400 max-w-sm font-medium leading-relaxed">
271
+ NVIDIA's advanced 3B vision-language model. Locate any object, UI target, or text in images and videos with natural language.
272
+ </p>
273
+ </div>
274
+
275
+ <!-- Setup Glass Card Controls -->
276
+ <div class="glass-panel rounded-2xl p-4 space-y-4 pointer-events-auto max-w-xs shadow-2xl">
277
+ <div class="grid grid-cols-2 gap-3">
278
+
279
+ <!-- Media Type toggle selection -->
280
+ <div class="space-y-1">
281
+ <label class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Media Type</label>
282
+ <div class="grid grid-cols-2 gap-0.5 bg-black/40 p-0.5 rounded-lg border border-white/5 text-center">
283
+ <button id="media-type-image" class="py-1 rounded-md font-semibold text-[9px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10">
284
+ Image
285
+ </button>
286
+ <button id="media-type-video" class="py-1 rounded-md font-semibold text-[9px] text-slate-400 hover:text-slate-200 transition-all">
287
+ Video
288
+ </button>
289
+ </div>
290
+ </div>
291
+
292
+ <!-- Task Selector -->
293
+ <div class="space-y-1">
294
+ <label for="task-type" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Task Type</label>
295
+ <select id="task-type" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200 font-semibold">
296
+ <option value="Detection">Detection</option>
297
+ <option value="Grounding">Grounding</option>
298
+ <option value="OCR">OCR</option>
299
+ <option value="GUI">GUI</option>
300
+ <option value="Pointing">Pointing</option>
301
+ </select>
302
+ </div>
303
+
304
+ </div>
305
+
306
+ <!-- Advanced parameters sliders (Collapsible details inside the left overlay) -->
307
+ <details class="group border-t border-white/5 pt-3">
308
+ <summary class="list-none flex justify-between items-center cursor-pointer select-none text-[8px] font-bold text-slate-400 tracking-wider uppercase hover:text-slate-200 transition-colors">
309
+ <span>⚙️ Advanced parameters</span>
310
+ <svg class="h-3 w-3 transform group-open:rotate-180 transition-transform text-slate-500" fill="none" viewBox="0 0 24 24" stroke="currentColor">
311
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7" />
312
+ </svg>
313
+ </summary>
314
+ <div class="space-y-3 pt-3">
315
+
316
+ <!-- Inference Mode Selection -->
317
+ <div class="space-y-1">
318
+ <label for="inference-mode" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Inference Mode</label>
319
+ <select id="inference-mode" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200">
320
+ <option value="hybrid">Hybrid</option>
321
+ <option value="fast">Fast</option>
322
+ <option value="slow">Slow</option>
323
+ </select>
324
+ </div>
325
+
326
+ <!-- Short side resize cap -->
327
+ <div class="space-y-1">
328
+ <label for="short-size" class="text-[8px] font-bold text-slate-400 uppercase tracking-widest">Resize Cap (px)</label>
329
+ <input type="number" id="short-size" placeholder="Auto-Cap (1024)" class="w-full bg-black/40 border border-white/5 rounded-lg px-2 py-1 text-[9px] focus:border-nvidia-brand focus:outline-none transition-all text-slate-200 font-mono">
330
+ </div>
331
+
332
+ <!-- Temp -->
333
+ <div class="space-y-1">
334
+ <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
335
+ <span>Temperature</span>
336
+ <span id="temp-val" class="font-mono text-nvidia-brand">0.7</span>
337
+ </div>
338
+ <input type="range" id="temp" min="0.1" max="2.0" step="0.1" value="0.7" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
339
+ </div>
340
+
341
+ <!-- Top P -->
342
+ <div class="space-y-1">
343
+ <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
344
+ <span>Top P</span>
345
+ <span id="topp-val" class="font-mono text-nvidia-brand">0.9</span>
346
+ </div>
347
+ <input type="range" id="topp" min="0.05" max="1.0" step="0.05" value="0.9" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
348
+ </div>
349
+
350
+ <!-- Top K -->
351
+ <div class="space-y-1">
352
+ <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
353
+ <span>Top K</span>
354
+ <span id="topk-val" class="font-mono text-nvidia-brand">20</span>
355
+ </div>
356
+ <input type="range" id="topk" min="1" max="100" step="1" value="20" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand">
357
+ </div>
358
+
359
+ <!-- Video Frames (Only displayed for Video mode) -->
360
+ <div id="video-frames-wrapper" class="space-y-1 opacity-50 pointer-events-none transition-opacity duration-300">
361
+ <div class="flex justify-between text-[8px] uppercase font-bold text-slate-400 tracking-wider">
362
+ <span>Max Video Frames</span>
363
+ <span id="frames-val" class="font-mono text-nvidia-brand">4</span>
364
+ </div>
365
+ <input type="range" id="max-frames" min="1" max="10" step="1" value="4" class="w-full h-0.5 bg-black rounded appearance-none cursor-pointer accent-nvidia-brand" disabled>
366
+ </div>
367
+
368
+ </div>
369
+ </details>
370
+ </div>
371
+
372
+ <!-- CTA Action Button (Floats at bottom-left corner of visual container) -->
373
+ <div class="pointer-events-auto pt-2 max-w-xs">
374
+ <button id="run-btn" class="pill-btn-green w-full py-3 px-6 rounded-full text-black font-extrabold text-sm flex items-center justify-center gap-2 select-none shadow-2xl">
375
+ <span id="btn-icon">🧠</span>
376
+ <span id="btn-text">Run Inference</span>
377
+ </button>
378
+ </div>
379
+
380
+ </div>
381
+
382
+ <!-- 3. Floating Categories Search Bar Overlay (Right/Center side, extremely clean glass box) -->
383
+ <div class="absolute top-1/2 right-6 lg:right-16 -translate-y-1/2 z-30 flex justify-center pointer-events-none w-full max-w-xs">
384
+ <div class="sam-input-bar rounded-2xl px-3.5 py-2.5 flex items-center gap-2 w-full pointer-events-auto">
385
+ <svg class="h-4 w-4 text-nvidia-brand shrink-0" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
386
+ <path stroke-linecap="round" stroke-linejoin="round" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" />
387
+ </svg>
388
+ <input type="text" id="categories" value="car, bus, person, potted plant" placeholder="Describe objects to locate..." class="bg-transparent border-none outline-none focus:outline-none w-full text-slate-100 placeholder-slate-600 font-semibold text-xs">
389
+ <button id="clear-search-btn" class="text-slate-500 hover:text-white transition-colors p-0.5 rounded-full hover:bg-white/5 shrink-0">
390
+ <svg class="h-3.5 w-3.5" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2.5">
391
+ <path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
392
+ </svg>
393
+ </button>
394
+ </div>
395
+ </div>
396
+
397
+ <!-- Floating Workspace Status -->
398
+ <div class="absolute bottom-4 right-4 z-30 bg-black/60 backdrop-blur px-2.5 py-1 rounded-lg border border-white/10 text-[9px] text-slate-400 font-mono select-none pointer-events-none">
399
+ status: <span id="workspace-status" class="text-slate-200 font-semibold">No Media Loaded</span>
400
+ </div>
401
+
402
+ </div>
403
+
404
+ <!-- Shelf Section (Examples and Log metrics placed directly below the giant showcase) -->
405
+ <div class="grid grid-cols-1 lg:grid-cols-12 gap-6 items-start">
406
+
407
+ <!-- Left: Examples Library Shelf (Col Span: 5) -->
408
+ <div class="lg:col-span-5 space-y-4">
409
+ <div class="glass-panel rounded-2xl p-5 space-y-4">
410
+ <span class="text-[9px] font-bold text-slate-400 uppercase tracking-widest block font-mono">🖼️ Interactive Quick Sandbox</span>
411
+ <div class="grid grid-cols-4 gap-3">
412
+
413
+ <!-- Card 1 -->
414
+ <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Book" data-category="book" data-task="Detection" data-mode="hybrid" data-asset="assets/book.jpg">
415
+ <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/book.jpg');"></div>
416
+ <span class="text-[9px] font-semibold text-slate-300 block truncate">Book</span>
417
+ </div>
418
+
419
+ <!-- Card 2 -->
420
+ <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Sweet" data-category="sweet" data-task="Detection" data-mode="hybrid" data-asset="assets/sweet.jpg">
421
+ <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/sweet.jpg');"></div>
422
+ <span class="text-[9px] font-semibold text-slate-300 block truncate">Sweet</span>
423
+ </div>
424
+
425
+ <!-- Card 3 -->
426
+ <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="Person" data-category="person" data-task="Detection" data-mode="hybrid" data-asset="assets/person.jpg">
427
+ <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/person.jpg');"></div>
428
+ <span class="text-[9px] font-semibold text-slate-300 block truncate">People</span>
429
+ </div>
430
+
431
+ <!-- Card 4 -->
432
+ <div class="example-card border border-white/5 rounded-xl p-1 cursor-pointer group space-y-1 bg-black/35 hover:border-nvidia-brand/20 transition-all text-center" data-type="Image" data-name="OCR" data-category="text" data-task="OCR" data-mode="fast" data-asset="assets/ocr.jpg">
433
+ <div class="h-12 w-full rounded-lg bg-cover bg-center overflow-hidden bg-slate-900" style="background-image: url('/assets/ocr.jpg');"></div>
434
+ <span class="text-[9px] font-semibold text-slate-300 block truncate">OCR</span>
435
+ </div>
436
+
437
+ </div>
438
+ </div>
439
+
440
+ <!-- Text Prompt logs -->
441
+ <div class="glass-panel rounded-2xl p-4 text-[10px] text-slate-500 font-mono flex justify-between items-center select-none bg-black/40">
442
+ <span class="truncate block">compiled: <span id="raw-prompt-preview" class="text-slate-400"></span></span>
443
+ </div>
444
+ </div>
445
+
446
+ <!-- Right: Performance Metrics & Tag draw overlays (Col Span: 7) -->
447
+ <div class="lg:col-span-7 space-y-4">
448
+ <div class="glass-panel rounded-2xl p-5 space-y-4">
449
+ <div class="grid grid-cols-1 sm:grid-cols-12 gap-4 items-stretch">
450
+
451
+ <!-- Performance Statistics Metrics Console (Grid: 5) -->
452
+ <div class="sm:col-span-5 bg-black/60 rounded-xl p-4 border border-white/5 font-mono text-[10px] text-slate-300 space-y-2 leading-normal">
453
+ <div class="text-nvidia-brand font-bold border-b border-white/5 pb-1 mb-1.5 uppercase tracking-widest text-[9px] font-mono">📊 Metrics Log</div>
454
+ <div class="flex justify-between"><span class="text-slate-500">Status:</span> <span id="meta-status" class="text-emerald-500 font-semibold">Idle</span></div>
455
+ <div class="flex justify-between"><span class="text-slate-500">Tokens/Frames:</span> <span id="meta-tokens">-</span></div>
456
+ <div class="flex justify-between"><span class="text-slate-500">Detections:</span> <span id="meta-boxes">-</span></div>
457
+ <div class="flex justify-between"><span class="text-slate-500">TPS / BPS:</span> <span><span id="meta-tps">-</span> / <span id="meta-bps">-</span></span></div>
458
+ <div class="flex justify-between"><span class="text-slate-500">Time:</span> <span id="meta-time">-</span></div>
459
+ </div>
460
+
461
+ <!-- Tag drawer box list (Grid: 7) -->
462
+ <div class="sm:col-span-7 bg-black/60 rounded-xl p-4 border border-white/5 flex flex-col">
463
+ <div class="text-nvidia-brand font-mono font-bold border-b border-white/5 pb-1 mb-2 uppercase tracking-widest text-[9px] flex justify-between shrink-0">
464
+ <span>🎯 Detected Target Overlays</span>
465
+ <span id="detection-count-badge" class="text-[8px] bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 px-1.5 py-0.5 rounded-full font-bold">0</span>
466
+ </div>
467
+ <div id="detection-tags-wrapper" class="flex-1 flex flex-wrap gap-1.5 max-h-[100px] overflow-y-auto pt-1 align-content-start text-[10px] text-slate-500">
468
+ Run inference to populate target tags here.
469
+ </div>
470
+ </div>
471
+
472
+ </div>
473
+
474
+ <!-- Optional dynamic trace wrapper -->
475
+ <div id="rich-trace-log" class="hidden border-t border-white/5 pt-3"></div>
476
+ </div>
477
+ </div>
478
+
479
+ </div>
480
+
481
+ </main>
482
+
483
+ <!-- Gradio client connection & app runtime logic -->
484
+ <script type="module">
485
+ import { client, handle_file } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";
486
+
487
+ // State variables
488
+ let selectedMediaType = "Image";
489
+ let activeFile = null;
490
+ let clientInstance = null;
491
+
492
+ // Cache elements
493
+ const mediaTypeImageBtn = document.getElementById("media-type-image");
494
+ const mediaTypeVideoBtn = document.getElementById("media-type-video");
495
+ const videoFramesWrapper = document.getElementById("video-frames-wrapper");
496
+ const taskTypeSelect = document.getElementById("task-type");
497
+ const categoriesInput = document.getElementById("categories");
498
+ const clearSearchBtn = document.getElementById("clear-search-btn");
499
+ const inferenceModeSelect = document.getElementById("inference-mode");
500
+ const rawPromptPreview = document.getElementById("raw-prompt-preview");
501
+
502
+ // Advanced Controls Elements
503
+ const tempSlider = document.getElementById("temp");
504
+ const tempVal = document.getElementById("temp-val");
505
+ const toppSlider = document.getElementById("topp");
506
+ const toppVal = document.getElementById("topp-val");
507
+ const topkSlider = document.getElementById("topk");
508
+ const topkVal = document.getElementById("topk-val");
509
+ const shortSizeInput = document.getElementById("short-size");
510
+ const maxFramesSlider = document.getElementById("max-frames");
511
+ const maxFramesVal = document.getElementById("frames-val");
512
+
513
+ // Workspace Preview elements
514
+ const dropZone = document.getElementById("drop-zone");
515
+ const uploadPrompt = document.getElementById("upload-prompt");
516
+ const previewImage = document.getElementById("preview-image");
517
+ const previewVideo = document.getElementById("preview-video");
518
+ const fileInput = document.getElementById("media-file-input");
519
+ const workspaceStatus = document.getElementById("workspace-status");
520
+
521
+ // Output result elements
522
+ const outputEmpty = document.getElementById("output-empty");
523
+ const outputImage = document.getElementById("output-image");
524
+ const outputVideo = document.getElementById("output-video");
525
+
526
+ // Overlay and run button
527
+ const runBtn = document.getElementById("run-btn");
528
+ const btnText = document.getElementById("btn-text");
529
+ const btnIcon = document.getElementById("btn-icon");
530
+ const processingOverlay = document.getElementById("processing-overlay");
531
+ const processingStatus = document.getElementById("processing-status");
532
+
533
+ // Logging & Trace elements
534
+ const metaStatus = document.getElementById("meta-status");
535
+ const metaTokens = document.getElementById("meta-tokens");
536
+ const metaBoxes = document.getElementById("meta-boxes");
537
+ const metaTps = document.getElementById("meta-tps");
538
+ const metaBps = document.getElementById("meta-bps");
539
+ const metaTime = document.getElementById("meta-time");
540
+ const detectionTagsWrapper = document.getElementById("detection-tags-wrapper");
541
+ const detectionCountBadge = document.getElementById("detection-count-badge");
542
+ const richTraceLog = document.getElementById("rich-trace-log");
543
+
544
+ // Connect client
545
+ async function getClient() {
546
+ if (!clientInstance) {
547
+ try {
548
+ clientInstance = await client(window.location.origin);
549
+ } catch (e) {
550
+ console.error("Gradio Server connection failed:", e);
551
+ alert("Could not connect to Gradio backend. Ensure the server is active.");
552
+ }
553
+ }
554
+ return clientInstance;
555
+ }
556
+
557
+ // Live values updater
558
+ function setupLiveUpdaters() {
559
+ tempSlider.addEventListener("input", (e) => tempVal.textContent = e.target.value);
560
+ toppSlider.addEventListener("input", (e) => toppVal.textContent = e.target.value);
561
+ topkSlider.addEventListener("input", (e) => topkVal.textContent = e.target.value);
562
+ maxFramesSlider.addEventListener("input", (e) => maxFramesVal.textContent = e.target.value);
563
+
564
+ // Clear search categories button
565
+ clearSearchBtn.addEventListener("click", () => {
566
+ categoriesInput.value = "";
567
+ categoriesInput.focus();
568
+ triggerPromptUpdate();
569
+ });
570
+
571
+ // Trigger prompt generation updates
572
+ const triggerPromptUpdate = () => {
573
+ const task = taskTypeSelect.value;
574
+ const cat = categoriesInput.value;
575
+ rawPromptPreview.textContent = generateRawPromptText(task, cat);
576
+ };
577
+ taskTypeSelect.addEventListener("change", triggerPromptUpdate);
578
+ categoriesInput.addEventListener("input", triggerPromptUpdate);
579
+
580
+ // Run prompt builder initially
581
+ triggerPromptUpdate();
582
+ }
583
+
584
+ // Prompt builder mirroring python logic
585
+ function generateRawPromptText(taskType, category) {
586
+ if (!category) category = "objects";
587
+ const cats = category.split(",")
588
+ .map(c => c.trim())
589
+ .filter(c => c.length > 0)
590
+ .join("</c>");
591
+
592
+ switch (taskType) {
593
+ case "Detection": return `Locate all the instances that matches the following description: ${cats}.`;
594
+ case "Grounding": return `Locate all the instances that match the following description: ${cats}.`;
595
+ case "OCR": return "Detect all the text in box format.";
596
+ case "GUI": return `Locate the region that matches the following description: ${cats}.`;
597
+ case "Pointing": return `Point to: ${cats}.`;
598
+ default: return `Locate all the instances that matches the following description: ${cats}.`;
599
+ }
600
+ }
601
+
602
+ // Switch workspace input styles without clearing
603
+ function setMediaType(type) {
604
+ selectedMediaType = type;
605
+ if (type === "Image") {
606
+ mediaTypeImageBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10";
607
+ mediaTypeVideoBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] text-slate-400 hover:text-slate-200 transition-all";
608
+ videoFramesWrapper.classList.add("hidden");
609
+ videoFramesWrapper.classList.add("opacity-50");
610
+ videoFramesWrapper.classList.add("pointer-events-none");
611
+ maxFramesSlider.disabled = true;
612
+ fileInput.accept = "image/*";
613
+ workspaceStatus.textContent = activeFile ? "Image Loaded" : "No Media Loaded";
614
+ } else {
615
+ mediaTypeVideoBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] transition-all bg-nvidia-brand text-black font-outfit font-black shadow shadow-nvidia-brand/10";
616
+ mediaTypeImageBtn.className = "py-1.5 rounded-lg font-semibold text-[10px] text-slate-400 hover:text-slate-200 transition-all";
617
+ videoFramesWrapper.classList.remove("hidden");
618
+ videoFramesWrapper.classList.remove("opacity-50");
619
+ videoFramesWrapper.classList.remove("pointer-events-none");
620
+ maxFramesSlider.disabled = false;
621
+ fileInput.accept = "video/*";
622
+ workspaceStatus.textContent = activeFile ? "Video Loaded" : "No Media Loaded";
623
+ }
624
+ }
625
+
626
+ // Reset elements
627
+ function clearWorkspace() {
628
+ activeFile = null;
629
+ previewImage.src = "";
630
+ previewImage.classList.add("hidden");
631
+ previewVideo.src = "";
632
+ previewVideo.classList.add("hidden");
633
+ uploadPrompt.classList.remove("hidden");
634
+ if (outputEmpty) outputEmpty.classList.remove("hidden");
635
+ outputImage.src = "";
636
+ outputImage.classList.add("hidden");
637
+ outputVideo.src = "";
638
+ outputVideo.classList.add("hidden");
639
+ workspaceStatus.textContent = "Workspace Cleared";
640
+ }
641
+
642
+ // Drag and drop utilities
643
+ function setupDragDrop() {
644
+ ['dragenter', 'dragover'].forEach(eventName => {
645
+ dropZone.addEventListener(eventName, (e) => {
646
+ e.preventDefault();
647
+ dropZone.classList.add('drop-zone-active');
648
+ }, false);
649
+ });
650
+
651
+ ['dragleave', 'drop'].forEach(eventName => {
652
+ dropZone.addEventListener(eventName, (e) => {
653
+ e.preventDefault();
654
+ dropZone.classList.remove('drop-zone-active');
655
+ }, false);
656
+ });
657
+
658
+ dropZone.addEventListener('drop', (e) => {
659
+ const dt = e.dataTransfer;
660
+ const file = dt.files[0];
661
+ if (file) handleFileImport(file);
662
+ });
663
+
664
+ fileInput.addEventListener('change', (e) => {
665
+ const file = e.target.files[0];
666
+ if (file) handleFileImport(file);
667
+ });
668
+ }
669
+
670
+ // Display imported media
671
+ function handleFileImport(file) {
672
+ uploadPrompt.classList.add("hidden");
673
+
674
+ if (file.type.startsWith("image/")) {
675
+ setMediaType("Image");
676
+ activeFile = file;
677
+
678
+ const reader = new FileReader();
679
+ reader.onload = (e) => {
680
+ previewImage.src = e.target.result;
681
+ previewImage.classList.remove("hidden");
682
+ previewVideo.classList.add("hidden");
683
+ };
684
+ reader.readAsDataURL(file);
685
+ workspaceStatus.textContent = `Image Loaded: ${file.name}`;
686
+ } else if (file.type.startsWith("video/")) {
687
+ setMediaType("Video");
688
+ activeFile = file;
689
+
690
+ previewVideo.src = URL.createObjectURL(file);
691
+ previewVideo.classList.remove("hidden");
692
+ previewImage.classList.add("hidden");
693
+ workspaceStatus.textContent = `Video Loaded: ${file.name}`;
694
+ }
695
+ }
696
+
697
+ // Initialize preloaded examples click actions
698
+ // Utility to fetch preloaded example assets and convert to File
699
+ async function loadExampleFromAsset(url, filename) {
700
+ try {
701
+ const response = await fetch(url);
702
+ const blob = await response.blob();
703
+ return new File([blob], filename, { type: blob.type });
704
+ } catch (err) {
705
+ console.error("Failed to load example asset:", err);
706
+ return null;
707
+ }
708
+ }
709
+
710
+ // Initialize preloaded examples click actions
711
+ function setupExamples() {
712
+ document.querySelectorAll(".example-card").forEach(card => {
713
+ card.addEventListener("click", async () => {
714
+ const type = card.getAttribute("data-type");
715
+ const name = card.getAttribute("data-name");
716
+ const category = card.getAttribute("data-category");
717
+ const task = card.getAttribute("data-task");
718
+ const mode = card.getAttribute("data-mode");
719
+ const assetPath = card.getAttribute("data-asset"); // e.g. "assets/book.jpg"
720
+
721
+ clearWorkspace();
722
+ workspaceStatus.textContent = `Loading ${name} example...`;
723
+
724
+ // Set parameters
725
+ taskTypeSelect.value = task;
726
+ categoriesInput.value = category;
727
+ inferenceModeSelect.value = mode;
728
+
729
+ // Trigger live prompt update
730
+ taskTypeSelect.dispatchEvent(new Event("change"));
731
+
732
+ // Setup Media type
733
+ setMediaType(type);
734
+
735
+ // Fetch asset file with robust absolute URL resolution (works in iframe)
736
+ const ext = type === "Image" ? "jpg" : "mp4";
737
+ const resolvedAssetUrl = new URL(assetPath, window.location.href).href;
738
+ console.log("Fetching example from:", resolvedAssetUrl);
739
+ const file = await loadExampleFromAsset(resolvedAssetUrl, `${name.toLowerCase()}.${ext}`);
740
+ if (file) {
741
+ activeFile = file;
742
+ uploadPrompt.classList.add("hidden");
743
+ if (type === "Image") {
744
+ previewImage.src = URL.createObjectURL(file);
745
+ previewImage.classList.remove("hidden");
746
+ previewVideo.classList.add("hidden");
747
+ workspaceStatus.textContent = `Example Image Loaded: ${name}`;
748
+ } else {
749
+ previewVideo.src = URL.createObjectURL(file);
750
+ previewVideo.classList.remove("hidden");
751
+ previewImage.classList.add("hidden");
752
+ workspaceStatus.textContent = `Example Video Loaded: ${name}`;
753
+ }
754
+ } else {
755
+ workspaceStatus.textContent = `Failed to load ${name} example`;
756
+ }
757
+ });
758
+ });
759
+ }
760
+
761
+ // Execution logic
762
+ async function executeInference() {
763
+ if (!activeFile) {
764
+ alert("Please upload a media file (Image or Video) or select an example first.");
765
+ return;
766
+ }
767
+
768
+ // Set loading state
769
+ runBtn.disabled = true;
770
+ btnText.textContent = "⏳ Queueing Request...";
771
+ btnIcon.textContent = "🔒";
772
+ processingOverlay.classList.remove("hidden");
773
+ processingStatus.textContent = "Waiting for Gradio queue...";
774
+
775
+ // Clean outputs
776
+ if (outputEmpty) outputEmpty.classList.add("hidden");
777
+ outputImage.classList.add("hidden");
778
+ outputVideo.classList.add("hidden");
779
+ richTraceLog.innerHTML = "";
780
+ richTraceLog.classList.add("hidden");
781
+ metaStatus.textContent = "Processing...";
782
+ metaStatus.className = "text-yellow-500 font-semibold";
783
+ detectionTagsWrapper.innerHTML = "Processing objects in backend...";
784
+ detectionCountBadge.textContent = "0";
785
+
786
+ try {
787
+ const clientInstance = await getClient();
788
+ if (!clientInstance) {
789
+ throw new Error("Unable to create Gradio Client instance.");
790
+ }
791
+
792
+ // Handle file parameter wrapping using Gradio client handle_file
793
+ const wrappedFile = activeFile ? handle_file(activeFile) : null;
794
+ const imageFile = (selectedMediaType === "Image") ? wrappedFile : null;
795
+ const videoFile = (selectedMediaType === "Video") ? wrappedFile : null;
796
+
797
+ // Collect configuration values
798
+ const taskType = taskTypeSelect.value;
799
+ const category = categoriesInput.value;
800
+ const modelMode = inferenceModeSelect.value;
801
+ const temp = parseFloat(tempSlider.value);
802
+ const topp = parseFloat(toppSlider.value);
803
+ const topk = parseInt(topkSlider.value);
804
+ const shortSize = shortSizeInput.value ? parseInt(shortSizeInput.value) : null;
805
+ const maxVideoFrames = parseInt(maxFramesSlider.value);
806
+
807
+ processingStatus.textContent = "Running Vision Model (duration-locked)...";
808
+
809
+ // Execute predictions using named parameters object matching app.py signature
810
+ const result = await clientInstance.predict("/run_inference", {
811
+ input_type: selectedMediaType,
812
+ image_file: imageFile,
813
+ video_file: videoFile,
814
+ task_type: taskType,
815
+ category: category,
816
+ model_mode: modelMode,
817
+ temp: temp,
818
+ top_p: topp,
819
+ top_k: topk,
820
+ short_size: shortSize,
821
+ question_override: null,
822
+ max_video_frames: maxVideoFrames
823
+ });
824
+
825
+ console.log("Inference complete. API outputs:", result);
826
+
827
+ // Unpack result values
828
+ const [outImageObj, outVideoObj, meta] = result.data;
829
+
830
+ if (!meta.success) {
831
+ throw new Error(meta.error || "Backend returned processing failure.");
832
+ }
833
+
834
+ // Process image result
835
+ if (selectedMediaType === "Image" && outImageObj) {
836
+ outputImage.src = outImageObj.url;
837
+ outputImage.classList.remove("hidden");
838
+ outputVideo.classList.add("hidden");
839
+ }
840
+ // Process video result
841
+ else if (selectedMediaType === "Video" && outVideoObj) {
842
+ outputVideo.src = outVideoObj.url;
843
+ outputVideo.classList.remove("hidden");
844
+ outputImage.classList.add("hidden");
845
+ }
846
+
847
+ // Render metrics logs
848
+ metaStatus.textContent = "Success";
849
+ metaStatus.className = "text-emerald-500 font-semibold";
850
+
851
+ const stats = meta.stats || {};
852
+ metaTokens.textContent = stats.num_tokens || stats.total_frames || "-";
853
+ metaBoxes.textContent = stats.num_boxes || stats.processed_frames || "-";
854
+ metaTps.textContent = stats.tps || "-";
855
+ metaBps.textContent = stats.bps || "-";
856
+ metaTime.textContent = stats.total_time_seconds ? `${stats.total_time_seconds}s` : "Optimal";
857
+
858
+ // Render detection tags
859
+ const detections = meta.detections || [];
860
+ detectionCountBadge.textContent = detections.length;
861
+
862
+ if (detections.length === 0) {
863
+ detectionTagsWrapper.innerHTML = "No objects matched categories.";
864
+ } else {
865
+ detectionTagsWrapper.innerHTML = "";
866
+ detections.forEach(det => {
867
+ const tag = document.createElement("span");
868
+ tag.className = "px-2 py-0.5 rounded bg-nvidia-brand/10 text-nvidia-brand border border-nvidia-brand/20 font-bold uppercase tracking-wider text-[8px] animate-fade-in";
869
+ tag.textContent = det.frame ? `[Frame ${det.frame}] ${det.label}` : det.label;
870
+ detectionTagsWrapper.appendChild(tag);
871
+ });
872
+ }
873
+
874
+ // Render logs trace
875
+ if (meta.html) {
876
+ richTraceLog.innerHTML = meta.html;
877
+ richTraceLog.classList.remove("hidden");
878
+ }
879
+
880
+ } catch (err) {
881
+ console.error("Execution failed:", err);
882
+ metaStatus.textContent = "Error";
883
+ metaStatus.className = "text-red-500 font-semibold";
884
+ detectionTagsWrapper.innerHTML = `<span class="text-red-400">Failed: ${err.message}</span>`;
885
+ alert(`Inference failed: ${err.message}`);
886
+ if (outputEmpty) outputEmpty.classList.remove("hidden");
887
+ } finally {
888
+ // Restore UI state
889
+ runBtn.disabled = false;
890
+ btnText.textContent = "Run Inference";
891
+ btnIcon.textContent = "🧠";
892
+ processingOverlay.classList.add("hidden");
893
+ }
894
+ }
895
+
896
+ // Add event listeners on load
897
+ document.addEventListener("DOMContentLoaded", () => {
898
+ mediaTypeImageBtn.addEventListener("click", () => {
899
+ if (selectedMediaType !== "Image") {
900
+ setMediaType("Image");
901
+ clearWorkspace();
902
+ }
903
+ });
904
+ mediaTypeVideoBtn.addEventListener("click", () => {
905
+ if (selectedMediaType !== "Video") {
906
+ setMediaType("Video");
907
+ clearWorkspace();
908
+ }
909
+ });
910
+ runBtn.addEventListener("click", executeInference);
911
+
912
+ // Bind enter key press in Categories float bar input
913
+ categoriesInput.addEventListener("keydown", (e) => {
914
+ if (e.key === "Enter") {
915
+ e.preventDefault();
916
+ executeInference();
917
+ }
918
+ });
919
+
920
+ setupLiveUpdaters();
921
+ setupDragDrop();
922
+ setupExamples();
923
+ });
924
+ </script>
925
+ </body>
926
+ </html>
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  opencv-python-headless==4.11.0.86
2
- transformers==4.51.0
3
  torch==2.8.0
4
  torchvision==0.23.0
5
  numpy==1.25.0
 
1
  opencv-python-headless==4.11.0.86
2
+ transformers==4.57.1
3
  torch==2.8.0
4
  torchvision==0.23.0
5
  numpy==1.25.0