Auto-deploy from GitHub
Browse files- README.md +1 -3
- index.html +1 -2
- results.csv +148 -88
- results.csv.timestamp +1 -1
- scripts/leaderboard.js +27 -7
- scripts/pareto.js +0 -386
- styles.css +3 -1
README.md
CHANGED
|
@@ -14,7 +14,5 @@ Interactive leaderboard and efficiency analysis for general-purpose AI agents ev
|
|
| 14 |
|
| 15 |
- **Benchmarks**: AppWorld, BrowseComp+, SWE-bench, TauBench (Airline, Retail, Telecom)
|
| 16 |
- **Paper**: [arXiv:2602.22953](https://arxiv.org/abs/2602.22953)
|
| 17 |
-
- **
|
| 18 |
-
- **GitHub**: [Exgentic/open-agent-leaderboard](https://github.com/Exgentic/open-agent-leaderboard)
|
| 19 |
- **Website**: [exgentic.github.io](https://exgentic.github.io)
|
| 20 |
-
- **Submit Results**: [CONTRIBUTING.md](https://github.com/Exgentic/open-agent-leaderboard/blob/main/CONTRIBUTING.md)
|
|
|
|
| 14 |
|
| 15 |
- **Benchmarks**: AppWorld, BrowseComp+, SWE-bench, TauBench (Airline, Retail, Telecom)
|
| 16 |
- **Paper**: [arXiv:2602.22953](https://arxiv.org/abs/2602.22953)
|
| 17 |
+
- **GitHub**: [Exgentic/exgentic](https://github.com/Exgentic/exgentic)
|
|
|
|
| 18 |
- **Website**: [exgentic.github.io](https://exgentic.github.io)
|
|
|
index.html
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
<!--
|
| 2 |
SPDX-License-Identifier: Apache-2.0
|
| 3 |
-
Copyright (C) 2025
|
| 4 |
-->
|
| 5 |
<!doctype html>
|
| 6 |
<html lang="en">
|
|
@@ -82,7 +82,6 @@ const header=document.getElementById('header');
|
|
| 82 |
window.addEventListener('scroll',()=>{header.classList.toggle('scrolled',window.scrollY>50)},{passive:true});
|
| 83 |
document.getElementById('mobileToggle').addEventListener('click',()=>{document.getElementById('headerNav').classList.toggle('open')});
|
| 84 |
</script>
|
| 85 |
-
<script src="./scripts/pareto.js"></script>
|
| 86 |
<script src="./scripts/leaderboard.js" data-base="./"></script>
|
| 87 |
<script>
|
| 88 |
// Theme toggle
|
|
|
|
| 1 |
<!--
|
| 2 |
SPDX-License-Identifier: Apache-2.0
|
| 3 |
+
Copyright (C) 2025, The Exgentic organization and its contributors.
|
| 4 |
-->
|
| 5 |
<!doctype html>
|
| 6 |
<html lang="en">
|
|
|
|
| 82 |
window.addEventListener('scroll',()=>{header.classList.toggle('scrolled',window.scrollY>50)},{passive:true});
|
| 83 |
document.getElementById('mobileToggle').addEventListener('click',()=>{document.getElementById('headerNav').classList.toggle('open')});
|
| 84 |
</script>
|
|
|
|
| 85 |
<script src="./scripts/leaderboard.js" data-base="./"></script>
|
| 86 |
<script>
|
| 87 |
// Theme toggle
|
results.csv
CHANGED
|
@@ -1,91 +1,151 @@
|
|
| 1 |
agent,agent_normalized,visible_agent_name,agent_version,avg_cost,avg_steps,benchmark,finished_pct,model,model_normalized,num_tasks,score,total_cost
|
| 2 |
-
claudecode,claude-code,Claude_Code,claude_code_2.1.7,
|
| 3 |
-
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
claudecode,claude-code,Claude_Code,claude_code_2.1.7,3.10550762,38.01,AppWorld,0.86,gemini,gemini-3-pro,100,0.36,310.550762
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,
|
| 21 |
-
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.
|
| 22 |
-
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.17156755,6.57,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.26,17.156755
|
| 25 |
-
smolagent,smolagents,Smolagent,smolagents_1.24.0,
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
smolagent,smolagents,Smolagent,smolagents_1.24.0,
|
| 39 |
-
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.
|
| 40 |
-
smolagent,smolagents,Smolagent,smolagents_1.24.0,
|
| 41 |
-
litellm,litellm-react,React,exgentic_0.1.0,
|
| 42 |
-
litellm,litellm-react,React,exgentic_0.1.0,0.
|
| 43 |
-
litellm,litellm-react,React,exgentic_0.1.0,0.
|
| 44 |
-
litellm
|
| 45 |
-
litellm
|
| 46 |
-
litellm
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
litellm,litellm-react,React,exgentic_0.1.0,0.
|
| 57 |
-
litellm,litellm-react,React,exgentic_0.1.0,0.
|
| 58 |
-
litellm,litellm-react,React,exgentic_0.1.0,0.
|
| 59 |
-
litellm
|
| 60 |
-
litellm
|
| 61 |
-
litellm
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
litellm,litellm-
|
| 72 |
-
litellm,litellm-
|
| 73 |
-
litellm,litellm-
|
| 74 |
-
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.
|
| 75 |
-
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.
|
| 76 |
-
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
litellm,litellm-
|
| 87 |
-
litellm,litellm-
|
| 88 |
-
litellm,litellm-
|
| 89 |
-
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,
|
| 90 |
-
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,
|
| 91 |
-
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
agent,agent_normalized,visible_agent_name,agent_version,avg_cost,avg_steps,benchmark,finished_pct,model,model_normalized,num_tasks,score,total_cost
|
| 2 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.5026066430000001,18.94,AppWorld,0.46,deepseek,deepseek-v3.2,100,0.03,50.26066430000001
|
| 3 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.10630119119999998,20.82,BrowseComp+,0.98,deepseek,deepseek-v3.2,100,0.48,10.630119119999998
|
| 4 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.20144125259999995,54.16,SWE-bench,0.89,deepseek,deepseek-v3.2,100,0.64,20.144125259999996
|
| 5 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.028655971600000007,12.06,TauBench-Airline,0.98,deepseek,deepseek-v3.2,50,0.28,1.4327985800000003
|
| 6 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.043596285400000004,12.51,TauBench-Retail,0.99,deepseek,deepseek-v3.2,100,0.65,4.35962854
|
| 7 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.07225669540000003,20.03,TauBench-Telecom,0.97,deepseek,deepseek-v3.2,100,0.61,7.225669540000003
|
| 8 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.8930315160000002,36.62,AppWorld,0.78,kimi,kimi-k2.5,100,0.08,89.30315160000002
|
| 9 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.7320507719999999,25.02,BrowseComp+,0.86,kimi,kimi-k2.5,100,0.56,73.20507719999999
|
| 10 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.7822629151515151,38.484848484848484,SWE-bench,0.7474747474747475,kimi,kimi-k2.5,100,0.5204081632653061,77.4440286
|
| 11 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.032654508000000006,2.58,TauBench-Airline,0.9,kimi,kimi-k2.5,50,0.12,1.6327254000000002
|
| 12 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.018570114000000006,0.56,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.03,1.8570114000000004
|
| 13 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.005500535999999998,0.0,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.0,0.5500535999999998
|
| 14 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.0,0.0,AppWorld,0.0,gpt52,gpt-5.2,100,0.0,0.0
|
| 15 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.4293807000000001,8.97,BrowseComp+,1.0,gpt52,gpt-5.2,100,0.43,42.93807000000001
|
| 16 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.9397547250000001,23.99,SWE-bench,1.0,gpt52,gpt-5.2,100,0.58,93.97547250000001
|
| 17 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.21338296,10.18,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.48,10.669148
|
| 18 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.1200643675,9.92,TauBench-Retail,0.98,gpt52,gpt-5.2,100,0.64,12.00643675
|
| 19 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.0980149625,9.36,TauBench-Telecom,1.0,gpt52,gpt-5.2,100,0.55,9.80149625
|
| 20 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,13.08379215,49.69,AppWorld,0.74,claude,claude-opus-4.5,100,0.66,1308.3792150000002
|
| 21 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,11.658796372549022,31.03921568627451,BrowseComp+,0.8431372549019608,claude,claude-opus-4.5,51,0.5294117647058824,594.5986150000001
|
| 22 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,5.604360979381441,31.762886597938145,SWE-bench,1.0,claude,claude-opus-4.5,97,0.7422680412371134,543.6230149999998
|
| 23 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,1.2993549000000004,11.5,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.66,64.96774500000002
|
| 24 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,1.5985232500000006,12.54,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.83,159.85232500000006
|
| 25 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,2.448864600000001,18.71,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.76,244.8864600000001
|
| 26 |
claudecode,claude-code,Claude_Code,claude_code_2.1.7,3.10550762,38.01,AppWorld,0.86,gemini,gemini-3-pro,100,0.36,310.550762
|
| 27 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,2.8452944199999997,22.88,BrowseComp+,0.7,gemini,gemini-3-pro,100,0.51,284.52944199999996
|
| 28 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,3.679668759999999,43.72,SWE-bench,1.0,gemini,gemini-3-pro,100,0.67,367.9668759999999
|
| 29 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.3394426000000001,12.62,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.7,16.972130000000003
|
| 30 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.18525811999999994,11.18,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.71,18.525811999999995
|
| 31 |
+
claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.21064223999999993,9.9,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.71,21.064223999999992
|
| 32 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3050881588000001,15.63,AppWorld,0.37,deepseek,deepseek-v3.2,100,0.06,30.50881588000001
|
| 33 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.09532471040000004,13.13,BrowseComp+,0.62,deepseek,deepseek-v3.2,100,0.3,9.532471040000004
|
| 34 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.11237704080000001,41.37,SWE-bench,0.38,deepseek,deepseek-v3.2,100,0.7368421052631579,11.23770408
|
| 35 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.016994613600000003,6.72,TauBench-Airline,0.94,deepseek,deepseek-v3.2,50,0.2,0.8497306800000002
|
| 36 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.011356187600000003,4.82,TauBench-Retail,0.97,deepseek,deepseek-v3.2,100,0.19,1.1356187600000003
|
| 37 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.023369379999999995,7.0,TauBench-Telecom,0.97,deepseek,deepseek-v3.2,100,0.18,2.3369379999999995
|
| 38 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3024222959999999,19.59,AppWorld,0.48,kimi,kimi-k2.5,100,0.08,30.242229599999995
|
| 39 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.25144734599999996,17.61,BrowseComp+,0.53,kimi,kimi-k2.5,100,0.35,25.144734599999996
|
| 40 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3341831760000001,38.74,SWE-bench,0.78,kimi,kimi-k2.5,100,0.5670103092783505,33.41831760000001
|
| 41 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.004579271999999998,0.5,TauBench-Airline,0.88,kimi,kimi-k2.5,50,0.0,0.2289635999999999
|
| 42 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.0042811740000000004,0.41,TauBench-Retail,0.96,kimi,kimi-k2.5,100,0.01,0.42811740000000004
|
| 43 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.0046396319999999994,0.0,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.0,0.46396319999999996
|
| 44 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.0,0.0,AppWorld,0.0,gpt52,gpt-5.2,100,0.0,0.0
|
| 45 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3804372600000001,14.27,BrowseComp+,1.0,gpt52,gpt-5.2,100,0.48,38.043726000000014
|
| 46 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.259017835858586,20.444444444444443,SWE-bench,1.0,gpt52,gpt-5.2,99,0.5454545454545454,25.642765750000013
|
| 47 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.10711806000000001,11.4,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.5,5.3559030000000005
|
| 48 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.10778330499999997,9.55,TauBench-Retail,0.99,gpt52,gpt-5.2,100,0.53,10.778330499999997
|
| 49 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.14717545500000007,9.92,TauBench-Telecom,1.0,gpt52,gpt-5.2,100,0.53,14.717545500000005
|
| 50 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,22.764767600000006,47.65,AppWorld,0.77,claude,claude-opus-4.5,100,0.68,2276.4767600000005
|
| 51 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,7.592457349999998,27.18,BrowseComp+,1.0,claude,claude-opus-4.5,100,0.61,759.2457349999999
|
| 52 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,2.9612612048192783,34.096385542168676,SWE-bench,1.0,claude,claude-opus-4.5,83,0.8072289156626506,245.7846800000001
|
| 53 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.7165842000000003,12.22,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.74,35.82921000000002
|
| 54 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.5495812000000001,12.54,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.85,54.958120000000015
|
| 55 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,1.2459395,17.15,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.84,124.59395
|
| 56 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,8.695528419999997,33.49,AppWorld,0.98,gemini,gemini-3-pro,100,0.57,869.5528419999997
|
| 57 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.6433071111111112,8.454545454545455,BrowseComp+,0.6060606060606061,gemini,gemini-3-pro,99,0.3333333333333333,63.687404
|
| 58 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,1.5791779787234044,32.361702127659576,SWE-bench,1.0,gemini,gemini-3-pro,94,0.723404255319149,148.44273
|
| 59 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.21380211999999996,10.9,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.62,10.690105999999998
|
| 60 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.26648499999999986,10.62,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.73,26.648499999999988
|
| 61 |
+
openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.5351315199999997,10.82,TauBench-Telecom,0.89,gemini,gemini-3-pro,100,0.79,53.51315199999998
|
| 62 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2420605992000001,54.69,AppWorld,0.81,deepseek,deepseek-v3.2,100,0.13,24.20605992000001
|
| 63 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.07768438359999996,12.01,BrowseComp+,1.0,deepseek,deepseek-v3.2,100,0.21,7.768438359999997
|
| 64 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.5519223772,69.43,SWE-bench,0.98,deepseek,deepseek-v3.2,100,0.56,55.19223772000001
|
| 65 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.055420856,18.94,TauBench-Airline,1.0,deepseek,deepseek-v3.2,50,0.6,2.7710428
|
| 66 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.05129110320000001,14.06,TauBench-Retail,0.98,deepseek,deepseek-v3.2,100,0.77,5.1291103200000006
|
| 67 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.06806756960000002,18.3,TauBench-Telecom,1.0,deepseek,deepseek-v3.2,100,0.84,6.806756960000001
|
| 68 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.6740286120000001,50.63,AppWorld,0.91,kimi,kimi-k2.5,100,0.11,67.4028612
|
| 69 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.4948784999999996,29.79,BrowseComp+,0.95,kimi,kimi-k2.5,100,0.33,49.48784999999996
|
| 70 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,1.3741787032258057,65.91397849462365,SWE-bench,0.8709677419354839,kimi,kimi-k2.5,100,0.5760869565217391,127.79861939999994
|
| 71 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.09928738799999998,13.0,TauBench-Airline,1.0,kimi,kimi-k2.5,50,0.56,4.964369399999999
|
| 72 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.06753205714285712,12.887755102040817,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.7244897959183674,6.618141599999998
|
| 73 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.15341341818181825,19.141414141414142,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.7070707070707071,15.187928400000008
|
| 74 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.5503114225,51.59,AppWorld,0.61,gpt52,gpt-5.2,100,0.07,55.03114225
|
| 75 |
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.17156755,6.57,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.26,17.156755
|
| 76 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.45031795454545454,19.97979797979798,SWE-bench,1.0,gpt52,gpt-5.2,99,0.5252525252525253,44.5814775
|
| 77 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2932717550000001,10.68,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.6,14.663587750000003
|
| 78 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.25194685249999993,11.08,TauBench-Retail,1.0,gpt52,gpt-5.2,100,0.68,25.194685249999992
|
| 79 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.30355347749999995,10.11,TauBench-Telecom,1.0,gpt52,gpt-5.2,100,0.71,30.355347749999993
|
| 80 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,5.585109100000002,41.07,AppWorld,0.82,claude,claude-opus-4.5,100,0.7,558.5109100000002
|
| 81 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,6.303725049999997,24.16,BrowseComp+,1.0,claude,claude-opus-4.5,100,0.61,630.3725049999997
|
| 82 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,4.8521832499999995,39.13,SWE-bench,1.0,claude,claude-opus-4.5,100,0.65,485.218325
|
| 83 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.7801527999999999,11.88,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.72,39.007639999999995
|
| 84 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.6711861,11.71,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.78,67.11861
|
| 85 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,1.0643879000000003,13.77,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.58,106.43879000000003
|
| 86 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,2.542546020000001,49.13,AppWorld,0.71,gemini,gemini-3-pro,100,0.13,254.2546020000001
|
| 87 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,2.388851260000001,29.63,BrowseComp+,0.69,gemini,gemini-3-pro,100,0.57,238.88512600000007
|
| 88 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,2.209661838383839,38.101010101010104,SWE-bench,1.0,gemini,gemini-3-pro,99,0.7575757575757576,218.75652200000005
|
| 89 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.19590487999999995,12.28,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.68,9.795243999999997
|
| 90 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2056864,11.3,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.75,20.56864
|
| 91 |
+
smolagent,smolagents,Smolagent,smolagents_1.24.0,0.3461698999999998,12.71,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.88,34.61698999999998
|
| 92 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.8332616264000001,31.69,AppWorld,0.88,deepseek,deepseek-v3.2,100,0.09,83.32616264
|
| 93 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.1679167768,34.19,BrowseComp+,0.98,deepseek,deepseek-v3.2,100,0.36,16.79167768
|
| 94 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.47889091680000007,77.44,SWE-bench,0.94,deepseek,deepseek-v3.2,100,0.6875,47.88909168000001
|
| 95 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.033952516,15.28,TauBench-Airline,1.0,deepseek,deepseek-v3.2,50,0.56,1.6976258
|
| 96 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.029598555599999986,14.44,TauBench-Retail,1.0,deepseek,deepseek-v3.2,100,0.82,2.9598555599999985
|
| 97 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.04940463720000001,18.11,TauBench-Telecom,1.0,deepseek,deepseek-v3.2,100,0.71,4.940463720000001
|
| 98 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.8925547499999997,25.05,AppWorld,0.9,kimi,kimi-k2.5,100,0.09,89.25547499999998
|
| 99 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.3791127780000001,26.27,BrowseComp+,0.74,kimi,kimi-k2.5,100,0.34,37.91127780000001
|
| 100 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.590102075510204,48.765306122448976,SWE-bench,0.8775510204081632,kimi,kimi-k2.5,100,0.5714285714285714,57.830003399999995
|
| 101 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.03922161599999999,9.6,TauBench-Airline,1.0,kimi,kimi-k2.5,50,0.62,1.9610807999999995
|
| 102 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.04560821212121211,12.313131313131313,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.6464646464646465,4.515212999999999
|
| 103 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.092763078,16.14,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.83,9.2763078
|
| 104 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.0,0.0,AppWorld,0.0,gpt52,gpt-5.2,100,0.0,0.0
|
| 105 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.2961726250000001,8.14,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.46,29.617262500000006
|
| 106 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.247620555,20.47,SWE-bench,1.0,gpt52,gpt-5.2,100,0.57,24.7620555
|
| 107 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.12510473500000002,11.22,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.54,6.255236750000001
|
| 108 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.11093171249999992,10.33,TauBench-Retail,1.0,gpt52,gpt-5.2,100,0.73,11.093171249999992
|
| 109 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.14089510749999992,10.18,TauBench-Telecom,0.99,gpt52,gpt-5.2,100,0.53,14.089510749999993
|
| 110 |
+
litellm,litellm-react,React,exgentic_0.1.0,11.324659950000004,21.99,AppWorld,0.83,claude,claude-opus-4.5,100,0.61,1132.4659950000005
|
| 111 |
+
litellm,litellm-react,React,exgentic_0.1.0,7.093632849999999,21.66,BrowseComp+,0.93,claude,claude-opus-4.5,100,0.49,709.3632849999999
|
| 112 |
+
litellm,litellm-react,React,exgentic_0.1.0,3.971294949494951,43.44444444444444,SWE-bench,1.0,claude,claude-opus-4.5,99,0.6060606060606061,393.15820000000014
|
| 113 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.46923760000000003,10.0,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.66,23.46188
|
| 114 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.46742524999999985,11.33,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.78,46.742524999999986
|
| 115 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.9157812500000004,17.22,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.76,91.57812500000004
|
| 116 |
+
litellm,litellm-react,React,exgentic_0.1.0,1.8818770200000003,21.76,AppWorld,0.99,gemini,gemini-3-pro,100,0.5,188.18770200000003
|
| 117 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.4394335600000001,7.85,BrowseComp+,0.99,gemini,gemini-3-pro,100,0.48,43.94335600000001
|
| 118 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.6956134199999998,32.55,SWE-bench,1.0,gemini,gemini-3-pro,100,0.71,69.56134199999998
|
| 119 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.15854967999999997,10.14,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.7,7.927483999999998
|
| 120 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.15649691999999996,11.25,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.82,15.649691999999995
|
| 121 |
+
litellm,litellm-react,React,exgentic_0.1.0,0.29797836000000005,14.84,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.73,29.797836000000004
|
| 122 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.23222570840000004,28.51,AppWorld,0.52,deepseek,deepseek-v3.2,100,0.04,23.222570840000003
|
| 123 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.1679167768,34.19,BrowseComp+,0.98,deepseek,deepseek-v3.2,100,0.36,16.79167768
|
| 124 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.47889091680000007,77.44,SWE-bench,0.94,deepseek,deepseek-v3.2,100,0.6875,47.88909168000001
|
| 125 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.033952516,15.28,TauBench-Airline,1.0,deepseek,deepseek-v3.2,50,0.56,1.6976258
|
| 126 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.029598555599999986,14.44,TauBench-Retail,1.0,deepseek,deepseek-v3.2,100,0.82,2.9598555599999985
|
| 127 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.04940463720000001,18.11,TauBench-Telecom,1.0,deepseek,deepseek-v3.2,100,0.71,4.940463720000001
|
| 128 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.6031619339999997,23.53,AppWorld,0.89,kimi,kimi-k2.5,100,0.1,60.316193399999975
|
| 129 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.3791127780000001,26.27,BrowseComp+,0.74,kimi,kimi-k2.5,100,0.34,37.91127780000001
|
| 130 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.590102075510204,48.765306122448976,SWE-bench,0.8775510204081632,kimi,kimi-k2.5,100,0.5714285714285714,57.830003399999995
|
| 131 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.03922161599999999,9.6,TauBench-Airline,1.0,kimi,kimi-k2.5,50,0.62,1.9610807999999995
|
| 132 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.04560821212121211,12.313131313131313,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.6464646464646465,4.515212999999999
|
| 133 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.092763078,16.14,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.83,9.2763078
|
| 134 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.36371315749999994,10.05,AppWorld,1.0,gpt52,gpt-5.2,100,0.22,36.371315749999994
|
| 135 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.2961726250000001,8.14,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.46,29.617262500000006
|
| 136 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.247620555,20.47,SWE-bench,1.0,gpt52,gpt-5.2,100,0.57,24.7620555
|
| 137 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.12510473500000002,11.22,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.54,6.255236750000001
|
| 138 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.11093171249999992,10.33,TauBench-Retail,1.0,gpt52,gpt-5.2,100,0.73,11.093171249999992
|
| 139 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.14089510749999992,10.18,TauBench-Telecom,0.99,gpt52,gpt-5.2,100,0.53,14.089510749999993
|
| 140 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,3.4332017499999994,20.06,AppWorld,0.82,claude,claude-opus-4.5,100,0.64,343.32017499999995
|
| 141 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,7.093632849999999,21.66,BrowseComp+,0.93,claude,claude-opus-4.5,100,0.49,709.3632849999999
|
| 142 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,3.971294949494951,43.44444444444444,SWE-bench,1.0,claude,claude-opus-4.5,99,0.6060606060606061,393.15820000000014
|
| 143 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.46923760000000003,10.0,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.66,23.46188
|
| 144 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.46742524999999985,11.33,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.78,46.742524999999986
|
| 145 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.9157812500000004,17.22,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.76,91.57812500000004
|
| 146 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,1.30485292,22.59,AppWorld,1.0,gemini,gemini-3-pro,100,0.55,130.48529200000002
|
| 147 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.4394335600000001,7.85,BrowseComp+,0.99,gemini,gemini-3-pro,100,0.48,43.94335600000001
|
| 148 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.6956134199999998,32.55,SWE-bench,1.0,gemini,gemini-3-pro,100,0.71,69.56134199999998
|
| 149 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.15854967999999997,10.14,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.7,7.927483999999998
|
| 150 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.15649691999999996,11.25,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.82,15.649691999999995
|
| 151 |
+
litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.29797836000000005,14.84,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.73,29.797836000000004
|
results.csv.timestamp
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
Wednesday, May 13, 2026 12:25:52 PM
|
scripts/leaderboard.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
// ===== 9. LEADERBOARD =====
|
| 2 |
const BENCHMARKS=['AppWorld','BrowseComp+','SWE-bench','TauBench-Airline','TauBench-Retail','TauBench-Telecom'];
|
| 3 |
const BENCH_SHORT={'AppWorld':'App','BrowseComp+':'Browse','SWE-bench':'SWE','TauBench-Airline':'Tau-Air','TauBench-Retail':'Tau-Ret','TauBench-Telecom':'Tau-Tel'};
|
| 4 |
-
const MODEL_DISPLAY={'claude-opus-4.5':'Claude Opus 4.5','gpt-5.2':'GPT 5.2','gemini-3-pro':'Gemini Pro 3'};
|
| 5 |
-
const MODEL_URLS={'claude-opus-4.5':'https://www.anthropic.com/claude','gpt-5.2':'https://openai.com/','gemini-3-pro':'https://deepmind.google/technologies/gemini/'};
|
| 6 |
const AGENT_DISPLAY={'Claude_Code':'Claude Code','OpenAI_Solo':'OpenAI Solo','Smolagent':'Smolagent','React':'React','React_+_Shortlisting':'React + Shortlist'};
|
| 7 |
const AGENT_URLS={'Claude_Code':'https://github.com/anthropics/claude-code','OpenAI_Solo':'https://github.com/openai/openai-agents-python','Smolagent':'https://github.com/huggingface/smolagents','React':'https://github.com/BerriAI/litellm','React_+_Shortlisting':'https://github.com/BerriAI/litellm'};
|
| 8 |
|
|
@@ -27,10 +27,12 @@ function processData(rows,modelFilter){
|
|
| 27 |
const BENCH_WEIGHT={};
|
| 28 |
BENCHMARKS.forEach(b=>{BENCH_WEIGHT[b]=b.startsWith('TauBench')?1/12:1/4});
|
| 29 |
return Object.values(groups).map(g=>{
|
|
|
|
|
|
|
| 30 |
const bs=BENCHMARKS.map(b=>g.benchmarks[b]||0);
|
| 31 |
-
let wSum=0
|
| 32 |
-
BENCHMARKS.forEach((b,i)=>{
|
| 33 |
-
const avg=
|
| 34 |
const cs=Object.values(g.costs).filter(c=>c>0);
|
| 35 |
const avgCost=cs.length?cs.reduce((a,b)=>a+b,0)/cs.length:0;
|
| 36 |
return{...g,avg,avgCost,benchScores:bs};
|
|
@@ -81,8 +83,7 @@ function renderTable(data){
|
|
| 81 |
html+=`<td class="score-cell ${scoreClass(row.avg)}"><div class="bar bar-cyan" style="width:${row.avg*100}%"></div><span class="val">${fmtPct(row.avg)}</span></td>`;
|
| 82 |
html+=`<td class="cost-cell">$${row.avgCost.toFixed(2)}</td>`;
|
| 83 |
row.benchScores.forEach(s=>{
|
| 84 |
-
|
| 85 |
-
html+=`<td class="score-cell ${scoreClass(s)}"><div class="bar bar-purple" style="width:${barW}%"></div><span class="val">${s>0?fmtPct(s):'—'}</span></td>`;
|
| 86 |
});
|
| 87 |
html+=`</tr>`;return html;
|
| 88 |
}).join('');
|
|
@@ -98,6 +99,25 @@ function renderTable(data){
|
|
| 98 |
renderTable(data);
|
| 99 |
});
|
| 100 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
}
|
| 102 |
|
| 103 |
// ===== CHART =====
|
|
|
|
| 1 |
// ===== 9. LEADERBOARD =====
|
| 2 |
const BENCHMARKS=['AppWorld','BrowseComp+','SWE-bench','TauBench-Airline','TauBench-Retail','TauBench-Telecom'];
|
| 3 |
const BENCH_SHORT={'AppWorld':'App','BrowseComp+':'Browse','SWE-bench':'SWE','TauBench-Airline':'Tau-Air','TauBench-Retail':'Tau-Ret','TauBench-Telecom':'Tau-Tel'};
|
| 4 |
+
const MODEL_DISPLAY={'claude-opus-4.5':'Claude Opus 4.5','gpt-5.2':'GPT 5.2','gemini-3-pro':'Gemini Pro 3','deepseek-v3.2':'DeepSeek V3.2','kimi-k2.5':'Kimi K2.5'};
|
| 5 |
+
const MODEL_URLS={'claude-opus-4.5':'https://www.anthropic.com/claude','gpt-5.2':'https://openai.com/','gemini-3-pro':'https://deepmind.google/technologies/gemini/','deepseek-v3.2':'https://www.deepseek.com/','kimi-k2.5':'https://www.moonshot.ai/'};
|
| 6 |
const AGENT_DISPLAY={'Claude_Code':'Claude Code','OpenAI_Solo':'OpenAI Solo','Smolagent':'Smolagent','React':'React','React_+_Shortlisting':'React + Shortlist'};
|
| 7 |
const AGENT_URLS={'Claude_Code':'https://github.com/anthropics/claude-code','OpenAI_Solo':'https://github.com/openai/openai-agents-python','Smolagent':'https://github.com/huggingface/smolagents','React':'https://github.com/BerriAI/litellm','React_+_Shortlisting':'https://github.com/BerriAI/litellm'};
|
| 8 |
|
|
|
|
| 27 |
const BENCH_WEIGHT={};
|
| 28 |
BENCHMARKS.forEach(b=>{BENCH_WEIGHT[b]=b.startsWith('TauBench')?1/12:1/4});
|
| 29 |
return Object.values(groups).map(g=>{
|
| 30 |
+
// Every (model, agent, benchmark) cell is populated; zero is a real score
|
| 31 |
+
// (TauBench protocol failures, AppWorld tool-limit failures). Include all six.
|
| 32 |
const bs=BENCHMARKS.map(b=>g.benchmarks[b]||0);
|
| 33 |
+
let wSum=0;
|
| 34 |
+
BENCHMARKS.forEach((b,i)=>{wSum+=bs[i]*BENCH_WEIGHT[b]});
|
| 35 |
+
const avg=wSum; // weights sum to 1
|
| 36 |
const cs=Object.values(g.costs).filter(c=>c>0);
|
| 37 |
const avgCost=cs.length?cs.reduce((a,b)=>a+b,0)/cs.length:0;
|
| 38 |
return{...g,avg,avgCost,benchScores:bs};
|
|
|
|
| 83 |
html+=`<td class="score-cell ${scoreClass(row.avg)}"><div class="bar bar-cyan" style="width:${row.avg*100}%"></div><span class="val">${fmtPct(row.avg)}</span></td>`;
|
| 84 |
html+=`<td class="cost-cell">$${row.avgCost.toFixed(2)}</td>`;
|
| 85 |
row.benchScores.forEach(s=>{
|
| 86 |
+
html+=`<td class="score-cell ${scoreClass(s)}"><div class="bar bar-purple" style="width:${s*100}%"></div><span class="val">${fmtPct(s)}</span></td>`;
|
|
|
|
| 87 |
});
|
| 88 |
html+=`</tr>`;return html;
|
| 89 |
}).join('');
|
|
|
|
| 99 |
renderTable(data);
|
| 100 |
});
|
| 101 |
});
|
| 102 |
+
|
| 103 |
+
// Remove any legacy "See all" button if it exists from a previous render
|
| 104 |
+
const oldBtn=document.getElementById('lbSeeAll');
|
| 105 |
+
if(oldBtn)oldBtn.remove();
|
| 106 |
+
|
| 107 |
+
// Toggle bottom-fade hint based on whether the table can still scroll down
|
| 108 |
+
const tableWrap=document.querySelector('.table-wrap');
|
| 109 |
+
const updateFade=()=>{
|
| 110 |
+
const atBottom=tableWrap.scrollTop+tableWrap.clientHeight>=tableWrap.scrollHeight-2;
|
| 111 |
+
const overflows=tableWrap.scrollHeight>tableWrap.clientHeight+2;
|
| 112 |
+
tableWrap.classList.toggle('lb-can-scroll',overflows&&!atBottom);
|
| 113 |
+
};
|
| 114 |
+
if(!tableWrap.dataset.scrollBound){
|
| 115 |
+
tableWrap.addEventListener('scroll',updateFade,{passive:true});
|
| 116 |
+
window.addEventListener('resize',updateFade);
|
| 117 |
+
tableWrap.dataset.scrollBound='1';
|
| 118 |
+
}
|
| 119 |
+
// Defer to next frame so layout is settled
|
| 120 |
+
requestAnimationFrame(updateFade);
|
| 121 |
}
|
| 122 |
|
| 123 |
// ===== CHART =====
|
scripts/pareto.js
DELETED
|
@@ -1,386 +0,0 @@
|
|
| 1 |
-
// SVG Pareto Chart — reads processed data from leaderboard.js via window._chartData
|
| 2 |
-
(() => {
|
| 3 |
-
const MODEL_DISPLAY = {
|
| 4 |
-
'claude-opus-4.5': 'Claude Opus 4.5',
|
| 5 |
-
'gpt-5.2': 'GPT 5.2',
|
| 6 |
-
'gemini-3-pro': 'Gemini Pro 3'
|
| 7 |
-
};
|
| 8 |
-
const MODEL_COLORS = {
|
| 9 |
-
'claude-opus-4.5': '#5ba8a0',
|
| 10 |
-
'gpt-5.2': '#9b8ec4',
|
| 11 |
-
'gemini-3-pro': '#6a8cbe'
|
| 12 |
-
};
|
| 13 |
-
const AGENT_DISPLAY = {
|
| 14 |
-
'Claude_Code': 'Claude Code',
|
| 15 |
-
'OpenAI_Solo': 'OpenAI Solo',
|
| 16 |
-
'Smolagent': 'Smolagent',
|
| 17 |
-
'React': 'React',
|
| 18 |
-
'React_+_Shortlisting': 'React + Shortlist'
|
| 19 |
-
};
|
| 20 |
-
|
| 21 |
-
const ns = 'http://www.w3.org/2000/svg';
|
| 22 |
-
|
| 23 |
-
function el(tag, attrs) {
|
| 24 |
-
const e = document.createElementNS(ns, tag);
|
| 25 |
-
if (attrs) for (const [k, v] of Object.entries(attrs)) e.setAttribute(k, v);
|
| 26 |
-
return e;
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
function makeShape(agent, x, y, color) {
|
| 30 |
-
const g = el('g');
|
| 31 |
-
const common = { fill: color, 'fill-opacity': '0.82', stroke: 'rgba(128,128,128,0.3)', 'stroke-width': '1.5' };
|
| 32 |
-
let shape;
|
| 33 |
-
|
| 34 |
-
if (agent === 'Claude_Code') {
|
| 35 |
-
const half = 9;
|
| 36 |
-
const pts = [];
|
| 37 |
-
for (let i = 0; i < 10; i++) {
|
| 38 |
-
const r = i % 2 === 0 ? half : half * 0.42;
|
| 39 |
-
const a = i * Math.PI / 5 - Math.PI / 2;
|
| 40 |
-
pts.push((x + r * Math.cos(a)).toFixed(1) + ',' + (y + r * Math.sin(a)).toFixed(1));
|
| 41 |
-
}
|
| 42 |
-
shape = el('polygon', { points: pts.join(' '), ...common });
|
| 43 |
-
} else if (agent === 'OpenAI_Solo') {
|
| 44 |
-
const half = 8;
|
| 45 |
-
shape = el('polygon', { points: `${x},${y - half} ${x + half},${y} ${x},${y + half} ${x - half},${y}`, ...common });
|
| 46 |
-
} else if (agent === 'Smolagent') {
|
| 47 |
-
const half = 8.5;
|
| 48 |
-
shape = el('polygon', { points: `${x},${y - half} ${x + half},${y + half} ${x - half},${y + half}`, ...common });
|
| 49 |
-
} else if (agent === 'React_+_Shortlisting') {
|
| 50 |
-
const half = 6.5;
|
| 51 |
-
shape = el('rect', { x: x - half, y: y - half, width: half * 2, height: half * 2, rx: 2, ...common });
|
| 52 |
-
} else {
|
| 53 |
-
const half = 7;
|
| 54 |
-
shape = el('circle', { cx: x, cy: y, r: half, ...common });
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
-
g.appendChild(shape);
|
| 58 |
-
// Invisible larger hit area for hover
|
| 59 |
-
g.appendChild(el('circle', { cx: x, cy: y, r: 20, fill: 'transparent', cursor: 'pointer' }));
|
| 60 |
-
return g;
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
function makeLegendShapeSVG(agent) {
|
| 64 |
-
const svg = document.createElementNS(ns, 'svg');
|
| 65 |
-
svg.setAttribute('width', '14');
|
| 66 |
-
svg.setAttribute('height', '14');
|
| 67 |
-
svg.setAttribute('viewBox', '0 0 14 14');
|
| 68 |
-
const common = { fill: 'var(--text-muted)', 'fill-opacity': '0.5', stroke: 'var(--text-muted)', 'stroke-width': '0.8' };
|
| 69 |
-
let shape;
|
| 70 |
-
if (agent === 'Claude_Code') {
|
| 71 |
-
const pts = [];
|
| 72 |
-
for (let i = 0; i < 10; i++) {
|
| 73 |
-
const r = i % 2 === 0 ? 6 : 6 * 0.42;
|
| 74 |
-
const a = i * Math.PI / 5 - Math.PI / 2;
|
| 75 |
-
pts.push((7 + r * Math.cos(a)).toFixed(1) + ',' + (7 + r * Math.sin(a)).toFixed(1));
|
| 76 |
-
}
|
| 77 |
-
shape = el('polygon', { points: pts.join(' '), ...common });
|
| 78 |
-
} else if (agent === 'OpenAI_Solo') {
|
| 79 |
-
shape = el('polygon', { points: '7,1 11,7 7,13 3,7', ...common });
|
| 80 |
-
} else if (agent === 'Smolagent') {
|
| 81 |
-
shape = el('polygon', { points: '7,2 12,12 2,12', ...common });
|
| 82 |
-
} else if (agent === 'React_+_Shortlisting') {
|
| 83 |
-
shape = el('rect', { x: 2, y: 2, width: 10, height: 10, rx: 1.5, ...common });
|
| 84 |
-
} else {
|
| 85 |
-
shape = el('circle', { cx: 7, cy: 7, r: 5, ...common });
|
| 86 |
-
}
|
| 87 |
-
svg.appendChild(shape);
|
| 88 |
-
return svg;
|
| 89 |
-
}
|
| 90 |
-
|
| 91 |
-
function renderParetoChart(data) {
|
| 92 |
-
const container = document.getElementById('paretoChartContainer');
|
| 93 |
-
if (!container) return;
|
| 94 |
-
container.innerHTML = '';
|
| 95 |
-
|
| 96 |
-
// Build chart data from processed leaderboard data
|
| 97 |
-
const points = data.map(d => ({
|
| 98 |
-
agent: d.agent,
|
| 99 |
-
model: d.model,
|
| 100 |
-
cost: d.avgCost,
|
| 101 |
-
success: d.avg
|
| 102 |
-
})).filter(d => d.cost > 0 && d.success > 0);
|
| 103 |
-
|
| 104 |
-
if (points.length === 0) return;
|
| 105 |
-
|
| 106 |
-
// Determine unique agents and models
|
| 107 |
-
const agents = [...new Set(points.map(p => p.agent))];
|
| 108 |
-
const models = [...new Set(points.map(p => p.model))];
|
| 109 |
-
|
| 110 |
-
// Determine axis ranges with padding
|
| 111 |
-
const costs = points.map(p => p.cost);
|
| 112 |
-
const successes = points.map(p => p.success);
|
| 113 |
-
const xMax = Math.ceil(Math.max(...costs) + 0.5);
|
| 114 |
-
const xMin = 0;
|
| 115 |
-
const yMinRaw = Math.min(...successes);
|
| 116 |
-
const yMaxRaw = Math.max(...successes);
|
| 117 |
-
const yMin = Math.floor(yMinRaw * 10) / 10 - 0.05;
|
| 118 |
-
const yMax = Math.ceil(yMaxRaw * 10) / 10 + 0.05;
|
| 119 |
-
|
| 120 |
-
// SVG dimensions
|
| 121 |
-
const W = 740, H = 420;
|
| 122 |
-
const ml = 62, mr = 20, mt = 14, mb = 52;
|
| 123 |
-
const pw = W - ml - mr, ph = H - mt - mb;
|
| 124 |
-
|
| 125 |
-
const sx = c => ml + ((c - xMin) / (xMax - xMin)) * pw;
|
| 126 |
-
const sy = s => mt + (1 - ((s - yMin) / (yMax - yMin))) * ph;
|
| 127 |
-
|
| 128 |
-
// Pareto frontier
|
| 129 |
-
const sorted = [...points].sort((a, b) => a.cost - b.cost);
|
| 130 |
-
const pareto = [];
|
| 131 |
-
let bestS = -Infinity;
|
| 132 |
-
for (const p of sorted) {
|
| 133 |
-
if (p.success > bestS) { pareto.push(p); bestS = p.success; }
|
| 134 |
-
}
|
| 135 |
-
|
| 136 |
-
// Build DOM
|
| 137 |
-
const root = document.createElement('div');
|
| 138 |
-
root.className = 'pareto-embed';
|
| 139 |
-
|
| 140 |
-
const chartWrap = document.createElement('div');
|
| 141 |
-
chartWrap.className = 'chart-wrap';
|
| 142 |
-
|
| 143 |
-
const svg = document.createElementNS(ns, 'svg');
|
| 144 |
-
svg.setAttribute('class', 'chart');
|
| 145 |
-
svg.setAttribute('viewBox', `0 0 ${W} ${H}`);
|
| 146 |
-
svg.setAttribute('aria-label', 'Pareto frontier: cost vs success rate for agent configurations');
|
| 147 |
-
|
| 148 |
-
// Gradient for pareto area
|
| 149 |
-
const defs = el('defs');
|
| 150 |
-
const grad = el('linearGradient', { id: 'pareto-fade-main', x1: '0', y1: '0', x2: '0', y2: '1' });
|
| 151 |
-
const stop1 = el('stop', { offset: '0%', 'stop-color': '#5ba8a0', 'stop-opacity': '0.08' });
|
| 152 |
-
const stop2 = el('stop', { offset: '100%', 'stop-color': '#5ba8a0', 'stop-opacity': '0' });
|
| 153 |
-
grad.appendChild(stop1);
|
| 154 |
-
grad.appendChild(stop2);
|
| 155 |
-
defs.appendChild(grad);
|
| 156 |
-
svg.appendChild(defs);
|
| 157 |
-
|
| 158 |
-
// Grid lines and ticks
|
| 159 |
-
const xStep = xMax <= 10 ? 1 : 2;
|
| 160 |
-
const xTicks = [];
|
| 161 |
-
for (let x = 0; x <= xMax; x += xStep) xTicks.push(x);
|
| 162 |
-
const yTicks = [];
|
| 163 |
-
for (let y = Math.ceil(yMin * 10) / 10; y <= yMax; y = Math.round((y + 0.1) * 10) / 10) yTicks.push(y);
|
| 164 |
-
|
| 165 |
-
for (const x of xTicks) {
|
| 166 |
-
const px = sx(x);
|
| 167 |
-
svg.appendChild(el('line', { x1: px, y1: mt, x2: px, y2: mt + ph, stroke: 'var(--text-muted)', 'stroke-opacity': '0.15', 'stroke-width': 1 }));
|
| 168 |
-
const t = el('text', { x: px, y: mt + ph + 22, 'font-size': 11, fill: 'var(--text-muted)', 'text-anchor': 'middle', 'font-family': 'Inter, system-ui', 'font-weight': 400 });
|
| 169 |
-
t.textContent = '$' + x;
|
| 170 |
-
svg.appendChild(t);
|
| 171 |
-
}
|
| 172 |
-
|
| 173 |
-
for (const y of yTicks) {
|
| 174 |
-
const py = sy(y);
|
| 175 |
-
svg.appendChild(el('line', { x1: ml, y1: py, x2: ml + pw, y2: py, stroke: 'var(--text-muted)', 'stroke-opacity': '0.15', 'stroke-width': 1 }));
|
| 176 |
-
const t = el('text', { x: ml - 10, y: py + 4, 'font-size': 11, fill: 'var(--text-muted)', 'text-anchor': 'end', 'font-family': 'Inter, system-ui', 'font-weight': 400 });
|
| 177 |
-
t.textContent = Math.round(y * 100) + '%';
|
| 178 |
-
svg.appendChild(t);
|
| 179 |
-
}
|
| 180 |
-
|
| 181 |
-
// Axes
|
| 182 |
-
svg.appendChild(el('line', { x1: ml, y1: mt + ph, x2: ml + pw, y2: mt + ph, stroke: 'var(--text-muted)', 'stroke-opacity': '0.25', 'stroke-width': 1 }));
|
| 183 |
-
svg.appendChild(el('line', { x1: ml, y1: mt, x2: ml, y2: mt + ph, stroke: 'var(--text-muted)', 'stroke-opacity': '0.25', 'stroke-width': 1 }));
|
| 184 |
-
|
| 185 |
-
// Axis labels
|
| 186 |
-
const xLabel = el('text', { x: ml + pw / 2, y: H - 8, 'text-anchor': 'middle', 'font-size': 12, fill: 'var(--text-muted)', 'font-family': 'Inter, system-ui', 'font-weight': 500 });
|
| 187 |
-
xLabel.textContent = 'Average cost per task (USD)';
|
| 188 |
-
svg.appendChild(xLabel);
|
| 189 |
-
|
| 190 |
-
const yLabel = el('text', { x: 14, y: mt + ph / 2, 'text-anchor': 'middle', 'font-size': 12, fill: 'var(--text-muted)', 'font-family': 'Inter, system-ui', 'font-weight': 500, transform: 'rotate(-90 14 ' + (mt + ph / 2) + ')' });
|
| 191 |
-
yLabel.textContent = 'Success rate';
|
| 192 |
-
svg.appendChild(yLabel);
|
| 193 |
-
|
| 194 |
-
// Pareto area fill
|
| 195 |
-
if (pareto.length > 1) {
|
| 196 |
-
let areaD = 'M ' + sx(pareto[0].cost) + ' ' + sy(pareto[0].success);
|
| 197 |
-
for (let i = 1; i < pareto.length; i++) areaD += ' L ' + sx(pareto[i].cost) + ' ' + sy(pareto[i].success);
|
| 198 |
-
areaD += ' L ' + sx(pareto[pareto.length - 1].cost) + ' ' + (mt + ph);
|
| 199 |
-
areaD += ' L ' + sx(pareto[0].cost) + ' ' + (mt + ph) + ' Z';
|
| 200 |
-
svg.appendChild(el('path', { d: areaD, fill: 'url(#pareto-fade-main)' }));
|
| 201 |
-
}
|
| 202 |
-
|
| 203 |
-
// Pareto line
|
| 204 |
-
const paretoD = pareto.map((p, i) => (i === 0 ? 'M' : 'L') + ' ' + sx(p.cost) + ' ' + sy(p.success)).join(' ');
|
| 205 |
-
const paretoLine = el('path', { d: paretoD, fill: 'none', stroke: 'var(--text-muted)', 'stroke-opacity': '0.4', 'stroke-width': 1.6, 'stroke-dasharray': '6 4' });
|
| 206 |
-
svg.appendChild(paretoLine);
|
| 207 |
-
|
| 208 |
-
// Pareto label
|
| 209 |
-
const lastPareto = pareto[pareto.length - 1];
|
| 210 |
-
const paretoLabel = el('text', { x: sx(lastPareto.cost) + 8, y: sy(lastPareto.success) - 10, 'font-size': 10, fill: 'var(--text-muted)', 'font-family': 'Inter, system-ui', 'font-style': 'italic' });
|
| 211 |
-
paretoLabel.textContent = 'Pareto frontier';
|
| 212 |
-
svg.appendChild(paretoLabel);
|
| 213 |
-
|
| 214 |
-
// Data points
|
| 215 |
-
const pointEls = [];
|
| 216 |
-
points.forEach((d, i) => {
|
| 217 |
-
const x = sx(d.cost);
|
| 218 |
-
const y = sy(d.success);
|
| 219 |
-
const color = MODEL_COLORS[d.model] || '#7A7A7A';
|
| 220 |
-
const g = makeShape(d.agent, x, y, color);
|
| 221 |
-
g.classList.add('chart-point');
|
| 222 |
-
g.dataset.agent = d.agent;
|
| 223 |
-
g.dataset.model = d.model;
|
| 224 |
-
g.dataset.idx = i;
|
| 225 |
-
svg.appendChild(g);
|
| 226 |
-
pointEls.push({ el: g, data: d, x, y });
|
| 227 |
-
});
|
| 228 |
-
|
| 229 |
-
chartWrap.appendChild(svg);
|
| 230 |
-
|
| 231 |
-
// Tooltip
|
| 232 |
-
const tooltip = document.createElement('div');
|
| 233 |
-
tooltip.className = 'tooltip';
|
| 234 |
-
chartWrap.appendChild(tooltip);
|
| 235 |
-
|
| 236 |
-
root.appendChild(chartWrap);
|
| 237 |
-
|
| 238 |
-
// Legend
|
| 239 |
-
const legendRow = document.createElement('div');
|
| 240 |
-
legendRow.className = 'legend-row';
|
| 241 |
-
|
| 242 |
-
// Agents legend group
|
| 243 |
-
const agentGroup = document.createElement('div');
|
| 244 |
-
agentGroup.className = 'legend-group';
|
| 245 |
-
const agentLabel = document.createElement('span');
|
| 246 |
-
agentLabel.className = 'legend-group-label';
|
| 247 |
-
agentLabel.textContent = 'Agents';
|
| 248 |
-
agentGroup.appendChild(agentLabel);
|
| 249 |
-
|
| 250 |
-
agents.forEach(agent => {
|
| 251 |
-
const item = document.createElement('div');
|
| 252 |
-
item.className = 'legend-item';
|
| 253 |
-
item.dataset.filter = 'agent';
|
| 254 |
-
item.dataset.value = agent;
|
| 255 |
-
item.appendChild(makeLegendShapeSVG(agent));
|
| 256 |
-
const span = document.createElement('span');
|
| 257 |
-
span.textContent = AGENT_DISPLAY[agent] || agent.replace(/_/g, ' ');
|
| 258 |
-
item.appendChild(span);
|
| 259 |
-
agentGroup.appendChild(item);
|
| 260 |
-
});
|
| 261 |
-
|
| 262 |
-
// Models legend group
|
| 263 |
-
const modelGroup = document.createElement('div');
|
| 264 |
-
modelGroup.className = 'legend-group';
|
| 265 |
-
const modelLabel = document.createElement('span');
|
| 266 |
-
modelLabel.className = 'legend-group-label';
|
| 267 |
-
modelLabel.textContent = 'Models';
|
| 268 |
-
modelGroup.appendChild(modelLabel);
|
| 269 |
-
|
| 270 |
-
models.forEach(model => {
|
| 271 |
-
const item = document.createElement('div');
|
| 272 |
-
item.className = 'legend-item';
|
| 273 |
-
item.dataset.filter = 'model';
|
| 274 |
-
item.dataset.value = model;
|
| 275 |
-
const dot = document.createElement('span');
|
| 276 |
-
dot.style.cssText = 'display:inline-block;width:11px;height:11px;border-radius:50%;background:' + (MODEL_COLORS[model] || '#666') + ';border:1px solid rgba(128,128,128,0.2);';
|
| 277 |
-
item.appendChild(dot);
|
| 278 |
-
const span = document.createElement('span');
|
| 279 |
-
span.textContent = MODEL_DISPLAY[model] || model;
|
| 280 |
-
item.appendChild(span);
|
| 281 |
-
modelGroup.appendChild(item);
|
| 282 |
-
});
|
| 283 |
-
|
| 284 |
-
legendRow.appendChild(agentGroup);
|
| 285 |
-
legendRow.appendChild(modelGroup);
|
| 286 |
-
root.appendChild(legendRow);
|
| 287 |
-
|
| 288 |
-
container.appendChild(root);
|
| 289 |
-
|
| 290 |
-
// Interaction
|
| 291 |
-
const activeFilters = { agent: null, model: null };
|
| 292 |
-
const legendItems = root.querySelectorAll('.legend-item[data-filter]');
|
| 293 |
-
|
| 294 |
-
function applyFilters() {
|
| 295 |
-
const hasFilter = activeFilters.agent || activeFilters.model;
|
| 296 |
-
legendItems.forEach(li => {
|
| 297 |
-
const t = li.dataset.filter;
|
| 298 |
-
const v = li.dataset.value;
|
| 299 |
-
if (activeFilters[t] === v) {
|
| 300 |
-
li.classList.add('active');
|
| 301 |
-
li.classList.remove('dimmed');
|
| 302 |
-
} else if (activeFilters[t] && activeFilters[t] !== v) {
|
| 303 |
-
li.classList.remove('active');
|
| 304 |
-
li.classList.add('dimmed');
|
| 305 |
-
} else {
|
| 306 |
-
li.classList.remove('active');
|
| 307 |
-
li.classList.remove('dimmed');
|
| 308 |
-
}
|
| 309 |
-
});
|
| 310 |
-
|
| 311 |
-
pointEls.forEach(({ el: g, data: d }) => {
|
| 312 |
-
const matchAgent = !activeFilters.agent || d.agent === activeFilters.agent;
|
| 313 |
-
const matchModel = !activeFilters.model || d.model === activeFilters.model;
|
| 314 |
-
const visible = !hasFilter || (matchAgent && matchModel);
|
| 315 |
-
g.style.opacity = visible ? '1' : '0.08';
|
| 316 |
-
g.style.filter = 'none';
|
| 317 |
-
});
|
| 318 |
-
|
| 319 |
-
paretoLine.style.opacity = hasFilter ? '0.1' : '1';
|
| 320 |
-
}
|
| 321 |
-
|
| 322 |
-
function showTooltip(d, x, y) {
|
| 323 |
-
pointEls.forEach(p => {
|
| 324 |
-
if (p.data === d) {
|
| 325 |
-
p.el.style.opacity = '1';
|
| 326 |
-
p.el.style.filter = 'drop-shadow(0 2px 8px rgba(0,0,0,0.25))';
|
| 327 |
-
} else {
|
| 328 |
-
p.el.style.opacity = '0.15';
|
| 329 |
-
p.el.style.filter = 'none';
|
| 330 |
-
}
|
| 331 |
-
});
|
| 332 |
-
paretoLine.style.opacity = '0.15';
|
| 333 |
-
|
| 334 |
-
const rect = chartWrap.getBoundingClientRect();
|
| 335 |
-
const svgRect = svg.getBoundingClientRect();
|
| 336 |
-
const scaleX = svgRect.width / W;
|
| 337 |
-
const scaleY = svgRect.height / H;
|
| 338 |
-
const tipX = x * scaleX + (svgRect.left - rect.left);
|
| 339 |
-
const tipY = y * scaleY + (svgRect.top - rect.top);
|
| 340 |
-
const successColor = MODEL_COLORS[d.model] || '#333';
|
| 341 |
-
const agentName = AGENT_DISPLAY[d.agent] || d.agent.replace(/_/g, ' ');
|
| 342 |
-
const modelName = MODEL_DISPLAY[d.model] || d.model;
|
| 343 |
-
|
| 344 |
-
tooltip.innerHTML =
|
| 345 |
-
'<div class="tooltip-agent">' + agentName + '</div>' +
|
| 346 |
-
'<div class="tooltip-model">' + modelName + '</div>' +
|
| 347 |
-
'<div class="tooltip-stats">' +
|
| 348 |
-
'<div><div class="tooltip-stat-label">Success</div><div class="tooltip-stat-value" style="color:' + successColor + '">' + (d.success * 100).toFixed(0) + '%</div></div>' +
|
| 349 |
-
'<div><div class="tooltip-stat-label">Cost/task</div><div class="tooltip-stat-value">$' + d.cost.toFixed(2) + '</div></div>' +
|
| 350 |
-
'</div>';
|
| 351 |
-
|
| 352 |
-
tooltip.classList.add('visible');
|
| 353 |
-
const tipW = tooltip.offsetWidth;
|
| 354 |
-
const tipH = tooltip.offsetHeight;
|
| 355 |
-
let left = tipX + 18;
|
| 356 |
-
let top = tipY - tipH / 2;
|
| 357 |
-
if (left + tipW > rect.width - 8) left = tipX - tipW - 18;
|
| 358 |
-
if (top < 4) top = 4;
|
| 359 |
-
if (top + tipH > rect.height - 4) top = rect.height - tipH - 4;
|
| 360 |
-
tooltip.style.left = left + 'px';
|
| 361 |
-
tooltip.style.top = top + 'px';
|
| 362 |
-
}
|
| 363 |
-
|
| 364 |
-
function hideTooltip() {
|
| 365 |
-
applyFilters();
|
| 366 |
-
tooltip.classList.remove('visible');
|
| 367 |
-
}
|
| 368 |
-
|
| 369 |
-
pointEls.forEach(({ el: g, data: d, x, y }) => {
|
| 370 |
-
g.addEventListener('mouseenter', () => showTooltip(d, x, y));
|
| 371 |
-
g.addEventListener('mouseleave', hideTooltip);
|
| 372 |
-
});
|
| 373 |
-
|
| 374 |
-
legendItems.forEach(item => {
|
| 375 |
-
item.addEventListener('click', () => {
|
| 376 |
-
const type = item.dataset.filter;
|
| 377 |
-
const value = item.dataset.value;
|
| 378 |
-
activeFilters[type] = (activeFilters[type] === value) ? null : value;
|
| 379 |
-
applyFilters();
|
| 380 |
-
});
|
| 381 |
-
});
|
| 382 |
-
}
|
| 383 |
-
|
| 384 |
-
// Expose globally so main.js can re-render on theme toggle
|
| 385 |
-
window.renderParetoChart = renderParetoChart;
|
| 386 |
-
})();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
styles.css
CHANGED
|
@@ -404,7 +404,9 @@ section{position:relative;z-index:1;padding:80px 0}
|
|
| 404 |
}
|
| 405 |
.filter-select:hover,.filter-select:focus{border-color:var(--cyan);outline:none;box-shadow:0 0 12px var(--cyan-dim)}
|
| 406 |
|
| 407 |
-
.table-wrap{overflow-x:auto;border-radius:12px;border:1px solid rgba(255,255,255,0.1);background:rgba(10,10,10,0.8);box-shadow:0 2px 16px rgba(0,0,0,0.3)}
|
|
|
|
|
|
|
| 408 |
.lb-table{width:100%;border-collapse:collapse;font-size:.78rem}
|
| 409 |
.lb-table thead{background:rgba(255,255,255,0.06)}
|
| 410 |
.lb-table th{
|
|
|
|
| 404 |
}
|
| 405 |
.filter-select:hover,.filter-select:focus{border-color:var(--cyan);outline:none;box-shadow:0 0 12px var(--cyan-dim)}
|
| 406 |
|
| 407 |
+
.table-wrap{overflow-x:auto;overflow-y:auto;max-height:600px;border-radius:12px;border:1px solid rgba(255,255,255,0.1);background:rgba(10,10,10,0.8);box-shadow:0 2px 16px rgba(0,0,0,0.3);position:relative}
|
| 408 |
+
.table-wrap.lb-can-scroll{-webkit-mask-image:linear-gradient(to bottom,#000 88%,transparent 100%);mask-image:linear-gradient(to bottom,#000 88%,transparent 100%)}
|
| 409 |
+
.lb-table thead{position:sticky;top:0;z-index:1;backdrop-filter:blur(8px);-webkit-backdrop-filter:blur(8px)}
|
| 410 |
.lb-table{width:100%;border-collapse:collapse;font-size:.78rem}
|
| 411 |
.lb-table thead{background:rgba(255,255,255,0.06)}
|
| 412 |
.lb-table th{
|