Elron commited on
Commit
a7dde89
·
verified ·
1 Parent(s): 22f581d

Auto-deploy from GitHub

Browse files
Files changed (7) hide show
  1. README.md +1 -3
  2. index.html +1 -2
  3. results.csv +148 -88
  4. results.csv.timestamp +1 -1
  5. scripts/leaderboard.js +27 -7
  6. scripts/pareto.js +0 -386
  7. styles.css +3 -1
README.md CHANGED
@@ -14,7 +14,5 @@ Interactive leaderboard and efficiency analysis for general-purpose AI agents ev
14
 
15
  - **Benchmarks**: AppWorld, BrowseComp+, SWE-bench, TauBench (Airline, Retail, Telecom)
16
  - **Paper**: [arXiv:2602.22953](https://arxiv.org/abs/2602.22953)
17
- - **Framework**: [Exgentic](https://github.com/Exgentic/exgentic)
18
- - **GitHub**: [Exgentic/open-agent-leaderboard](https://github.com/Exgentic/open-agent-leaderboard)
19
  - **Website**: [exgentic.github.io](https://exgentic.github.io)
20
- - **Submit Results**: [CONTRIBUTING.md](https://github.com/Exgentic/open-agent-leaderboard/blob/main/CONTRIBUTING.md)
 
14
 
15
  - **Benchmarks**: AppWorld, BrowseComp+, SWE-bench, TauBench (Airline, Retail, Telecom)
16
  - **Paper**: [arXiv:2602.22953](https://arxiv.org/abs/2602.22953)
17
+ - **GitHub**: [Exgentic/exgentic](https://github.com/Exgentic/exgentic)
 
18
  - **Website**: [exgentic.github.io](https://exgentic.github.io)
 
index.html CHANGED
@@ -1,6 +1,6 @@
1
  <!--
2
  SPDX-License-Identifier: Apache-2.0
3
- Copyright (C) 2025-2026, The Exgentic organization and its contributors.
4
  -->
5
  <!doctype html>
6
  <html lang="en">
@@ -82,7 +82,6 @@ const header=document.getElementById('header');
82
  window.addEventListener('scroll',()=>{header.classList.toggle('scrolled',window.scrollY>50)},{passive:true});
83
  document.getElementById('mobileToggle').addEventListener('click',()=>{document.getElementById('headerNav').classList.toggle('open')});
84
  </script>
85
- <script src="./scripts/pareto.js"></script>
86
  <script src="./scripts/leaderboard.js" data-base="./"></script>
87
  <script>
88
  // Theme toggle
 
1
  <!--
2
  SPDX-License-Identifier: Apache-2.0
3
+ Copyright (C) 2025, The Exgentic organization and its contributors.
4
  -->
5
  <!doctype html>
6
  <html lang="en">
 
82
  window.addEventListener('scroll',()=>{header.classList.toggle('scrolled',window.scrollY>50)},{passive:true});
83
  document.getElementById('mobileToggle').addEventListener('click',()=>{document.getElementById('headerNav').classList.toggle('open')});
84
  </script>
 
85
  <script src="./scripts/leaderboard.js" data-base="./"></script>
86
  <script>
87
  // Theme toggle
results.csv CHANGED
@@ -1,91 +1,151 @@
1
  agent,agent_normalized,visible_agent_name,agent_version,avg_cost,avg_steps,benchmark,finished_pct,model,model_normalized,num_tasks,score,total_cost
2
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,13.08379215,49.69,AppWorld,0.74,claude,claude-opus-4.5,100,0.66,1308.379215
3
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0,0,AppWorld,0,gpt52,gpt-5.2,100,0,0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  claudecode,claude-code,Claude_Code,claude_code_2.1.7,3.10550762,38.01,AppWorld,0.86,gemini,gemini-3-pro,100,0.36,310.550762
5
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,22.7647676,47.65,AppWorld,0.77,claude,claude-opus-4.5,100,0.68,2276.47676
6
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0,0,AppWorld,0,gpt52,gpt-5.2,100,0,0
7
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,8.69552842,33.49,AppWorld,0.98,gemini,gemini-3-pro,100,0.582,869.552842
8
- smolagent,smolagents,Smolagent,smolagents_1.24.0,5.5851091,41.07,AppWorld,0.82,claude,claude-opus-4.5,100,0.7,558.51091
9
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.5503114225,51.59,AppWorld,0.61,gpt52,gpt-5.2,100,0.071,55.03114225
10
- smolagent,smolagents,Smolagent,smolagents_1.24.0,2.54254602,49.13,AppWorld,0.71,gemini,gemini-3-pro,100,0.13,254.254602
11
- litellm,litellm-react,React,exgentic_0.1.0,11.32465995,21.99,AppWorld,0.83,claude,claude-opus-4.5,100,0.61,1132.465995
12
- litellm,litellm-react,React,exgentic_0.1.0,0,0,AppWorld,0,gpt52,gpt-5.2,100,0,0
13
- litellm,litellm-react,React,exgentic_0.1.0,1.88187702,21.76,AppWorld,0.99,gemini,gemini-3-pro,100,0.505,188.187702
14
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,3.43320175,20.06,AppWorld,0.82,claude,claude-opus-4.5,100,0.64,343.320175
15
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.3637131575,10.05,AppWorld,1,gpt52,gpt-5.2,100,0.22,36.37131575
16
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,1.30485292,22.59,AppWorld,1,gemini,gemini-3-pro,100,0.55,130.485292
17
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,11.65879637,31.03921569,BrowseComp+,0.8431372549,claude,claude-opus-4.5,51,0.5294117647,594.598615
18
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.4293807,8.97,BrowseComp+,1,gpt52,gpt-5.2,100,0.43,42.93807
19
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,2.84529442,22.88,BrowseComp+,0.7,gemini,gemini-3-pro,100,0.51,284.529442
20
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,7.59245735,27.18,BrowseComp+,1,claude,claude-opus-4.5,100,0.61,759.245735
21
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.38043726,14.27,BrowseComp+,1,gpt52,gpt-5.2,100,0.48,38.043726
22
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.6433071111,8.454545455,BrowseComp+,0.6060606061,gemini,gemini-3-pro,99,0.3333333333,63.687404
23
- smolagent,smolagents,Smolagent,smolagents_1.24.0,6.30372505,24.16,BrowseComp+,1,claude,claude-opus-4.5,100,0.61,630.372505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  smolagent,smolagents,Smolagent,smolagents_1.24.0,0.17156755,6.57,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.26,17.156755
25
- smolagent,smolagents,Smolagent,smolagents_1.24.0,2.38885126,29.63,BrowseComp+,0.69,gemini,gemini-3-pro,100,0.57,238.885126
26
- litellm,litellm-react,React,exgentic_0.1.0,7.09363285,21.66,BrowseComp+,0.93,claude,claude-opus-4.5,100,0.49,709.363285
27
- litellm,litellm-react,React,exgentic_0.1.0,0.296172625,8.14,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.46,29.6172625
28
- litellm,litellm-react,React,exgentic_0.1.0,0.43943356,7.85,BrowseComp+,0.99,gemini,gemini-3-pro,100,0.48,43.943356
29
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,7.09363285,21.66,BrowseComp+,0.93,claude,claude-opus-4.5,100,0.49,709.363285
30
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.296172625,8.14,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.46,29.6172625
31
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.43943356,7.85,BrowseComp+,0.99,gemini,gemini-3-pro,100,0.48,43.943356
32
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,5.604360979,31.7628866,SWE-bench,1,claude,claude-opus-4.5,97,0.7422680412,543.623015
33
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.939754725,23.99,SWE-bench,1,gpt52,gpt-5.2,100,0.58,93.9754725
34
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,3.67966876,43.72,SWE-bench,1,gemini,gemini-3-pro,100,0.67,367.966876
35
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,2.961261205,34.09638554,SWE-bench,1,claude,claude-opus-4.5,83,0.8072289157,245.78468
36
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.2590178359,20.44444444,SWE-bench,1,gpt52,gpt-5.2,99,0.5454545455,25.64276575
37
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,1.579177979,32.36170213,SWE-bench,1,gemini,gemini-3-pro,94,0.7234042553,148.44273
38
- smolagent,smolagents,Smolagent,smolagents_1.24.0,4.85218325,39.13,SWE-bench,1,claude,claude-opus-4.5,100,0.65,485.218325
39
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.4503179545,19.97979798,SWE-bench,1,gpt52,gpt-5.2,99,0.5252525253,44.5814775
40
- smolagent,smolagents,Smolagent,smolagents_1.24.0,2.209661838,38.1010101,SWE-bench,1,gemini,gemini-3-pro,99,0.7575757576,218.756522
41
- litellm,litellm-react,React,exgentic_0.1.0,3.971294949,43.44444444,SWE-bench,1,claude,claude-opus-4.5,99,0.6060606061,393.1582
42
- litellm,litellm-react,React,exgentic_0.1.0,0.247620555,20.47,SWE-bench,1,gpt52,gpt-5.2,100,0.57,24.7620555
43
- litellm,litellm-react,React,exgentic_0.1.0,0.69561342,32.55,SWE-bench,1,gemini,gemini-3-pro,100,0.71,69.561342
44
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,3.971294949,43.44444444,SWE-bench,1,claude,claude-opus-4.5,99,0.6060606061,393.1582
45
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.247620555,20.47,SWE-bench,1,gpt52,gpt-5.2,100,0.57,24.7620555
46
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.69561342,32.55,SWE-bench,1,gemini,gemini-3-pro,100,0.71,69.561342
47
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,1.2993549,11.5,TauBench-Airline,1,claude,claude-opus-4.5,50,0.66,64.967745
48
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.21338296,10.18,TauBench-Airline,1,gpt52,gpt-5.2,50,0.48,10.669148
49
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.3394426,12.62,TauBench-Airline,1,gemini,gemini-3-pro,50,0.7,16.97213
50
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.7165842,12.22,TauBench-Airline,1,claude,claude-opus-4.5,50,0.74,35.82921
51
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.10711806,11.4,TauBench-Airline,1,gpt52,gpt-5.2,50,0.5,5.355903
52
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.21380212,10.9,TauBench-Airline,1,gemini,gemini-3-pro,50,0.62,10.690106
53
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.7801528,11.88,TauBench-Airline,1,claude,claude-opus-4.5,50,0.72,39.00764
54
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.293271755,10.68,TauBench-Airline,1,gpt52,gpt-5.2,50,0.6,14.66358775
55
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.19590488,12.28,TauBench-Airline,1,gemini,gemini-3-pro,50,0.68,9.795244
56
- litellm,litellm-react,React,exgentic_0.1.0,0.4692376,10,TauBench-Airline,1,claude,claude-opus-4.5,50,0.66,23.46188
57
- litellm,litellm-react,React,exgentic_0.1.0,0.125104735,11.22,TauBench-Airline,1,gpt52,gpt-5.2,50,0.54,6.25523675
58
- litellm,litellm-react,React,exgentic_0.1.0,0.15854968,10.14,TauBench-Airline,1,gemini,gemini-3-pro,50,0.7,7.927484
59
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.4692376,10,TauBench-Airline,1,claude,claude-opus-4.5,50,0.66,23.46188
60
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.125104735,11.22,TauBench-Airline,1,gpt52,gpt-5.2,50,0.54,6.25523675
61
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.15854968,10.14,TauBench-Airline,1,gemini,gemini-3-pro,50,0.7,7.927484
62
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,1.59852325,12.54,TauBench-Retail,1,claude,claude-opus-4.5,100,0.83,159.852325
63
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.1200643675,9.92,TauBench-Retail,0.98,gpt52,gpt-5.2,100,0.51,12.00643675
64
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.18525812,11.18,TauBench-Retail,1,gemini,gemini-3-pro,100,0.7804878049,18.525812
65
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.5495812,12.54,TauBench-Retail,1,claude,claude-opus-4.5,100,0.85,54.95812
66
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.107783305,9.55,TauBench-Retail,0.99,gpt52,gpt-5.2,100,0.5353535354,10.7783305
67
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.266485,10.62,TauBench-Retail,1,gemini,gemini-3-pro,100,0.73,26.6485
68
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.6711861,11.71,TauBench-Retail,1,claude,claude-opus-4.5,100,0.78,67.11861
69
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2519468525,11.08,TauBench-Retail,1,gpt52,gpt-5.2,100,0.68,25.19468525
70
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2056864,11.3,TauBench-Retail,1,gemini,gemini-3-pro,100,0.7575757576,20.56864
71
- litellm,litellm-react,React,exgentic_0.1.0,0.46742525,11.33,TauBench-Retail,1,claude,claude-opus-4.5,100,0.78,46.742525
72
- litellm,litellm-react,React,exgentic_0.1.0,0.1109317125,10.33,TauBench-Retail,1,gpt52,gpt-5.2,100,0.73,11.09317125
73
- litellm,litellm-react,React,exgentic_0.1.0,0.15649692,11.25,TauBench-Retail,1,gemini,gemini-3-pro,100,0.82,15.649692
74
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.46742525,11.33,TauBench-Retail,1,claude,claude-opus-4.5,100,0.78,46.742525
75
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.1109317125,10.33,TauBench-Retail,1,gpt52,gpt-5.2,100,0.73,11.09317125
76
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.15649692,11.25,TauBench-Retail,1,gemini,gemini-3-pro,100,0.82,15.649692
77
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,2.4488646,18.71,TauBench-Telecom,1,claude,claude-opus-4.5,100,0.76,244.88646
78
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.0980149625,9.36,TauBench-Telecom,1,gpt52,gpt-5.2,100,0.55,9.80149625
79
- claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.21064224,9.9,TauBench-Telecom,1,gemini,gemini-3-pro,100,0.6851851852,21.064224
80
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,1.2459395,17.15,TauBench-Telecom,1,claude,claude-opus-4.5,100,0.84,124.59395
81
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.147175455,9.92,TauBench-Telecom,1,gpt52,gpt-5.2,100,0.53,14.7175455
82
- openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.53513152,10.82,TauBench-Telecom,0.89,gemini,gemini-3-pro,100,0.8876404494,53.513152
83
- smolagent,smolagents,Smolagent,smolagents_1.24.0,1.0643879,13.77,TauBench-Telecom,1,claude,claude-opus-4.5,100,0.58,106.43879
84
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.3035534775,10.11,TauBench-Telecom,1,gpt52,gpt-5.2,100,0.71,30.35534775
85
- smolagent,smolagents,Smolagent,smolagents_1.24.0,0.3461699,12.71,TauBench-Telecom,1,gemini,gemini-3-pro,100,0.88,34.61699
86
- litellm,litellm-react,React,exgentic_0.1.0,0.91578125,17.22,TauBench-Telecom,1,claude,claude-opus-4.5,100,0.76,91.578125
87
- litellm,litellm-react,React,exgentic_0.1.0,0.1408951075,10.18,TauBench-Telecom,0.99,gpt52,gpt-5.2,100,0.5353535354,14.08951075
88
- litellm,litellm-react,React,exgentic_0.1.0,0.29797836,14.84,TauBench-Telecom,1,gemini,gemini-3-pro,100,0.73,29.797836
89
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.91578125,17.22,TauBench-Telecom,1,claude,claude-opus-4.5,100,0.76,91.578125
90
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.1408951075,10.18,TauBench-Telecom,0.99,gpt52,gpt-5.2,100,0.5353535354,14.08951075
91
- litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.29797836,14.84,TauBench-Telecom,1,gemini,gemini-3-pro,100,0.73,29.797836
 
 
 
 
 
 
 
 
 
 
1
  agent,agent_normalized,visible_agent_name,agent_version,avg_cost,avg_steps,benchmark,finished_pct,model,model_normalized,num_tasks,score,total_cost
2
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.5026066430000001,18.94,AppWorld,0.46,deepseek,deepseek-v3.2,100,0.03,50.26066430000001
3
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.10630119119999998,20.82,BrowseComp+,0.98,deepseek,deepseek-v3.2,100,0.48,10.630119119999998
4
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.20144125259999995,54.16,SWE-bench,0.89,deepseek,deepseek-v3.2,100,0.64,20.144125259999996
5
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.028655971600000007,12.06,TauBench-Airline,0.98,deepseek,deepseek-v3.2,50,0.28,1.4327985800000003
6
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.043596285400000004,12.51,TauBench-Retail,0.99,deepseek,deepseek-v3.2,100,0.65,4.35962854
7
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.07225669540000003,20.03,TauBench-Telecom,0.97,deepseek,deepseek-v3.2,100,0.61,7.225669540000003
8
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.8930315160000002,36.62,AppWorld,0.78,kimi,kimi-k2.5,100,0.08,89.30315160000002
9
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.7320507719999999,25.02,BrowseComp+,0.86,kimi,kimi-k2.5,100,0.56,73.20507719999999
10
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.7822629151515151,38.484848484848484,SWE-bench,0.7474747474747475,kimi,kimi-k2.5,100,0.5204081632653061,77.4440286
11
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.032654508000000006,2.58,TauBench-Airline,0.9,kimi,kimi-k2.5,50,0.12,1.6327254000000002
12
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.018570114000000006,0.56,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.03,1.8570114000000004
13
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.005500535999999998,0.0,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.0,0.5500535999999998
14
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.0,0.0,AppWorld,0.0,gpt52,gpt-5.2,100,0.0,0.0
15
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.4293807000000001,8.97,BrowseComp+,1.0,gpt52,gpt-5.2,100,0.43,42.93807000000001
16
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.9397547250000001,23.99,SWE-bench,1.0,gpt52,gpt-5.2,100,0.58,93.97547250000001
17
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.21338296,10.18,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.48,10.669148
18
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.1200643675,9.92,TauBench-Retail,0.98,gpt52,gpt-5.2,100,0.64,12.00643675
19
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.0980149625,9.36,TauBench-Telecom,1.0,gpt52,gpt-5.2,100,0.55,9.80149625
20
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,13.08379215,49.69,AppWorld,0.74,claude,claude-opus-4.5,100,0.66,1308.3792150000002
21
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,11.658796372549022,31.03921568627451,BrowseComp+,0.8431372549019608,claude,claude-opus-4.5,51,0.5294117647058824,594.5986150000001
22
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,5.604360979381441,31.762886597938145,SWE-bench,1.0,claude,claude-opus-4.5,97,0.7422680412371134,543.6230149999998
23
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,1.2993549000000004,11.5,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.66,64.96774500000002
24
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,1.5985232500000006,12.54,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.83,159.85232500000006
25
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,2.448864600000001,18.71,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.76,244.8864600000001
26
  claudecode,claude-code,Claude_Code,claude_code_2.1.7,3.10550762,38.01,AppWorld,0.86,gemini,gemini-3-pro,100,0.36,310.550762
27
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,2.8452944199999997,22.88,BrowseComp+,0.7,gemini,gemini-3-pro,100,0.51,284.52944199999996
28
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,3.679668759999999,43.72,SWE-bench,1.0,gemini,gemini-3-pro,100,0.67,367.9668759999999
29
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.3394426000000001,12.62,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.7,16.972130000000003
30
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.18525811999999994,11.18,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.71,18.525811999999995
31
+ claudecode,claude-code,Claude_Code,claude_code_2.1.7,0.21064223999999993,9.9,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.71,21.064223999999992
32
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3050881588000001,15.63,AppWorld,0.37,deepseek,deepseek-v3.2,100,0.06,30.50881588000001
33
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.09532471040000004,13.13,BrowseComp+,0.62,deepseek,deepseek-v3.2,100,0.3,9.532471040000004
34
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.11237704080000001,41.37,SWE-bench,0.38,deepseek,deepseek-v3.2,100,0.7368421052631579,11.23770408
35
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.016994613600000003,6.72,TauBench-Airline,0.94,deepseek,deepseek-v3.2,50,0.2,0.8497306800000002
36
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.011356187600000003,4.82,TauBench-Retail,0.97,deepseek,deepseek-v3.2,100,0.19,1.1356187600000003
37
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.023369379999999995,7.0,TauBench-Telecom,0.97,deepseek,deepseek-v3.2,100,0.18,2.3369379999999995
38
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3024222959999999,19.59,AppWorld,0.48,kimi,kimi-k2.5,100,0.08,30.242229599999995
39
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.25144734599999996,17.61,BrowseComp+,0.53,kimi,kimi-k2.5,100,0.35,25.144734599999996
40
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3341831760000001,38.74,SWE-bench,0.78,kimi,kimi-k2.5,100,0.5670103092783505,33.41831760000001
41
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.004579271999999998,0.5,TauBench-Airline,0.88,kimi,kimi-k2.5,50,0.0,0.2289635999999999
42
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.0042811740000000004,0.41,TauBench-Retail,0.96,kimi,kimi-k2.5,100,0.01,0.42811740000000004
43
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.0046396319999999994,0.0,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.0,0.46396319999999996
44
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.0,0.0,AppWorld,0.0,gpt52,gpt-5.2,100,0.0,0.0
45
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.3804372600000001,14.27,BrowseComp+,1.0,gpt52,gpt-5.2,100,0.48,38.043726000000014
46
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.259017835858586,20.444444444444443,SWE-bench,1.0,gpt52,gpt-5.2,99,0.5454545454545454,25.642765750000013
47
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.10711806000000001,11.4,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.5,5.3559030000000005
48
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.10778330499999997,9.55,TauBench-Retail,0.99,gpt52,gpt-5.2,100,0.53,10.778330499999997
49
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.14717545500000007,9.92,TauBench-Telecom,1.0,gpt52,gpt-5.2,100,0.53,14.717545500000005
50
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,22.764767600000006,47.65,AppWorld,0.77,claude,claude-opus-4.5,100,0.68,2276.4767600000005
51
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,7.592457349999998,27.18,BrowseComp+,1.0,claude,claude-opus-4.5,100,0.61,759.2457349999999
52
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,2.9612612048192783,34.096385542168676,SWE-bench,1.0,claude,claude-opus-4.5,83,0.8072289156626506,245.7846800000001
53
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.7165842000000003,12.22,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.74,35.82921000000002
54
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.5495812000000001,12.54,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.85,54.958120000000015
55
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,1.2459395,17.15,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.84,124.59395
56
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,8.695528419999997,33.49,AppWorld,0.98,gemini,gemini-3-pro,100,0.57,869.5528419999997
57
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.6433071111111112,8.454545454545455,BrowseComp+,0.6060606060606061,gemini,gemini-3-pro,99,0.3333333333333333,63.687404
58
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,1.5791779787234044,32.361702127659576,SWE-bench,1.0,gemini,gemini-3-pro,94,0.723404255319149,148.44273
59
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.21380211999999996,10.9,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.62,10.690105999999998
60
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.26648499999999986,10.62,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.73,26.648499999999988
61
+ openaimcp,openai-mcp,OpenAI_Solo,openai_sdk_0.7.0,0.5351315199999997,10.82,TauBench-Telecom,0.89,gemini,gemini-3-pro,100,0.79,53.51315199999998
62
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2420605992000001,54.69,AppWorld,0.81,deepseek,deepseek-v3.2,100,0.13,24.20605992000001
63
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.07768438359999996,12.01,BrowseComp+,1.0,deepseek,deepseek-v3.2,100,0.21,7.768438359999997
64
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.5519223772,69.43,SWE-bench,0.98,deepseek,deepseek-v3.2,100,0.56,55.19223772000001
65
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.055420856,18.94,TauBench-Airline,1.0,deepseek,deepseek-v3.2,50,0.6,2.7710428
66
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.05129110320000001,14.06,TauBench-Retail,0.98,deepseek,deepseek-v3.2,100,0.77,5.1291103200000006
67
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.06806756960000002,18.3,TauBench-Telecom,1.0,deepseek,deepseek-v3.2,100,0.84,6.806756960000001
68
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.6740286120000001,50.63,AppWorld,0.91,kimi,kimi-k2.5,100,0.11,67.4028612
69
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.4948784999999996,29.79,BrowseComp+,0.95,kimi,kimi-k2.5,100,0.33,49.48784999999996
70
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,1.3741787032258057,65.91397849462365,SWE-bench,0.8709677419354839,kimi,kimi-k2.5,100,0.5760869565217391,127.79861939999994
71
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.09928738799999998,13.0,TauBench-Airline,1.0,kimi,kimi-k2.5,50,0.56,4.964369399999999
72
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.06753205714285712,12.887755102040817,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.7244897959183674,6.618141599999998
73
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.15341341818181825,19.141414141414142,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.7070707070707071,15.187928400000008
74
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.5503114225,51.59,AppWorld,0.61,gpt52,gpt-5.2,100,0.07,55.03114225
75
  smolagent,smolagents,Smolagent,smolagents_1.24.0,0.17156755,6.57,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.26,17.156755
76
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.45031795454545454,19.97979797979798,SWE-bench,1.0,gpt52,gpt-5.2,99,0.5252525252525253,44.5814775
77
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2932717550000001,10.68,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.6,14.663587750000003
78
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.25194685249999993,11.08,TauBench-Retail,1.0,gpt52,gpt-5.2,100,0.68,25.194685249999992
79
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.30355347749999995,10.11,TauBench-Telecom,1.0,gpt52,gpt-5.2,100,0.71,30.355347749999993
80
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,5.585109100000002,41.07,AppWorld,0.82,claude,claude-opus-4.5,100,0.7,558.5109100000002
81
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,6.303725049999997,24.16,BrowseComp+,1.0,claude,claude-opus-4.5,100,0.61,630.3725049999997
82
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,4.8521832499999995,39.13,SWE-bench,1.0,claude,claude-opus-4.5,100,0.65,485.218325
83
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.7801527999999999,11.88,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.72,39.007639999999995
84
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.6711861,11.71,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.78,67.11861
85
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,1.0643879000000003,13.77,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.58,106.43879000000003
86
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,2.542546020000001,49.13,AppWorld,0.71,gemini,gemini-3-pro,100,0.13,254.2546020000001
87
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,2.388851260000001,29.63,BrowseComp+,0.69,gemini,gemini-3-pro,100,0.57,238.88512600000007
88
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,2.209661838383839,38.101010101010104,SWE-bench,1.0,gemini,gemini-3-pro,99,0.7575757575757576,218.75652200000005
89
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.19590487999999995,12.28,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.68,9.795243999999997
90
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.2056864,11.3,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.75,20.56864
91
+ smolagent,smolagents,Smolagent,smolagents_1.24.0,0.3461698999999998,12.71,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.88,34.61698999999998
92
+ litellm,litellm-react,React,exgentic_0.1.0,0.8332616264000001,31.69,AppWorld,0.88,deepseek,deepseek-v3.2,100,0.09,83.32616264
93
+ litellm,litellm-react,React,exgentic_0.1.0,0.1679167768,34.19,BrowseComp+,0.98,deepseek,deepseek-v3.2,100,0.36,16.79167768
94
+ litellm,litellm-react,React,exgentic_0.1.0,0.47889091680000007,77.44,SWE-bench,0.94,deepseek,deepseek-v3.2,100,0.6875,47.88909168000001
95
+ litellm,litellm-react,React,exgentic_0.1.0,0.033952516,15.28,TauBench-Airline,1.0,deepseek,deepseek-v3.2,50,0.56,1.6976258
96
+ litellm,litellm-react,React,exgentic_0.1.0,0.029598555599999986,14.44,TauBench-Retail,1.0,deepseek,deepseek-v3.2,100,0.82,2.9598555599999985
97
+ litellm,litellm-react,React,exgentic_0.1.0,0.04940463720000001,18.11,TauBench-Telecom,1.0,deepseek,deepseek-v3.2,100,0.71,4.940463720000001
98
+ litellm,litellm-react,React,exgentic_0.1.0,0.8925547499999997,25.05,AppWorld,0.9,kimi,kimi-k2.5,100,0.09,89.25547499999998
99
+ litellm,litellm-react,React,exgentic_0.1.0,0.3791127780000001,26.27,BrowseComp+,0.74,kimi,kimi-k2.5,100,0.34,37.91127780000001
100
+ litellm,litellm-react,React,exgentic_0.1.0,0.590102075510204,48.765306122448976,SWE-bench,0.8775510204081632,kimi,kimi-k2.5,100,0.5714285714285714,57.830003399999995
101
+ litellm,litellm-react,React,exgentic_0.1.0,0.03922161599999999,9.6,TauBench-Airline,1.0,kimi,kimi-k2.5,50,0.62,1.9610807999999995
102
+ litellm,litellm-react,React,exgentic_0.1.0,0.04560821212121211,12.313131313131313,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.6464646464646465,4.515212999999999
103
+ litellm,litellm-react,React,exgentic_0.1.0,0.092763078,16.14,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.83,9.2763078
104
+ litellm,litellm-react,React,exgentic_0.1.0,0.0,0.0,AppWorld,0.0,gpt52,gpt-5.2,100,0.0,0.0
105
+ litellm,litellm-react,React,exgentic_0.1.0,0.2961726250000001,8.14,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.46,29.617262500000006
106
+ litellm,litellm-react,React,exgentic_0.1.0,0.247620555,20.47,SWE-bench,1.0,gpt52,gpt-5.2,100,0.57,24.7620555
107
+ litellm,litellm-react,React,exgentic_0.1.0,0.12510473500000002,11.22,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.54,6.255236750000001
108
+ litellm,litellm-react,React,exgentic_0.1.0,0.11093171249999992,10.33,TauBench-Retail,1.0,gpt52,gpt-5.2,100,0.73,11.093171249999992
109
+ litellm,litellm-react,React,exgentic_0.1.0,0.14089510749999992,10.18,TauBench-Telecom,0.99,gpt52,gpt-5.2,100,0.53,14.089510749999993
110
+ litellm,litellm-react,React,exgentic_0.1.0,11.324659950000004,21.99,AppWorld,0.83,claude,claude-opus-4.5,100,0.61,1132.4659950000005
111
+ litellm,litellm-react,React,exgentic_0.1.0,7.093632849999999,21.66,BrowseComp+,0.93,claude,claude-opus-4.5,100,0.49,709.3632849999999
112
+ litellm,litellm-react,React,exgentic_0.1.0,3.971294949494951,43.44444444444444,SWE-bench,1.0,claude,claude-opus-4.5,99,0.6060606060606061,393.15820000000014
113
+ litellm,litellm-react,React,exgentic_0.1.0,0.46923760000000003,10.0,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.66,23.46188
114
+ litellm,litellm-react,React,exgentic_0.1.0,0.46742524999999985,11.33,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.78,46.742524999999986
115
+ litellm,litellm-react,React,exgentic_0.1.0,0.9157812500000004,17.22,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.76,91.57812500000004
116
+ litellm,litellm-react,React,exgentic_0.1.0,1.8818770200000003,21.76,AppWorld,0.99,gemini,gemini-3-pro,100,0.5,188.18770200000003
117
+ litellm,litellm-react,React,exgentic_0.1.0,0.4394335600000001,7.85,BrowseComp+,0.99,gemini,gemini-3-pro,100,0.48,43.94335600000001
118
+ litellm,litellm-react,React,exgentic_0.1.0,0.6956134199999998,32.55,SWE-bench,1.0,gemini,gemini-3-pro,100,0.71,69.56134199999998
119
+ litellm,litellm-react,React,exgentic_0.1.0,0.15854967999999997,10.14,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.7,7.927483999999998
120
+ litellm,litellm-react,React,exgentic_0.1.0,0.15649691999999996,11.25,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.82,15.649691999999995
121
+ litellm,litellm-react,React,exgentic_0.1.0,0.29797836000000005,14.84,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.73,29.797836000000004
122
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.23222570840000004,28.51,AppWorld,0.52,deepseek,deepseek-v3.2,100,0.04,23.222570840000003
123
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.1679167768,34.19,BrowseComp+,0.98,deepseek,deepseek-v3.2,100,0.36,16.79167768
124
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.47889091680000007,77.44,SWE-bench,0.94,deepseek,deepseek-v3.2,100,0.6875,47.88909168000001
125
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.033952516,15.28,TauBench-Airline,1.0,deepseek,deepseek-v3.2,50,0.56,1.6976258
126
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.029598555599999986,14.44,TauBench-Retail,1.0,deepseek,deepseek-v3.2,100,0.82,2.9598555599999985
127
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.04940463720000001,18.11,TauBench-Telecom,1.0,deepseek,deepseek-v3.2,100,0.71,4.940463720000001
128
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.6031619339999997,23.53,AppWorld,0.89,kimi,kimi-k2.5,100,0.1,60.316193399999975
129
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.3791127780000001,26.27,BrowseComp+,0.74,kimi,kimi-k2.5,100,0.34,37.91127780000001
130
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.590102075510204,48.765306122448976,SWE-bench,0.8775510204081632,kimi,kimi-k2.5,100,0.5714285714285714,57.830003399999995
131
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.03922161599999999,9.6,TauBench-Airline,1.0,kimi,kimi-k2.5,50,0.62,1.9610807999999995
132
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.04560821212121211,12.313131313131313,TauBench-Retail,1.0,kimi,kimi-k2.5,100,0.6464646464646465,4.515212999999999
133
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.092763078,16.14,TauBench-Telecom,1.0,kimi,kimi-k2.5,100,0.83,9.2763078
134
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.36371315749999994,10.05,AppWorld,1.0,gpt52,gpt-5.2,100,0.22,36.371315749999994
135
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.2961726250000001,8.14,BrowseComp+,0.99,gpt52,gpt-5.2,100,0.46,29.617262500000006
136
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.247620555,20.47,SWE-bench,1.0,gpt52,gpt-5.2,100,0.57,24.7620555
137
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.12510473500000002,11.22,TauBench-Airline,1.0,gpt52,gpt-5.2,50,0.54,6.255236750000001
138
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.11093171249999992,10.33,TauBench-Retail,1.0,gpt52,gpt-5.2,100,0.73,11.093171249999992
139
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.14089510749999992,10.18,TauBench-Telecom,0.99,gpt52,gpt-5.2,100,0.53,14.089510749999993
140
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,3.4332017499999994,20.06,AppWorld,0.82,claude,claude-opus-4.5,100,0.64,343.32017499999995
141
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,7.093632849999999,21.66,BrowseComp+,0.93,claude,claude-opus-4.5,100,0.49,709.3632849999999
142
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,3.971294949494951,43.44444444444444,SWE-bench,1.0,claude,claude-opus-4.5,99,0.6060606060606061,393.15820000000014
143
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.46923760000000003,10.0,TauBench-Airline,1.0,claude,claude-opus-4.5,50,0.66,23.46188
144
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.46742524999999985,11.33,TauBench-Retail,1.0,claude,claude-opus-4.5,100,0.78,46.742524999999986
145
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.9157812500000004,17.22,TauBench-Telecom,1.0,claude,claude-opus-4.5,100,0.76,91.57812500000004
146
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,1.30485292,22.59,AppWorld,1.0,gemini,gemini-3-pro,100,0.55,130.48529200000002
147
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.4394335600000001,7.85,BrowseComp+,0.99,gemini,gemini-3-pro,100,0.48,43.94335600000001
148
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.6956134199999998,32.55,SWE-bench,1.0,gemini,gemini-3-pro,100,0.71,69.56134199999998
149
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.15854967999999997,10.14,TauBench-Airline,1.0,gemini,gemini-3-pro,50,0.7,7.927483999999998
150
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.15649691999999996,11.25,TauBench-Retail,1.0,gemini,gemini-3-pro,100,0.82,15.649691999999995
151
+ litellm-shortlist,litellm-shortlist,React_+_Shortlisting,exgentic_0.1.0_�_litellm_1.79.1,0.29797836000000005,14.84,TauBench-Telecom,1.0,gemini,gemini-3-pro,100,0.73,29.797836000000004
results.csv.timestamp CHANGED
@@ -1 +1 @@
1
- Mon Mar 30 14:50:35 UTC 2026
 
1
+ Wednesday, May 13, 2026 12:25:52 PM
scripts/leaderboard.js CHANGED
@@ -1,8 +1,8 @@
1
  // ===== 9. LEADERBOARD =====
2
  const BENCHMARKS=['AppWorld','BrowseComp+','SWE-bench','TauBench-Airline','TauBench-Retail','TauBench-Telecom'];
3
  const BENCH_SHORT={'AppWorld':'App','BrowseComp+':'Browse','SWE-bench':'SWE','TauBench-Airline':'Tau-Air','TauBench-Retail':'Tau-Ret','TauBench-Telecom':'Tau-Tel'};
4
- const MODEL_DISPLAY={'claude-opus-4.5':'Claude Opus 4.5','gpt-5.2':'GPT 5.2','gemini-3-pro':'Gemini Pro 3'};
5
- const MODEL_URLS={'claude-opus-4.5':'https://www.anthropic.com/claude','gpt-5.2':'https://openai.com/','gemini-3-pro':'https://deepmind.google/technologies/gemini/'};
6
  const AGENT_DISPLAY={'Claude_Code':'Claude Code','OpenAI_Solo':'OpenAI Solo','Smolagent':'Smolagent','React':'React','React_+_Shortlisting':'React + Shortlist'};
7
  const AGENT_URLS={'Claude_Code':'https://github.com/anthropics/claude-code','OpenAI_Solo':'https://github.com/openai/openai-agents-python','Smolagent':'https://github.com/huggingface/smolagents','React':'https://github.com/BerriAI/litellm','React_+_Shortlisting':'https://github.com/BerriAI/litellm'};
8
 
@@ -27,10 +27,12 @@ function processData(rows,modelFilter){
27
  const BENCH_WEIGHT={};
28
  BENCHMARKS.forEach(b=>{BENCH_WEIGHT[b]=b.startsWith('TauBench')?1/12:1/4});
29
  return Object.values(groups).map(g=>{
 
 
30
  const bs=BENCHMARKS.map(b=>g.benchmarks[b]||0);
31
- let wSum=0,wTotal=0;
32
- BENCHMARKS.forEach((b,i)=>{if(bs[i]>0){wSum+=bs[i]*BENCH_WEIGHT[b];wTotal+=BENCH_WEIGHT[b]}});
33
- const avg=wTotal?wSum/wTotal:0;
34
  const cs=Object.values(g.costs).filter(c=>c>0);
35
  const avgCost=cs.length?cs.reduce((a,b)=>a+b,0)/cs.length:0;
36
  return{...g,avg,avgCost,benchScores:bs};
@@ -81,8 +83,7 @@ function renderTable(data){
81
  html+=`<td class="score-cell ${scoreClass(row.avg)}"><div class="bar bar-cyan" style="width:${row.avg*100}%"></div><span class="val">${fmtPct(row.avg)}</span></td>`;
82
  html+=`<td class="cost-cell">$${row.avgCost.toFixed(2)}</td>`;
83
  row.benchScores.forEach(s=>{
84
- const barW=s>0?s*100:0;
85
- html+=`<td class="score-cell ${scoreClass(s)}"><div class="bar bar-purple" style="width:${barW}%"></div><span class="val">${s>0?fmtPct(s):'&mdash;'}</span></td>`;
86
  });
87
  html+=`</tr>`;return html;
88
  }).join('');
@@ -98,6 +99,25 @@ function renderTable(data){
98
  renderTable(data);
99
  });
100
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
 
103
  // ===== CHART =====
 
1
  // ===== 9. LEADERBOARD =====
2
  const BENCHMARKS=['AppWorld','BrowseComp+','SWE-bench','TauBench-Airline','TauBench-Retail','TauBench-Telecom'];
3
  const BENCH_SHORT={'AppWorld':'App','BrowseComp+':'Browse','SWE-bench':'SWE','TauBench-Airline':'Tau-Air','TauBench-Retail':'Tau-Ret','TauBench-Telecom':'Tau-Tel'};
4
+ const MODEL_DISPLAY={'claude-opus-4.5':'Claude Opus 4.5','gpt-5.2':'GPT 5.2','gemini-3-pro':'Gemini Pro 3','deepseek-v3.2':'DeepSeek V3.2','kimi-k2.5':'Kimi K2.5'};
5
+ const MODEL_URLS={'claude-opus-4.5':'https://www.anthropic.com/claude','gpt-5.2':'https://openai.com/','gemini-3-pro':'https://deepmind.google/technologies/gemini/','deepseek-v3.2':'https://www.deepseek.com/','kimi-k2.5':'https://www.moonshot.ai/'};
6
  const AGENT_DISPLAY={'Claude_Code':'Claude Code','OpenAI_Solo':'OpenAI Solo','Smolagent':'Smolagent','React':'React','React_+_Shortlisting':'React + Shortlist'};
7
  const AGENT_URLS={'Claude_Code':'https://github.com/anthropics/claude-code','OpenAI_Solo':'https://github.com/openai/openai-agents-python','Smolagent':'https://github.com/huggingface/smolagents','React':'https://github.com/BerriAI/litellm','React_+_Shortlisting':'https://github.com/BerriAI/litellm'};
8
 
 
27
  const BENCH_WEIGHT={};
28
  BENCHMARKS.forEach(b=>{BENCH_WEIGHT[b]=b.startsWith('TauBench')?1/12:1/4});
29
  return Object.values(groups).map(g=>{
30
+ // Every (model, agent, benchmark) cell is populated; zero is a real score
31
+ // (TauBench protocol failures, AppWorld tool-limit failures). Include all six.
32
  const bs=BENCHMARKS.map(b=>g.benchmarks[b]||0);
33
+ let wSum=0;
34
+ BENCHMARKS.forEach((b,i)=>{wSum+=bs[i]*BENCH_WEIGHT[b]});
35
+ const avg=wSum; // weights sum to 1
36
  const cs=Object.values(g.costs).filter(c=>c>0);
37
  const avgCost=cs.length?cs.reduce((a,b)=>a+b,0)/cs.length:0;
38
  return{...g,avg,avgCost,benchScores:bs};
 
83
  html+=`<td class="score-cell ${scoreClass(row.avg)}"><div class="bar bar-cyan" style="width:${row.avg*100}%"></div><span class="val">${fmtPct(row.avg)}</span></td>`;
84
  html+=`<td class="cost-cell">$${row.avgCost.toFixed(2)}</td>`;
85
  row.benchScores.forEach(s=>{
86
+ html+=`<td class="score-cell ${scoreClass(s)}"><div class="bar bar-purple" style="width:${s*100}%"></div><span class="val">${fmtPct(s)}</span></td>`;
 
87
  });
88
  html+=`</tr>`;return html;
89
  }).join('');
 
99
  renderTable(data);
100
  });
101
  });
102
+
103
+ // Remove any legacy "See all" button if it exists from a previous render
104
+ const oldBtn=document.getElementById('lbSeeAll');
105
+ if(oldBtn)oldBtn.remove();
106
+
107
+ // Toggle bottom-fade hint based on whether the table can still scroll down
108
+ const tableWrap=document.querySelector('.table-wrap');
109
+ const updateFade=()=>{
110
+ const atBottom=tableWrap.scrollTop+tableWrap.clientHeight>=tableWrap.scrollHeight-2;
111
+ const overflows=tableWrap.scrollHeight>tableWrap.clientHeight+2;
112
+ tableWrap.classList.toggle('lb-can-scroll',overflows&&!atBottom);
113
+ };
114
+ if(!tableWrap.dataset.scrollBound){
115
+ tableWrap.addEventListener('scroll',updateFade,{passive:true});
116
+ window.addEventListener('resize',updateFade);
117
+ tableWrap.dataset.scrollBound='1';
118
+ }
119
+ // Defer to next frame so layout is settled
120
+ requestAnimationFrame(updateFade);
121
  }
122
 
123
  // ===== CHART =====
scripts/pareto.js DELETED
@@ -1,386 +0,0 @@
1
- // SVG Pareto Chart — reads processed data from leaderboard.js via window._chartData
2
- (() => {
3
- const MODEL_DISPLAY = {
4
- 'claude-opus-4.5': 'Claude Opus 4.5',
5
- 'gpt-5.2': 'GPT 5.2',
6
- 'gemini-3-pro': 'Gemini Pro 3'
7
- };
8
- const MODEL_COLORS = {
9
- 'claude-opus-4.5': '#5ba8a0',
10
- 'gpt-5.2': '#9b8ec4',
11
- 'gemini-3-pro': '#6a8cbe'
12
- };
13
- const AGENT_DISPLAY = {
14
- 'Claude_Code': 'Claude Code',
15
- 'OpenAI_Solo': 'OpenAI Solo',
16
- 'Smolagent': 'Smolagent',
17
- 'React': 'React',
18
- 'React_+_Shortlisting': 'React + Shortlist'
19
- };
20
-
21
- const ns = 'http://www.w3.org/2000/svg';
22
-
23
- function el(tag, attrs) {
24
- const e = document.createElementNS(ns, tag);
25
- if (attrs) for (const [k, v] of Object.entries(attrs)) e.setAttribute(k, v);
26
- return e;
27
- }
28
-
29
- function makeShape(agent, x, y, color) {
30
- const g = el('g');
31
- const common = { fill: color, 'fill-opacity': '0.82', stroke: 'rgba(128,128,128,0.3)', 'stroke-width': '1.5' };
32
- let shape;
33
-
34
- if (agent === 'Claude_Code') {
35
- const half = 9;
36
- const pts = [];
37
- for (let i = 0; i < 10; i++) {
38
- const r = i % 2 === 0 ? half : half * 0.42;
39
- const a = i * Math.PI / 5 - Math.PI / 2;
40
- pts.push((x + r * Math.cos(a)).toFixed(1) + ',' + (y + r * Math.sin(a)).toFixed(1));
41
- }
42
- shape = el('polygon', { points: pts.join(' '), ...common });
43
- } else if (agent === 'OpenAI_Solo') {
44
- const half = 8;
45
- shape = el('polygon', { points: `${x},${y - half} ${x + half},${y} ${x},${y + half} ${x - half},${y}`, ...common });
46
- } else if (agent === 'Smolagent') {
47
- const half = 8.5;
48
- shape = el('polygon', { points: `${x},${y - half} ${x + half},${y + half} ${x - half},${y + half}`, ...common });
49
- } else if (agent === 'React_+_Shortlisting') {
50
- const half = 6.5;
51
- shape = el('rect', { x: x - half, y: y - half, width: half * 2, height: half * 2, rx: 2, ...common });
52
- } else {
53
- const half = 7;
54
- shape = el('circle', { cx: x, cy: y, r: half, ...common });
55
- }
56
-
57
- g.appendChild(shape);
58
- // Invisible larger hit area for hover
59
- g.appendChild(el('circle', { cx: x, cy: y, r: 20, fill: 'transparent', cursor: 'pointer' }));
60
- return g;
61
- }
62
-
63
- function makeLegendShapeSVG(agent) {
64
- const svg = document.createElementNS(ns, 'svg');
65
- svg.setAttribute('width', '14');
66
- svg.setAttribute('height', '14');
67
- svg.setAttribute('viewBox', '0 0 14 14');
68
- const common = { fill: 'var(--text-muted)', 'fill-opacity': '0.5', stroke: 'var(--text-muted)', 'stroke-width': '0.8' };
69
- let shape;
70
- if (agent === 'Claude_Code') {
71
- const pts = [];
72
- for (let i = 0; i < 10; i++) {
73
- const r = i % 2 === 0 ? 6 : 6 * 0.42;
74
- const a = i * Math.PI / 5 - Math.PI / 2;
75
- pts.push((7 + r * Math.cos(a)).toFixed(1) + ',' + (7 + r * Math.sin(a)).toFixed(1));
76
- }
77
- shape = el('polygon', { points: pts.join(' '), ...common });
78
- } else if (agent === 'OpenAI_Solo') {
79
- shape = el('polygon', { points: '7,1 11,7 7,13 3,7', ...common });
80
- } else if (agent === 'Smolagent') {
81
- shape = el('polygon', { points: '7,2 12,12 2,12', ...common });
82
- } else if (agent === 'React_+_Shortlisting') {
83
- shape = el('rect', { x: 2, y: 2, width: 10, height: 10, rx: 1.5, ...common });
84
- } else {
85
- shape = el('circle', { cx: 7, cy: 7, r: 5, ...common });
86
- }
87
- svg.appendChild(shape);
88
- return svg;
89
- }
90
-
91
- function renderParetoChart(data) {
92
- const container = document.getElementById('paretoChartContainer');
93
- if (!container) return;
94
- container.innerHTML = '';
95
-
96
- // Build chart data from processed leaderboard data
97
- const points = data.map(d => ({
98
- agent: d.agent,
99
- model: d.model,
100
- cost: d.avgCost,
101
- success: d.avg
102
- })).filter(d => d.cost > 0 && d.success > 0);
103
-
104
- if (points.length === 0) return;
105
-
106
- // Determine unique agents and models
107
- const agents = [...new Set(points.map(p => p.agent))];
108
- const models = [...new Set(points.map(p => p.model))];
109
-
110
- // Determine axis ranges with padding
111
- const costs = points.map(p => p.cost);
112
- const successes = points.map(p => p.success);
113
- const xMax = Math.ceil(Math.max(...costs) + 0.5);
114
- const xMin = 0;
115
- const yMinRaw = Math.min(...successes);
116
- const yMaxRaw = Math.max(...successes);
117
- const yMin = Math.floor(yMinRaw * 10) / 10 - 0.05;
118
- const yMax = Math.ceil(yMaxRaw * 10) / 10 + 0.05;
119
-
120
- // SVG dimensions
121
- const W = 740, H = 420;
122
- const ml = 62, mr = 20, mt = 14, mb = 52;
123
- const pw = W - ml - mr, ph = H - mt - mb;
124
-
125
- const sx = c => ml + ((c - xMin) / (xMax - xMin)) * pw;
126
- const sy = s => mt + (1 - ((s - yMin) / (yMax - yMin))) * ph;
127
-
128
- // Pareto frontier
129
- const sorted = [...points].sort((a, b) => a.cost - b.cost);
130
- const pareto = [];
131
- let bestS = -Infinity;
132
- for (const p of sorted) {
133
- if (p.success > bestS) { pareto.push(p); bestS = p.success; }
134
- }
135
-
136
- // Build DOM
137
- const root = document.createElement('div');
138
- root.className = 'pareto-embed';
139
-
140
- const chartWrap = document.createElement('div');
141
- chartWrap.className = 'chart-wrap';
142
-
143
- const svg = document.createElementNS(ns, 'svg');
144
- svg.setAttribute('class', 'chart');
145
- svg.setAttribute('viewBox', `0 0 ${W} ${H}`);
146
- svg.setAttribute('aria-label', 'Pareto frontier: cost vs success rate for agent configurations');
147
-
148
- // Gradient for pareto area
149
- const defs = el('defs');
150
- const grad = el('linearGradient', { id: 'pareto-fade-main', x1: '0', y1: '0', x2: '0', y2: '1' });
151
- const stop1 = el('stop', { offset: '0%', 'stop-color': '#5ba8a0', 'stop-opacity': '0.08' });
152
- const stop2 = el('stop', { offset: '100%', 'stop-color': '#5ba8a0', 'stop-opacity': '0' });
153
- grad.appendChild(stop1);
154
- grad.appendChild(stop2);
155
- defs.appendChild(grad);
156
- svg.appendChild(defs);
157
-
158
- // Grid lines and ticks
159
- const xStep = xMax <= 10 ? 1 : 2;
160
- const xTicks = [];
161
- for (let x = 0; x <= xMax; x += xStep) xTicks.push(x);
162
- const yTicks = [];
163
- for (let y = Math.ceil(yMin * 10) / 10; y <= yMax; y = Math.round((y + 0.1) * 10) / 10) yTicks.push(y);
164
-
165
- for (const x of xTicks) {
166
- const px = sx(x);
167
- svg.appendChild(el('line', { x1: px, y1: mt, x2: px, y2: mt + ph, stroke: 'var(--text-muted)', 'stroke-opacity': '0.15', 'stroke-width': 1 }));
168
- const t = el('text', { x: px, y: mt + ph + 22, 'font-size': 11, fill: 'var(--text-muted)', 'text-anchor': 'middle', 'font-family': 'Inter, system-ui', 'font-weight': 400 });
169
- t.textContent = '$' + x;
170
- svg.appendChild(t);
171
- }
172
-
173
- for (const y of yTicks) {
174
- const py = sy(y);
175
- svg.appendChild(el('line', { x1: ml, y1: py, x2: ml + pw, y2: py, stroke: 'var(--text-muted)', 'stroke-opacity': '0.15', 'stroke-width': 1 }));
176
- const t = el('text', { x: ml - 10, y: py + 4, 'font-size': 11, fill: 'var(--text-muted)', 'text-anchor': 'end', 'font-family': 'Inter, system-ui', 'font-weight': 400 });
177
- t.textContent = Math.round(y * 100) + '%';
178
- svg.appendChild(t);
179
- }
180
-
181
- // Axes
182
- svg.appendChild(el('line', { x1: ml, y1: mt + ph, x2: ml + pw, y2: mt + ph, stroke: 'var(--text-muted)', 'stroke-opacity': '0.25', 'stroke-width': 1 }));
183
- svg.appendChild(el('line', { x1: ml, y1: mt, x2: ml, y2: mt + ph, stroke: 'var(--text-muted)', 'stroke-opacity': '0.25', 'stroke-width': 1 }));
184
-
185
- // Axis labels
186
- const xLabel = el('text', { x: ml + pw / 2, y: H - 8, 'text-anchor': 'middle', 'font-size': 12, fill: 'var(--text-muted)', 'font-family': 'Inter, system-ui', 'font-weight': 500 });
187
- xLabel.textContent = 'Average cost per task (USD)';
188
- svg.appendChild(xLabel);
189
-
190
- const yLabel = el('text', { x: 14, y: mt + ph / 2, 'text-anchor': 'middle', 'font-size': 12, fill: 'var(--text-muted)', 'font-family': 'Inter, system-ui', 'font-weight': 500, transform: 'rotate(-90 14 ' + (mt + ph / 2) + ')' });
191
- yLabel.textContent = 'Success rate';
192
- svg.appendChild(yLabel);
193
-
194
- // Pareto area fill
195
- if (pareto.length > 1) {
196
- let areaD = 'M ' + sx(pareto[0].cost) + ' ' + sy(pareto[0].success);
197
- for (let i = 1; i < pareto.length; i++) areaD += ' L ' + sx(pareto[i].cost) + ' ' + sy(pareto[i].success);
198
- areaD += ' L ' + sx(pareto[pareto.length - 1].cost) + ' ' + (mt + ph);
199
- areaD += ' L ' + sx(pareto[0].cost) + ' ' + (mt + ph) + ' Z';
200
- svg.appendChild(el('path', { d: areaD, fill: 'url(#pareto-fade-main)' }));
201
- }
202
-
203
- // Pareto line
204
- const paretoD = pareto.map((p, i) => (i === 0 ? 'M' : 'L') + ' ' + sx(p.cost) + ' ' + sy(p.success)).join(' ');
205
- const paretoLine = el('path', { d: paretoD, fill: 'none', stroke: 'var(--text-muted)', 'stroke-opacity': '0.4', 'stroke-width': 1.6, 'stroke-dasharray': '6 4' });
206
- svg.appendChild(paretoLine);
207
-
208
- // Pareto label
209
- const lastPareto = pareto[pareto.length - 1];
210
- const paretoLabel = el('text', { x: sx(lastPareto.cost) + 8, y: sy(lastPareto.success) - 10, 'font-size': 10, fill: 'var(--text-muted)', 'font-family': 'Inter, system-ui', 'font-style': 'italic' });
211
- paretoLabel.textContent = 'Pareto frontier';
212
- svg.appendChild(paretoLabel);
213
-
214
- // Data points
215
- const pointEls = [];
216
- points.forEach((d, i) => {
217
- const x = sx(d.cost);
218
- const y = sy(d.success);
219
- const color = MODEL_COLORS[d.model] || '#7A7A7A';
220
- const g = makeShape(d.agent, x, y, color);
221
- g.classList.add('chart-point');
222
- g.dataset.agent = d.agent;
223
- g.dataset.model = d.model;
224
- g.dataset.idx = i;
225
- svg.appendChild(g);
226
- pointEls.push({ el: g, data: d, x, y });
227
- });
228
-
229
- chartWrap.appendChild(svg);
230
-
231
- // Tooltip
232
- const tooltip = document.createElement('div');
233
- tooltip.className = 'tooltip';
234
- chartWrap.appendChild(tooltip);
235
-
236
- root.appendChild(chartWrap);
237
-
238
- // Legend
239
- const legendRow = document.createElement('div');
240
- legendRow.className = 'legend-row';
241
-
242
- // Agents legend group
243
- const agentGroup = document.createElement('div');
244
- agentGroup.className = 'legend-group';
245
- const agentLabel = document.createElement('span');
246
- agentLabel.className = 'legend-group-label';
247
- agentLabel.textContent = 'Agents';
248
- agentGroup.appendChild(agentLabel);
249
-
250
- agents.forEach(agent => {
251
- const item = document.createElement('div');
252
- item.className = 'legend-item';
253
- item.dataset.filter = 'agent';
254
- item.dataset.value = agent;
255
- item.appendChild(makeLegendShapeSVG(agent));
256
- const span = document.createElement('span');
257
- span.textContent = AGENT_DISPLAY[agent] || agent.replace(/_/g, ' ');
258
- item.appendChild(span);
259
- agentGroup.appendChild(item);
260
- });
261
-
262
- // Models legend group
263
- const modelGroup = document.createElement('div');
264
- modelGroup.className = 'legend-group';
265
- const modelLabel = document.createElement('span');
266
- modelLabel.className = 'legend-group-label';
267
- modelLabel.textContent = 'Models';
268
- modelGroup.appendChild(modelLabel);
269
-
270
- models.forEach(model => {
271
- const item = document.createElement('div');
272
- item.className = 'legend-item';
273
- item.dataset.filter = 'model';
274
- item.dataset.value = model;
275
- const dot = document.createElement('span');
276
- dot.style.cssText = 'display:inline-block;width:11px;height:11px;border-radius:50%;background:' + (MODEL_COLORS[model] || '#666') + ';border:1px solid rgba(128,128,128,0.2);';
277
- item.appendChild(dot);
278
- const span = document.createElement('span');
279
- span.textContent = MODEL_DISPLAY[model] || model;
280
- item.appendChild(span);
281
- modelGroup.appendChild(item);
282
- });
283
-
284
- legendRow.appendChild(agentGroup);
285
- legendRow.appendChild(modelGroup);
286
- root.appendChild(legendRow);
287
-
288
- container.appendChild(root);
289
-
290
- // Interaction
291
- const activeFilters = { agent: null, model: null };
292
- const legendItems = root.querySelectorAll('.legend-item[data-filter]');
293
-
294
- function applyFilters() {
295
- const hasFilter = activeFilters.agent || activeFilters.model;
296
- legendItems.forEach(li => {
297
- const t = li.dataset.filter;
298
- const v = li.dataset.value;
299
- if (activeFilters[t] === v) {
300
- li.classList.add('active');
301
- li.classList.remove('dimmed');
302
- } else if (activeFilters[t] && activeFilters[t] !== v) {
303
- li.classList.remove('active');
304
- li.classList.add('dimmed');
305
- } else {
306
- li.classList.remove('active');
307
- li.classList.remove('dimmed');
308
- }
309
- });
310
-
311
- pointEls.forEach(({ el: g, data: d }) => {
312
- const matchAgent = !activeFilters.agent || d.agent === activeFilters.agent;
313
- const matchModel = !activeFilters.model || d.model === activeFilters.model;
314
- const visible = !hasFilter || (matchAgent && matchModel);
315
- g.style.opacity = visible ? '1' : '0.08';
316
- g.style.filter = 'none';
317
- });
318
-
319
- paretoLine.style.opacity = hasFilter ? '0.1' : '1';
320
- }
321
-
322
- function showTooltip(d, x, y) {
323
- pointEls.forEach(p => {
324
- if (p.data === d) {
325
- p.el.style.opacity = '1';
326
- p.el.style.filter = 'drop-shadow(0 2px 8px rgba(0,0,0,0.25))';
327
- } else {
328
- p.el.style.opacity = '0.15';
329
- p.el.style.filter = 'none';
330
- }
331
- });
332
- paretoLine.style.opacity = '0.15';
333
-
334
- const rect = chartWrap.getBoundingClientRect();
335
- const svgRect = svg.getBoundingClientRect();
336
- const scaleX = svgRect.width / W;
337
- const scaleY = svgRect.height / H;
338
- const tipX = x * scaleX + (svgRect.left - rect.left);
339
- const tipY = y * scaleY + (svgRect.top - rect.top);
340
- const successColor = MODEL_COLORS[d.model] || '#333';
341
- const agentName = AGENT_DISPLAY[d.agent] || d.agent.replace(/_/g, ' ');
342
- const modelName = MODEL_DISPLAY[d.model] || d.model;
343
-
344
- tooltip.innerHTML =
345
- '<div class="tooltip-agent">' + agentName + '</div>' +
346
- '<div class="tooltip-model">' + modelName + '</div>' +
347
- '<div class="tooltip-stats">' +
348
- '<div><div class="tooltip-stat-label">Success</div><div class="tooltip-stat-value" style="color:' + successColor + '">' + (d.success * 100).toFixed(0) + '%</div></div>' +
349
- '<div><div class="tooltip-stat-label">Cost/task</div><div class="tooltip-stat-value">$' + d.cost.toFixed(2) + '</div></div>' +
350
- '</div>';
351
-
352
- tooltip.classList.add('visible');
353
- const tipW = tooltip.offsetWidth;
354
- const tipH = tooltip.offsetHeight;
355
- let left = tipX + 18;
356
- let top = tipY - tipH / 2;
357
- if (left + tipW > rect.width - 8) left = tipX - tipW - 18;
358
- if (top < 4) top = 4;
359
- if (top + tipH > rect.height - 4) top = rect.height - tipH - 4;
360
- tooltip.style.left = left + 'px';
361
- tooltip.style.top = top + 'px';
362
- }
363
-
364
- function hideTooltip() {
365
- applyFilters();
366
- tooltip.classList.remove('visible');
367
- }
368
-
369
- pointEls.forEach(({ el: g, data: d, x, y }) => {
370
- g.addEventListener('mouseenter', () => showTooltip(d, x, y));
371
- g.addEventListener('mouseleave', hideTooltip);
372
- });
373
-
374
- legendItems.forEach(item => {
375
- item.addEventListener('click', () => {
376
- const type = item.dataset.filter;
377
- const value = item.dataset.value;
378
- activeFilters[type] = (activeFilters[type] === value) ? null : value;
379
- applyFilters();
380
- });
381
- });
382
- }
383
-
384
- // Expose globally so main.js can re-render on theme toggle
385
- window.renderParetoChart = renderParetoChart;
386
- })();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
styles.css CHANGED
@@ -404,7 +404,9 @@ section{position:relative;z-index:1;padding:80px 0}
404
  }
405
  .filter-select:hover,.filter-select:focus{border-color:var(--cyan);outline:none;box-shadow:0 0 12px var(--cyan-dim)}
406
 
407
- .table-wrap{overflow-x:auto;border-radius:12px;border:1px solid rgba(255,255,255,0.1);background:rgba(10,10,10,0.8);box-shadow:0 2px 16px rgba(0,0,0,0.3)}
 
 
408
  .lb-table{width:100%;border-collapse:collapse;font-size:.78rem}
409
  .lb-table thead{background:rgba(255,255,255,0.06)}
410
  .lb-table th{
 
404
  }
405
  .filter-select:hover,.filter-select:focus{border-color:var(--cyan);outline:none;box-shadow:0 0 12px var(--cyan-dim)}
406
 
407
+ .table-wrap{overflow-x:auto;overflow-y:auto;max-height:600px;border-radius:12px;border:1px solid rgba(255,255,255,0.1);background:rgba(10,10,10,0.8);box-shadow:0 2px 16px rgba(0,0,0,0.3);position:relative}
408
+ .table-wrap.lb-can-scroll{-webkit-mask-image:linear-gradient(to bottom,#000 88%,transparent 100%);mask-image:linear-gradient(to bottom,#000 88%,transparent 100%)}
409
+ .lb-table thead{position:sticky;top:0;z-index:1;backdrop-filter:blur(8px);-webkit-backdrop-filter:blur(8px)}
410
  .lb-table{width:100%;border-collapse:collapse;font-size:.78rem}
411
  .lb-table thead{background:rgba(255,255,255,0.06)}
412
  .lb-table th{