Skip to content

Commit 3d5747a

Browse files
author
jty1128
committed
init html
1 parent 453405c commit 3d5747a

File tree

2 files changed

+288
-0
lines changed

2 files changed

+288
-0
lines changed

omnigirl.html

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
<!doctype html>
2+
<html lang="en">
3+
<link rel="preconnect" href="https://fonts.googleapis.com">
4+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
5+
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@100;400&display=swap" rel="stylesheet">
6+
7+
<head>
8+
<meta charset="UTF-8">
9+
<title>OmniGIRL Leaderboard</title>
10+
11+
<script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script>
12+
<link rel="icon" href="https://images.emojiterra.com/google/noto-emoji/unicode-15/color/1024px/1f4da.png">
13+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0/dist/css/bootstrap.min.css">
14+
15+
<style>
16+
body{font-family:"JetBrains Mono",monospace;background:#fff;color:#000}
17+
#content{width:85%}
18+
th{background:#f2f2f2;text-align:left}
19+
td{text-align:left;vertical-align:middle}
20+
#notes{font-size:1em}
21+
#notes h3{margin-top:1em;font-size:2em;text-align:center}
22+
#notes li{font-size:1.2em;font-weight:300;margin:1em}
23+
.table-striped tbody tr:nth-of-type(odd){background:#f8f9fa}
24+
@media(max-width:1400px){
25+
body{font-size:1.6vw}
26+
#content{width:100%}
27+
h1{font-size:2em}h3{font-size:1.2em}
28+
table{font-size:small}
29+
}
30+
</style>
31+
</head>
32+
33+
<body>
34+
<div id="content" class="container-fluid d-flex flex-column align-items-center gap-3">
35+
36+
<h1 class="text-nowrap mt-5">🏆 OmniGIRL Leaderboard 🏆</h1>
37+
<h3 class="fw-light text-nowrap">
38+
<small id="warning">A Multilingual & Multimodal Benchmark for GitHub Issue Resolution<br></small>
39+
</h3>
40+
41+
<!-- 徽章:GitHub / Paper / HF -->
42+
<div class="d-flex flex-row justify-content-center gap-3">
43+
<a href="https://github.com/your-org/your-repo" target="_blank">
44+
<img src="https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white"
45+
alt="github" class="img-fluid">
46+
</a>
47+
<a href="https://openreview.net/forum?id=yourISSTA25paperID" target="_blank">
48+
<img src="https://img.shields.io/badge/Paper-ISSTA%2725-a55fed.svg?style=for-the-badge"
49+
alt="paper" class="img-fluid">
50+
</a>
51+
<a href="https://huggingface.co/your-repo" target="_blank">
52+
<img src="https://img.shields.io/badge/HuggingFace-%23f9ac00.svg?style=for-the-badge&logo=huggingface"
53+
alt="hf" class="img-fluid">
54+
</a>
55+
</div>
56+
57+
<!-- 五语言按钮 -->
58+
<div class="btn-group mt-3" role="group" id="Language">
59+
<input type="radio" class="btn-check" name="langradio" id="Full" value="full" checked>
60+
<label class="btn btn-outline-primary" for="Full">Full</label>
61+
<input type="radio" class="btn-check" name="langradio" id="Python" value="python">
62+
<label class="btn btn-outline-primary" for="Python">Python</label>
63+
<input type="radio" class="btn-check" name="langradio" id="Java" value="java">
64+
<label class="btn btn-outline-primary" for="Java">Java</label>
65+
<input type="radio" class="btn-check" name="langradio" id="JavaScript" value="javascript">
66+
<label class="btn btn-outline-primary" for="JavaScript">JavaScript</label>
67+
<input type="radio" class="btn-check" name="langradio" id="TypeScript" value="typescript">
68+
<label class="btn btn-outline-primary" for="TypeScript">TypeScript</label>
69+
</div>
70+
71+
<!-- 排名表 -->
72+
<table id="origin" class="table table-striped table-bordered border border-primary border-3 mt-4 w-100">
73+
<thead>
74+
<tr>
75+
<th style="width:50%">Method</th>
76+
<th style="width:25%">Model</th>
77+
<th style="width:10%" class="text-center">%Resolved</th>
78+
<th style="width:15%" class="text-center">Date</th>
79+
</tr>
80+
</thead>
81+
<tbody id="leaderboard-body"></tbody>
82+
</table>
83+
84+
<!-- Notes -->
85+
<div id="notes" class="w-100">
86+
<h3>📝 Notes</h3>
87+
<div class="inline-block mt-3">
88+
<ol>
89+
<li>
90+
<strong>OmniGIRL</strong> is a multilingual &amp; multimodal GitHub-issue-resolution benchmark
91+
with <strong>959 tasks</strong> spanning four programming languages.
92+
Inputs may include text, screenshots, rendered web pages and other modalities.
93+
</li>
94+
95+
<li>
96+
For realistic evaluation, <em>we recommend</em> that methods automatically examine each
97+
task’s raw input to detect available modalities (e.g., embedded webpages, images),
98+
retrieve the relevant content by themselves, and invoke the appropriate tools—
99+
instead of relying on manual hints.
100+
Doing so better assesses a solver’s <strong>general-purpose issue-resolution ability in real-world scenarios</strong>.
101+
</li>
102+
103+
<li>
104+
Our baseline system is released <em>for research purposes only</em>; please cite
105+
OmniGIRL if you use it.
106+
</li>
107+
</ol>
108+
</div>
109+
</div>
110+
111+
112+
<!-- More Leaderboards -->
113+
<div id="notes" class="w-100">
114+
<h3>🤗 More Leaderboards</h3>
115+
<div class="inline-block mt-3">
116+
<ol>
117+
<li><a href="https://bigcode-bench.github.io/">BigCodeBench</a></li>
118+
<li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">Big Code Models</a></li>
119+
<li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">Chatbot Arena</a></li>
120+
<li><a href="https://github.com/amazon-science/cceval">CrossCodeEval</a></li>
121+
<li><a href="https://fudanselab-classeval.github.io/">ClassEval</a></li>
122+
<li><a href="https://crux-eval.github.io/leaderboard.html">CRUXEval</a></li>
123+
<li><a href="https://codetlingua.github.io/leaderboard.html">Code Lingua</a></li>
124+
<li><a href="https://evo-eval.github.io/">Evo-Eval</a></li>
125+
<li><a href="https://huggingface.co/spaces/EffiBench/effibench-leaderboard">EffiBench</a></li>
126+
<li><a href="https://github.com/01-ai/HumanEval.jl">HumanEval.jl</a></li>
127+
<li><a href="https://livecodebench.github.io/leaderboard.html">LiveCodeBench</a></li>
128+
<li><a href="https://sparksofagi.github.io/MHPP/">MHPP</a></li>
129+
<li><a href="https://github.com/THUDM/NaturalCodeBench">NaturalCodeBench</a></li>
130+
<li><a href="https://github.com/Leolty/repobench">RepoBench</a></li>
131+
<li><a href="https://www.swebench.com/">SWE-bench</a></li>
132+
<li><a href="https://leaderboard.tabbyml.com/">TabbyML</a></li>
133+
<li><a href="https://llm4softwaretesting.github.io/">TestEval</a></li>
134+
</ol>
135+
</div>
136+
</div>
137+
138+
<!-- Acknowledgements -->
139+
<!-- 🙏 Acknowledgements -->
140+
<!-- 🙏 Acknowledgements -->
141+
<div id="notes" class="w-100 mb-5">
142+
<h3>🙏 Acknowledgements</h3>
143+
<div class="inline-block mt-3">
144+
<ol>
145+
<li>
146+
We build on prior work — <strong><a href="https://arxiv.org/abs/2310.06770" target="_blank">SWE-bench</a></strong>,
147+
<strong><a href="https://arxiv.org/abs/2407.01489" target="_blank">Agentless</a></strong>, and
148+
<strong><a href="https://arxiv.org/abs/2404.05427" target="_blank">AutoCodeRover</a></strong>
149+
which laid the groundwork for this study.
150+
</li>
151+
152+
<li>
153+
We thank the <strong><a href="https://github.com/evalplus/evalplus" target="_blank">EvalPlus leaderboard</a></strong>
154+
team for releasing the elegant page template that inspired this site.
155+
</li>
156+
157+
<li>
158+
Finally, we are grateful to the <strong>open-source developer community</strong> for their invaluable contributions.
159+
</li>
160+
</ol>
161+
</div>
162+
</div>
163+
164+
165+
</div><!-- /#content -->
166+
167+
<!-- 渲染脚本:与之前一致 -->
168+
<script>
169+
const tbody=document.getElementById("leaderboard-body");
170+
const radios=document.querySelectorAll('input[name="langradio"]');
171+
const xhr=new XMLHttpRequest();xhr.open("GET","results.json",false);xhr.send();
172+
if(xhr.status!==200){alert("Failed to load results.json");}
173+
const raw=Object.values(JSON.parse(xhr.responseText));
174+
const keyMap={full:"%resolved_full",python:"%resolved_python",
175+
java:"%resolved_java",javascript:"%resolved_javascript",
176+
typescript:"%resolved_typescript"};
177+
function render(lang){
178+
tbody.innerHTML="";
179+
const k=keyMap[lang];
180+
raw.filter(r=>r[k]!=null).sort((a,b)=>b[k]-a[k]).forEach((r,i)=>{
181+
const medal=i===0?"🥇 ":i===1?"🥈 ":i===2?"🥉 ":"";
182+
tbody.insertAdjacentHTML("beforeend",
183+
`<tr><td>${medal}${r.method}</td>
184+
<td>${r.model}</td>
185+
<td class="text-center">${(r[k]*100).toFixed(1)}%</td>
186+
<td class="text-center">${r.date}</td></tr>`);
187+
});
188+
}
189+
render("full");
190+
radios.forEach(r=>r.addEventListener("change",()=>r.checked&&render(r.value)));
191+
</script>
192+
</body>
193+
</html>

results/results.json

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
{
2+
"Oracle Retrieval (DeepSeek-V2.5)": {
3+
"method": "Oracle Retrieval (DeepSeek-V2.5)",
4+
"model": "DeepSeek-V2.5",
5+
"%resolved_full": 0.027,
6+
"%resolved_python": 0.013,
7+
"%resolved_java": 0.105,
8+
"%resolved_javascript": 0.022,
9+
"%resolved_typescript": 0.019,
10+
"date": "2025-04-27"
11+
},
12+
"Agentless-X (DeepSeek-V2.5)": {
13+
"method": "Agentless-v1-X (DeepSeek-V2.5)",
14+
"model": "DeepSeek-V2.5",
15+
"%resolved_full": 0.039,
16+
"%resolved_python": 0.061,
17+
"%resolved_java": 0.029,
18+
"%resolved_javascript": 0.026,
19+
"%resolved_typescript": 0.019,
20+
"date": "2025-04-27"
21+
},
22+
"AutoCodeRover-X (DeepSeek-V2.5)": {
23+
"method": "AutoCodeRover(v20240620)-X (DeepSeek-V2.5)",
24+
"model": "DeepSeek-V2.5",
25+
"%resolved_full": 0.060,
26+
"%resolved_python": 0.072,
27+
"%resolved_java": 0.114,
28+
"%resolved_javascript": 0.037,
29+
"%resolved_typescript": 0.043,
30+
"date": "2025-04-27"
31+
},
32+
33+
"Oracle Retrieval (GPT-4o-2024-08-06)": {
34+
"method": "Oracle Retrieval (GPT-4o-2024-08-06)",
35+
"model": "GPT-4o-2024-08-06",
36+
"%resolved_full": 0.027,
37+
"%resolved_python": 0.019,
38+
"%resolved_java": 0.067,
39+
"%resolved_javascript": 0.019,
40+
"%resolved_typescript": 0.033,
41+
"date": "2025-04-27"
42+
},
43+
"Agentless-X (GPT-4o-2024-08-06)": {
44+
"method": "Agentless-v1-X (GPT-4o-2024-08-06)",
45+
"model": "GPT-4o-2024-08-06",
46+
"%resolved_full": 0.086,
47+
"%resolved_python": 0.088,
48+
"%resolved_java": 0.181,
49+
"%resolved_javascript": 0.063,
50+
"%resolved_typescript": 0.062,
51+
"date": "2025-04-27"
52+
},
53+
"AutoCodeRover-X (GPT-4o-2024-08-06)": {
54+
"method": "AutoCodeRover(v20240620)-X (GPT-4o-2024-08-06)",
55+
"model": "GPT-4o-2024-08-06",
56+
"%resolved_full": 0.081,
57+
"%resolved_python": 0.099,
58+
"%resolved_java": 0.171,
59+
"%resolved_javascript": 0.037,
60+
"%resolved_typescript": 0.062,
61+
"date": "2025-04-27"
62+
},
63+
64+
"Oracle Retrieval (Claude-3.5-Sonnet-2024-06-25)": {
65+
"method": "Oracle Retrieval (Claude-3.5-Sonnet-2024-06-25)",
66+
"model": "Claude-3.5-Sonnet-2024-06-25",
67+
"%resolved_full": 0.078,
68+
"%resolved_python": 0.051,
69+
"%resolved_java": 0.181,
70+
"%resolved_javascript": 0.085,
71+
"%resolved_typescript": 0.067,
72+
"date": "2025-04-27"
73+
},
74+
"Agentless-X (Claude-3.5-Sonnet-2024-06-25)": {
75+
"method": "Agentless-v1-X (Claude-3.5-Sonnet-2024-06-25)",
76+
"model": "Claude-3.5-Sonnet-2024-06-25",
77+
"%resolved_full": 0.074,
78+
"%resolved_python": 0.078,
79+
"%resolved_java": 0.181,
80+
"%resolved_javascript": 0.044,
81+
"%resolved_typescript": 0.057,
82+
"date": "2025-04-27"
83+
},
84+
"AutoCodeRover-X (Claude-3.5-Sonnet-2024-06-25)": {
85+
"method": "AutoCodeRover(v20240620)-X (Claude-3.5-Sonnet-2024-06-25)",
86+
"model": "Claude-3.5-Sonnet-2024-06-25",
87+
"%resolved_full": 0.076,
88+
"%resolved_python": 0.088,
89+
"%resolved_java": 0.190,
90+
"%resolved_javascript": 0.041,
91+
"%resolved_typescript": 0.043,
92+
"date": "2025-04-27"
93+
}
94+
}
95+

0 commit comments

Comments
 (0)