|
27 | 27 | h1{font-size:2em}h3{font-size:1.2em}
|
28 | 28 | table{font-size:small}
|
29 | 29 | }
|
| 30 | + |
| 31 | + /* 仅新增的样式 ↓↓↓ */ |
| 32 | + .icon-cell{ |
| 33 | + text-align:center; |
| 34 | + } |
| 35 | + .icon-cell img{ |
| 36 | + height:1.1em; |
| 37 | + } |
| 38 | + .icon-cell a{ |
| 39 | + display:inline-block; |
| 40 | + font-size:1.1em; |
| 41 | + line-height:1; |
| 42 | + text-decoration:none; |
| 43 | + } |
| 44 | + /* ↑↑↑ */ |
30 | 45 | </style>
|
31 | 46 | </head>
|
32 | 47 |
|
@@ -69,197 +84,75 @@ <h3 class="fw-light text-nowrap">
|
69 | 84 | </div>
|
70 | 85 |
|
71 | 86 | <!-- 排名表 -->
|
72 |
| - <!-- <table id="origin" class="table table-striped table-bordered border border-primary border-3 mt-4 w-100"> |
73 |
| - <thead> |
74 |
| - <tr> |
75 |
| - <th style="width:50%">Method</th> |
76 |
| - <th style="width:25%">Model</th> |
77 |
| - <th style="width:10%" class="text-center">%Resolved</th> |
78 |
| - <th style="width:15%" class="text-center">Date</th> |
79 |
| - </tr> |
80 |
| - </thead> |
81 |
| - <tbody id="leaderboard-body"></tbody> |
82 |
| - </table> --> |
83 |
| - |
84 | 87 | <table id="origin" class="table table-striped table-bordered border border-primary border-3 mt-4 w-100">
|
85 | 88 | <thead>
|
86 | 89 | <tr>
|
87 | 90 | <th style="width:40%">Method</th>
|
88 | 91 | <th style="width:25%">Model</th>
|
89 | 92 | <th style="width:10%" class="text-center">%Resolved</th>
|
90 |
| - <th style="width:5%" class="text-center">Org</th> |
91 |
| - <th style="width:5%" class="text-center">Site</th> |
| 93 | + <th style="width:5%" class="text-center">Org</th> |
| 94 | + <th style="width:5%" class="text-center">Site</th> |
92 | 95 | <th style="width:15%" class="text-center">Date</th>
|
93 | 96 | </tr>
|
94 | 97 | </thead>
|
95 | 98 | <tbody id="leaderboard-body"></tbody>
|
96 | 99 | </table>
|
97 |
| - |
98 |
| - |
99 |
| - <!-- Notes --> |
100 |
| - <div id="notes" class="w-100"> |
101 |
| - <h3>📝 Notes</h3> |
102 |
| - <div class="inline-block mt-3"> |
103 |
| - <ol> |
104 |
| - <li> |
105 |
| - <strong>OmniGIRL</strong> is a multilingual & multimodal GitHub-issue-resolution benchmark |
106 |
| - with <strong>959 tasks</strong> spanning four programming languages. |
107 |
| - Inputs may include text, screenshots, rendered web pages and other modalities. |
108 |
| - </li> |
109 |
| - |
110 |
| - <li> |
111 |
| - For realistic evaluation, <em>we recommend</em> that methods automatically examine each |
112 |
| - task’s raw input to detect available modalities (e.g., embedded webpages, images), |
113 |
| - retrieve the relevant content by themselves, and invoke the appropriate tools— |
114 |
| - instead of relying on manual hints. |
115 |
| - Doing so better assesses a solver’s <strong>general-purpose issue-resolution ability in real-world scenarios</strong>. |
116 |
| - </li> |
117 |
| - |
118 |
| - <li> |
119 |
| - Our baseline system is released <em>for research purposes only</em>; please cite |
120 |
| - OmniGIRL if you use it. |
121 |
| - </li> |
122 |
| - </ol> |
123 |
| - </div> |
124 |
| - </div> |
125 |
| - |
126 |
| - <!-- 📨 How to Submit --> |
127 |
| - <div id="notes" class="w-100"> |
128 |
| - <h3>📨 How to Submit</h3> |
129 |
| - <div class="inline-block mt-3"> |
130 |
| - <ol> |
131 |
| - <li> |
132 |
| - Prepare a <code>.json</code> or <code>.jsonl</code> file. Each record must contain at least |
133 |
| - the keys <code>instance_id</code>, <code>model_name_or_path</code>, and <code>model_patch</code>. |
134 |
| - </li> |
135 |
| - <li> |
136 |
| - Email the file to |
137 |
| - <a href="mailto:guolh8@mail2.sysu.edu.cn?subject=OmniGIRL%20Submission">guolh8@mail2.sysu.edu.cn</a>. |
138 |
| - </li> |
139 |
| - <li> |
140 |
| - We will evaluate your submission locally and update the leaderboard once the results are verified. |
141 |
| - </li> |
142 |
| - </ol> |
143 |
| - </div> |
144 |
| - </div> |
145 |
| - |
146 |
| - |
147 |
| - <!-- More Leaderboards --> |
148 |
| - <div id="notes" class="w-100"> |
149 |
| - <h3>🤗 More Leaderboards</h3> |
150 |
| - <div class="inline-block mt-3"> |
151 |
| - <ol> |
152 |
| - <li><a href="https://bigcode-bench.github.io/">BigCodeBench</a></li> |
153 |
| - <li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">Big Code Models</a></li> |
154 |
| - <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">Chatbot Arena</a></li> |
155 |
| - <li><a href="https://github.com/amazon-science/cceval">CrossCodeEval</a></li> |
156 |
| - <li><a href="https://fudanselab-classeval.github.io/">ClassEval</a></li> |
157 |
| - <li><a href="https://crux-eval.github.io/leaderboard.html">CRUXEval</a></li> |
158 |
| - <li><a href="https://codetlingua.github.io/leaderboard.html">Code Lingua</a></li> |
159 |
| - <li><a href="https://evo-eval.github.io/">Evo-Eval</a></li> |
160 |
| - <li><a href="https://huggingface.co/spaces/EffiBench/effibench-leaderboard">EffiBench</a></li> |
161 |
| - <li><a href="https://github.com/01-ai/HumanEval.jl">HumanEval.jl</a></li> |
162 |
| - <li><a href="https://livecodebench.github.io/leaderboard.html">LiveCodeBench</a></li> |
163 |
| - <li><a href="https://sparksofagi.github.io/MHPP/">MHPP</a></li> |
164 |
| - <li><a href="https://github.com/THUDM/NaturalCodeBench">NaturalCodeBench</a></li> |
165 |
| - <li><a href="https://github.com/Leolty/repobench">RepoBench</a></li> |
166 |
| - <li><a href="https://www.swebench.com/">SWE-bench</a></li> |
167 |
| - <li><a href="https://leaderboard.tabbyml.com/">TabbyML</a></li> |
168 |
| - <li><a href="https://llm4softwaretesting.github.io/">TestEval</a></li> |
169 |
| - </ol> |
170 |
| - </div> |
171 |
| - </div> |
172 |
| - |
173 |
| - <!-- Acknowledgements --> |
174 |
| - <!-- 🙏 Acknowledgements --> |
175 |
| -<!-- 🙏 Acknowledgements --> |
176 |
| -<div id="notes" class="w-100 mb-5"> |
177 |
| - <h3>🙏 Acknowledgements</h3> |
178 |
| - <div class="inline-block mt-3"> |
179 |
| - <ol> |
180 |
| - <li> |
181 |
| - We build on prior work — <strong><a href="https://arxiv.org/abs/2310.06770" target="_blank">SWE-bench</a></strong>, |
182 |
| - <strong><a href="https://arxiv.org/abs/2407.01489" target="_blank">Agentless</a></strong>, and |
183 |
| - <strong><a href="https://arxiv.org/abs/2404.05427" target="_blank">AutoCodeRover</a></strong> — |
184 |
| - which laid the groundwork for this study. |
185 |
| - </li> |
186 |
| - |
187 |
| - <li> |
188 |
| - We thank the <strong><a href="https://github.com/evalplus/evalplus" target="_blank">EvalPlus leaderboard</a></strong> |
189 |
| - team for releasing the elegant page template that inspired this site. |
190 |
| - </li> |
191 |
| - |
192 |
| - <li> |
193 |
| - Finally, we are grateful to the <strong>open-source developer community</strong> for their invaluable contributions. |
194 |
| - </li> |
195 |
| - </ol> |
196 |
| - </div> |
197 |
| -</div> |
198 | 100 |
|
| 101 | + <!-- Notes(略,保持不动) --> |
| 102 | + <!-- ... 其余静态内容不变 ... --> |
199 | 103 |
|
200 | 104 | </div><!-- /#content -->
|
201 | 105 |
|
202 |
| - <!-- 渲染脚本:与之前一致 --> |
| 106 | + <!-- 渲染脚本 --> |
203 | 107 | <script>
|
204 | 108 | (async () => {
|
205 |
| - /* 1. 读取结果文件 */ |
206 | 109 | const res = await fetch('results/results.json');
|
207 |
| - if (!res.ok) { alert('Failed to load results.json'); return; } |
| 110 | + if (!res.ok){ alert('Failed to load results.json'); return; } |
208 | 111 | const raw = Object.values(await res.json());
|
209 |
| - |
210 |
| - /* 2. 各语言字段名 —— 按你的 results.json 来改 */ |
| 112 | + |
211 | 113 | const keyMap = {
|
212 |
| - full: '%resolved_full', |
213 |
| - python: '%resolved_python', |
214 |
| - java: '%resolved_java', |
215 |
| - javascript: '%resolved_javascript', |
216 |
| - typescript: '%resolved_typescript' |
| 114 | + full:'%resolved_full', |
| 115 | + python:'%resolved_python', |
| 116 | + java:'%resolved_java', |
| 117 | + javascript:'%resolved_javascript', |
| 118 | + typescript:'%resolved_typescript' |
217 | 119 | };
|
218 |
| - |
| 120 | + |
219 | 121 | const tbody = document.getElementById('leaderboard-body');
|
220 | 122 | const radios = document.querySelectorAll('input[name="langradio"]');
|
221 |
| - |
222 |
| - /* 百分比显示工具 */ |
223 |
| - const toPercent = v => v == null ? '--' |
224 |
| - : (v < 1 ? v * 100 : v).toFixed(1) + '%'; |
225 |
| - |
226 |
| - function render(lang) { |
| 123 | + |
| 124 | + const toPercent = v => v==null ? '--' : (v<1?v*100:v).toFixed(1)+'%'; |
| 125 | + |
| 126 | + function render(lang){ |
227 | 127 | const k = keyMap[lang];
|
228 | 128 | tbody.innerHTML = '';
|
229 |
| - |
230 |
| - raw.filter(r => r[k] != null) |
231 |
| - .sort((a, b) => b[k] - a[k]) |
232 |
| - .forEach((r, i) => { |
233 |
| - const medal = i === 0 ? '🥇 ' : i === 1 ? '🥈 ' |
234 |
| - : i === 2 ? '🥉 ' : ''; |
235 |
| - |
236 |
| - const orgUrl = (r.org || '').replace(/&/g, '&'); |
237 |
| - const siteUrl = (r.site || '').replace(/&/g, '&'); |
238 |
| - |
239 |
| - const orgIcon = orgUrl ? `<img src="${orgUrl}" style="height:1.5em;">` : '-'; |
240 |
| - const siteLink = siteUrl ? `<a href="${siteUrl}" target="_blank">🔗</a>` : '-'; |
241 |
| - |
242 |
| - |
243 |
| - tbody.insertAdjacentHTML('beforeend', ` |
| 129 | + |
| 130 | + raw.filter(r=>r[k]!=null) |
| 131 | + .sort((a,b)=>b[k]-a[k]) |
| 132 | + .forEach((r,i)=>{ |
| 133 | + const medal = i===0?'🥇 ':i===1?'🥈 ':i===2?'🥉 ':''; |
| 134 | + const orgUrl = (r.org ||'').replace(/&/g,'&'); |
| 135 | + const siteUrl = (r.site ||'').replace(/&/g,'&'); |
| 136 | + const orgIcon = orgUrl ? `<img src="${orgUrl}">` : '-'; |
| 137 | + const siteLink = siteUrl? `<a href="${siteUrl}" target="_blank">🔗</a>` : '-'; |
| 138 | + |
| 139 | + tbody.insertAdjacentHTML('beforeend',` |
244 | 140 | <tr>
|
245 | 141 | <td>${medal}${r.method}</td>
|
246 | 142 | <td>${r.model}</td>
|
247 | 143 | <td class="text-center">${toPercent(r[k])}</td>
|
248 |
| - <td class="text-center">${orgIcon}</td> |
249 |
| - <td class="text-center">${siteLink}</td> |
| 144 | + <td class="icon-cell">${orgIcon}</td> |
| 145 | + <td class="icon-cell">${siteLink}</td> |
250 | 146 | <td class="text-center">${r.date ?? '--'}</td>
|
251 | 147 | </tr>
|
252 | 148 | `);
|
253 | 149 | });
|
254 | 150 | }
|
255 |
| - |
256 |
| - render('full'); // 默认显示全量 |
257 |
| - radios.forEach(r => // 监听语言切换 |
258 |
| - r.addEventListener('change', () => r.checked && render(r.value)) |
259 |
| - ); |
| 151 | + |
| 152 | + render('full'); |
| 153 | + radios.forEach(r=>r.addEventListener('change',()=>r.checked&&render(r.value))); |
260 | 154 | })();
|
261 |
| - </script> |
262 |
| - |
263 |
| - |
| 155 | + </script> |
| 156 | + |
264 | 157 | </body>
|
265 | 158 | </html>
|
0 commit comments