|
| 1 | +<!doctype html> |
| 2 | +<html lang="en"> |
| 3 | +<link rel="preconnect" href="https://fonts.googleapis.com"> |
| 4 | +<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| 5 | +<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@100;400&display=swap" rel="stylesheet"> |
| 6 | + |
| 7 | +<head> |
| 8 | + <meta charset="UTF-8"> |
| 9 | + <title>OmniGIRL Leaderboard</title> |
| 10 | + |
| 11 | + <script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script> |
| 12 | + <link rel="icon" href="https://images.emojiterra.com/google/noto-emoji/unicode-15/color/1024px/1f4da.png"> |
| 13 | + <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0/dist/css/bootstrap.min.css"> |
| 14 | + |
| 15 | + <style> |
| 16 | + body{font-family:"JetBrains Mono",monospace;background:#fff;color:#000} |
| 17 | + #content{width:85%} |
| 18 | + th{background:#f2f2f2;text-align:left} |
| 19 | + td{text-align:left;vertical-align:middle} |
| 20 | + #notes{font-size:1em} |
| 21 | + #notes h3{margin-top:1em;font-size:2em;text-align:center} |
| 22 | + #notes li{font-size:1.2em;font-weight:300;margin:1em} |
| 23 | + .table-striped tbody tr:nth-of-type(odd){background:#f8f9fa} |
| 24 | + @media(max-width:1400px){ |
| 25 | + body{font-size:1.6vw} |
| 26 | + #content{width:100%} |
| 27 | + h1{font-size:2em}h3{font-size:1.2em} |
| 28 | + table{font-size:small} |
| 29 | + } |
| 30 | + </style> |
| 31 | +</head> |
| 32 | + |
| 33 | +<body> |
| 34 | + <div id="content" class="container-fluid d-flex flex-column align-items-center gap-3"> |
| 35 | + |
| 36 | + <h1 class="text-nowrap mt-5">🏆 OmniGIRL Leaderboard 🏆</h1> |
| 37 | + <h3 class="fw-light text-nowrap"> |
| 38 | + <small id="warning">A Multilingual & Multimodal Benchmark for GitHub Issue Resolution<br></small> |
| 39 | + </h3> |
| 40 | + |
| 41 | + <!-- 徽章:GitHub / Paper / HF --> |
| 42 | + <div class="d-flex flex-row justify-content-center gap-3"> |
| 43 | + <a href="https://github.com/your-org/your-repo" target="_blank"> |
| 44 | + <img src="https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white" |
| 45 | + alt="github" class="img-fluid"> |
| 46 | + </a> |
| 47 | + <a href="https://openreview.net/forum?id=yourISSTA25paperID" target="_blank"> |
| 48 | + <img src="https://img.shields.io/badge/Paper-ISSTA%2725-a55fed.svg?style=for-the-badge" |
| 49 | + alt="paper" class="img-fluid"> |
| 50 | + </a> |
| 51 | + <a href="https://huggingface.co/your-repo" target="_blank"> |
| 52 | + <img src="https://img.shields.io/badge/HuggingFace-%23f9ac00.svg?style=for-the-badge&logo=huggingface" |
| 53 | + alt="hf" class="img-fluid"> |
| 54 | + </a> |
| 55 | + </div> |
| 56 | + |
| 57 | + <!-- 五语言按钮 --> |
| 58 | + <div class="btn-group mt-3" role="group" id="Language"> |
| 59 | + <input type="radio" class="btn-check" name="langradio" id="Full" value="full" checked> |
| 60 | + <label class="btn btn-outline-primary" for="Full">Full</label> |
| 61 | + <input type="radio" class="btn-check" name="langradio" id="Python" value="python"> |
| 62 | + <label class="btn btn-outline-primary" for="Python">Python</label> |
| 63 | + <input type="radio" class="btn-check" name="langradio" id="Java" value="java"> |
| 64 | + <label class="btn btn-outline-primary" for="Java">Java</label> |
| 65 | + <input type="radio" class="btn-check" name="langradio" id="JavaScript" value="javascript"> |
| 66 | + <label class="btn btn-outline-primary" for="JavaScript">JavaScript</label> |
| 67 | + <input type="radio" class="btn-check" name="langradio" id="TypeScript" value="typescript"> |
| 68 | + <label class="btn btn-outline-primary" for="TypeScript">TypeScript</label> |
| 69 | + </div> |
| 70 | + |
| 71 | + <!-- 排名表 --> |
| 72 | + <table id="origin" class="table table-striped table-bordered border border-primary border-3 mt-4 w-100"> |
| 73 | + <thead> |
| 74 | + <tr> |
| 75 | + <th style="width:50%">Method</th> |
| 76 | + <th style="width:25%">Model</th> |
| 77 | + <th style="width:10%" class="text-center">%Resolved</th> |
| 78 | + <th style="width:15%" class="text-center">Date</th> |
| 79 | + </tr> |
| 80 | + </thead> |
| 81 | + <tbody id="leaderboard-body"></tbody> |
| 82 | + </table> |
| 83 | + |
| 84 | + <!-- Notes --> |
| 85 | + <div id="notes" class="w-100"> |
| 86 | + <h3>📝 Notes</h3> |
| 87 | + <div class="inline-block mt-3"> |
| 88 | + <ol> |
| 89 | + <li> |
| 90 | + <strong>OmniGIRL</strong> is a multilingual & multimodal GitHub-issue-resolution benchmark |
| 91 | + with <strong>959 tasks</strong> spanning four programming languages. |
| 92 | + Inputs may include text, screenshots, rendered web pages and other modalities. |
| 93 | + </li> |
| 94 | + |
| 95 | + <li> |
| 96 | + For realistic evaluation, <em>we recommend</em> that methods automatically examine each |
| 97 | + task’s raw input to detect available modalities (e.g., embedded webpages, images), |
| 98 | + retrieve the relevant content by themselves, and invoke the appropriate tools— |
| 99 | + instead of relying on manual hints. |
| 100 | + Doing so better assesses a solver’s <strong>general-purpose issue-resolution ability in real-world scenarios</strong>. |
| 101 | + </li> |
| 102 | + |
| 103 | + <li> |
| 104 | + Our baseline system is released <em>for research purposes only</em>; please cite |
| 105 | + OmniGIRL if you use it. |
| 106 | + </li> |
| 107 | + </ol> |
| 108 | + </div> |
| 109 | + </div> |
| 110 | + |
| 111 | + |
| 112 | + <!-- More Leaderboards --> |
| 113 | + <div id="notes" class="w-100"> |
| 114 | + <h3>🤗 More Leaderboards</h3> |
| 115 | + <div class="inline-block mt-3"> |
| 116 | + <ol> |
| 117 | + <li><a href="https://bigcode-bench.github.io/">BigCodeBench</a></li> |
| 118 | + <li><a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">Big Code Models</a></li> |
| 119 | + <li><a href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">Chatbot Arena</a></li> |
| 120 | + <li><a href="https://github.com/amazon-science/cceval">CrossCodeEval</a></li> |
| 121 | + <li><a href="https://fudanselab-classeval.github.io/">ClassEval</a></li> |
| 122 | + <li><a href="https://crux-eval.github.io/leaderboard.html">CRUXEval</a></li> |
| 123 | + <li><a href="https://codetlingua.github.io/leaderboard.html">Code Lingua</a></li> |
| 124 | + <li><a href="https://evo-eval.github.io/">Evo-Eval</a></li> |
| 125 | + <li><a href="https://huggingface.co/spaces/EffiBench/effibench-leaderboard">EffiBench</a></li> |
| 126 | + <li><a href="https://github.com/01-ai/HumanEval.jl">HumanEval.jl</a></li> |
| 127 | + <li><a href="https://livecodebench.github.io/leaderboard.html">LiveCodeBench</a></li> |
| 128 | + <li><a href="https://sparksofagi.github.io/MHPP/">MHPP</a></li> |
| 129 | + <li><a href="https://github.com/THUDM/NaturalCodeBench">NaturalCodeBench</a></li> |
| 130 | + <li><a href="https://github.com/Leolty/repobench">RepoBench</a></li> |
| 131 | + <li><a href="https://www.swebench.com/">SWE-bench</a></li> |
| 132 | + <li><a href="https://leaderboard.tabbyml.com/">TabbyML</a></li> |
| 133 | + <li><a href="https://llm4softwaretesting.github.io/">TestEval</a></li> |
| 134 | + </ol> |
| 135 | + </div> |
| 136 | + </div> |
| 137 | + |
| 138 | + <!-- Acknowledgements --> |
| 139 | + <!-- 🙏 Acknowledgements --> |
| 140 | +<!-- 🙏 Acknowledgements --> |
| 141 | +<div id="notes" class="w-100 mb-5"> |
| 142 | + <h3>🙏 Acknowledgements</h3> |
| 143 | + <div class="inline-block mt-3"> |
| 144 | + <ol> |
| 145 | + <li> |
| 146 | + We build on prior work — <strong><a href="https://arxiv.org/abs/2310.06770" target="_blank">SWE-bench</a></strong>, |
| 147 | + <strong><a href="https://arxiv.org/abs/2407.01489" target="_blank">Agentless</a></strong>, and |
| 148 | + <strong><a href="https://arxiv.org/abs/2404.05427" target="_blank">AutoCodeRover</a></strong> — |
| 149 | + which laid the groundwork for this study. |
| 150 | + </li> |
| 151 | + |
| 152 | + <li> |
| 153 | + We thank the <strong><a href="https://github.com/evalplus/evalplus" target="_blank">EvalPlus leaderboard</a></strong> |
| 154 | + team for releasing the elegant page template that inspired this site. |
| 155 | + </li> |
| 156 | + |
| 157 | + <li> |
| 158 | + Finally, we are grateful to the <strong>open-source developer community</strong> for their invaluable contributions. |
| 159 | + </li> |
| 160 | + </ol> |
| 161 | + </div> |
| 162 | +</div> |
| 163 | + |
| 164 | + |
| 165 | + </div><!-- /#content --> |
| 166 | + |
| 167 | + <!-- 渲染脚本:与之前一致 --> |
| 168 | + <script> |
| 169 | + const tbody=document.getElementById("leaderboard-body"); |
| 170 | + const radios=document.querySelectorAll('input[name="langradio"]'); |
| 171 | + const xhr=new XMLHttpRequest();xhr.open("GET","results.json",false);xhr.send(); |
| 172 | + if(xhr.status!==200){alert("Failed to load results.json");} |
| 173 | + const raw=Object.values(JSON.parse(xhr.responseText)); |
| 174 | + const keyMap={full:"%resolved_full",python:"%resolved_python", |
| 175 | + java:"%resolved_java",javascript:"%resolved_javascript", |
| 176 | + typescript:"%resolved_typescript"}; |
| 177 | + function render(lang){ |
| 178 | + tbody.innerHTML=""; |
| 179 | + const k=keyMap[lang]; |
| 180 | + raw.filter(r=>r[k]!=null).sort((a,b)=>b[k]-a[k]).forEach((r,i)=>{ |
| 181 | + const medal=i===0?"🥇 ":i===1?"🥈 ":i===2?"🥉 ":""; |
| 182 | + tbody.insertAdjacentHTML("beforeend", |
| 183 | + `<tr><td>${medal}${r.method}</td> |
| 184 | + <td>${r.model}</td> |
| 185 | + <td class="text-center">${(r[k]*100).toFixed(1)}%</td> |
| 186 | + <td class="text-center">${r.date}</td></tr>`); |
| 187 | + }); |
| 188 | + } |
| 189 | + render("full"); |
| 190 | + radios.forEach(r=>r.addEventListener("change",()=>r.checked&&render(r.value))); |
| 191 | + </script> |
| 192 | +</body> |
| 193 | +</html> |
0 commit comments