forked from mlfoundations/dclm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evalutaion.bib
429 lines (398 loc) · 16.6 KB
/
evalutaion.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
@article{zhong2023agieval,
author = {Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
journal = {ArXiv preprint},
title = {AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
url = {https://arxiv.org/abs/2304.06364},
volume = {abs/2304.06364},
year = {2023}
}
@article{arc,
author = {Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and
Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
journal = {ArXiv preprint},
title = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
url = {https://arxiv.org/abs/1803.05457},
volume = {abs/1803.05457},
year = {2018}
}
@inproceedings{srivastava2023beyond,
author = {BIG-bench authors},
booktitle = {Transactions on Machine Learning Research (TMLR)},
note = {\url{https://openreview.net/forum?id=uyTL5Bvosj}},
title = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
year = {2023}
}
@inproceedings{boolq,
address = {Minneapolis, Minnesota},
author = {Clark, Christopher and
Lee, Kenton and
Chang, Ming-Wei and
Kwiatkowski, Tom and
Collins, Michael and
Toutanova, Kristina},
booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
doi = {10.18653/v1/N19-1300},
pages = {2924--2936},
publisher = {Association for Computational Linguistics},
title = {{B}ool{Q}: Exploring the Surprising Difficulty of Natural Yes/No Questions},
url = {https://aclanthology.org/N19-1300},
year = {2019}
}
@inproceedings{talmor-etal-2019-commonsenseqa,
address = {Minneapolis, Minnesota},
author = {Talmor, Alon and
Herzig, Jonathan and
Lourie, Nicholas and
Berant, Jonathan},
booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
doi = {10.18653/v1/N19-1421},
pages = {4149--4158},
publisher = {Association for Computational Linguistics},
title = {{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge},
url = {https://aclanthology.org/N19-1421},
year = {2019}
}
@inproceedings{copa,
author = {Melissa Roemmele and Cosmin Adrian Bejan and and Andrew S. Gordon},
booktitle = {Association for the Advancement of Artificial Intelligence (AAAI) Spring Symposium},
note = {\url{https://people.ict.usc.edu/~gordon/copa.html}},
title = {Choice of plausible alternatives: An evaluation of commonsense causal reasoning.},
year = {2011}
}
@article{reddy-etal-2019-coqa,
address = {Cambridge, MA},
author = {Reddy, Siva and
Chen, Danqi and
Manning, Christopher D.},
doi = {10.1162/tacl_a_00266},
journal = {Transactions of the Association for Computational Linguistics},
pages = {249--266},
publisher = {MIT Press},
title = {{C}o{QA}: A Conversational Question Answering Challenge},
url = {https://aclanthology.org/Q19-1016},
volume = {7},
year = {2019}
}
@inproceedings{hellaswag,
address = {Florence, Italy},
author = {Zellers, Rowan and
Holtzman, Ari and
Bisk, Yonatan and
Farhadi, Ali and
Choi, Yejin},
booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
doi = {10.18653/v1/P19-1472},
pages = {4791--4800},
publisher = {Association for Computational Linguistics},
title = {{H}ella{S}wag: Can a Machine Really Finish Your Sentence?},
url = {https://aclanthology.org/P19-1472},
year = {2019}
}
@misc{kaggle200000Jeopardy,
author = {kaggle200000Jeopardy},
howpublished = {\url{https://www.kaggle.com/datasets/tunguz/200000-jeopardy-questions}},
title = {200,000+ {J}eopardy! {Q}uestions --- kaggle.com},
year = {2019}
}
@inproceedings{lambada,
address = {Berlin, Germany},
author = {Paperno, Denis and
Kruszewski, Germ{\'a}n and
Lazaridou, Angeliki and
Pham, Ngoc Quan and
Bernardi, Raffaella and
Pezzelle, Sandro and
Baroni, Marco and
Boleda, Gemma and
Fern{\'a}ndez, Raquel},
booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
doi = {10.18653/v1/P16-1144},
pages = {1525--1534},
publisher = {Association for Computational Linguistics},
title = {The {LAMBADA} dataset: Word prediction requiring a broad discourse context},
url = {https://aclanthology.org/P16-1144},
year = {2016}
}
@inproceedings{OpenBookQA2018,
address = {Brussels, Belgium},
author = {Mihaylov, Todor and
Clark, Peter and
Khot, Tushar and
Sabharwal, Ashish},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
doi = {10.18653/v1/D18-1260},
pages = {2381--2391},
publisher = {Association for Computational Linguistics},
title = {Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
url = {https://aclanthology.org/D18-1260},
year = {2018}
}
@inproceedings{piqa,
author = {Yonatan Bisk and
Rowan Zellers and
Ronan LeBras and
Jianfeng Gao and
Yejin Choi},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/conf/aaai/BiskZLGC20.bib},
booktitle = {The Thirty-Fourth {AAAI} Conference on Artificial Intelligence, {AAAI}
2020, The Thirty-Second Innovative Applications of Artificial Intelligence
Conference, {IAAI} 2020, The Tenth {AAAI} Symposium on Educational
Advances in Artificial Intelligence, {EAAI} 2020, New York, NY, USA,
February 7-12, 2020},
pages = {7432--7439},
publisher = {{AAAI} Press},
timestamp = {Thu, 04 Jun 2020 01:00:00 +0200},
title = {{PIQA:} Reasoning about Physical Commonsense in Natural Language},
url = {https://aaai.org/ojs/index.php/AAAI/article/view/6239},
year = {2020}
}
@inproceedings{squad,
address = {Austin, Texas},
author = {Rajpurkar, Pranav and
Zhang, Jian and
Lopyrev, Konstantin and
Liang, Percy},
booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing},
doi = {10.18653/v1/D16-1264},
pages = {2383--2392},
publisher = {Association for Computational Linguistics},
title = {{SQ}u{AD}: 100,000+ Questions for Machine Comprehension of Text},
url = {https://aclanthology.org/D16-1264},
year = {2016}
}
@inproceedings{winograd,
author = {Levesque, Hector and Davis, Ernest and Morgenstern, Leora},
booktitle = {International conference on the principles of knowledge representation and reasoning},
note = {\url{https://aaai.org/papers/59-4492-the-winograd-schema-challenge}},
title = {The winograd schema challenge},
year = {2012}
}
@inproceedings{sakaguchi2019winogrande,
author = {Keisuke Sakaguchi and
Ronan Le Bras and
Chandra Bhagavatula and
Yejin Choi},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/conf/aaai/SakaguchiBBC20.bib},
booktitle = {The Thirty-Fourth {AAAI} Conference on Artificial Intelligence, {AAAI}
2020, The Thirty-Second Innovative Applications of Artificial Intelligence
Conference, {IAAI} 2020, The Tenth {AAAI} Symposium on Educational
Advances in Artificial Intelligence, {EAAI} 2020, New York, NY, USA,
February 7-12, 2020},
pages = {8732--8740},
publisher = {{AAAI} Press},
timestamp = {Tue, 02 Feb 2021 00:00:00 +0100},
title = {WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
url = {https://aaai.org/ojs/index.php/AAAI/article/view/6399},
year = {2020}
}
@article{zhong2023agieval,
author = {Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
journal = {ArXiv preprint},
title = {AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
url = {https://arxiv.org/abs/2304.06364},
volume = {abs/2304.06364},
year = {2023}
}
@inproceedings{ling2017program,
address = {Vancouver, Canada},
author = {Ling, Wang and
Yogatama, Dani and
Dyer, Chris and
Blunsom, Phil},
booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
doi = {10.18653/v1/P17-1015},
pages = {158--167},
publisher = {Association for Computational Linguistics},
title = {Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems},
url = {https://aclanthology.org/P17-1015},
year = {2017}
}
@inproceedings{bbq,
address = {Dublin, Ireland},
author = {Parrish, Alicia and
Chen, Angelica and
Nangia, Nikita and
Padmakumar, Vishakh and
Phang, Jason and
Thompson, Jana and
Htut, Phu Mon and
Bowman, Samuel},
booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
doi = {10.18653/v1/2022.findings-acl.165},
pages = {2086--2105},
publisher = {Association for Computational Linguistics},
title = {{BBQ}: A hand-built bias benchmark for question answering},
url = {https://aclanthology.org/2022.findings-acl.165},
year = {2022}
}
@inproceedings{srivastava2023beyond,
author = {BIG-bench authors},
booktitle = {Transactions on Machine Learning Research (TMLR)},
note = {\url{https://openreview.net/forum?id=uyTL5Bvosj}},
title = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
year = {2023}
}
@misc{patronusPatronusPatronus,
author = {PatronusAI},
howpublished = {\url{https://www.patronus.ai/announcements/patronus-ai-launches-enterprisepii-the-industrys-first-llm-dataset-for-detecting-business-sensitive-information}},
title = {{P}atronus {A}{I} | {P}atronus {A}{I} launches {E}nterprise{P}{I}{I}, the industry’s first {L}{L}{M} dataset for detecting business-sensitive information --- patronus.ai},
year = {2023}
}
@article{rein2023gpqa,
author = {Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
journal = {ArXiv preprint},
title = {Gpqa: A graduate-level google-proof q\&a benchmark},
url = {https://arxiv.org/abs/2311.12022},
volume = {abs/2311.12022},
year = {2023}
}
@article{gsm8k,
author = {Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
journal = {ArXiv preprint},
title = {Training Verifiers to Solve Math Word Problems},
url = {https://arxiv.org/abs/2110.14168},
volume = {abs/2110.14168},
year = {2021}
}
@inproceedings{Liu2020LogiQAAC,
author = {Jian Liu and
Leyang Cui and
Hanmeng Liu and
Dandan Huang and
Yile Wang and
Yue Zhang},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/conf/ijcai/LiuCLHWZ20.bib},
booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on
Artificial Intelligence, {IJCAI} 2020},
doi = {10.24963/ijcai.2020/501},
editor = {Christian Bessiere},
pages = {3622--3628},
publisher = {ijcai.org},
timestamp = {Mon, 19 Oct 2020 01:00:00 +0200},
title = {LogiQA: {A} Challenge Dataset for Machine Reading Comprehension with
Logical Reasoning},
url = {https://doi.org/10.24963/ijcai.2020/501},
year = {2020}
}
@inproceedings{mathqa,
address = {Minneapolis, Minnesota},
author = {Amini, Aida and
Gabriel, Saadia and
Lin, Shanchuan and
Koncel-Kedziorski, Rik and
Choi, Yejin and
Hajishirzi, Hannaneh},
booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
doi = {10.18653/v1/N19-1245},
pages = {2357--2367},
publisher = {Association for Computational Linguistics},
title = {{M}ath{QA}: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
url = {https://aclanthology.org/N19-1245},
year = {2019}
}
@inproceedings{mmlu,
author = {Dan Hendrycks and
Collin Burns and
Steven Basart and
Andy Zou and
Mantas Mazeika and
Dawn Song and
Jacob Steinhardt},
bibsource = {dblp computer science bibliography, https://dblp.org},
biburl = {https://dblp.org/rec/conf/iclr/HendrycksBBZMSS21.bib},
booktitle = {9th International Conference on Learning Representations, {ICLR} 2021,
Virtual Event, Austria, May 3-7, 2021},
publisher = {OpenReview.net},
timestamp = {Wed, 23 Jun 2021 01:00:00 +0200},
title = {Measuring Massive Multitask Language Understanding},
url = {https://openreview.net/forum?id=d7KBjmI3GmQ},
year = {2021}
}
@inproceedings{pubmed,
address = {Hong Kong, China},
author = {Jin, Qiao and
Dhingra, Bhuwan and
Liu, Zhengping and
Cohen, William and
Lu, Xinghua},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
doi = {10.18653/v1/D19-1259},
pages = {2567--2577},
publisher = {Association for Computational Linguistics},
title = {{P}ub{M}ed{QA}: A Dataset for Biomedical Research Question Answering},
url = {https://aclanthology.org/D19-1259},
year = {2019}
}
@misc{mosailMLarithmetic,
author = {MosaicML},
howpublished = {\url{https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/EVAL_GAUNTLET.md}},
title = {llm-foundry/scripts/eval/local\_data/{E}{V}{A}{L}\_{G}{A}{U}{N}{T}{L}{E}{T}.md at main · mosaicml/llm-foundry --- github.com},
year = {2023}
}
@inproceedings{siqa,
address = {Hong Kong, China},
author = {Sap, Maarten and
Rashkin, Hannah and
Chen, Derek and
Le Bras, Ronan and
Choi, Yejin},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
doi = {10.18653/v1/D19-1454},
pages = {4463--4473},
publisher = {Association for Computational Linguistics},
title = {Social {IQ}a: Commonsense Reasoning about Social Interactions},
url = {https://aclanthology.org/D19-1454},
year = {2019}
}
@inproceedings{patel2021nlp,
address = {Online},
author = {Patel, Arkil and
Bhattamishra, Satwik and
Goyal, Navin},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
doi = {10.18653/v1/2021.naacl-main.168},
pages = {2080--2094},
publisher = {Association for Computational Linguistics},
title = {Are {NLP} Models really able to Solve Simple Math Word Problems?},
url = {https://aclanthology.org/2021.naacl-main.168},
year = {2021}
}
@inproceedings{triviaqa,
address = {Vancouver, Canada},
author = {Joshi, Mandar and
Choi, Eunsol and
Weld, Daniel and
Zettlemoyer, Luke},
booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
doi = {10.18653/v1/P17-1147},
pages = {1601--1611},
publisher = {Association for Computational Linguistics},
title = {{T}rivia{QA}: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
url = {https://aclanthology.org/P17-1147},
year = {2017}
}
@inproceedings{winogender,
address = {New Orleans, Louisiana},
author = {Rudinger, Rachel and
Naradowsky, Jason and
Leonard, Brian and
Van Durme, Benjamin},
booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)},
doi = {10.18653/v1/N18-2002},
pages = {8--14},
publisher = {Association for Computational Linguistics},
title = {Gender Bias in Coreference Resolution},
url = {https://aclanthology.org/N18-2002},
year = {2018}
}
@article{Chen2021EvaluatingLL,
author = {Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde and Jared Kaplan and Harrison Edwards and Yura Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and David W. Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William H. Guss and Alex Nichol and Igor Babuschkin and Suchir Balaji and Shantanu Jain and Andrew Carr and Jan Leike and Joshua Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew M. Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
journal = {ArXiv preprint},
title = {Evaluating Large Language Models Trained on Code},
url = {https://arxiv.org/abs/2107.03374},
volume = {abs/2107.03374},
year = {2021}
}