bib/evalutaion.bib

@article{zhong2023agieval,
 author = {Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
 journal = {ArXiv preprint},
 title = {AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
 url = {https://arxiv.org/abs/2304.06364},
 volume = {abs/2304.06364},
 year = {2023}
}

@article{arc,
 author = {Peter Clark  and Isaac Cowhey and Oren Etzioni and Tushar Khot and
Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
 journal = {ArXiv preprint},
 title = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
 url = {https://arxiv.org/abs/1803.05457},
 volume = {abs/1803.05457},
 year = {2018}
}

@inproceedings{srivastava2023beyond,
 author = {BIG-bench authors},
 booktitle = {Transactions on Machine Learning Research (TMLR)},
 note = {\url{https://openreview.net/forum?id=uyTL5Bvosj}},
 title = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
 year = {2023}
}

@inproceedings{boolq,
 address = {Minneapolis, Minnesota},
 author = {Clark, Christopher  and
Lee, Kenton  and
Chang, Ming-Wei  and
Kwiatkowski, Tom  and
Collins, Michael  and
Toutanova, Kristina},
 booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
 doi = {10.18653/v1/N19-1300},
 pages = {2924--2936},
 publisher = {Association for Computational Linguistics},
 title = {{B}ool{Q}: Exploring the Surprising Difficulty of Natural Yes/No Questions},
 url = {https://aclanthology.org/N19-1300},
 year = {2019}
}

@inproceedings{talmor-etal-2019-commonsenseqa,
 address = {Minneapolis, Minnesota},
 author = {Talmor, Alon  and
Herzig, Jonathan  and
Lourie, Nicholas  and
Berant, Jonathan},
 booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
 doi = {10.18653/v1/N19-1421},
 pages = {4149--4158},
 publisher = {Association for Computational Linguistics},
 title = {{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge},
 url = {https://aclanthology.org/N19-1421},
 year = {2019}
}

@inproceedings{copa,
 author = {Melissa Roemmele and Cosmin Adrian Bejan and and Andrew S. Gordon},
 booktitle = {Association for the Advancement of Artificial Intelligence (AAAI) Spring Symposium},
 note = {\url{https://people.ict.usc.edu/~gordon/copa.html}},
 title = {Choice of plausible alternatives: An evaluation of commonsense causal reasoning.},
 year = {2011}
}

@article{reddy-etal-2019-coqa,
 address = {Cambridge, MA},
 author = {Reddy, Siva  and
Chen, Danqi  and
Manning, Christopher D.},
 doi = {10.1162/tacl_a_00266},
 journal = {Transactions of the Association for Computational Linguistics},
 pages = {249--266},
 publisher = {MIT Press},
 title = {{C}o{QA}: A Conversational Question Answering Challenge},
 url = {https://aclanthology.org/Q19-1016},
 volume = {7},
 year = {2019}
}

@inproceedings{hellaswag,
 address = {Florence, Italy},
 author = {Zellers, Rowan  and
Holtzman, Ari  and
Bisk, Yonatan  and
Farhadi, Ali  and
Choi, Yejin},
 booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
 doi = {10.18653/v1/P19-1472},
 pages = {4791--4800},
 publisher = {Association for Computational Linguistics},
 title = {{H}ella{S}wag: Can a Machine Really Finish Your Sentence?},
 url = {https://aclanthology.org/P19-1472},
 year = {2019}
}

@misc{kaggle200000Jeopardy,
 author = {kaggle200000Jeopardy},
 howpublished = {\url{https://www.kaggle.com/datasets/tunguz/200000-jeopardy-questions}},
 title = {200,000+ {J}eopardy! {Q}uestions --- kaggle.com},
 year = {2019}
}

@inproceedings{lambada,
 address = {Berlin, Germany},
 author = {Paperno, Denis  and
Kruszewski, Germ{\'a}n  and
Lazaridou, Angeliki  and
Pham, Ngoc Quan  and
Bernardi, Raffaella  and
Pezzelle, Sandro  and
Baroni, Marco  and
Boleda, Gemma  and
Fern{\'a}ndez, Raquel},
 booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
 doi = {10.18653/v1/P16-1144},
 pages = {1525--1534},
 publisher = {Association for Computational Linguistics},
 title = {The {LAMBADA} dataset: Word prediction requiring a broad discourse context},
 url = {https://aclanthology.org/P16-1144},
 year = {2016}
}

@inproceedings{OpenBookQA2018,
 address = {Brussels, Belgium},
 author = {Mihaylov, Todor  and
Clark, Peter  and
Khot, Tushar  and
Sabharwal, Ashish},
 booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
 doi = {10.18653/v1/D18-1260},
 pages = {2381--2391},
 publisher = {Association for Computational Linguistics},
 title = {Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
 url = {https://aclanthology.org/D18-1260},
 year = {2018}
}

@inproceedings{piqa,
 author = {Yonatan Bisk and
Rowan Zellers and
Ronan LeBras and
Jianfeng Gao and
Yejin Choi},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/aaai/BiskZLGC20.bib},
 booktitle = {The Thirty-Fourth {AAAI} Conference on Artificial Intelligence, {AAAI}
2020, The Thirty-Second Innovative Applications of Artificial Intelligence
Conference, {IAAI} 2020, The Tenth {AAAI} Symposium on Educational
Advances in Artificial Intelligence, {EAAI} 2020, New York, NY, USA,
February 7-12, 2020},
 pages = {7432--7439},
 publisher = {{AAAI} Press},
 timestamp = {Thu, 04 Jun 2020 01:00:00 +0200},
 title = {{PIQA:} Reasoning about Physical Commonsense in Natural Language},
 url = {https://aaai.org/ojs/index.php/AAAI/article/view/6239},
 year = {2020}
}

@inproceedings{squad,
 address = {Austin, Texas},
 author = {Rajpurkar, Pranav  and
Zhang, Jian  and
Lopyrev, Konstantin  and
Liang, Percy},
 booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing},
 doi = {10.18653/v1/D16-1264},
 pages = {2383--2392},
 publisher = {Association for Computational Linguistics},
 title = {{SQ}u{AD}: 100,000+ Questions for Machine Comprehension of Text},
 url = {https://aclanthology.org/D16-1264},
 year = {2016}
}

@inproceedings{winograd,
 author = {Levesque, Hector and Davis, Ernest and Morgenstern, Leora},
 booktitle = {International conference on the principles of knowledge representation and reasoning},
 note = {\url{https://aaai.org/papers/59-4492-the-winograd-schema-challenge}},
 title = {The winograd schema challenge},
 year = {2012}
}

@inproceedings{sakaguchi2019winogrande,
 author = {Keisuke Sakaguchi and
Ronan Le Bras and
Chandra Bhagavatula and
Yejin Choi},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/aaai/SakaguchiBBC20.bib},
 booktitle = {The Thirty-Fourth {AAAI} Conference on Artificial Intelligence, {AAAI}
2020, The Thirty-Second Innovative Applications of Artificial Intelligence
Conference, {IAAI} 2020, The Tenth {AAAI} Symposium on Educational
Advances in Artificial Intelligence, {EAAI} 2020, New York, NY, USA,
February 7-12, 2020},
 pages = {8732--8740},
 publisher = {{AAAI} Press},
 timestamp = {Tue, 02 Feb 2021 00:00:00 +0100},
 title = {WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
 url = {https://aaai.org/ojs/index.php/AAAI/article/view/6399},
 year = {2020}
}

@article{zhong2023agieval,
 author = {Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
 journal = {ArXiv preprint},
 title = {AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
 url = {https://arxiv.org/abs/2304.06364},
 volume = {abs/2304.06364},
 year = {2023}
}

@inproceedings{ling2017program,
 address = {Vancouver, Canada},
 author = {Ling, Wang  and
Yogatama, Dani  and
Dyer, Chris  and
Blunsom, Phil},
 booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
 doi = {10.18653/v1/P17-1015},
 pages = {158--167},
 publisher = {Association for Computational Linguistics},
 title = {Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems},
 url = {https://aclanthology.org/P17-1015},
 year = {2017}
}

@inproceedings{bbq,
 address = {Dublin, Ireland},
 author = {Parrish, Alicia  and
Chen, Angelica  and
Nangia, Nikita  and
Padmakumar, Vishakh  and
Phang, Jason  and
Thompson, Jana  and
Htut, Phu Mon  and
Bowman, Samuel},
 booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
 doi = {10.18653/v1/2022.findings-acl.165},
 pages = {2086--2105},
 publisher = {Association for Computational Linguistics},
 title = {{BBQ}: A hand-built bias benchmark for question answering},
 url = {https://aclanthology.org/2022.findings-acl.165},
 year = {2022}
}

@inproceedings{srivastava2023beyond,
 author = {BIG-bench authors},
 booktitle = {Transactions on Machine Learning Research (TMLR)},
 note = {\url{https://openreview.net/forum?id=uyTL5Bvosj}},
 title = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
 year = {2023}
}

@misc{patronusPatronusPatronus,
 author = {PatronusAI},
 howpublished = {\url{https://www.patronus.ai/announcements/patronus-ai-launches-enterprisepii-the-industrys-first-llm-dataset-for-detecting-business-sensitive-information}},
 title = {{P}atronus {A}{I} | {P}atronus {A}{I} launches {E}nterprise{P}{I}{I}, the industry’s first {L}{L}{M} dataset for detecting business-sensitive information --- patronus.ai},
 year = {2023}
}

@article{rein2023gpqa,
 author = {Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
 journal = {ArXiv preprint},
 title = {Gpqa: A graduate-level google-proof q\&a benchmark},
 url = {https://arxiv.org/abs/2311.12022},
 volume = {abs/2311.12022},
 year = {2023}
}

@article{gsm8k,
 author = {Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
 journal = {ArXiv preprint},
 title = {Training Verifiers to Solve Math Word Problems},
 url = {https://arxiv.org/abs/2110.14168},
 volume = {abs/2110.14168},
 year = {2021}
}

@inproceedings{Liu2020LogiQAAC,
 author = {Jian Liu and
Leyang Cui and
Hanmeng Liu and
Dandan Huang and
Yile Wang and
Yue Zhang},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/ijcai/LiuCLHWZ20.bib},
 booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on
Artificial Intelligence, {IJCAI} 2020},
 doi = {10.24963/ijcai.2020/501},
 editor = {Christian Bessiere},
 pages = {3622--3628},
 publisher = {ijcai.org},
 timestamp = {Mon, 19 Oct 2020 01:00:00 +0200},
 title = {LogiQA: {A} Challenge Dataset for Machine Reading Comprehension with
Logical Reasoning},
 url = {https://doi.org/10.24963/ijcai.2020/501},
 year = {2020}
}

@inproceedings{mathqa,
 address = {Minneapolis, Minnesota},
 author = {Amini, Aida  and
Gabriel, Saadia  and
Lin, Shanchuan  and
Koncel-Kedziorski, Rik  and
Choi, Yejin  and
Hajishirzi, Hannaneh},
 booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
 doi = {10.18653/v1/N19-1245},
 pages = {2357--2367},
 publisher = {Association for Computational Linguistics},
 title = {{M}ath{QA}: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
 url = {https://aclanthology.org/N19-1245},
 year = {2019}
}

@inproceedings{mmlu,
 author = {Dan Hendrycks and
Collin Burns and
Steven Basart and
Andy Zou and
Mantas Mazeika and
Dawn Song and
Jacob Steinhardt},
 bibsource = {dblp computer science bibliography, https://dblp.org},
 biburl = {https://dblp.org/rec/conf/iclr/HendrycksBBZMSS21.bib},
 booktitle = {9th International Conference on Learning Representations, {ICLR} 2021,
Virtual Event, Austria, May 3-7, 2021},
 publisher = {OpenReview.net},
 timestamp = {Wed, 23 Jun 2021 01:00:00 +0200},
 title = {Measuring Massive Multitask Language Understanding},
 url = {https://openreview.net/forum?id=d7KBjmI3GmQ},
 year = {2021}
}

@inproceedings{pubmed,
 address = {Hong Kong, China},
 author = {Jin, Qiao  and
Dhingra, Bhuwan  and
Liu, Zhengping  and
Cohen, William  and
Lu, Xinghua},
 booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
 doi = {10.18653/v1/D19-1259},
 pages = {2567--2577},
 publisher = {Association for Computational Linguistics},
 title = {{P}ub{M}ed{QA}: A Dataset for Biomedical Research Question Answering},
 url = {https://aclanthology.org/D19-1259},
 year = {2019}
}

@misc{mosailMLarithmetic,
 author = {MosaicML},
 howpublished = {\url{https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/local_data/EVAL_GAUNTLET.md}},
 title = {llm-foundry/scripts/eval/local\_data/{E}{V}{A}{L}\_{G}{A}{U}{N}{T}{L}{E}{T}.md at main · mosaicml/llm-foundry --- github.com},
 year = {2023}
}

@inproceedings{siqa,
 address = {Hong Kong, China},
 author = {Sap, Maarten  and
Rashkin, Hannah  and
Chen, Derek  and
Le Bras, Ronan  and
Choi, Yejin},
 booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
 doi = {10.18653/v1/D19-1454},
 pages = {4463--4473},
 publisher = {Association for Computational Linguistics},
 title = {Social {IQ}a: Commonsense Reasoning about Social Interactions},
 url = {https://aclanthology.org/D19-1454},
 year = {2019}
}

@inproceedings{patel2021nlp,
 address = {Online},
 author = {Patel, Arkil  and
Bhattamishra, Satwik  and
Goyal, Navin},
 booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
 doi = {10.18653/v1/2021.naacl-main.168},
 pages = {2080--2094},
 publisher = {Association for Computational Linguistics},
 title = {Are {NLP} Models really able to Solve Simple Math Word Problems?},
 url = {https://aclanthology.org/2021.naacl-main.168},
 year = {2021}
}

@inproceedings{triviaqa,
 address = {Vancouver, Canada},
 author = {Joshi, Mandar  and
Choi, Eunsol  and
Weld, Daniel  and
Zettlemoyer, Luke},
 booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
 doi = {10.18653/v1/P17-1147},
 pages = {1601--1611},
 publisher = {Association for Computational Linguistics},
 title = {{T}rivia{QA}: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
 url = {https://aclanthology.org/P17-1147},
 year = {2017}
}

@inproceedings{winogender,
 address = {New Orleans, Louisiana},
 author = {Rudinger, Rachel  and
Naradowsky, Jason  and
Leonard, Brian  and
Van Durme, Benjamin},
 booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)},
 doi = {10.18653/v1/N18-2002},
 pages = {8--14},
 publisher = {Association for Computational Linguistics},
 title = {Gender Bias in Coreference Resolution},
 url = {https://aclanthology.org/N18-2002},
 year = {2018}
}

@article{Chen2021EvaluatingLL,
 author = {Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde and Jared Kaplan and Harrison Edwards and Yura Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and David W. Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William H. Guss and Alex Nichol and Igor Babuschkin and Suchir Balaji and Shantanu Jain and Andrew Carr and Jan Leike and Joshua Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew M. Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
 journal = {ArXiv preprint},
 title = {Evaluating Large Language Models Trained on Code},
 url = {https://arxiv.org/abs/2107.03374},
 volume = {abs/2107.03374},
 year = {2021}
}