bibliography.bib

@online{adamsBayesianOnlineChangepoint2007,
  title = {Bayesian {{Online Changepoint Detection}}},
  author = {Adams, Ryan Prescott and MacKay, David J. C.},
  date = {2007-10-19},
  eprint = {0710.3742},
  eprinttype = {arXiv},
  eprintclass = {stat},
  doi = {10.48550/arXiv.0710.3742},
  url = {http://arxiv.org/abs/0710.3742},
  urldate = {2023-03-17},
  abstract = {Changepoints are abrupt variations in the generative parameters of a data sequence. Online detection of changepoints is useful in modelling and prediction of time series in application areas such as finance, biometrics, and robotics. While frequentist methods have yielded online filtering and prediction techniques, most Bayesian papers have focused on the retrospective segmentation problem. Here we examine the case where the model parameters before and after the changepoint are independent and we derive an online algorithm for exact inference of the most recent changepoint. We compute the probability distribution of the length of the current ``run,'' or time since the last changepoint, using a simple message-passing algorithm. Our implementation is highly modular so that the algorithm may be applied to a variety of types of data. We illustrate this modularity by demonstrating the algorithm on three different real-world data sets.},
  pubstate = {prepublished},
  keywords = {/unread,Statistics - Machine Learning},
  file = {/Users/andrew/Zotero/storage/B9844L4J/Adams_MacKay_2007_Bayesian Online Changepoint Detection.pdf;/Users/andrew/Zotero/storage/36XZ5FQB/0710.html}
}

@online{adamsSparseDenseGPT42023,
  title = {From {{Sparse}} to {{Dense}}: {{GPT-4 Summarization}} with {{Chain}} of {{Density Prompting}}},
  shorttitle = {From {{Sparse}} to {{Dense}}},
  author = {Adams, Griffin and Fabbri, Alexander and Ladhak, Faisal and Lehman, Eric and Elhadad, Noémie},
  date = {2023-09-08},
  eprint = {2309.04269},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2309.04269},
  urldate = {2023-09-17},
  abstract = {Selecting the ``right'' amount of information to include in a summary is a difficult task. A good summary should be detailed and entity-centric without being overly dense and hard to follow. To better understand this tradeoff, we solicit increasingly dense GPT-4 summaries with what we refer to as a ``Chain of Density'' (CoD) prompt. Specifically, GPT-4 generates an initial entity-sparse summary before iteratively incorporating missing salient entities without increasing the length. Summaries generated by CoD are more abstractive, exhibit more fusion, and have less of a lead bias than GPT-4 summaries generated by a vanilla prompt. We conduct a human preference study on 100 CNN DailyMail articles and find that that humans prefer GPT-4 summaries that are more dense than those generated by a vanilla prompt and almost as dense as human written summaries. Qualitative analysis supports the notion that there exists a tradeoff between informativeness and readability. 500 annotated CoD summaries, as well as an extra 5,000 unannotated summaries, are freely available on HuggingFace (https://huggingface.co/datasets/griffin/chain\_of\_density).},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/K7HZ73IP/Adams et al_2023_From Sparse to Dense.pdf;/Users/andrew/Zotero/storage/H8GLKSD9/2309.html}
}

@article{adorniRubricbasedLearnerModelling2023,
  title = {Rubric-Based {{Learner Modelling}} via {{Noisy Gates Bayesian Networks}} for {{Computational Thinking Skills Assessment}}},
  author = {Adorni, Giorgia and Mangili, Francesca and Piatti, Alberto and Bonesana, Claudio and Antonucci, Alessandro},
  date = {2023},
  journaltitle = {Journal of Communications Software and Systems},
  shortjournal = {JCOMSS},
  volume = {19},
  number = {1},
  pages = {52--64},
  issn = {18456421, 18466079},
  doi = {10.24138/jcomss-2022-0169},
  url = {https://jcoms.fesb.unist.hr/10.24138/jcomss-2022-0169/},
  urldate = {2024-09-04},
  abstract = {In modern and personalised education, there is a growing interest in developing learners’ competencies and accurately assessing them. In a previous work, we proposed a procedure for deriving a learner model for automatic skill assessment from a task-specific competence rubric, thus simplifying the implementation of automated assessment tools. The previous approach, however, suffered two main limitations: (i) the ordering between competencies defined by the assessment rubric was only indirectly modelled; (ii) supplementary skills, not under assessment but necessary for accomplishing the task, were not included in the model. In this work, we address issue (i) by introducing dummy observed nodes, strictly enforcing the skills ordering without changing the network’s structure. In contrast, for point (ii), we design a network with two layers of gates, one performing disjunctive operations by noisy-OR gates and the other conjunctive operations through logical ANDs. Such changes improve the model outcomes’ coherence and the modelling tool’s flexibility without compromising the model’s compact parametrisation, interpretability and simple experts’ elicitation. We used this approach to develop a learner model for Computational Thinking (CT) skills assessment. The CT-cube skills assessment framework and the Cross Array Task (CAT) are used to exemplify it and demonstrate its feasibility.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/MGF59B53/Adorni et al. - 2023 - Rubric-based Learner Modelling via Noisy Gates Bayesian Networks for Computational Thinking Skills A.pdf}
}

@online{agarwalTransformersReinforcementLearning2023,
  title = {Transformers in {{Reinforcement Learning}}: {{A Survey}}},
  shorttitle = {Transformers in {{Reinforcement Learning}}},
  author = {Agarwal, Pranav and Rahman, Aamer Abdul and St-Charles, Pierre-Luc and Prince, Simon J. D. and Kahou, Samira Ebrahimi},
  date = {2023-07-12},
  eprint = {2307.05979},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2307.05979},
  urldate = {2023-08-02},
  abstract = {Transformers have significantly impacted domains like natural language processing, computer vision, and robotics, where they improve performance compared to other neural networks. This survey explores how transformers are used in reinforcement learning (RL), where they are seen as a promising solution for addressing challenges such as unstable training, credit assignment, lack of interpretability, and partial observability. We begin by providing a brief domain overview of RL, followed by a discussion on the challenges of classical RL algorithms. Next, we delve into the properties of the transformer and its variants and discuss the characteristics that make them well-suited to address the challenges inherent in RL. We examine the application of transformers to various aspects of RL, including representation learning, transition and reward function modeling, and policy optimization. We also discuss recent research that aims to enhance the interpretability and efficiency of transformers in RL, using visualization techniques and efficient training strategies. Often, the transformer architecture must be tailored to the specific needs of a given application. We present a broad overview of how transformers have been adapted for several applications, including robotics, medicine, language modeling, cloud computing, and combinatorial optimization. We conclude by discussing the limitations of using transformers in RL and assess their potential for catalyzing future breakthroughs in this field. CCS Concepts: • Computing methodologies → Reinforcement learning; Neural networks; • General and reference → Surveys and overviews.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/W85GDSUV/Agarwal et al. - 2023 - Transformers in Reinforcement Learning A Survey.pdf}
}

@online{aksitovReSTMeetsReAct2023,
  title = {{{ReST}} Meets {{ReAct}}: {{Self-Improvement}} for {{Multi-Step Reasoning LLM Agent}}},
  shorttitle = {{{ReST}} Meets {{ReAct}}},
  author = {Aksitov, Renat and Miryoosefi, Sobhan and Li, Zonglin and Li, Daliang and Babayan, Sheila and Kopparapu, Kavya and Fisher, Zachary and Guo, Ruiqi and Prakash, Sushant and Srinivasan, Pranesh and Zaheer, Manzil and Yu, Felix and Kumar, Sanjiv},
  date = {2023-12-15},
  eprint = {2312.10003},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2312.10003},
  url = {http://arxiv.org/abs/2312.10003},
  urldate = {2024-05-16},
  abstract = {Answering complex natural language questions often necessitates multi-step reasoning and integrating external information. Several systems have combined knowledge retrieval with a large language model (LLM) to answer such questions. These systems, however, suffer from various failure cases, and we cannot directly train them end-to-end to fix such failures, as interaction with external knowledge is non-differentiable. To address these deficiencies, we define a ReAct-style LLM agent with the ability to reason and act upon external knowledge. We further refine the agent through a ReST-like method that iteratively trains on previous trajectories, employing growing-batch reinforcement learning with AI feedback for continuous self-improvement and self-distillation. Starting from a prompted large model and after just two iterations of the algorithm, we can produce a fine-tuned small model that achieves comparable performance on challenging compositional question-answering benchmarks with two orders of magnitude fewer parameters.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/D2H22N54/Aksitov et al_2023_ReST meets ReAct.pdf;/Users/andrew/Zotero/storage/GAVYYD6V/2312.html}
}

@online{albarracinDesigningExplainableArtificial2023,
  title = {Designing Explainable Artificial Intelligence with Active Inference: {{A}} Framework for Transparent Introspection and Decision-Making},
  shorttitle = {Designing Explainable Artificial Intelligence with Active Inference},
  author = {Albarracin, Mahault and Hipólito, Inês and Tremblay, Safae Essafi and Fox, Jason G. and René, Gabriel and Friston, Karl and Ramstead, Maxwell J. D.},
  date = {2023-06-06},
  eprint = {2306.04025},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2306.04025},
  urldate = {2023-11-10},
  abstract = {This paper investigates the prospect of developing human-interpretable, explainable artificial intelligence (AI) systems based on active inference and the free energy principle. We first provide a brief overview of active inference, and in particular, of how it applies to the modeling of decision-making, introspection, as well as the generation of overt and covert actions. We then discuss how active inference can be leveraged to design explainable AI systems, namely, by allowing us to model core features of ``introspective'' processes and by generating useful, human-interpretable models of the processes involved in decision-making. We propose an architecture for explainable AI systems using active inference. This architecture foregrounds the role of an explicit hierarchical generative model, the operation of which enables the AI system to track and explain the factors that contribute to its own decisions, and whose structure is designed to be interpretable and auditable by human users. We outline how this architecture can integrate diverse sources of information to make informed decisions in an auditable manner, mimicking or reproducing aspects of human-like consciousness and introspection. Finally, we discuss the implications of our findings for future research in AI, and the potential ethical considerations of developing AI systems with (the appearance of) introspective capabilities.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/Users/andrew/Zotero/storage/7RVVJ5Z5/Albarracin et al. - 2023 - Designing explainable artificial intelligence with.pdf;/Users/andrew/Zotero/storage/NGI5TUX7/2306.html}
}

@book{alevenIntelligentTutoringSystems2010,
  title = {Intelligent {{Tutoring Systems}}: 10th {{International Conference}}, {{ITS}} 2010, {{Pittsburgh}}, {{PA}}, {{USA}}, {{June}} 14-18, 2010, {{Proceedings}}, {{Part II}}},
  shorttitle = {Intelligent {{Tutoring Systems}}},
  editor = {Aleven, Vincent and Kay, Judy and Mostow, Jack},
  editora = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard},
  editoratype = {redactor},
  date = {2010},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  volume = {6095},
  publisher = {Springer},
  location = {Berlin, Heidelberg},
  doi = {10.1007/978-3-642-13437-1},
  url = {http://link.springer.com/10.1007/978-3-642-13437-1},
  urldate = {2023-11-29},
  isbn = {978-3-642-13436-4 978-3-642-13437-1},
  langid = {english},
  keywords = {adaptive mobile learning,adaptive systems,affect recognition,affective computing,agents,assessment,Augmented Reality,authoring tools,cognition,computer assisted learning,data mining,design patterns,e-learning,learning,Scaffolding},
  file = {/Users/andrew/Zotero/storage/QGLK6IBM/Aleven et al_2010_Intelligent Tutoring Systems.pdf}
}

@article{alshurafatFactorsAffectingAccounting2023,
  title = {Factors Affecting Accounting Students’ Misuse of Chatgpt: An Application of the Fraud Triangle Theory},
  shorttitle = {Factors Affecting Accounting Students’ Misuse of Chatgpt},
  author = {Alshurafat, Hashem and Al Shbail, Mohannad Obeid and Hamdan, Allam and Al-Dmour, Ahmad and Ensour, Waed},
  date = {2023-01-01},
  journaltitle = {Journal of Financial Reporting and Accounting},
  volume = {ahead-of-print},
  issn = {1985-2517},
  doi = {10.1108/JFRA-04-2023-0182},
  url = {https://doi.org/10.1108/JFRA-04-2023-0182},
  urldate = {2024-01-29},
  abstract = {Purpose This study aims to explore the factors that contribute to student academic dishonesty through an examination of the misuse of AI language models. Using the fraud triangle theory, which posits that opportunity, rationalization and pressure are key factors for fraudulent behavior, this study investigates how these elements interact and contribute to academic dishonesty among students. Design/methodology/approach In this study, data on how accounting students used ChatGPT to cheat was acquired from 279 accounting students in Jordanian public universities over the course of two months, from January 2023 to March 2023, through previously tested and validated questionnaires. The main tool for gathering data was a questionnaire distributed online using Microsoft Forms. Findings The results show that all of the fraud triangle factors are significant determinants of student academic dishonesty and student misuse of ChatGPT. The findings of this research can be used to guide the development of technology-based preventative measures. Originality/value This study provides valuable insights into the motivations and factors that drive students to engage in academic dishonesty and sheds light on the broader issue of technology-assisted academic dishonesty and its impact on the educational system. This study’s contribution is significant, as it sheds light on a pressing issue in education and provides valuable information for educators and policymakers to address the problem and improve academic standards.},
  issue = {ahead-of-print},
  keywords = {Academic dishonesty,Academic integrity,AI language models,ChatGPT,Fraud triangle theory,Technology-assisted cheating},
  file = {/Users/andrew/Zotero/storage/C54TP6F5/html.html}
}

@online{amatriainTransformerModelsIntroduction2023,
  title = {Transformer Models: An Introduction and Catalog},
  shorttitle = {Transformer Models},
  author = {Amatriain, Xavier},
  date = {2023-02-11},
  eprint = {2302.07730},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2302.07730},
  url = {http://arxiv.org/abs/2302.07730},
  urldate = {2023-02-16},
  abstract = {In the past few years we have seen the meteoric appearance of dozens of models of the Transformer family, all of which have funny, but not self-explanatory, names. The goal of this paper is to offer a somewhat comprehensive but simple catalog and classification of the most popular Transformer models. The paper also includes an introduction to the most important aspects and innovation in Transformer models.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/94ENBB72/Amatriain_2023_Transformer models.pdf;/Users/andrew/Zotero/storage/XCL2MGKM/2302.html}
}

@online{andreasLanguageModelsAgent2022,
  title = {Language {{Models}} as {{Agent Models}}},
  author = {Andreas, Jacob},
  date = {2022-12-03},
  eprint = {2212.01681},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2212.01681},
  url = {http://arxiv.org/abs/2212.01681},
  urldate = {2024-09-04},
  abstract = {Language models (LMs) are trained on collections of documents, written by individual human agents to achieve specific goals in an outside world. During training, LMs have access only to text of these documents, with no direct evidence of the internal states of the agents that produced them -- a fact often used to argue that LMs are incapable of modeling goal-directed aspects of human language production and comprehension. Can LMs trained on text learn anything at all about the relationship between language and use? I argue that LMs are models of intentional communication in a specific, narrow sense. When performing next word prediction given a textual context, an LM can infer and represent properties of an agent likely to have produced that context. These representations can in turn influence subsequent LM generation in the same way that agents' communicative intentions influence their language. I survey findings from the recent literature showing that -- even in today's non-robust and error-prone models -- LMs infer and use representations of fine-grained communicative intentions and more abstract beliefs and goals. Despite the limited nature of their training data, they can thus serve as building blocks for systems that communicate and act intentionally.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language,Computer Science - Multiagent Systems},
  file = {/Users/andrew/Zotero/storage/F4CTEGC3/Andreas - 2022 - Language Models as Agent Models.pdf;/Users/andrew/Zotero/storage/9Q9UNPL7/2212.html}
}

@online{andukuriSTaRGATETeachingLanguage2024,
  title = {{{STaR-GATE}}: {{Teaching Language Models}} to {{Ask Clarifying Questions}}},
  shorttitle = {{{STaR-GATE}}},
  author = {Andukuri, Chinmaya and Fränken, Jan-Philipp and Gerstenberg, Tobias and Goodman, Noah D.},
  date = {2024-08-07},
  eprint = {2403.19154},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2403.19154},
  url = {http://arxiv.org/abs/2403.19154},
  urldate = {2024-08-16},
  abstract = {When prompting language models to complete a task, users often leave important aspects unsaid. While asking questions could resolve this ambiguity (GATE; Li et al., 2023), models often struggle to ask good questions. We explore a language model's ability to self-improve (STaR; Zelikman et al., 2022) by rewarding the model for generating useful questions-a simple method we dub STaR-GATE. We generate a synthetic dataset of 25,500 unique persona-task prompts to simulate conversations between a pretrained language model-the Questioner-and a Roleplayer whose preferences are unknown to the Questioner. By asking questions, the Questioner elicits preferences from the Roleplayer. The Questioner is iteratively finetuned on questions that increase the probability of high-quality responses to the task, which are generated by an Oracle with access to the Roleplayer's latent preferences. After two iterations of self-improvement, the Questioner asks better questions, allowing it to generate responses that are preferred over responses from the initial model on 72\% of tasks. Our results indicate that teaching a language model to ask better questions leads to better personalized responses.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/MYCXAU5S/Andukuri et al. - 2024 - STaR-GATE Teaching Language Models to Ask Clarifying Questions.pdf;/Users/andrew/Zotero/storage/874GD5A3/2403.html}
}

@online{angelopoulosGentleIntroductionConformal2022,
  title = {A {{Gentle Introduction}} to {{Conformal Prediction}} and {{Distribution-Free Uncertainty Quantification}}},
  author = {Angelopoulos, Anastasios N. and Bates, Stephen},
  date = {2022-12-07},
  eprint = {2107.07511},
  eprinttype = {arXiv},
  eprintclass = {cs, math, stat},
  url = {http://arxiv.org/abs/2107.07511},
  urldate = {2023-07-28},
  abstract = {Black-box machine learning models are now routinely used in high-risk settings, like medical diagnostics, which demand uncertainty quantification to avoid consequential model failures. Conformal prediction (a.k.a. conformal inference) is a user-friendly paradigm for creating statistically rigorous uncertainty sets/intervals for the predictions of such models. Critically, the sets are valid in a distribution-free sense: they possess explicit, non-asymptotic guarantees even without distributional assumptions or model assumptions. One can use conformal prediction with any pre-trained model, such as a neural network, to produce sets that are guaranteed to contain the ground truth with a user-specified probability, such as 90\%. It is easy-to-understand, easy-to-use, and general, applying naturally to problems arising in the fields of computer vision, natural language processing, deep reinforcement learning, and so on.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Mathematics - Statistics Theory,Statistics - Machine Learning,Statistics - Methodology},
  file = {/Users/andrew/Zotero/storage/N9EX8LDW/Angelopoulos and Bates - 2022 - A Gentle Introduction to Conformal Prediction and .pdf}
}

@online{arkoudasGPT4CanReason2023,
  title = {{{GPT-4 Can}}'t {{Reason}}},
  author = {Arkoudas, Konstantine},
  date = {2023-08-10},
  eprint = {2308.03762},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2308.03762},
  url = {http://arxiv.org/abs/2308.03762},
  urldate = {2023-09-25},
  abstract = {GPT-4 was released in March 2023 to wide acclaim, marking a very substantial improvement across the board over GPT-3.5 (OpenAI's previously best model, which had powered the initial release of ChatGPT). However, despite the genuinely impressive improvement, there are good reasons to be highly skeptical of GPT-4's ability to reason. This position paper discusses the nature of reasoning; criticizes the current formulation of reasoning problems in the NLP community, as well as the way in which LLM reasoning performance is currently evaluated; introduces a small collection of 21 diverse reasoning problems; and performs a detailed qualitative evaluation of GPT-4's performance on those problems. Based on this analysis, the paper concludes that, despite its occasional flashes of analytical brilliance, GPT-4 at present is utterly incapable of reasoning.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/3TSKKKLN/Arkoudas_2023_GPT-4 Can't Reason.pdf;/Users/andrew/Zotero/storage/SLBD9E4Z/2308.html}
}

@article{asadiRippleConceptBasedInterpretation,
  title = {Ripple: {{Concept-Based Interpretation}} for {{Raw Time Series Models}} in {{Education}}},
  author = {Asadi, Mohammad and Swamy, Vinitra and Frej, Jibril and Vignoud, Julien and Marras, Mirko and Kaser, Tanja},
  abstract = {Time series is the most prevalent form of input data for educational prediction tasks. The vast majority of research using time series data focuses on hand-crafted features, designed by experts for predictive performance and interpretability. However, extracting these features is labor-intensive for humans and computers. In this paper, we propose an approach that utilizes irregular multivariate time series modeling with graph neural networks to achieve comparable or better accuracy with raw time series clickstreams in comparison to handcrafted features. Furthermore, we extend concept activation vectors for interpretability in raw time series models. We analyze these advances in the education domain, addressing the task of early student performance prediction for downstream targeted interventions and instructional support. Our experimental analysis on 23 MOOCs with millions of combined interactions over six behavioral dimensions show that models designed with our approach can (i) beat state-of-the-art educational time series baselines with no feature extraction and (ii) provide interpretable insights for personalized interventions. Source code: https://github.com/epfl-ml4ed/ripple/.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/E4TQA4KF/Asadi et al. - Ripple Concept-Based Interpretation for Raw Time .pdf}
}

@online{azarGeneralTheoreticalParadigm2023,
  title = {A {{General Theoretical Paradigm}} to {{Understand Learning}} from {{Human Preferences}}},
  author = {Azar, Mohammad Gheshlaghi and Rowland, Mark and Piot, Bilal and Guo, Daniel and Calandriello, Daniele and Valko, Michal and Munos, Rémi},
  date = {2023-11-21},
  eprint = {2310.12036},
  eprinttype = {arXiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.2310.12036},
  url = {http://arxiv.org/abs/2310.12036},
  urldate = {2023-11-23},
  abstract = {The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direct Preference Optimisation (DPO) has been proposed as an approach that bypasses the second approximation and learn directly a policy from collected data without the reward modelling stage. However, this method still heavily relies on the first approximation. In this paper we try to gain a deeper theoretical understanding of these practical algorithms. In particular we derive a new general objective called \$\textbackslash Psi\$PO for learning from human preferences that is expressed in terms of pairwise preferences and therefore bypasses both approximations. This new general objective allows us to perform an in-depth analysis of the behavior of RLHF and DPO (as special cases of \$\textbackslash Psi\$PO) and to identify their potential pitfalls. We then consider another special case for \$\textbackslash Psi\$PO by setting \$\textbackslash Psi\$ simply to Identity, for which we can derive an efficient optimisation procedure, prove performance guarantees and demonstrate its empirical superiority to DPO on some illustrative examples.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/andrew/Zotero/storage/RNUU5HAU/Azar et al_2023_A General Theoretical Paradigm to Understand Learning from Human Preferences.pdf;/Users/andrew/Zotero/storage/T8USN539/2310.html}
}

@article{badrinathPyBKTAccessiblePython,
  title = {{{pyBKT}}: {{An Accessible Python Library}} of {{Bayesian Knowledge Tracing Models}}},
  author = {Badrinath, Anirudhan and Wang, Frederic and Pardos, Zachary},
  abstract = {Bayesian Knowledge Tracing, a model used for cognitive mastery estimation, has been a hallmark of adaptive learning research and an integral component of deployed intelligent tutoring systems (ITS). In this paper, we provide a brief history of knowledge tracing model research and introduce pyBKT, an accessible and computationally efficient library of model extensions from the literature. The library provides data generation, fitting, prediction, and cross-validation routines, as well as a simple to use data helper interface to ingest typical tutor log dataset formats. We evaluate the runtime with various dataset sizes and compare to past implementations. Additionally, we conduct sanity checks of the model using experiments with simulated data to evaluate the accuracy of its EM parameter learning and use real-world data to validate its predictions, comparing pyBKT’s supported model variants with results from the papers in which they were originally introduced. The library is open source and open license for the purpose of making knowledge tracing more accessible to communities of research and practice and to facilitate progress in the field through easier replication of past approaches.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/EX6WHJBD/Badrinath et al. - pyBKT An Accessible Python Library of Bayesian Kn.pdf}
}

@online{bastaniGenerativeAICan2024,
  type = {SSRN Scholarly Paper},
  title = {Generative {{AI Can Harm Learning}}},
  author = {Bastani, Hamsa and Bastani, Osbert and Sungu, Alp and Ge, Haosen and Kabakcı, Özge and Mariman, Rei},
  date = {2024-07-15},
  number = {4895486},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4895486},
  url = {https://papers.ssrn.com/abstract=4895486},
  urldate = {2024-07-26},
  abstract = {Generative artificial intelligence (AI) is poised to revolutionize how humans work, and has already demonstrated promise in significantly improving human productivity. However, a key remaining question is how generative AI affects learning, namely, how humans acquire new skills as they perform tasks. This kind of skill learning is critical to long-term productivity gains, especially in domains where generative AI is fallible and human experts must check its outputs. We study the impact of generative AI, specifically OpenAI's GPT-4, on human learning in the context of math classes at a high school. In a field experiment involving nearly a thousand students, we have deployed and evaluated two GPT based tutors, one that mimics a standard ChatGPT interface (called GPT Base) and one with prompts designed to safeguard learning (called GPT Tutor). These tutors comprise about 15\% of the curriculum in each of three grades. Consistent with prior work, our results show that access to GPT-4 significantly improves performance (48\% improvement for GPT Base and 127\% for GPT Tutor). However, we additionally find that when access is subsequently taken away, students actually perform worse than those who never had access (17\% reduction for GPT Base). That is, access to GPT-4 can harm educational outcomes. These negative learning effects are largely mitigated by the safeguards included in GPT Tutor. Our results suggest that students attempt to use GPT-4 as a "crutch" during practice problem sessions, and when successful, perform worse on their own. Thus, to maintain long-term productivity, we must be cautious when deploying generative AI to ensure humans continue to learn critical skills.    * HB, OB, and AS contributed equally},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Education,Generative AI,Human Capital Development,Human-AI Collaboration,Large Language Models},
  file = {/Users/andrew/Zotero/storage/KUWTQ4RR/Bastani et al. - 2024 - Generative AI Can Harm Learning.pdf}
}

@article{battagliaSimulationEnginePhysical2013,
  title = {Simulation as an Engine of Physical Scene Understanding},
  author = {Battaglia, Peter W. and Hamrick, Jessica B. and Tenenbaum, Joshua B.},
  date = {2013-11-05},
  journaltitle = {Proceedings of the National Academy of Sciences},
  shortjournal = {PNAS},
  volume = {110},
  number = {45},
  eprint = {24145417},
  eprinttype = {pmid},
  pages = {18327--18332},
  issn = {0027-8424, 1091-6490},
  doi = {10.1073/pnas.1306572110},
  url = {https://www.pnas.org/content/110/45/18327},
  urldate = {2019-11-23},
  abstract = {In a glance, we can perceive whether a stack of dishes will topple, a branch will support a child’s weight, a grocery bag is poorly packed and liable to tear or crush its contents, or a tool is firmly attached to a table or free to be lifted. Such rapid physical inferences are central to how people interact with the world and with each other, yet their computational underpinnings are poorly understood. We propose a model based on an “intuitive physics engine,” a cognitive mechanism similar to computer engines that simulate rich physics in video games and graphics, but that uses approximate, probabilistic simulations to make robust and fast inferences in complex natural scenes where crucial information is unobserved. This single model fits data from five distinct psychophysical tasks, captures several illusions and biases, and explains core aspects of human mental models and common-sense reasoning that are instrumental to how humans understand their everyday world.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/GMGKJPQF/Battaglia et al. - 2013 - Simulation as an engine of physical scene understa.pdf;/Users/andrew/Zotero/storage/HRW9JRML/Battaglia et al. - 2013 - Simulation as an engine of physical scene understa.pdf;/Users/andrew/Zotero/storage/8M4WCSDG/18327.html}
}

@online{battleUnreasonableEffectivenessEccentric2024,
  title = {The {{Unreasonable Effectiveness}} of {{Eccentric Automatic Prompts}}},
  author = {Battle, Rick and Gollapudi, Teja},
  date = {2024-02-20},
  eprint = {2402.10949},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2402.10949},
  url = {http://arxiv.org/abs/2402.10949},
  urldate = {2024-04-25},
  abstract = {Large Language Models (LLMs) have demonstrated remarkable problem-solving and basic mathematics abilities. However, their efficacy is highly contingent on the formulation of the prompt. This study endeavors to quantify the influence of incorporating "positive thinking" into the system message of the prompt, then compare that to systematic prompt optimization. We assess the performance of 60 combinations of system message snippets, tested with and without Chain of Thought prompting, across three models with parameters ranging from 7 to 70 billion on the GSM8K dataset. Our findings reveal that results do not universally generalize across models. In most instances, the inclusion of "positive thinking" prompts positively affected model performance. Notably, however, Llama2-70B exhibited an exception when not utilizing Chain of Thought, as the optimal system message was found to be none at all. Given the combinatorial complexity, and thus computation time, of experimenting with hand-tuning prompts for large black-box models, we then compared the performance of the best "positive thinking" prompt against the output of systematic prompt optimization. We show that employing an automated prompt optimizer emerges as the most effective method for enhancing performance, even when working with smaller open-source models. Additionally, our findings reveal that the highest-scoring, automatically-optimized prompt exhibits a degree of peculiarity far beyond expectations.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/DK5CF28L/Battle_Gollapudi_2024_The Unreasonable Effectiveness of Eccentric Automatic Prompts.pdf;/Users/andrew/Zotero/storage/TKJANXL2/2402.html}
}

@online{battleUnreasonableEffectivenessEccentric2024b,
  title = {The {{Unreasonable Effectiveness}} of {{Eccentric Automatic Prompts}}},
  author = {Battle, Rick and Gollapudi, Teja},
  date = {2024-02-20},
  eprint = {2402.10949},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2402.10949},
  url = {http://arxiv.org/abs/2402.10949},
  urldate = {2024-09-05},
  abstract = {Large Language Models (LLMs) have demonstrated remarkable problem-solving and basic mathematics abilities. However, their efficacy is highly contingent on the formulation of the prompt. This study endeavors to quantify the influence of incorporating "positive thinking" into the system message of the prompt, then compare that to systematic prompt optimization. We assess the performance of 60 combinations of system message snippets, tested with and without Chain of Thought prompting, across three models with parameters ranging from 7 to 70 billion on the GSM8K dataset. Our findings reveal that results do not universally generalize across models. In most instances, the inclusion of "positive thinking" prompts positively affected model performance. Notably, however, Llama2-70B exhibited an exception when not utilizing Chain of Thought, as the optimal system message was found to be none at all. Given the combinatorial complexity, and thus computation time, of experimenting with hand-tuning prompts for large black-box models, we then compared the performance of the best "positive thinking" prompt against the output of systematic prompt optimization. We show that employing an automated prompt optimizer emerges as the most effective method for enhancing performance, even when working with smaller open-source models. Additionally, our findings reveal that the highest-scoring, automatically-optimized prompt exhibits a degree of peculiarity far beyond expectations.},
  pubstate = {prepublished},
  version = {2},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/NPCN4SRY/Battle and Gollapudi - 2024 - The Unreasonable Effectiveness of Eccentric Automatic Prompts.pdf}
}

@online{bellerCounterfactualSimulationModel2023,
  title = {A Counterfactual Simulation Model of Causal Language},
  author = {Beller, Ari and Gerstenberg, Tobias},
  date = {2023-07-04T20:30:01},
  doi = {10.31234/osf.io/xv8hf},
  url = {https://psyarxiv.com/xv8hf/},
  urldate = {2023-07-05},
  abstract = {The words we use to describe what happened shape the story a listener imagines. How do speakers choose what causal expression to use? How does that impact what listeners infer about what happened? In this paper, we develop a computational model of how people use the causal expressions "caused", "enabled", "affected", and "made no difference". The model first builds a causal representation of what happened. By running counterfactual simulations, the model computes causal aspects that capture the different ways in which a candidate cause made a difference to the outcome. Logical combinations of these aspects define a semantics for the different causal expressions. The model then uses pragmatic inference favoring informative utterances to decide what word to use in context. We test our model in a series of experiments. In a set of psycholinguistic studies, we verify semantic and pragmatic assumptions of our model. We show that the causal expressions exist on a hierarchy of informativeness, and that participants draw informative pragmatic inferences in line with this scale. In the next two studies, we demonstrate that our model quantitatively fits participant behavior in a speaker task and a listener task involving dynamic physical scenarios. We compare our model to two lesioned alternatives, one which removes the pragmatic inference component, and another which additionally removes the semantics of the causal expressions. Our full model better accounts for participants' behavior than both alternatives, suggesting that causal knowledge, semantics, and pragmatics are all important for understanding how people produce and comprehend causal language.},
  langid = {american},
  pubstate = {prepublished},
  keywords = {/unread,causality,Cognitive Psychology,Concepts and Categories,counterfactuals,intuitive physics,Judgment and Decision Making,Language,mental simulation,pragmatics,Reasoning,semantics,Social and Behavioral Sciences},
  file = {/Users/andrew/Zotero/storage/ZYMRB99E/Beller_Gerstenberg_2023_A counterfactual simulation model of causal language.pdf}
}

@online{belyiLunaEvaluationFoundation2024,
  title = {Luna: {{An Evaluation Foundation Model}} to {{Catch Language Model Hallucinations}} with {{High Accuracy}} and {{Low Cost}}},
  shorttitle = {Luna},
  author = {Belyi, Masha and Friel, Robert and Shao, Shuai and Sanyal, Atindriyo},
  date = {2024-06-05},
  eprint = {2406.00975},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2406.00975},
  url = {http://arxiv.org/abs/2406.00975},
  urldate = {2024-06-13},
  abstract = {Retriever Augmented Generation (RAG) systems have become pivotal in enhancing the capabilities of language models by incorporating external knowledge retrieval mechanisms. However, a significant challenge in deploying these systems in industry applications is the detection and mitigation of hallucinations: instances where the model generates information that is not grounded in the retrieved context. Addressing this issue is crucial for ensuring the reliability and accuracy of responses generated by large language models (LLMs) in diverse industry settings. Current hallucination detection techniques fail to deliver accuracy, low latency, and low cost simultaneously. We introduce Luna: a DeBERTA-large (440M) encoder, finetuned for hallucination detection in RAG settings. We demonstrate that Luna outperforms GPT-3.5 and commercial evaluation frameworks on the hallucination detection task, with 97\% and 91\% reduction in cost and latency, respectively. Luna is lightweight and generalizes across multiple industry verticals and out-of-domain data, making it an ideal candidate for industry LLM applications.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/TTHKFSV6/Belyi et al_2024_Luna.pdf;/Users/andrew/Zotero/storage/X9BTI9U3/2406.html}
}

@online{bestaGraphThoughtsSolving2023,
  title = {Graph of {{Thoughts}}: {{Solving Elaborate Problems}} with {{Large Language Models}}},
  shorttitle = {Graph of {{Thoughts}}},
  author = {Besta, Maciej and Blach, Nils and Kubicek, Ales and Gerstenberger, Robert and Gianinazzi, Lukas and Gajda, Joanna and Lehmann, Tomasz and Podstawski, Michal and Niewiadomski, Hubert and Nyczyk, Piotr and Hoefler, Torsten},
  date = {2023-08-21},
  eprint = {2308.09687},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2308.09687},
  url = {http://arxiv.org/abs/2308.09687},
  urldate = {2023-08-29},
  abstract = {We introduce Graph of Thoughts (GoT): a framework that advances prompting capabilities in large language models (LLMs) beyond those offered by paradigms such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary advantage of GoT is the ability to model the information generated by an LLM as an arbitrary graph, where units of information ("LLM thoughts") are vertices, and edges correspond to dependencies between these vertices. This approach enables combining arbitrary LLM thoughts into synergistic outcomes, distilling the essence of whole networks of thoughts, or enhancing thoughts using feedback loops. We illustrate that GoT offers advantages over state of the art on different tasks, for example increasing the quality of sorting by 62\% over ToT, while simultaneously reducing costs by {$>$}31\%. We ensure that GoT is extensible with new thought transformations and thus can be used to spearhead new prompting schemes. This work brings the LLM reasoning closer to human thinking or brain mechanisms such as recurrence, both of which form complex networks.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/8Y9XLWYG/Besta et al_2023_Graph of Thoughts.pdf;/Users/andrew/Zotero/storage/LLTA6BCG/2308.html}
}

@article{bin-nashwanUseChatGPTAcademia2023,
  title = {Use of {{ChatGPT}} in Academia: {{Academic}} Integrity Hangs in the Balance},
  shorttitle = {Use of {{ChatGPT}} in Academia},
  author = {Bin-Nashwan, Saeed Awadh and Sadallah, Mouad and Bouteraa, Mohamed},
  date = {2023-11-01},
  journaltitle = {Technology in Society},
  shortjournal = {Technology in Society},
  volume = {75},
  pages = {102370},
  issn = {0160-791X},
  doi = {10.1016/j.techsoc.2023.102370},
  url = {https://www.sciencedirect.com/science/article/pii/S0160791X23001756},
  urldate = {2024-04-15},
  abstract = {In today's academic world, some academicians, researchers and students have begun employing Artificial Intelligence (AI) language models, e.g., ChatGPT, in completing a variety of academic tasks, including generating ideas, summarising literature, and essay writing. However, the use of ChatGPT in academic settings is a controversial issue, leading to a severe concern about academic integrity and AI-assisted cheating, while scholarly communities still lack clear principles on using such innovation in academia. Accordingly, this study aims to understand the motivations driving academics and researchers to use ChatGPT in their work, and specifically the role of academic integrity in making up adoption behavior. Based on 702 responses retrieved from users of ResearchGate and Academia.edu, we found that ChatGPT usage is positively shaped by time-saving feature, e-word of mouth, academic self-efficacy, academic self-esteem, and perceived stress. In contrast, peer influence and academic integrity had a negative effect on usage. Intriguingly, academic integrity-moderated interactions of time-saving, self-esteem and perceived stress on ChatGPT usage are found to be significantly positive. Therefore, we suggest that stakeholders, including academic institutions, publishers and AI language models' programmers, should work together to specify necessary guidelines for the ethical use of AI chatbots in academic work and research.},
  keywords = {Academia,Academic integrity,Artificial intelligence,ChatGPT,Plagiarism,Technology adoption},
  file = {/Users/andrew/Zotero/storage/C6DGJBEB/S0160791X23001756.html}
}

@article{binzUsingCognitivePsychology2023,
  title = {Using Cognitive Psychology to Understand {{GPT-3}}},
  author = {Binz, Marcel and Schulz, Eric},
  date = {2023-02-07},
  journaltitle = {Proceedings of the National Academy of Sciences},
  volume = {120},
  number = {6},
  pages = {e2218523120},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.2218523120},
  url = {https://www.pnas.org/doi/10.1073/pnas.2218523120},
  urldate = {2023-02-11},
  abstract = {We study GPT-3, a recent large language model, using tools from cognitive psychology. More specifically, we assess GPT-3’s decision-making, information search, deliberation, and causal reasoning abilities on a battery of canonical experiments from the literature. We find that much of GPT-3’s behavior is impressive: It solves vignette-based tasks similarly or better than human subjects, is able to make decent decisions from descriptions, outperforms humans in a multiarmed bandit task, and shows signatures of model-based reinforcement learning. Yet, we also find that small perturbations to vignette-based tasks can lead GPT-3 vastly astray, that it shows no signatures of directed exploration, and that it fails miserably in a causal reasoning task. Taken together, these results enrich our understanding of current large language models and pave the way for future investigations using tools from cognitive psychology to study increasingly capable and opaque artificial agents.},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/2DE7KRJ8/Binz_Schulz_2023_Using cognitive psychology to understand GPT-3.pdf}
}

@article{binzUsingCognitivePsychology2023a,
  title = {Using Cognitive Psychology to Understand {{GPT-3}}},
  author = {Binz, Marcel and Schulz, Eric},
  date = {2023-02-07},
  journaltitle = {Proceedings of the National Academy of Sciences},
  volume = {120},
  number = {6},
  pages = {e2218523120},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.2218523120},
  url = {https://www.pnas.org/doi/10.1073/pnas.2218523120},
  urldate = {2023-03-25},
  abstract = {We study GPT-3, a recent large language model, using tools from cognitive psychology. More specifically, we assess GPT-3’s decision-making, information search, deliberation, and causal reasoning abilities on a battery of canonical experiments from the literature. We find that much of GPT-3’s behavior is impressive: It solves vignette-based tasks similarly or better than human subjects, is able to make decent decisions from descriptions, outperforms humans in a multiarmed bandit task, and shows signatures of model-based reinforcement learning. Yet, we also find that small perturbations to vignette-based tasks can lead GPT-3 vastly astray, that it shows no signatures of directed exploration, and that it fails miserably in a causal reasoning task. Taken together, these results enrich our understanding of current large language models and pave the way for future investigations using tools from cognitive psychology to study increasingly capable and opaque artificial agents.},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/J823HK9W/Binz_Schulz_2023_Using cognitive psychology to understand GPT-3.pdf}
}

@article{birhaneLargeModelsWhat2024,
  title = {Large Models of What? {{Mistaking}} Engineering Achievements for Human Linguistic Agency},
  shorttitle = {Large Models of What?},
  author = {Birhane, Abeba and McGann, Marek},
  date = {2024-11-01},
  journaltitle = {Language Sciences},
  shortjournal = {Language Sciences},
  volume = {106},
  pages = {101672},
  issn = {0388-0001},
  doi = {10.1016/j.langsci.2024.101672},
  url = {https://www.sciencedirect.com/science/article/pii/S0388000124000615},
  urldate = {2024-08-31},
  abstract = {In this paper we argue that key, often sensational and misleading, claims regarding linguistic capabilities of Large Language Models (LLMs) are based on at least two unfounded assumptions: the assumption of language completeness and the assumption of data completeness. Language completeness assumes that a distinct and complete thing such as “a natural language” exists, the essential characteristics of which can be effectively and comprehensively modelled by an LLM. The assumption of data completeness relies on the belief that a language can be quantified and wholly captured by data. Work within the enactive approach to cognitive science makes clear that, rather than a distinct and complete thing, language is a means or way of acting. Languaging is not the kind of thing that can admit of a complete or comprehensive modelling. From an enactive perspective we identify three key characteristics of enacted language; embodiment, participation, and precariousness, that are absent in LLMs, and likely incompatible in principle with current architectures. We argue that these absences imply that LLMs are not now and cannot in their present form be linguistic agents the way humans are. We illustrate the point in particular through the phenomenon of “algospeak”, a recently described pattern of high-stakes human language activity in heavily controlled online environments. On the basis of these points, we conclude that sensational and misleading claims about LLM agency and capabilities emerge from a deep misconception of both what human language is and what LLMs are.},
  keywords = {Agency,Algospeak,Embodiment,Enaction,Language,Large language models,Precariousness,Precarity},
  file = {/Users/andrew/Zotero/storage/TQEGURUB/S0388000124000615.html}
}

@online{bommasaniOpportunitiesRisksFoundation2022,
  title = {On the {{Opportunities}} and {{Risks}} of {{Foundation Models}}},
  author = {Bommasani, Rishi and Hudson, Drew A. and Adeli, Ehsan and Altman, Russ and Arora, Simran and family=Arx, given=Sydney, prefix=von, useprefix=true and Bernstein, Michael S. and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and Brynjolfsson, Erik and Buch, Shyamal and Card, Dallas and Castellon, Rodrigo and Chatterji, Niladri and Chen, Annie and Creel, Kathleen and Davis, Jared Quincy and Demszky, Dora and Donahue, Chris and Doumbouya, Moussa and Durmus, Esin and Ermon, Stefano and Etchemendy, John and Ethayarajh, Kawin and Fei-Fei, Li and Finn, Chelsea and Gale, Trevor and Gillespie, Lauren and Goel, Karan and Goodman, Noah and Grossman, Shelby and Guha, Neel and Hashimoto, Tatsunori and Henderson, Peter and Hewitt, John and Ho, Daniel E. and Hong, Jenny and Hsu, Kyle and Huang, Jing and Icard, Thomas and Jain, Saahil and Jurafsky, Dan and Kalluri, Pratyusha and Karamcheti, Siddharth and Keeling, Geoff and Khani, Fereshte and Khattab, Omar and Koh, Pang Wei and Krass, Mark and Krishna, Ranjay and Kuditipudi, Rohith and Kumar, Ananya and Ladhak, Faisal and Lee, Mina and Lee, Tony and Leskovec, Jure and Levent, Isabelle and Li, Xiang Lisa and Li, Xuechen and Ma, Tengyu and Malik, Ali and Manning, Christopher D. and Mirchandani, Suvir and Mitchell, Eric and Munyikwa, Zanele and Nair, Suraj and Narayan, Avanika and Narayanan, Deepak and Newman, Ben and Nie, Allen and Niebles, Juan Carlos and Nilforoshan, Hamed and Nyarko, Julian and Ogut, Giray and Orr, Laurel and Papadimitriou, Isabel and Park, Joon Sung and Piech, Chris and Portelance, Eva and Potts, Christopher and Raghunathan, Aditi and Reich, Rob and Ren, Hongyu and Rong, Frieda and Roohani, Yusuf and Ruiz, Camilo and Ryan, Jack and Ré, Christopher and Sadigh, Dorsa and Sagawa, Shiori and Santhanam, Keshav and Shih, Andy and Srinivasan, Krishnan and Tamkin, Alex and Taori, Rohan and Thomas, Armin W. and Tramèr, Florian and Wang, Rose E. and Wang, William and Wu, Bohan and Wu, Jiajun and Wu, Yuhuai and Xie, Sang Michael and Yasunaga, Michihiro and You, Jiaxuan and Zaharia, Matei and Zhang, Michael and Zhang, Tianyi and Zhang, Xikun and Zhang, Yuhui and Zheng, Lucia and Zhou, Kaitlyn and Liang, Percy},
  date = {2022-07-12},
  eprint = {2108.07258},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2108.07258},
  url = {http://arxiv.org/abs/2108.07258},
  urldate = {2023-11-01},
  abstract = {AI is undergoing a paradigm shift with the rise of models (e.g., BERT, DALL-E, GPT-3) that are trained on broad data at scale and are adaptable to a wide range of downstream tasks. We call these models foundation models to underscore their critically central yet incomplete character. This report provides a thorough account of the opportunities and risks of foundation models, ranging from their capabilities (e.g., language, vision, robotics, reasoning, human interaction) and technical principles(e.g., model architectures, training procedures, data, systems, security, evaluation, theory) to their applications (e.g., law, healthcare, education) and societal impact (e.g., inequity, misuse, economic and environmental impact, legal and ethical considerations). Though foundation models are based on standard deep learning and transfer learning, their scale results in new emergent capabilities,and their effectiveness across so many tasks incentivizes homogenization. Homogenization provides powerful leverage but demands caution, as the defects of the foundation model are inherited by all the adapted models downstream. Despite the impending widespread deployment of foundation models, we currently lack a clear understanding of how they work, when they fail, and what they are even capable of due to their emergent properties. To tackle these questions, we believe much of the critical research on foundation models will require deep interdisciplinary collaboration commensurate with their fundamentally sociotechnical nature.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/ER6APDWA/Bommasani et al_2022_On the Opportunities and Risks of Foundation Models.pdf;/Users/andrew/Zotero/storage/ZF3UG93I/2108.html}
}

@online{borazjanizadehReliableReasoningNatural2024,
  title = {Reliable {{Reasoning Beyond Natural Language}}},
  author = {Borazjanizadeh, Nasim and Piantadosi, Steven T.},
  date = {2024-07-16},
  eprint = {2407.11373},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2407.11373},
  url = {http://arxiv.org/abs/2407.11373},
  urldate = {2024-07-19},
  abstract = {Despite their linguistic competence, Large Language models (LLMs) often exhibit limitations in their ability to reason reliably and flexibly. To address this, we propose a neurosymbolic approach that prompts LLMs to extract and encode all relevant information from a problem statement as logical code statements, and then use a logic programming language (Prolog) to conduct the iterative computations of explicit deductive reasoning. Our approach significantly enhances the performance of LLMs on the standard mathematical reasoning benchmark, GSM8k, and the Navigate dataset from the BIG-bench dataset. Additionally, we introduce a novel dataset, the Non-Linear Reasoning (NLR) dataset, consisting of 55 unique word problems that target the shortcomings of the next token prediction paradigm of LLMs and require complex non-linear reasoning but only basic arithmetic skills to solve. Our findings demonstrate that the integration of Prolog enables LLMs to achieve high performance on the NLR dataset, which even the most advanced language models (including GPT4) fail to solve using text only.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/86UZBDDJ/Borazjanizadeh and Piantadosi - 2024 - Reliable Reasoning Beyond Natural Language.pdf;/Users/andrew/Zotero/storage/JP3CL4CU/2407.html}
}

@online{borazjanizadehReliableReasoningNatural2024a,
  title = {Reliable {{Reasoning Beyond Natural Language}}},
  author = {Borazjanizadeh, Nasim and Piantadosi, Steven T.},
  date = {2024-07-19},
  eprint = {2407.11373},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2407.11373},
  url = {http://arxiv.org/abs/2407.11373},
  urldate = {2024-08-14},
  abstract = {Despite their linguistic competence, Large Language models (LLMs) often exhibit limitations in their ability to reason reliably and flexibly. To address this, we propose a neurosymbolic approach that prompts LLMs to extract and encode all relevant information from a problem statement as logical code statements, and then use a logic programming language (Prolog) to conduct the iterative computations of explicit deductive reasoning. Our approach significantly enhances the performance of LLMs on the standard mathematical reasoning benchmark, GSM8k, and the Navigate dataset from the BIG-bench dataset. Additionally, we introduce a novel dataset, the Non-Linear Reasoning (NLR) dataset, consisting of 55 unique word problems that target the shortcomings of the next token prediction paradigm of LLMs and require complex non-linear reasoning but only basic arithmetic skills to solve. Our findings demonstrate that the integration of Prolog enables LLMs to achieve high performance on the NLR dataset, which even the most advanced language models (including GPT4) fail to solve using text only.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/5C36RKU7/Borazjanizadeh and Piantadosi - 2024 - Reliable Reasoning Beyond Natural Language.pdf;/Users/andrew/Zotero/storage/Z8RHXBYA/2407.html}
}

@online{bosselutCOMETCommonsenseTransformers2019,
  title = {{{COMET}}: {{Commonsense Transformers}} for {{Automatic Knowledge Graph Construction}}},
  shorttitle = {{{COMET}}},
  author = {Bosselut, Antoine and Rashkin, Hannah and Sap, Maarten and Malaviya, Chaitanya and Celikyilmaz, Asli and Choi, Yejin},
  date = {2019-06-14},
  eprint = {1906.05317},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1906.05317},
  url = {http://arxiv.org/abs/1906.05317},
  urldate = {2024-08-26},
  abstract = {We present the first comprehensive study on automatic knowledge base construction for two prevalent commonsense knowledge graphs: ATOMIC (Sap et al., 2019) and ConceptNet (Speer et al., 2017). Contrary to many conventional KBs that store knowledge with canonical templates, commonsense KBs only store loosely structured open-text descriptions of knowledge. We posit that an important step toward automatic commonsense completion is the development of generative models of commonsense knowledge, and propose COMmonsEnse Transformers (COMET) that learn to generate rich and diverse commonsense descriptions in natural language. Despite the challenges of commonsense modeling, our investigation reveals promising results when implicit knowledge from deep pre-trained language models is transferred to generate explicit knowledge in commonsense knowledge graphs. Empirical results demonstrate that COMET is able to generate novel knowledge that humans rate as high quality, with up to 77.5\% (ATOMIC) and 91.7\% (ConceptNet) precision at top 1, which approaches human performance for these resources. Our findings suggest that using generative commonsense models for automatic commonsense KB completion could soon be a plausible alternative to extractive methods.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/HZMSC8ZP/Bosselut et al. - 2019 - COMET Commonsense Transformers for Automatic Knowledge Graph Construction.pdf;/Users/andrew/Zotero/storage/49G7F25F/1906.html}
}

@book{bowenTeachingAI2024,
  title = {Teaching with {{AI}}},
  author = {Bowen, José Antonio and Watson, C. Edward},
  date = {2024},
  publisher = {Johns Hopkins University Press},
  doi = {10.56021/9781421449227},
  url = {https://www.press.jhu.edu/books/title/53869/teaching-ai},
  urldate = {2024-06-26},
  isbn = {978-1-4214-4922-7 978-1-4214-4923-4},
  langid = {english}
}

@online{bowmanEightThingsKnow2023,
  title = {Eight {{Things}} to {{Know}} about {{Large Language Models}}},
  author = {Bowman, Samuel R.},
  date = {2023-04-02},
  eprint = {2304.00612},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2304.00612},
  url = {http://arxiv.org/abs/2304.00612},
  urldate = {2023-11-20},
  abstract = {The widespread public deployment of large language models (LLMs) in recent months has prompted a wave of new attention and engagement from advocates, policymakers, and scholars from many fields. This attention is a timely response to the many urgent questions that this technology raises, but it can sometimes miss important considerations. This paper surveys the evidence for eight potentially surprising such points: 1. LLMs predictably get more capable with increasing investment, even without targeted innovation. 2. Many important LLM behaviors emerge unpredictably as a byproduct of increasing investment. 3. LLMs often appear to learn and use representations of the outside world. 4. There are no reliable techniques for steering the behavior of LLMs. 5. Experts are not yet able to interpret the inner workings of LLMs. 6. Human performance on a task isn't an upper bound on LLM performance. 7. LLMs need not express the values of their creators nor the values encoded in web text. 8. Brief interactions with LLMs are often misleading.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/6VI46I6I/Bowman_2023_Eight Things to Know about Large Language Models.pdf;/Users/andrew/Zotero/storage/EQDBZXT6/2304.html}
}

@article{bozkurta.SpeculativeFuturesChatGPT2023,
  title = {Speculative Futures on {{ChatGPT}} and Generative Artificial Intelligence ({{AI}}): {{A}} Collective Reflection from the Educational Landscape},
  shorttitle = {Speculative Futures on {{ChatGPT}} and Generative Artificial Intelligence ({{AI}})},
  author = {Bozkurt, A. and Xiao, J. and Lambert, S. and Pazurek, A. and Crompton, H. and Koseoglu, S. and Farrow, R. and Bond, M. and Nerantzi, C. and Honeychurch, S. and Bali, M. and Dron, J. and Mir, K. and Stewart, B. and Costello, E. and Mason, J. and Stracke, C. M. and Romero-Hall, E. and Koutropoulos, A. and Toquero, C. M. and Singh, L. and Tlili, A. and Lee, K. and Nichols, M. and Ossiannilsson, E. and Brown, M. and Irvine, V. and Raffaghelli, J. E. and Santos-Hermosa, G. and Farrell, O. and Adam, T. and Thong, Y. L. and Sani-Bozkurt, S. and Sharma, R. C. and Hrastinski, S. and Jandrić, P.},
  date = {2023-02-13},
  publisher = {Zenodo},
  doi = {10.5281/ZENODO.7636568},
  url = {https://zenodo.org/record/7636568},
  urldate = {2023-03-23},
  abstract = {While ChatGPT has recently become very popular, AI has a long history and philosophy. This paper intends to explore the promises and pitfalls of the Generative Pre-trained Transformer (GPT) AI and potentially future technologies by adopting a speculative methodology. Speculative future narratives with a specific focus on educational contexts are provided in an attempt to identify emerging themes and discuss their implications for education in the 21st century. Affordances of (using) AI in Education (AIEd) and possible adverse effects are identified and discussed which emerge from the narratives. It is argued that now is the best of times to define human vs AI contribution to education because AI can accomplish more and more educational activities that used to be the prerogative of human educators. Therefore, it is imperative to rethink the respective roles of technology and human educators in education with a future-oriented mindset.},
  langid = {english},
  keywords = {/unread,artificial intelligence (AI),artificial intelligence in education (AIEd),future educational perspectives,generative pre-trained transformer (GPT),natural language processing,speculative methodology},
  file = {/Users/andrew/Zotero/storage/B7H35EFB/Bozkurt, A. et al. - 2023 - Speculative futures on ChatGPT and generative arti.pdf}
}

@article{branwenScalingHypothesis2020,
  title = {The {{Scaling Hypothesis}}},
  author = {Branwen, Gwern},
  date = {2020-05-28},
  url = {https://gwern.net/scaling-hypothesis},
  urldate = {2023-06-30},
  abstract = {On GPT-3: meta-learning, scaling, implications, and deep theory. The scaling hypothesis: neural nets absorb data \& compute, generalizing and becoming more Bayesian as problems get harder, manifesting new abilities even at trivial-by-global-standards-scale. The deep learning revolution has begun as foretold.},
  langid = {american},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/QHRZYSBL/scaling-hypothesis.html}
}

@online{broschinskiGrafikenErklaertFunktioniert2023,
  title = {In 9 Grafiken erklärt – So funktioniert künstliche Intelligenz},
  author = {Broschinski, Sebastian and Plattner, Titus and Meier, Patrick and Vögeli, Patrick},
  date = {2023-06-10},
  url = {https://www.derbund.ch/so-funktioniert-kuenstliche-intelligenz-599276436215},
  urldate = {2023-06-13},
  abstract = {Kann künstliche Intelligenz mehr als Äpfel und Birnen sortieren? Und warum lassen sich Computer immer noch leicht übertölpeln? Hier finden Sie alle Antworten.},
  langid = {ngerman},
  organization = {Der Bund},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/3G74I43I/so-funktioniert-kuenstliche-intelligenz-599276436215.html}
}

@online{brownLanguageModelsAre2020a,
  title = {Language {{Models}} Are {{Few-Shot Learners}}},
  author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  date = {2020-07-22},
  eprint = {2005.14165},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2005.14165},
  urldate = {2023-03-01},
  abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/SUCZVXXP/Brown et al_2020_Language Models are Few-Shot Learners.pdf;/Users/andrew/Zotero/storage/JW9ITXGI/2005.html}
}

@online{brownLanguageModelsAre2020b,
  title = {Language {{Models}} Are {{Few-Shot Learners}}},
  author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  date = {2020-07-22},
  eprint = {2005.14165},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2005.14165},
  url = {http://arxiv.org/abs/2005.14165},
  urldate = {2023-06-08},
  abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/HSTN9QGG/Brown et al_2020_Language Models are Few-Shot Learners.pdf;/Users/andrew/Zotero/storage/VHPJRB6M/2005.html}
}

@online{bubeckSparksArtificialGeneral2023,
  title = {Sparks of {{Artificial General Intelligence}}: {{Early}} Experiments with {{GPT-4}}},
  shorttitle = {Sparks of {{Artificial General Intelligence}}},
  author = {Bubeck, Sébastien and Chandrasekaran, Varun and Eldan, Ronen and Gehrke, Johannes and Horvitz, Eric and Kamar, Ece and Lee, Peter and Lee, Yin Tat and Li, Yuanzhi and Lundberg, Scott and Nori, Harsha and Palangi, Hamid and Ribeiro, Marco Tulio and Zhang, Yi},
  date = {2023-04-13},
  eprint = {2303.12712},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2303.12712},
  urldate = {2023-05-29},
  abstract = {Artificial intelligence (AI) researchers have been developing and refining large language models (LLMs) that exhibit remarkable capabilities across a variety of domains and tasks, challenging our understanding of learning and cognition. The latest model developed by OpenAI, GPT-4 [Ope23], was trained using an unprecedented scale of compute and data. In this paper, we report on our investigation of an early version of GPT-4, when it was still in active development by OpenAI. We contend that (this early version of) GPT4 is part of a new cohort of LLMs (along with ChatGPT and Google’s PaLM for example) that exhibit more general intelligence than previous AI models. We discuss the rising capabilities and implications of these models. We demonstrate that, beyond its mastery of language, GPT-4 can solve novel and difficult tasks that span mathematics, coding, vision, medicine, law, psychology and more, without needing any special prompting. Moreover, in all of these tasks, GPT-4’s performance is strikingly close to human-level performance, and often vastly surpasses prior models such as ChatGPT. Given the breadth and depth of GPT-4’s capabilities, we believe that it could reasonably be viewed as an early (yet still incomplete) version of an artificial general intelligence (AGI) system. In our exploration of GPT-4, we put special emphasis on discovering its limitations, and we discuss the challenges ahead for advancing towards deeper and more comprehensive versions of AGI, including the possible need for pursuing a new paradigm that moves beyond next-word prediction. We conclude with reflections on societal influences of the recent technological leap and future research directions.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/2QTBXBKP/Bubeck et al. - 2023 - Sparks of Artificial General Intelligence Early e.pdf}
}

@article{buckHochschulbildungVorHintergrund2023,
  title = {Hochschulbildung vor dem Hintergrund von Natural Language Processing (KI-Schreibtools)},
  author = {Buck, Isabella and Limburg, Anika},
  date = {2023},
  langid = {ngerman},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/ZYDESPAY/Buck and Limburg - 2023 - Hochschulbildung vor dem Hintergrund von Natural L.pdf}
}

@article{butlerMicrosoftNewFuture2023,
  title = {Microsoft {{New Future}} of {{Work Report}} 2023},
  author = {Butler, Jenna and Jaffe, Sonia and Baym, Nancy and Czerwinski, Mary and Iqbal, Shamsi and Nowak, Kate and Rintel, Sean and Sellen, Abigail and Vorvoreanu, Mihaela and Abdulhamid, Najeeb G. and Amores, Judith and Andersen, Reid and Awori, Kagonya and Axmed, Maxamed and Boyd, Danah and Brand, James and Buscher, Georg and Carignan, Dean and Chan, Martin and Coleman, Adam and Counts, Scott and Daepp, Madeleine and Fourney, Adam and Goldstein, Daniel G. and Gordon, Andy and Halfaker, Aaron L. and Hernandez, Javier and Hofman, Jake and Lay-Flurrie, Jenny and Liao, Vera and Lindley, Siân and Manivannan, Sathish and Mcilwain, Charlton and Nepal, Subigya and Neville, Jennifer and Nyairo, Stephanie and O'Neill, Jacki and Poznanski, Victor and Ramos, Gonzalo and Rangan, Nagu and Rosedale, Lacey and Rothschild, David and Safavi, Tara and Sarkar, Advait and Scott, Ava and Shah, Chirag and Shah, Neha Parikh and Shapiro, Teny and Shaw, Ryland and Simkute, Auste and Suh, Jina and Suri, Siddharth and Tanase, Ioana and Tankelevitch, Lev and Troy, Adam and Wan, Mengting and White, Ryen W. and Yang, Longqi and Hecht, Brent and Teevan, Jaime},
  date = {2023-12-20},
  url = {https://www.microsoft.com/en-us/research/publication/microsoft-new-future-of-work-report-2023/},
  urldate = {2024-01-29},
  abstract = {In the past three years, there have been not one but two generational shifts in how work gets done, both of which were only possible because of decades of research and development. The first shift occurred when COVID made us realize how powerful remote and hybrid work technologies had become, as well as how much […]},
  langid = {american},
  file = {/Users/andrew/Zotero/storage/W95FJGQ6/Butler et al_2023_Microsoft New Future of Work Report 2023.pdf}
}

@article{cantlonUniquelyHumanIntelligence2024,
  title = {Uniquely Human Intelligence Arose from Expanded Information Capacity},
  author = {Cantlon, Jessica F. and Piantadosi, Steven T.},
  date = {2024-04-02},
  journaltitle = {Nature Reviews Psychology},
  shortjournal = {Nat Rev Psychol},
  pages = {1--19},
  publisher = {Nature Publishing Group},
  issn = {2731-0574},
  doi = {10.1038/s44159-024-00283-3},
  url = {https://www.nature.com/articles/s44159-024-00283-3},
  urldate = {2024-04-05},
  abstract = {Most theories of how human cognition is unique propose specific representational capacities or biases, often thought to arise through evolutionary change. In this Perspective, we argue that the evidence that supports these domain-specific theories is confounded by general information-processing differences. We argue that human uniqueness arises through genetic quantitative increases in the global capacity to process information and share it among systems such as memory, attention and learning. This change explains regularities across numerous subdomains of cognition, behavioural comparisons between species and phenomena in child development. This strict evolutionary continuity theory of human intelligence is consistent with comparative evidence about neural evolution and computational constraints of memory on the ability to represent rules, patterns and abstract generalizations. We show how these differences in the degree of information processing capacity yield differences in kind for human cognition relative to other animals.},
  langid = {english},
  keywords = {Animal behaviour,Human behaviour,Intelligence,Psychology},
  file = {/Users/andrew/Zotero/storage/U2F3MXGM/Cantlon_Piantadosi_2024_Uniquely human intelligence arose from expanded information capacity.pdf}
}

@article{cardonaArtificialIntelligenceFuture,
  title = {Artificial {{Intelligence}} and the {{Future}} of {{Teaching}} and {{Learning}}},
  author = {Cardona, Miguel A and Rodríguez, Roberto J and Ishmael, Kristina},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/GZIMYF79/Cardona et al. - Artificial Intelligence and the Future of Teaching.pdf}
}

@online{changPromptingLargeLanguage2023,
  title = {Prompting {{Large Language Models With}} the {{Socratic Method}}},
  author = {Chang, Edward Y.},
  date = {2023-03-15},
  eprint = {2303.08769},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2303.08769},
  url = {http://arxiv.org/abs/2303.08769},
  urldate = {2024-02-07},
  abstract = {This paper presents a systematic approach to using the Socratic method in developing prompt templates that effectively interact with large language models, including GPT-3. Various methods are examined, and those that yield precise answers and justifications while fostering creativity and imagination to enhance creative writing are identified. Techniques such as \{\textbackslash em definition\}, \{\textbackslash em elenchus\}, \{\textbackslash em dialectic\}, \{\textbackslash em maieutics\}, \{\textbackslash em generalization\}, and \{\textbackslash em counterfactual reasoning\} are discussed for their application in engineering prompt templates and their connections to inductive, deductive, and abductive reasoning. Through examples, the effectiveness of these dialogue and reasoning methods is demonstrated. An interesting observation is made that when the task's goal and user intent are conveyed to GPT-3 via ChatGPT before the start of a dialogue, the large language model seems to connect to the external context expressed in the intent and perform more effectively.},
  pubstate = {prepublished},
  keywords = {Computer Science - Machine Learning,I.2.7},
  file = {/Users/andrew/Zotero/storage/ZJ69AGX2/Chang_2023_Prompting Large Language Models With the Socratic Method.pdf;/Users/andrew/Zotero/storage/PJJ9LIM7/2303.html}
}

@online{chengInductiveDeductiveRethinking2024,
  title = {Inductive or {{Deductive}}? {{Rethinking}} the {{Fundamental Reasoning Abilities}} of {{LLMs}}},
  shorttitle = {Inductive or {{Deductive}}?},
  author = {Cheng, Kewei and Yang, Jingfeng and Jiang, Haoming and Wang, Zhengyang and Huang, Binxuan and Li, Ruirui and Li, Shiyang and Li, Zheng and Gao, Yifan and Li, Xian and Yin, Bing and Sun, Yizhou},
  date = {2024-08-06},
  eprint = {2408.00114},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2408.00114},
  url = {http://arxiv.org/abs/2408.00114},
  urldate = {2024-08-25},
  abstract = {Reasoning encompasses two typical types: deductive reasoning and inductive reasoning. Despite extensive research into the reasoning capabilities of Large Language Models (LLMs), most studies have failed to rigorously differentiate between inductive and deductive reasoning, leading to a blending of the two. This raises an essential question: In LLM reasoning, which poses a greater challenge - deductive or inductive reasoning? While the deductive reasoning capabilities of LLMs, (i.e. their capacity to follow instructions in reasoning tasks), have received considerable attention, their abilities in true inductive reasoning remain largely unexplored. To investigate into the true inductive reasoning capabilities of LLMs, we propose a novel framework, SolverLearner. This framework enables LLMs to learn the underlying function (i.e., \$y = f\_w(x)\$), that maps input data points \$(x)\$ to their corresponding output values \$(y)\$, using only in-context examples. By focusing on inductive reasoning and separating it from LLM-based deductive reasoning, we can isolate and investigate inductive reasoning of LLMs in its pure form via SolverLearner. Our observations reveal that LLMs demonstrate remarkable inductive reasoning capabilities through SolverLearner, achieving near-perfect performance with ACC of 1 in most cases. Surprisingly, despite their strong inductive reasoning abilities, LLMs tend to relatively lack deductive reasoning capabilities, particularly in tasks involving ``counterfactual'' reasoning.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/Users/andrew/Zotero/storage/YEHNTN3K/Cheng et al. - 2024 - Inductive or Deductive Rethinking the Fundamental Reasoning Abilities of LLMs.pdf;/Users/andrew/Zotero/storage/Q4SNB2NX/2408.html}
}

@article{chenHierarchicalBayesianModel2024,
  title = {A {{Hierarchical Bayesian Model}} of {{Adaptive Teaching}}},
  author = {Chen, Alicia M. and Palacci, Andrew and Vélez, Natalia and Hawkins, Robert D. and Gershman, Samuel J.},
  date = {2024},
  journaltitle = {Cognitive Science},
  volume = {48},
  number = {7},
  pages = {e13477},
  issn = {1551-6709},
  doi = {10.1111/cogs.13477},
  url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/cogs.13477},
  urldate = {2024-07-17},
  abstract = {How do teachers learn about what learners already know? How do learners aid teachers by providing them with information about their background knowledge and what they find confusing? We formalize this collaborative reasoning process using a hierarchical Bayesian model of pedagogy. We then evaluate this model in two online behavioral experiments (N = 312 adults). In Experiment 1, we show that teachers select examples that account for learners' background knowledge, and adjust their examples based on learners' feedback. In Experiment 2, we show that learners strategically provide more feedback when teachers' examples deviate from their background knowledge. These findings provide a foundation for extending computational accounts of pedagogy to richer interactive settings.},
  langid = {english},
  keywords = {Bayesian modeling,Communication,Pedagogy,Social cognition,Theory of mind},
  file = {/Users/andrew/Zotero/storage/XGKLBYPV/Chen et al. - 2024 - A Hierarchical Bayesian Model of Adaptive Teaching.pdf;/Users/andrew/Zotero/storage/XUUMCCC7/cogs.html}
}

@article{chiLearningHumanTutoring2001,
  title = {Learning from Human Tutoring},
  author = {Chi, Michelene and Siler, Stephanie and Jeong, Heisawn and Yamauchi, Takashi and Hausmann, Robert},
  date = {2001-07-01},
  journaltitle = {Cognitive Science},
  shortjournal = {Cognitive Science},
  volume = {25},
  pages = {471--533},
  doi = {10.1016/S0364-0213(01)00044-1},
  abstract = {Human one-to-one tutoring has been shown to be a very effective form of instruction. Three contrasting hypotheses, a tutor-centered one, a student-centered one, and an interactive one could all potentially explain the effectiveness of tutoring. To test these hypotheses, analyses focused not only on the effectiveness of the tutors’ moves, but also on the effectiveness of the students’ construction on learning, as well as their interaction. The interaction hypothesis is further tested in the second study by manipulating the kind of tutoring tactics tutors were permitted to use. In order to promote a more interactive style of dialogue, rather than a didactic style, tutors were suppressed from giving explanations and feedback. Instead, tutors were encouraged to prompt the students. Surprisingly, students learned just as effectively even when tutors were suppressed from giving explanations and feedback. Their learning in the interactive style of tutoring is attributed to construction from deeper and a greater amount of scaffolding episodes, as well as their greater effort to take control of their own learning by reading more. What they learned from reading was limited, however, by their reading abilities.},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/BWWCXV84/Chi et al_2001_Learning from human tutoring.pdf}
}

@online{choiMALADEOrchestrationLLMpowered2024,
  title = {{{MALADE}}: {{Orchestration}} of {{LLM-powered Agents}} with {{Retrieval Augmented Generation}} for {{Pharmacovigilance}}},
  shorttitle = {{{MALADE}}},
  author = {Choi, Jihye and Palumbo, Nils and Chalasani, Prasad and Engelhard, Matthew M. and Jha, Somesh and Kumar, Anivarya and Page, David},
  date = {2024-08-03},
  eprint = {2408.01869},
  eprinttype = {arXiv},
  eprintclass = {cs, q-bio},
  doi = {10.48550/arXiv.2408.01869},
  url = {http://arxiv.org/abs/2408.01869},
  urldate = {2024-08-16},
  abstract = {In the era of Large Language Models (LLMs), given their remarkable text understanding and generation abilities, there is an unprecedented opportunity to develop new, LLM-based methods for trustworthy medical knowledge synthesis, extraction and summarization. This paper focuses on the problem of Pharmacovigilance (PhV), where the significance and challenges lie in identifying Adverse Drug Events (ADEs) from diverse text sources, such as medical literature, clinical notes, and drug labels. Unfortunately, this task is hindered by factors including variations in the terminologies of drugs and outcomes, and ADE descriptions often being buried in large amounts of narrative text. We present MALADE, the first effective collaborative multi-agent system powered by LLM with Retrieval Augmented Generation for ADE extraction from drug label data. This technique involves augmenting a query to an LLM with relevant information extracted from text resources, and instructing the LLM to compose a response consistent with the augmented data. MALADE is a general LLM-agnostic architecture, and its unique capabilities are: (1) leveraging a variety of external sources, such as medical literature, drug labels, and FDA tools (e.g., OpenFDA drug information API), (2) extracting drug-outcome association in a structured format along with the strength of the association, and (3) providing explanations for established associations. Instantiated with GPT-4 Turbo or GPT-4o, and FDA drug label data, MALADE demonstrates its efficacy with an Area Under ROC Curve of 0.90 against the OMOP Ground Truth table of ADEs. Our implementation leverages the Langroid multi-agent LLM framework and can be found at https://github.com/jihyechoi77/malade.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Information Retrieval,Computer Science - Machine Learning,Computer Science - Multiagent Systems,Quantitative Biology - Quantitative Methods},
  file = {/Users/andrew/Zotero/storage/ZHBVB2K3/Choi et al. - 2024 - MALADE Orchestration of LLM-powered Agents with Retrieval Augmented Generation for Pharmacovigilanc.pdf;/Users/andrew/Zotero/storage/XAU7Y67T/2408.html}
}

@article{clarkExtendedMind1998,
  title = {The {{Extended Mind}}},
  author = {Clark, Andy and Chalmers, David},
  date = {1998},
  journaltitle = {Analysis},
  volume = {58},
  number = {1},
  eprint = {3328150},
  eprinttype = {jstor},
  pages = {7--19},
  publisher = {[Analysis Committee, Oxford University Press]},
  issn = {0003-2638},
  url = {https://www.jstor.org/stable/3328150},
  urldate = {2024-06-26}
}

@book{clarkSupersizingMindEmbodiment2008,
  title = {Supersizing the {{Mind}}: {{Embodiment}}, {{Action}}, and {{Cognitive Extension}}},
  shorttitle = {Supersizing the {{Mind}}},
  author = {Clark, Andy},
  date = {2008},
  publisher = {New York: Oxford University Press},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/UX96EM6A/CLASTM.html}
}

@online{collinsBuildingMachinesThat2024,
  title = {Building {{Machines}} That {{Learn}} and {{Think}} with {{People}}},
  author = {Collins, Katherine M. and Sucholutsky, Ilia and Bhatt, Umang and Chandra, Kartik and Wong, Lionel and Lee, Mina and Zhang, Cedegao E. and Zhi-Xuan, Tan and Ho, Mark and Mansinghka, Vikash and Weller, Adrian and Tenenbaum, Joshua B. and Griffiths, Thomas L.},
  date = {2024-07-21},
  eprint = {2408.03943},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2408.03943},
  url = {http://arxiv.org/abs/2408.03943},
  urldate = {2024-08-13},
  abstract = {What do we want from machine intelligence? We envision machines that are not just tools for thought, but partners in thought: reasonable, insightful, knowledgeable, reliable, and trustworthy systems that think with us. Current artificial intelligence (AI) systems satisfy some of these criteria, some of the time. In this Perspective, we show how the science of collaborative cognition can be put to work to engineer systems that really can be called ``thought partners,'' systems built to meet our expectations and complement our limitations. We lay out several modes of collaborative thought in which humans and AI thought partners can engage and propose desiderata for human-compatible thought partnerships. Drawing on motifs from computational cognitive science, we motivate an alternative scaling path for the design of thought partners and ecosystems around their use through a Bayesian lens, whereby the partners we construct actively build and reason over models of the human and world.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/8XYVBJSF/Collins et al. - 2024 - Building Machines that Learn and Think with People.pdf;/Users/andrew/Zotero/storage/EKLB687T/2408.html}
}

@article{corbettKnowledgeTracingModeling1994,
  title = {Knowledge Tracing: {{Modeling}} the Acquisition of Procedural Knowledge},
  shorttitle = {Knowledge Tracing},
  author = {Corbett, Albert T. and Anderson, John R.},
  date = {1994-12-01},
  journaltitle = {User Modeling and User-Adapted Interaction},
  shortjournal = {User Model User-Adap Inter},
  volume = {4},
  number = {4},
  pages = {253--278},
  issn = {1573-1391},
  doi = {10.1007/BF01099821},
  url = {https://doi.org/10.1007/BF01099821},
  urldate = {2023-03-31},
  abstract = {This paper describes an effort to model students' changing knowledge state during skill acquisition. Students in this research are learning to write short programs with the ACT Programming Tutor (APT). APT is constructed around a production rule cognitive model of programming knowledge, called theideal student model. This model allows the tutor to solve exercises along with the student and provide assistance as necessary. As the student works, the tutor also maintains an estimate of the probability that the student has learned each of the rules in the ideal model, in a process calledknowledge tracing. The tutor presents an individualized sequence of exercises to the student based on these probability estimates until the student has ‘mastered’ each rule. The programming tutor, cognitive model and learning and performance assumptions are described. A series of studies is reviewed that examine the empirical validity of knowledge tracing and has led to modifications in the process. Currently the model is quite successful in predicting test performance. Further modifications in the modeling process are discussed that may improve performance levels.},
  langid = {english},
  keywords = {/unread,empirical validity,individual differences,intelligent tutoring systems,learning,mastery learning,procedural knowledge,Student modeling},
  file = {/Users/andrew/Zotero/storage/BCTFGDNV/Corbett_Anderson_1994_Knowledge tracing.pdf}
}

@article{cottonChattingCheatingEnsuring2024,
  title = {Chatting and Cheating: {{Ensuring}} Academic Integrity in the Era of {{ChatGPT}}},
  shorttitle = {Chatting and Cheating},
  author = {Cotton, Debby R. E. and Cotton, Peter A. and Shipway, J. Reuben},
  date = {2024-03-03},
  journaltitle = {Innovations in Education and Teaching International},
  volume = {61},
  number = {2},
  pages = {228--239},
  publisher = {Routledge},
  issn = {1470-3297},
  doi = {10.1080/14703297.2023.2190148},
  url = {https://doi.org/10.1080/14703297.2023.2190148},
  urldate = {2024-05-27},
  abstract = {The use of artificial intelligence in academia is a hot topic in the education field. ChatGPT is an AI tool that offers a range of benefits, including increased student engagement, collaboration, and accessibility. However, is also raises concerns regarding academic honesty and plagiarism. This paper examines the opportunities and challenges of using ChatGPT in higher education, and discusses the potential risks and rewards of these tools. The paper also considers the difficulties of detecting and preventing academic dishonesty, and suggests strategies that universities can adopt to ensure ethical and responsible use of these tools. These strategies include developing policies and procedures, providing training and support, and using various methods to detect and prevent cheating. The paper concludes that while the use of AI in higher education presents both opportunities and challenges, universities can effectively address these concerns by taking a proactive and ethical approach to the use of these tools.},
  keywords = {detection and prevention,higher education,Machine-generated writing,plagiarism},
  file = {/Users/andrew/Zotero/storage/79TXHW5V/Cotton et al_2024_Chatting and cheating.pdf}
}

@online{craigAICopyrightTrap2024,
  type = {SSRN Scholarly Paper},
  title = {The {{AI-Copyright Trap}}},
  author = {Craig, Carys J.},
  date = {2024-07-15},
  number = {4905118},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4905118},
  url = {https://papers.ssrn.com/abstract=4905118},
  urldate = {2024-09-06},
  abstract = {As AI tools proliferate, policy makers are increasingly being called upon to protect creators and the cultural industries from the extractive, exploitative, and even existential threats posed by generative AI. In their haste to act, however, they risk running headlong into the Copyright Trap: the mistaken conviction that copyright law is the best tool to support human creators and culture in our new technological reality (when in fact it is likely to do more harm than good). It is a trap in the sense that it may satisfy the wants of a small group of powerful stakeholders, but it will harm the interests of the more vulnerable actors who are, perhaps, most drawn to it. Once entered, it will also prove practically impossible to escape. I identify three routes in to the copyright trap in current AI debates: first is the "if value, then (property) right" fallacy; second is the idea that unauthorized copying is inherently wrongful; and third is the resurrection of the starving artist trope to justify copyright's expansion. Ultimately, this article urges AI critics to sidestep the copyright trap, resisting the lure of its proprietary logic in favor of more appropriate routes towards addressing the risks and harms of generative AI.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Artificial Intelligence,Copyright Infringement,Copyright Law,Generative AI,Intellectual Property,Law and Technology,Text and Data Mining},
  file = {/Users/andrew/Zotero/storage/NY2F48XN/Craig - 2024 - The AI-Copyright Trap.pdf}
}

@online{creswellFaithfulReasoningUsing2022,
  title = {Faithful {{Reasoning Using Large Language Models}}},
  author = {Creswell, Antonia and Shanahan, Murray},
  date = {2022-08-30},
  eprint = {2208.14271},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2208.14271},
  url = {http://arxiv.org/abs/2208.14271},
  urldate = {2023-04-21},
  abstract = {Although contemporary large language models (LMs) demonstrate impressive question-answering capabilities, their answers are typically the product of a single call to the model. This entails an unwelcome degree of opacity and compromises performance, especially on problems that are inherently multi-step. To address these limitations, we show how LMs can be made to perform faithful multi-step reasoning via a process whose causal structure mirrors the underlying logical structure of the problem. Our approach works by chaining together reasoning steps, where each step results from calls to two fine-tuned LMs, one for selection and one for inference, to produce a valid reasoning trace. Our method carries out a beam search through the space of reasoning traces to improve reasoning quality. We demonstrate the effectiveness of our model on multi-step logical deduction and scientific question-answering, showing that it outperforms baselines on final answer accuracy, and generates humanly interpretable reasoning traces whose validity can be checked by the user.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/JHWPXKW7/Creswell_Shanahan_2022_Faithful Reasoning Using Large Language Models.pdf;/Users/andrew/Zotero/storage/C9TB3MXG/2208.html}
}

@article{cuskleyLimitationsLargeLanguage2024,
  title = {The {{Limitations}} of {{Large Language Models}} for {{Understanding Human Language}} and {{Cognition}}},
  author = {Cuskley, Christine and Woods, Rebecca and Flaherty, Molly},
  date = {2024-08-31},
  journaltitle = {Open Mind},
  shortjournal = {Open Mind},
  volume = {8},
  pages = {1058--1083},
  issn = {2470-2986},
  doi = {10.1162/opmi_a_00160},
  url = {https://doi.org/10.1162/opmi_a_00160},
  urldate = {2024-09-05},
  abstract = {Researchers have recently argued that the capabilities of Large Language Models (LLMs) can provide new insights into longstanding debates about the role of learning and/or innateness in the development and evolution of human language. Here, we argue on two grounds that LLMs alone tell us very little about human language and cognition in terms of acquisition and evolution. First, any similarities between human language and the output of LLMs are purely functional. Borrowing the “four questions” framework from ethology, we argue that what LLMs do is superficially similar, but how they do it is not. In contrast to the rich multimodal data humans leverage in interactive language learning, LLMs rely on immersive exposure to vastly greater quantities of unimodal text data, with recent multimodal efforts built upon mappings between images and text. Second, turning to functional similarities between human language and LLM output, we show that human linguistic behavior is much broader. LLMs were designed to imitate the very specific behavior of human writing; while they do this impressively, the underlying mechanisms of these models limit their capacities for meaning and naturalistic interaction, and their potential for dealing with the diversity in human language. We conclude by emphasising that LLMs are not theories of language, but tools that may be used to study language, and that can only be effectively applied with specific hypotheses to motivate research.}
}

@online{daiCanLargeLanguage2023,
  title = {Can {{Large Language Models Provide Feedback}} to {{Students}}? {{A Case Study}} on {{ChatGPT}}},
  shorttitle = {Can {{Large Language Models Provide Feedback}} to {{Students}}?},
  author = {Dai, Wei and Lin, Jionghao and Jin, Flora and Li, Tongguang and Tsai, Yi-Shan and Gasevic, Dragan and Chen, Guanliang},
  date = {2023-04-13T06:49:18},
  doi = {10.35542/osf.io/hcgzj},
  url = {https://edarxiv.org/hcgzj/},
  urldate = {2023-04-15},
  abstract = {Educational feedback has been widely acknowledged as an effective approach to improving student learning. However, scaling effective practices can be laborious and costly, which motivated researchers to work on automated feedback systems (AFS). Inspired by the recent advancements in the pre-trained language models (e.g., ChatGPT), we posit that such models might advance the existing knowledge of textual feedback generation in AFS because of their capability to offer natural-sounding and detailed responses. Therefore, we aimed to investigate the feasibility of using ChatGPT to provide students with feedback to help them learn better. Specifically, we first examined the readability of ChatGPT-generated feedback. Then, we measured the agreement between ChatGPT and the instructor when assessing students' assignments according to the marking rubric. Finally, we used a well-known theoretical feedback framework to further investigate the effectiveness of the feedback generated by ChatGPT. Our results show that i) ChatGPT is capable of generating more detailed feedback that fluently and coherently summarizes students' performance than human instructors; ii) ChatGPT achieved high agreement with the instructor when assessing the topic of students' assignments; and iii) ChatGPT could provide feedback on the process of students completing the task, which benefits students developing learning skills.},
  langid = {american},
  pubstate = {prepublished},
  keywords = {/unread,and Research,Automated Feedback,Education,Educational Assessment,Educational Methods,Evaluation,Feedback Effectiveness,Feedback Generation,Higher Education,Large Language Model},
  file = {/Users/andrew/Zotero/storage/ICYKBJ87/Dai et al_2023_Can Large Language Models Provide Feedback to Students.pdf}
}

@online{dalalMatrixBayesianLearning2024,
  title = {The {{Matrix}}: {{A Bayesian}} Learning Model for {{LLMs}}},
  shorttitle = {The {{Matrix}}},
  author = {Dalal, Siddhartha and Misra, Vishal},
  date = {2024-02-05},
  eprint = {2402.03175},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2402.03175},
  urldate = {2024-05-29},
  abstract = {In this paper, we introduce a Bayesian learning model to understand the behavior of Large Language Models (LLMs). We explore the optimization metric of LLMs, which is based on predicting the next token, and develop a novel model grounded in this principle. Our approach involves constructing an ideal generative text model represented by a multinomial transition probability matrix with a prior, and we examine how LLMs approximate this matrix. We discuss the continuity of the mapping between embeddings and multinomial distributions, and present the Dirichlet approximation theorem to approximate any prior. Additionally, we demonstrate how text generation by LLMs aligns with Bayesian learning principles and delve into the implications for in-context learning, specifically explaining why in-context learning emerges in larger models where prompts are considered as samples to be updated. Our findings indicate that the behavior of LLMs is consistent with Bayesian Learning, offering new insights into their functioning and potential applications.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,I.2.7},
  file = {/Users/andrew/Zotero/storage/GY9L3K28/Dalal and Misra - 2024 - The Matrix A Bayesian learning model for LLMs.pdf}
}

@inproceedings{danryDonJustTell2023,
  title = {Don’t {{Just Tell Me}}, {{Ask Me}}: {{AI Systems}} That {{Intelligently Frame Explanations}} as {{Questions Improve Human Logical Discernment Accuracy}} over {{Causal AI}} Explanations},
  shorttitle = {Don’t {{Just Tell Me}}, {{Ask Me}}},
  booktitle = {Proceedings of the 2023 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  author = {Danry, Valdemar and Pataranutaporn, Pat and Mao, Yaoli and Maes, Pattie},
  date = {2023-04-19},
  series = {{{CHI}} '23},
  pages = {1--13},
  publisher = {Association for Computing Machinery},
  location = {New York, NY, USA},
  doi = {10.1145/3544548.3580672},
  url = {https://dl.acm.org/doi/10.1145/3544548.3580672},
  urldate = {2024-05-17},
  abstract = {Critical thinking is an essential human skill. Despite the importance of critical thinking, research reveals that our reasoning ability suffers from personal biases and cognitive resource limitations, leading to potentially dangerous outcomes. This paper presents the novel idea of AI-framed Questioning that turns information relevant to the AI classification into questions to actively engage users’ thinking and scaffold their reasoning process. We conducted a study with 204 participants comparing the effects of AI-framed Questioning on a critical thinking task; discernment of logical validity of socially divisive statements. Our results show that compared to no feedback and even causal AI explanations of an always correct system, AI-framed Questioning significantly increase human discernment of logically flawed statements. Our experiment exemplifies a future style of Human-AI co-reasoning system, where the AI becomes a critical thinking stimulator rather than an information teller.},
  isbn = {978-1-4503-9421-5},
  keywords = {AI,AI Explanation,Explainable AI,Human-AI Interaction,Language Model,Logic,Reasoning},
  file = {/Users/andrew/Zotero/storage/XIQ6PZEK/Danry et al_2023_Don’t Just Tell Me, Ask Me.pdf}
}

@article{davidUseGenerativeAI2023,
  title = {The Use of Generative {{AI}} Tools in {{Design Thinking}} Academic Makeathon},
  author = {David, Yigal and Krebs, Assaf and Rosenbaum, Alon},
  date = {2023-12-29},
  journaltitle = {CERN IdeaSquare Journal of Experimental Innovation},
  volume = {7},
  number = {3},
  pages = {43--49},
  issn = {2413-9505},
  doi = {10.23726/cij.2023.1470},
  url = {https://e-publishing.cern.ch/index.php/CIJ/article/view/1470},
  urldate = {2024-08-06},
  abstract = {This paper examines the integration and influence of Generative Artificial Intelligence (GAI) tools in a Double Diamond Design Thinking (DDDT) academic makeathon. It analyzes students' interaction with these tools in problem-solving scenarios, offering insights into their perceptions and manner of use. The study reveals that text-based GAI, such as ChatGPT and visual tools such as Midjourney and Dall-E 2, are perceived to be supportive rather than solution-dictating. However, it appears that there is a significant difference between engineering and design students in their approach and their trust in these tools. Moreover, students often use tools like ChatGPT as search engines without fully exploring their capabilities. This paper aims to explore the potential of GAI in its deeper capacity within the DDDT methodology, and how to maximize its value.},
  issue = {3},
  langid = {english},
  keywords = {AI,AI in education,ChatGPT,Dall-E 2,Design Thinking,Double Diamond Design Thinking,Generative artificial intelligence,Human-AI collaboration,Makeathon,Midjourney,Shenkar Jamweek},
  file = {/Users/andrew/Zotero/storage/R5BWIXRB/David et al. - 2023 - The use of generative AI tools in Design Thinking academic makeathon.pdf}
}

@article{degallier-rochatHumanAugmentationNot2022,
  title = {Human Augmentation, Not Replacement: {{A}} Research Agenda for {{AI}} and Robotics in the Industry},
  shorttitle = {Human Augmentation, Not Replacement},
  author = {Dégallier-Rochat, Sarah and Kurpicz-Briki, Mascha and Endrissat, Nada and Yatsenko, Olena},
  date = {2022},
  journaltitle = {Frontiers in Robotics and AI},
  volume = {9},
  issn = {2296-9144},
  doi = {10.3389/frobt.2022.997386},
  url = {https://www.frontiersin.org/articles/10.3389/frobt.2022.997386},
  urldate = {2023-03-03},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/9QEXUZ9N/Dégallier-Rochat et al_2022_Human augmentation, not replacement.pdf}
}

@article{degenRationalSpeechAct2023,
  title = {The {{Rational Speech Act Framework}}},
  author = {Degen, Judith},
  date = {2023},
  journaltitle = {Annual Review of Linguistics},
  volume = {9},
  number = {1},
  pages = {519--540},
  doi = {10.1146/annurev-linguistics-031220-010811},
  url = {https://doi.org/10.1146/annurev-linguistics-031220-010811},
  urldate = {2023-12-11},
  abstract = {The past decade has seen the rapid development of a new approach to pragmatics that attempts to integrate insights from formal and experimental semantics and pragmatics, psycholinguistics, and computational cognitive science in the study of meaning: probabilistic pragmatics. The most influential probabilistic approach to pragmatics is the Rational Speech Act (RSA) framework. In this review, I demonstrate the basic mechanics and commitments of RSA as well as some of its standard extensions, highlighting the key features that have led to its success in accounting for a wide variety of pragmatic phenomena. Fundamentally, it treats language as probabilistic, informativeness as gradient, alternatives as context-dependent, and subjective prior beliefs (world knowledge) as a crucial facet of interpretation. It also provides an integrated account of the link between production and interpretation. I highlight key challenges for RSA, which include scalability, the treatment of the boundedness of cognition, and the incremental and compositional nature of language.},
  keywords = {computational pragmatics,context,experimental pragmatics,experimental semantics,probabilistic pragmatics},
  file = {/Users/andrew/Zotero/storage/6BTDFBQ6/Degen_2023_The Rational Speech Act Framework.pdf}
}

@article{demszkyUsingLargeLanguage2023,
  title = {Using Large Language Models in Psychology},
  author = {Demszky, Dorottya and Yang, Diyi and Yeager, David S. and Bryan, Christopher J. and Clapper, Margarett and Chandhok, Susannah and Eichstaedt, Johannes C. and Hecht, Cameron and Jamieson, Jeremy and Johnson, Meghann and Jones, Michaela and Krettek-Cobb, Danielle and Lai, Leslie and JonesMitchell, Nirel and Ong, Desmond C. and Dweck, Carol S. and Gross, James J. and Pennebaker, James W.},
  date = {2023-10-13},
  journaltitle = {Nature Reviews Psychology},
  shortjournal = {Nat Rev Psychol},
  pages = {1--14},
  publisher = {Nature Publishing Group},
  issn = {2731-0574},
  doi = {10.1038/s44159-023-00241-5},
  url = {https://www.nature.com/articles/s44159-023-00241-5},
  urldate = {2023-10-21},
  abstract = {Large language models (LLMs), such as OpenAI’s GPT-4, Google’s Bard or Meta’s LLaMa, have created unprecedented opportunities for analysing and generating language data on a massive scale. Because language data have a central role in all areas of psychology, this new technology has the potential to transform the field. In this Perspective, we review the foundations of LLMs. We then explain how the way that LLMs are constructed enables them to effectively generate human-like linguistic output without the ability to think or feel like a human. We argue that although LLMs have the potential to advance psychological measurement, experimentation and practice, they are not yet ready for many of the most transformative psychological applications — but further research and development may enable such use. Next, we examine four major concerns about the application of LLMs to psychology, and how each might be overcome. Finally, we conclude with recommendations for investments that could help to address these concerns: field-initiated ‘keystone’ datasets; increased standardization of performance benchmarks; and shared computing and analysis infrastructure to ensure that the future of LLM-powered research is equitable.},
  langid = {english},
  keywords = {Human behaviour,Language and linguistics,Psychology,Science,technology and society},
  file = {/Users/andrew/Zotero/storage/7BZWJTSP/Demszky et al_2023_Using large language models in psychology.pdf}
}

@incollection{deneckeAnalysisCriticalIncident2024,
  title = {Analysis of {{Critical Incident Reports Using Natural Language Processing}}},
  booktitle = {{{dHealth}} 2024},
  author = {Denecke, Kerstin and Paula, Helmut},
  date = {2024},
  pages = {1--6},
  publisher = {IOS Press},
  doi = {10.3233/SHTI240002},
  url = {https://ebooks.iospress.nl/doi/10.3233/SHTI240002},
  urldate = {2024-05-13},
  file = {/Users/andrew/Zotero/storage/RQZDGRY7/Denecke_Paula_2024_Analysis of Critical Incident Reports Using Natural Language Processing.pdf}
}

@article{deneckeAssessingPotentialRisks2023,
  title = {Assessing the {{Potential}} and {{Risks}} of {{AI-Based Tools}} in {{Higher Education}}: {{Results}} from an {{eSurvey}} and {{SWOT Analysis}}},
  author = {Denecke, Kerstin and Glauser, Robin and Reichenpfader, Daniel},
  date = {2023},
  abstract = {Recent developments related to tools based on artificial intelligence (AI) have raised interests in many areas, including higher education. While machine translation tools have been available and in use for many years in teaching and learning, generative AI models have sparked concerns within the academic community. The objective of this paper is to identify the strengths, weaknesses, opportunities and threats (SWOT) of using AI-based tools (ABTs) in higher education contexts. We employed a mixed methods approach to achieve our objectives; we conducted a survey and used the results to perform a SWOT analysis. For the survey, we asked lecturers and students to answer 27 questions (Likert scale, free text, etc.) on their experiences and viewpoints related to AI-based tools in higher education. A total of 305 people from different countries and with different backgrounds answered the questionnaire. The results show that a moderate to high future impact of ABTs on teaching, learning and exams is expected by the participants. ABT strengths are seen as the personalization of the learning experience or increased efficiency via automation of repetitive tasks. Several use cases are envisioned but are still not yet used in daily practice. Challenges include skills teaching, data protection and bias. We conclude that research is needed to study the unintended consequences of ABT usage in higher education in particular for developing countermeasures and to demonstrate the benefits of ABT usage in higher education. Furthermore, we suggest defining a competence model specifying the required skills that ensure the responsible and efficient use of ABTs by students and lecturers.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/ZS56LJ4Z/Denecke et al. - 2023 - Assessing the Potential and Risks of AI-Based Tool.pdf}
}

@article{desaireDistinguishingAcademicScience2023,
  title = {Distinguishing Academic Science Writing from Humans or {{ChatGPT}} with over 99\% Accuracy Using Off-the-Shelf Machine Learning Tools},
  author = {Desaire, Heather and Chua, Aleesa E. and Isom, Madeline and Jarosova, Romana and Hua, David},
  date = {2023-06},
  journaltitle = {Cell Reports Physical Science},
  shortjournal = {Cell Reports Physical Science},
  pages = {101426},
  issn = {26663864},
  doi = {10.1016/j.xcrp.2023.101426},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S266638642300200X},
  urldate = {2023-06-08},
  abstract = {ChatGPT has enabled access to artificial intelligence (AI)-generated writing for the masses, initiating a culture shift in the way people work, learn, and write. The need to discriminate human writing from AI is now both critical and urgent. Addressing this need, we report a method for discriminating text generated by ChatGPT from (human) academic scientists, relying on prevalent and accessible supervised classification methods. The approach uses new features for discriminating (these) humans from AI; as examples, scientists write long paragraphs and have a penchant for equivocal language, frequently using words like ‘‘but,’’ ‘‘however,’’ and ‘‘although.’’ With a set of 20 features, we built a model that assigns the author, as human or AI, at over 99\% accuracy. This strategy could be further adapted and developed by others with basic skills in supervised classification, enabling access to many highly accurate and targeted models for detecting AI usage in academic writing and beyond.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/VFDLC92L/Desaire et al. - 2023 - Distinguishing academic science writing from human.pdf}
}

@article{desaireDistinguishingAcademicScience2023a,
  title = {Distinguishing Academic Science Writing from Humans or {{ChatGPT}} with over 99\% Accuracy Using Off-the-Shelf Machine Learning Tools},
  author = {Desaire, Heather and Chua, Aleesa E. and Isom, Madeline and Jarosova, Romana and Hua, David},
  date = {2023-06-07},
  journaltitle = {Cell Reports Physical Science},
  shortjournal = {Cell Reports Physical Science},
  pages = {101426},
  issn = {2666-3864},
  doi = {10.1016/j.xcrp.2023.101426},
  url = {https://www.sciencedirect.com/science/article/pii/S266638642300200X},
  urldate = {2023-06-11},
  abstract = {ChatGPT has enabled access to artificial intelligence (AI)-generated writing for the masses, initiating a culture shift in the way people work, learn, and write. The need to discriminate human writing from AI is now both critical and urgent. Addressing this need, we report a method for discriminating text generated by ChatGPT from (human) academic scientists, relying on prevalent and accessible supervised classification methods. The approach uses new features for discriminating (these) humans from AI; as examples, scientists write long paragraphs and have a penchant for equivocal language, frequently using words like “but,” “however,” and “although.” With a set of 20 features, we built a model that assigns the author, as human or AI, at over 99\% accuracy. This strategy could be further adapted and developed by others with basic skills in supervised classification, enabling access to many highly accurate and targeted models for detecting AI usage in academic writing and beyond.},
  langid = {english},
  keywords = {/unread,AI,ChatGPT,machine learing,plagiarism,text analysis,XGBoost},
  file = {/Users/andrew/Zotero/storage/QT575F6U/Desaire et al_2023_Distinguishing academic science writing from humans or ChatGPT with over 99%.pdf;/Users/andrew/Zotero/storage/Q3KICH9D/S266638642300200X.html}
}

@article{DidaktischeUndRechtliche2023,
  title = {Didaktische Und Rechtliche {{Perspektiven}} Auf {{KI-gestütztes Schreiben}} in Der {{Hochschulbildung}}},
  date = {2023-03-07},
  url = {https://hss-opus.ub.ruhr-uni-bochum.de/opus4/frontdoor/index/index/docId/9734},
  urldate = {2023-03-15},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/4IZVMT7F/2023_Didaktische und rechtliche Perspektiven auf KI-gestütztes Schreiben in der.pdf}
}

@online{dinuSymbolicAIFrameworkLogicbased2024,
  title = {{{SymbolicAI}}: {{A}} Framework for Logic-Based Approaches Combining Generative Models and Solvers},
  shorttitle = {{{SymbolicAI}}},
  author = {Dinu, Marius-Constantin and Leoveanu-Condrei, Claudiu and Holzleitner, Markus and Zellinger, Werner and Hochreiter, Sepp},
  date = {2024-02-01},
  eprint = {2402.00854},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2402.00854},
  url = {http://arxiv.org/abs/2402.00854},
  urldate = {2024-02-02},
  abstract = {We introduce SymbolicAI, a versatile and modular framework employing a logic-based approach to concept learning and flow management in generative processes. SymbolicAI enables the seamless integration of generative models with a diverse range of solvers by treating large language models (LLMs) as semantic parsers that execute tasks based on both natural and formal language instructions, thus bridging the gap between symbolic reasoning and generative AI. We leverage probabilistic programming principles to tackle complex tasks, and utilize differentiable and classical programming paradigms with their respective strengths. The framework introduces a set of polymorphic, compositional, and self-referential operations for data stream manipulation, aligning LLM outputs with user objectives. As a result, we can transition between the capabilities of various foundation models endowed with zero- and few-shot learning capabilities and specialized, fine-tuned models or solvers proficient in addressing specific problems. In turn, the framework facilitates the creation and evaluation of explainable computational graphs. We conclude by introducing a quality measure and its empirical score for evaluating these computational graphs, and propose a benchmark that compares various state-of-the-art LLMs across a set of complex workflows. We refer to the empirical score as the "Vector Embedding for Relational Trajectory Evaluation through Cross-similarity", or VERTEX score for short. The framework codebase and benchmark are linked below.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Software Engineering,Computer Science - Symbolic Computation},
  file = {/Users/andrew/Zotero/storage/I2FC5L2R/Dinu et al_2024_SymbolicAI.pdf;/Users/andrew/Zotero/storage/U4RJDL84/2402.html}
}

@online{dohanLanguageModelCascades2022,
  title = {Language {{Model Cascades}}},
  author = {Dohan, David and Xu, Winnie and Lewkowycz, Aitor and Austin, Jacob and Bieber, David and Lopes, Raphael Gontijo and Wu, Yuhuai and Michalewski, Henryk and Saurous, Rif A. and Sohl-dickstein, Jascha and Murphy, Kevin and Sutton, Charles},
  date = {2022-07-28},
  eprint = {2207.10342},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2207.10342},
  url = {http://arxiv.org/abs/2207.10342},
  urldate = {2023-04-21},
  abstract = {Prompted models have demonstrated impressive few-shot learning abilities. Repeated interactions at test-time with a single model, or the composition of multiple models together, further expands capabilities. These compositions are probabilistic models, and may be expressed in the language of graphical models with random variables whose values are complex data types such as strings. Cases with control flow and dynamic structure require techniques from probabilistic programming, which allow implementing disparate model structures and inference strategies in a unified language. We formalize several existing techniques from this perspective, including scratchpads / chain of thought, verifiers, STaR, selection-inference, and tool use. We refer to the resulting programs as language model cascades.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/V8KLV6ZR/Dohan et al_2022_Language Model Cascades.pdf;/Users/andrew/Zotero/storage/Y74R2PJV/2207.html}
}

@online{dohanLanguageModelCascades2022a,
  title = {Language {{Model Cascades}}},
  author = {Dohan, David and Xu, Winnie and Lewkowycz, Aitor and Austin, Jacob and Bieber, David and Lopes, Raphael Gontijo and Wu, Yuhuai and Michalewski, Henryk and Saurous, Rif A. and Sohl-dickstein, Jascha and Murphy, Kevin and Sutton, Charles},
  date = {2022-07-28},
  eprint = {2207.10342},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2207.10342},
  url = {http://arxiv.org/abs/2207.10342},
  urldate = {2023-10-24},
  abstract = {Prompted models have demonstrated impressive few-shot learning abilities. Repeated interactions at test-time with a single model, or the composition of multiple models together, further expands capabilities. These compositions are probabilistic models, and may be expressed in the language of graphical models with random variables whose values are complex data types such as strings. Cases with control flow and dynamic structure require techniques from probabilistic programming, which allow implementing disparate model structures and inference strategies in a unified language. We formalize several existing techniques from this perspective, including scratchpads / chain of thought, verifiers, STaR, selection-inference, and tool use. We refer to the resulting programs as language model cascades.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/KVLYHVNZ/Dohan et al_2022_Language Model Cascades.pdf;/Users/andrew/Zotero/storage/7CHTB98K/2207.html}
}

@online{dohanLanguageModelCascades2022b,
  title = {Language {{Model Cascades}}},
  author = {Dohan, David and Xu, Winnie and Lewkowycz, Aitor and Austin, Jacob and Bieber, David and Lopes, Raphael Gontijo and Wu, Yuhuai and Michalewski, Henryk and Saurous, Rif A. and Sohl-dickstein, Jascha and Murphy, Kevin and Sutton, Charles},
  date = {2022-07-28},
  eprint = {2207.10342},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2207.10342},
  url = {http://arxiv.org/abs/2207.10342},
  urldate = {2024-09-04},
  abstract = {Prompted models have demonstrated impressive few-shot learning abilities. Repeated interactions at test-time with a single model, or the composition of multiple models together, further expands capabilities. These compositions are probabilistic models, and may be expressed in the language of graphical models with random variables whose values are complex data types such as strings. Cases with control flow and dynamic structure require techniques from probabilistic programming, which allow implementing disparate model structures and inference strategies in a unified language. We formalize several existing techniques from this perspective, including scratchpads / chain of thought, verifiers, STaR, selection-inference, and tool use. We refer to the resulting programs as language model cascades.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/EG5N5V9Z/Dohan et al. - 2022 - Language Model Cascades.pdf;/Users/andrew/Zotero/storage/B7L4PQNM/2207.html}
}

@article{dunloskyImprovingStudentsLearning2013,
  title = {Improving {{Students}}’ {{Learning With Effective Learning Techniques}}},
  author = {Dunlosky, John and Rawson, Katherine A. and Marsh, Elizabeth J. and Nathan, Mitchell J. and Willingham, Daniel T.},
  date = {2013-01-08},
  journaltitle = {Psychological Science in the Public Interest},
  publisher = {SAGE PublicationsSage CA: Los Angeles, CA},
  doi = {10.1177/1529100612453266},
  url = {https://journals.sagepub.com/stoken/rbtfl/Z10jaVH/60XQM/full},
  urldate = {2024-04-16},
  abstract = {Many students are being left behind by an educational system that some people believe is in crisis. Improving educational outcomes will require efforts on many ...},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/GS5CWPWH/Dunlosky et al_2013_Improving Students’ Learning With Effective Learning Techniques.pdf;/Users/andrew/Zotero/storage/WXS6G4BN/full.html}
}

@online{dziriFaithFateLimits2023,
  title = {Faith and {{Fate}}: {{Limits}} of {{Transformers}} on {{Compositionality}}},
  shorttitle = {Faith and {{Fate}}},
  author = {Dziri, Nouha and Lu, Ximing and Sclar, Melanie and Li, Xiang Lorraine and Jiang, Liwei and Lin, Bill Yuchen and West, Peter and Bhagavatula, Chandra and Bras, Ronan Le and Hwang, Jena D. and Sanyal, Soumya and Welleck, Sean and Ren, Xiang and Ettinger, Allyson and Harchaoui, Zaid and Choi, Yejin},
  date = {2023-06-01},
  eprint = {2305.18654},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2305.18654},
  url = {http://arxiv.org/abs/2305.18654},
  urldate = {2023-08-29},
  abstract = {Transformer large language models (LLMs) have sparked admiration for their exceptional performance on tasks that demand intricate multi-step reasoning. Yet, these models simultaneously show failures on surprisingly trivial problems. This begs the question: Are these errors incidental, or do they signal more substantial limitations? In an attempt to demystify Transformers, we investigate the limits of these models across three representative compositional tasks -- multi-digit multiplication, logic grid puzzles, and a classic dynamic programming problem. These tasks require breaking problems down into sub-steps and synthesizing these steps into a precise answer. We formulate compositional tasks as computation graphs to systematically quantify the level of complexity, and break down reasoning steps into intermediate sub-procedures. Our empirical findings suggest that Transformers solve compositional tasks by reducing multi-step compositional reasoning into linearized subgraph matching, without necessarily developing systematic problem-solving skills. To round off our empirical study, we provide theoretical arguments on abstract multi-step reasoning problems that highlight how Transformers' performance will rapidly decay with increased task complexity.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/K8JPBESX/Dziri et al_2023_Faith and Fate.pdf;/Users/andrew/Zotero/storage/CCHSFQ65/2305.html}
}

@inproceedings{dziriFaithFateLimits2023a,
  title = {Faith and {{Fate}}: {{Limits}} of {{Transformers}} on {{Compositionality}}},
  shorttitle = {Faith and {{Fate}}},
  author = {Dziri, Nouha and Lu, Ximing and Sclar, Melanie and Li, Xiang Lorraine and Jiang, Liwei and Lin, Bill Yuchen and Welleck, Sean and West, Peter and Bhagavatula, Chandra and Bras, Ronan Le and Hwang, Jena D. and Sanyal, Soumya and Ren, Xiang and Ettinger, Allyson and Harchaoui, Zaid and Choi, Yejin},
  date = {2023-11-02},
  url = {https://openreview.net/forum?id=Fkckkr3ya8},
  urldate = {2023-12-18},
  abstract = {Transformer large language models (LLMs) have sparked admiration for their exceptional performance on tasks that demand intricate multi-step reasoning. Yet, these models simultaneously show failures on surprisingly trivial problems. This begs the question: Are these errors incidental, or do they signal more substantial limitations? In an attempt to demystify transformer LLMs, we investigate the limits of these models across three representative compositional tasks---multi-digit multiplication, logic grid puzzles, and a classic dynamic programming problem. These tasks require breaking problems down into sub-steps and synthesizing these steps into a precise answer. We formulate compositional tasks as computation graphs to systematically quantify the level of complexity, and break down reasoning steps into intermediate sub-procedures. Our empirical findings suggest that transformer LLMs solve compositional tasks by reducing multi-step compositional reasoning into linearized subgraph matching, without necessarily developing systematic problem-solving skills. To round off our empirical study, we provide theoretical arguments on abstract multi-step reasoning problems that highlight how autoregressive generations' performance can rapidly decay with increased task complexity.},
  eventtitle = {Thirty-Seventh {{Conference}} on {{Neural Information Processing Systems}}},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/MAQGW672/Dziri et al_2023_Faith and Fate.pdf}
}

@online{ellisDreamCoderGrowingGeneralizable2020,
  title = {{{DreamCoder}}: {{Growing}} Generalizable, Interpretable Knowledge with Wake-Sleep {{Bayesian}} Program Learning},
  shorttitle = {{{DreamCoder}}},
  author = {Ellis, Kevin and Wong, Catherine and Nye, Maxwell and Sable-Meyer, Mathias and Cary, Luc and Morales, Lucas and Hewitt, Luke and Solar-Lezama, Armando and Tenenbaum, Joshua B.},
  date = {2020-06-15},
  eprint = {2006.08381},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2006.08381},
  urldate = {2024-08-23},
  abstract = {Expert problem-solving is driven by powerful languages for thinking about problems and their solutions. Acquiring expertise means learning these languages -- systems of concepts, alongside the skills to use them. We present DreamCoder, a system that learns to solve problems by writing programs. It builds expertise by creating programming languages for expressing domain concepts, together with neural networks to guide the search for programs within these languages. A ``wake-sleep'' learning algorithm alternately extends the language with new symbolic abstractions and trains the neural network on imagined and replayed problems. DreamCoder solves both classic inductive programming tasks and creative tasks such as drawing pictures and building scenes. It rediscovers the basics of modern functional programming, vector algebra and classical physics, including Newton's and Coulomb's laws. Concepts are built compositionally from those learned earlier, yielding multi-layered symbolic representations that are interpretable and transferrable to new tasks, while still growing scalably and flexibly with experience.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/HXIBF8KR/2006.pdf}
}

@article{fedorenkoLanguagePrimarilyTool2024,
  title = {Language Is Primarily a Tool for Communication Rather than Thought},
  author = {Fedorenko, Evelina and Piantadosi, Steven T. and Gibson, Edward A. F.},
  date = {2024-06-20},
  journaltitle = {Nature},
  shortjournal = {Nature},
  volume = {630},
  number = {8017},
  pages = {575--586},
  issn = {0028-0836, 1476-4687},
  doi = {10.1038/s41586-024-07522-w},
  url = {https://www.nature.com/articles/s41586-024-07522-w},
  urldate = {2024-06-21},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/E6EFBBDD/FedorenkoPiantadosiGibson2024_SuppInfo.pdf;/Users/andrew/Zotero/storage/QEIUTVT8/Fedorenko et al. - 2024 - Language is primarily a tool for communication rather than thought.pdf}
}

@online{ferrandoPrimerInnerWorkings2024,
  title = {A {{Primer}} on the {{Inner Workings}} of {{Transformer-based Language Models}}},
  author = {Ferrando, Javier and Sarti, Gabriele and Bisazza, Arianna and Costa-jussà, Marta R.},
  date = {2024-05-01},
  eprint = {2405.00208},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2405.00208},
  url = {http://arxiv.org/abs/2405.00208},
  urldate = {2024-07-05},
  abstract = {The rapid progress of research aimed at interpreting the inner workings of advanced language models has highlighted a need for contextualizing the insights gained from years of work in this area. This primer provides a concise technical introduction to the current techniques used to interpret the inner workings of Transformer-based language models, focusing on the generative decoder-only architecture. We conclude by presenting a comprehensive overview of the known internal mechanisms implemented by these models, uncovering connections across popular approaches and active research directions in this area.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/P9VY7VZ7/Ferrando et al. - 2024 - A Primer on the Inner Workings of Transformer-based Language Models.pdf;/Users/andrew/Zotero/storage/8ZBRREWR/2405.html}
}

@article{fleckensteinTeachersSpotAI2024,
  title = {Do Teachers Spot {{AI}}? {{Evaluating}} the Detectability of {{AI-generated}} Texts among Student Essays},
  shorttitle = {Do Teachers Spot {{AI}}?},
  author = {Fleckenstein, Johanna and Meyer, Jennifer and Jansen, Thorben and Keller, Stefan D. and Köller, Olaf and Möller, Jens},
  date = {2024-06-01},
  journaltitle = {Computers and Education: Artificial Intelligence},
  shortjournal = {Computers and Education: Artificial Intelligence},
  volume = {6},
  pages = {100209},
  issn = {2666-920X},
  doi = {10.1016/j.caeai.2024.100209},
  url = {https://www.sciencedirect.com/science/article/pii/S2666920X24000109},
  urldate = {2024-05-16},
  abstract = {The potential application of generative artificial intelligence (AI) in schools and universities poses great challenges, especially for the assessment of students’ texts. Previous research has shown that people generally have difficulty distinguishing AI-generated from human-written texts; however, the ability of teachers to identify an AI-generated text among student essays has not yet been investigated. Here we show in two experimental studies that novice (N~=~89) and experienced teachers (N~=~200) could not identify texts generated by ChatGPT among student-written texts. However, there are some indications that more experienced teachers made more differentiated and more accurate judgments. Furthermore, both groups were overconfident in their judgments. Effects of real and assumed source on quality assessment were heterogeneous. Our findings demonstrate that with relatively little prompting, current AI can generate texts that are not detectable for teachers, which poses a challenge to schools and universities in grading student essays. Our study provides empirical evidence for the current debate regarding exam strategies in schools and universities in light of the latest technological developments.},
  keywords = {ChatGPT,Essay writing,Generative AI,Teachers,Writing assessment}
}

@article{fleckensteinTeachersSpotAI2024a,
  title = {Do Teachers Spot {{AI}}? {{Evaluating}} the Detectability of {{AI-generated}} Texts among Student Essays},
  shorttitle = {Do Teachers Spot {{AI}}?},
  author = {Fleckenstein, Johanna and Meyer, Jennifer and Jansen, Thorben and Keller, Stefan D. and Köller, Olaf and Möller, Jens},
  date = {2024-06},
  journaltitle = {Computers and Education: Artificial Intelligence},
  shortjournal = {Computers and Education: Artificial Intelligence},
  volume = {6},
  pages = {100209},
  issn = {2666920X},
  doi = {10.1016/j.caeai.2024.100209},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S2666920X24000109},
  urldate = {2024-05-27},
  abstract = {The potential application of generative artificial intelligence (AI) in schools and universities poses great chal­ lenges, especially for the assessment of students’ texts. Previous research has shown that people generally have difficulty distinguishing AI-generated from human-written texts; however, the ability of teachers to identify an AI-generated text among student essays has not yet been investigated. Here we show in two experimental studies that novice (N = 89) and experienced teachers (N = 200) could not identify texts generated by ChatGPT among student-written texts. However, there are some indications that more experienced teachers made more differ­ entiated and more accurate judgments. Furthermore, both groups were overconfident in their judgments. Effects of real and assumed source on quality assessment were heterogeneous. Our findings demonstrate that with relatively little prompting, current AI can generate texts that are not detectable for teachers, which poses a challenge to schools and universities in grading student essays. Our study provides empirical evidence for the current debate regarding exam strategies in schools and universities in light of the latest technological developments.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/VV3JVRVV/Fleckenstein et al. - 2024 - Do teachers spot AI Evaluating the detectability .pdf}
}

@unpublished{fleckPruefungsrechtlicheFragenChatGPT2023,
  title = {Prüfungsrechtliche Fragen zu ChatGPT},
  author = {Fleck, Tilmann},
  date = {2023},
  langid = {ngerman},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/72DBD84Q/Fleck - Prüfungsrechtliche Fragen zu ChatGPT.pdf}
}

@article{flowerCognitiveProcessTheory1981,
  title = {A {{Cognitive Process Theory}} of {{Writing}}},
  author = {Flower, Linda and Hayes, John R.},
  date = {1981},
  journaltitle = {College Composition and Communication},
  volume = {32},
  number = {4},
  eprint = {356600},
  eprinttype = {jstor},
  pages = {365--387},
  publisher = {National Council of Teachers of English},
  issn = {0010-096X},
  doi = {10.2307/356600},
  url = {https://www.jstor.org/stable/356600},
  urldate = {2024-02-07},
  file = {/Users/andrew/Zotero/storage/3WINC33Q/Flower_Hayes_1981_A Cognitive Process Theory of Writing.pdf}
}

@article{frankeSoftmaxFunctionProperties2023,
  title = {The Softmax Function: {{Properties}}, Motivation, and Interpretation},
  shorttitle = {The Softmax Function},
  author = {Franke, Michael and Degen, Judith},
  date = {2023-12-12},
  publisher = {OSF},
  doi = {10.31234/osf.io/vsw47},
  url = {https://osf.io/vsw47},
  urldate = {2023-12-12},
  abstract = {The softmax function is a ubiquitous helper function, frequently used as a probabilistic link function for unordered categorical data, in different kinds of models, such as regression, artificial neural networks, or probabilistic cognitive models. To fully understand the models in which the softmax function occurs, different levels of understanding of the softmax function itself are necessary. For input-output oriented models, like regression or neural network models, mathematical properties are crucial. For models with interpretable and meaningful internal representations like probabilistic cognitive models, we also require a thorough conceptual understanding of the motivation for using the softmax function (instead of something else). This tutorial provides an in- depth exposition of the informal, mathematical and conceptual properties of the softmax function. It also provides two mathematical derivations (as a stochastic choice model, and as maximum entropy distribution), together with three conceptual interpretations that can serve as rationale for using the softmax function in models that require explainability of modeling choices.},
  langid = {american},
  file = {/Users/andrew/Zotero/storage/SURPZMGE/Franke and Degen - 2023 - The softmax function Properties, motivation, and .pdf;/Users/andrew/Zotero/storage/FGIWCQBF/vsw47.html}
}

@online{fristonDesigningEcosystemsIntelligence2022,
  title = {Designing {{Ecosystems}} of {{Intelligence}} from {{First Principles}}},
  author = {Friston, Karl J. and Ramstead, Maxwell J. D. and Kiefer, Alex B. and Tschantz, Alexander and Buckley, Christopher L. and Albarracin, Mahault and Pitliya, Riddhi J. and Heins, Conor and Klein, Brennan and Millidge, Beren and Sakthivadivel, Dalton A. R. and Smithe, Toby St Clere and Koudahl, Magnus and Tremblay, Safae Essafi and Petersen, Capm and Fung, Kaiser and Fox, Jason G. and Swanson, Steven and Mapes, Dan and René, Gabriel},
  date = {2022-12-02},
  eprint = {2212.01354},
  eprinttype = {arXiv},
  eprintclass = {nlin},
  doi = {10.48550/arXiv.2212.01354},
  url = {http://arxiv.org/abs/2212.01354},
  urldate = {2023-11-10},
  abstract = {This white paper lays out a vision of research and development in the field of artificial intelligence for the next decade (and beyond). Its denouement is a cyber-physical ecosystem of natural and synthetic sense-making, in which humans are integral participants\$\textbackslash unicode\{x2014\}\$what we call ''shared intelligence''. This vision is premised on active inference, a formulation of adaptive behavior that can be read as a physics of intelligence, and which inherits from the physics of self-organization. In this context, we understand intelligence as the capacity to accumulate evidence for a generative model of one's sensed world\$\textbackslash unicode\{x2014\}\$also known as self-evidencing. Formally, this corresponds to maximizing (Bayesian) model evidence, via belief updating over several scales: i.e., inference, learning, and model selection. Operationally, this self-evidencing can be realized via (variational) message passing or belief propagation on a factor graph. Crucially, active inference foregrounds an existential imperative of intelligent systems; namely, curiosity or the resolution of uncertainty. This same imperative underwrites belief sharing in ensembles of agents, in which certain aspects (i.e., factors) of each agent's generative world model provide a common ground or frame of reference. Active inference plays a foundational role in this ecology of belief sharing\$\textbackslash unicode\{x2014\}\$leading to a formal account of collective intelligence that rests on shared narratives and goals. We also consider the kinds of communication protocols that must be developed to enable such an ecosystem of intelligences and motivate the development of a shared hyper-spatial modeling language and transaction protocol, as a first\$\textbackslash unicode\{x2014\}\$and key\$\textbackslash unicode\{x2014\}\$step towards such an ecology.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Multiagent Systems,Nonlinear Sciences - Adaptation and Self-Organizing Systems},
  file = {/Users/andrew/Zotero/storage/GQVADVZ9/Friston et al_2022_Designing Ecosystems of Intelligence from First Principles.pdf;/Users/andrew/Zotero/storage/P35PMWE8/2212.html}
}

@online{galkinFoundationModelsKnowledge2023,
  title = {Towards {{Foundation Models}} for {{Knowledge Graph Reasoning}}},
  author = {Galkin, Mikhail and Yuan, Xinyu and Mostafa, Hesham and Tang, Jian and Zhu, Zhaocheng},
  date = {2023-10-06},
  eprint = {2310.04562},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2310.04562},
  url = {http://arxiv.org/abs/2310.04562},
  urldate = {2023-11-06},
  abstract = {Foundation models in language and vision have the ability to run inference on any textual and visual inputs thanks to the transferable representations such as a vocabulary of tokens in language. Knowledge graphs (KGs) have different entity and relation vocabularies that generally do not overlap. The key challenge of designing foundation models on KGs is to learn such transferable representations that enable inference on any graph with arbitrary entity and relation vocabularies. In this work, we make a step towards such foundation models and present ULTRA, an approach for learning universal and transferable graph representations. ULTRA builds relational representations as a function conditioned on their interactions. Such a conditioning strategy allows a pre-trained ULTRA model to inductively generalize to any unseen KG with any relation vocabulary and to be fine-tuned on any graph. Conducting link prediction experiments on 57 different KGs, we find that the zero-shot inductive inference performance of a single pre-trained ULTRA model on unseen graphs of various sizes is often on par or better than strong baselines trained on specific graphs. Fine-tuning further boosts the performance.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/2DGPLB6S/Galkin et al_2023_Towards Foundation Models for Knowledge Graph Reasoning.pdf;/Users/andrew/Zotero/storage/FSJYB5T5/2310.html}
}

@online{gandhiUnderstandingSocialReasoning2023,
  title = {Understanding {{Social Reasoning}} in {{Language Models}} with {{Language Models}}},
  author = {Gandhi, Kanishk and Fränken, Jan-Philipp and Gerstenberg, Tobias and Goodman, Noah D.},
  date = {2023-06-21},
  eprint = {2306.15448},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.15448},
  url = {http://arxiv.org/abs/2306.15448},
  urldate = {2023-07-01},
  abstract = {As Large Language Models (LLMs) become increasingly integrated into our everyday lives, understanding their ability to comprehend human mental states becomes critical for ensuring effective interactions. However, despite the recent attempts to assess the Theory-of-Mind (ToM) reasoning capabilities of LLMs, the degree to which these models can align with human ToM remains a nuanced topic of exploration. This is primarily due to two distinct challenges: (1) the presence of inconsistent results from previous evaluations, and (2) concerns surrounding the validity of existing evaluation methodologies. To address these challenges, we present a novel framework for procedurally generating evaluations with LLMs by populating causal templates. Using our framework, we create a new social reasoning benchmark (BigToM) for LLMs which consists of 25 controls and 5,000 model-written evaluations. We find that human participants rate the quality of our benchmark higher than previous crowd-sourced evaluations and comparable to expert-written evaluations. Using BigToM, we evaluate the social reasoning capabilities of a variety of LLMs and compare model performances with human performance. Our results suggest that GPT4 has ToM capabilities that mirror human inference patterns, though less reliable, while other LLMs struggle.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Human-Computer Interaction},
  file = {/Users/andrew/Zotero/storage/X4HKY53H/Gandhi et al_2023_Understanding Social Reasoning in Language Models with Language Models.pdf;/Users/andrew/Zotero/storage/6HKAPS44/2306.html}
}

@online{gandhiUnderstandingSocialReasoning2023a,
  title = {Understanding {{Social Reasoning}} in {{Language Models}} with {{Language Models}}},
  author = {Gandhi, Kanishk and Fränken, Jan-Philipp and Gerstenberg, Tobias and Goodman, Noah D.},
  date = {2023-06-21},
  eprint = {2306.15448},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.15448},
  url = {http://arxiv.org/abs/2306.15448},
  urldate = {2023-10-15},
  abstract = {As Large Language Models (LLMs) become increasingly integrated into our everyday lives, understanding their ability to comprehend human mental states becomes critical for ensuring effective interactions. However, despite the recent attempts to assess the Theory-of-Mind (ToM) reasoning capabilities of LLMs, the degree to which these models can align with human ToM remains a nuanced topic of exploration. This is primarily due to two distinct challenges: (1) the presence of inconsistent results from previous evaluations, and (2) concerns surrounding the validity of existing evaluation methodologies. To address these challenges, we present a novel framework for procedurally generating evaluations with LLMs by populating causal templates. Using our framework, we create a new social reasoning benchmark (BigToM) for LLMs which consists of 25 controls and 5,000 model-written evaluations. We find that human participants rate the quality of our benchmark higher than previous crowd-sourced evaluations and comparable to expert-written evaluations. Using BigToM, we evaluate the social reasoning capabilities of a variety of LLMs and compare model performances with human performance. Our results suggest that GPT4 has ToM capabilities that mirror human inference patterns, though less reliable, while other LLMs struggle.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Human-Computer Interaction},
  file = {/Users/andrew/Zotero/storage/CPH23HUV/Gandhi et al_2023_Understanding Social Reasoning in Language Models with Language Models.pdf;/Users/andrew/Zotero/storage/YGMU6RWE/2306.html}
}

@online{gaoRetrievalAugmentedGenerationLarge2023,
  title = {Retrieval-{{Augmented Generation}} for {{Large Language Models}}: {{A Survey}}},
  shorttitle = {Retrieval-{{Augmented Generation}} for {{Large Language Models}}},
  author = {Gao, Yunfan and Xiong, Yun and Gao, Xinyu and Jia, Kangxiang and Pan, Jinliu and Bi, Yuxi and Dai, Yi and Sun, Jiawei and Wang, Haofen},
  date = {2023-12-18},
  url = {https://arxiv.org/abs/2312.10997v1},
  urldate = {2023-12-30},
  abstract = {Large language models (LLMs) demonstrate powerful capabilities, but they still face challenges in practical applications, such as hallucinations, slow knowledge updates, and lack of transparency in answers. Retrieval-Augmented Generation (RAG) refers to the retrieval of relevant information from external knowledge bases before answering questions with LLMs. RAG has been demonstrated to significantly enhance answer accuracy, reduce model hallucination, particularly for knowledge-intensive tasks. By citing sources, users can verify the accuracy of answers and increase trust in model outputs. It also facilitates knowledge updates and the introduction of domain-specific knowledge. RAG effectively combines the parameterized knowledge of LLMs with non-parameterized external knowledge bases, making it one of the most important methods for implementing large language models. This paper outlines the development paradigms of RAG in the era of LLMs, summarizing three paradigms: Naive RAG, Advanced RAG, and Modular RAG. It then provides a summary and organization of the three main components of RAG: retriever, generator, and augmentation methods, along with key technologies in each component. Furthermore, it discusses how to evaluate the effectiveness of RAG models, introducing two evaluation methods for RAG, emphasizing key metrics and abilities for evaluation, and presenting the latest automatic evaluation framework. Finally, potential future research directions are introduced from three aspects: vertical optimization, horizontal scalability, and the technical stack and ecosystem of RAG.},
  langid = {english},
  organization = {arXiv.org},
  file = {/Users/andrew/Zotero/storage/VLJCVKH6/Gao et al_2023_Retrieval-Augmented Generation for Large Language Models.pdf}
}

@online{gaoRetrievalAugmentedGenerationLarge2024,
  title = {Retrieval-{{Augmented Generation}} for {{Large Language Models}}: {{A Survey}}},
  shorttitle = {Retrieval-{{Augmented Generation}} for {{Large Language Models}}},
  author = {Gao, Yunfan and Xiong, Yun and Gao, Xinyu and Jia, Kangxiang and Pan, Jinliu and Bi, Yuxi and Dai, Yi and Sun, Jiawei and Wang, Meng and Wang, Haofen},
  date = {2024-03-27},
  eprint = {2312.10997},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2312.10997},
  url = {http://arxiv.org/abs/2312.10997},
  urldate = {2024-07-05},
  abstract = {Large Language Models (LLMs) showcase impressive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility of the generation, particularly for knowledge-intensive tasks, and allows for continuous knowledge updates and integration of domain-specific information. RAG synergistically merges LLMs' intrinsic knowledge with the vast, dynamic repositories of external databases. This comprehensive review paper offers a detailed examination of the progression of RAG paradigms, encompassing the Naive RAG, the Advanced RAG, and the Modular RAG. It meticulously scrutinizes the tripartite foundation of RAG frameworks, which includes the retrieval, the generation and the augmentation techniques. The paper highlights the state-of-the-art technologies embedded in each of these critical components, providing a profound understanding of the advancements in RAG systems. Furthermore, this paper introduces up-to-date evaluation framework and benchmark. At the end, this article delineates the challenges currently faced and points out prospective avenues for research and development.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/LN58I4KE/Gao et al. - 2024 - Retrieval-Augmented Generation for Large Language Models A Survey.pdf;/Users/andrew/Zotero/storage/WFGZCHGD/2312.html}
}

@article{gerstenbergCounterfactualSimulationCausal2024,
  title = {Counterfactual Simulation in Causal Cognition},
  author = {Gerstenberg, Tobias},
  date = {2024-05-21},
  journaltitle = {Trends in Cognitive Sciences},
  shortjournal = {Trends in Cognitive Sciences},
  issn = {1364-6613},
  doi = {10.1016/j.tics.2024.04.012},
  url = {https://www.sciencedirect.com/science/article/pii/S1364661324001074},
  urldate = {2024-05-21},
  abstract = {How do people make causal judgments and assign responsibility? In this review article, I argue that counterfactual simulations are key. To simulate counterfactuals, we need three ingredients: a generative mental model of the world, the ability to perform interventions on that model, and the capacity to simulate the consequences of these interventions. The counterfactual simulation model (CSM) uses these ingredients to capture people’s intuitive understanding of the physical and social world. In the physical domain, the CSM predicts people’s causal judgments about dynamic collision events, complex situations that involve multiple causes, omissions as causes, and causes that sustain physical stability. In the social domain, the CSM predicts responsibility judgments in helping and hindering scenarios.},
  keywords = {causality,counterfactuals,intuitive physics,mental simulation,theory of mind}
}

@online{gerstenbergWhatWouldHave2022,
  title = {What Would Have Happened? {{Counterfactuals}}, Hypotheticals, and Causal Judgments},
  shorttitle = {What Would Have Happened?},
  author = {Gerstenberg, Tobias},
  date = {2022-01-20T05:34:47},
  doi = {10.31234/osf.io/rsb46},
  url = {https://psyarxiv.com/rsb46/},
  urldate = {2022-08-31},
  abstract = {How do people make causal judgments? In this paper, I show that counterfactual simulations are necessary for explaining causal judgments about events, and that hypotheticals don't suffice. In two experiments, participants viewed video clips of dynamic interactions between billiard balls. In Experiment 1, participants either made hypothetical judgments about whether ball B would go through the gate if ball A weren't present in the scene, or counterfactual judgments about whether ball B would have gone through the gate if ball A hadn't been present. Because the clips featured a block in front of the gate that sometimes moved and sometimes stayed put, hypothetical and counterfactual judgments came apart. A computational model that evaluates hypotheticals and counterfactuals by running noisy physical simulations accurately captured participants' judgments. In Experiment 2, participants judged whether ball A caused ball B to go through the gate. The results showed a tight fit between counterfactual and causal judgments, whereas hypotheticals didn't predict causal judgments. I discuss the implications of this work for theories of causality, and for studying the development of counterfactual thinking in children.},
  langid = {american},
  pubstate = {prepublished},
  keywords = {causality,Cognitive Psychology,conditional,counterfactual,hypothetical,Imagery,intuitive physics,mental simulation,Reasoning,Social and Behavioral Sciences},
  file = {/Users/andrew/Zotero/storage/BPAXBDQM/Gerstenberg - 2022 - What would have happened Counterfactuals, hypothe.pdf}
}

@article{ghoshOptionTracingBinary,
  title = {Option {{Tracing}}: {{Beyond Binary Knowledge Tracing}}},
  author = {Ghosh, Aritra and Lan, Andrew S},
  abstract = {This paper details our solutions to Tasks 1\&2 of the NeurIPS 2020 Education Challenge.1 Knowledge tracing, a family of methods to estimate each student’s mastery levels on skills/knowledge components from their past responses to assessment questions, is useful for progress monitoring, personalization, and helping teachers to deliver personalized and targeted feedback to students to improve their learning outcomes. One key limitation of current knowledge tracing methods is that they can only estimate an overall knowledge level of a student since they analyze only the binary-valued correctness of student responses. We adapt a series of popular knowledge tracing methods to the task of option prediction in multiple choice questions. Experimental results show that our method performs well on both option prediction and correctness prediction.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/IHK5ULH6/Ghosh and Lan - Option Tracing Beyond Binary Knowledge Tracing.pdf}
}

@online{grayChatGPTContaminationEstimating2024,
  title = {{{ChatGPT}} "Contamination": Estimating the Prevalence of {{LLMs}} in the Scholarly Literature},
  shorttitle = {{{ChatGPT}} "Contamination"},
  author = {Gray, Andrew},
  date = {2024-03-25},
  eprint = {2403.16887},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2403.16887},
  url = {http://arxiv.org/abs/2403.16887},
  urldate = {2024-05-03},
  abstract = {The use of ChatGPT and similar Large Language Model (LLM) tools in scholarly communication and academic publishing has been widely discussed since they became easily accessible to a general audience in late 2022. This study uses keywords known to be disproportionately present in LLM-generated text to provide an overall estimate for the prevalence of LLM-assisted writing in the scholarly literature. For the publishing year 2023, it is found that several of those keywords show a distinctive and disproportionate increase in their prevalence, individually and in combination. It is estimated that at least 60,000 papers (slightly over 1\% of all articles) were LLM-assisted, though this number could be extended and refined by analysis of other characteristics of the papers or by identification of further indicative keywords.},
  pubstate = {prepublished},
  keywords = {Computer Science - Digital Libraries},
  file = {/Users/andrew/Zotero/storage/3MPR5PTQ/Gray_2024_ChatGPT contamination.pdf;/Users/andrew/Zotero/storage/XHQM6MCD/2403.html}
}

@online{grayChatGPTContaminationEstimating2024a,
  title = {{{ChatGPT}} "Contamination": Estimating the Prevalence of {{LLMs}} in the Scholarly Literature},
  shorttitle = {{{ChatGPT}} "Contamination"},
  author = {Gray, Andrew},
  date = {2024-03-25},
  eprint = {2403.16887},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2403.16887},
  url = {http://arxiv.org/abs/2403.16887},
  urldate = {2024-05-29},
  abstract = {The use of ChatGPT and similar Large Language Model (LLM) tools in scholarly communication and academic publishing has been widely discussed since they became easily accessible to a general audience in late 2022. This study uses keywords known to be disproportionately present in LLM-generated text to provide an overall estimate for the prevalence of LLM-assisted writing in the scholarly literature. For the publishing year 2023, it is found that several of those keywords show a distinctive and disproportionate increase in their prevalence, individually and in combination. It is estimated that at least 60,000 papers (slightly over 1\% of all articles) were LLM-assisted, though this number could be extended and refined by analysis of other characteristics of the papers or by identification of further indicative keywords.},
  pubstate = {prepublished},
  keywords = {Computer Science - Digital Libraries},
  file = {/Users/andrew/Zotero/storage/6G2HZ3M5/Gray_2024_ChatGPT contamination.pdf;/Users/andrew/Zotero/storage/N64SUY54/2403.html}
}

@online{griffithsBayesAgeIntelligent2023,
  title = {Bayes in the Age of Intelligent Machines},
  author = {Griffiths, Thomas L. and Zhu, Jian-Qiao and Grant, Erin and McCoy, R. Thomas},
  date = {2023-11-16},
  eprint = {2311.10206},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2311.10206},
  url = {http://arxiv.org/abs/2311.10206},
  urldate = {2023-12-01},
  abstract = {The success of methods based on artificial neural networks in creating intelligent machines seems like it might pose a challenge to explanations of human cognition in terms of Bayesian inference. We argue that this is not the case, and that in fact these systems offer new opportunities for Bayesian modeling. Specifically, we argue that Bayesian models of cognition and artificial neural networks lie at different levels of analysis and are complementary modeling approaches, together offering a way to understand human cognition that spans these levels. We also argue that the same perspective can be applied to intelligent machines, where a Bayesian approach may be uniquely valuable in understanding the behavior of large, opaque artificial neural networks that are trained on proprietary data.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/NTCDZH32/Griffiths et al_2023_Bayes in the age of intelligent machines.pdf;/Users/andrew/Zotero/storage/JPTDCP65/2311.html}
}

@online{guMambaLinearTimeSequence2023,
  title = {Mamba: {{Linear-Time Sequence Modeling}} with {{Selective State Spaces}}},
  shorttitle = {Mamba},
  author = {Gu, Albert and Dao, Tri},
  date = {2023-12-01},
  eprint = {2312.00752},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2312.00752},
  url = {http://arxiv.org/abs/2312.00752},
  urldate = {2024-01-11},
  abstract = {Foundation models, now powering most of the exciting applications in deep learning, are almost universally based on the Transformer architecture and its core attention module. Many subquadratic-time architectures such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs) have been developed to address Transformers' computational inefficiency on long sequences, but they have not performed as well as attention on important modalities such as language. We identify that a key weakness of such models is their inability to perform content-based reasoning, and make several improvements. First, simply letting the SSM parameters be functions of the input addresses their weakness with discrete modalities, allowing the model to selectively propagate or forget information along the sequence length dimension depending on the current token. Second, even though this change prevents the use of efficient convolutions, we design a hardware-aware parallel algorithm in recurrent mode. We integrate these selective SSMs into a simplified end-to-end neural network architecture without attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5\$\textbackslash times\$ higher throughput than Transformers) and linear scaling in sequence length, and its performance improves on real data up to million-length sequences. As a general sequence model backbone, Mamba achieves state-of-the-art performance across several modalities such as language, audio, and genomics. On language modeling, our Mamba-3B model outperforms Transformers of the same size and matches Transformers twice its size, both in pretraining and downstream evaluation.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/2HDTA295/Gu_Dao_2023_Mamba.pdf;/Users/andrew/Zotero/storage/F3MR54WC/2312.html}
}

@online{gurneeLanguageModelsRepresent2023,
  title = {Language {{Models Represent Space}} and {{Time}}},
  author = {Gurnee, Wes and Tegmark, Max},
  date = {2023-10-03},
  eprint = {2310.02207},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2310.02207},
  url = {http://arxiv.org/abs/2310.02207},
  urldate = {2023-10-04},
  abstract = {The capabilities of large language models (LLMs) have sparked debate over whether such systems just learn an enormous collection of superficial statistics or a coherent model of the data generating process -- a world model. We find evidence for the latter by analyzing the learned representations of three spatial datasets (world, US, NYC places) and three temporal datasets (historical figures, artworks, news headlines) in the Llama-2 family of models. We discover that LLMs learn linear representations of space and time across multiple scales. These representations are robust to prompting variations and unified across different entity types (e.g. cities and landmarks). In addition, we identify individual ``space neurons'' and ``time neurons'' that reliably encode spatial and temporal coordinates. Our analysis demonstrates that modern LLMs acquire structured knowledge about fundamental dimensions such as space and time, supporting the view that they learn not merely superficial statistics, but literal world models.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/SDS6AINH/Gurnee_Tegmark_2023_Language Models Represent Space and Time.pdf;/Users/andrew/Zotero/storage/NT4RMCNI/2310.html}
}

@online{gutierrezHippoRAGNeurobiologicallyInspired2024,
  title = {{{HippoRAG}}: {{Neurobiologically Inspired Long-Term Memory}} for {{Large Language Models}}},
  shorttitle = {{{HippoRAG}}},
  author = {Gutiérrez, Bernal Jiménez and Shu, Yiheng and Gu, Yu and Yasunaga, Michihiro and Su, Yu},
  date = {2024-05-23},
  eprint = {2405.14831},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2405.14831},
  url = {http://arxiv.org/abs/2405.14831},
  urldate = {2024-07-01},
  abstract = {In order to thrive in hostile and ever-changing natural environments, mammalian brains evolved to store large amounts of knowledge about the world and continually integrate new information while avoiding catastrophic forgetting. Despite the impressive accomplishments, large language models (LLMs), even with retrieval-augmented generation (RAG), still struggle to efficiently and effectively integrate a large amount of new experiences after pre-training. In this work, we introduce HippoRAG, a novel retrieval framework inspired by the hippocampal indexing theory of human long-term memory to enable deeper and more efficient knowledge integration over new experiences. HippoRAG synergistically orchestrates LLMs, knowledge graphs, and the Personalized PageRank algorithm to mimic the different roles of neocortex and hippocampus in human memory. We compare HippoRAG with existing RAG methods on multi-hop question answering and show that our method outperforms the state-of-the-art methods remarkably, by up to 20\%. Single-step retrieval with HippoRAG achieves comparable or better performance than iterative retrieval like IRCoT while being 10-30 times cheaper and 6-13 times faster, and integrating HippoRAG into IRCoT brings further substantial gains. Finally, we show that our method can tackle new types of scenarios that are out of reach of existing methods. Code and data are available at https://github.com/OSU-NLP-Group/HippoRAG.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/LYYUDVJU/Gutiérrez et al. - 2024 - HippoRAG Neurobiologically Inspired Long-Term Memory for Large Language Models.pdf;/Users/andrew/Zotero/storage/SBTY5CR2/2405.html}
}

@article{hacklGPT4ReliableRater2023,
  title = {Is {{GPT-4}} a Reliable Rater? {{Evaluating}} Consistency in {{GPT-4}}'s Text Ratings},
  shorttitle = {Is {{GPT-4}} a Reliable Rater?},
  author = {Hackl, Veronika and Müller, Alexandra Elena and Granitzer, Michael and Sailer, Maximilian},
  date = {2023-12-05},
  journaltitle = {Frontiers in Education},
  shortjournal = {Front. Educ.},
  volume = {8},
  publisher = {Frontiers},
  issn = {2504-284X},
  doi = {10.3389/feduc.2023.1272229},
  url = {https://www.frontiersin.org/articles/10.3389/feduc.2023.1272229},
  urldate = {2024-03-08},
  abstract = {This study reports the Intraclass Correlation Coefficients of feedback ratings produced by OpenAI's GPT-4, a large language model (LLM), across various iterations, time frames, and stylistic variations. The model was used to rate responses to tasks related to macroeconomics in higher education (HE), based on their content and style. Statistical analysis was performed to determine the absolute agreement and consistency of ratings in all iterations, and the correlation between the ratings in terms of content and style. The findings revealed high interrater reliability, with ICC scores ranging from 0.94 to 0.99 for different time periods, indicating that GPT-4 is capable of producing consistent ratings. The prompt used in this study is also presented and explained.},
  langid = {english},
  keywords = {artificial intelligence,Feedback,GPT-4,higher education,Large Language Model,Prompt Engineering},
  file = {/Users/andrew/Zotero/storage/FBZZ6MBD/Hackl et al_2023_Is GPT-4 a reliable rater.pdf}
}

@online{hagendorffMachinePsychology2024,
  title = {Machine {{Psychology}}},
  author = {Hagendorff, Thilo and Dasgupta, Ishita and Binz, Marcel and Chan, Stephanie C. Y. and Lampinen, Andrew and Wang, Jane X. and Akata, Zeynep and Schulz, Eric},
  date = {2024-08-08},
  eprint = {2303.13988},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2303.13988},
  urldate = {2024-08-13},
  abstract = {Large language models (LLMs) show increasingly advanced emergent capabilities and are being incorporated across various societal domains. Understanding their behavior and reasoning abilities therefore holds significant importance. We argue that a fruitful direction for research is engaging LLMs in behavioral experiments inspired by psychology that have traditionally been aimed at understanding human cognition and behavior. In this article, we highlight and summarize theoretical perspectives, experimental paradigms, and computational analysis techniques that this approach brings to the table. It paves the way for a "machine psychology" for generative artificial intelligence (AI) that goes beyond performance benchmarks and focuses instead on computational insights that move us toward a better understanding and discovery of emergent abilities and behavioral patterns in LLMs. We review existing work taking this approach, synthesize best practices, and highlight promising future directions. We also highlight the important caveats of applying methodologies designed for understanding humans to machines. We posit that leveraging tools from experimental psychology to study AI will become increasingly valuable as models evolve to be more powerful, opaque, multi-modal, and integrated into complex real-world settings.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/RP49N6FD/Hagendorff et al. - 2024 - Machine Psychology.pdf}
}

@online{hagendorffMachinePsychologyInvestigating2023,
  title = {Machine {{Psychology}}: {{Investigating Emergent Capabilities}} and {{Behavior}} in {{Large Language Models Using Psychological Methods}}},
  shorttitle = {Machine {{Psychology}}},
  author = {Hagendorff, Thilo},
  date = {2023-04-11},
  eprint = {2303.13988},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2303.13988},
  url = {http://arxiv.org/abs/2303.13988},
  urldate = {2023-04-20},
  abstract = {Large language models (LLMs) are currently at the forefront of intertwining AI systems with human communication and everyday life. Due to rapid technological advances and their extreme versatility, LLMs nowadays have millions of users and are at the cusp of being the main go-to technology for information retrieval, content generation, problem-solving, etc. Therefore, it is of great importance to thoroughly assess and scrutinize their capabilities. Due to increasingly complex and novel behavioral patterns in current LLMs, this can be done by treating them as participants in psychology experiments that were originally designed to test humans. For this purpose, the paper introduces a new field of research called "machine psychology". The paper outlines how different subfields of psychology can inform behavioral tests for LLMs. It defines methodological standards for machine psychology research, especially by focusing on policies for prompt designs. Additionally, it describes how behavioral patterns discovered in LLMs are to be interpreted. In sum, machine psychology aims to discover emergent abilities in LLMs that cannot be detected by most traditional natural language processing benchmarks.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/GGMKIIXK/Hagendorff_2023_Machine Psychology.pdf;/Users/andrew/Zotero/storage/GNLYS9D7/2303.html}
}

@online{handaBayesianPreferenceElicitation2024,
  title = {Bayesian {{Preference Elicitation}} with {{Language Models}}},
  author = {Handa, Kunal and Gal, Yarin and Pavlick, Ellie and Goodman, Noah and Andreas, Jacob and Tamkin, Alex and Li, Belinda Z.},
  date = {2024-03-08},
  eprint = {2403.05534},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2403.05534},
  url = {http://arxiv.org/abs/2403.05534},
  urldate = {2024-09-09},
  abstract = {Aligning AI systems to users' interests requires understanding and incorporating humans' complex values and preferences. Recently, language models (LMs) have been used to gather information about the preferences of human users. This preference data can be used to fine-tune or guide other LMs and/or AI systems. However, LMs have been shown to struggle with crucial aspects of preference learning: quantifying uncertainty, modeling human mental states, and asking informative questions. These challenges have been addressed in other areas of machine learning, such as Bayesian Optimal Experimental Design (BOED), which focus on designing informative queries within a well-defined feature space. But these methods, in turn, are difficult to scale and apply to real-world problems where simply identifying the relevant features can be difficult. We introduce OPEN (Optimal Preference Elicitation with Natural language) a framework that uses BOED to guide the choice of informative questions and an LM to extract features and translate abstract BOED queries into natural language questions. By combining the flexibility of LMs with the rigor of BOED, OPEN can optimize the informativity of queries while remaining adaptable to real-world domains. In user studies, we find that OPEN outperforms existing LM- and BOED-based methods for preference elicitation.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/CZSN73XN/Handa et al. - 2024 - Bayesian Preference Elicitation with Language Models.pdf;/Users/andrew/Zotero/storage/N9Y9YDP2/2403.html}
}

@report{harnadLanguageWritLarge2024,
  type = {preprint},
  title = {Language {{Writ Large}}: {{LLMs}}, {{ChatGPT}}, {{Grounding}}, {{Meaning}} and {{Understanding}}},
  shorttitle = {Language {{Writ Large}}},
  author = {Harnad, Stevan},
  date = {2024-02-03},
  institution = {PsyArXiv},
  doi = {10.31234/osf.io/ch2wx},
  url = {https://osf.io/ch2wx},
  urldate = {2024-02-07},
  abstract = {Apart from what (little) OpenAI may be concealing from us, we all know (roughly) how ChatGPT works (its huge text database, its statistics, its vector representations, and their huge number of parameters, its next-word training, etc.). But none of us can say (hand on heart) that we are not surprised by what ChatGPT has proved to be able to do with these resources. This has even driven some of us to conclude that ChatGPT actually understands. It’s not true that it understands. But it is also not true that we understand how it can do what it can do. I will suggest some hunches about benign “biases” -- convergent constraints that emerge at LLM-scale that may be helping ChatGPT do so much better than we would have expected. These biases are inherent in the nature of language itself, at LLM-scale, and they are closely linked to what it is that ChatGPT lacks, which is direct sensorimotor grounding to connect its words to their referents and its propositions to their meanings. These convergent biases are related to (1) the parasitism of indirect verbal grounding on direct sensorimotor grounding, (2) the circularity of verbal definition, (3) the “mirroring” of language production and comprehension, (4) iconicity in propositions at LLM-scale, (5) computational counterparts of human “categorical perception” in category learning by neural nets, and perhaps also (6) a conjecture by Chomsky about the laws of thought. The exposition will be in the form of a dialogue with ChatGPT-4.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/6WFW58IV/Harnad - 2024 - Language Writ Large LLMs, ChatGPT, Grounding, Mea.pdf}
}

@online{he-yueyaEvaluatingOptimizingEducational2024,
  title = {Evaluating and {{Optimizing Educational Content}} with {{Large Language Model Judgments}}},
  author = {He-Yueya, Joy and Goodman, Noah D. and Brunskill, Emma},
  date = {2024-05-06},
  eprint = {2403.02795},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2403.02795},
  urldate = {2024-08-16},
  abstract = {Creating effective educational materials generally requires expensive and time-consuming studies of student learning outcomes. To overcome this barrier, one idea is to build computational models of student learning and use them to optimize instructional materials. However, it is difficult to model the cognitive processes of learning dynamics. We propose an alternative approach that uses Language Models (LMs) as educational experts to assess the impact of various instructions on learning outcomes. Specifically, we use GPT-3.5 to evaluate the overall effect of instructional materials on different student groups and find that it can replicate well-established educational findings such as the Expertise Reversal Effect and the Variability Effect. This demonstrates the potential of LMs as reliable evaluators of educational content. Building on this insight, we introduce an instruction optimization approach in which one LM generates instructional materials using the judgments of another LM as a reward function. We apply this approach to create math word problem worksheets aimed at maximizing student learning gains. Human teachers' evaluations of these LM-generated worksheets show a significant alignment between the LM judgments and human teacher preferences. We conclude by discussing potential divergences between human and LM opinions and the resulting pitfalls of automating instructional design.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/8S7JF5C7/He-Yueya et al. - 2024 - Evaluating and Optimizing Educational Content with Large Language Model Judgments.pdf}
}

@online{hengstConformalIntentClassification2024,
  title = {Conformal {{Intent Classification}} and {{Clarification}} for {{Fast}} and {{Accurate Intent Recognition}}},
  author = {family=Hengst, given=Floris, prefix=den, useprefix=false and Wolter, Ralf and Altmeyer, Patrick and Kaygan, Arda},
  date = {2024-03-27},
  eprint = {2403.18973},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2403.18973},
  url = {http://arxiv.org/abs/2403.18973},
  urldate = {2024-04-01},
  abstract = {We present Conformal Intent Classification and Clarification (CICC), a framework for fast and accurate intent classification for task-oriented dialogue systems. The framework turns heuristic uncertainty scores of any intent classifier into a clarification question that is guaranteed to contain the true intent at a pre-defined confidence level. By disambiguating between a small number of likely intents, the user query can be resolved quickly and accurately. Additionally, we propose to augment the framework for out-of-scope detection. In a comparative evaluation using seven intent recognition datasets we find that CICC generates small clarification questions and is capable of out-of-scope detection. CICC can help practitioners and researchers substantially in improving the user experience of dialogue agents with specific clarification questions.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/IPGGQ94N/Hengst et al_2024_Conformal Intent Classification and Clarification for Fast and Accurate Intent.pdf;/Users/andrew/Zotero/storage/42WXCBAJ/2403.html}
}

@article{hofstadterAnalogyCoreCognition,
  title = {Analogy as the {{Core}} of {{Cognition}}},
  author = {Hofstadter, Douglas R.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/TB7X6P3X/Hofstadter_Analogy as the Core of Cognition.pdf}
}

@online{Home,
  title = {Home},
  url = {https://sites.google.com/view/social-reasoning-lms},
  urldate = {2023-07-01},
  abstract = {ABSTRACT As Large Language Models (LLMs) become increasingly integrated into our everyday lives, understanding their ability to comprehend human mental states becomes critical for ensuring effective interactions. However, despite the recent attempts to assess the Theory-of-Mind (ToM) reasoning},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/8W93VA9V/social-reasoning-lms.html}
}

@article{howeChatGPTAdvicePerceived2023,
  title = {{{ChatGPT}}’s Advice Is Perceived as Better than That of Professional Advice Columnists},
  author = {Howe, Piers Douglas Lionel and Fay, Nicolas and Saletta, Morgan and Hovy, Eduard},
  date = {2023},
  journaltitle = {Frontiers in Psychology},
  volume = {14},
  issn = {1664-1078},
  doi = {10.3389/fpsyg.2023.1281255},
  url = {https://www.frontiersin.org/articles/10.3389/fpsyg.2023.1281255},
  urldate = {2024-03-08},
  abstract = {ChatGPT is a high-performance large language model that has the potential to significantly improve human-computer interactions. It can provide advice on a range of topics, but it is unclear how good this advice is relative to that provided by competent humans, especially in situations where empathy is required. Here, we report the first investigation of whether ChatGPT’s responses are perceived as better than those of humans in a task where humans were attempting to be empathetic. Fifty social dilemma questions were randomly selected from 10 well-known advice columns. In a pre-registered survey, participants (N = 404) were each shown one question, along with the corresponding response by an advice columnist and by ChatGPT. ChatGPT’s advice was perceived as more balanced, complete, empathetic, helpful, and better than the advice provided by professional advice columnists (all values of p {$<$} 0.001). Although participants could not determine which response was written by ChatGPT (54\%, p = 0.29), most participants preferred that their own social dilemma questions be answered by a human than by a computer (77\%, p {$<$} 0.001). ChatGPT’s responses were longer than those produced by the advice columnists (mean 280.9 words vs. 142.2 words, p {$<$} 0.001). In a second pre-registered survey, each ChatGPT answer was constrained to be approximately the same length as that of the advice columnist (mean 143.2 vs. 142.2 words, p = 0.95). This survey (N = 401) replicated the above findings, showing that the benefit of ChatGPT was not solely due to it writing longer answers.},
  file = {/Users/andrew/Zotero/storage/M5RGQQTS/Howe et al_2023_ChatGPT’s advice is perceived as better than that of professional advice.pdf}
}

@online{hrefHowDoesIncontext,
  title = {How Does In-Context Learning Work? {{A}} Framework for Understanding the Differences from Traditional Supervised Learning},
  shorttitle = {How Does In-Context Learning Work?},
  author = {{href=},},
  url = {http://ai.stanford.edu/blog/understanding-incontext/},
  urldate = {2023-03-28},
  abstract = {The official Stanford AI Lab blog},
  keywords = {/unread}
}

@online{huangLanguageNotAll2023,
  title = {Language {{Is Not All You Need}}: {{Aligning Perception}} with {{Language Models}}},
  shorttitle = {Language {{Is Not All You Need}}},
  author = {Huang, Shaohan and Dong, Li and Wang, Wenhui and Hao, Yaru and Singhal, Saksham and Ma, Shuming and Lv, Tengchao and Cui, Lei and Mohammed, Owais Khan and Patra, Barun and Liu, Qiang and Aggarwal, Kriti and Chi, Zewen and Bjorck, Johan and Chaudhary, Vishrav and Som, Subhojit and Song, Xia and Wei, Furu},
  date = {2023-03-01},
  eprint = {2302.14045},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2302.14045},
  urldate = {2023-03-02},
  abstract = {A big convergence of language, multimodal perception, action, and world modeling is a key step toward artificial general intelligence. In this work, we introduce Kosmos-1, a Multimodal Large Language Model (MLLM) that can perceive general modalities, learn in context (i.e., few-shot), and follow instructions (i.e., zero-shot). Specifically, we train Kosmos-1 from scratch on web-scale multimodal corpora, including arbitrarily interleaved text and images, image-caption pairs, and text data. We evaluate various settings, including zero-shot, few-shot, and multimodal chain-of-thought prompting, on a wide range of tasks without any gradient updates or finetuning. Experimental results show that Kosmos-1 achieves impressive performance on (i) language understanding, generation, and even OCR-free NLP (directly fed with document images), (ii) perception-language tasks, including multimodal dialogue, image captioning, visual question answering, and (iii) vision tasks, such as image recognition with descriptions (specifying classification via text instructions). We also show that MLLMs can benefit from cross-modal transfer, i.e., transfer knowledge from language to multimodal, and from multimodal to language. In addition, we introduce a dataset of Raven IQ test, which diagnoses the nonverbal reasoning capability of MLLMs.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition},
  file = {/Users/andrew/Zotero/storage/U6D9JARC/Huang et al_2023_Language Is Not All You Need.pdf;/Users/andrew/Zotero/storage/MEX5ATVT/2302.html}
}

@online{huangReasoningLargeLanguage2022,
  title = {Towards {{Reasoning}} in {{Large Language Models}}: {{A Survey}}},
  shorttitle = {Towards {{Reasoning}} in {{Large Language Models}}},
  author = {Huang, Jie and Chang, Kevin Chen-Chuan},
  date = {2022-12-20},
  eprint = {2212.10403},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2212.10403},
  url = {http://arxiv.org/abs/2212.10403},
  urldate = {2023-04-20},
  abstract = {Reasoning is a fundamental aspect of human intelligence that plays a crucial role in activities such as problem solving, decision making, and critical thinking. In recent years, large language models (LLMs) have made significant progress in natural language processing, and there is observation that these models may exhibit reasoning abilities when they are sufficiently large. However, it is not yet clear to what extent LLMs are capable of reasoning. This paper provides a comprehensive overview of the current state of knowledge on reasoning in LLMs, including techniques for improving and eliciting reasoning in these models, methods and benchmarks for evaluating reasoning abilities, findings and implications of previous research in this field, and suggestions on future directions. Our aim is to provide a detailed and up-to-date review of this topic and stimulate meaningful discussion and future work.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/XL6CTLJE/Huang_Chang_2022_Towards Reasoning in Large Language Models.pdf;/Users/andrew/Zotero/storage/D4H273EY/2212.html}
}

@online{huAuxiliaryTaskDemands2024,
  title = {Auxiliary Task Demands Mask the Capabilities of Smaller Language Models},
  author = {Hu, Jennifer and Frank, Michael C.},
  date = {2024-04-02},
  eprint = {2404.02418},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2404.02418},
  urldate = {2024-04-05},
  abstract = {Developmental psychologists have argued about when cognitive capacities such as language understanding or theory of mind emerge. These debates often hinge on the concept of "task demands" -- the auxiliary challenges associated with performing a particular evaluation -- that may mask the child's underlying ability. The same issues arise when measuring the capacities of language models (LMs): performance on a task is a function of the model's underlying competence, combined with the model's ability to interpret and perform the task given its available resources. Here, we show that for analogical reasoning, reflective reasoning, word prediction, and grammaticality judgments, evaluation methods with greater task demands yield lower performance than evaluations with reduced demands. This "demand gap" is most pronounced for models with fewer parameters and less training data. Our results illustrate that LM performance should not be interpreted as a direct indication of intelligence (or lack thereof), but as a reflection of capacities seen through the lens of researchers' design choices.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/FCFUYI3Q/Hu_Frank_2024_Auxiliary task demands mask the capabilities of smaller language models.pdf;/Users/andrew/Zotero/storage/UNXRVSE5/2404.html}
}

@inproceedings{hunzikerTeachingMultipleConcepts2019a,
  title = {Teaching {{Multiple Concepts}} to a {{Forgetful Learner}}},
  booktitle = {Advances in {{Neural Information Processing Systems}}},
  author = {Hunziker, Anette and Chen, Yuxin and Mac Aodha, Oisin and Gomez Rodriguez, Manuel and Krause, Andreas and Perona, Pietro and Yue, Yisong and Singla, Adish},
  date = {2019},
  volume = {32},
  publisher = {Curran Associates, Inc.},
  url = {https://proceedings.neurips.cc/paper/2019/hash/2952351097998ac1240cb2ab7333a3d2-Abstract.html},
  urldate = {2023-02-13},
  abstract = {How can we help a forgetful learner learn multiple concepts within a limited time frame? While there have been extensive studies in designing optimal schedules for teaching a single concept given a learner's memory model, existing approaches for teaching multiple concepts are typically based on heuristic scheduling techniques without theoretical guarantees. In this paper, we look at the problem from the perspective of discrete optimization and introduce a novel algorithmic framework for teaching multiple concepts with strong performance guarantees.  Our framework is both generic, allowing the design of teaching schedules for different memory models, and also interactive, allowing the teacher to adapt the schedule to the underlying forgetting mechanisms of the learner. Furthermore, for a well-known memory model, we are able to identify a regime of model parameters where our framework is guaranteed to achieve high performance. We perform extensive evaluations using simulations along with real user studies in two concrete applications: (i) an educational app for online vocabulary teaching; and (ii) an app for teaching novices how to recognize animal species from images.  Our results demonstrate the effectiveness of our algorithm compared to popular heuristic approaches.},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/W2EDAAKP/Hunziker et al_2019_Teaching Multiple Concepts to a Forgetful Learner.pdf}
}

@online{hussainAreLLMsJust2024,
  title = {Are {{LLMs}} “Just” next-Token Predictors?},
  author = {Hussain, Zak and Mata, Rui and Wulff, Dirk U.},
  date = {2024-07-18},
  eprinttype = {OSF},
  doi = {10.31219/osf.io/y34ur},
  url = {https://osf.io/y34ur},
  urldate = {2024-07-22},
  abstract = {Large language models (LLMs) are often labeled as just next-token predictors devoid of cognitive capacities such as thought or understanding. We argue that these deflationary claims are premature. Drawing on prominent theoretical and philosophical frameworks in cognitive science, we critically evaluate different forms of Justaism that dismiss LLM cognition by labeling LLMs as just simplistic entities without specifying or substantiating the critical capacities they supposedly lack. Our analysis highlights the need for a more nuanced discussion of LLM cognition, aiming to better inform future research and the development of machine intelligence.},
  langid = {american},
  pubstate = {prepublished},
  keywords = {AI,artificial intelligence,cognition,large language models,LLMs,psychology},
  file = {/Users/andrew/Zotero/storage/6MVR8BXI/Hussain et al. - 2024 - Are LLMs “just” next-token predictors.pdf}
}

@online{huThoughtCloningLearning2023,
  title = {Thought {{Cloning}}: {{Learning}} to {{Think}} While {{Acting}} by {{Imitating Human Thinking}}},
  shorttitle = {Thought {{Cloning}}},
  author = {Hu, Shengran and Clune, Jeff},
  date = {2023-05-31},
  eprint = {2306.00323},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.00323},
  url = {http://arxiv.org/abs/2306.00323},
  urldate = {2023-06-09},
  abstract = {Language is often considered a key aspect of human thinking, providing us with exceptional abilities to generalize, explore, plan, replan, and adapt to new situations. However, Reinforcement Learning (RL) agents are far from human-level performance in any of these abilities. We hypothesize one reason for such cognitive deficiencies is that they lack the benefits of thinking in language and that we can improve AI agents by training them to think like humans do. We introduce a novel Imitation Learning framework, Thought Cloning, where the idea is to not just clone the behaviors of human demonstrators, but also the thoughts humans have as they perform these behaviors. While we expect Thought Cloning to truly shine at scale on internet-sized datasets of humans thinking out loud while acting (e.g. online videos with transcripts), here we conduct experiments in a domain where the thinking and action data are synthetically generated. Results reveal that Thought Cloning learns much faster than Behavioral Cloning and its performance advantage grows the further out of distribution test tasks are, highlighting its ability to better handle novel situations. Thought Cloning also provides important benefits for AI Safety and Interpretability, and makes it easier to debug and improve AI. Because we can observe the agent's thoughts, we can (1) more easily diagnose why things are going wrong, making it easier to fix the problem, (2) steer the agent by correcting its thinking, or (3) prevent it from doing unsafe things it plans to do. Overall, by training agents how to think as well as behave, Thought Cloning creates safer, more powerful agents.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/XGKRWFQ6/Hu_Clune_2023_Thought Cloning.pdf;/Users/andrew/Zotero/storage/7DUCYMMG/2306.html}
}

@online{imaniMathPrompterMathematicalReasoning2023,
  title = {{{MathPrompter}}: {{Mathematical Reasoning}} Using {{Large Language Models}}},
  shorttitle = {{{MathPrompter}}},
  author = {Imani, Shima and Du, Liang and Shrivastava, Harsh},
  date = {2023-03-03},
  eprint = {2303.05398},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2303.05398},
  urldate = {2023-03-23},
  abstract = {Large Language Models (LLMs) have limited performance when solving arithmetic reasoning tasks and often provide incorrect answers. Unlike natural language understanding, math problems typically have a single correct answer, making the task of generating accurate solutions more challenging for LLMs. To the best of our knowledge, we are not aware of any LLMs that indicate their level of confidence in their responses which fuels a trust deficit in these models impeding their adoption. To address this deficiency, we propose `MathPrompter', a technique that improves performance of LLMs on arithmetic problems along with increased reliance in the predictions. MathPrompter uses the Zero-shot chain-of-thought prompting technique to generate multiple Algebraic expressions or Python functions to solve the same math problem in different ways and thereby raise the confidence level in the output results. This is in contrast to other prompt based CoT methods, where there is no check on the validity of the intermediate steps followed. Our technique improves over state-of-the-art on the MultiArith dataset (\$78.7\textbackslash\%\textbackslash rightarrow92.5\textbackslash\%\$) evaluated using 175B parameter GPT-based LLM.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/7A7HVRIY/Imani et al_2023_MathPrompter.pdf;/Users/andrew/Zotero/storage/PGSEZPZP/2303.html}
}

@online{ivanovaRunningCognitiveEvaluations2023,
  title = {Running Cognitive Evaluations on Large Language Models: {{The}} Do's and the Don'ts},
  shorttitle = {Running Cognitive Evaluations on Large Language Models},
  author = {Ivanova, Anna A.},
  date = {2023-12-02},
  eprint = {2312.01276},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2312.01276},
  url = {http://arxiv.org/abs/2312.01276},
  urldate = {2023-12-07},
  abstract = {In this paper, I describe methodological considerations for studies that aim to evaluate the cognitive capacities of large language models (LLMs) using language-based behavioral assessments. Drawing on three case studies from the literature (a commonsense knowledge benchmark, a theory of mind evaluation, and a test of syntactic agreement), I describe common pitfalls that might arise when applying a cognitive test to an LLM. I then list 10 do's and don'ts that should help design high-quality cognitive evaluations for AI systems. I conclude by discussing four areas where the do's and don'ts are currently under active discussion -- prompt sensitivity, cultural and linguistic diversity, using LLMs as research assistants, and running evaluations on open vs. closed LLMs. Overall, the goal of the paper is to contribute to the broader discussion of best practices in the rapidly growing field of AI Psychology.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/HKENXAFP/Ivanova_2023_Running cognitive evaluations on large language models.pdf;/Users/andrew/Zotero/storage/8H7KH6M7/2312.html}
}

@online{jinCanLargeLanguage2023,
  title = {Can {{Large Language Models Infer Causation}} from {{Correlation}}?},
  author = {Jin, Zhijing and Liu, Jiarui and Lyu, Zhiheng and Poff, Spencer and Sachan, Mrinmaya and Mihalcea, Rada and Diab, Mona and Schölkopf, Bernhard},
  date = {2023-06-09},
  eprint = {2306.05836},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.05836},
  url = {http://arxiv.org/abs/2306.05836},
  urldate = {2023-09-21},
  abstract = {Causal inference is one of the hallmarks of human intelligence. While the field of CausalNLP has attracted much interest in the recent years, existing causal inference datasets in NLP primarily rely on discovering causality from empirical knowledge (e.g., commonsense knowledge). In this work, we propose the first benchmark dataset to test the pure causal inference skills of large language models (LLMs). Specifically, we formulate a novel task Corr2Cause, which takes a set of correlational statements and determines the causal relationship between the variables. We curate a large-scale dataset of more than 400K samples, on which we evaluate seventeen existing LLMs. Through our experiments, we identify a key shortcoming of LLMs in terms of their causal inference skills, and show that these models achieve almost close to random performance on the task. This shortcoming is somewhat mitigated when we try to re-purpose LLMs for this skill via finetuning, but we find that these models still fail to generalize -- they can only perform causal inference in in-distribution settings when variable names and textual expressions used in the queries are similar to those in the training set, but fail in out-of-distribution settings generated by perturbing these queries. Corr2Cause is a challenging task for LLMs, and would be helpful in guiding future research on improving LLMs' pure reasoning skills and generalizability. Our data is at https://huggingface.co/datasets/causalnlp/corr2cause. Our code is at https://github.com/causalNLP/corr2cause.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/4GZHHJS6/Jin et al_2023_Can Large Language Models Infer Causation from Correlation.pdf;/Users/andrew/Zotero/storage/98ST4W58/2306.html}
}

@online{jinImpactReasoningStep2024,
  title = {The {{Impact}} of {{Reasoning Step Length}} on {{Large Language Models}}},
  author = {Jin, Mingyu and Yu, Qinkai and {shu}, Dong and Zhao, Haiyan and Hua, Wenyue and Meng, Yanda and Zhang, Yongfeng and Du, Mengnan},
  date = {2024-01-09},
  eprint = {2401.04925},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2401.04925},
  urldate = {2024-01-11},
  abstract = {Chain of Thought (CoT) is significant in improving the reasoning abilities of large language models (LLMs). However, the correlation between the effectiveness of CoT and the length of reasoning steps in prompts remains largely unknown. To shed light on this, we have conducted several empirical experiments to explore the relations. Specifically, we design experiments that expand and compress the rationale reasoning steps within CoT demonstrations, while keeping all other factors constant. We have the following key findings. First, the results indicate that lengthening the reasoning steps in prompts, even without adding new information into the prompt, considerably enhances LLMs' reasoning abilities across multiple datasets. Alternatively, shortening the reasoning steps, even while preserving the key information, significantly diminishes the reasoning abilities of models. This finding highlights the importance of the number of steps in CoT prompts and provides practical guidance to make better use of LLMs' potential in complex problem-solving scenarios. Second, we also investigated the relationship between the performance of CoT and the rationales used in demonstrations. Surprisingly, the result shows that even incorrect rationales can yield favorable outcomes if they maintain the requisite length of inference. Third, we observed that the advantages of increasing reasoning steps are task-dependent: simpler tasks require fewer steps, whereas complex tasks gain significantly from longer inference sequences.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/PJ4HZ8T6/Jin et al_2024_The Impact of Reasoning Step Length on Large Language Models.pdf;/Users/andrew/Zotero/storage/4HSRBMGH/2401.html}
}

@article{johnson-lairdWhatShouldReplace2023,
  title = {What {{Should Replace}} the {{Turing Test}}?},
  author = {Johnson-Laird, Philip N. and Ragni, Marco},
  date = {2023-11-10},
  journaltitle = {Intelligent Computing},
  volume = {2},
  pages = {0064},
  publisher = {American Association for the Advancement of Science},
  doi = {10.34133/icomputing.0064},
  url = {https://spj.science.org/doi/10.34133/icomputing.0064},
  urldate = {2023-12-18},
  abstract = {Today, chatbots and other artificial intelligence tools pass the Turing test, which was Turing’s alternative to trying to answer the question: can a machine think? Despite their success in passing the Turing test, these machines do not think. We therefore propose a test of a more focused question: does a program reason in the way that humans reason? This test treats an “intelligent” program as though it were a participant in a psychological study and has 3 steps: (a) test the program in a set of experiments examining its inferences, (b) test its understanding of its own way of reasoning, and (c) examine, if possible, the cognitive adequacy of the source code for the program.},
  file = {/Users/andrew/Zotero/storage/I34UB3K2/Johnson-Laird_Ragni_2023_What Should Replace the Turing Test.pdf}
}

@article{jurenkaResponsibleDevelopmentGenerative,
  title = {Towards {{Responsible Development}} of {{Generative AI}} for {{Education}}: {{An Evaluation-Driven Approach}}},
  author = {Jurenka, Irina and Kunesch, Markus and McKee, Kevin and Gillick, Daniel and Zhu, Shaojian and Wiltberger, Sara and Phal, Shubham Milind and Hermann, Katherine and Kasenberg, Daniel and Bhoopchand, Avishkar and Anand, Ankit and Pîslar, Miruna and Chan, Stephanie and Wang, Lisa and She, Jennifer and Mahmoudieh, Parsa and Rysbek, Aliya and Huber, Andrea and Wiltshire, Brett and Elidan, Gal and Rabin, Roni and Rubinovitz, Jasmin and Pitaru, Amit and Wilkowski, Julia and Choi, David and Engelberg, Roee and Hackmon, Lidan and Levin, Adva and Griffin, Rachel and Sears, Michael and Bar, Filip and Mesar, Mia and Jabbour, Mana and Chaudhry, Arslan and Cohan, James and Levine, Nir and Brown, Ben and Gorur, Dilan and Grant, Svetlana and Hashimoshoni, Rachel and Hu, Jieru and Chen, Dawn and Dolecki, Kuba and Akbulut, Canfer and Bileschi, Maxwell and Culp, Laura and Dong, Wen-Xin and Marchal, Nahema and Deman, Kelsie Van and Bajaj, Hema and Duah, Michael and Ambar, Moran and Lefdal, Sandra and Summerfield, Chris and An, James and Kamienny, Pierre-Alexandre and Mohdi, Abhinit and Strinopoulous, Theofilos and Hale, Annie and Anderson, Wayne and Cobo, Luis C and Efron, Niv and Ananda, Muktha and Mohamed, Shakir and Heymans, Maureen and Ghahramani, Zoubin and Matias, Yossi and Gomes, Ben and Ibrahim, Lila},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/DTF2RHSR/Jurenka et al. - Towards Responsible Development of Generative AI f.pdf}
}

@online{kaddourChallengesApplicationsLarge2023,
  title = {Challenges and {{Applications}} of {{Large Language Models}}},
  author = {Kaddour, Jean and Harris, Joshua and Mozes, Maximilian and Bradley, Herbie and Raileanu, Roberta and McHardy, Robert},
  date = {2023-07-19},
  eprint = {2307.10169},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2307.10169},
  url = {http://arxiv.org/abs/2307.10169},
  urldate = {2023-07-25},
  abstract = {Large Language Models (LLMs) went from non-existent to ubiquitous in the machine learning discourse within a few years. Due to the fast pace of the field, it is difficult to identify the remaining challenges and already fruitful application areas. In this paper, we aim to establish a systematic set of open problems and application successes so that ML researchers can comprehend the field's current state more quickly and become productive.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/THLCX6BU/Kaddour et al_2023_Challenges and Applications of Large Language Models.pdf;/Users/andrew/Zotero/storage/LY9873T3/2307.html}
}

@online{kalaiCalibratedLanguageModels2024d,
  title = {Calibrated {{Language Models Must Hallucinate}}},
  author = {Kalai, Adam Tauman and Vempala, Santosh S.},
  date = {2024-03-19},
  eprint = {2311.14648},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2311.14648},
  url = {http://arxiv.org/abs/2311.14648},
  urldate = {2024-06-13},
  abstract = {Recent language models generate false but plausible-sounding text with surprising frequency. Such "hallucinations" are an obstacle to the usability of language-based AI systems and can harm people who rely upon their outputs. This work shows that there is an inherent statistical lower-bound on the rate that pretrained language models hallucinate certain types of facts, having nothing to do with the transformer LM architecture or data quality. For "arbitrary" facts whose veracity cannot be determined from the training data, we show that hallucinations must occur at a certain rate for language models that satisfy a statistical calibration condition appropriate for generative language models. Specifically, if the maximum probability of any fact is bounded, we show that the probability of generating a hallucination is close to the fraction of facts that occur exactly once in the training data (a "Good-Turing" estimate), even assuming ideal training data without errors. One conclusion is that models pretrained to be sufficiently good predictors (i.e., calibrated) may require post-training to mitigate hallucinations on the type of arbitrary facts that tend to appear once in the training set. However, our analysis also suggests that there is no statistical reason that pretraining will lead to hallucination on facts that tend to appear more than once in the training data (like references to publications such as articles and books, whose hallucinations have been particularly notable and problematic) or on systematic facts (like arithmetic calculations). Therefore, different architectures and learning algorithms may mitigate these latter types of hallucinations.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/KAC7XNGM/Kalai_Vempala_2024_Calibrated Language Models Must Hallucinate.pdf;/Users/andrew/Zotero/storage/9UCZAKVY/2311.html}
}

@article{kambhampatiCanLargeLanguage2024,
  title = {Can {{Large Language Models Reason}} and {{Plan}}?},
  author = {Kambhampati, Subbarao},
  date = {2024-04},
  journaltitle = {Annals of the New York Academy of Sciences},
  shortjournal = {Annals of the New York Academy of Sciences},
  volume = {1534},
  number = {1},
  eprint = {2403.04121},
  eprinttype = {arXiv},
  eprintclass = {cs},
  pages = {15--18},
  issn = {0077-8923, 1749-6632},
  doi = {10.1111/nyas.15125},
  url = {http://arxiv.org/abs/2403.04121},
  urldate = {2024-08-10},
  abstract = {While humans sometimes do show the capability of correcting their own erroneous guesses with self-critiquing, there seems to be no basis for that assumption in the case of LLMs.},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/Y7PTQDES/Kambhampati - 2024 - Can Large Language Models Reason and Plan.pdf;/Users/andrew/Zotero/storage/KLI9M3T7/2403.html}
}

@online{kambhampatiLLMsCanPlan2024,
  title = {{{LLMs Can}}'t {{Plan}}, {{But Can Help Planning}} in {{LLM-Modulo Frameworks}}},
  author = {Kambhampati, Subbarao and Valmeekam, Karthik and Guan, Lin and Verma, Mudit and Stechly, Kaya and Bhambri, Siddhant and Saldyt, Lucas and Murthy, Anil},
  date = {2024-06-11},
  eprint = {2402.01817},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2402.01817},
  url = {http://arxiv.org/abs/2402.01817},
  urldate = {2024-08-14},
  abstract = {There is considerable confusion about the role of Large Language Models (LLMs) in planning and reasoning tasks. On one side are over-optimistic claims that LLMs can indeed do these tasks with just the right prompting or self-verification strategies. On the other side are perhaps over-pessimistic claims that all that LLMs are good for in planning/reasoning tasks are as mere translators of the problem specification from one syntactic format to another, and ship the problem off to external symbolic solvers. In this position paper, we take the view that both these extremes are misguided. We argue that auto-regressive LLMs cannot, by themselves, do planning or self-verification (which is after all a form of reasoning), and shed some light on the reasons for misunderstandings in the literature. We will also argue that LLMs should be viewed as universal approximate knowledge sources that have much more meaningful roles to play in planning/reasoning tasks beyond simple front-end/back-end format translators. We present a vision of \{\textbackslash bf LLM-Modulo Frameworks\} that combine the strengths of LLMs with external model-based verifiers in a tighter bi-directional interaction regime. We will show how the models driving the external verifiers themselves can be acquired with the help of LLMs. We will also argue that rather than simply pipelining LLMs and symbolic components, this LLM-Modulo Framework provides a better neuro-symbolic approach that offers tighter integration between LLMs and symbolic components, and allows extending the scope of model-based planning/reasoning regimes towards more flexible knowledge, problem and preference specifications.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/KDLJGI2B/Kambhampati et al. - 2024 - LLMs Can't Plan, But Can Help Planning in LLM-Modulo Frameworks.pdf;/Users/andrew/Zotero/storage/GMZSTD5K/2402.html}
}

@incollection{kaserKnowledgeTracingModeling2014,
  title = {Beyond {{Knowledge Tracing}}: {{Modeling Skill Topologies}} with {{Bayesian Networks}}},
  shorttitle = {Beyond {{Knowledge Tracing}}},
  booktitle = {Intelligent {{Tutoring Systems}}},
  author = {Käser, Tanja and Klingler, Severin and Schwing, Alexander Gerhard and Gross, Markus},
  editor = {Trausan-Matu, Stefan and Boyer, Kristy Elizabeth and Crosby, Martha and Panourgia, Kitty},
  editora = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Kobsa, Alfred and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Terzopoulos, Demetri and Tygar, Doug and Weikum, Gerhard},
  editoratype = {redactor},
  date = {2014},
  volume = {8474},
  pages = {188--198},
  publisher = {Springer International Publishing},
  location = {Cham},
  doi = {10.1007/978-3-319-07221-0_23},
  url = {http://link.springer.com/10.1007/978-3-319-07221-0_23},
  urldate = {2023-02-23},
  abstract = {Modeling and predicting student knowledge is a fundamental task of an intelligent tutoring system. A popular approach for student modeling is Bayesian Knowledge Tracing (BKT). BKT models, however, lack the ability to describe the hierarchy and relationships between the different skills of a learning domain. In this work, we therefore aim at increasing the representational power of the student model by employing dynamic Bayesian networks that are able to represent such skill topologies. To ensure model interpretability, we constrain the parameter space. We evaluate the performance of our models on five large-scale data sets of different learning domains such as mathematics, spelling learning and physics, and demonstrate that our approach outperforms BKT in prediction accuracy on unseen data across all learning domains.},
  isbn = {978-3-319-07220-3 978-3-319-07221-0},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/PVTWQBMU/Käser et al. - 2014 - Beyond Knowledge Tracing Modeling Skill Topologie.pdf}
}

@incollection{kaserKnowledgeTracingModeling2014a,
  title = {Beyond {{Knowledge Tracing}}: {{Modeling Skill Topologies}} with {{Bayesian Networks}}},
  shorttitle = {Beyond {{Knowledge Tracing}}},
  booktitle = {Intelligent {{Tutoring Systems}}},
  author = {Käser, Tanja and Klingler, Severin and Schwing, Alexander Gerhard and Gross, Markus},
  editor = {Trausan-Matu, Stefan and Boyer, Kristy Elizabeth and Crosby, Martha and Panourgia, Kitty},
  editora = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Kobsa, Alfred and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Terzopoulos, Demetri and Tygar, Doug and Weikum, Gerhard},
  editoratype = {redactor},
  date = {2014},
  volume = {8474},
  pages = {188--198},
  publisher = {Springer International Publishing},
  location = {Cham},
  doi = {10.1007/978-3-319-07221-0_23},
  url = {http://link.springer.com/10.1007/978-3-319-07221-0_23},
  urldate = {2023-03-08},
  abstract = {Modeling and predicting student knowledge is a fundamental task of an intelligent tutoring system. A popular approach for student modeling is Bayesian Knowledge Tracing (BKT). BKT models, however, lack the ability to describe the hierarchy and relationships between the different skills of a learning domain. In this work, we therefore aim at increasing the representational power of the student model by employing dynamic Bayesian networks that are able to represent such skill topologies. To ensure model interpretability, we constrain the parameter space. We evaluate the performance of our models on five large-scale data sets of different learning domains such as mathematics, spelling learning and physics, and demonstrate that our approach outperforms BKT in prediction accuracy on unseen data across all learning domains.},
  isbn = {978-3-319-07220-3 978-3-319-07221-0},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/96L8MSFW/Käser et al. - 2014 - Beyond Knowledge Tracing Modeling Skill Topologie.pdf}
}

@online{khattabDSPyCompilingDeclarative2023,
  title = {{{DSPy}}: {{Compiling Declarative Language Model Calls}} into {{Self-Improving Pipelines}}},
  shorttitle = {{{DSPy}}},
  author = {Khattab, Omar and Singhvi, Arnav and Maheshwari, Paridhi and Zhang, Zhiyuan and Santhanam, Keshav and Vardhamanan, Sri and Haq, Saiful and Sharma, Ashutosh and Joshi, Thomas T. and Moazam, Hanna and Miller, Heather and Zaharia, Matei and Potts, Christopher},
  date = {2023-10-05},
  eprint = {2310.03714},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2310.03714},
  url = {http://arxiv.org/abs/2310.03714},
  urldate = {2024-05-01},
  abstract = {The ML community is rapidly exploring techniques for prompting language models (LMs) and for stacking them into pipelines that solve complex tasks. Unfortunately, existing LM pipelines are typically implemented using hard-coded "prompt templates", i.e. lengthy strings discovered via trial and error. Toward a more systematic approach for developing and optimizing LM pipelines, we introduce DSPy, a programming model that abstracts LM pipelines as text transformation graphs, i.e. imperative computational graphs where LMs are invoked through declarative modules. DSPy modules are parameterized, meaning they can learn (by creating and collecting demonstrations) how to apply compositions of prompting, finetuning, augmentation, and reasoning techniques. We design a compiler that will optimize any DSPy pipeline to maximize a given metric. We conduct two case studies, showing that succinct DSPy programs can express and optimize sophisticated LM pipelines that reason about math word problems, tackle multi-hop retrieval, answer complex questions, and control agent loops. Within minutes of compiling, a few lines of DSPy allow GPT-3.5 and llama2-13b-chat to self-bootstrap pipelines that outperform standard few-shot prompting (generally by over 25\% and 65\%, respectively) and pipelines with expert-created demonstrations (by up to 5-46\% and 16-40\%, respectively). On top of that, DSPy programs compiled to open and relatively small LMs like 770M-parameter T5 and llama2-13b-chat are competitive with approaches that rely on expert-written prompt chains for proprietary GPT-3.5. DSPy is available at https://github.com/stanfordnlp/dspy},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Information Retrieval,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/R47XHD94/Khattab et al_2023_DSPy.pdf;/Users/andrew/Zotero/storage/WVLHXPVC/2310.html}
}

@online{kirchenbauerWatermarkLargeLanguage2023,
  title = {A {{Watermark}} for {{Large Language Models}}},
  author = {Kirchenbauer, John and Geiping, Jonas and Wen, Yuxin and Katz, Jonathan and Miers, Ian and Goldstein, Tom},
  date = {2023-01-27},
  eprint = {2301.10226},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2301.10226},
  url = {http://arxiv.org/abs/2301.10226},
  urldate = {2023-02-15},
  abstract = {Potential harms of large language models can be mitigated by watermarking model output, i.e., embedding signals into generated text that are invisible to humans but algorithmically detectable from a short span of tokens. We propose a watermarking framework for proprietary language models. The watermark can be embedded with negligible impact on text quality, and can be detected using an efficient open-source algorithm without access to the language model API or parameters. The watermark works by selecting a randomized set of "green" tokens before a word is generated, and then softly promoting use of green tokens during sampling. We propose a statistical test for detecting the watermark with interpretable p-values, and derive an information-theoretic framework for analyzing the sensitivity of the watermark. We test the watermark using a multi-billion parameter model from the Open Pretrained Transformer (OPT) family, and discuss robustness and security.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language,Computer Science - Cryptography and Security,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/QIQVSX4J/Kirchenbauer et al_2023_A Watermark for Large Language Models.pdf;/Users/andrew/Zotero/storage/IUF8M4VC/2301.html}
}

@book{kirschnerHowTeachingHappens2022,
  title = {How {{Teaching Happens}}: {{Seminal Works}} in {{Teaching}} and {{Teacher Effectiveness}} and {{What They Mean}} in {{Practice}}},
  shorttitle = {How {{Teaching Happens}}},
  author = {Kirschner, Paul and Hendrick, Carl and Heal, Jim},
  date = {2022-06-23},
  publisher = {Routledge},
  location = {London},
  doi = {10.4324/9781003228165},
  abstract = {Building on their bestselling book How Learning Happens, Paul A. Kirschner and Carl Hendrick are joined by Jim Heal to explore how teaching happens. The book seeks to closely examine what makes for effective teaching in the classroom and how research on expert teaching can be used in practice.  Introducing 30 seminal works from the field of education psychology research, the learning sciences, and teaching effectiveness studies, each chapter takes an important work and illustrates clearly and concisely what the research means and how it can be used in daily practice. Divided into six sections the book covers:  • Teacher Effectiveness, Development, and Growth • Curriculum Development / Instructional Design  • Teaching Techniques • Pedagogical Content Knowledge • In the Classroom  • Assessment The book ends with a final chapter on "What’s Missing?" in how teachers learn to teach.~ Written by three leading experts in the field with illustrations by Oliver Cavigioli, How Teaching Happens provides a clear roadmap for classroom teachers, school leaders, and teacher trainers/trainees on what effective teaching looks like in practice.},
  isbn = {978-1-00-322816-5},
  pagetotal = {374}
}

@online{kobakDelvingChatGPTUsage2024,
  title = {Delving into {{ChatGPT}} Usage in Academic Writing through Excess Vocabulary},
  author = {Kobak, Dmitry and González-Márquez, Rita and Horvát, Emőke-Ágnes and Lause, Jan},
  date = {2024-07-03},
  eprint = {2406.07016},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2406.07016},
  url = {http://arxiv.org/abs/2406.07016},
  urldate = {2024-08-06},
  abstract = {Recent large language models (LLMs) can generate and revise text with human-level performance, and have been widely commercialized in systems like ChatGPT. These models come with clear limitations: they can produce inaccurate information, reinforce existing biases, and be easily misused. Yet, many scientists have been using them to assist their scholarly writing. How wide-spread is LLM usage in the academic literature currently? To answer this question, we use an unbiased, large-scale approach, free from any assumptions on academic LLM usage. We study vocabulary changes in 14 million PubMed abstracts from 2010-2024, and show how the appearance of LLMs led to an abrupt increase in the frequency of certain style words. Our analysis based on excess words usage suggests that at least 10\% of 2024 abstracts were processed with LLMs. This lower bound differed across disciplines, countries, and journals, and was as high as 30\% for some PubMed sub-corpora. We show that the appearance of LLM-based writing assistants has had an unprecedented impact in the scientific literature, surpassing the effect of major world events such as the Covid pandemic.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Digital Libraries,Computer Science - Social and Information Networks},
  file = {/Users/andrew/Zotero/storage/GQAVU7EF/Kobak et al. - 2024 - Delving into ChatGPT usage in academic writing through excess vocabulary.pdf;/Users/andrew/Zotero/storage/67BRT3KZ/2406.html}
}

@article{kojimaLargeLanguageModels,
  title = {Large {{Language Models}} Are {{Zero-Shot Reasoners}}},
  author = {Kojima, Takeshi and Gu, Shixiang Shane and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
  abstract = {Pretrained large language models (LLMs) are widely used in many sub-fields of natural language processing (NLP) and generally known as excellent few-shot learners with task-specific exemplars. Notably, chain of thought (CoT) prompting, a recent technique for eliciting complex multi-step reasoning through step-bystep answer examples, achieved the state-of-the-art performances in arithmetics and symbolic reasoning, difficult system-2 tasks that do not follow the standard scaling laws for LLMs. While these successes are often attributed to LLMs’ ability for few-shot learning, we show that LLMs are decent zero-shot reasoners by simply adding “Let’s think step by step” before each answer. Experimental results demonstrate that our Zero-shot-CoT, using the same single prompt template, significantly outperforms zero-shot LLM performances on diverse benchmark reasoning tasks including arithmetics (MultiArith, GSM8K, AQUA-RAT, SVAMP), symbolic reasoning (Last Letter, Coin Flip), and other logical reasoning tasks (Date Understanding, Tracking Shuffled Objects), without any hand-crafted few-shot examples, e.g. increasing the accuracy on MultiArith from 17.7\% to 78.7\% and GSM8K from 10.4\% to 40.7\% with large-scale InstructGPT model (text-davinci002), as well as similar magnitudes of improvements with another off-the-shelf large model, 540B parameter PaLM. The versatility of this single prompt across very diverse reasoning tasks hints at untapped and understudied fundamental zero-shot capabilities of LLMs, suggesting high-level, multi-task broad cognitive capabilities may be extracted by simple prompting. We hope our work not only serves as the minimal strongest zero-shot baseline for the challenging reasoning benchmarks, but also highlights the importance of carefully exploring and analyzing the enormous zero-shot knowledge hidden inside LLMs before crafting finetuning datasets or few-shot exemplars.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/CJ2G7HTR/Kojima et al. - Large Language Models are Zero-Shot Reasoners.pdf}
}

@article{kortemeyerAIGradingStudent2023,
  title = {Toward {{AI}} Grading of Student Problem Solutions in Introductory Physics: {{A}} Feasibility Study},
  shorttitle = {Toward {{AI}} Grading of Student Problem Solutions in Introductory Physics},
  author = {Kortemeyer, Gerd},
  date = {2023-11-29},
  journaltitle = {Physical Review Physics Education Research},
  shortjournal = {Phys. Rev. Phys. Educ. Res.},
  volume = {19},
  number = {2},
  pages = {020163},
  publisher = {American Physical Society},
  doi = {10.1103/PhysRevPhysEducRes.19.020163},
  url = {https://link.aps.org/doi/10.1103/PhysRevPhysEducRes.19.020163},
  urldate = {2024-05-02},
  abstract = {Solving problems is crucial for learning physics, and not only final solutions but also their derivations are important. Grading these derivations is labor intensive, as it generally involves human evaluation of handwritten work. AI tools have not been an alternative, since even for short answers, they needed specific training for each problem or set of problems. Extensively pretrained AI systems offer a potentially universal grading solution without this specific training. This feasibility study explores an AI-assisted workflow to grade handwritten physics derivations using MathPix and GPT-4. We were able to successfully scan handwritten solution paths and achieved an R-squared of 0.84 compared to human graders on a synthetic dataset. The proposed workflow appears promising for formative feedback, but for final evaluations, it would best be used to assist human graders.},
  file = {/Users/andrew/Zotero/storage/Z7VPPQRM/Kortemeyer - 2023 - Toward AI grading of student problem solutions in .pdf}
}

@online{kortemeyerEthelVirtualTeaching2024,
  title = {Ethel: {{A Virtual Teaching Assistant}}},
  shorttitle = {Ethel},
  author = {Kortemeyer, Gerd},
  date = {2024-07-28},
  eprint = {2407.19452},
  eprinttype = {arXiv},
  eprintclass = {physics},
  url = {http://arxiv.org/abs/2407.19452},
  urldate = {2024-08-06},
  abstract = {Generative AI has shown potential in solving physics problems and providing feedback on assessments. However, results are sometimes still inaccurate, at the wrong level, or using notations and definitions not appropriate for a particular course. A possible solution is augmenting the prompts with course-specific reference materials. Also, for feedback on homework solutions and grading exams, the problem text and the sample solution or grading rubric can be injected into the prompts. Project Ethel at ETH Zurich aims to construct a virtual teaching assistant using these practices.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Physics - Physics Education},
  file = {/Users/andrew/Zotero/storage/8VIN5A6E/Kortemeyer - 2024 - Ethel A Virtual Teaching Assistant.pdf}
}

@online{kortemeyerPerformancePreTrainedLarge2023,
  title = {Performance of the {{Pre-Trained Large Language Model GPT-4}} on {{Automated Short Answer Grading}}},
  author = {Kortemeyer, Gerd},
  date = {2023-09-17},
  eprint = {2309.09338},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2309.09338},
  url = {http://arxiv.org/abs/2309.09338},
  urldate = {2024-05-02},
  abstract = {Automated Short Answer Grading (ASAG) has been an active area of machine-learning research for over a decade. It promises to let educators grade and give feedback on free-form responses in large-enrollment courses in spite of limited availability of human graders. Over the years, carefully trained models have achieved increasingly higher levels of performance. More recently, pre-trained Large Language Models (LLMs) emerged as a commodity, and an intriguing question is how a general-purpose tool without additional training compares to specialized models. We studied the performance of GPT-4 on the standard benchmark 2-way and 3-way datasets SciEntsBank and Beetle, where in addition to the standard task of grading the alignment of the student answer with a reference answer, we also investigated withholding the reference answer. We found that overall, the performance of the pre-trained general-purpose GPT-4 LLM is comparable to hand-engineered models, but worse than pre-trained LLMs that had specialized training.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/DRBYGSFX/Kortemeyer_2023_Performance of the Pre-Trained Large Language Model GPT-4 on Automated Short.pdf;/Users/andrew/Zotero/storage/EGSN67AY/2309.html}
}

@online{kortemeyerTailoringChatbotsHigher2024,
  title = {Tailoring {{Chatbots}} for {{Higher Education}}: {{Some Insights}} and {{Experiences}}},
  shorttitle = {Tailoring {{Chatbots}} for {{Higher Education}}},
  author = {Kortemeyer, Gerd},
  date = {2024-08-26},
  eprint = {2409.06717},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2409.06717},
  urldate = {2024-09-12},
  abstract = {The general availability of powerful Large Language Models had a powerful impact on higher education, yet general models may not always be useful for the associated specialized tasks. When using these models, oftentimes the need for particular domain knowledge becomes quickly apparent, and the desire for customized bots arises. Customization holds the promise of leading to more accurate and contextually relevant responses, enhancing the educational experience. The purpose of this short technical experience report is to describe what "customizing" Large Language Models means in practical terms for higher education institutions. This report thus relates insights and experiences from one particular technical university in Switzerland, ETH Zurich.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Computer Science - Computers and Society},
  file = {/Users/andrew/Zotero/storage/XV6RLEZ9/Kortemeyer - 2024 - Tailoring Chatbots for Higher Education Some Insights and Experiences.pdf}
}

@online{kosinskiTheoryMindMay2023,
  title = {Theory of {{Mind May Have Spontaneously Emerged}} in {{Large Language Models}}},
  author = {Kosinski, Michal},
  date = {2023-03-14},
  eprint = {2302.02083},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2302.02083},
  url = {http://arxiv.org/abs/2302.02083},
  urldate = {2023-04-20},
  abstract = {Theory of mind (ToM), or the ability to impute unobservable mental states to others, is central to human social interactions, communication, empathy, self-consciousness, and morality. We tested several language models using 40 classic false-belief tasks widely used to test ToM in humans. The models published before 2020 showed virtually no ability to solve ToM tasks. Yet, the first version of GPT-3 ("davinci-001"), published in May 2020, solved about 40\% of false-belief tasks-performance comparable with 3.5-year-old children. Its second version ("davinci-002"; January 2022) solved 70\% of false-belief tasks, performance comparable with six-year-olds. Its most recent version, GPT-3.5 ("davinci-003"; November 2022), solved 90\% of false-belief tasks, at the level of seven-year-olds. GPT-4 published in March 2023 solved nearly all the tasks (95\%). These findings suggest that ToM-like ability (thus far considered to be uniquely human) may have spontaneously emerged as a byproduct of language models' improving language skills.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Human-Computer Interaction},
  file = {/Users/andrew/Zotero/storage/PSVQIFRD/Kosinski_2023_Theory of Mind May Have Spontaneously Emerged in Large Language Models.pdf;/Users/andrew/Zotero/storage/IGGWBMA9/2302.html}
}

@online{krakauerWillHarmUs2016,
  title = {Will {{A}}.{{I}}. {{Harm Us}}? {{Better}} to {{Ask How We}}'ll {{Reckon With Our Hybrid Nature}}},
  shorttitle = {Will {{A}}.{{I}}. {{Harm Us}}?},
  author = {Krakauer, David},
  date = {2016-09-06T03:51:14+00:00},
  url = {https://nautil.us/will-ai-harm-us-better-to-ask-how-well-reckon-with-our-hybrid-nature-236098/},
  urldate = {2024-06-26},
  abstract = {Little Lewis, my son, I see some evidence that you have the ability to learn science, number and proportions, and I recognize your special desire to learn about the astrolabe… — Chaucer’s Astrolabe Treatise, 1391. Nautilus Members enjoy an ad-free experience. Log in or Join now . At what point did we create an artificial […]},
  langid = {american},
  organization = {Nautilus},
  file = {/Users/andrew/Zotero/storage/YSKXJVPE/will-ai-harm-us-better-to-ask-how-well-reckon-with-our-hybrid-nature-236098.html}
}

@online{kumarMathEducationLarge2023,
  type = {SSRN Scholarly Paper},
  title = {Math {{Education}} with {{Large Language Models}}: {{Peril}} or {{Promise}}?},
  shorttitle = {Math {{Education}} with {{Large Language Models}}},
  author = {Kumar, Harsh and Rothschild, David M. and Goldstein, Daniel G. and Hofman, Jake M.},
  date = {2023-11-22},
  number = {4641653},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4641653},
  url = {https://papers.ssrn.com/abstract=4641653},
  urldate = {2024-01-29},
  abstract = {The widespread availability of large language models (LLMs) has provoked both fear and excitement in the domain of education.On one hand, there is the concern that students will offload their coursework to LLMs, limiting what they themselves learn.On the other hand, there is the hope that LLMs might serve as scalable, personalized tutors.Here we conduct a large, pre-registered experiment involving 1200 participants to investigate how exposure to LLM-based explanations affect learning.In the experiment's learning phase, we gave participants practice problems and manipulated two key factors in a between-participants design: first, whether they were required to attempt a problem before or after seeing the correct answer, and second, whether participants were shown only the answer or were also exposed to an LLM-generated explanation of the answer.Subsequently, all participants were tested on new test questions to assess how well they had learned the underlying concepts.Overall we found that LLM-based explanations positively impacted learning relative to seeing only correct answers.The benefits were largest for those who attempted problems on their own first before consulting LLM explanations, but surprisingly this trend held even for those participants who were exposed to LLM explanations before attempting to solve practice problems on their own.An accompanying qualitative analysis revealed that these boosts in performance were indeed due to participants adopting the strategies they were shown, and that exposure to LLM explanations increased the amount people felt they learned and decreased the perceived difficulty of the test problems.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {AI,education,generative AI,human-AI interaction,large language models,llms,math,tutoring},
  file = {/Users/andrew/Zotero/storage/2SBLAD8E/Kumar et al_2023_Math Education with Large Language Models.pdf}
}

@online{kungPerformanceChatGPTUSMLE2022,
  title = {Performance of {{ChatGPT}} on {{USMLE}}: {{Potential}} for {{AI-Assisted Medical Education Using Large Language Models}}},
  shorttitle = {Performance of {{ChatGPT}} on {{USMLE}}},
  author = {Kung, Tiffany H. and Cheatham, Morgan and ChatGPT and Medenilla, Arielle and Sillos, Czarina and Leon, Lorie De and Elepaño, Camille and Madriaga, Maria and Aggabao, Rimel and Diaz-Candido, Giezel and Maningo, James and Tseng, Victor},
  date = {2022-12-21},
  eprinttype = {medRxiv},
  pages = {2022.12.19.22283643},
  doi = {10.1101/2022.12.19.22283643},
  url = {https://www.medrxiv.org/content/10.1101/2022.12.19.22283643v2},
  urldate = {2023-02-11},
  abstract = {We evaluated the performance of a large language model called ChatGPT on the United States Medical Licensing Exam (USMLE), which consists of three exams: Step 1, Step 2CK, and Step 3. ChatGPT performed at or near the passing threshold for all three exams without any specialized training or reinforcement. Additionally, ChatGPT demonstrated a high level of concordance and insight in its explanations. These results suggest that large language models may have the potential to assist with medical education, and potentially, clinical decision-making.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/5SATZSAZ/Kung et al_2022_Performance of ChatGPT on USMLE.pdf}
}

@article{kurpicz-brikiCulturalDifferencesBias,
  title = {Cultural {{Differences}} in {{Bias}}? {{Origin}} and {{Gender Bias}} in {{Pre-Trained German}} and {{French Word Embeddings}}},
  author = {Kurpicz-Briki, Mascha},
  abstract = {Smart applications often rely on training data in form of text. If there is a bias in that training data, the decision of the applications might not be fair. Common training data has been shown to be biased towards different groups of minorities. However, there is no generic algorithm to determine the fairness of training data. One existing approach is to measure gender bias using word embeddings. Most research in this field has been dedicated to the English language. In this work, we identified that there is a bias towards gender and origin in both German and French word embeddings. In particular, we found that real-world bias and stereotypes from the 18th century are still included in today’s word embeddings. Furthermore, we show that the gender bias in German has a different form from English and there is indication that bias has cultural differences that need to be considered when analyzing texts and word embeddings in different languages.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/X4NJQJNZ/Kurpicz-Briki - Cultural Differences in Bias Origin and Gender Bi.pdf}
}

@article{kurpicz-brikiWorldFullStereotypes2021,
  title = {A {{World Full}} of {{Stereotypes}}? {{Further Investigation}} on {{Origin}} and {{Gender Bias}} in {{Multi-Lingual Word Embeddings}}},
  shorttitle = {A {{World Full}} of {{Stereotypes}}?},
  author = {Kurpicz-Briki, Mascha and Leoni, Tomaso},
  date = {2021},
  journaltitle = {Frontiers in Big Data},
  volume = {4},
  issn = {2624-909X},
  doi = {10.3389/fdata.2021.625290},
  url = {https://www.frontiersin.org/articles/10.3389/fdata.2021.625290},
  urldate = {2023-03-03},
  abstract = {Publicly available off-the-shelf word embeddings that are often used in productive applications for natural language processing have been proven to be biased. We have previously shown that this bias can come in different forms, depending on the language and the cultural context. In this work, we extend our previous work and further investigate how bias varies in different languages. We examine Italian and Swedish word embeddings for gender and origin bias, and demonstrate how an origin bias concerning local migration groups in Switzerland is included in German word embeddings. We propose BiasWords, a method to automatically detect new forms of bias. Finally, we discuss how cultural and language aspects are relevant to the impact of bias on the application and to potential mitigation measures.},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/6GGF3AFM/Kurpicz-Briki_Leoni_2021_A World Full of Stereotypes.pdf}
}

@online{kwonRewardDesignLanguage2023,
  title = {Reward {{Design}} with {{Language Models}}},
  author = {Kwon, Minae and Xie, Sang Michael and Bullard, Kalesha and Sadigh, Dorsa},
  date = {2023-02-27},
  eprint = {2303.00001},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2303.00001},
  urldate = {2023-03-10},
  abstract = {Reward design in reinforcement learning (RL) is challenging since specifying human notions of desired behavior may be difficult via reward functions or require many expert demonstrations. Can we instead cheaply design rewards using a natural language interface? This paper explores how to simplify reward design by prompting a large language model (LLM) such as GPT-3 as a proxy reward function, where the user provides a textual prompt containing a few examples (few-shot) or a description (zero-shot) of the desired behavior. Our approach leverages this proxy reward function in an RL framework. Specifically, users specify a prompt once at the beginning of training. During training, the LLM evaluates an RL agent's behavior against the desired behavior described by the prompt and outputs a corresponding reward signal. The RL agent then uses this reward to update its behavior. We evaluate whether our approach can train agents aligned with user objectives in the Ultimatum Game, matrix games, and the DealOrNoDeal negotiation task. In all three tasks, we show that RL agents trained with our framework are well-aligned with the user's objectives and outperform RL agents trained with reward functions learned via supervised learning},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/AIXZC8BS/Kwon et al_2023_Reward Design with Language Models.pdf;/Users/andrew/Zotero/storage/PI3A34ML/2303.html}
}

@article{lakeBuildingMachinesThat2017,
  title = {Building Machines That Learn and Think like People},
  author = {Lake, Brenden M. and Ullman, Tomer D. and Tenenbaum, Joshua B. and Gershman, Samuel J.},
  year = {2017/ed},
  journaltitle = {Behavioral and Brain Sciences},
  volume = {40},
  publisher = {Cambridge University Press},
  issn = {0140-525X, 1469-1825},
  doi = {10.1017/S0140525X16001837},
  url = {https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/article/building-machines-that-learn-and-think-like-people/A9535B1D745A0377E16C590E14B94993#},
  urldate = {2021-02-08},
  abstract = {Recent progress in artificial intelligence has renewed interest in building systems that learn and think like people. Many advances have come from using deep neural networks trained end-to-end in tasks such as object recognition, video games, and board games, achieving performance that equals or even beats that of humans in some respects. Despite their biological inspiration and performance achievements, these systems differ from human intelligence in crucial ways. We review progress in cognitive science suggesting that truly human-like learning and thinking machines will have to reach beyond current engineering trends in both what they learn and how they learn it. Specifically, we argue that these machines should (1) build causal models of the world that support explanation and understanding, rather than merely solving pattern recognition problems; (2) ground learning in intuitive theories of physics and psychology to support and enrich the knowledge that is learned; and (3) harness compositionality and learning-to-learn to rapidly acquire and generalize knowledge to new tasks and situations. We suggest concrete challenges and promising routes toward these goals that can combine the strengths of recent neural network advances with more structured cognitive models.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/2TZWZ4KG/Lake et al_2017_Building machines that learn and think like people.pdf;/Users/andrew/Zotero/storage/2LBPILKD/A9535B1D745A0377E16C590E14B94993.html}
}

@article{lakeHumanFewshotLearning,
  title = {Human Few-Shot Learning of Compositional Instructions},
  author = {Lake, Brenden M and Linzen, Tal and Baroni, Marco},
  abstract = {People learn in fast and flexible ways that have not been emulated by machines. Once a person learns a new verb “dax,” he or she can effortlessly understand how to “dax twice,” “walk and dax,” or “dax vigorously.” There have been striking recent improvements in machine learning for natural language processing, yet the best algorithms require vast amounts of experience and struggle to generalize new concepts in compositional ways. To better understand these distinctively human abilities, we study the compositional skills of people through languagelike instruction learning tasks. Our results show that people can learn and use novel functional concepts from very few examples (few-shot learning), successfully applying familiar functions to novel inputs. People can also compose concepts in complex ways that go beyond the provided demonstrations. Two additional experiments examined the assumptions and inductive biases that people make when solving these tasks, revealing three biases: mutual exclusivity, one-to-one mappings, and iconic concatenation. We discuss the implications for cognitive modeling and the potential for building machines with more human-like language learning capabilities.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/W8H4AGKW/Lake et al. - Human few-shot learning of compositional instructi.pdf}
}

@article{lakeHumanlikeSystematicGeneralization2023,
  title = {Human-like Systematic Generalization through a Meta-Learning Neural Network},
  author = {Lake, Brenden M. and Baroni, Marco},
  date = {2023-10-25},
  journaltitle = {Nature},
  pages = {1--7},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/s41586-023-06668-3},
  url = {https://www.nature.com/articles/s41586-023-06668-3},
  urldate = {2023-10-26},
  abstract = {The power of human language and thought arises from systematic compositionality—the algebraic ability to understand and produce novel combinations from known components. Fodor and Pylyshyn1 famously argued that artificial neural networks lack this capacity and are therefore not viable models of the mind. Neural networks have advanced considerably in the years since, yet the systematicity challenge persists. Here we successfully address Fodor and Pylyshyn’s challenge by providing evidence that neural networks can achieve human-like systematicity when optimized for their compositional skills. To do so, we introduce the meta-learning for compositionality (MLC) approach for guiding training through a dynamic stream of compositional tasks. To compare humans and machines, we conducted human behavioural experiments using an~instruction learning paradigm. After considering seven different models, we found that, in contrast to perfectly systematic but rigid probabilistic symbolic models, and perfectly flexible but unsystematic neural networks, only MLC achieves both the systematicity and flexibility needed for human-like generalization. MLC also advances the compositional skills of machine learning systems in several systematic generalization benchmarks. Our results show how a standard neural network architecture, optimized for its compositional skills, can mimic human systematic generalization in a head-to-head comparison.},
  langid = {english},
  keywords = {Computer science,Human behaviour},
  file = {/Users/andrew/Zotero/storage/QGPS8CIX/Lake_Baroni_2023_Human-like systematic generalization through a meta-learning neural network.pdf}
}

@article{lakePeopleInferRecursive2020,
  title = {People {{Infer Recursive Visual Concepts}} from {{Just}} a {{Few Examples}}},
  author = {Lake, Brenden M. and Piantadosi, Steven T.},
  date = {2020-03-01},
  journaltitle = {Computational Brain \& Behavior},
  shortjournal = {Comput Brain Behav},
  volume = {3},
  number = {1},
  pages = {54--65},
  issn = {2522-087X},
  doi = {10.1007/s42113-019-00053-y},
  url = {https://doi.org/10.1007/s42113-019-00053-y},
  urldate = {2022-02-03},
  abstract = {Machine learning has made major advances in categorizing objects in images, yet the best algorithms miss important aspects of how people learn and think about categories. People can learn richer concepts from fewer examples, including causal models that explain how members of a category are formed. Here, we explore the limits of this human ability to infer causal “programs”—latent generating processes with nontrivial algorithmic properties—from one, two, or three visual examples. People were asked to extrapolate the programs in several ways, for both classifying and generating new examples. As a theory of these inductive abilities, we present a Bayesian program learning model that searches the space of programs for the best explanation of the observations. Although variable, people’s judgments are broadly consistent with the model and inconsistent with several alternatives, including a pretrained deep neural network for object recognition, indicating that people can learn and reason with rich algorithmic abstractions from sparse input data.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/GMQQPPDV/Lake and Piantadosi - 2020 - People Infer Recursive Visual Concepts from Just a.pdf}
}

@online{lakeWordMeaningMinds2021,
  title = {Word Meaning in Minds and Machines},
  author = {Lake, Brenden M. and Murphy, Gregory L.},
  date = {2021-04-17},
  eprint = {2008.01766},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2008.01766},
  url = {http://arxiv.org/abs/2008.01766},
  urldate = {2023-02-11},
  abstract = {Machines have achieved a broad and growing set of linguistic competencies, thanks to recent progress in Natural Language Processing (NLP). Psychologists have shown increasing interest in such models, comparing their output to psychological judgments such as similarity, association, priming, and comprehension, raising the question of whether the models could serve as psychological theories. In this article, we compare how humans and machines represent the meaning of words. We argue that contemporary NLP systems are fairly successful models of human word similarity, but they fall short in many other respects. Current models are too strongly linked to the text-based patterns in large corpora, and too weakly linked to the desires, goals, and beliefs that people express through words. Word meanings must also be grounded in perception and action and be capable of flexible combinations in ways that current systems are not. We discuss more promising approaches to grounding NLP systems and argue that they will be more successful with a more human-like, conceptual basis for word meaning.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/UM2SEPQH/Lake_Murphy_2021_Word meaning in minds and machines.pdf;/Users/andrew/Zotero/storage/R7MFY27P/2008.html}
}

@article{lakeWordMeaningMinds2021a,
  title = {Word Meaning in Minds and Machines.},
  author = {Lake, Brenden M. and Murphy, Gregory L.},
  date = {2021-07-22},
  journaltitle = {Psychological Review},
  shortjournal = {Psychological Review},
  issn = {1939-1471, 0033-295X},
  doi = {10.1037/rev0000297},
  url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/rev0000297},
  urldate = {2023-03-04},
  abstract = {Machines have achieved a broad and growing set of linguistic competencies, thanks to recent progress in Natural Language Processing (NLP). Psychologists have shown increasing interest in such models, comparing their output to psychological judgments such as similarity, association, priming, and comprehension, raising the question of whether the models could serve as psychological theories. In this article, we compare how humans and machines represent the meaning of words. We argue that contemporary NLP systems are fairly successful models of human word similarity, but they fall short in many other respects. Current models are too strongly linked to the text-based patterns in large corpora, and too weakly linked to the desires, goals, and beliefs that people express through words. Word meanings must also be grounded in perception and action and be capable of flexible combinations in ways that current systems are not. We discuss promising approaches to grounding NLP systems and argue that they will be more successful, with a more human-like, conceptual basis for word meaning.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/QN5IJG5Q/Lake and Murphy - 2021 - Word meaning in minds and machines..pdf}
}

@online{lampinenPassiveLearningActive2023,
  title = {Passive Learning of Active Causal Strategies in Agents and Language Models},
  author = {Lampinen, Andrew Kyle and Chan, Stephanie C. Y. and Dasgupta, Ishita and Nam, Andrew J. and Wang, Jane X.},
  date = {2023-05-25},
  eprint = {2305.16183},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2305.16183},
  urldate = {2023-05-29},
  abstract = {What can be learned about causality and experimentation from passive data? This question is salient given recent successes of passively-trained language models in interactive domains such as tool use. Passive learning is inherently limited. However, we show that purely passive learning can in fact allow an agent to learn generalizable strategies for determining and using causal structures, as long as the agent can intervene at test time. We formally illustrate that learning a strategy of first experimenting, then seeking goals, can allow generalization from passive learning in principle. We then show empirically that agents trained via imitation on expert data can indeed generalize at test time to infer and use causal links which are never present in the training data; these agents can also generalize experimentation strategies to novel variable sets never observed in training. We then show that strategies for causal intervention and exploitation can be generalized from passive data even in a more complex environment with high-dimensional observations, with the support of natural language explanations. Explanations can even allow passive learners to generalize out-of-distribution from perfectlyconfounded training data. Finally, we show that language models, trained only on passive next-word prediction, can generalize causal intervention strategies from a few-shot prompt containing examples of experimentation, together with explanations and reasoning. These results highlight the surprising power of passive learning of active causal strategies, and may help to understand the behaviors and capabilities of language models.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/7LP2SCKD/Lampinen et al. - 2023 - Passive learning of active causal strategies in ag.pdf}
}

@book{langCheatingLessons2013,
  title = {Cheating {{Lessons}}},
  author = {Lang, James M.},
  date = {2013-09-02},
  eprint = {Fa16AAAAQBAJ},
  eprinttype = {googlebooks},
  publisher = {Harvard University Press},
  abstract = {Nearly three-quarters of college students cheat during their undergraduate careers, a startling number attributed variously to the laziness of today’s students, their lack of a moral compass, or the demands of a hypercompetitive society. For James Lang, cultural or sociological explanations like these are red herrings. His provocative new research indicates that students often cheat because their learning environments give them ample incentives to try—and that strategies which make cheating less worthwhile also improve student learning. Cheating Lessons is a practical guide to tackling academic dishonesty at its roots.Drawing on an array of findings from cognitive theory, Lang analyzes the specific, often hidden features of course design and daily classroom practice that create opportunities for cheating. Courses that set the stakes of performance very high, that rely on single assessment mechanisms like multiple-choice tests, that have arbitrary grading criteria: these are the kinds of conditions that breed cheating. Lang seeks to empower teachers to create more effective learning environments that foster intrinsic motivation, promote mastery, and instill the sense of self-efficacy that students need for deep learning.Although cheating is a persistent problem, the prognosis is not dire. The good news is that strategies which reduce cheating also improve student performance overall. Instructors who learn to curb academic dishonesty will have done more than solve a course management problem—they will have become better educators all around.},
  isbn = {978-0-674-72730-4},
  langid = {english},
  pagetotal = {174},
  keywords = {Education / Schools / Levels / Higher,Education / Teaching / General,Education / Testing & Measurement}
}

@online{langCheatingLessons2013a,
  title = {Cheating {{Lessons}}},
  author = {Lang, James M.},
  date = {2013},
  url = {https://www.hup.harvard.edu/books/9780674724631},
  urldate = {2024-06-26},
  abstract = {Nearly three-quarters of college students cheat during their undergraduate careers, a startling number attributed variously to the laziness of today’s students, their lack of a moral compass, or the demands of a hypercompetitive society. For James Lang, cultural or sociological explanations like these are red herrings. His provocative new research indicates that students often cheat because their learning environments give them ample incentives to try—and that strategies which make cheating less worthwhile also improve student learning. Cheating Lessons is a practical guide to tackling academic dishonesty at its roots.Drawing on an array of findings from cognitive theory, Lang analyzes the specific, often hidden features of course design and daily classroom practice that create opportunities for cheating. Courses that set the stakes of performance very high, that rely on single assessment mechanisms like multiple-choice tests, that have arbitrary grading criteria: these are the kinds of conditions that breed cheating. Lang seeks to empower teachers to create more effective learning environments that foster intrinsic motivation, promote mastery, and instill the sense of self-efficacy that students need for deep learning.Although cheating is a persistent problem, the prognosis is not dire. The good news is that strategies which reduce cheating also improve student performance overall. Instructors who learn to curb academic dishonesty will have done more than solve a course management problem—they will have become better educators all around.},
  langid = {english},
  organization = {Harvard University Press},
  file = {/Users/andrew/Zotero/storage/UKX7E7RJ/9780674724631.html}
}

@online{LearningDoingGuide,
  title = {Learning by {{Doing}}: A Guide to Teaching and Learning Methods by {{Graham Gibbs}} - {{eBook}} | {{Oxford Brookes University Online Shop}}},
  url = {https://shop.brookes.ac.uk/product-catalogue/oxford-centre-for-staff-learning-development/books-publications/ebooks/learning-by-doing-a-guide-to-teaching-and-learning-methods-by-graham-gibbs-ebook},
  urldate = {2023-10-03},
  file = {/Users/andrew/Zotero/storage/TZ6I9HEA/Learning by Doing a guide to teaching and learnin.pdf;/Users/andrew/Zotero/storage/EMD24KJV/learning-by-doing-a-guide-to-teaching-and-learning-methods-by-graham-gibbs-ebook.html}
}

@online{ledermanAreLanguageModels2024,
  title = {Are {{Language Models More Like Libraries}} or {{Like Librarians}}? {{Bibliotechnism}}, the {{Novel Reference Problem}}, and the {{Attitudes}} of {{LLMs}}},
  shorttitle = {Are {{Language Models More Like Libraries}} or {{Like Librarians}}?},
  author = {Lederman, Harvey and Mahowald, Kyle},
  date = {2024-01-09},
  eprint = {2401.04854},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2401.04854},
  urldate = {2024-01-11},
  abstract = {Are LLMs cultural technologies like photocopiers or printing presses, which transmit information but cannot create new content? A challenge for this idea, which we call bibliotechnism, is that LLMs often do generate entirely novel text. We begin by defending bibliotechnism against this challenge, showing how novel text may be meaningful only in a derivative sense, so that the content of this generated text depends in an important sense on the content of original human text. We go on to present a different, novel challenge for bibliotechnism, stemming from examples in which LLMs generate "novel reference", using novel names to refer to novel entities. Such examples could be smoothly explained if LLMs were not cultural technologies but possessed a limited form of agency (beliefs, desires, and intentions). According to interpretationism in the philosophy of mind, a system has beliefs, desires and intentions if and only if its behavior is well-explained by the hypothesis that it has such states. In line with this view, we argue that cases of novel reference provide evidence that LLMs do in fact have beliefs, desires, and intentions, and thus have a limited form of agency.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/RTHDQEC6/Lederman_Mahowald_2024_Are Language Models More Like Libraries or Like Librarians.pdf;/Users/andrew/Zotero/storage/FHHW766R/2401.html}
}

@online{lehnertAIInsightsTheoretical2023,
  title = {{{AI Insights}} into {{Theoretical Physics}} and the {{Swampland Program}}: {{A Journey Through}} the {{Cosmos}} with {{ChatGPT}}},
  shorttitle = {{{AI Insights}} into {{Theoretical Physics}} and the {{Swampland Program}}},
  author = {Lehnert, Kay},
  date = {2023-01-10},
  eprint = {2301.08155},
  eprinttype = {arXiv},
  eprintclass = {physics},
  doi = {10.48550/arXiv.2301.08155},
  url = {http://arxiv.org/abs/2301.08155},
  urldate = {2023-02-15},
  abstract = {In this case study, we explore the capabilities and limitations of ChatGPT, a natural language processing model developed by OpenAI, in the field of string theoretical swampland conjectures. We find that it is effective at paraphrasing and explaining concepts in a variety of styles, but not at genuinely connecting concepts. It will provide false information with full confidence and make up statements when necessary. However, its ingenious use of language can be fruitful for identifying analogies and describing visual representations of abstract concepts.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Physics - Popular Physics},
  file = {/Users/andrew/Zotero/storage/JYCW4CH3/Lehnert_2023_AI Insights into Theoretical Physics and the Swampland Program.pdf;/Users/andrew/Zotero/storage/BD79CY89/2301.html}
}

@online{lewSequentialMonteCarlo2023,
  title = {Sequential {{Monte Carlo Steering}} of {{Large Language Models}} Using {{Probabilistic Programs}}},
  author = {Lew, Alexander K. and Zhi-Xuan, Tan and Grand, Gabriel and Mansinghka, Vikash K.},
  date = {2023-06-05},
  eprint = {2306.03081},
  eprinttype = {arXiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.2306.03081},
  url = {http://arxiv.org/abs/2306.03081},
  urldate = {2023-06-25},
  abstract = {Even after fine-tuning and reinforcement learning, large language models (LLMs) can be difficult, if not impossible, to control reliably with prompts alone. We propose a new inference-time approach to enforcing syntactic and semantic constraints on the outputs of LLMs, called sequential Monte Carlo (SMC) steering. The key idea is to specify language generation tasks as posterior inference problems in a class of discrete probabilistic sequence models, and replace standard decoding with sequential Monte Carlo inference. For a computational cost similar to that of beam search, SMC can steer LLMs to solve diverse tasks, including infilling, generation under syntactic constraints, and prompt intersection. To facilitate experimentation with SMC steering, we present a probabilistic programming library, LLaMPPL (https://github.com/probcomp/LLaMPPL), for concisely specifying new generation tasks as language model probabilistic programs, and automating steering of LLaMA-family Transformers.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Programming Languages,Statistics - Computation},
  file = {/Users/andrew/Zotero/storage/YARLLDCM/Lew et al_2023_Sequential Monte Carlo Steering of Large Language Models using Probabilistic.pdf;/Users/andrew/Zotero/storage/NIJGMUCW/2306.html}
}

@online{lewSequentialMonteCarlo2023a,
  title = {Sequential {{Monte Carlo Steering}} of {{Large Language Models}} Using {{Probabilistic Programs}}},
  author = {Lew, Alexander K. and Zhi-Xuan, Tan and Grand, Gabriel and Mansinghka, Vikash K.},
  date = {2023-11-26},
  eprint = {2306.03081},
  eprinttype = {arXiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.2306.03081},
  url = {http://arxiv.org/abs/2306.03081},
  urldate = {2024-08-16},
  abstract = {Even after fine-tuning and reinforcement learning, large language models (LLMs) can be difficult, if not impossible, to control reliably with prompts alone. We propose a new inference-time approach to enforcing syntactic and semantic constraints on the outputs of LLMs, called sequential Monte Carlo (SMC) steering. The key idea is to specify language generation tasks as posterior inference problems in a class of discrete probabilistic sequence models, and replace standard decoding with sequential Monte Carlo inference. For a computational cost similar to that of beam search, SMC can steer LLMs to solve diverse tasks, including infilling, generation under syntactic constraints, and prompt intersection. To facilitate experimentation with SMC steering, we present a probabilistic programming library, LLaMPPL (https://github.com/probcomp/hfppl), for concisely specifying new generation tasks as language model probabilistic programs, and automating steering of LLaMA-family Transformers.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Programming Languages,Statistics - Computation},
  file = {/Users/andrew/Zotero/storage/HTAEA9T9/Lew et al. - 2023 - Sequential Monte Carlo Steering of Large Language Models using Probabilistic Programs.pdf;/Users/andrew/Zotero/storage/QXBCRNDC/2306.html}
}

@online{liangMappingIncreasingUse2024,
  title = {Mapping the {{Increasing Use}} of {{LLMs}} in {{Scientific Papers}}},
  author = {Liang, Weixin and Zhang, Yaohui and Wu, Zhengxuan and Lepp, Haley and Ji, Wenlong and Zhao, Xuandong and Cao, Hancheng and Liu, Sheng and He, Siyu and Huang, Zhi and Yang, Diyi and Potts, Christopher and Manning, Christopher D. and Zou, James Y.},
  date = {2024-04-01},
  eprint = {2404.01268},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2404.01268},
  url = {http://arxiv.org/abs/2404.01268},
  urldate = {2024-05-03},
  abstract = {Scientific publishing lays the foundation of science by disseminating research findings, fostering collaboration, encouraging reproducibility, and ensuring that scientific knowledge is accessible, verifiable, and built upon over time. Recently, there has been immense speculation about how many people are using large language models (LLMs) like ChatGPT in their academic writing, and to what extent this tool might have an effect on global scientific practices. However, we lack a precise measure of the proportion of academic writing substantially modified or produced by LLMs. To address this gap, we conduct the first systematic, large-scale analysis across 950,965 papers published between January 2020 and February 2024 on the arXiv, bioRxiv, and Nature portfolio journals, using a population-level statistical framework to measure the prevalence of LLM-modified content over time. Our statistical estimation operates on the corpus level and is more robust than inference on individual instances. Our findings reveal a steady increase in LLM usage, with the largest and fastest growth observed in Computer Science papers (up to 17.5\%). In comparison, Mathematics papers and the Nature portfolio showed the least LLM modification (up to 6.3\%). Moreover, at an aggregate level, our analysis reveals that higher levels of LLM-modification are associated with papers whose first authors post preprints more frequently, papers in more crowded research areas, and papers of shorter lengths. Our findings suggests that LLMs are being broadly used in scientific writings.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Digital Libraries,Computer Science - Machine Learning,Computer Science - Social and Information Networks},
  file = {/Users/andrew/Zotero/storage/8RRT5S2B/Liang et al_2024_Mapping the Increasing Use of LLMs in Scientific Papers.pdf;/Users/andrew/Zotero/storage/TEFD7T3V/2404.html}
}

@article{liangRelationshipStudentInteraction2023,
  title = {The Relationship between Student Interaction with Generative Artificial Intelligence and Learning Achievement: Serial Mediating Roles of Self-Efficacy and Cognitive Engagement},
  shorttitle = {The Relationship between Student Interaction with Generative Artificial Intelligence and Learning Achievement},
  author = {Liang, Jing and Wang, Lili and Luo, Jia and Yan, Yufei and Fan, Chao},
  date = {2023},
  journaltitle = {Frontiers in Psychology},
  volume = {14},
  issn = {1664-1078},
  doi = {10.3389/fpsyg.2023.1285392},
  url = {https://www.frontiersin.org/articles/10.3389/fpsyg.2023.1285392},
  urldate = {2024-03-08},
  abstract = {Generative artificial intelligence (GAI) shocked the world with its unprecedented ability and raised significant tensions in the education field. Educators inevitably transition to an educational future that embraces GAI rather than shuns it. Understanding the mechanism between students interacting with GAI tools and their achievement is important for educators and schools, but relevant empirical evidence is relatively lacking. Due to the characteristics of personalization and real-time interactivity of GAI tools, we propose that the students–GAI interaction would affect their learning achievement through serial mediators of self-efficacy and cognitive engagement. Based on questionnaire surveys that include 389 participants as the objective, this study finds that: (1) in total, there is a significantly positive relationship between student–GAI interaction and learning achievement. (2) This positive relationship is mediated by self-efficacy, with a significant mediation effect value of 0.015. (3) Cognitive engagement also acts as a mediator in the mechanism between the student–GAI interaction and learning achievement, evidenced by a significant and relatively strong mediating effect value of 0.046. (4) Self-efficacy and cognitive engagement in series mediate this positive association, with a serial mediating effect value of 0.011, which is relatively small in comparison but also shows significance. In addition, the propensity score matching (PSM) method is applied to alleviate self-selection bias, reinforcing the validity of the results. The findings offer empirical evidence for the incorporation of GAI in teaching and learning.},
  file = {/Users/andrew/Zotero/storage/GYCRQKZ4/Liang et al_2023_The relationship between student interaction with generative artificial.pdf}
}

@online{liAutomatedStatisticalModel2024,
  title = {Automated {{Statistical Model Discovery}} with {{Language Models}}},
  author = {Li, Michael Y. and Fox, Emily B. and Goodman, Noah D.},
  date = {2024-06-22},
  eprint = {2402.17879},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2402.17879},
  url = {http://arxiv.org/abs/2402.17879},
  urldate = {2024-08-16},
  abstract = {Statistical model discovery is a challenging search over a vast space of models subject to domain-specific constraints. Efficiently searching over this space requires expertise in modeling and the problem domain. Motivated by the domain knowledge and programming capabilities of large language models (LMs), we introduce a method for language model driven automated statistical model discovery. We cast our automated procedure within the principled framework of Box's Loop: the LM iterates between proposing statistical models represented as probabilistic programs, acting as a modeler, and critiquing those models, acting as a domain expert. By leveraging LMs, we do not have to define a domain-specific language of models or design a handcrafted search procedure, which are key restrictions of previous systems. We evaluate our method in three settings in probabilistic modeling: searching within a restricted space of models, searching over an open-ended space, and improving expert models under natural language constraints (e.g., this model should be interpretable to an ecologist). Our method identifies models on par with human expert designed models and extends classic models in interpretable ways. Our results highlight the promise of LM-driven model discovery.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/IJUME7CU/Li et al. - 2024 - Automated Statistical Model Discovery with Language Models.pdf;/Users/andrew/Zotero/storage/HZ7IETVA/2402.html}
}

@online{liElicitingHumanPreferences2023,
  title = {Eliciting {{Human Preferences}} with {{Language Models}}},
  author = {Li, Belinda Z. and Tamkin, Alex and Goodman, Noah and Andreas, Jacob},
  date = {2023-10-17},
  eprint = {2310.11589},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2310.11589},
  url = {http://arxiv.org/abs/2310.11589},
  urldate = {2024-09-09},
  abstract = {Language models (LMs) can be directed to perform target tasks by using labeled examples or natural language prompts. But selecting examples or writing prompts for can be challenging--especially in tasks that involve unusual edge cases, demand precise articulation of nebulous preferences, or require an accurate mental model of LM behavior. We propose to use *LMs themselves* to guide the task specification process. In this paper, we introduce **Generative Active Task Elicitation (GATE)**: a learning framework in which models elicit and infer intended behavior through free-form, language-based interaction with users. We study GATE in three domains: email validation, content recommendation, and moral reasoning. In preregistered experiments, we show that LMs prompted to perform GATE (e.g., by generating open-ended questions or synthesizing informative edge cases) elicit responses that are often more informative than user-written prompts or labels. Users report that interactive task elicitation requires less effort than prompting or example labeling and surfaces novel considerations not initially anticipated by users. Our findings suggest that LM-driven elicitation can be a powerful tool for aligning models to complex human preferences and values.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/427N2FP7/Li et al. - 2023 - Eliciting Human Preferences with Language Models.pdf;/Users/andrew/Zotero/storage/4BP43NCQ/2310.html}
}

@online{liEmergentWorldRepresentations2023,
  title = {Emergent {{World Representations}}: {{Exploring}} a {{Sequence Model Trained}} on a {{Synthetic Task}}},
  shorttitle = {Emergent {{World Representations}}},
  author = {Li, Kenneth and Hopkins, Aspen K. and Bau, David and Viégas, Fernanda and Pfister, Hanspeter and Wattenberg, Martin},
  date = {2023-01-25},
  eprint = {2210.13382},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2210.13382},
  urldate = {2023-02-15},
  abstract = {Language models show a surprising range of capabilities, but the source of their apparent competence is unclear. Do these networks just memorize a collection of surface statistics, or do they rely on internal representations of the process that generates the sequences they see? We investigate this question by applying a variant of the GPT model to the task of predicting legal moves in a simple board game, Othello. Although the network has no a priori knowledge of the game or its rules, we uncover evidence of an emergent nonlinear internal representation of the board state. Interventional experiments indicate this representation can be used to control the output of the network and create "latent saliency maps" that can help explain predictions in human terms.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/5ZTLQG5D/Li et al_2023_Emergent World Representations.pdf;/Users/andrew/Zotero/storage/K77V6GLW/2210.html}
}

@online{liLanguageModelingEditable2024,
  title = {Language {{Modeling}} with {{Editable External Knowledge}}},
  author = {Li, Belinda Z. and Liu, Emmy and Ross, Alexis and Zeitoun, Abbas and Neubig, Graham and Andreas, Jacob},
  date = {2024-06-17},
  eprint = {2406.11830},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2406.11830},
  url = {http://arxiv.org/abs/2406.11830},
  urldate = {2024-09-09},
  abstract = {When the world changes, so does the text that humans write about it. How do we build language models that can be easily updated to reflect these changes? One popular approach is retrieval-augmented generation, in which new documents are inserted into a knowledge base and retrieved during prediction for downstream tasks. Most prior work on these systems have focused on improving behavior during prediction through better retrieval or reasoning. This paper introduces ERASE, which instead improves model behavior when new documents are acquired, by incrementally deleting or rewriting other entries in the knowledge base each time a document is added. In two new benchmark datasets evaluating models' ability to answer questions about a stream of news articles or conversations, ERASE improves accuracy relative to conventional retrieval-augmented generation by 7-13\% (Mixtral-8x7B) and 6-10\% (Llama-3-8B) absolute. Code and data are available at https://github.com/belindal/ERASE},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/4JIKM65S/Li et al. - 2024 - Language Modeling with Editable External Knowledge.pdf;/Users/andrew/Zotero/storage/GDPIHUWJ/2406.html}
}

@article{limburgDiesemDiskussionspapierWerden,
  title = {In diesem Diskussionspapier werden zehn Thesen zur Zukunft des wissenschaftlichen Schreibens vorgestellt, mit denen für die Tragweite der KI-induzierten Disruption und der damit einherge- henden Veränderungen des Schreibens sensibilisiert werden soll. Die Thesen sind das Resultat eines kollaborativen Denk- und Schreibprozesses aller Autor:innen.},
  author = {Limburg, Anika and Knorr, Dagmar},
  langid = {ngerman},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/ACNU2RM5/Limburg and Knorr - In diesem Diskussionspapier werden zehn Thesen zur.pdf}
}

@online{linKagNetKnowledgeAwareGraph2019,
  title = {{{KagNet}}: {{Knowledge-Aware Graph Networks}} for {{Commonsense Reasoning}}},
  shorttitle = {{{KagNet}}},
  author = {Lin, Bill Yuchen and Chen, Xinyue and Chen, Jamin and Ren, Xiang},
  date = {2019-09-04},
  eprint = {1909.02151},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.1909.02151},
  url = {http://arxiv.org/abs/1909.02151},
  urldate = {2023-11-16},
  abstract = {Commonsense reasoning aims to empower machines with the human ability to make presumptions about ordinary situations in our daily life. In this paper, we propose a textual inference framework for answering commonsense questions, which effectively utilizes external, structured commonsense knowledge graphs to perform explainable inferences. The framework first grounds a question-answer pair from the semantic space to the knowledge-based symbolic space as a schema graph, a related sub-graph of external knowledge graphs. It represents schema graphs with a novel knowledge-aware graph network module named KagNet, and finally scores answers with graph representations. Our model is based on graph convolutional networks and LSTMs, with a hierarchical path-based attention mechanism. The intermediate attention scores make it transparent and interpretable, which thus produce trustworthy inferences. Using ConceptNet as the only external resource for Bert-based models, we achieved state-of-the-art performance on the CommonsenseQA, a large-scale dataset for commonsense reasoning.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/6AUHTXQR/Lin et al_2019_KagNet.pdf;/Users/andrew/Zotero/storage/XRET7W7L/1909.html}
}

@online{linTeachingModelsExpress2022,
  title = {Teaching {{Models}} to {{Express Their Uncertainty}} in {{Words}}},
  author = {Lin, Stephanie and Hilton, Jacob and Evans, Owain},
  date = {2022-06-13},
  eprint = {2205.14334},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2205.14334},
  url = {http://arxiv.org/abs/2205.14334},
  urldate = {2023-07-24},
  abstract = {We show that a GPT-3 model can learn to express uncertainty about its own answers in natural language -- without use of model logits. When given a question, the model generates both an answer and a level of confidence (e.g. "90\% confidence" or "high confidence"). These levels map to probabilities that are well calibrated. The model also remains moderately calibrated under distribution shift, and is sensitive to uncertainty in its own answers, rather than imitating human examples. To our knowledge, this is the first time a model has been shown to express calibrated uncertainty about its own answers in natural language. For testing calibration, we introduce the CalibratedMath suite of tasks. We compare the calibration of uncertainty expressed in words ("verbalized probability") to uncertainty extracted from model logits. Both kinds of uncertainty are capable of generalizing calibration under distribution shift. We also provide evidence that GPT-3's ability to generalize calibration depends on pre-trained latent representations that correlate with epistemic uncertainty over its answers.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/Q654QAMU/Lin et al_2022_Teaching Models to Express Their Uncertainty in Words.pdf;/Users/andrew/Zotero/storage/AVZC9KYD/2205.html}
}

@online{lipkinEvaluatingStatisticalLanguage2023,
  title = {Evaluating Statistical Language Models as Pragmatic Reasoners},
  author = {Lipkin, Benjamin and Wong, Lionel and Grand, Gabriel and Tenenbaum, Joshua B.},
  date = {2023-05-01},
  eprint = {2305.01020},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2305.01020},
  url = {http://arxiv.org/abs/2305.01020},
  urldate = {2023-12-11},
  abstract = {The relationship between communicated language and intended meaning is often probabilistic and sensitive to context. Numerous strategies attempt to estimate such a mapping, often leveraging recursive Bayesian models of communication. In parallel, large language models (LLMs) have been increasingly applied to semantic parsing applications, tasked with inferring logical representations from natural language. While existing LLM explorations have been largely restricted to literal language use, in this work, we evaluate the capacity of LLMs to infer the meanings of pragmatic utterances. Specifically, we explore the case of threshold estimation on the gradable adjective ``strong'', contextually conditioned on a strength prior, then extended to composition with qualification, negation, polarity inversion, and class comparison. We find that LLMs can derive context-grounded, human-like distributions over the interpretations of several complex pragmatic utterances, yet struggle composing with negation. These results inform the inferential capacity of statistical language models, and their use in pragmatic and semantic parsing applications. All corresponding code is made publicly available (https://github.com/benlipkin/probsem/tree/CogSci2023).},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/2WKY35VS/Lipkin et al_2023_Evaluating statistical language models as pragmatic reasoners.pdf;/Users/andrew/Zotero/storage/RIL2V9C8/2305.html}
}

@online{lipkinEvaluatingStatisticalLanguage2023a,
  title = {Evaluating Statistical Language Models as Pragmatic Reasoners},
  author = {Lipkin, Benjamin and Wong, Lionel and Grand, Gabriel and Tenenbaum, Joshua B.},
  date = {2023-05-01},
  eprint = {2305.01020},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2305.01020},
  urldate = {2024-08-15},
  abstract = {The relationship between communicated language and intended meaning is often probabilistic and sensitive to context. Numerous strategies attempt to estimate such a mapping, often leveraging recursive Bayesian models of communication. In parallel, large language models (LLMs) have been increasingly applied to semantic parsing applications, tasked with inferring logical representations from natural language. While existing LLM explorations have been largely restricted to literal language use, in this work, we evaluate the capacity of LLMs to infer the meanings of pragmatic utterances. Specifically, we explore the case of threshold estimation on the gradable adjective “strong”, contextually conditioned on a strength prior, then extended to composition with qualification, negation, polarity inversion, and class comparison. We find that LLMs can derive context-grounded, human-like distributions over the interpretations of several complex pragmatic utterances, yet struggle composing with negation. These results inform the inferential capacity of statistical language models, and their use in pragmatic and semantic parsing applications. All corresponding code is made publicly available1.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/E3YQ9CUE/Lipkin et al. - 2023 - Evaluating statistical language models as pragmatic reasoners.pdf}
}

@online{liTeachLLMsPersonalize2023,
  title = {Teach {{LLMs}} to {{Personalize}} -- {{An Approach}} Inspired by {{Writing Education}}},
  author = {Li, Cheng and Zhang, Mingyang and Mei, Qiaozhu and Wang, Yaqing and Hombaiah, Spurthi Amba and Liang, Yi and Bendersky, Michael},
  date = {2023-08-15},
  eprint = {2308.07968},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2308.07968},
  url = {http://arxiv.org/abs/2308.07968},
  urldate = {2023-09-07},
  abstract = {Personalized text generation is an emerging research area that has attracted much attention in recent years. Most studies in this direction focus on a particular domain by designing bespoke features or models. In this work, we propose a general approach for personalized text generation using large language models (LLMs). Inspired by the practice of writing education, we develop a multistage and multitask framework to teach LLMs for personalized generation. In writing instruction, the task of writing from sources is often decomposed into multiple steps that involve finding, evaluating, summarizing, synthesizing, and integrating information. Analogously, our approach to personalized text generation consists of multiple stages: retrieval, ranking, summarization, synthesis, and generation. In addition, we introduce a multitask setting that helps the model improve its generation ability further, which is inspired by the observation in education that a student's reading proficiency and writing ability are often correlated. We evaluate our approach on three public datasets, each of which covers a different and representative domain. Our results show significant improvements over a variety of baselines.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/NWF4XDHE/Li et al_2023_Teach LLMs to Personalize -- An Approach inspired by Writing Education.pdf;/Users/andrew/Zotero/storage/MUH6VAXN/2308.html}
}

@online{liuLLMEmpoweringLarge2023,
  title = {{{LLM}}+{{P}}: {{Empowering Large Language Models}} with {{Optimal Planning Proficiency}}},
  shorttitle = {{{LLM}}+{{P}}},
  author = {Liu, Bo and Jiang, Yuqian and Zhang, Xiaohan and Liu, Qiang and Zhang, Shiqi and Biswas, Joydeep and Stone, Peter},
  date = {2023-04-22},
  eprint = {2304.11477},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2304.11477},
  url = {http://arxiv.org/abs/2304.11477},
  urldate = {2023-04-28},
  abstract = {Large language models (LLMs) have demonstrated remarkable zero-shot generalization abilities: state-of-the-art chatbots can provide plausible answers to many common questions that arise in daily life. However, so far, LLMs cannot reliably solve long-horizon planning problems. By contrast, classical planners, once a problem is given in a formatted way, can use efficient search algorithms to quickly identify correct, or even optimal, plans. In an effort to get the best of both worlds, this paper introduces LLM+P, the first framework that incorporates the strengths of classical planners into LLMs. LLM+P takes in a natural language description of a planning problem, then returns a correct (or optimal) plan for solving that problem in natural language. LLM+P does so by first converting the language description into a file written in the planning domain definition language (PDDL), then leveraging classical planners to quickly find a solution, and then translating the found solution back into natural language. Along with LLM+P, we define a diverse set of different benchmark problems taken from common planning scenarios. Via a comprehensive set of experiments on these benchmark problems, we find that LLM+P is able to provide optimal solutions for most problems, while LLMs fail to provide even feasible plans for most problems.\textbackslash footnote\{The code and results are publicly available at https://github.com/Cranial-XIX/llm-pddl.git.\vphantom\}},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Robotics},
  file = {/Users/andrew/Zotero/storage/6JKMPU69/Liu et al_2023_LLM+P.pdf;/Users/andrew/Zotero/storage/HYEFMIMG/2304.html}
}

@online{liuLostMiddleHow2023,
  title = {Lost in the {{Middle}}: {{How Language Models Use Long Contexts}}},
  shorttitle = {Lost in the {{Middle}}},
  author = {Liu, Nelson F. and Lin, Kevin and Hewitt, John and Paranjape, Ashwin and Bevilacqua, Michele and Petroni, Fabio and Liang, Percy},
  date = {2023-07-31},
  eprint = {2307.03172},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2307.03172},
  url = {http://arxiv.org/abs/2307.03172},
  urldate = {2023-09-03},
  abstract = {While recent language models have the ability to take long contexts as input, relatively little is known about how well they use longer context. We analyze language model performance on two tasks that require identifying relevant information within their input contexts: multi-document question answering and key-value retrieval. We find that performance is often highest when relevant information occurs at the beginning or end of the input context, and significantly degrades when models must access relevant information in the middle of long contexts. Furthermore, performance substantially decreases as the input context grows longer, even for explicitly long-context models. Our analysis provides a better understanding of how language models use their input context and provides new evaluation protocols for future long-context models.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/JFI3IEWY/Liu et al_2023_Lost in the Middle.pdf;/Users/andrew/Zotero/storage/MLT2T66L/2307.html}
}

@online{liuTrustworthyLLMsSurvey2023,
  title = {Trustworthy {{LLMs}}: A {{Survey}} and {{Guideline}} for {{Evaluating Large Language Models}}' {{Alignment}}},
  shorttitle = {Trustworthy {{LLMs}}},
  author = {Liu, Yang and Yao, Yuanshun and Ton, Jean-Francois and Zhang, Xiaoying and Guo, Ruocheng and Cheng, Hao and Klochkov, Yegor and Taufiq, Muhammad Faaiz and Li, Hang},
  date = {2023-08-10},
  eprint = {2308.05374},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2308.05374},
  url = {http://arxiv.org/abs/2308.05374},
  urldate = {2023-10-15},
  abstract = {Ensuring alignment, which refers to making models behave in accordance with human intentions [1,2], has become a critical task before deploying large language models (LLMs) in real-world applications. For instance, OpenAI devoted six months to iteratively aligning GPT-4 before its release [3]. However, a major challenge faced by practitioners is the lack of clear guidance on evaluating whether LLM outputs align with social norms, values, and regulations. This obstacle hinders systematic iteration and deployment of LLMs. To address this issue, this paper presents a comprehensive survey of key dimensions that are crucial to consider when assessing LLM trustworthiness. The survey covers seven major categories of LLM trustworthiness: reliability, safety, fairness, resistance to misuse, explainability and reasoning, adherence to social norms, and robustness. Each major category is further divided into several sub-categories, resulting in a total of 29 sub-categories. Additionally, a subset of 8 sub-categories is selected for further investigation, where corresponding measurement studies are designed and conducted on several widely-used LLMs. The measurement results indicate that, in general, more aligned models tend to perform better in terms of overall trustworthiness. However, the effectiveness of alignment varies across the different trustworthiness categories considered. This highlights the importance of conducting more fine-grained analyses, testing, and making continuous improvements on LLM alignment. By shedding light on these key dimensions of LLM trustworthiness, this paper aims to provide valuable insights and guidance to practitioners in the field. Understanding and addressing these concerns will be crucial in achieving reliable and ethically sound deployment of LLMs in various applications.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/QN7ZQNND/Liu et al_2023_Trustworthy LLMs.pdf;/Users/andrew/Zotero/storage/AZ2KCG7W/2308.html}
}

@online{liuWeReAfraid2023,
  title = {We're {{Afraid Language Models Aren}}'t {{Modeling Ambiguity}}},
  author = {Liu, Alisa and Wu, Zhaofeng and Michael, Julian and Suhr, Alane and West, Peter and Koller, Alexander and Swayamdipta, Swabha and Smith, Noah A. and Choi, Yejin},
  date = {2023-04-27},
  eprint = {2304.14399},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2304.14399},
  url = {http://arxiv.org/abs/2304.14399},
  urldate = {2023-04-28},
  abstract = {Ambiguity is an intrinsic feature of natural language. Managing ambiguity is a key part of human language understanding, allowing us to anticipate misunderstanding as communicators and revise our interpretations as listeners. As language models (LMs) are increasingly employed as dialogue interfaces and writing aids, handling ambiguous language is critical to their success. We characterize ambiguity in a sentence by its effect on entailment relations with another sentence, and collect AmbiEnt, a linguist-annotated benchmark of 1,645 examples with diverse kinds of ambiguity. We design a suite of tests based on AmbiEnt, presenting the first evaluation of pretrained LMs to recognize ambiguity and disentangle possible meanings. We find that the task remains extremely challenging, including for the recent GPT-4, whose generated disambiguations are considered correct only 32\% of the time in human evaluation, compared to 90\% for disambiguations in our dataset. Finally, to illustrate the value of ambiguity-sensitive tools, we show that a multilabel NLI model can flag political claims in the wild that are misleading due to ambiguity. We encourage the field to rediscover the importance of ambiguity for NLP.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/CKKEQ77I/Liu et al_2023_We're Afraid Language Models Aren't Modeling Ambiguity.pdf;/Users/andrew/Zotero/storage/442HT3FD/2304.html}
}

@article{luckinEmpoweringEducatorsBe2022,
  title = {Empowering Educators to Be {{AI-ready}}},
  author = {Luckin, Rosemary and Cukurova, Mutlu and Kent, Carmel and family=Boulay, given=Benedict, prefix=du, useprefix=true},
  date = {2022-01-01},
  journaltitle = {Computers and Education: Artificial Intelligence},
  shortjournal = {Computers and Education: Artificial Intelligence},
  volume = {3},
  pages = {100076},
  issn = {2666-920X},
  doi = {10.1016/j.caeai.2022.100076},
  url = {https://www.sciencedirect.com/science/article/pii/S2666920X22000315},
  urldate = {2023-04-26},
  abstract = {In this paper, we present the concept of AI Readiness, along with a framework for developing AI Readiness training. ‘AI Readiness’ can be framed as a contextualised way of helping people to understand AI, in particular, data-driven AI. The nature of AI Readiness training is not the same as merely learning about AI. Rather, AI Readiness recognises the diversity of the professions, workplaces and sectors for whom AI has a potential impact. For example, AI Readiness for lawyers may be based on the same principles as AI Readiness for Educators. However, the details will be contextualised differently. AI Readiness recognises that such contextualisation is not an option: it is essential due to the multiple intricacies, sensitivities and variations between different sectors and their settings, which all impact the application of AI. To embrace such contextualisation, AI Readiness needs to be an active, participatory training process and aims to empower people to be more able to leverage AI to meet their needs. The text that follows focuses on AI Readiness within the Education and Training sector and starts with a discussion of the current state of AI within education and training, and the need for AI Readiness. We then problematize the concept of AI Readiness, why AI Readiness is needed, and what it means. We expand upon the nature of AI Readiness through a discussion of the difference between human and Artificial Intelligence, before presenting a 7-step framework for helping people to become AI Ready. Finally, we use an example of AI Readiness in action within Higher Education to exemplify AI Readiness.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/L2RNJKXY/Luckin et al. - 2022 - Empowering educators to be AI-ready.pdf}
}

@online{mahowaldDissociatingLanguageThought2023a,
  title = {Dissociating Language and Thought in Large Language Models: A Cognitive Perspective},
  shorttitle = {Dissociating Language and Thought in Large Language Models},
  author = {Mahowald, Kyle and Ivanova, Anna A. and Blank, Idan A. and Kanwisher, Nancy and Tenenbaum, Joshua B. and Fedorenko, Evelina},
  date = {2023-01-16},
  eprint = {2301.06627},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2301.06627},
  url = {http://arxiv.org/abs/2301.06627},
  urldate = {2023-02-12},
  abstract = {Today's large language models (LLMs) routinely generate coherent, grammatical and seemingly meaningful paragraphs of text. This achievement has led to speculation that these networks are -- or will soon become -- "thinking machines", capable of performing tasks that require abstract knowledge and reasoning. Here, we review the capabilities of LLMs by considering their performance on two different aspects of language use: 'formal linguistic competence', which includes knowledge of rules and patterns of a given language, and 'functional linguistic competence', a host of cognitive abilities required for language understanding and use in the real world. Drawing on evidence from cognitive neuroscience, we show that formal competence in humans relies on specialized language processing mechanisms, whereas functional competence recruits multiple extralinguistic capacities that comprise human thought, such as formal reasoning, world knowledge, situation modeling, and social cognition. In line with this distinction, LLMs show impressive (although imperfect) performance on tasks requiring formal linguistic competence, but fail on many tests requiring functional competence. Based on this evidence, we argue that (1) contemporary LLMs should be taken seriously as models of formal linguistic skills; (2) models that master real-life language use would need to incorporate or develop not only a core language module, but also multiple non-language-specific cognitive capacities required for modeling thought. Overall, a distinction between formal and functional linguistic competence helps clarify the discourse surrounding LLMs' potential and provides a path toward building models that understand and use language in human-like ways.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/BGLP7GQM/Mahowald et al_2023_Dissociating language and thought in large language models.pdf;/Users/andrew/Zotero/storage/Q5CBAEVF/2301.html}
}

@online{maLetsThoughtExperiment2023,
  title = {Let's {{Do}} a {{Thought Experiment}}: {{Using Counterfactuals}} to {{Improve Moral Reasoning}}},
  shorttitle = {Let's {{Do}} a {{Thought Experiment}}},
  author = {Ma, Xiao and Mishra, Swaroop and Beirami, Ahmad and Beutel, Alex and Chen, Jilin},
  date = {2023-06-25},
  eprint = {2306.14308},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.14308},
  url = {http://arxiv.org/abs/2306.14308},
  urldate = {2024-09-03},
  abstract = {Language models still struggle on moral reasoning, despite their impressive performance in many other tasks. In particular, the Moral Scenarios task in MMLU (Multi-task Language Understanding) is among the worst performing tasks for many language models, including GPT-3. In this work, we propose a new prompting framework, Thought Experiments, to teach language models to do better moral reasoning using counterfactuals. Experiment results show that our framework elicits counterfactual questions and answers from the model, which in turn helps improve the accuracy on Moral Scenarios task by 9-16\% compared to other zero-shot baselines. Interestingly, unlike math reasoning tasks, zero-shot Chain-of-Thought (CoT) reasoning doesn't work out of the box, and even reduces accuracy by around 4\% compared to direct zero-shot. We further observed that with minimal human supervision in the form of 5 few-shot examples, the accuracy of the task can be improved to as much as 80\%.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/ZQM2H8E4/Ma et al. - 2023 - Let's Do a Thought Experiment Using Counterfactuals to Improve Moral Reasoning.pdf;/Users/andrew/Zotero/storage/BPYW469P/2306.html}
}

@article{markauskaiteRethinkingEntwinementArtificial2022,
  title = {Rethinking the Entwinement between Artificial Intelligence and Human Learning: {{What}} Capabilities Do Learners Need for a World with {{AI}}?},
  shorttitle = {Rethinking the Entwinement between Artificial Intelligence and Human Learning},
  author = {Markauskaite, Lina and Marrone, Rebecca and Poquet, Oleksandra and Knight, Simon and Martinez-Maldonado, Roberto and Howard, Sarah and Tondeur, Jo and De Laat, Maarten and Buckingham Shum, Simon and Gašević, Dragan and Siemens, George},
  date = {2022},
  journaltitle = {Computers and Education: Artificial Intelligence},
  shortjournal = {Computers and Education: Artificial Intelligence},
  volume = {3},
  pages = {100056},
  issn = {2666920X},
  doi = {10.1016/j.caeai.2022.100056},
  url = {https://linkinghub.elsevier.com/retrieve/pii/S2666920X2200011X},
  urldate = {2023-04-13},
  abstract = {The proliferation of AI in many aspects of human life—from personal leisure, to collaborative professional work, to global policy decisions—poses a sharp question about how to prepare people for an interconnected, fastchanging world which is increasingly becoming saturated with technological devices and agentic machines. What kinds of capabilities do people need in a world infused with AI? How can we conceptualise these capa­ bilities? How can we help learners develop them? How can we empirically study and assess their development? With this paper, we open the discussion by adopting a dialogical knowledge-making approach. Our team of 11 co-authors participated in an orchestrated written discussion. Engaging in a semi-independent and semi-joint written polylogue, we assembled a pool of ideas of what these capabilities are and how learners could be hel­ ped to develop them. Simultaneously, we discussed conceptual and methodological ideas that would enable us to test and refine our hypothetical views. In synthesising these ideas, we propose that there is a need to move beyond AI-centred views of capabilities and consider the ecology of technology, cognition, social interaction, and values.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/HMPQNLW7/Markauskaite et al. - 2022 - Rethinking the entwinement between artificial inte.pdf}
}

@article{markauskaiteRethinkingEntwinementArtificial2022a,
  title = {Rethinking the Entwinement between Artificial Intelligence and Human Learning: {{What}} Capabilities Do Learners Need for a World with {{AI}}?},
  shorttitle = {Rethinking the Entwinement between Artificial Intelligence and Human Learning},
  author = {Markauskaite, Lina and Marrone, Rebecca and Poquet, Oleksandra and Knight, Simon and Martinez-Maldonado, Roberto and Howard, Sarah and Tondeur, Jo and De Laat, Maarten and Buckingham Shum, Simon and Gašević, Dragan and Siemens, George},
  date = {2022-01-01},
  journaltitle = {Computers and Education: Artificial Intelligence},
  shortjournal = {Computers and Education: Artificial Intelligence},
  volume = {3},
  pages = {100056},
  issn = {2666-920X},
  doi = {10.1016/j.caeai.2022.100056},
  url = {https://www.sciencedirect.com/science/article/pii/S2666920X2200011X},
  urldate = {2023-04-26},
  abstract = {The proliferation of AI in many aspects of human life—from personal leisure, to collaborative professional work, to global policy decisions—poses a sharp question about how to prepare people for an interconnected, fast-changing world which is increasingly becoming saturated with technological devices and agentic machines. What kinds of capabilities do people need in a world infused with AI? How can we conceptualise these capabilities? How can we help learners develop them? How can we empirically study and assess their development? With this paper, we open the discussion by adopting a dialogical knowledge-making approach. Our team of 11 co-authors participated in an orchestrated written discussion. Engaging in a semi-independent and semi-joint written polylogue, we assembled a pool of ideas of what these capabilities are and how learners could be helped to develop them. Simultaneously, we discussed conceptual and methodological ideas that would enable us to test and refine our hypothetical views. In synthesising these ideas, we propose that there is a need to move beyond AI-centred views of capabilities and consider the ecology of technology, cognition, social interaction, and values.},
  langid = {english},
  keywords = {/unread,AI in education,Capabilities for AI,Ecological approach,Postdigital dialogue},
  file = {/Users/andrew/Zotero/storage/MJH9X3KD/Markauskaite et al_2022_Rethinking the entwinement between artificial intelligence and human learning.pdf;/Users/andrew/Zotero/storage/EX4MSQMH/S2666920X2200011X.html}
}

@article{mcaleeseLLMCriticsHelp,
  title = {{{LLM Critics Help Catch LLM Bugs}}},
  author = {McAleese, Nat and Pokorny, Michael and Uribe, Juan Felipe Cerón and Nitishinskaya, Evgenia and Tre, Maja and Leike, Jan},
  abstract = {Reinforcement learning from human feedback (RLHF) is fundamentally limited by the capacity of humans to correctly evaluate model output. To improve human evaluation ability and overcome that limitation this work trains “critic” models that help humans to more accurately evaluate model-written code. These critics are themselves LLMs trained with RLHF to write natural language feedback highlighting problems in code from real-world assistant tasks. On code containing naturally occurring LLM errors model-written critiques are preferred over human critiques in 63\% of cases, and human evaluation finds that models catch more bugs than human contractors paid for code review. We further confirm that our fine-tuned LLM critics can successfully identify hundreds of errors in ChatGPT training data rated as “flawless”, even though the majority of those tasks are non-code tasks and thus out-of-distribution for the critic model. Critics can have limitations of their own, including hallucinated bugs that could mislead humans into making mistakes they might have otherwise avoided, but human-machine teams of critics and contractors catch similar numbers of bugs to LLM critics while hallucinating less than LLMs alone.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/UFRDWFCH/McAleese et al. - LLM Critics Help Catch LLM Bugs.pdf}
}

@online{mccoyEmbersAutoregressionUnderstanding2023,
  title = {Embers of {{Autoregression}}: {{Understanding Large Language Models Through}} the {{Problem They}} Are {{Trained}} to {{Solve}}},
  shorttitle = {Embers of {{Autoregression}}},
  author = {McCoy, R. Thomas and Yao, Shunyu and Friedman, Dan and Hardy, Matthew and Griffiths, Thomas L.},
  date = {2023-09-24},
  eprint = {2309.13638},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2309.13638},
  url = {http://arxiv.org/abs/2309.13638},
  urldate = {2023-10-02},
  abstract = {The widespread adoption of large language models (LLMs) makes it important to recognize their strengths and limitations. We argue that in order to develop a holistic understanding of these systems we need to consider the problem that they were trained to solve: next-word prediction over Internet text. By recognizing the pressures that this task exerts we can make predictions about the strategies that LLMs will adopt, allowing us to reason about when they will succeed or fail. This approach - which we call the teleological approach - leads us to identify three factors that we hypothesize will influence LLM accuracy: the probability of the task to be performed, the probability of the target output, and the probability of the provided input. We predict that LLMs will achieve higher accuracy when these probabilities are high than when they are low - even in deterministic settings where probability should not matter. To test our predictions, we evaluate two LLMs (GPT-3.5 and GPT-4) on eleven tasks, and we find robust evidence that LLMs are influenced by probability in the ways that we have hypothesized. In many cases, the experiments reveal surprising failure modes. For instance, GPT-4's accuracy at decoding a simple cipher is 51\% when the output is a high-probability word sequence but only 13\% when it is low-probability. These results show that AI practitioners should be careful about using LLMs in low-probability situations. More broadly, we conclude that we should not evaluate LLMs as if they are humans but should instead treat them as a distinct type of system - one that has been shaped by its own particular set of pressures.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/XEUXLQ5P/McCoy et al_2023_Embers of Autoregression.pdf;/Users/andrew/Zotero/storage/XQFYTKF2/2309.html}
}

@online{merrillIllusionStateStateSpace2024,
  title = {The {{Illusion}} of {{State}} in {{State-Space Models}}},
  author = {Merrill, William and Petty, Jackson and Sabharwal, Ashish},
  date = {2024-04-12},
  eprint = {2404.08819},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2404.08819},
  url = {http://arxiv.org/abs/2404.08819},
  urldate = {2024-04-17},
  abstract = {State-space models (SSMs) have emerged as a potential alternative architecture for building large language models (LLMs) compared to the previously ubiquitous transformer architecture. One theoretical weakness of transformers is that they cannot express certain kinds of sequential computation and state tracking (Merrill and Sabharwal, 2023), which SSMs are explicitly designed to address via their close architectural similarity to recurrent neural networks (RNNs). But do SSMs truly have an advantage (over transformers) in expressive power for state tracking? Surprisingly, the answer is no. Our analysis reveals that the expressive power of SSMs is limited very similarly to transformers: SSMs cannot express computation outside the complexity class \$\textbackslash mathsf\{TC\}\textasciicircum 0\$. In particular, this means they cannot solve simple state-tracking problems like permutation composition. It follows that SSMs are provably unable to accurately track chess moves with certain notation, evaluate code, or track entities in a long narrative. To supplement our formal analysis, we report experiments showing that Mamba-style SSMs indeed struggle with state tracking. Thus, despite its recurrent formulation, the "state" in an SSM is an illusion: SSMs have similar expressiveness limitations to non-recurrent models like transformers, which may fundamentally limit their ability to solve real-world state-tracking problems.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language,Computer Science - Computational Complexity,Computer Science - Formal Languages and Automata Theory,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/I6EJCSLB/Merrill et al_2024_The Illusion of State in State-Space Models.pdf;/Users/andrew/Zotero/storage/DDD69ZJE/2404.html}
}

@article{messeriArtificialIntelligenceIllusions2024a,
  title = {Artificial Intelligence and Illusions of Understanding in Scientific Research},
  author = {Messeri, Lisa and Crockett, M. J.},
  date = {2024-03},
  journaltitle = {Nature},
  volume = {627},
  number = {8002},
  pages = {49--58},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/s41586-024-07146-0},
  url = {https://www.nature.com/articles/s41586-024-07146-0},
  urldate = {2024-03-11},
  abstract = {Scientists are enthusiastically imagining ways in which artificial intelligence (AI) tools might improve research. Why are AI tools so attractive and what are the risks of implementing them across the research pipeline? Here we develop a taxonomy of scientists’ visions for AI, observing that their appeal comes from promises to improve productivity and objectivity by overcoming human shortcomings. But proposed AI solutions can also exploit our cognitive limitations, making us vulnerable to illusions of understanding in which we believe we understand more about the world than we actually do. Such illusions obscure the scientific community’s ability to see the formation of scientific monocultures, in which some types of methods, questions and viewpoints come to dominate alternative approaches, making science less innovative and more vulnerable to errors. The proliferation of AI tools in science risks introducing a phase of scientific enquiry in which we produce more but understand less. By analysing the appeal of these tools, we provide a framework for advancing discussions of responsible knowledge production in the age of AI.},
  langid = {english},
  keywords = {Human behaviour,Interdisciplinary studies,Lab life,Research management,Social anthropology}
}

@online{mishraReframingInstructionalPrompts2022,
  title = {Reframing {{Instructional Prompts}} to {{GPTk}}'s {{Language}}},
  author = {Mishra, Swaroop and Khashabi, Daniel and Baral, Chitta and Choi, Yejin and Hajishirzi, Hannaneh},
  date = {2022-03-15},
  eprint = {2109.07830},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2109.07830},
  url = {http://arxiv.org/abs/2109.07830},
  urldate = {2024-09-04},
  abstract = {What kinds of instructional prompts are easier to follow for Language Models (LMs)? We study this question by conducting extensive empirical analysis that shed light on important features of successful instructional prompts. Specifically, we study several classes of reframing techniques for manual reformulation of prompts into more effective ones. Some examples include decomposing a complex task instruction into multiple simpler tasks or itemizing instructions into sequential steps. Our experiments compare the zero-shot and few-shot performance of LMs prompted with reframed instructions on 12 NLP tasks across 6 categories. Compared with original instructions, our reframed instructions lead to significant improvements across LMs with different sizes. For example, the same reframed prompts boost few-shot performance of GPT3-series and GPT2-series by 12.5\% and 6.7\% respectively averaged over all tasks. Furthermore, reframed instructions reduce the number of examples required to prompt LMs in the few-shot setting. We hope these empirically-driven techniques will pave the way towards more effective future prompting algorithms.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/Y5PKE4UF/Mishra et al. - 2022 - Reframing Instructional Prompts to GPTk's Language.pdf;/Users/andrew/Zotero/storage/G9UHZZNN/2109.html}
}

@online{mitchellDebateUnderstandingAI2023,
  title = {The {{Debate Over Understanding}} in {{AI}}'s {{Large Language Models}}},
  author = {Mitchell, Melanie and Krakauer, David C.},
  date = {2023-02-10},
  eprint = {2210.13966},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2210.13966},
  url = {http://arxiv.org/abs/2210.13966},
  urldate = {2023-03-29},
  abstract = {We survey a current, heated debate in the AI research community on whether large pre-trained language models can be said to "understand" language -- and the physical and social situations language encodes -- in any important sense. We describe arguments that have been made for and against such understanding, and key questions for the broader sciences of intelligence that have arisen in light of these arguments. We contend that a new science of intelligence can be developed that will provide insight into distinct modes of understanding, their strengths and limitations, and the challenge of integrating diverse forms of cognition.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/WAVN8MYY/Mitchell_Krakauer_2023_The Debate Over Understanding in AI's Large Language Models.pdf;/Users/andrew/Zotero/storage/XAYUWIJI/2210.html}
}

@online{mitchellDetectGPTZeroShotMachineGenerated2023,
  title = {{{DetectGPT}}: {{Zero-Shot Machine-Generated Text Detection}} Using {{Probability Curvature}}},
  shorttitle = {{{DetectGPT}}},
  author = {Mitchell, Eric and Lee, Yoonho and Khazatsky, Alexander and Manning, Christopher D. and Finn, Chelsea},
  date = {2023-01-26},
  eprint = {2301.11305},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2301.11305},
  urldate = {2023-02-15},
  abstract = {The fluency and factual knowledge of large language models (LLMs) heightens the need for corresponding systems to detect whether a piece of text is machine-written. For example, students may use LLMs to complete written assignments, leaving instructors unable to accurately assess student learning. In this paper, we first demonstrate that text sampled from an LLM tends to occupy negative curvature regions of the model's log probability function. Leveraging this observation, we then define a new curvature-based criterion for judging if a passage is generated from a given LLM. This approach, which we call DetectGPT, does not require training a separate classifier, collecting a dataset of real or generated passages, or explicitly watermarking generated text. It uses only log probabilities computed by the model of interest and random perturbations of the passage from another generic pre-trained language model (e.g, T5). We find DetectGPT is more discriminative than existing zero-shot methods for model sample detection, notably improving detection of fake news articles generated by 20B parameter GPT-NeoX from 0.81 AUROC for the strongest zero-shot baseline to 0.95 AUROC for DetectGPT. See https://ericmitchell.ai/detectgpt for code, data, and other project information.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/VFZ4ZU2A/Mitchell et al_2023_DetectGPT.pdf;/Users/andrew/Zotero/storage/HJEJ9TSC/2301.html}
}

@online{modrakSimulationBasedCalibrationChecking2022,
  title = {Simulation-{{Based Calibration Checking}} for {{Bayesian Computation}}: {{The Choice}} of {{Test Quantities Shapes Sensitivity}}},
  shorttitle = {Simulation-{{Based Calibration Checking}} for {{Bayesian Computation}}},
  author = {Modrák, Martin and Moon, Angie H. and Kim, Shinyoung and Bürkner, Paul and Huurre, Niko and Faltejsková, Kateřina and Gelman, Andrew and Vehtari, Aki},
  date = {2022-11-04},
  eprint = {2211.02383},
  eprinttype = {arXiv},
  eprintclass = {stat},
  doi = {10.48550/arXiv.2211.02383},
  url = {http://arxiv.org/abs/2211.02383},
  urldate = {2023-10-09},
  abstract = {Simulation-based calibration checking (SBC) is a practical method to validate computationally-derived posterior distributions or their approximations. In this paper, we introduce a new variant of SBC to alleviate several known problems. Our variant allows the user to in principle detect any possible issue with the posterior, while previously reported implementations could never detect large classes of problems including when the posterior is equal to the prior. This is made possible by including additional data-dependent test quantities when running SBC. We argue and demonstrate that the joint likelihood of the data is an especially useful test quantity. Some other types of test quantities and their theoretical and practical benefits are also investigated. We provide theoretical analysis of SBC, thereby providing a more complete understanding of the underlying statistical mechanisms. We also bring attention to a relatively common mistake in the literature and clarify the difference between SBC and checks based on the data-averaged posterior. We support our recommendations with numerical case studies on a multivariate normal example and a case study in implementing an ordered simplex data type for use with Hamiltonian Monte Carlo. The SBC variant introduced in this paper is implemented in the \$\textbackslash mathtt\{SBC\}\$ R package.},
  pubstate = {prepublished},
  version = {1},
  keywords = {Statistics - Methodology},
  file = {/Users/andrew/Zotero/storage/RUTFS9ZE/Modrák et al_2022_Simulation-Based Calibration Checking for Bayesian Computation.pdf;/Users/andrew/Zotero/storage/YEXYXH9P/2211.html}
}

@online{mollickAssigningAISeven2023,
  type = {SSRN Scholarly Paper},
  title = {Assigning {{AI}}: {{Seven Approaches}} for {{Students}}, with {{Prompts}}},
  shorttitle = {Assigning {{AI}}},
  author = {Mollick, Ethan R. and Mollick, Lilach},
  date = {2023-06-12},
  number = {4475995},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4475995},
  url = {https://papers.ssrn.com/abstract=4475995},
  urldate = {2023-09-14},
  abstract = {This paper examines the transformative role of Large Language Models (LLMs) in education and their potential as learning tools, despite their inherent risks and limitations. The authors propose seven approaches for utilizing AI in classrooms: AI-tutor, AI-coach, AI-mentor, AI-teammate, AI-tool, AI-simulator, and AI-student, each with distinct pedagogical benefits and risks. The aim is to help students learn with and about AI, with practical strategies designed to mitigate risks such as complacency about the AI’s output, errors, and biases. These strategies promote active oversight, critical assessment of AI outputs, and complementarity of AI's capabilities with the students' unique insights. By challenging students to remain the "human in the loop", the authors aim to enhance learning outcomes while ensuring that AI serves as a supportive tool rather than a replacement. The proposed framework offers a guide for educators navigating the integration of AI-assisted learning in classrooms.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,AI,Education,LLM,Prompts},
  file = {/Users/andrew/Zotero/storage/86XYJQ2F/Mollick_Mollick_2023_Assigning AI.pdf}
}

@online{mollickAssigningAISeven2023a,
  title = {Assigning {{AI}}: {{Seven Approaches}} for {{Students}}, with {{Prompts}}},
  shorttitle = {Assigning {{AI}}},
  author = {Mollick, Ethan and Mollick, Lilach},
  date = {2023-06-12},
  eprint = {2306.10052},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2306.10052},
  urldate = {2023-09-14},
  abstract = {This paper examines the transformative role of Large Language Models (LLMs) in education and their potential as learning tools, despite their inherent risks and limitations. The authors propose seven approaches for utilizing AI in classrooms: AI-tutor, AI-coach, AI-mentor, AI-teammate, AI-tool, AI-simulator, and AI-student, each with distinct pedagogical benefits and risks. The aim is to help students learn with and about AI, with practical strategies designed to mitigate risks such as complacency about the AI's output, errors, and biases. These strategies promote active oversight, critical assessment of AI outputs, and complementarity of AI's capabilities with the students' unique insights. By challenging students to remain the "human in the loop," the authors aim to enhance learning outcomes while ensuring that AI serves as a supportive tool rather than a replacement. The proposed framework offers a guide for educators navigating the integration of AI-assisted learning in classrooms},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computers and Society},
  file = {/Users/andrew/Zotero/storage/BNRWUZBX/Mollick_Mollick_2023_Assigning AI.pdf;/Users/andrew/Zotero/storage/ENQN2XLJ/2306.html}
}

@online{mollickAssigningAISeven2023b,
  type = {SSRN Scholarly Paper},
  title = {Assigning {{AI}}: {{Seven Approaches}} for {{Students}}, with {{Prompts}}},
  shorttitle = {Assigning {{AI}}},
  author = {Mollick, Ethan R. and Mollick, Lilach},
  date = {2023-09-23},
  number = {4475995},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4475995},
  url = {https://papers.ssrn.com/abstract=4475995},
  urldate = {2024-03-06},
  abstract = {This paper examines the transformative role of Large Language Models (LLMs) in education and their potential as learning tools, despite their inherent risks and limitations. The authors propose seven approaches for utilizing AI in classrooms: AI-tutor, AI-coach, AI-mentor, AI-teammate, AI-tool, AI-simulator, and AI-student, each with distinct pedagogical benefits and risks. Prompts are included for each of these approaches. The aim is to help students learn with and about AI, with practical strategies designed to mitigate risks such as complacency about the AI’s output, errors, and biases. These strategies promote active oversight, critical assessment of AI outputs, and complementarity of AI's capabilities with the students' unique insights. By challenging students to remain the "human in the loop," the authors aim to enhance learning outcomes while ensuring that AI serves as a supportive tool rather than a replacement. The proposed framework offers a guide for educators navigating the integration of AI-assisted learning in classrooms.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {AI,Education,LLM,Prompts},
  file = {/Users/andrew/Zotero/storage/LG8AGFYC/Mollick_Mollick_2023_Assigning AI.pdf}
}

@online{mollickInstructorsInnovatorsFuturefocused2024,
  type = {SSRN Scholarly Paper},
  title = {Instructors as {{Innovators}}: {{A}} Future-Focused Approach to New {{AI}} Learning Opportunities, with Prompts},
  shorttitle = {Instructors as {{Innovators}}},
  author = {Mollick, Ethan R. and Mollick, Lilach},
  date = {2024-04-22},
  number = {4802463},
  location = {Rochester, NY},
  url = {https://papers.ssrn.com/abstract=4802463},
  urldate = {2024-04-22},
  abstract = {This paper explores how instructors can leverage generative AI to create personalized learning experiences for students that transform teaching and learning. We present a range of AI-based exercises that enable novel forms of practice and application including simulations, mentoring, coaching, and co-creation. For each type of exercise, we provide prompts that instructors can customize, along with guidance on classroom implementation, assessment, and risks to consider. We also provide blueprints, prompts that help instructors create their own original prompts. Instructors can leverage their content and pedagogical expertise to design these experiences, putting them in the role of builders and innovators. We argue that this instructor-driven approach has the potential to democratize the development of educational technology by enabling individual instructors to create AI exercises and tools tailored to their students' needs. While the exercises in this paper are a starting point, not a definitive solutions, they demonstrate AI's potential to expand what is possible in teaching and learning.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {AI,education,LLM,prompts},
  file = {/Users/andrew/Zotero/storage/KKJAMBK3/Mollick_Mollick_2024_Instructors as Innovators.pdf}
}

@online{mollickNewModesLearning2022a,
  type = {SSRN Scholarly Paper},
  title = {New {{Modes}} of {{Learning Enabled}} by {{AI Chatbots}}: {{Three Methods}} and {{Assignments}}},
  shorttitle = {New {{Modes}} of {{Learning Enabled}} by {{AI Chatbots}}},
  author = {Mollick, Ethan R. and Mollick, Lilach},
  date = {2022-12-13},
  number = {4300783},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4300783},
  url = {https://papers.ssrn.com/abstract=4300783},
  urldate = {2023-02-16},
  abstract = {Chatbots are able to produce high-quality, sophisticated text in natural language. The authors of this paper believe that AI can be used to overcome three barriers to learning in the classroom: improving transfer, breaking the illusion of explanatory depth, and training students to critically evaluate explanations. The paper provides background information and techniques on how AI can be used to overcome these barriers and includes prompts and assignments that teachers can incorporate into their teaching. The goal is to help teachers use the capabilities and drawbacks of AI to improve learning},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,AI,chatbot,education,learning,transfer},
  file = {/Users/andrew/Zotero/storage/TI6A8P95/Mollick_Mollick_2022_New Modes of Learning Enabled by AI Chatbots.pdf}
}

@article{mollickUsingAIImplement2023,
  title = {Using {{AI}} to {{Implement Effective Teaching Strategies}} in {{Classrooms}}: {{Five Strategies}}, {{Including Prompts}}},
  shorttitle = {Using {{AI}} to {{Implement Effective Teaching Strategies}} in {{Classrooms}}},
  author = {Mollick, Ethan R. and Mollick, Lilach},
  date = {2023},
  journaltitle = {SSRN Electronic Journal},
  shortjournal = {SSRN Journal},
  issn = {1556-5068},
  doi = {10.2139/ssrn.4391243},
  url = {https://www.ssrn.com/abstract=4391243},
  urldate = {2023-05-03},
  abstract = {This paper provides guidance for using AI to quickly and easily implement evidencebased teaching strategies that instructors can integrate into their teaching. We discuss five teaching strategies that have proven value but are hard to implement in practice due to time and effort constraints. We show how AI can help instructors create material that supports these strategies and improve student learning. The strategies include providing multiple examples and explanations; uncovering and addressing student misconceptions; frequent low-stakes testing; assessing student learning; and distributed practice. The paper provides guidelines for how AI can support each strategy, and discusses both the promises and perils of this approach, arguing that AI may act as a “force multiplier” for instructors if implemented cautiously and thoughtfully in service of evidence-based teaching practices.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/ZWU886LV/Mollick and Mollick - 2023 - Using AI to Implement Effective Teaching Strategie.pdf}
}

@online{mollickUsingAIImplement2023a,
  type = {SSRN Scholarly Paper},
  title = {Using {{AI}} to {{Implement Effective Teaching Strategies}} in {{Classrooms}}: {{Five Strategies}}, {{Including Prompts}}},
  shorttitle = {Using {{AI}} to {{Implement Effective Teaching Strategies}} in {{Classrooms}}},
  author = {Mollick, Ethan R. and Mollick, Lilach},
  date = {2023-03-17},
  number = {4391243},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4391243},
  url = {https://papers.ssrn.com/abstract=4391243},
  urldate = {2023-09-07},
  abstract = {This paper provides guidance for using AI to quickly and easily implement evidence-based teaching strategies that instructors can integrate into their teaching. We discuss five teaching strategies that have proven value but are hard to implement in practice due to time and effort constraints. We show how AI can help instructors create material that supports these strategies and improve student learning. The strategies include providing multiple examples and explanations; uncovering and addressing student misconceptions; frequent low-stakes testing; assessing student learning; and distributed practice. The paper provides guidelines for how AI can support each strategy, and discusses both the promises and perils of this approach, arguing that AI may act as a “force multiplier” for instructors if implemented cautiously and thoughtfully in service of evidence-based teaching practices.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,AI,ChatGPT,GPT4,Learning},
  file = {/Users/andrew/Zotero/storage/92ZG94XT/Mollick_Mollick_2023_Using AI to Implement Effective Teaching Strategies in Classrooms.pdf}
}

@online{mollickUsingAIImplement2023b,
  type = {SSRN Scholarly Paper},
  title = {Using {{AI}} to {{Implement Effective Teaching Strategies}} in {{Classrooms}}: {{Five Strategies}}, {{Including Prompts}}},
  shorttitle = {Using {{AI}} to {{Implement Effective Teaching Strategies}} in {{Classrooms}}},
  author = {Mollick, Ethan R. and Mollick, Lilach},
  date = {2023-03-17},
  number = {4391243},
  location = {Rochester, NY},
  doi = {10.2139/ssrn.4391243},
  url = {https://papers.ssrn.com/abstract=4391243},
  urldate = {2024-03-06},
  abstract = {This paper provides guidance for using AI to quickly and easily implement evidence-based teaching strategies that instructors can integrate into their teaching. We discuss five teaching strategies that have proven value but are hard to implement in practice due to time and effort constraints. We show how AI can help instructors create material that supports these strategies and improve student learning. The strategies include providing multiple examples and explanations; uncovering and addressing student misconceptions; frequent low-stakes testing; assessing student learning; and distributed practice. The paper provides guidelines for how AI can support each strategy, and discusses both the promises and perils of this approach, arguing that AI may act as a “force multiplier” for instructors if implemented cautiously and thoughtfully in service of evidence-based teaching practices.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {AI,ChatGPT,GPT4,Learning},
  file = {/Users/andrew/Zotero/storage/IJYP77UZ/Mollick_Mollick_2023_Using AI to Implement Effective Teaching Strategies in Classrooms.pdf}
}

@article{morales-garciaAdaptationPsychometricProperties2024,
  title = {Adaptation and Psychometric Properties of a Brief Version of the General Self-Efficacy Scale for Use with Artificial Intelligence ({{GSE-6AI}}) among University Students},
  author = {Morales-García, Wilter C. and Sairitupa-Sanchez, Liset Z. and Morales-García, Sandra B. and Morales-García, Mardel},
  date = {2024},
  journaltitle = {Frontiers in Education},
  volume = {9},
  issn = {2504-284X},
  doi = {10.3389/feduc.2024.1293437},
  url = {https://www.frontiersin.org/articles/10.3389/feduc.2024.1293437},
  urldate = {2024-03-08},
  abstract = {{$<$}sec{$>$}BackgroundIndividual beliefs about one’s ability to carry out tasks and face challenges play a pivotal role in academic and professional formation. In the contemporary technological landscape, Artificial Intelligence (AI) is effecting profound changes across multiple sectors. Adaptation to this technology varies greatly among individuals. The integration of AI in the educational setting has necessitated a tool that measures self-efficacy concerning the adoption and use of this technology.{$<$}/sec{$><$}sec{$>$}ObjectiveTo adapt and validate a short version of the General Self-Efficacy Scale (GSE-6) for self-efficacy in the use of Artificial Intelligence (GSE-6AI) in a university student population.{$<$}/sec{$><$}sec{$>$}MethodsAn instrumental study was conducted with the participation of 469 medical students aged between 18 and 29 (M = 19.71; SD = 2.47). The GSE-6 was adapted to the AI context, following strict translation and cultural adaptation procedures. Its factorial structure was evaluated through confirmatory factorial analysis (CFA). Additionally, the factorial invariance of the scale based on gender was studied.{$<$}/sec{$><$}sec{$>$}ResultsThe GSE-6AI exhibited a unidimensional structure with excellent fit indices. All item factorial loads surpassed the recommended threshold, and both Cronbach’s Alpha (α) and McDonald’s Omega (ω) achieved a value of 0.91. Regarding factorial invariance by gender, the scale proved to maintain its structure and meaning in both men and women.{$<$}/sec{$><$}sec{$>$}ConclusionThe adapted GSE-6AI version is a valid and reliable tool for measuring self-efficacy in the use of Artificial Intelligence among university students. Its unidimensional structure and gender-related factorial invariance make it a robust and versatile tool for future research and practical applications in educational and technological contexts.{$<$}/sec{$>$}},
  file = {/Users/andrew/Zotero/storage/DRCCUIXI/Morales-García et al_2024_Adaptation and psychometric properties of a brief version of the general.pdf}
}

@online{neelakantanTextCodeEmbeddings2022,
  title = {Text and {{Code Embeddings}} by {{Contrastive Pre-Training}}},
  author = {Neelakantan, Arvind and Xu, Tao and Puri, Raul and Radford, Alec and Han, Jesse Michael and Tworek, Jerry and Yuan, Qiming and Tezak, Nikolas and Kim, Jong Wook and Hallacy, Chris and Heidecke, Johannes and Shyam, Pranav and Power, Boris and Nekoul, Tyna Eloundou and Sastry, Girish and Krueger, Gretchen and Schnurr, David and Such, Felipe Petroski and Hsu, Kenny and Thompson, Madeleine and Khan, Tabarak and Sherbakov, Toki and Jang, Joanne and Welinder, Peter and Weng, Lilian},
  date = {2022-01-24},
  eprint = {2201.10005},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2201.10005},
  url = {http://arxiv.org/abs/2201.10005},
  urldate = {2023-06-13},
  abstract = {Text embeddings are useful features in many applications such as semantic search and computing text similarity. Previous work typically trains models customized for different use cases, varying in dataset choice, training objective and model architecture. In this work, we show that contrastive pre-training on unsupervised data at scale leads to high quality vector representations of text and code. The same unsupervised text embeddings that achieve new state-of-the-art results in linear-probe classification also display impressive semantic search capabilities and sometimes even perform competitively with fine-tuned models. On linear-probe classification accuracy averaging over 7 tasks, our best unsupervised model achieves a relative improvement of 4\% and 1.8\% over previous best unsupervised and supervised text embedding models respectively. The same text embeddings when evaluated on large-scale semantic search attains a relative improvement of 23.4\%, 14.7\%, and 10.6\% over previous best unsupervised methods on MSMARCO, Natural Questions and TriviaQA benchmarks, respectively. Similarly to text embeddings, we train code embedding models on (text, code) pairs, obtaining a 20.8\% relative improvement over prior best work on code search.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/Q6YJLLU4/Neelakantan et al_2022_Text and Code Embeddings by Contrastive Pre-Training.pdf;/Users/andrew/Zotero/storage/Q28HLSDN/2201.html}
}

@book{nerantzi101CreativeIdeas2023,
  title = {101 Creative Ideas to Use {{AI}} in Education, {{A}} Crowdsourced Collection},
  author = {Nerantzi, Chrissi and Abegglen, Sandra and Karatsiori, Marianna and Martínez-Arboleda (Eds.), Antonio},
  date = {2023-07-31},
  publisher = {Zenodo},
  doi = {10.5281/zenodo.8355454},
  url = {https://zenodo.org/record/8355454},
  urldate = {2023-10-11},
  abstract = {This open crowdsourced collection by \#creativeHE presents a rich tapestry of our collective thinking in the first months of 2023 stitching together potential alternative uses and applications of Artificial Intelligence (AI) that could make a difference and create new learning, development, teaching and assessment opportunities. ~ Experimentation is at the heart of learning, teaching and scholarship. Being open to diverse ideas will help us make novel connections that can lead to new discoveries and insights to make a positive contribution to our world. Ideas shared may be in its embryonic stage, but worth exploring further through active and creative inquiry.~~ We would like to illuminate the importance of responsible, critical and ethical use of AI in education settings and more generally.~ We are grateful for all 101 contributions from 19 countries: Australia, Canada, China, Egypt, Germany, Greece, India, Israel, Italy, Ireland, Jordan, Liberia, Mexico, South Africa, Spain, Thailand, Turkey, United Kingdom and the US. A special thank you to Bushra Hashim for the beautiful design. Suggested citation: Nerantzi, C., Abegglen, S., Karatsiori, M. and Martinez-Arboleda, A. (Eds.) (2023). 101 Creative ideas to use AI in education. A collection curated by \#creativeHE. Graphic Design by Bushra Hashim. CC-BY-NC-SA 4.0. As the collection is made available under the Creative Commons License CC-BY-NC-SA license, anybody can use the collection as open data to further interrogate the use of AI in Education. Please share any resulting outcomes with the editorial team and the wider community.~ ~ The editors ~ “This collection represents vision; it embodies creativity. The importance of perspective and community of practice comes to life here in the breadth of examples demonstrating creative ideas to use AI in education. As we explore how we design new experiences for our learners and differentiate opportunities to engage in new ways, we have an opportunity to push our own boundaries and explore. We can collaborate, radically. This is a collection that will only grow as we shift our own practice and as we allow ourselves to experiment and iterate for a transformational student experience.” Dr Margaret Korosec, Dean of Online and Digital Education, University of Leeds.},
  version = {2023 1.2},
  keywords = {AI in Education,Artificial intelligence,creativeHE,Creativity,crowdsourced collection,Education,open book},
  file = {/Users/andrew/Zotero/storage/MG8RRPP8/Nerantzi et al. - 2023 - 101 creative ideas to use AI in education, A crowd.pdf}
}

@online{opsahl-ongOptimizingInstructionsDemonstrations2024,
  title = {Optimizing {{Instructions}} and {{Demonstrations}} for {{Multi-Stage Language Model Programs}}},
  author = {Opsahl-Ong, Krista and Ryan, Michael J. and Purtell, Josh and Broman, David and Potts, Christopher and Zaharia, Matei and Khattab, Omar},
  date = {2024-06-17},
  eprint = {2406.11695},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2406.11695},
  url = {http://arxiv.org/abs/2406.11695},
  urldate = {2024-06-21},
  abstract = {Language Model Programs, i.e. sophisticated pipelines of modular language model (LM) calls, are increasingly advancing NLP tasks, but they require crafting prompts that are jointly effective for all modules. We study prompt optimization for LM programs, i.e. how to update these prompts to maximize a downstream metric without access to module-level labels or gradients. To make this tractable, we factorize our problem into optimizing the free-form instructions and few-shot demonstrations of every module and introduce several strategies to craft task-grounded instructions and navigate credit assignment across modules. Our strategies include (i) program- and data-aware techniques for proposing effective instructions, (ii) a stochastic mini-batch evaluation function for learning a surrogate model of our objective, and (iii) a meta-optimization procedure in which we refine how LMs construct proposals over time. Using these insights we develop MIPRO, a novel optimizer that outperforms baselines on five of six diverse LM programs using a best-in-class open-source model (Llama-3-8B), by as high as 12.9\% accuracy. We will release our new optimizers and benchmark in DSPy at https://github.com/stanfordnlp/dspy},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/YKR439HU/Opsahl-Ong et al. - 2024 - Optimizing Instructions and Demonstrations for Multi-Stage Language Model Programs.pdf;/Users/andrew/Zotero/storage/AVVKLECD/2406.html}
}

@online{ouyangTrainingLanguageModels2022b,
  title = {Training Language Models to Follow Instructions with Human Feedback},
  author = {Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll L. and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and Schulman, John and Hilton, Jacob and Kelton, Fraser and Miller, Luke and Simens, Maddie and Askell, Amanda and Welinder, Peter and Christiano, Paul and Leike, Jan and Lowe, Ryan},
  date = {2022-03-04},
  eprint = {2203.02155},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2203.02155},
  urldate = {2023-02-13},
  abstract = {Making language models bigger does not inherently make them better at following a user's intent. For example, large language models can generate outputs that are untruthful, toxic, or simply not helpful to the user. In other words, these models are not aligned with their users. In this paper, we show an avenue for aligning language models with user intent on a wide range of tasks by fine-tuning with human feedback. Starting with a set of labeler-written prompts and prompts submitted through the OpenAI API, we collect a dataset of labeler demonstrations of the desired model behavior, which we use to fine-tune GPT-3 using supervised learning. We then collect a dataset of rankings of model outputs, which we use to further fine-tune this supervised model using reinforcement learning from human feedback. We call the resulting models InstructGPT. In human evaluations on our prompt distribution, outputs from the 1.3B parameter InstructGPT model are preferred to outputs from the 175B GPT-3, despite having 100x fewer parameters. Moreover, InstructGPT models show improvements in truthfulness and reductions in toxic output generation while having minimal performance regressions on public NLP datasets. Even though InstructGPT still makes simple mistakes, our results show that fine-tuning with human feedback is a promising direction for aligning language models with human intent.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/FAADUUEG/Ouyang et al_2022_Training language models to follow instructions with human feedback.pdf;/Users/andrew/Zotero/storage/WMCGN5Y8/2203.html}
}

@online{panUnifyingLargeLanguage2023,
  title = {Unifying {{Large Language Models}} and {{Knowledge Graphs}}: {{A Roadmap}}},
  shorttitle = {Unifying {{Large Language Models}} and {{Knowledge Graphs}}},
  author = {Pan, Shirui and Luo, Linhao and Wang, Yufei and Chen, Chen and Wang, Jiapu and Wu, Xindong},
  date = {2023-06-20},
  eprint = {2306.08302},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.08302},
  url = {http://arxiv.org/abs/2306.08302},
  urldate = {2023-06-21},
  abstract = {Large language models (LLMs), such as ChatGPT and GPT4, are making new waves in the field of natural language processing and artificial intelligence, due to their emergent ability and generalizability. However, LLMs are black-box models, which often fall short of capturing and accessing factual knowledge. In contrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are structured knowledge models that explicitly store rich factual knowledge. KGs can enhance LLMs by providing external knowledge for inference and interpretability. Meanwhile, KGs are difficult to construct and evolving by nature, which challenges the existing methods in KGs to generate new facts and represent unseen knowledge. Therefore, it is complementary to unify LLMs and KGs together and simultaneously leverage their advantages. In this article, we present a forward-looking roadmap for the unification of LLMs and KGs. Our roadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs, which incorporate KGs during the pre-training and inference phases of LLMs, or for the purpose of enhancing understanding of the knowledge learned by LLMs; 2) LLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding, completion, construction, graph-to-text generation, and question answering; and 3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a mutually beneficial way to enhance both LLMs and KGs for bidirectional reasoning driven by both data and knowledge. We review and summarize existing efforts within these three frameworks in our roadmap and pinpoint their future research directions.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/QDSPAGAT/Pan et al_2023_Unifying Large Language Models and Knowledge Graphs.pdf;/Users/andrew/Zotero/storage/APPDNP8E/2306.html}
}

@online{panUnifyingLargeLanguage2023a,
  title = {Unifying {{Large Language Models}} and {{Knowledge Graphs}}: {{A Roadmap}}},
  shorttitle = {Unifying {{Large Language Models}} and {{Knowledge Graphs}}},
  author = {Pan, Shirui and Luo, Linhao and Wang, Yufei and Chen, Chen and Wang, Jiapu and Wu, Xindong},
  date = {2023-06-20},
  eprint = {2306.08302},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.08302},
  url = {http://arxiv.org/abs/2306.08302},
  urldate = {2023-11-13},
  abstract = {Large language models (LLMs), such as ChatGPT and GPT4, are making new waves in the field of natural language processing and artificial intelligence, due to their emergent ability and generalizability. However, LLMs are black-box models, which often fall short of capturing and accessing factual knowledge. In contrast, Knowledge Graphs (KGs), Wikipedia and Huapu for example, are structured knowledge models that explicitly store rich factual knowledge. KGs can enhance LLMs by providing external knowledge for inference and interpretability. Meanwhile, KGs are difficult to construct and evolving by nature, which challenges the existing methods in KGs to generate new facts and represent unseen knowledge. Therefore, it is complementary to unify LLMs and KGs together and simultaneously leverage their advantages. In this article, we present a forward-looking roadmap for the unification of LLMs and KGs. Our roadmap consists of three general frameworks, namely, 1) KG-enhanced LLMs, which incorporate KGs during the pre-training and inference phases of LLMs, or for the purpose of enhancing understanding of the knowledge learned by LLMs; 2) LLM-augmented KGs, that leverage LLMs for different KG tasks such as embedding, completion, construction, graph-to-text generation, and question answering; and 3) Synergized LLMs + KGs, in which LLMs and KGs play equal roles and work in a mutually beneficial way to enhance both LLMs and KGs for bidirectional reasoning driven by both data and knowledge. We review and summarize existing efforts within these three frameworks in our roadmap and pinpoint their future research directions.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/BHXBNID6/Pan et al_2023_Unifying Large Language Models and Knowledge Graphs.pdf;/Users/andrew/Zotero/storage/5NGLV3Q6/2306.html}
}

@article{pardosChatGPTgeneratedHelpProduces2024,
  title = {{{ChatGPT-generated}} Help Produces Learning Gains Equivalent to Human Tutor-Authored Help on Mathematics Skills},
  author = {Pardos, Zachary A. and Bhandari, Shreya},
  date = {2024-05-24},
  journaltitle = {PLOS ONE},
  shortjournal = {PLOS ONE},
  volume = {19},
  number = {5},
  pages = {e0304013},
  publisher = {Public Library of Science},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0304013},
  url = {https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0304013},
  urldate = {2024-06-25},
  abstract = {Authoring of help content within educational technologies is labor intensive, requiring many iterations of content creation, refining, and proofreading. In this paper, we conduct an efficacy evaluation of ChatGPT-generated help using a 3 x 4 study design (N = 274) to compare the learning gains of ChatGPT to human tutor-authored help across four mathematics problem subject areas. Participants are randomly assigned to one of three hint conditions (control, human tutor, or ChatGPT) paired with one of four randomly assigned subject areas (Elementary Algebra, Intermediate Algebra, College Algebra, or Statistics). We find that only the ChatGPT condition produces statistically significant learning gains compared to a no-help control, with no statistically significant differences in gains or time-on-task observed between learners receiving ChatGPT vs human tutor help. Notably, ChatGPT-generated help failed quality checks on 32\% of problems. This was, however, reducible to nearly 0\% for algebra problems and 13\% for statistics problems after applying self-consistency, a “hallucination” mitigation technique for Large Language Models.},
  langid = {english},
  keywords = {Hallucinations,Human learning,Language,Machine learning,Mathematics,Statistics,Teachers,Textbooks},
  file = {/Users/andrew/Zotero/storage/JPU3ST6M/Pardos and Bhandari - 2024 - ChatGPT-generated help produces learning gains equivalent to human tutor-authored help on mathematic.pdf}
}

@inproceedings{pardosOATutorOpensourceAdaptive2023,
  title = {{{OATutor}}: {{An Open-source Adaptive Tutoring System}} and {{Curated Content Library}} for {{Learning Sciences Research}}},
  shorttitle = {{{OATutor}}},
  booktitle = {Proceedings of the 2023 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  author = {Pardos, Zachary A. and Tang, Matthew and Anastasopoulos, Ioannis and Sheel, Shreya K. and Zhang, Ethan},
  date = {2023-04-19},
  series = {{{CHI}} '23},
  pages = {1--17},
  publisher = {Association for Computing Machinery},
  location = {New York, NY, USA},
  doi = {10.1145/3544548.3581574},
  url = {https://dl.acm.org/doi/10.1145/3544548.3581574},
  urldate = {2023-04-27},
  abstract = {Despite decades long establishment of effective tutoring principles, no adaptive tutoring system has been developed and open-sourced to the research community. The absence of such a system inhibits researchers from replicating adaptive learning studies and extending and experimenting with various tutoring system design directions. For this reason, adaptive learning research is primarily conducted on a small number of proprietary platforms. In this work, we aim to democratize adaptive learning research with the introduction of the first open-source adaptive tutoring system based on Intelligent Tutoring System principles. The system, we call Open Adaptive Tutor (OATutor), has been iteratively developed over three years with field trials in classrooms drawing feedback from students, teachers, and researchers. The MIT-licensed source code includes three creative commons (CC BY) textbooks worth of algebra problems, with tutoring supports authored by the OATutor project. Knowledge Tracing, an A/B testing framework, and LTI support are included.},
  isbn = {978-1-4503-9421-5},
  keywords = {/unread,Adaptive learning,content authoring,intelligent tutoring systems,OER,open source,replicable research,research through design},
  file = {/Users/andrew/Zotero/storage/CXRWYD97/Pardos et al_2023_OATutor.pdf}
}

@article{pattersonCarbonEmissionsLarge2022,
  title = {Carbon {{Emissions}} and {{Large Neural Network Training}}},
  author = {Patterson, David and Gonzalez, Joseph and Le, Quoc and Liang, Chen and Munguia, Lluis-Miquel and Rothchild, Daniel and So, David and Texier, Maud and Dean, Jeff},
  date = {2022},
  abstract = {The computation demand for machine learning (ML) has grown rapidly recently, which comes with a number of costs. Estimating the energy cost helps measure its environmental impact and finding greener strategies, yet it is challenging without detailed information. We calculate the energy use and carbon footprint of several recent large models—T5, Meena, GShard, Switch Transformer, and GPT-3—and refine earlier estimates for the neural architecture search that found Evolved Transformer. We highlight the following opportunities to improve energy efficiency and CO2 equivalent emissions (CO2e): ● Large but sparsely activated DNNs can consume {$<$}1/10th the energy of large, dense DNNs without sacrificing accuracy despite using as many or even more parameters. ● Geographic location matters for ML workload scheduling since the fraction of carbon-free energy and resulting CO2e vary \textasciitilde 5X-10X, even within the same country and the same organization. We are now optimizing where and when large models are trained. ● Specific datacenter infrastructure matters, as Cloud datacenters can be \textasciitilde 1.4-2X more energy efficient than typical datacenters, and the ML-oriented accelerators inside them can be \textasciitilde 2-5X more effective than off-the-shelf systems. Remarkably, the choice of DNN, datacenter, and processor can reduce the carbon footprint up to \textasciitilde 100-1000X. These large factors also make retroactive estimates of energy cost difficult. To avoid miscalculations, we believe ML papers requiring large computational resources should make energy consumption and CO2e explicit when practical. We are working to be more transparent about energy use and CO2e in our future research. To help reduce the carbon footprint of ML, we believe energy usage and CO2e should be a key metric in evaluating models, and we are collaborating with MLPerf developers to include energy usage during training and inference in this industry standard benchmark.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/9HE5ZBU8/Patterson et al. - Carbon Emissions and Large Neural Network Training.pdf}
}

@article{paulCriticalThinkingArt2008,
  title = {Critical {{Thinking}}: {{The Art}} of {{Socratic Questioning}}, {{Part III}}},
  shorttitle = {Critical {{Thinking}}},
  author = {Paul, Richard and Elder, Linda},
  date = {2008},
  journaltitle = {Journal of Developmental Education},
  volume = {31},
  number = {3},
  pages = {34--35},
  publisher = {National Center for Developmental Education},
  issn = {0894-3907},
  url = {https://eric.ed.gov/?id=EJ832681},
  urldate = {2024-02-07},
  abstract = {In the last two articles, the authors introduced the concept of Socratic questioning and its relationship to critical thinking. They illuminated how understanding the concepts embedded in critical thinking naturally generates questions. For example, a thinker who understands the elements of thought asks questions which probe the parts of thinking. A thinker who understands the role of intellectual standards in disciplined reasoning asks questions that target the assessment of thinking. A thinker who understands the need for students to connect learning to their lived experiences gives numerous examples of questions that can be used everyday to foster student engagement. In this article, the authors focus on the formal mechanics of Socratic questioning. They distinguish three general categories of Socratic questioning: (1) spontaneous; (2) exploratory; and (3) focused. Each of these modes of questioning represents orientations one can adopt in cultivating student thinking. All three require skill in questioning. All three require the instructor to pick from among a wide variety of intellectual moves. All three require judgment in determining when to ask which kind of question. Of course, at any given time, there is no one best question, just better or worse ones.},
  langid = {english},
  keywords = {Critical Thinking,Questioning Techniques,Skill Development,Teaching Methods,Thinking Skills},
  annotation = {ERIC Number: EJ832681},
  file = {/Users/andrew/Zotero/storage/85HFX5MC/Paul_Elder_2008_Critical Thinking.pdf;/Users/andrew/Zotero/storage/TNCS7WP3/SocraticQuestioning2006.pdf}
}

@article{peachUnderstandingLearnerBehaviour2021,
  title = {Understanding Learner Behaviour in Online Courses with {{Bayesian}} Modelling and Time Series Characterisation},
  author = {Peach, Robert L. and Greenbury, Sam F. and Johnston, Iain G. and Yaliraki, Sophia N. and Lefevre, David J. and Barahona, Mauricio},
  date = {2021-02-02},
  journaltitle = {Scientific Reports},
  shortjournal = {Sci Rep},
  volume = {11},
  number = {1},
  pages = {2823},
  publisher = {Nature Publishing Group},
  issn = {2045-2322},
  doi = {10.1038/s41598-021-81709-3},
  url = {https://www.nature.com/articles/s41598-021-81709-3},
  urldate = {2023-05-24},
  abstract = {The intrinsic temporality of learning demands the adoption of methodologies capable of exploiting time-series information. In this study we leverage the sequence data framework and show how data-driven analysis of temporal sequences of task completion in online courses can be used to characterise personal and group learners’ behaviors, and to identify critical tasks and course sessions in a given course design. We also introduce a recently developed probabilistic Bayesian model to learn sequential behaviours of students and predict student performance. The application of our data-driven sequence-based analyses to data from learners undertaking an on-line Business Management course reveals distinct behaviors within the cohort of learners, identifying learners or groups of learners that deviate from the nominal order expected in the course. Using course grades a posteriori, we explore differences in behavior between high and low performing learners. We find that high performing learners follow the progression between weekly sessions more regularly than low performing learners, yet within each weekly session high performing learners are less tied to the nominal task order. We then model the sequences of high and low performance students using the probablistic Bayesian model and show that we can learn engagement behaviors associated with performance. We also show that the data sequence framework can be used for task-centric analysis; we identify critical junctures and differences among types of tasks within the course design. We find that non-rote learning tasks, such as interactive tasks or discussion posts, are correlated with higher performance. We discuss the application of such analytical techniques as an aid to course design, intervention, and student supervision.},
  issue = {1},
  langid = {english},
  keywords = {/unread,Human behaviour,Information technology},
  file = {/Users/andrew/Zotero/storage/T3ECNUX2/Peach et al_2021_Understanding learner behaviour in online courses with Bayesian modelling and.pdf}
}

@article{perkinsArtificialIntelligenceAssessment2024,
  title = {The {{Artificial Intelligence Assessment Scale}} ({{AIAS}}): {{A Framework}} for {{Ethical Integration}} of {{Generative AI}} in {{Educational Assessment}}},
  shorttitle = {The {{Artificial Intelligence Assessment Scale}} ({{AIAS}})},
  author = {Perkins, Mike and Furze, Leon and Roe, Jasper and MacVaugh, Jason},
  date = {2024-04-19},
  journaltitle = {Journal of University Teaching and Learning Practice},
  volume = {21},
  number = {06},
  issn = {1449-9789},
  doi = {10.53761/q3azde36},
  url = {https://open-publishing.org/journals/index.php/jutlp/article/view/810},
  urldate = {2024-07-26},
  abstract = {Recent developments in Generative Artificial Intelligence (GenAI) have created a paradigm shift in multiple areas of society, and the use of these technologies is likely to become a defining feature of education in coming decades. GenAI offers transformative pedagogical opportunities, while simultaneously posing ethical and academic challenges. Against this backdrop, we outline a practical, simple, and sufficiently comprehensive tool to allow for the integration of GenAI tools into educational assessment: the AI Assessment Scale (AIAS). The AIAS empowers educators to select the appropriate level of GenAI usage in assessments based on the learning outcomes they seek to address. The AIAS offers greater clarity and transparency for students and educators, provides a fair and equitable policy tool for institutions to work with, and offers a nuanced approach which embraces the opportunities of GenAI while recognising that there are instances where such tools may not be pedagogically appropriate or necessary. By adopting a practical, flexible approach that can be implemented quickly, the AIAS can form a much-needed starting point to address the current uncertainty and anxiety regarding GenAI in education. As a secondary objective, we engage with the current literature and advocate for a refocused discourse on GenAI tools in education, one which foregrounds how technologies can help support and enhance teaching and learning, which contrasts with the current focus on GenAI as a facilitator of academic misconduct.},
  issue = {06},
  langid = {english},
  keywords = {Academic integrity,AI Assessment,AI integration,Generative artificial intelligence,Student engagement},
  file = {/Users/andrew/Zotero/storage/AYJGR4IP/Perkins et al. - 2024 - The Artificial Intelligence Assessment Scale (AIAS) A Framework for Ethical Integration of Generati.pdf}
}

@article{piantadosiModernLanguageModels,
  title = {Modern Language Models Refute {{Chomsky}}’s Approach to Language},
  author = {Piantadosi, Steven T},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/ER5CQ64P/Piantadosi - Modern language models refute Chomsky’s approach t.pdf}
}

@online{piantasodiMeaningReferenceLarge2022,
  title = {Meaning without Reference in Large Language Models},
  author = {Piantasodi, Steven T. and Hill, Felix},
  date = {2022-08-04},
  eprint = {2208.02957},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2208.02957},
  url = {http://arxiv.org/abs/2208.02957},
  urldate = {2022-08-09},
  abstract = {The widespread success of large language models (LLMs) has been met with skepticism that they possess anything like human concepts or meanings. Contrary to claims that LLMs possess no meaning whatsoever, we argue that they likely capture important aspects of meaning, and moreover work in a way that approximates a compelling account of human cognition in which meaning arises from conceptual role. Because conceptual role is defined by the relationships between internal representational states, meaning cannot be determined from a model's architecture, training data, or objective function, but only by examination of how its internal states relate to each other. This approach may clarify why and how LLMs are so successful and suggest how they can be made more human-like.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/ZWYCJ33Q/Piantasodi and Hill - 2022 - Meaning without reference in large language models.pdf;/Users/andrew/Zotero/storage/F9JAXQ23/2208.html}
}

@article{pintrichRoleMetacognitiveKnowledge2002,
  title = {The {{Role}} of {{Metacognitive Knowledge}} in {{Learning}}, {{Teaching}}, and {{Assessing}}},
  author = {Pintrich, Paul R.},
  date = {2002-11-01},
  journaltitle = {Theory Into Practice},
  shortjournal = {Theory Into Practice},
  volume = {41},
  number = {4},
  pages = {219--225},
  issn = {0040-5841, 1543-0421},
  doi = {10.1207/s15430421tip4104_3},
  url = {https://www.tandfonline.com/doi/full/10.1207/s15430421tip4104_3},
  urldate = {2023-05-05},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/LJXIFAG5/Pintrich - 2002 - The Role of Metacognitive Knowledge in Learning, T.pdf;/Users/andrew/Zotero/storage/PQIEADTF/Pintrich_2002_The Role of Metacognitive Knowledge in Learning, Teaching, and Assessing.pdf}
}

@online{pioroMoEMambaEfficientSelective2024,
  title = {{{MoE-Mamba}}: {{Efficient Selective State Space Models}} with {{Mixture}} of {{Experts}}},
  shorttitle = {{{MoE-Mamba}}},
  author = {Pióro, Maciej and Ciebiera, Kamil and Król, Krystian and Ludziejewski, Jan and Jaszczur, Sebastian},
  date = {2024-01-08},
  eprint = {2401.04081},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2401.04081},
  urldate = {2024-01-11},
  abstract = {State Space Models (SSMs) have become serious contenders in the field of sequential modeling, challenging the dominance of Transformers. At the same time, Mixture of Experts (MoE) has significantly improved Transformer-based LLMs, including recent state-of-the-art open-source models. We propose that to unlock the potential of SSMs for scaling, they should be combined with MoE. We showcase this on Mamba, a recent SSM-based model that achieves remarkable, Transformer-like performance. Our model, MoE-Mamba, outperforms both Mamba and Transformer-MoE. In particular, MoE-Mamba reaches the same performance as Mamba in 2.2x less training steps while preserving the inference performance gains of Mamba against the Transformer.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/MUFTUNZH/Pióro et al_2024_MoE-Mamba.pdf;/Users/andrew/Zotero/storage/ULIRUJCR/2401.html}
}

@online{poesiaCertifiedDeductiveReasoning2023,
  title = {Certified {{Deductive Reasoning}} with {{Language Models}}},
  author = {Poesia, Gabriel and Gandhi, Kanishk and Zelikman, Eric and Goodman, Noah D.},
  date = {2023-11-07},
  eprint = {2306.04031},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.04031},
  url = {http://arxiv.org/abs/2306.04031},
  urldate = {2024-08-16},
  abstract = {Language models often achieve higher accuracy when reasoning step-by-step in complex tasks. However, even when arriving at a correct final answer, their rationales are often logically unsound or inconsistent. This is a major issue when reliable reasoning traces are needed, such when fine-tuning on model-generated reasoning for self-improvement. To tackle these issues, we introduce a class of tools for language models called \textbackslash emph\{guides\}, that use state and incremental constraints to guide generation. A guide can be invoked by the model to constrain its own generation to a set of valid statements given by the tool. In turn, the model's choices can change the guide's state. We show how a general system for logical reasoning can be used as a guide, which we call \textbackslash textsc\{LogicGuide\}. Given a reasoning problem in natural language, a model can formalize its assumptions for \textbackslash textsc\{LogicGuide\} and guarantee that its step-by-step reasoning is sound. In experiments on PrOntoQA, ProofWriter and Syllogism Validity datasets, \textbackslash textsc\{LogicGuide\} significantly improves the performance of GPT-3, GPT-3.5 Turbo and LLaMA (accuracy gains up to 35\textbackslash\%), while drastically reducing \textbackslash emph\{content effects\} -- the interference between unwanted prior assumptions and reasoning, which humans and language models suffer from. We then explore bootstrapping GPT-3.5 Turbo and LLaMA using their own reasoning traces. We find that LogicGuide is critical: by training only on certified self-generated reasoning, models can self-improve, avoiding learning from their own hallucinations. Moreover, bootstrapped models enjoy significant boosts on ReClor, a challenging real-world reasoning dataset, even when not relying on formalization at inference time.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence},
  file = {/Users/andrew/Zotero/storage/NP6PJM3M/Poesia et al. - 2023 - Certified Deductive Reasoning with Language Models.pdf;/Users/andrew/Zotero/storage/R5QBTQRC/2306.html}
}

@online{poluGenerativeLanguageModeling2020,
  title = {Generative {{Language Modeling}} for {{Automated Theorem Proving}}},
  author = {Polu, Stanislas and Sutskever, Ilya},
  date = {2020-09-07},
  eprint = {2009.03393},
  eprinttype = {arXiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.2009.03393},
  url = {http://arxiv.org/abs/2009.03393},
  urldate = {2023-11-23},
  abstract = {We explore the application of transformer-based language models to automated theorem proving. This work is motivated by the possibility that a major limitation of automated theorem provers compared to humans -- the generation of original mathematical terms -- might be addressable via generation from language models. We present an automated prover and proof assistant, GPT-f, for the Metamath formalization language, and analyze its performance. GPT-f found new short proofs that were accepted into the main Metamath library, which is to our knowledge, the first time a deep-learning based system has contributed proofs that were adopted by a formal mathematics community.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,Statistics - Machine Learning},
  file = {/Users/andrew/Zotero/storage/I9HT5RIT/Polu_Sutskever_2020_Generative Language Modeling for Automated Theorem Proving.pdf;/Users/andrew/Zotero/storage/7Z767BK8/2009.html}
}

@online{princeUnderstandingDeepLearning,
  title = {Understanding {{Deep Learning}}},
  author = {Prince, Simon},
  url = {https://udlbook.github.io/udlbook/},
  urldate = {2023-08-02},
  organization = {Understanding Deep Learning},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/KSUS8QU3/udlbook.html}
}

@online{prystawskiPsychologicallyinformedChainofthoughtPrompts2023,
  title = {Psychologically-Informed Chain-of-Thought Prompts for Metaphor Understanding in Large Language Models},
  author = {Prystawski, Ben and Thibodeau, Paul and Potts, Christopher and Goodman, Noah D.},
  date = {2023-05-19},
  eprint = {2209.08141},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2209.08141},
  url = {http://arxiv.org/abs/2209.08141},
  urldate = {2023-10-15},
  abstract = {Probabilistic models of language understanding are valuable tools for investigating human language use. However, they need to be hand-designed for a particular domain. In contrast, large language models (LLMs) are trained on text that spans a wide array of domains, but they lack the structure and interpretability of probabilistic models. In this paper, we use chain-of-thought prompts to introduce structures from probabilistic models into LLMs. We explore this approach in the case of metaphor understanding. Our chain-of-thought prompts lead language models to infer latent variables and reason about their relationships in order to choose appropriate paraphrases for metaphors. The latent variables and relationships chosen are informed by theories of metaphor understanding from cognitive psychology. We apply these prompts to the two largest versions of GPT-3 and show that they can improve performance in a paraphrase selection task.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/S72N4BVP/Prystawski et al_2023_Psychologically-informed chain-of-thought prompts for metaphor understanding in.pdf;/Users/andrew/Zotero/storage/PA6WE8WY/2209.html}
}

@online{qinChatGPTGeneralPurposeNatural2023,
  title = {Is {{ChatGPT}} a {{General-Purpose Natural Language Processing Task Solver}}?},
  author = {Qin, Chengwei and Zhang, Aston and Zhang, Zhuosheng and Chen, Jiaao and Yasunaga, Michihiro and Yang, Diyi},
  date = {2023-02-15},
  eprint = {2302.06476},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2302.06476},
  urldate = {2023-02-22},
  abstract = {Spurred by advancements in scale, large language models (LLMs) have demonstrated the ability to perform a variety of natural language processing (NLP) tasks zero-shot -- i.e., without adaptation on downstream data. Recently, the debut of ChatGPT has drawn a great deal of attention from the natural language processing (NLP) community due to the fact that it can generate high-quality responses to human input and self-correct previous mistakes based on subsequent conversations. However, it is not yet known whether ChatGPT can serve as a generalist model that can perform many NLP tasks zero-shot. In this work, we empirically analyze the zero-shot learning ability of ChatGPT by evaluating it on 20 popular NLP datasets covering 7 representative task categories. With extensive empirical studies, we demonstrate both the effectiveness and limitations of the current version of ChatGPT. We find that ChatGPT performs well on many tasks favoring reasoning capabilities (e.g., arithmetic reasoning) while it still faces challenges when solving specific tasks such as sequence tagging. We additionally provide in-depth analysis through qualitative case studies.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/QJE4EGUH/Qin et al_2023_Is ChatGPT a General-Purpose Natural Language Processing Task Solver.pdf;/Users/andrew/Zotero/storage/BQF7N8SL/2302.html}
}

@article{radfordImprovingLanguageUnderstanding,
  title = {Improving {{Language Understanding}} by {{Generative Pre-Training}}},
  author = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
  abstract = {Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering, semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant, labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to perform adequately. We demonstrate that large gains on these tasks can be realized by generative pre-training of a language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our approach on a wide range of benchmarks for natural language understanding. Our general task-agnostic model outperforms discriminatively trained models that use architectures specifically crafted for each task, significantly improving upon the state of the art in 9 out of the 12 tasks studied. For instance, we achieve absolute improvements of 8.9\% on commonsense reasoning (Stories Cloze Test), 5.7\% on question answering (RACE), and 1.5\% on textual entailment (MultiNLI).},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/ZBTH9HXC/Radford et al. - Improving Language Understanding by Generative Pre.pdf}
}

@online{rafailovDirectPreferenceOptimization2023,
  title = {Direct {{Preference Optimization}}: {{Your Language Model}} Is {{Secretly}} a {{Reward Model}}},
  shorttitle = {Direct {{Preference Optimization}}},
  author = {Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D. and Finn, Chelsea},
  date = {2023-12-13},
  eprint = {2305.18290},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2305.18290},
  url = {http://arxiv.org/abs/2305.18290},
  urldate = {2024-01-11},
  abstract = {While large-scale unsupervised language models (LMs) learn broad world knowledge and some reasoning skills, achieving precise control of their behavior is difficult due to the completely unsupervised nature of their training. Existing methods for gaining such steerability collect human labels of the relative quality of model generations and fine-tune the unsupervised LM to align with these preferences, often with reinforcement learning from human feedback (RLHF). However, RLHF is a complex and often unstable procedure, first fitting a reward model that reflects the human preferences, and then fine-tuning the large unsupervised LM using reinforcement learning to maximize this estimated reward without drifting too far from the original model. In this paper we introduce a new parameterization of the reward model in RLHF that enables extraction of the corresponding optimal policy in closed form, allowing us to solve the standard RLHF problem with only a simple classification loss. The resulting algorithm, which we call Direct Preference Optimization (DPO), is stable, performant, and computationally lightweight, eliminating the need for sampling from the LM during fine-tuning or performing significant hyperparameter tuning. Our experiments show that DPO can fine-tune LMs to align with human preferences as well as or better than existing methods. Notably, fine-tuning with DPO exceeds PPO-based RLHF in ability to control sentiment of generations, and matches or improves response quality in summarization and single-turn dialogue while being substantially simpler to implement and train.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/NSX4BPEJ/Rafailov et al_2023_Direct Preference Optimization.pdf;/Users/andrew/Zotero/storage/29DLLCUE/2305.html}
}

@online{rameshZeroShotTexttoImageGeneration2021,
  title = {Zero-{{Shot Text-to-Image Generation}}},
  author = {Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
  date = {2021-02-26},
  eprint = {2102.12092},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2102.12092},
  url = {http://arxiv.org/abs/2102.12092},
  urldate = {2023-06-13},
  abstract = {Text-to-image generation has traditionally focused on finding better modeling assumptions for training on a fixed dataset. These assumptions might involve complex architectures, auxiliary losses, or side information such as object part labels or segmentation masks supplied during training. We describe a simple approach for this task based on a transformer that autoregressively models the text and image tokens as a single stream of data. With sufficient data and scale, our approach is competitive with previous domain-specific models when evaluated in a zero-shot fashion.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/WTFS2ZWC/Ramesh et al_2021_Zero-Shot Text-to-Image Generation.pdf;/Users/andrew/Zotero/storage/Z75SNWHC/2102.html}
}

@online{ramInContextRetrievalAugmentedLanguage2023,
  title = {In-{{Context Retrieval-Augmented Language Models}}},
  author = {Ram, Ori and Levine, Yoav and Dalmedigos, Itay and Muhlgay, Dor and Shashua, Amnon and Leyton-Brown, Kevin and Shoham, Yoav},
  date = {2023-01-31},
  eprint = {2302.00083},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2302.00083},
  url = {http://arxiv.org/abs/2302.00083},
  urldate = {2023-06-17},
  abstract = {Retrieval-Augmented Language Modeling (RALM) methods, that condition a language model (LM) on relevant documents from a grounding corpus during generation, have been shown to significantly improve language modeling while also providing a natural source attribution mechanism. Existing RALM approaches focus on modifying the LM architecture in order to facilitate the incorporation of external information, significantly complicating deployment. This paper proposes an under-explored alternative, which we dub In-Context RALM: leaving the LM architecture unchanged and prepending grounding documents to the input. We show that in-context RALM which uses off-the-shelf general purpose retrievers provides surprisingly large LM gains across model sizes and diverse corpora. We also demonstrate that the document retrieval and ranking mechanism can be specialized to the RALM setting to further boost performance. We conclude that in-context RALM has considerable potential to increase the prevalence of LM grounding, particularly in settings where a pretrained LM must be used without modification or even via API access. To that end, we make our code publicly available.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language,Computer Science - Information Retrieval},
  file = {/Users/andrew/Zotero/storage/SZGZJ74F/Ram et al_2023_In-Context Retrieval-Augmented Language Models.pdf;/Users/andrew/Zotero/storage/K9TGTTYD/2302.html}
}

@online{razeghiImpactPretrainingTerm2022,
  title = {Impact of {{Pretraining Term Frequencies}} on {{Few-Shot Reasoning}}},
  author = {Razeghi, Yasaman and Logan IV, Robert L. and Gardner, Matt and Singh, Sameer},
  date = {2022-05-23},
  eprint = {2202.07206},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2202.07206},
  urldate = {2023-02-11},
  abstract = {Pretrained Language Models (LMs) have demonstrated ability to perform numerical reasoning by extrapolating from a few examples in few-shot settings. However, the extent to which this extrapolation relies on robust reasoning is unclear. In this paper, we investigate how well these models reason with terms that are less frequent in the pretraining data. In particular, we examine the correlations between the model performance on test instances and the frequency of terms from those instances in the pretraining data. We measure the strength of this correlation for a number of GPT-based language models (pretrained on the Pile dataset) on various numerical deduction tasks (e.g., arithmetic and unit conversion). Our results consistently demonstrate that models are more accurate on instances whose terms are more prevalent, in some cases above \$70\textbackslash\%\$ (absolute) more accurate on the top 10\textbackslash\% frequent terms in comparison to the bottom 10\textbackslash\%. Overall, although LMs exhibit strong performance at few-shot numerical reasoning tasks, our results raise the question of how much models actually generalize beyond pretraining data, and we encourage researchers to take the pretraining data into account when interpreting evaluation results.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/HH5T9WLG/Razeghi et al_2022_Impact of Pretraining Term Frequencies on Few-Shot Reasoning.pdf;/Users/andrew/Zotero/storage/UQQ78JRW/2202.html}
}

@online{razeghiImpactPretrainingTerm2022a,
  title = {Impact of {{Pretraining Term Frequencies}} on {{Few-Shot Reasoning}}},
  author = {Razeghi, Yasaman and Logan IV, Robert L. and Gardner, Matt and Singh, Sameer},
  date = {2022-05-23},
  eprint = {2202.07206},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2202.07206},
  url = {http://arxiv.org/abs/2202.07206},
  urldate = {2023-10-02},
  abstract = {Pretrained Language Models (LMs) have demonstrated ability to perform numerical reasoning by extrapolating from a few examples in few-shot settings. However, the extent to which this extrapolation relies on robust reasoning is unclear. In this paper, we investigate how well these models reason with terms that are less frequent in the pretraining data. In particular, we examine the correlations between the model performance on test instances and the frequency of terms from those instances in the pretraining data. We measure the strength of this correlation for a number of GPT-based language models (pretrained on the Pile dataset) on various numerical deduction tasks (e.g., arithmetic and unit conversion). Our results consistently demonstrate that models are more accurate on instances whose terms are more prevalent, in some cases above \$70\textbackslash\%\$ (absolute) more accurate on the top 10\textbackslash\% frequent terms in comparison to the bottom 10\textbackslash\%. Overall, although LMs exhibit strong performance at few-shot numerical reasoning tasks, our results raise the question of how much models actually generalize beyond pretraining data, and we encourage researchers to take the pretraining data into account when interpreting evaluation results.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/NHK92KRE/Razeghi et al_2022_Impact of Pretraining Term Frequencies on Few-Shot Reasoning.pdf;/Users/andrew/Zotero/storage/TTCMRAA4/2202.html}
}

@online{reynoldsPromptProgrammingLarge2021,
  title = {Prompt {{Programming}} for {{Large Language Models}}: {{Beyond}} the {{Few-Shot Paradigm}}},
  shorttitle = {Prompt {{Programming}} for {{Large Language Models}}},
  author = {Reynolds, Laria and McDonell, Kyle},
  date = {2021-02-15},
  eprint = {2102.07350},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2102.07350},
  url = {http://arxiv.org/abs/2102.07350},
  urldate = {2024-09-04},
  abstract = {Prevailing methods for mapping large generative language models to supervised tasks may fail to sufficiently probe models' novel capabilities. Using GPT-3 as a case study, we show that 0-shot prompts can significantly outperform few-shot prompts. We suggest that the function of few-shot examples in these cases is better described as locating an already learned task rather than meta-learning. This analysis motivates rethinking the role of prompts in controlling and evaluating powerful language models. In this work, we discuss methods of prompt programming, emphasizing the usefulness of considering prompts through the lens of natural language. We explore techniques for exploiting the capacity of narratives and cultural anchors to encode nuanced intentions and techniques for encouraging deconstruction of a problem into components before producing a verdict. Informed by this more encompassing theory of prompt programming, we also introduce the idea of a metaprompt that seeds the model to generate its own natural language prompts for a range of tasks. Finally, we discuss how these more general methods of interacting with language models can be incorporated into existing and future benchmarks and practical applications.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/YB7PKVRI/Reynolds and McDonell - 2021 - Prompt Programming for Large Language Models Beyond the Few-Shot Paradigm.pdf;/Users/andrew/Zotero/storage/I5I28Y5W/2102.html}
}

@article{rogersPrimerBERTologyWhat2020,
  title = {A {{Primer}} in {{BERTology}}: {{What We Know About How BERT Works}}},
  shorttitle = {A {{Primer}} in {{BERTology}}},
  author = {Rogers, Anna and Kovaleva, Olga and Rumshisky, Anna},
  date = {2020},
  journaltitle = {Transactions of the Association for Computational Linguistics},
  volume = {8},
  pages = {842--866},
  publisher = {MIT Press},
  location = {Cambridge, MA},
  doi = {10.1162/tacl_a_00349},
  url = {https://aclanthology.org/2020.tacl-1.54},
  urldate = {2023-03-15},
  abstract = {Transformer-based models have pushed state of the art in many areas of NLP, but our understanding of what is behind their success is still limited. This paper is the first survey of over 150 studies of the popular BERT model. We review the current state of knowledge about how BERT works, what kind of information it learns and how it is represented, common modifications to its training objectives and architecture, the overparameterization issue, and approaches to compression. We then outline directions for future research.},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/2IFRFAD2/Rogers et al_2020_A Primer in BERTology.pdf}
}

@article{rohrerThatLotProcess2022,
  title = {That’s a {{Lot}} to {{Process}}! {{Pitfalls}} of {{Popular Path Models}}},
  author = {Rohrer, Julia M. and Hünermund, Paul and Arslan, Ruben C. and Elson, Malte},
  date = {2022-04-01},
  journaltitle = {Advances in Methods and Practices in Psychological Science},
  volume = {5},
  number = {2},
  pages = {25152459221095827},
  publisher = {SAGE Publications Inc},
  issn = {2515-2459},
  doi = {10.1177/25152459221095827},
  url = {https://doi.org/10.1177/25152459221095827},
  urldate = {2023-05-29},
  abstract = {Path models to test claims about mediation and moderation are a staple of psychology. But applied researchers may sometimes not understand the underlying causal inference problems and thus endorse conclusions that rest on unrealistic assumptions. In this article, we aim to provide a clear explanation for the limited conditions under which standard procedures for mediation and moderation analysis can succeed. We discuss why reversing arrows or comparing model fit indices cannot tell us which model is the right one and how tests of conditional independence can at least tell us where our model goes wrong. Causal modeling practices in psychology are far from optimal but may be kept alive by domain norms that demand every article makes some novel claim about processes and boundary conditions. We end with a vision for a different research culture in which causal inference is pursued in a much slower, more deliberate, and collaborative manner.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/XPHMPG4C/Rohrer et al_2022_That’s a Lot to Process.pdf}
}

@online{rooijReclaimingAITheoretical2023,
  title = {Reclaiming {{AI}} as a Theoretical Tool for Cognitive Science},
  author = {family=Rooij, given=Iris, prefix=van, useprefix=false and Guest, Olivia and Adolfi, Federico G. and family=Haan, given=Ronald, prefix=de, useprefix=false and Kolokolova, Antonina and Rich, Patricia},
  date = {2023-08-01T20:26:54},
  doi = {10.31234/osf.io/4cbuv},
  url = {https://psyarxiv.com/4cbuv/},
  urldate = {2023-09-02},
  abstract = {The idea that human cognition is, or can be understood as, a form of computation is a useful conceptual tool for cognitive science. It was a foundational assumption during the birth of cognitive science as a multidisciplinary field, with Artificial Intelligence (AI) as one of its contributing fields. One conception of AI in this context is as a provider of computational tools (frameworks, concepts, formalisms, models, proofs, simulations, etc.) that support theory building in cognitive science. The contemporary field of AI, however, has taken the theoretical possibility of explaining human cognition as a form of computation to imply the practical feasibility of realising human(-like or -level) cognition in factual computational systems; and, the field frames this realisation as a short-term inevitability. Yet, as we formally prove herein, creating systems with human(-like or -level) cognition is intrinsically computationally intractable. This means that any factual AI systems created in the short-run are at best decoys. When we think these systems capture something deep about ourselves and our thinking, we induce distorted and impoverished images of ourselves and our cognition. In other words, AI in current practice is deteriorating our theoretical understanding of cognition rather than advancing and enhancing it. The situation could be remediated by releasing the grip of the currently dominant view on AI and by returning to the idea of AI as a theoretical tool for cognitive science. In reclaiming this older idea of AI, however, it is important not to repeat conceptual mistakes of the past (and present) that brought us to where we are today.},
  langid = {american},
  pubstate = {prepublished},
  keywords = {/unread,artificial intelligence (AI),Cognitive Psychology,cognitive science,computational complexity,engineering,explanation,Meta-science,Social and Behavioral Sciences,theory,Theory and Philosophy of Science},
  file = {/Users/andrew/Zotero/storage/9D3ZEXVK/Rooij et al_2023_Reclaiming AI as a theoretical tool for cognitive science.pdf}
}

@report{saldenDidaktischeUndRechtliche2023,
  title = {Didaktische und rechtliche Perspektiven auf KI-gestütztes Schreiben in der Hochschulbildung},
  author = {Salden, Peter},
  namea = {Leschke, Jonas},
  nameatype = {collaborator},
  date = {2023},
  pages = {1119 KB, 41 pages},
  institution = {Ruhr-Universität Bochum},
  doi = {10.13154/294-9734},
  url = {https://hss-opus.ub.ruhr-uni-bochum.de/opus4/9734},
  urldate = {2023-06-14},
  langid = {ngerman},
  keywords = {/unread,370 Erziehung Schul- und Bildungswesen},
  file = {/Users/andrew/Zotero/storage/TBSUFMBP/Salden, Peter - 2023 - Didaktische und rechtliche Perspektiven auf KI-ges.pdf}
}

@inproceedings{sarkarExploringPerspectivesImpact2023,
  title = {Exploring {{Perspectives}} on the {{Impact}} of {{Artificial Intelligence}} on the {{Creativity}} of {{Knowledge Work}}: {{Beyond Mechanised Plagiarism}} and {{Stochastic Parrots}}},
  shorttitle = {Exploring {{Perspectives}} on the {{Impact}} of {{Artificial Intelligence}} on the {{Creativity}} of {{Knowledge Work}}},
  booktitle = {Proceedings of the 2nd {{Annual Meeting}} of the {{Symposium}} on {{Human-Computer Interaction}} for {{Work}}},
  author = {Sarkar, Advait},
  date = {2023-06-13},
  pages = {1--17},
  publisher = {ACM},
  location = {Oldenburg Germany},
  doi = {10.1145/3596671.3597650},
  url = {https://dl.acm.org/doi/10.1145/3596671.3597650},
  urldate = {2024-01-29},
  abstract = {Artificial Intelligence (AI), and in particular generative models, are transformative tools for knowledge work. They problematise notions of creativity, originality, plagiarism, the attribution of credit, and copyright ownership. Critics of generative models emphasise the reliance on large amounts of training data, and view the output of these models as no more than randomised plagiarism, remix, or collage of the source data. On these grounds many have argued for stronger regulations on the deployment, use, and attribution of the output of these models.},
  eventtitle = {{{CHIWORK}} 2023: {{Annual Symposium}} on {{Human-Computer Interaction}} for {{Work}} 2023},
  isbn = {9798400708077},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/LF9VP3B3/Sarkar - 2023 - Exploring Perspectives on the Impact of Artificial.pdf}
}

@online{schaefferAreEmergentAbilities2023,
  title = {Are {{Emergent Abilities}} of {{Large Language Models}} a {{Mirage}}?},
  author = {Schaeffer, Rylan and Miranda, Brando and Koyejo, Sanmi},
  date = {2023-04-28},
  eprint = {2304.15004},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2304.15004},
  url = {http://arxiv.org/abs/2304.15004},
  urldate = {2023-05-01},
  abstract = {Recent work claims that large language models display emergent abilities, abilities not present in smaller-scale models that are present in larger-scale models. What makes emergent abilities intriguing is two-fold: their sharpness, transitioning seemingly instantaneously from not present to present, and their unpredictability, appearing at seemingly unforeseeable model scales. Here, we present an alternative explanation for emergent abilities: that for a particular task and model family, when analyzing fixed model outputs, one can choose a metric which leads to the inference of an emergent ability or another metric which does not. Thus, our alternative suggests that existing claims of emergent abilities are creations of the researcher's analyses, not fundamental changes in model behavior on specific tasks with scale. We present our explanation in a simple mathematical model, then test it in three complementary ways: we (1) make, test and confirm three predictions on the effect of metric choice using the InstructGPT/GPT-3 family on tasks with claimed emergent abilities, (2) make, test and confirm two predictions about metric choices in a meta-analysis of emergent abilities on BIG-Bench; and (3) show how similar metric decisions suggest apparent emergent abilities on vision tasks in diverse deep network architectures (convolutional, autoencoder, transformers). In all three analyses, we find strong supporting evidence that emergent abilities may not be a fundamental property of scaling AI models.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence},
  file = {/Users/andrew/Zotero/storage/YI3D3JAV/Schaeffer et al_2023_Are Emergent Abilities of Large Language Models a Mirage.pdf;/Users/andrew/Zotero/storage/C9ACQRBT/2304.html}
}

@online{schulhoffPromptReportSystematic2024,
  title = {The {{Prompt Report}}: {{A Systematic Survey}} of {{Prompting Techniques}}},
  shorttitle = {The {{Prompt Report}}},
  author = {Schulhoff, Sander and Ilie, Michael and Balepur, Nishant and Kahadze, Konstantine and Liu, Amanda and Si, Chenglei and Li, Yinheng and Gupta, Aayush and Han, HyoJung and Schulhoff, Sevien and Dulepet, Pranav Sandeep and Vidyadhara, Saurav and Ki, Dayeon and Agrawal, Sweta and Pham, Chau and Kroiz, Gerson and Li, Feileen and Tao, Hudson and Srivastava, Ashay and Da Costa, Hevander and Gupta, Saloni and Rogers, Megan L. and Goncearenco, Inna and Sarli, Giuseppe and Galynker, Igor and Peskoff, Denis and Carpuat, Marine and White, Jules and Anadkat, Shyamal and Hoyle, Alexander and Resnik, Philip},
  date = {2024-06-06},
  eprint = {2406.06608},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2406.06608},
  urldate = {2024-06-14},
  abstract = {Generative Artificial Intelligence (GenAI) systems are being increasingly deployed across all parts of industry and research settings. Developers and end users interact with these systems through the use of prompting or prompt engineering. While prompting is a widespread and highly researched concept, there exists conflicting terminology and a poor ontological understanding of what constitutes a prompt due to the area's nascency. This paper establishes a structured understanding of prompts, by assembling a taxonomy of prompting techniques and analyzing their use. We present a comprehensive vocabulary of 33 vocabulary terms, a taxonomy of 58 text-only prompting techniques, and 40 techniques for other modalities. We further present a meta-analysis of the entire literature on natural language prefix-prompting.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/HUITTKA7/Schulhoff et al_2024_The Prompt Report.pdf;/Users/andrew/Zotero/storage/FMR4FLJQ/2406.html}
}

@article{shahExplainableKnowledgeTracing,
  title = {Explainable {{Knowledge Tracing Models}} for {{Big Data}}: {{Is Ensembling}} an {{Answer}}?},
  author = {Shah, Tirth and Sharma, Aditya and Olson, Lukas and Patel, Nirmal},
  abstract = {In this paper, we describe our Knowledge Tracing model for the 2020 NeurIPS Education Challenge. We used a combination of 22 models to predict whether the students will answer a given question correctly or not. Our combination of different approaches allowed us to get an accuracy higher than any of the individual models, and the variation of our model types gave our solution better explainability, more alignment with learning science theories, and high predictive power.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/MZ3WIFI2/Shah et al. - Explainable Knowledge Tracing Models for Big Data.pdf}
}

@article{shanahanRolePlayLarge2023,
  title = {Role Play with Large Language Models},
  author = {Shanahan, Murray and McDonell, Kyle and Reynolds, Laria},
  date = {2023-11-08},
  journaltitle = {Nature},
  pages = {1--6},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/s41586-023-06647-8},
  url = {https://www.nature.com/articles/s41586-023-06647-8},
  urldate = {2023-11-08},
  abstract = {As dialogue agents become increasingly human-like in their performance, we must develop effective ways to describe their behaviour in high-level terms without falling into the trap of anthropomorphism. Here we foreground the concept of role play. Casting dialogue-agent behaviour in terms of role play allows us to draw on familiar folk psychological terms, without ascribing human characteristics to language models that they in fact lack. Two important cases of dialogue-agent behaviour are addressed this way, namely, (apparent) deception and (apparent) self-awareness.},
  langid = {english},
  keywords = {Computer science,Philosophy},
  file = {/Users/andrew/Zotero/storage/XVYR9QDK/Shanahan et al_2023_Role play with large language models.pdf}
}

@online{shanahanRolePlayLargeLanguage2023,
  title = {Role-{{Play}} with {{Large Language Models}}},
  author = {Shanahan, Murray and McDonell, Kyle and Reynolds, Laria},
  date = {2023-05-25},
  eprint = {2305.16367},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2305.16367},
  url = {http://arxiv.org/abs/2305.16367},
  urldate = {2023-05-29},
  abstract = {As dialogue agents become increasingly human-like in their performance, it is imperative that we develop effective ways to describe their behaviour in high-level terms without falling into the trap of anthropomorphism. In this paper, we foreground the concept of role-play. Casting dialogue agent behaviour in terms of role-play allows us to draw on familiar folk psychological terms, without ascribing human characteristics to language models they in fact lack. Two important cases of dialogue agent behaviour are addressed this way, namely (apparent) deception and (apparent) self-awareness.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/PN6NU6L7/Shanahan et al_2023_Role-Play with Large Language Models.pdf;/Users/andrew/Zotero/storage/YTHKAW75/2305.html}
}

@online{shanahanTalkingLargeLanguage2023,
  title = {Talking {{About Large Language Models}}},
  author = {Shanahan, Murray},
  date = {2023-01-25},
  eprint = {2212.03551},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2212.03551},
  url = {http://arxiv.org/abs/2212.03551},
  urldate = {2023-02-11},
  abstract = {Thanks to rapid progress in artificial intelligence, we have entered an era when technology and philosophy intersect in interesting ways. Sitting squarely at the centre of this intersection are large language models (LLMs). The more adept LLMs become at mimicking human language, the more vulnerable we become to anthropomorphism, to seeing the systems in which they are embedded as more human-like than they really are. This trend is amplified by the natural tendency to use philosophically loaded terms, such as "knows", "believes", and "thinks", when describing these systems. To mitigate this trend, this paper advocates the practice of repeatedly stepping back to remind ourselves of how LLMs, and the systems of which they form a part, actually work. The hope is that increased scientific precision will encourage more philosophical nuance in the discourse around artificial intelligence, both within the field and in the public sphere.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/EK8WVTU2/Shanahan_2023_Talking About Large Language Models.pdf;/Users/andrew/Zotero/storage/TDJQ7QT9/2212.html}
}

@online{shaoAssistingWritingWikipedialike2024,
  title = {Assisting in {{Writing Wikipedia-like Articles From Scratch}} with {{Large Language Models}}},
  author = {Shao, Yijia and Jiang, Yucheng and Kanell, Theodore A. and Xu, Peter and Khattab, Omar and Lam, Monica S.},
  date = {2024-02-21},
  eprint = {2402.14207},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2402.14207},
  url = {http://arxiv.org/abs/2402.14207},
  urldate = {2024-03-04},
  abstract = {We study how to apply large language models to write grounded and organized long-form articles from scratch, with comparable breadth and depth to Wikipedia pages. This underexplored problem poses new challenges at the pre-writing stage, including how to research the topic and prepare an outline prior to writing. We propose STORM, a writing system for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking. STORM models the pre-writing stage by (1) discovering diverse perspectives in researching the given topic, (2) simulating conversations where writers carrying different perspectives pose questions to a topic expert grounded on trusted Internet sources, (3) curating the collected information to create an outline. For evaluation, we curate FreshWiki, a dataset of recent high-quality Wikipedia articles, and formulate outline assessments to evaluate the pre-writing stage. We further gather feedback from experienced Wikipedia editors. Compared to articles generated by an outline-driven retrieval-augmented baseline, more of STORM's articles are deemed to be organized (by a 25\% absolute increase) and broad in coverage (by 10\%). The expert feedback also helps identify new challenges for generating grounded long articles, such as source bias transfer and over-association of unrelated facts.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/PWGKI49Z/Shao et al_2024_Assisting in Writing Wikipedia-like Articles From Scratch with Large Language.pdf;/Users/andrew/Zotero/storage/I96NLGGM/2402.html}
}

@online{shaoAssistingWritingWikipedialike2024a,
  title = {Assisting in {{Writing Wikipedia-like Articles From Scratch}} with {{Large Language Models}}},
  author = {Shao, Yijia and Jiang, Yucheng and Kanell, Theodore A. and Xu, Peter and Khattab, Omar and Lam, Monica S.},
  date = {2024-04-08},
  eprint = {2402.14207},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2402.14207},
  url = {http://arxiv.org/abs/2402.14207},
  urldate = {2024-08-30},
  abstract = {We study how to apply large language models to write grounded and organized long-form articles from scratch, with comparable breadth and depth to Wikipedia pages. This underexplored problem poses new challenges at the pre-writing stage, including how to research the topic and prepare an outline prior to writing. We propose STORM, a writing system for the Synthesis of Topic Outlines through Retrieval and Multi-perspective Question Asking. STORM models the pre-writing stage by (1) discovering diverse perspectives in researching the given topic, (2) simulating conversations where writers carrying different perspectives pose questions to a topic expert grounded on trusted Internet sources, (3) curating the collected information to create an outline. For evaluation, we curate FreshWiki, a dataset of recent high-quality Wikipedia articles, and formulate outline assessments to evaluate the pre-writing stage. We further gather feedback from experienced Wikipedia editors. Compared to articles generated by an outline-driven retrieval-augmented baseline, more of STORM's articles are deemed to be organized (by a 25\% absolute increase) and broad in coverage (by 10\%). The expert feedback also helps identify new challenges for generating grounded long articles, such as source bias transfer and over-association of unrelated facts.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/MB6DETH5/Shao et al. - 2024 - Assisting in Writing Wikipedia-like Articles From Scratch with Large Language Models.pdf;/Users/andrew/Zotero/storage/FTYDC8XB/2402.html}
}

@online{sharmaUnderstandingSycophancyLanguage2023,
  title = {Towards {{Understanding Sycophancy}} in {{Language Models}}},
  author = {Sharma, Mrinank and Tong, Meg and Korbak, Tomasz and Duvenaud, David and Askell, Amanda and Bowman, Samuel R. and Cheng, Newton and Durmus, Esin and Hatfield-Dodds, Zac and Johnston, Scott R. and Kravec, Shauna and Maxwell, Timothy and McCandlish, Sam and Ndousse, Kamal and Rausch, Oliver and Schiefer, Nicholas and Yan, Da and Zhang, Miranda and Perez, Ethan},
  date = {2023-10-27},
  eprint = {2310.13548},
  eprinttype = {arXiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.2310.13548},
  url = {http://arxiv.org/abs/2310.13548},
  urldate = {2024-09-08},
  abstract = {Human feedback is commonly utilized to finetune AI assistants. But human feedback may also encourage model responses that match user beliefs over truthful ones, a behaviour known as sycophancy. We investigate the prevalence of sycophancy in models whose finetuning procedure made use of human feedback, and the potential role of human preference judgments in such behavior. We first demonstrate that five state-of-the-art AI assistants consistently exhibit sycophancy across four varied free-form text-generation tasks. To understand if human preferences drive this broadly observed behavior, we analyze existing human preference data. We find that when a response matches a user's views, it is more likely to be preferred. Moreover, both humans and preference models (PMs) prefer convincingly-written sycophantic responses over correct ones a non-negligible fraction of the time. Optimizing model outputs against PMs also sometimes sacrifices truthfulness in favor of sycophancy. Overall, our results indicate that sycophancy is a general behavior of state-of-the-art AI assistants, likely driven in part by human preference judgments favoring sycophantic responses.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,I.2.6,Statistics - Machine Learning},
  file = {/Users/andrew/Zotero/storage/W3PV4TMS/Sharma et al. - 2023 - Towards Understanding Sycophancy in Language Models.pdf;/Users/andrew/Zotero/storage/DIYLTKRG/2310.html}
}

@online{siCanLLMsGenerate2024,
  title = {Can {{LLMs Generate Novel Research Ideas}}? {{A Large-Scale Human Study}} with 100+ {{NLP Researchers}}},
  shorttitle = {Can {{LLMs Generate Novel Research Ideas}}?},
  author = {Si, Chenglei and Yang, Diyi and Hashimoto, Tatsunori},
  date = {2024-09-06},
  eprint = {2409.04109},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2409.04109},
  urldate = {2024-09-16},
  abstract = {Recent advancements in large language models (LLMs) have sparked optimism about their potential to accelerate scientific discovery, with a growing number of works proposing research agents that autonomously generate and validate new ideas. Despite this, no evaluations have shown that LLM systems can take the very first step of producing novel, expert-level ideas, let alone perform the entire research process. We address this by establishing an experimental design that evaluates research idea generation while controlling for confounders and performs the first head-to-head comparison between expert NLP researchers and an LLM ideation agent. By recruiting over 100 NLP researchers to write novel ideas and blind reviews of both LLM and human ideas, we obtain the first statistically significant conclusion on current LLM capabilities for research ideation: we find LLM-generated ideas are judged as more novel (p {$<$} 0.05) than human expert ideas while being judged slightly weaker on feasibility. Studying our agent baselines closely, we identify open problems in building and evaluating research agents, including failures of LLM self-evaluation and their lack of diversity in generation. Finally, we acknowledge that human judgements of novelty can be difficult, even by experts, and propose an end-to-end study design which recruits researchers to execute these ideas into full projects, enabling us to study whether these novelty and feasibility judgements result in meaningful differences in research outcome.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/WLW6FBD3/Si et al. - 2024 - Can LLMs Generate Novel Research Ideas A Large-Scale Human Study with 100+ NLP Researchers.pdf}
}

@unpublished{spannagelRulesTools2023,
  title = {Rules for {{Tools}}},
  author = {Spannagel, Christian},
  date = {2023-03},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/SXNMPYKA/Rules for Tools.pdf}
}

@online{stechlyChainThoughtlessnessAnalysis2024,
  title = {Chain of {{Thoughtlessness}}? {{An Analysis}} of {{CoT}} in {{Planning}}},
  shorttitle = {Chain of {{Thoughtlessness}}?},
  author = {Stechly, Kaya and Valmeekam, Karthik and Kambhampati, Subbarao},
  date = {2024-05-08},
  url = {https://arxiv.org/abs/2405.04776v2},
  urldate = {2024-10-02},
  abstract = {Large language model (LLM) performance on reasoning problems typically does not generalize out of distribution. Previous work has claimed that this can be mitigated with chain of thought prompting-a method of demonstrating solution procedures-with the intuition that it is possible to in-context teach an LLM an algorithm for solving the problem. This paper presents a case study of chain of thought on problems from Blocksworld, a classical planning domain, and examines the performance of two state-of-the-art LLMs across two axes: generality of examples given in prompt, and complexity of problems queried with each prompt. While our problems are very simple, we only find meaningful performance improvements from chain of thought prompts when those prompts are exceedingly specific to their problem class, and that those improvements quickly deteriorate as the size n of the query-specified stack grows past the size of stacks shown in the examples. We also create scalable variants of three domains commonly studied in previous CoT papers and demonstrate the existence of similar failure modes. Our results hint that, contrary to previous claims in the literature, CoT's performance improvements do not stem from the model learning general algorithmic procedures via demonstrations but depend on carefully engineering highly problem specific prompts. This spotlights drawbacks of chain of thought, especially the sharp tradeoff between possible performance gains and the amount of human labor necessary to generate examples with correct reasoning traces.},
  langid = {english},
  organization = {arXiv.org},
  file = {/Users/andrew/Zotero/storage/K373ZN9U/Stechly et al. - 2024 - Chain of Thoughtlessness An Analysis of CoT in Planning.pdf}
}

@article{steinAttitudesAIMeasurement2024,
  title = {Attitudes towards {{AI}}: Measurement and Associations with Personality},
  shorttitle = {Attitudes towards {{AI}}},
  author = {Stein, Jan-Philipp and Messingschlager, Tanja and Gnambs, Timo and Hutmacher, Fabian and Appel, Markus},
  date = {2024-02-05},
  journaltitle = {Scientific Reports},
  shortjournal = {Sci Rep},
  volume = {14},
  number = {1},
  pages = {2909},
  publisher = {Nature Publishing Group},
  issn = {2045-2322},
  doi = {10.1038/s41598-024-53335-2},
  url = {https://www.nature.com/articles/s41598-024-53335-2},
  urldate = {2024-05-31},
  abstract = {Artificial intelligence (AI) has become an integral part of many contemporary technologies, such as social media platforms, smart devices, and global logistics systems. At the same time, research on the public acceptance of AI shows that many people feel quite apprehensive about the potential of such technologies—an observation that has been connected to both demographic and sociocultural user variables (e.g., age, previous media exposure). Yet, due to divergent and often ad-hoc measurements of AI-related attitudes, the current body of evidence remains inconclusive. Likewise, it is still unclear if attitudes towards AI are also affected by users’ personality traits. In response to these research gaps, we offer a two-fold contribution. First, we present a novel, psychologically informed questionnaire (ATTARI-12) that captures attitudes towards AI as a single construct, independent of specific contexts or applications. Having observed good reliability and validity for our new measure across two studies (N1\,=\,490; N2\,=\,150), we examine several personality traits—the Big Five, the Dark Triad, and conspiracy mentality—as potential predictors of AI-related attitudes in a third study (N3\,=\,298). We find that agreeableness and younger age predict a more positive view towards artificially intelligent technology, whereas the susceptibility to conspiracy beliefs connects to a more negative attitude. Our findings are discussed considering potential limitations and future directions for research and practice.},
  langid = {english},
  keywords = {Human behaviour,Information technology,Psychology},
  file = {/Users/andrew/Zotero/storage/CF4VPFUP/Stein et al_2024_Attitudes towards AI.pdf}
}

@book{steinsMythenFehlvorstellungenFehlkonzepte2022,
  title = {Mythen, Fehlvorstellungen, Fehlkonzepte und Irrtümer in Schule und Unterricht},
  editor = {Steins, Gisela and Spinath, Birgit and Dutke, Stephan and Roth, Marcus and Limbourg, Maria},
  date = {2022},
  series = {Psychologie in Bildung und Erziehung: Vom Wissen zum Handeln},
  publisher = {Springer Fachmedien},
  location = {Wiesbaden},
  doi = {10.1007/978-3-658-36260-7},
  url = {https://link.springer.com/10.1007/978-3-658-36260-7},
  urldate = {2023-08-11},
  isbn = {978-3-658-36259-1 978-3-658-36260-7},
  langid = {ngerman},
  keywords = {/unread,Bugs,Fehlvorstellungen,Irrtümer,Lehrberuf,Lerntypen,Mythen,Neuromythen},
  file = {/Users/andrew/Zotero/storage/HS25SIGG/Steins et al_2022_Mythen, Fehlvorstellungen, Fehlkonzepte und Irrtümer in Schule und Unterricht.pdf}
}

@inproceedings{strubellEnergyPolicyConsiderations2019,
  title = {Energy and {{Policy Considerations}} for {{Deep Learning}} in {{NLP}}},
  booktitle = {Proceedings of the 57th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}}},
  author = {Strubell, Emma and Ganesh, Ananya and McCallum, Andrew},
  date = {2019-07},
  pages = {3645--3650},
  publisher = {Association for Computational Linguistics},
  location = {Florence, Italy},
  doi = {10.18653/v1/P19-1355},
  url = {https://aclanthology.org/P19-1355},
  urldate = {2023-10-25},
  abstract = {Recent progress in hardware and methodology for training neural networks has ushered in a new generation of large networks trained on abundant data. These models have obtained notable gains in accuracy across many NLP tasks. However, these accuracy improvements depend on the availability of exceptionally large computational resources that necessitate similarly substantial energy consumption. As a result these models are costly to train and develop, both financially, due to the cost of hardware and electricity or cloud compute time, and environmentally, due to the carbon footprint required to fuel modern tensor processing hardware. In this paper we bring this issue to the attention of NLP researchers by quantifying the approximate financial and environmental costs of training a variety of recently successful neural network models for NLP. Based on these findings, we propose actionable recommendations to reduce costs and improve equity in NLP research and practice.},
  eventtitle = {{{ACL}} 2019},
  file = {/Users/andrew/Zotero/storage/V5IR3G4D/Strubell et al_2019_Energy and Policy Considerations for Deep Learning in NLP.pdf}
}

@online{sumersCognitiveArchitecturesLanguage2024,
  title = {Cognitive {{Architectures}} for {{Language Agents}}},
  author = {Sumers, Theodore R. and Yao, Shunyu and Narasimhan, Karthik and Griffiths, Thomas L.},
  date = {2024-03-15},
  eprint = {2309.02427},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2309.02427},
  url = {http://arxiv.org/abs/2309.02427},
  urldate = {2024-04-07},
  abstract = {Recent efforts have augmented large language models (LLMs) with external resources (e.g., the Internet) or internal control flows (e.g., prompt chaining) for tasks requiring grounding or reasoning, leading to a new class of language agents. While these agents have achieved substantial empirical success, we lack a systematic framework to organize existing agents and plan future developments. In this paper, we draw on the rich history of cognitive science and symbolic artificial intelligence to propose Cognitive Architectures for Language Agents (CoALA). CoALA describes a language agent with modular memory components, a structured action space to interact with internal memory and external environments, and a generalized decision-making process to choose actions. We use CoALA to retrospectively survey and organize a large body of recent work, and prospectively identify actionable directions towards more capable agents. Taken together, CoALA contextualizes today's language agents within the broader history of AI and outlines a path towards language-based general intelligence.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning,Computer Science - Symbolic Computation},
  file = {/Users/andrew/Zotero/storage/BP9H28QI/Sumers et al_2024_Cognitive Architectures for Language Agents.pdf;/Users/andrew/Zotero/storage/M6BUCLXL/2309.html}
}

@report{sumersReconcilingTruthfulnessRelevance2022,
  type = {preprint},
  title = {Reconciling Truthfulness and Relevance as Epistemic and Decision-Theoretic Utility},
  author = {Sumers, Theodore and Ho, Mark K and Griffiths, Thomas L. and Hawkins, Robert},
  date = {2022-10-02},
  institution = {PsyArXiv},
  doi = {10.31234/osf.io/e9m3j},
  url = {https://osf.io/e9m3j},
  urldate = {2023-10-15},
  abstract = {People use language to influence others’ beliefs and actions. Yet models of communication have diverged along these lines, formalizing the speaker’s objective in terms of either the listener’s beliefs or actions. We argue that this divergence lies at the root of a longstanding controversy over the Gricean maxims of truthfulness and relevance. We first bridge the divide by introducing a speaker model which considers both the listener’s beliefs (epistemic utility) and their actions (decision-theoretic utility). We show that formalizing truthfulness as an epistemic utility and relevance as a decision-theoretic utility reconciles the tension between them, readily explaining puzzles such as context-dependent standards of truthfulness. We then test a set of novel predictions generated by our combined model. We introduce a new signaling game which decouples utterances’ truthfulness and relevance, then use it to conduct a pair of experiments. Our first experiment demonstrates that participants jointly maximize epistemic and decision-theoretic utility, rather than either alone. Our second experiment shows that when the two conflict, participants make a graded tradeo rather than prioritizing one over the other. These results demonstrate that human communication cannot be reduced to influencing beliefs or actions alone. Taken together, our work provides a new foundation for grounding rational communication not only in what we believe, but in what those beliefs lead us to do.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/M34U4F5M/Sumers et al. - 2022 - Reconciling truthfulness and relevance as epistemi.pdf}
}

@online{suttonBitterLesson2019,
  title = {The {{Bitter Lesson}}},
  author = {Sutton, Rich},
  date = {2019},
  url = {http://incompleteideas.net/IncIdeas/BitterLesson.html},
  urldate = {2023-06-13},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/HP7I3W55/BitterLesson.html}
}

@online{suttonQuestCommonModel2022a,
  title = {The {{Quest}} for a {{Common Model}} of the {{Intelligent Decision Maker}}},
  author = {Sutton, Richard S.},
  date = {2022-06-05},
  eprint = {2202.13252},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2202.13252},
  urldate = {2024-02-02},
  abstract = {The premise of the Multi-disciplinary Conference on Reinforcement Learning and Decision Making is that multiple disciplines share an interest in goal-directed decision making over time. The idea of this paper is to sharpen and deepen this premise by proposing a perspective on the decision maker that is substantive and widely held across psychology, artificial intelligence, economics, control theory, and neuroscience, which I call the "common model of the intelligent agent". The common model does not include anything specific to any organism, world, or application domain. The common model does include aspects of the decision maker's interaction with its world (there must be input and output, and a goal) and internal components of the decision maker (for perception, decision-making, internal evaluation, and a world model). I identify these aspects and components, note that they are given different names in different disciplines but refer essentially to the same ideas, and discuss the challenges and benefits of devising a neutral terminology that can be used across disciplines. It is time to recognize and build on the convergence of multiple diverse disciplines on a substantive common model of the intelligent agent.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/Users/andrew/Zotero/storage/7HAE2FC7/Sutton_2022_The Quest for a Common Model of the Intelligent Decision Maker.pdf;/Users/andrew/Zotero/storage/C69994SH/2202.html}
}

@online{tankelevitchMetacognitiveDemandsOpportunities2023,
  title = {The {{Metacognitive Demands}} and {{Opportunities}} of {{Generative AI}}},
  author = {Tankelevitch, Lev and Kewenig, Viktor and Simkute, Auste and Scott, Ava Elizabeth and Sarkar, Advait and Sellen, Abigail and Rintel, Sean},
  date = {2023-12-17},
  eprint = {2312.10893},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2312.10893},
  url = {http://arxiv.org/abs/2312.10893},
  urldate = {2024-02-21},
  abstract = {Generative AI (GenAI) systems offer unprecedented opportunities for transforming professional and personal work, yet present challenges around prompting, evaluating and relying on outputs, and optimizing workflows. We argue that metacognition\$\textbackslash unicode\{x2013\}\$the psychological ability to monitor and control one's thoughts and behavior\$\textbackslash unicode\{x2013\}\$offers a valuable lens to understand and design for these usability challenges. Drawing on research in psychology and cognitive science, and recent GenAI user studies, we illustrate how GenAI systems impose metacognitive demands on users, requiring a high degree of metacognitive monitoring and control. We propose these demands could be addressed by integrating metacognitive support strategies into GenAI systems, and by designing GenAI systems to reduce their metacognitive demand by targeting explainability and customizability. Metacognition offers a coherent framework for understanding the usability challenges posed by GenAI, enabling us to offer research and design directions to advance human-GenAI interaction.},
  pubstate = {prepublished},
  keywords = {Computer Science - Human-Computer Interaction},
  file = {/Users/andrew/Zotero/storage/ELZ4ZZR5/Tankelevitch et al_2023_The Metacognitive Demands and Opportunities of Generative AI.pdf;/Users/andrew/Zotero/storage/HIQZXKEL/2312.html}
}

@online{tianFinetuningLanguageModels2023,
  title = {Fine-Tuning {{Language Models}} for {{Factuality}}},
  author = {Tian, Katherine and Mitchell, Eric and Yao, Huaxiu and Manning, Christopher D. and Finn, Chelsea},
  date = {2023-11-14},
  eprint = {2311.08401},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2311.08401},
  url = {http://arxiv.org/abs/2311.08401},
  urldate = {2023-12-09},
  abstract = {The fluency and creativity of large pre-trained language models (LLMs) have led to their widespread use, sometimes even as a replacement for traditional search engines. Yet language models are prone to making convincing but factually inaccurate claims, often referred to as 'hallucinations.' These errors can inadvertently spread misinformation or harmfully perpetuate misconceptions. Further, manual fact-checking of model responses is a time-consuming process, making human factuality labels expensive to acquire. In this work, we fine-tune language models to be more factual, without human labeling and targeting more open-ended generation settings than past work. We leverage two key recent innovations in NLP to do so. First, several recent works have proposed methods for judging the factuality of open-ended text by measuring consistency with an external knowledge base or simply a large model's confidence scores. Second, the direct preference optimization algorithm enables straightforward fine-tuning of language models on objectives other than supervised imitation, using a preference ranking over possible model responses. We show that learning from automatically generated factuality preference rankings, generated either through existing retrieval systems or our novel retrieval-free approach, significantly improves the factuality (percent of generated claims that are correct) of Llama-2 on held-out topics compared with RLHF or decoding strategies targeted at factuality. At 7B scale, compared to Llama-2-chat, we observe 58\% and 40\% reduction in factual error rate when generating biographies and answering medical questions, respectively.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/3AHVD68X/Tian et al_2023_Fine-tuning Language Models for Factuality.pdf;/Users/andrew/Zotero/storage/S8X7U9HR/2311.html}
}

@article{touvronLlamaOpenFoundation,
  title = {Llama 2: {{Open Foundation}} and {{Fine-Tuned Chat Models}}},
  author = {Touvron, Hugo and Martin, Louis and Stone, Kevin},
  abstract = {In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closedsource models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.},
  langid = {english},
  keywords = {/unread,⛔ No DOI found},
  file = {/Users/andrew/Zotero/storage/XYQWPWHY/Touvron et al. - Llama 2 Open Foundation and Fine-Tuned Chat Model.pdf}
}

@book{tranMachineLearningProbabilistic2022,
  title = {Machine {{Learning}} and {{Probabilistic Graphical Models}} for {{Decision Support Systems}}},
  editor = {Tran, Kim Phuc},
  date = {2022-10-13},
  publisher = {CRC Press},
  location = {Boca Raton},
  doi = {10.1201/9781003189886},
  abstract = {This book presents recent advancements in research, a review of new methods and techniques, and applications in decision support systems (DSS) with Machine Learning and Probabilistic Graphical Models, which are very effective techniques in gaining knowledge from Big Data and in interpreting decisions. It explores Bayesian network learning, Control Chart, Reinforcement Learning for multicriteria DSS, Anomaly Detection in Smart Manufacturing with Federated Learning, DSS in healthcare, DSS for supply chain management, etc. Researchers and practitioners alike will benefit from this book to enhance the understanding of machine learning, Probabilistic Graphical Models, and their uses in DSS in the context of decision making with uncertainty. The real-world case studies in various fields with guidance and recommendations for the practical applications of these studies are introduced in each chapter.},
  isbn = {978-1-00-318988-6},
  pagetotal = {330}
}

@article{tschiatschekEquityFairnessBayesian2022a,
  title = {Equity and {{Fairness}} of {{Bayesian Knowledge Tracing}}},
  author = {Tschiatschek, Sebastian and Knobelsdorf, Maria and Adish Singla},
  namea = {Mitrovic, Antonija and Bosch, Nigel},
  nameatype = {collaborator},
  date = {2022-07-18},
  publisher = {Zenodo},
  doi = {10.5281/ZENODO.6853012},
  url = {https://zenodo.org/record/6853012},
  urldate = {2023-03-08},
  abstract = {We consider the equity and fairness of curricula derived from Knowledge Tracing models. We begin by defining a unifying notion of an equitable tutoring system as a system that achieves maximum possible knowledge in minimal time for each student interacting with it. Realizing perfect equity requires tutoring systems that can provide individualized curricula per student. In particular, we investigate the design of equitable tutoring systems that derive their curricula from Knowledge Tracing models. We first show that the classical Bayesian Knowledge Tracing (BKT) model and their derived curricula can fall short of achieving equitable tutoring. To overcome this issue, we then propose a novel model, Bayesian-Bayesian Knowledge Tracing (B2KT), that naturally allows online individualization. We demonstrate that curricula derived from our model are more effective and equitable than those derived from existing models. Furthermore, we highlight that improving models with a focus on the fairness of next-step predictions can be insufficient to develop equitable tutoring systems.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/QGCP2NZ7/Tschiatschek et al. - 2022 - Equity and Fairness of Bayesian Knowledge Tracing.pdf}
}

@online{tsvilodubOverinformativeQuestionAnswering2023,
  title = {Overinformative {{Question Answering}} by {{Humans}} and {{Machines}}},
  author = {Tsvilodub, Polina and Franke, Michael and Hawkins, Robert D. and Goodman, Noah D.},
  date = {2023-05-11},
  eprint = {2305.07151},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2305.07151},
  url = {http://arxiv.org/abs/2305.07151},
  urldate = {2023-12-12},
  abstract = {When faced with a polar question, speakers often provide overinformative answers going beyond a simple "yes" or "no". But what principles guide the selection of additional information? In this paper, we provide experimental evidence from two studies suggesting that overinformativeness in human answering is driven by considerations of relevance to the questioner's goals which they flexibly adjust given the functional context in which the question is uttered. We take these human results as a strong benchmark for investigating question-answering performance in state-of-the-art neural language models, conducting an extensive evaluation on items from human experiments. We find that most models fail to adjust their answering behavior in a human-like way and tend to include irrelevant information. We show that GPT-3 is highly sensitive to the form of the prompt and only achieves human-like answer patterns when guided by an example and cognitively-motivated explanation.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/LEC3CTRY/Tsvilodub et al_2023_Overinformative Question Answering by Humans and Machines.pdf;/Users/andrew/Zotero/storage/QQQP9GYC/2305.html}
}

@article{tuckuteLanguageBrainsMinds2024,
  title = {Language in {{Brains}}, {{Minds}}, and {{Machines}}},
  author = {Tuckute, Greta and Kanwisher, Nancy and Fedorenko, Evelina},
  date = {2024-04-26},
  publisher = {Annual Reviews},
  doi = {10.1146/annurev-neuro-120623-101142},
  url = {https://www.annualreviews.org/content/journals/10.1146/annurev-neuro-120623-101142},
  urldate = {2024-05-02},
  abstract = {It has long been argued that only humans could produce and understand language. But now, for the first time, artificial language models (LMs) achieve this feat. Here we survey the new purchase LMs are providing on the question of how language is implemented in the brain. We discuss why, a priori, LMs might be expected to share similarities with the human language system. We then summarize evidence that LMs represent linguistic information similarly enough to humans to enable relatively accurate brain encoding and decoding during language processing. Finally, we examine which LM properties—their architecture, task performance, or training—are critical for capturing human neural responses to language and review studies using LMs as in silico model organisms for testing hypotheses about language. These ongoing investigations bring us closer to understanding the representations and processes that underlie our ability to comprehend sentences and express thoughts in language.},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/S5LWQJCH/annurev-neuro-120623-101142.html}
}

@article{turingCOMPUTINGMACHINERYINTELLIGENCE1950,
  title = {I.—{{COMPUTING MACHINERY AND INTELLIGENCE}}},
  author = {TURING, A. M.},
  date = {1950-10-01},
  journaltitle = {Mind},
  shortjournal = {Mind},
  volume = {LIX},
  number = {236},
  pages = {433--460},
  issn = {0026-4423},
  doi = {10.1093/mind/LIX.236.433},
  url = {https://doi.org/10.1093/mind/LIX.236.433},
  urldate = {2023-05-04},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/ENA265IY/TURING - 1950 - I.—COMPUTING MACHINERY AND INTELLIGENCE.pdf;/Users/andrew/Zotero/storage/NS3QIJKK/TURING_1950_I.pdf}
}

@article{universityoftasmaniaaustraliaPromptingHigherEducation2023,
  title = {Prompting {{Higher Education Towards AI-Augmented Teaching}} and {{Learning Practice}}},
  author = {{University of Tasmania, Australia} and Eager, Bronwyn and Brunton, Ryan and {University of Tasmania, Australia}},
  date = {2023-05-29},
  journaltitle = {Journal of University Teaching and Learning Practice},
  shortjournal = {JUTLP},
  volume = {20},
  number = {5},
  issn = {14499789, 14499789},
  doi = {10.53761/1.20.5.02},
  url = {https://ro.uow.edu.au/jutlp/vol20/iss5/02/},
  urldate = {2023-05-30},
  abstract = {Large Language Models (LLMs) and conversational-style generative artificial intelligence (AI) are causing major disruption to higher education pedagogy. The emergence of tools like ChatGPT has raised concerns about plagiarism detection but also presents opportunities for educators to leverage AI to build supportive learning environments. In this commentary, we explore the potential of AI-augmented teaching and learning practice in higher education, discussing both the productive affordances and challenges associated with these technologies. We offer instructional advice for writing instructional text to guide the generation of quality outputs from AI models, as well as a case study to illustrate using AI for assessment design. Ultimately, we suggest that AI should be seen as one tool among many that can be used to enhance teaching and learning outcomes in higher education.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/99UM9AJ2/University of Tasmania, Australia et al. - 2023 - Prompting Higher Education Towards AI-Augmented Te.pdf}
}

@online{valmeekamPlanningAbilitiesLarge2023,
  title = {On the {{Planning Abilities}} of {{Large Language Models}} : {{A Critical Investigation}}},
  shorttitle = {On the {{Planning Abilities}} of {{Large Language Models}}},
  author = {Valmeekam, Karthik and Marquez, Matthew and Sreedharan, Sarath and Kambhampati, Subbarao},
  date = {2023-11-06},
  eprint = {2305.15771},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2305.15771},
  url = {http://arxiv.org/abs/2305.15771},
  urldate = {2024-08-12},
  abstract = {Intrigued by the claims of emergent reasoning capabilities in LLMs trained on general web corpora, in this paper, we set out to investigate their planning capabilities. We aim to evaluate (1) the effectiveness of LLMs in generating plans autonomously in commonsense planning tasks and (2) the potential of LLMs in LLM-Modulo settings where they act as a source of heuristic guidance for external planners and verifiers. We conduct a systematic study by generating a suite of instances on domains similar to the ones employed in the International Planning Competition and evaluate LLMs in two distinct modes: autonomous and heuristic. Our findings reveal that LLMs' ability to generate executable plans autonomously is rather limited, with the best model (GPT-4) having an average success rate of \textasciitilde 12\% across the domains. However, the results in the LLM-Modulo setting show more promise. In the LLM-Modulo setting, we demonstrate that LLM-generated plans can improve the search process for underlying sound planners and additionally show that external verifiers can help provide feedback on the generated plans and back-prompt the LLM for better plan generation.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence},
  file = {/Users/andrew/Zotero/storage/TGX5Z6G7/Valmeekam et al. - 2023 - On the Planning Abilities of Large Language Models  A Critical Investigation.pdf;/Users/andrew/Zotero/storage/MNK8QW5L/2305.html}
}

@report{vanrooijReclaimingAITheoretical2023,
  type = {preprint},
  title = {Reclaiming {{AI}} as a Theoretical Tool for Cognitive Science},
  author = {Van Rooij, Iris and Guest, Olivia and Adolfi, Federico G and De Haan, Ronald and Kolokolova, Antonina and Rich, Patricia},
  date = {2023-08-01},
  institution = {PsyArXiv},
  doi = {10.31234/osf.io/4cbuv},
  url = {https://osf.io/4cbuv},
  urldate = {2023-08-04},
  abstract = {The idea that human cognition is, or can be understood as, a form of computation is a useful conceptual tool for cognitive science. It was a foundational assumption during the birth of cognitive science as a multidisciplinary field, with Artificial Intelligence (AI) as one of its contributing fields. One conception of AI in this context is as a provider of computational tools (frameworks, concepts, formalisms, models, proofs, simulations, etc.) that support theory building in cognitive science. The contemporary field of AI, however, has taken the theoretical possibility of explaining human cognition as a form of computation to imply the practical feasibility of realising human(-like or -level) cognition in factual computational systems; and, the field frames this realisation as a short-term inevitability. Yet, as we formally prove herein, creating systems with human(-like or -level) cognition is intrinsically computationally intractable. This means that any factual AI systems created in the short-run are at best decoys. When we think these systems capture something deep about ourselves and our thinking, we induce distorted and impoverished images of ourselves and our cognition. In other words, AI in current practice is deteriorating our theoretical understanding of cognition rather than advancing and enhancing it. The situation could be remediated by releasing the grip of the currently dominant view on AI and by returning to the idea of AI as a theoretical tool for cognitive science. In reclaiming this older idea of AI, however, it is important not to repeat conceptual mistakes of the past (and present) that brought us to where we are today.},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/M3ZH6YY5/Van Rooij et al_2023_Reclaiming AI as a theoretical tool for cognitive science.pdf}
}

@online{vasilatosHowkGPTInvestigatingDetection2023,
  title = {{{HowkGPT}}: {{Investigating}} the {{Detection}} of {{ChatGPT-generated University Student Homework}} through {{Context-Aware Perplexity Analysis}}},
  shorttitle = {{{HowkGPT}}},
  author = {Vasilatos, Christoforos and Alam, Manaar and Rahwan, Talal and Zaki, Yasir and Maniatakos, Michail},
  date = {2023-06-07},
  eprint = {2305.18226},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2305.18226},
  urldate = {2023-06-27},
  abstract = {As the use of Large Language Models (LLMs) in text generation tasks proliferates, concerns arise over their potential to compromise academic integrity. The education sector currently tussles with distinguishing student-authored homework assignments from AI-generated ones. This paper addresses the challenge by introducing HowkGPT, designed to identify homework assignments generated by AI. HowkGPT is built upon a dataset of academic assignments and accompanying metadata [18] and employs a pretrained LLM to compute perplexity scores for student-authored and ChatGPT-generated responses. These scores then assist in establishing a threshold for discerning the origin of a submitted assignment. Given the specificity and contextual nature of academic work, HowkGPT further refines its analysis by defining category-specific thresholds derived from the metadata, enhancing the precision of the detection. This study emphasizes the critical need for effective strategies to uphold academic integrity amidst the growing influence of LLMs and provides an approach to ensuring fair and accurate grading in educational institutions.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/XBEWKXVL/Vasilatos et al. - 2023 - HowkGPT Investigating the Detection of ChatGPT-ge.pdf}
}

@unpublished{vaswaniAttentionAllYou2017b,
  title = {Attention {{Is All You Need}}},
  author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
  date = {2017-12-05},
  eprint = {1706.03762},
  eprinttype = {arXiv},
  eprintclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.1706.03762},
  url = {http://arxiv.org/abs/1706.03762},
  urldate = {2022-05-15},
  abstract = {The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/JKYDXK2K/Vaswani et al. - 2017 - Attention Is All You Need.pdf;/Users/andrew/Zotero/storage/2DZV3P2H/1706.html}
}

@article{velezTeachersRecruitMentalizing2023,
  title = {Teachers Recruit Mentalizing Regions to Represent Learners’ Beliefs},
  author = {Vélez, Natalia and Chen, Alicia M. and Burke, Taylor and Cushman, Fiery A. and Gershman, Samuel J.},
  date = {2023-05-30},
  journaltitle = {Proceedings of the National Academy of Sciences},
  volume = {120},
  number = {22},
  pages = {e2215015120},
  publisher = {Proceedings of the National Academy of Sciences},
  doi = {10.1073/pnas.2215015120},
  url = {https://www.pnas.org/doi/10.1073/pnas.2215015120},
  urldate = {2023-05-24},
  abstract = {Teaching enables humans to impart vast stores of culturally specific knowledge and skills. However, little is known about the neural computations that guide teachers’ decisions about what information to communicate. Participants (N = 28) played the role of teachers while being scanned using fMRI; their task was to select examples that would teach learners how to answer abstract multiple-choice questions. Participants’ examples were best described by a model that selects evidence that maximizes the learner’s belief in the correct answer. Consistent with this idea, participants’ predictions about how well learners would do closely tracked the performance of an independent sample of learners (N = 140) who were tested on the examples they had provided. In addition, regions that play specialized roles in processing social information, namely the bilateral temporoparietal junction and middle and dorsal medial prefrontal cortex, tracked learners’ posterior belief in the correct answer. Our results shed light on the computational and neural architectures that support our extraordinary abilities as teachers.},
  keywords = {/unread}
}

@online{VerbalizedProbabilisticGraphical,
  title = {Verbalized {{Probabilistic Graphical Modeling}} with {{Large Language Models}}},
  url = {https://arxiv.org/html/2406.05516v1#S4},
  urldate = {2024-09-07},
  file = {/Users/andrew/Zotero/storage/5QGGPBD5/2406.html}
}

@online{vergaReplacingJudgesJuries2024,
  title = {Replacing {{Judges}} with {{Juries}}: {{Evaluating LLM Generations}} with a {{Panel}} of {{Diverse Models}}},
  shorttitle = {Replacing {{Judges}} with {{Juries}}},
  author = {Verga, Pat and Hofstatter, Sebastian and Althammer, Sophia and Su, Yixuan and Piktus, Aleksandra and Arkhangorodsky, Arkady and Xu, Minjie and White, Naomi and Lewis, Patrick},
  date = {2024-05-01},
  eprint = {2404.18796},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2404.18796},
  url = {http://arxiv.org/abs/2404.18796},
  urldate = {2024-06-13},
  abstract = {As Large Language Models (LLMs) have become more advanced, they have outpaced our abilities to accurately evaluate their quality. Not only is finding data to adequately probe particular model properties difficult, but evaluating the correctness of a model's freeform generation alone is a challenge. To address this, many evaluations now rely on using LLMs themselves as judges to score the quality of outputs from other LLMs. Evaluations most commonly use a single large model like GPT4. While this method has grown in popularity, it is costly, has been shown to introduce intramodel bias, and in this work, we find that very large models are often unnecessary. We propose instead to evaluate models using a Panel of LLm evaluators (PoLL). Across three distinct judge settings and spanning six different datasets, we find that using a PoLL composed of a larger number of smaller models outperforms a single large judge, exhibits less intra-model bias due to its composition of disjoint model families, and does so while being over seven times less expensive.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/SMEMXH3T/Verga et al_2024_Replacing Judges with Juries.pdf;/Users/andrew/Zotero/storage/VA8ZX62S/2404.html}
}

@inproceedings{wambsganssArgueTutorAdaptiveDialogBased2021,
  title = {{{ArgueTutor}}: {{An Adaptive Dialog-Based Learning System}} for {{Argumentation Skills}}},
  shorttitle = {{{ArgueTutor}}},
  booktitle = {Proceedings of the 2021 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  author = {Wambsganss, Thiemo and Kueng, Tobias and Soellner, Matthias and Leimeister, Jan Marco},
  date = {2021-05-06},
  pages = {1--13},
  publisher = {ACM},
  location = {Yokohama Japan},
  doi = {10.1145/3411764.3445781},
  url = {https://dl.acm.org/doi/10.1145/3411764.3445781},
  urldate = {2023-09-21},
  eventtitle = {{{CHI}} '21: {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  isbn = {978-1-4503-8096-6},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/NNRD6KIH/Wambsganss et al. - 2021 - ArgueTutor An Adaptive Dialog-Based Learning Syst.pdf}
}

@article{wambsganssEnhancingArgumentativeWriting2022,
  title = {Enhancing Argumentative Writing with Automated Feedback and Social Comparison Nudging},
  author = {Wambsganss, Thiemo and Janson, Andreas and Leimeister, Jan Marco},
  date = {2022-12-01},
  journaltitle = {Computers \& Education},
  shortjournal = {Computers \& Education},
  volume = {191},
  pages = {104644},
  issn = {0360-1315},
  doi = {10.1016/j.compedu.2022.104644},
  url = {https://www.sciencedirect.com/science/article/pii/S0360131522002159},
  urldate = {2023-08-17},
  abstract = {The advantages offered by natural language processing (NLP) and machine learning enable students to receive automated feedback on their argumentation skills, independent of educator, time, and location. Although there is a growing amount of literature on formative argumentation feedback, empirical evidence on the effects of adaptive feedback mechanisms and novel NLP approaches to enhance argumentative writing remains scarce. To help fill this gap, the aim of the present study is to investigate whether automated feedback and social comparison nudging enable students to internalize and improve logical argumentation writing abilities in an undergraduate business course. We conducted a mixed-methods study to investigate the impact of argumentative writing on 71 students in a field experiment. Students in treatment group 1 completed their assignment while receiving automated feedback, whereas students in treatment group 2 completed the same assignment while receiving automated feedback with a social comparison nudge that indicated how other students performed on the same assignment. Students in the control group received generalized feedback based on rules of syntax. We found that participants who received automated argumentation feedback with a social comparison nudge wrote more convincing texts with higher-quality argumentation compared to the two benchmark groups (p~{$<~$}0.05). The measured self-efficacy, perceived ease of use, and qualitative data provide valuable insights that help explain this effect. The results suggest that embedding automated feedback in combination with social comparison nudges enables students to increase their argumentative writing skills by triggering psychological processes. Receiving only automated feedback in the form of in-text argumentative highlighting without any further guidance appears not to significantly influence students’ writing abilities when compared to syntactic feedback.},
  keywords = {/unread,Argumentation,Automated feedback,Machine learning,Natural language processing,Social comparison nudging},
  file = {/Users/andrew/Zotero/storage/ER8NT2HJ/Wambsganss et al_2022_Enhancing argumentative writing with automated feedback and social comparison.pdf}
}

@article{wangExaminingPotentialPitfalls2024,
  title = {Examining the Potential and Pitfalls of {{ChatGPT}} in Science and Engineering Problem-Solving},
  author = {Wang, Karen D. and Burkholder, Eric and Wieman, Carl and Salehi, Shima and Haber, Nick},
  date = {2024},
  journaltitle = {Frontiers in Education},
  volume = {8},
  issn = {2504-284X},
  doi = {10.3389/feduc.2023.1330486},
  url = {https://www.frontiersin.org/articles/10.3389/feduc.2023.1330486},
  urldate = {2024-03-08},
  abstract = {The study explores the capabilities of OpenAI's ChatGPT in solving different types of physics problems. ChatGPT (with GPT-4) was queried to solve a total of 40 problems from a college-level engineering physics course. These problems ranged from well-specified problems, where all data required for solving the problem was provided, to under-specified, real-world problems where not all necessary data were given. Our findings show that ChatGPT could successfully solve 62.5\% of the well-specified problems, but its accuracy drops to 8.3\% for under-specified problems. Analysis of the model's incorrect solutions revealed three distinct failure modes: (1) failure to construct accurate models of the physical world, (2) failure to make reasonable assumptions about missing data, and (3) calculation errors. The study offers implications for how to leverage LLM-augmented instructional materials to enhance STEM education. The insights also contribute to the broader discourse on AI's strengths and limitations, serving both educators aiming to leverage the technology and researchers investigating human-AI collaboration frameworks for problem-solving and decision-making.},
  file = {/Users/andrew/Zotero/storage/HH2HMM2C/Wang et al_2024_Examining the potential and pitfalls of ChatGPT in science and engineering.pdf}
}

@online{wangGrokkedTransformersAre2024,
  title = {Grokked {{Transformers}} Are {{Implicit Reasoners}}: {{A Mechanistic Journey}} to the {{Edge}} of {{Generalization}}},
  shorttitle = {Grokked {{Transformers}} Are {{Implicit Reasoners}}},
  author = {Wang, Boshi and Yue, Xiang and Su, Yu and Sun, Huan},
  date = {2024-05-26},
  eprint = {2405.15071},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2405.15071},
  url = {http://arxiv.org/abs/2405.15071},
  urldate = {2024-07-05},
  abstract = {We study whether transformers can learn to implicitly reason over parametric knowledge, a skill that even the most capable language models struggle with. Focusing on two representative reasoning types, composition and comparison, we consistently find that transformers can learn implicit reasoning, but only through grokking, i.e., extended training far beyond overfitting. The levels of generalization also vary across reasoning types: when faced with out-of-distribution examples, transformers fail to systematically generalize for composition but succeed for comparison. We delve into the model's internals throughout training, conducting analytical experiments that reveal: 1) the mechanism behind grokking, such as the formation of the generalizing circuit and its relation to the relative efficiency of generalizing and memorizing circuits, and 2) the connection between systematicity and the configuration of the generalizing circuit. Our findings guide data and training setup to better induce implicit reasoning and suggest potential improvements to the transformer architecture, such as encouraging cross-layer knowledge sharing. Furthermore, we demonstrate that for a challenging reasoning task with a large search space, GPT-4-Turbo and Gemini-1.5-Pro based on non-parametric memory fail badly regardless of prompting styles or retrieval augmentation, while a fully grokked transformer can achieve near-perfect accuracy, showcasing the power of parametric memory for complex reasoning.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/TFKPJLN4/Wang et al. - 2024 - Grokked Transformers are Implicit Reasoners A Mechanistic Journey to the Edge of Generalization.pdf;/Users/andrew/Zotero/storage/J6JWNV3C/2405.html}
}

@online{wangHypothesisSearchInductive2024,
  title = {Hypothesis {{Search}}: {{Inductive Reasoning}} with {{Language Models}}},
  shorttitle = {Hypothesis {{Search}}},
  author = {Wang, Ruocheng and Zelikman, Eric and Poesia, Gabriel and Pu, Yewen and Haber, Nick and Goodman, Noah D.},
  date = {2024-05-30},
  eprint = {2309.05660},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2309.05660},
  url = {http://arxiv.org/abs/2309.05660},
  urldate = {2024-08-16},
  abstract = {Inductive reasoning is a core problem-solving capacity: humans can identify underlying principles from a few examples, which robustly generalize to novel scenarios. Recent work evaluates large language models (LLMs) on inductive reasoning tasks by directly prompting them yielding "in context learning." This works well for straightforward inductive tasks but performs poorly on complex tasks such as the Abstraction and Reasoning Corpus (ARC). In this work, we propose to improve the inductive reasoning ability of LLMs by generating explicit hypotheses at multiple levels of abstraction: we prompt the LLM to propose multiple abstract hypotheses about the problem, in natural language, then implement the natural language hypotheses as concrete Python programs. These programs can be verified by running on observed examples and generalized to novel inputs. To reduce the hypothesis search space, we explore steps to filter the set of hypotheses to implement: we either ask the LLM to summarize them into a smaller set of hypotheses or ask human annotators to select a subset. We verify our pipeline's effectiveness on the ARC visual inductive reasoning benchmark, its variant 1D-ARC, string transformation dataset SyGuS, and list transformation dataset List Functions. On a random 100-problem subset of ARC, our automated pipeline using LLM summaries achieves 30\% accuracy, outperforming the direct prompting baseline (accuracy of 17\%). With the minimal human input of selecting from LLM-generated candidates, performance is boosted to 33\%. Our ablations show that both abstract hypothesis generation and concrete program representations benefit LLMs on inductive reasoning tasks.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  annotation = {Read\_Status: To Read\\
Read\_Status\_Date: 2024-10-15T20:59:17.076Z},
  file = {/Users/andrew/Zotero/storage/NDXWVPWE/Wang et al. - 2024 - Hypothesis Search Inductive Reasoning with Language Models.pdf;/Users/andrew/Zotero/storage/9FXWTEYH/2309.html}
}

@online{wangInstructionsGuideDiagnostic2021,
  title = {Instructions and {{Guide}} for {{Diagnostic Questions}}: {{The NeurIPS}} 2020 {{Education Challenge}}},
  shorttitle = {Instructions and {{Guide}} for {{Diagnostic Questions}}},
  author = {Wang, Zichao and Lamb, Angus and Saveliev, Evgeny and Cameron, Pashmina and Zaykov, Yordan and Hernández-Lobato, José Miguel and Turner, Richard E. and Baraniuk, Richard G. and Barton, Craig and Jones, Simon Peyton and Woodhead, Simon and Zhang, Cheng},
  date = {2021-04-12},
  eprint = {2007.12061},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2007.12061},
  url = {http://arxiv.org/abs/2007.12061},
  urldate = {2023-03-28},
  abstract = {Digital technologies are becoming increasingly prevalent in education, enabling personalized, high quality education resources to be accessible by students across the world. Importantly, among these resources are diagnostic questions: the answers that the students give to these questions reveal key information about the specific nature of misconceptions that the students may hold. Analyzing the massive quantities of data stemming from students' interactions with these diagnostic questions can help us more accurately understand the students' learning status and thus allow us to automate learning curriculum recommendations. In this competition, participants will focus on the students' answer records to these multiple-choice diagnostic questions, with the aim of 1) accurately predicting which answers the students provide; 2) accurately predicting which questions have high quality; and 3) determining a personalized sequence of questions for each student that best predicts the student's answers. These tasks closely mimic the goals of a real-world educational platform and are highly representative of the educational challenges faced today. We provide over 20 million examples of students' answers to mathematics questions from Eedi, a leading educational platform which thousands of students interact with daily around the globe. Participants to this competition have a chance to make a lasting, real-world impact on the quality of personalized education for millions of students across the world.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computers and Society,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/NSU32T9U/Wang et al_2021_Instructions and Guide for Diagnostic Questions.pdf;/Users/andrew/Zotero/storage/LEB5VMBT/2007.html}
}

@online{wangSelfConsistencyImprovesChain2023,
  title = {Self-{{Consistency Improves Chain}} of {{Thought Reasoning}} in {{Language Models}}},
  author = {Wang, Xuezhi and Wei, Jason and Schuurmans, Dale and Le, Quoc and Chi, Ed and Narang, Sharan and Chowdhery, Aakanksha and Zhou, Denny},
  date = {2023-03-07},
  eprint = {2203.11171},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2203.11171},
  url = {http://arxiv.org/abs/2203.11171},
  urldate = {2023-06-09},
  abstract = {Chain-of-thought prompting combined with pre-trained large language models has achieved encouraging results on complex reasoning tasks. In this paper, we propose a new decoding strategy, self-consistency, to replace the naive greedy decoding used in chain-of-thought prompting. It first samples a diverse set of reasoning paths instead of only taking the greedy one, and then selects the most consistent answer by marginalizing out the sampled reasoning paths. Self-consistency leverages the intuition that a complex reasoning problem typically admits multiple different ways of thinking leading to its unique correct answer. Our extensive empirical evaluation shows that self-consistency boosts the performance of chain-of-thought prompting with a striking margin on a range of popular arithmetic and commonsense reasoning benchmarks, including GSM8K (+17.9\%), SVAMP (+11.0\%), AQuA (+12.2\%), StrategyQA (+6.4\%) and ARC-challenge (+3.9\%).},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/KHTHU6S9/Wang et al_2023_Self-Consistency Improves Chain of Thought Reasoning in Language Models.pdf;/Users/andrew/Zotero/storage/MHHCVKHB/2203.html}
}

@online{wangVarFAVariationalFactor2020,
  title = {{{VarFA}}: {{A Variational Factor Analysis Framework For Efficient Bayesian Learning Analytics}}},
  shorttitle = {{{VarFA}}},
  author = {Wang, Zichao and Gu, Yi and Lan, Andrew and Baraniuk, Richard},
  date = {2020-08-14},
  eprint = {2005.13107},
  eprinttype = {arXiv},
  eprintclass = {cs, stat},
  doi = {10.48550/arXiv.2005.13107},
  url = {http://arxiv.org/abs/2005.13107},
  urldate = {2023-05-24},
  abstract = {We propose VarFA, a variational inference factor analysis framework that extends existing factor analysis models for educational data mining to efficiently output uncertainty estimation in the model's estimated factors. Such uncertainty information is useful, for example, for an adaptive testing scenario, where additional tests can be administered if the model is not quite certain about a students' skill level estimation. Traditional Bayesian inference methods that produce such uncertainty information are computationally expensive and do not scale to large data sets. VarFA utilizes variational inference which makes it possible to efficiently perform Bayesian inference even on very large data sets. We use the sparse factor analysis model as a case study and demonstrate the efficacy of VarFA on both synthetic and real data sets. VarFA is also very general and can be applied to a wide array of factor analysis models.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computers and Society,Computer Science - Machine Learning,Statistics - Applications,Statistics - Machine Learning},
  file = {/Users/andrew/Zotero/storage/7JNRD8KS/Wang et al_2020_VarFA.pdf;/Users/andrew/Zotero/storage/F6N6SDAL/2005.html}
}

@article{webbEmergentAnalogicalReasoning2023,
  title = {Emergent Analogical Reasoning in Large Language Models},
  author = {Webb, Taylor and Holyoak, Keith J. and Lu, Hongjing},
  date = {2023-07-31},
  journaltitle = {Nature Human Behaviour},
  shortjournal = {Nat Hum Behav},
  pages = {1--16},
  publisher = {Nature Publishing Group},
  issn = {2397-3374},
  doi = {10.1038/s41562-023-01659-w},
  url = {https://www.nature.com/articles/s41562-023-01659-w},
  urldate = {2023-08-04},
  abstract = {The recent advent of large language models has reinvigorated debate over whether human cognitive capacities might emerge in such generic models given sufficient training data. Of particular interest is the ability of these models to reason about novel problems zero-shot, without any direct training. In human cognition, this capacity is closely tied to an ability to reason by analogy. Here we performed a direct comparison between human reasoners and a large language model (the text-davinci-003 variant of Generative Pre-trained Transformer (GPT)-3) on a range of analogical tasks, including a non-visual matrix reasoning task based on the rule structure of Raven’s Standard Progressive Matrices. We found that GPT-3 displayed a surprisingly strong capacity for abstract pattern induction, matching or even surpassing human capabilities in most settings; preliminary tests of GPT-4 indicated even better performance. Our results indicate that large language models such as GPT-3 have acquired an emergent ability to find zero-shot solutions to a broad range of analogy problems.},
  langid = {english},
  keywords = {/unread,Computer science,Human behaviour},
  file = {/Users/andrew/Zotero/storage/8KB36P6W/Webb et al_2023_Emergent analogical reasoning in large language models.pdf}
}

@online{weber-wulffTestingDetectionTools2023,
  title = {Testing of {{Detection Tools}} for {{AI-Generated Text}}},
  author = {Weber-Wulff, Debora and Anohina-Naumeca, Alla and Bjelobaba, Sonja and Foltýnek, Tomáš and Guerrero-Dib, Jean and Popoola, Olumide and Šigut, Petr and Waddington, Lorna},
  date = {2023-06-21},
  eprint = {2306.15666},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.15666},
  url = {http://arxiv.org/abs/2306.15666},
  urldate = {2023-07-03},
  abstract = {Recent advances in generative pre-trained transformer large language models have emphasised the potential risks of unfair use of artificial intelligence (AI) generated content in an academic environment and intensified efforts in searching for solutions to detect such content. The paper examines the general functionality of detection tools for artificial intelligence generated text and evaluates them based on accuracy and error type analysis. Specifically, the study seeks to answer research questions about whether existing detection tools can reliably differentiate between human-written text and ChatGPT-generated text, and whether machine translation and content obfuscation techniques affect the detection of AIgenerated text. The research covers 12 publicly available tools and two commercial systems (Turnitin and PlagiarismCheck) that are widely used in the academic setting. The researchers conclude that the available detection tools are neither accurate nor reliable and have a main bias towards classifying the output as human-written rather than detecting AIgenerated text. Furthermore, content obfuscation techniques significantly worsen the performance of tools. The study makes several significant contributions. First, it summarises up-to-date similar scientific and non-scientific efforts in the field. Second, it presents the result of one of the most comprehensive tests conducted so far, based on a rigorous research methodology, an original document set, and a broad coverage of tools. Third, it discusses the implications and drawbacks of using detection tools for AI-generated text in academic settings.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society,I.2.7,I.2.m},
  file = {/Users/andrew/Zotero/storage/6NDXPP3B/Weber-Wulff et al_2023_Testing of Detection Tools for AI-Generated Text.pdf;/Users/andrew/Zotero/storage/W5I25PTG/2306.html}
}

@online{weiChainofThoughtPromptingElicits2023,
  title = {Chain-of-{{Thought Prompting Elicits Reasoning}} in {{Large Language Models}}},
  author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichter, Brian and Xia, Fei and Chi, Ed and Le, Quoc and Zhou, Denny},
  date = {2023-01-10},
  eprint = {2201.11903},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2201.11903},
  url = {http://arxiv.org/abs/2201.11903},
  urldate = {2023-03-02},
  abstract = {We explore how generating a chain of thought -- a series of intermediate reasoning steps -- significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain of thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain of thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a 540B-parameter language model with just eight chain of thought exemplars achieves state of the art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/GJBNDTL9/Wei et al_2023_Chain-of-Thought Prompting Elicits Reasoning in Large Language Models.pdf;/Users/andrew/Zotero/storage/TJEZ8D29/2201.html}
}

@online{weiChainofThoughtPromptingElicits2023a,
  title = {Chain-of-{{Thought Prompting Elicits Reasoning}} in {{Large Language Models}}},
  author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichter, Brian and Xia, Fei and Chi, Ed and Le, Quoc and Zhou, Denny},
  date = {2023-01-10},
  eprint = {2201.11903},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2201.11903},
  url = {http://arxiv.org/abs/2201.11903},
  urldate = {2023-10-15},
  abstract = {We explore how generating a chain of thought -- a series of intermediate reasoning steps -- significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain of thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain of thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a 540B-parameter language model with just eight chain of thought exemplars achieves state of the art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/ECP3ZU5X/Wei et al_2023_Chain-of-Thought Prompting Elicits Reasoning in Large Language Models.pdf;/Users/andrew/Zotero/storage/I8L5EMWA/2201.html}
}

@online{weiChainofThoughtPromptingElicits2023b,
  title = {Chain-of-{{Thought Prompting Elicits Reasoning}} in {{Large Language Models}}},
  author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichter, Brian and Xia, Fei and Chi, Ed and Le, Quoc and Zhou, Denny},
  date = {2023-01-10},
  eprint = {2201.11903},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2201.11903},
  url = {http://arxiv.org/abs/2201.11903},
  urldate = {2024-06-25},
  abstract = {We explore how generating a chain of thought -- a series of intermediate reasoning steps -- significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain of thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain of thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a 540B-parameter language model with just eight chain of thought exemplars achieves state of the art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/Z3P4FBA9/Wei et al. - 2023 - Chain-of-Thought Prompting Elicits Reasoning in Large Language Models.pdf}
}

@online{weiEmergentAbilitiesLarge2022,
  title = {Emergent {{Abilities}} of {{Large Language Models}}},
  author = {Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and Zoph, Barret and Borgeaud, Sebastian and Yogatama, Dani and Bosma, Maarten and Zhou, Denny and Metzler, Donald and Chi, Ed H. and Hashimoto, Tatsunori and Vinyals, Oriol and Liang, Percy and Dean, Jeff and Fedus, William},
  date = {2022-10-26},
  eprint = {2206.07682},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2206.07682},
  url = {http://arxiv.org/abs/2206.07682},
  urldate = {2023-04-20},
  abstract = {Scaling up language models has been shown to predictably improve performance and sample efficiency on a wide range of downstream tasks. This paper instead discusses an unpredictable phenomenon that we refer to as emergent abilities of large language models. We consider an ability to be emergent if it is not present in smaller models but is present in larger models. Thus, emergent abilities cannot be predicted simply by extrapolating the performance of smaller models. The existence of such emergence implies that additional scaling could further expand the range of capabilities of language models.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/EHJETBL3/Wei et al_2022_Emergent Abilities of Large Language Models.pdf;/Users/andrew/Zotero/storage/BMBRKP3L/2206.html}
}

@online{weiEmergentAbilitiesLarge2022a,
  title = {Emergent {{Abilities}} of {{Large Language Models}}},
  author = {Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and Zoph, Barret and Borgeaud, Sebastian and Yogatama, Dani and Bosma, Maarten and Zhou, Denny and Metzler, Donald and Chi, Ed H. and Hashimoto, Tatsunori and Vinyals, Oriol and Liang, Percy and Dean, Jeff and Fedus, William},
  date = {2022-10-26},
  eprint = {2206.07682},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2206.07682},
  urldate = {2023-04-25},
  abstract = {Scaling up language models has been shown to predictably improve performance and sample efficiency on a wide range of downstream tasks. This paper instead discusses an unpredictable phenomenon that we refer to as emergent abilities of large language models. We consider an ability to be emergent if it is not present in smaller models but is present in larger models. Thus, emergent abilities cannot be predicted simply by extrapolating the performance of smaller models. The existence of such emergence raises the question of whether additional scaling could potentially further expand the range of capabilities of language models.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/NN37SV4E/Wei et al. - 2022 - Emergent Abilities of Large Language Models.pdf}
}

@online{weiEmergentAbilitiesLarge2022b,
  title = {Emergent {{Abilities}} of {{Large Language Models}}},
  author = {Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and Zoph, Barret and Borgeaud, Sebastian and Yogatama, Dani and Bosma, Maarten and Zhou, Denny and Metzler, Donald and Chi, Ed H. and Hashimoto, Tatsunori and Vinyals, Oriol and Liang, Percy and Dean, Jeff and Fedus, William},
  date = {2022-10-26},
  eprint = {2206.07682},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2206.07682},
  url = {http://arxiv.org/abs/2206.07682},
  urldate = {2023-10-25},
  abstract = {Scaling up language models has been shown to predictably improve performance and sample efficiency on a wide range of downstream tasks. This paper instead discusses an unpredictable phenomenon that we refer to as emergent abilities of large language models. We consider an ability to be emergent if it is not present in smaller models but is present in larger models. Thus, emergent abilities cannot be predicted simply by extrapolating the performance of smaller models. The existence of such emergence implies that additional scaling could further expand the range of capabilities of language models.},
  pubstate = {prepublished},
  version = {2},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/NTZCUDAY/Wei et al_2022_Emergent Abilities of Large Language Models.pdf;/Users/andrew/Zotero/storage/2JF228HI/2206.html}
}

@inproceedings{weitekampInteractionDesignMachine2020,
  title = {An {{Interaction Design}} for {{Machine Teaching}} to {{Develop AI Tutors}}},
  booktitle = {Proceedings of the 2020 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  author = {Weitekamp, Daniel and Harpstead, Erik and Koedinger, Ken R.},
  date = {2020-04-23},
  series = {{{CHI}} '20},
  pages = {1--11},
  publisher = {Association for Computing Machinery},
  location = {New York, NY, USA},
  doi = {10.1145/3313831.3376226},
  url = {https://doi.org/10.1145/3313831.3376226},
  urldate = {2023-03-17},
  abstract = {Intelligent tutoring systems (ITSs) have consistently been shown to improve the educational outcomes of students when used alone or combined with traditional instruction. However, building an ITS is a time-consuming process which requires specialized knowledge of existing tools. Extant authoring methods, including the Cognitive Tutor Authoring Tools' (CTAT) example-tracing method and SimStudent's Authoring by Tutoring, use programming-by-demonstration to allow authors to build ITSs more quickly than they could by hand programming with model-tracing. Yet these methods still suffer from long authoring times or difficulty creating complete models. In this study, we demonstrate that Simulated Learners built with the Apprentice Learner (AL) Framework can be combined with a novel interaction design that emphasizes model transparency, input flexibility, and problem solving control to enable authors to achieve greater model completeness in less time than existing authoring methods.},
  isbn = {978-1-4503-6708-0},
  keywords = {/unread,intelligent tutoring systems,interaction design,machine teaching,programming-by-demonstration,simulated learners},
  file = {/Users/andrew/Zotero/storage/SSJXVDJL/Weitekamp et al_2020_An Interaction Design for Machine Teaching to Develop AI Tutors.pdf}
}

@online{WhatChatGPTDoing2023,
  title = {What {{Is ChatGPT Doing}} … and {{Why Does It Work}}?},
  date = {2023-02-14},
  url = {https://writings.stephenwolfram.com/2023/02/what-is-chatgpt-doing-and-why-does-it-work/},
  urldate = {2023-04-20},
  abstract = {Stephen Wolfram explores the broader picture of what's going on inside ChatGPT and why it produces meaningful text. Discusses models, training neural nets, embeddings, tokens, transformers, language syntax.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/3G5VCMNQ/what-is-chatgpt-doing-and-why-does-it-work.html}
}

@online{whitePromptPatternCatalog2023,
  title = {A {{Prompt Pattern Catalog}} to {{Enhance Prompt Engineering}} with {{ChatGPT}}},
  author = {White, Jules and Fu, Quchen and Hays, Sam and Sandborn, Michael and Olea, Carlos and Gilbert, Henry and Elnashar, Ashraf and Spencer-Smith, Jesse and Schmidt, Douglas C.},
  date = {2023-02-21},
  eprint = {2302.11382},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2302.11382},
  url = {http://arxiv.org/abs/2302.11382},
  urldate = {2024-06-25},
  abstract = {Prompt engineering is an increasingly important skill set needed to converse effectively with large language models (LLMs), such as ChatGPT. Prompts are instructions given to an LLM to enforce rules, automate processes, and ensure specific qualities (and quantities) of generated output. Prompts are also a form of programming that can customize the outputs and interactions with an LLM. This paper describes a catalog of prompt engineering techniques presented in pattern form that have been applied to solve common problems when conversing with LLMs. Prompt patterns are a knowledge transfer method analogous to software patterns since they provide reusable solutions to common problems faced in a particular context, i.e., output generation and interaction when working with LLMs. This paper provides the following contributions to research on prompt engineering that apply LLMs to automate software development tasks. First, it provides a framework for documenting patterns for structuring prompts to solve a range of problems so that they can be adapted to different domains. Second, it presents a catalog of patterns that have been applied successfully to improve the outputs of LLM conversations. Third, it explains how prompts can be built from multiple patterns and illustrates prompt patterns that benefit from combination with other prompt patterns.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Software Engineering},
  file = {/Users/andrew/Zotero/storage/FCLWSIAE/White et al. - 2023 - A Prompt Pattern Catalog to Enhance Prompt Engineering with ChatGPT.pdf;/Users/andrew/Zotero/storage/Y369AKVH/2302.html}
}

@online{wolframWhatChatGPTDoing2023,
  title = {What {{Is ChatGPT Doing}} … and {{Why Does It Work}}?},
  author = {Wolfram, Stephen},
  date = {2023-02-14},
  url = {https://writings.stephenwolfram.com/2023/02/what-is-chatgpt-doing-and-why-does-it-work/},
  urldate = {2023-03-01},
  abstract = {Stephen Wolfram explores the broader picture of what's going on inside ChatGPT and why it produces meaningful text. Discusses models, training neural nets, embeddings, tokens, transformers, language syntax.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/SYQP8GVU/what-is-chatgpt-doing-and-why-does-it-work.html}
}

@online{wongWordModelsWorld2023,
  title = {From {{Word Models}} to {{World Models}}: {{Translating}} from {{Natural Language}} to the {{Probabilistic Language}} of {{Thought}}},
  shorttitle = {From {{Word Models}} to {{World Models}}},
  author = {Wong, Lionel and Grand, Gabriel and Lew, Alexander K. and Goodman, Noah D. and Mansinghka, Vikash K. and Andreas, Jacob and Tenenbaum, Joshua B.},
  date = {2023-06-22},
  eprint = {2306.12672},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.12672},
  url = {http://arxiv.org/abs/2306.12672},
  urldate = {2023-06-25},
  abstract = {How does language inform our downstream thinking? In particular, how do humans make meaning from language -- and how can we leverage a theory of linguistic meaning to build machines that think in more human-like ways? In this paper, we propose \textbackslash textit\{rational meaning construction\}, a computational framework for language-informed thinking that combines neural models of language with probabilistic models for rational inference. We frame linguistic meaning as a context-sensitive mapping from natural language into a \textbackslash textit\{probabilistic language of thought\} (PLoT) -- a general-purpose symbolic substrate for probabilistic, generative world modeling. Our architecture integrates two powerful computational tools that have not previously come together: we model thinking with \textbackslash textit\{probabilistic programs\}, an expressive representation for flexible commonsense reasoning; and we model meaning construction with \textbackslash textit\{large language models\} (LLMs), which support broad-coverage translation from natural language utterances to code expressions in a probabilistic programming language. We illustrate our framework in action through examples covering four core domains from cognitive science: probabilistic reasoning, logical and relational reasoning, visual and physical reasoning, and social reasoning about agents and their plans. In each, we show that LLMs can generate context-sensitive translations that capture pragmatically-appropriate linguistic meanings, while Bayesian inference with the generated programs supports coherent and robust commonsense reasoning. We extend our framework to integrate cognitively-motivated symbolic modules to provide a unified commonsense thinking interface from language. Finally, we explore how language can drive the construction of world models themselves.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Symbolic Computation},
  file = {/Users/andrew/Zotero/storage/4WACCWDP/Wong et al_2023_From Word Models to World Models.pdf;/Users/andrew/Zotero/storage/NRJ449IG/2306.html}
}

@online{wongWordModelsWorld2023a,
  title = {From {{Word Models}} to {{World Models}}: {{Translating}} from {{Natural Language}} to the {{Probabilistic Language}} of {{Thought}}},
  shorttitle = {From {{Word Models}} to {{World Models}}},
  author = {Wong, Lionel and Grand, Gabriel and Lew, Alexander K. and Goodman, Noah D. and Mansinghka, Vikash K. and Andreas, Jacob and Tenenbaum, Joshua B.},
  date = {2023-06-23},
  eprint = {2306.12672},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2306.12672},
  url = {http://arxiv.org/abs/2306.12672},
  urldate = {2024-08-19},
  abstract = {How does language inform our downstream thinking? In particular, how do humans make meaning from language--and how can we leverage a theory of linguistic meaning to build machines that think in more human-like ways? In this paper, we propose rational meaning construction, a computational framework for language-informed thinking that combines neural language models with probabilistic models for rational inference. We frame linguistic meaning as a context-sensitive mapping from natural language into a probabilistic language of thought (PLoT)--a general-purpose symbolic substrate for generative world modeling. Our architecture integrates two computational tools that have not previously come together: we model thinking with probabilistic programs, an expressive representation for commonsense reasoning; and we model meaning construction with large language models (LLMs), which support broad-coverage translation from natural language utterances to code expressions in a probabilistic programming language. We illustrate our framework through examples covering four core domains from cognitive science: probabilistic reasoning, logical and relational reasoning, visual and physical reasoning, and social reasoning. In each, we show that LLMs can generate context-sensitive translations that capture pragmatically-appropriate linguistic meanings, while Bayesian inference with the generated programs supports coherent and robust commonsense reasoning. We extend our framework to integrate cognitively-motivated symbolic modules (physics simulators, graphics engines, and planning algorithms) to provide a unified commonsense thinking interface from language. Finally, we explore how language can drive the construction of world models themselves. We hope this work will provide a roadmap towards cognitive models and AI systems that synthesize the insights of both modern and classical computational perspectives.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Symbolic Computation},
  file = {/Users/andrew/Zotero/storage/FYSI4DVJ/Wong et al. - 2023 - From Word Models to World Models Translating from Natural Language to the Probabilistic Language of.pdf;/Users/andrew/Zotero/storage/QMB4LVZB/2306.html}
}

@inproceedings{wuAgentPromptsScaffolding2010b,
  title = {Agent {{Prompts}}: {{Scaffolding Students}} for {{Productive Reflection}} in an {{Intelligent Learning Environment}}},
  shorttitle = {Agent {{Prompts}}},
  booktitle = {Intelligent {{Tutoring Systems}}},
  author = {Wu, Longkai and Looi, Chee-Kit},
  editor = {Aleven, Vincent and Kay, Judy and Mostow, Jack},
  date = {2010},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {426--428},
  publisher = {Springer},
  location = {Berlin, Heidelberg},
  doi = {10.1007/978-3-642-13437-1_92},
  abstract = {Recent research has emphasized the importance of reflection for students in an intelligent learning environment. This study tries to investigate whether agent prompts, acting as scaffolding, can promote students’ reflection when they act as tutor through teaching the agent tutee in a learning-by-teaching environment. Two types of agent prompts are contrasted in this research, both from the perspective of a tutee, differing in their specificity. Reflective prompts are content-independent tutee questions, aiming at fostering students’ general reflection on metacognitive strategies and beliefs. Interactive prompts, on the other hand, are content-dependent tutee questions that encourage students’ specific reflection on domain-related and task-specific skills and articulation of their explanatory responses. The result indicates that designers on intelligent learning environment should concentrate on fostering students to reflect on their metacognitive strategies and beliefs, and allow students to take responsibility for directing their own learning autonomy.},
  isbn = {978-3-642-13437-1},
  langid = {english},
  keywords = {Reflection,Reflective Prompts,Scaffolding}
}

@online{wuLanguageModelsPlan2024,
  title = {Do Language Models Plan Ahead for Future Tokens?},
  author = {Wu, Wilson and Morris, John X. and Levine, Lionel},
  date = {2024-03-31},
  eprint = {2404.00859},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2404.00859},
  urldate = {2024-04-05},
  abstract = {Do transformers "think ahead" during inference at a given position? It is known transformers prepare information in the hidden states of the forward pass at \$t\$ that is then used in future forward passes \$t+\textbackslash tau\$. We posit two explanations for this phenomenon: pre-caching, in which off-diagonal gradient terms present in training result in the model computing features at \$t\$ irrelevant to the present inference task but useful for the future, and breadcrumbs, in which features most relevant to time step \$t\$ are already the same as those that would most benefit inference at time \$t+\textbackslash tau\$. We test these hypotheses by training language models without propagating gradients to past timesteps, a scheme we formalize as myopic training. In a synthetic data setting, we find clear evidence for pre-caching. In the autoregressive language modeling setting, our experiments are more suggestive of the breadcrumbs hypothesis.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/U4ZBFKXD/Wu et al_2024_Do language models plan ahead for future tokens.pdf;/Users/andrew/Zotero/storage/VXYE8V4I/2404.html}
}

@online{wuLLMDetLargeLanguage2023,
  title = {{{LLMDet}}: {{A Large Language Models Detection Tool}}},
  shorttitle = {{{LLMDet}}},
  author = {Wu, Kangxi and Pang, Liang and Shen, Huawei and Cheng, Xueqi and Chua, Tat-Seng},
  date = {2023-05-24},
  eprint = {2305.15004},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2305.15004},
  urldate = {2023-06-27},
  abstract = {With the advancement of generative language models, the generated text has come remarkably close to high-quality human-authored text in terms of fluency and diversity. This calls for a highly practical detection tool that can identify the source of text, preferably pinpointing the language model it originates from. However, existing detection tools typically require access to language models and can only differentiate between machine-generated and human-authored text, failing to meet the requirements of rapid detection and text tracing. Therefore, in this paper, we propose an efficient, secure, and scalable detection tool called LLMDet, which calculates the proxy perplexity of text by utilizing the prior information of the model’s next-token probabilities, obtained through pre-training. Subsequently, we use the self-watermarking information of the model, as measured by proxy perplexity, to detect the source of the text. We found that our method demonstrates impressive detection performance while ensuring speed and security, particularly achieving a recognition accuracy of 97.97\% for human-authored text. Furthermore, our detection tool also shows promising results in identifying the large language model (e.g., GPT-2, OPT, LLaMA, Vicuna...) responsible for the text. We release the code and processed data at https://github.com/TrustedLLM/LLMDet.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/2FNQGLR9/Wu et al. - 2023 - LLMDet A Large Language Models Detection Tool.pdf}
}

@online{wuReasoningRecitingExploring2023,
  title = {Reasoning or {{Reciting}}? {{Exploring}} the {{Capabilities}} and {{Limitations}} of {{Language Models Through Counterfactual Tasks}}},
  shorttitle = {Reasoning or {{Reciting}}?},
  author = {Wu, Zhaofeng and Qiu, Linlu and Ross, Alexis and Akyürek, Ekin and Chen, Boyuan and Wang, Bailin and Kim, Najoung and Andreas, Jacob and Kim, Yoon},
  date = {2023-08-01},
  eprint = {2307.02477},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2307.02477},
  url = {http://arxiv.org/abs/2307.02477},
  urldate = {2023-10-02},
  abstract = {The impressive performance of recent language models across a wide range of tasks suggests that they possess a degree of abstract reasoning skills. Are these skills general and transferable, or specialized to specific tasks seen during pretraining? To disentangle these effects, we propose an evaluation framework based on "counterfactual" task variants that deviate from the default assumptions underlying standard tasks. Across a suite of 11 tasks, we observe nontrivial performance on the counterfactual variants, but nevertheless find that performance substantially and consistently degrades compared to the default conditions. This suggests that while current LMs may possess abstract task-solving skills to a degree, they often also rely on narrow, non-transferable procedures for task-solving. These results motivate a more careful interpretation of language model performance that teases apart these aspects of behavior.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/KRIAIEZF/Wu et al_2023_Reasoning or Reciting.pdf;/Users/andrew/Zotero/storage/WNNJ2QA7/2307.html}
}

@online{wuReasoningRecitingExploring2024,
  title = {Reasoning or {{Reciting}}? {{Exploring}} the {{Capabilities}} and {{Limitations}} of {{Language Models Through Counterfactual Tasks}}},
  shorttitle = {Reasoning or {{Reciting}}?},
  author = {Wu, Zhaofeng and Qiu, Linlu and Ross, Alexis and Akyürek, Ekin and Chen, Boyuan and Wang, Bailin and Kim, Najoung and Andreas, Jacob and Kim, Yoon},
  date = {2024-03-28},
  eprint = {2307.02477},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2307.02477},
  url = {http://arxiv.org/abs/2307.02477},
  urldate = {2024-07-22},
  abstract = {The impressive performance of recent language models across a wide range of tasks suggests that they possess a degree of abstract reasoning skills. Are these skills general and transferable, or specialized to specific tasks seen during pretraining? To disentangle these effects, we propose an evaluation framework based on "counterfactual" task variants that deviate from the default assumptions underlying standard tasks. Across a suite of 11 tasks, we observe nontrivial performance on the counterfactual variants, but nevertheless find that performance substantially and consistently degrades compared to the default conditions. This suggests that while current LMs may possess abstract task-solving skills to an extent, they often also rely on narrow, non-transferable procedures for task-solving. These results motivate a more careful interpretation of language model performance that teases apart these aspects of behavior.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/WR276PEF/Wu et al. - 2024 - Reasoning or Reciting Exploring the Capabilities and Limitations of Language Models Through Counter.pdf;/Users/andrew/Zotero/storage/UBTPG8G7/2307.html}
}

@article{xuePhyQMeasurePhysical2023,
  title = {Phy-{{Q}} as a Measure for Physical Reasoning Intelligence},
  author = {Xue, Cheng and Pinto, Vimukthini and Gamage, Chathura and Nikonova, Ekaterina and Zhang, Peng and Renz, Jochen},
  date = {2023-01},
  journaltitle = {Nature Machine Intelligence},
  shortjournal = {Nat Mach Intell},
  volume = {5},
  number = {1},
  pages = {83--93},
  publisher = {Nature Publishing Group},
  issn = {2522-5839},
  doi = {10.1038/s42256-022-00583-4},
  url = {https://www.nature.com/articles/s42256-022-00583-4},
  urldate = {2023-02-11},
  abstract = {Humans are well versed in reasoning about the behaviours of physical objects and choosing actions accordingly to accomplish tasks, while this remains a major challenge for artificial intelligence. To facilitate research addressing this problem, we propose a new testbed that requires an agent to reason about physical scenarios and take an action appropriately. Inspired by the physical knowledge acquired in infancy and the capabilities required for robots to operate in real-world environments, we identify 15 essential physical scenarios. We create a wide variety of distinct task templates, and we ensure that all the task templates within the same scenario can be solved by using one specific strategic physical rule. By having such a design, we evaluate two distinct levels of generalization, namely local generalization and broad generalization. We conduct an extensive evaluation with human players, learning agents with various input types and architectures, and heuristic agents with different strategies. Inspired by how the human intelligence quotient is calculated, we define the physical reasoning quotient (Phy-Q score) that reflects the physical reasoning intelligence of an agent using the physical scenarios we considered. Our evaluation shows that (1) all the agents are far below human performance, and (2) learning agents, even with good local generalization ability, struggle to learn the underlying physical reasoning rules and fail to generalize broadly. We encourage the development of intelligent agents that can reach the human-level Phy-Q score.},
  issue = {1},
  langid = {english},
  keywords = {/unread,Computer science,Human behaviour,Scientific data,Software},
  file = {/Users/andrew/Zotero/storage/T8SET792/Xue et al_2023_Phy-Q as a measure for physical reasoning intelligence.pdf}
}

@online{yangLargeLanguageModels2023,
  title = {Large {{Language Models}} as {{Optimizers}}},
  author = {Yang, Chengrun and Wang, Xuezhi and Lu, Yifeng and Liu, Hanxiao and Le, Quoc V. and Zhou, Denny and Chen, Xinyun},
  date = {2023-09-06},
  eprint = {2309.03409},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2309.03409},
  urldate = {2023-10-18},
  abstract = {Optimization is ubiquitous. While derivative-based algorithms have been powerful tools for various problems, the absence of gradient imposes challenges on many real-world applications. In this work, we propose Optimization by PROmpting (OPRO), a simple and effective approach to leverage large language models (LLMs) as optimizers, where the optimization task is described in natural language. In each optimization step, the LLM generates new solutions from the prompt that contains previously generated solutions with their values, then the new solutions are evaluated and added to the prompt for the next optimization step. We first showcase OPRO on linear regression and traveling salesman problems, then move on to prompt optimization where the goal is to find instructions that maximize the task accuracy. With a variety of LLMs, we demonstrate that the best prompts optimized by OPRO outperform human-designed prompts by up to 8\% on GSM8K, and by up to 50\% on Big-Bench Hard tasks.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/DLQJ5DLK/Yang et al_2023_Large Language Models as Optimizers.pdf;/Users/andrew/Zotero/storage/3SBQQ5XQ/2309.html}
}

@article{yangOneModelLearning2022,
  title = {One Model for the Learning of Language},
  author = {Yang, Yuan and Piantadosi, Steven T.},
  date = {2022-02},
  journaltitle = {Proceedings of the National Academy of Sciences},
  shortjournal = {Proc. Natl. Acad. Sci. U.S.A.},
  volume = {119},
  number = {5},
  pages = {e2021865119},
  issn = {0027-8424, 1091-6490},
  doi = {10.1073/pnas.2021865119},
  url = {https://pnas.org/doi/full/10.1073/pnas.2021865119},
  urldate = {2023-03-14},
  abstract = {Significance             It has long been hypothesized that language acquisition may be impossible without innate knowledge of the structures that occur in natural language. Here, we show that a domain general learning setup, originally developed in cognitive psychology to model rule learning, is able to acquire key pieces of natural language from relatively few examples of sentences. This develops a new approach to formalizing linguistic learning and highlights some features of language and language acquisition that may arise from general cognitive processes.           ,                             A major goal of linguistics and cognitive science is to understand what class of learning systems can acquire natural language. Until recently, the computational requirements of language have been used to argue that learning is impossible without a highly constrained hypothesis space. Here, we describe a learning system that is maximally unconstrained, operating over the space of all computations, and is able to acquire many of the key structures present in natural language from positive evidence alone. We demonstrate this by providing the same learning model with data from 74 distinct formal languages which have been argued to capture key features of language, have been studied in experimental work, or come from an interesting complexity class. The model is able to successfully induce the latent system generating the observed strings from small amounts of evidence in almost all cases, including for regular (e.g.,                                a                 n                              ,                                                                                                                        (                         a                         b                         )                                              n                                                                                       , and                                                                                                                        \{                         a                         ,                         b                         \}                                              +                                                                                       ), context-free (e.g.,                                                                                               a                       n                                                                 b                       n                                          ,                     \,                                            a                       n                                                                 b                                                n                         +                         m                                                                                                              , and                                                                        x                                            x                       R                                                                                       ), and context-sensitive (e.g.,                                                                                               a                       n                                                                 b                       n                                                                 c                       n                                          ,                     \,                                            a                       n                                                                 b                       m                                                                 c                       n                                                                 d                       m                                                                                       , and               xx               ) languages, as well as for many languages studied in learning experiments. These results show that relatively small amounts of positive evidence can support learning of rich classes of generative computations over structures. The model provides an idealized learning setup upon which additional cognitive constraints and biases can be formalized.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/VWA6RP6X/Yang_Piantadosi_2022_One model for the learning of language.pdf}
}

@article{yanPracticalEthicalChallenges2023,
  title = {Practical and {{Ethical Challenges}} of {{Large Language Models}} in {{Education}}: {{A Systematic Scoping Review}}},
  shorttitle = {Practical and {{Ethical Challenges}} of {{Large Language Models}} in {{Education}}},
  author = {Yan, Lixiang and Sha, Lele and Zhao, Linxuan and Li, Yuheng and Martinez-Maldonado, Roberto and Chen, Guanliang and Li, Xinyu and Jin, Yueqiao and Gašević, Dragan},
  date = {2023-08-06},
  journaltitle = {British Journal of Educational Technology},
  shortjournal = {Brit J Educational Tech},
  eprint = {2303.13379},
  eprinttype = {arXiv},
  eprintclass = {cs},
  pages = {bjet.13370},
  issn = {0007-1013, 1467-8535},
  doi = {10.1111/bjet.13370},
  url = {http://arxiv.org/abs/2303.13379},
  urldate = {2023-12-07},
  abstract = {Educational technology innovations leveraging large language models (LLMs) have shown the potential to automate the laborious process of generating and analysing textual content. While various innovations have been developed to automate a range of educational tasks (e.g., question generation, feedback provision, and essay grading), there are concerns regarding the practicality and ethicality of these innovations. Such concerns may hinder future research and the adoption of LLMs-based innovations in authentic educational contexts. To address this, we conducted a systematic scoping review of 118 peer-reviewed papers published since 2017 to pinpoint the current state of research on using LLMs to automate and support educational tasks. The findings revealed 53 use cases for LLMs in automating education tasks, categorised into nine main categories: profiling/labelling, detection, grading, teaching support, prediction, knowledge representation, feedback, content generation, and recommendation. Additionally, we also identified several practical and ethical challenges, including low technological readiness, lack of replicability and transparency, and insufficient privacy and beneficence considerations. The findings were summarised into three recommendations for future studies, including updating existing innovations with state-of-the-art models (e.g., GPT-3/4), embracing the initiative of open-sourcing models/systems, and adopting a human-centred approach throughout the developmental process. As the intersection of AI and education is continuously evolving, the findings of this study can serve as an essential reference point for researchers, allowing them to leverage the strengths, learn from the limitations, and uncover potential research opportunities enabled by ChatGPT and other generative AI models.},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computers and Society},
  file = {/Users/andrew/Zotero/storage/5JJUWZ34/Yan et al_2023_Practical and Ethical Challenges of Large Language Models in Education.pdf;/Users/andrew/Zotero/storage/VQSN24P4/2303.html}
}

@online{yaoReActSynergizingReasoning2023a,
  title = {{{ReAct}}: {{Synergizing Reasoning}} and {{Acting}} in {{Language Models}}},
  shorttitle = {{{ReAct}}},
  author = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and Shafran, Izhak and Narasimhan, Karthik and Cao, Yuan},
  date = {2023-03-09},
  eprint = {2210.03629},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2210.03629},
  url = {http://arxiv.org/abs/2210.03629},
  urldate = {2024-01-24},
  abstract = {While large language models (LLMs) have demonstrated impressive capabilities across tasks in language understanding and interactive decision making, their abilities for reasoning (e.g. chain-of-thought prompting) and acting (e.g. action plan generation) have primarily been studied as separate topics. In this paper, we explore the use of LLMs to generate both reasoning traces and task-specific actions in an interleaved manner, allowing for greater synergy between the two: reasoning traces help the model induce, track, and update action plans as well as handle exceptions, while actions allow it to interface with external sources, such as knowledge bases or environments, to gather additional information. We apply our approach, named ReAct, to a diverse set of language and decision making tasks and demonstrate its effectiveness over state-of-the-art baselines, as well as improved human interpretability and trustworthiness over methods without reasoning or acting components. Concretely, on question answering (HotpotQA) and fact verification (Fever), ReAct overcomes issues of hallucination and error propagation prevalent in chain-of-thought reasoning by interacting with a simple Wikipedia API, and generates human-like task-solving trajectories that are more interpretable than baselines without reasoning traces. On two interactive decision making benchmarks (ALFWorld and WebShop), ReAct outperforms imitation and reinforcement learning methods by an absolute success rate of 34\% and 10\% respectively, while being prompted with only one or two in-context examples. Project site with code: https://react-lm.github.io},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/KVHWWPS4/Yao et al_2023_ReAct.pdf;/Users/andrew/Zotero/storage/AIMSVAK6/2210.html}
}

@online{yaoTreeThoughtsDeliberate2023,
  title = {Tree of {{Thoughts}}: {{Deliberate Problem Solving}} with {{Large Language Models}}},
  shorttitle = {Tree of {{Thoughts}}},
  author = {Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Thomas L. and Cao, Yuan and Narasimhan, Karthik},
  date = {2023-05-17},
  eprint = {2305.10601},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2305.10601},
  url = {http://arxiv.org/abs/2305.10601},
  urldate = {2023-05-23},
  abstract = {Language models are increasingly being deployed for general problem solving across a wide range of tasks, but are still confined to token-level, left-to-right decision-making processes during inference. This means they can fall short in tasks that require exploration, strategic lookahead, or where initial decisions play a pivotal role. To surmount these challenges, we introduce a new framework for language model inference, Tree of Thoughts (ToT), which generalizes over the popular Chain of Thought approach to prompting language models, and enables exploration over coherent units of text (thoughts) that serve as intermediate steps toward problem solving. ToT allows LMs to perform deliberate decision making by considering multiple different reasoning paths and self-evaluating choices to decide the next course of action, as well as looking ahead or backtracking when necessary to make global choices. Our experiments show that ToT significantly enhances language models' problem-solving abilities on three novel tasks requiring non-trivial planning or search: Game of 24, Creative Writing, and Mini Crosswords. For instance, in Game of 24, while GPT-4 with chain-of-thought prompting only solved 4\% of tasks, our method achieved a success rate of 74\%. Code repo with all prompts: https://github.com/ysymyth/tree-of-thought-llm.},
  pubstate = {prepublished},
  keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/WNZX2YN8/Yao et al. - 2023 - Tree of Thoughts Deliberate Problem Solving with Large Language Models.pdf;/Users/andrew/Zotero/storage/Q545MW3W/2305.html}
}

@article{yildirimTaskStructuresWorld2024,
  title = {From Task Structures to World Models: What Do {{LLMs}} Know?},
  shorttitle = {From Task Structures to World Models},
  author = {Yildirim, Ilker and Paul, L. A.},
  date = {2024-03-04},
  journaltitle = {Trends in Cognitive Sciences},
  shortjournal = {Trends in Cognitive Sciences},
  volume = {0},
  number = {0},
  eprint = {38443199},
  eprinttype = {pmid},
  publisher = {Elsevier},
  issn = {1364-6613, 1879-307X},
  doi = {10.1016/j.tics.2024.02.008},
  url = {https://www.cell.com/trends/cognitive-sciences/abstract/S1364-6613(24)00035-4},
  urldate = {2024-03-07},
  langid = {english},
  keywords = {instrumental knowledge,intelligence,large language models,resource rational,world models,worldly knowledge},
  file = {/Users/andrew/Zotero/storage/8BMGIKME/Yildirim_Paul_2024_From task structures to world models.pdf}
}

@incollection{yudelsonIndividualizedBayesianKnowledge2013a,
  title = {Individualized {{Bayesian Knowledge Tracing Models}}},
  booktitle = {Artificial {{Intelligence}} in {{Education}}},
  author = {Yudelson, Michael V. and Koedinger, Kenneth R. and Gordon, Geoffrey J.},
  editor = {Lane, H. Chad and Yacef, Kalina and Mostow, Jack and Pavlik, Philip},
  editora = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard},
  editoratype = {redactor},
  date = {2013},
  volume = {7926},
  pages = {171--180},
  publisher = {Springer Berlin Heidelberg},
  location = {Berlin, Heidelberg},
  doi = {10.1007/978-3-642-39112-5_18},
  url = {http://link.springer.com/10.1007/978-3-642-39112-5_18},
  urldate = {2023-03-08},
  abstract = {Bayesian Knowledge Tracing (BKT)[1] is a user modeling method extensively used in the area of Intelligent Tutoring Systems. In the standard BKT implementation, there are only skill-specific parameters. However, a large body of research strongly suggests that studentspecific variability in the data, when accounted for, could enhance model accuracy [5, 6, 8]. In this work, we revisit the problem of introducing student-specific parameters into BKT on a larger scale. We show that student-specific parameters lead to a tangible improvement when predicting the data of unseen students, and that parameterizing students’ speed of learning is more beneficial than parameterizing a priori knowledge.},
  isbn = {978-3-642-39111-8 978-3-642-39112-5},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/CI8MTT3S/Yudelson et al. - 2013 - Individualized Bayesian Knowledge Tracing Models.pdf}
}

@incollection{yudelsonIndividualizedBayesianKnowledge2013b,
  title = {Individualized {{Bayesian Knowledge Tracing Models}}},
  booktitle = {Artificial {{Intelligence}} in {{Education}}},
  author = {Yudelson, Michael V. and Koedinger, Kenneth R. and Gordon, Geoffrey J.},
  editor = {Lane, H. Chad and Yacef, Kalina and Mostow, Jack and Pavlik, Philip},
  editora = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard},
  editoratype = {redactor},
  date = {2013},
  volume = {7926},
  pages = {171--180},
  publisher = {Springer Berlin Heidelberg},
  location = {Berlin, Heidelberg},
  doi = {10.1007/978-3-642-39112-5_18},
  url = {http://link.springer.com/10.1007/978-3-642-39112-5_18},
  urldate = {2023-03-24},
  abstract = {Bayesian Knowledge Tracing (BKT)[1] is a user modeling method extensively used in the area of Intelligent Tutoring Systems. In the standard BKT implementation, there are only skill-specific parameters. However, a large body of research strongly suggests that studentspecific variability in the data, when accounted for, could enhance model accuracy [5, 6, 8]. In this work, we revisit the problem of introducing student-specific parameters into BKT on a larger scale. We show that student-specific parameters lead to a tangible improvement when predicting the data of unseen students, and that parameterizing students’ speed of learning is more beneficial than parameterizing a priori knowledge.},
  isbn = {978-3-642-39111-8 978-3-642-39112-5},
  langid = {english},
  keywords = {/unread},
  file = {/Users/andrew/Zotero/storage/I7K7Z9MI/Yudelson et al. - 2013 - Individualized Bayesian Knowledge Tracing Models.pdf}
}

@inproceedings{yudelsonIndividualizedBayesianKnowledge2013c,
  title = {Individualized {{Bayesian Knowledge Tracing Models}}},
  booktitle = {Artificial {{Intelligence}} in {{Education}}},
  author = {Yudelson, Michael V. and Koedinger, Kenneth R. and Gordon, Geoffrey J.},
  editor = {Lane, H. Chad and Yacef, Kalina and Mostow, Jack and Pavlik, Philip},
  date = {2013},
  series = {Lecture {{Notes}} in {{Computer Science}}},
  pages = {171--180},
  publisher = {Springer},
  location = {Berlin, Heidelberg},
  doi = {10.1007/978-3-642-39112-5_18},
  abstract = {Bayesian Knowledge Tracing (BKT)[1] is a user modeling method extensively used in the area of Intelligent Tutoring Systems. In the standard BKT implementation, there are only skill-specific parameters. However, a large body of research strongly suggests that student-specific variability in the data, when accounted for, could enhance model accuracy [5,6,8]. In this work, we revisit the problem of introducing student-specific parameters into BKT on a larger scale. We show that student-specific parameters lead to a tangible improvement when predicting the data of unseen students, and that parameterizing students’ speed of learning is more beneficial than parameterizing a priori knowledge.},
  isbn = {978-3-642-39112-5},
  langid = {english},
  keywords = {/unread,Bayesian knowledge tracing,model fitting,model selection,student-specific model parameters},
  file = {/Users/andrew/Zotero/storage/EK43IS7Z/Yudelson et al_2013_Individualized Bayesian Knowledge Tracing Models.pdf}
}

@online{yuksekgonulTextGradAutomaticDifferentiation2024,
  title = {{{TextGrad}}: {{Automatic}} "{{Differentiation}}" via {{Text}}},
  shorttitle = {{{TextGrad}}},
  author = {Yuksekgonul, Mert and Bianchi, Federico and Boen, Joseph and Liu, Sheng and Huang, Zhi and Guestrin, Carlos and Zou, James},
  date = {2024-06-11},
  eprint = {2406.07496},
  eprinttype = {arXiv},
  eprintclass = {cs},
  url = {http://arxiv.org/abs/2406.07496},
  urldate = {2024-07-05},
  abstract = {AI is undergoing a paradigm shift, with breakthroughs achieved by systems orchestrating multiple large language models (LLMs) and other complex components. As a result, developing principled and automated optimization methods for compound AI systems is one of the most important new challenges. Neural networks faced a similar challenge in its early days until backpropagation and automatic differentiation transformed the field by making optimization turn-key. Inspired by this, we introduce TEXTGRAD, a powerful framework performing automatic “differentiation” via text. TEXTGRAD backpropagates textual feedback provided by LLMs to improve individual components of a compound AI system. In our framework, LLMs provide rich, general, natural language suggestions to optimize variables in computation graphs, ranging from code snippets to molecular structures. TEXTGRAD follows PyTorch’s syntax and abstraction and is flexible and easy-to-use. It works out-of-the-box for a variety of tasks, where the users only provide the objective function without tuning components or prompts of the framework. We showcase TEXTGRAD’s effectiveness and generality across a diverse range of applications, from question answering and molecule optimization to radiotherapy treatment planning. Without modifying the framework, TEXTGRAD improves the zero-shot accuracy of GPT-4o in Google-Proof Question Answering from 51\% to 55\%, yields 20\% relative performance gain in optimizing LeetCode-Hard coding problem solutions, improves prompts for reasoning, designs new druglike small molecules with desirable in silico binding, and designs radiation oncology treatment plans with high specificity. TEXTGRAD lays a foundation to accelerate the development of the next-generation of AI systems.},
  langid = {english},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/46FI2NPE/Yuksekgonul et al. - 2024 - TextGrad Automatic Differentiation via Text.pdf}
}

@inproceedings{zamfirescu-pereiraWhyJohnnyCan2023,
  title = {Why {{Johnny Can}}’t {{Prompt}}: {{How Non-AI Experts Try}} (and {{Fail}}) to {{Design LLM Prompts}}},
  shorttitle = {Why {{Johnny Can}}’t {{Prompt}}},
  booktitle = {Proceedings of the 2023 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  author = {Zamfirescu-Pereira, J.D. and Wong, Richmond Y. and Hartmann, Bjoern and Yang, Qian},
  date = {2023-04-19},
  pages = {1--21},
  publisher = {ACM},
  location = {Hamburg Germany},
  doi = {10.1145/3544548.3581388},
  url = {https://dl.acm.org/doi/10.1145/3544548.3581388},
  urldate = {2024-07-29},
  abstract = {Pre-trained large language models (“LLMs”) like GPT-3 can engage in fuent, multi-turn instruction-taking out-of-the-box, making them attractive materials for designing natural language interactions. Using natural language to steer LLM outputs (“prompting”) has emerged as an important design technique potentially accessible to non-AI-experts. Crafting efective prompts can be challenging, however, and prompt-based interactions are brittle. Here, we explore whether non-AI-experts can successfully engage in “end-user prompt engineering” using a design probe—a prototype LLM-based chatbot design tool supporting development and systematic evaluation of prompting strategies. Ultimately, our probe participants explored prompt designs opportunistically, not systematically, and struggled in ways echoing end-user programming systems and interactive machine learning systems. Expectations stemming from human-to-human instructional experiences, and a tendency to overgeneralize, were barriers to efective prompt design. These fndings have implications for non-AI-expert-facing LLM-based tool design and for improving LLM-and-prompt literacy among programmers and the public, and present opportunities for further research.},
  eventtitle = {{{CHI}} '23: {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
  isbn = {978-1-4503-9421-5},
  langid = {english},
  file = {/Users/andrew/Zotero/storage/ZKAIA4AR/Zamfirescu-Pereira et al. - 2023 - Why Johnny Can’t Prompt How Non-AI Experts Try (and Fail) to Design LLM Prompts.pdf}
}

@online{zelikmanQuietSTaRLanguageModels2024,
  title = {Quiet-{{STaR}}: {{Language Models Can Teach Themselves}} to {{Think Before Speaking}}},
  shorttitle = {Quiet-{{STaR}}},
  author = {Zelikman, Eric and Harik, Georges and Shao, Yijia and Jayasiri, Varuna and Haber, Nick and Goodman, Noah D.},
  date = {2024-03-18},
  eprint = {2403.09629},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2403.09629},
  url = {http://arxiv.org/abs/2403.09629},
  urldate = {2024-03-19},
  abstract = {When writing and talking, people sometimes pause to think. Although reasoning-focused works have often framed reasoning as a method of answering questions or completing agentic tasks, reasoning is implicit in almost all written text. For example, this applies to the steps not stated between the lines of a proof or to the theory of mind underlying a conversation. In the Self-Taught Reasoner (STaR, Zelikman et al. 2022), useful thinking is learned by inferring rationales from few-shot examples in question-answering and learning from those that lead to a correct answer. This is a highly constrained setting -- ideally, a language model could instead learn to infer unstated rationales in arbitrary text. We present Quiet-STaR, a generalization of STaR in which LMs learn to generate rationales at each token to explain future text, improving their predictions. We address key challenges, including 1) the computational cost of generating continuations, 2) the fact that the LM does not initially know how to generate or use internal thoughts, and 3) the need to predict beyond individual next tokens. To resolve these, we propose a tokenwise parallel sampling algorithm, using learnable tokens indicating a thought's start and end, and an extended teacher-forcing technique. Encouragingly, generated rationales disproportionately help model difficult-to-predict tokens and improve the LM's ability to directly answer difficult questions. In particular, after continued pretraining of an LM on a corpus of internet text with Quiet-STaR, we find zero-shot improvements on GSM8K (5.9\%\$\textbackslash rightarrow\$10.9\%) and CommonsenseQA (36.3\%\$\textbackslash rightarrow\$47.2\%) and observe a perplexity improvement of difficult tokens in natural text. Crucially, these improvements require no fine-tuning on these tasks. Quiet-STaR marks a step towards LMs that can learn to reason in a more general and scalable way.},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/QIT83WLG/Zelikman et al_2024_Quiet-STaR.pdf;/Users/andrew/Zotero/storage/7T9XDZ38/2403.html}
}

@online{zhangExtractingPromptsInverting2024,
  title = {Extracting {{Prompts}} by {{Inverting LLM Outputs}}},
  author = {Zhang, Collin and Morris, John X. and Shmatikov, Vitaly},
  date = {2024-05-23},
  eprint = {2405.15012},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2405.15012},
  url = {http://arxiv.org/abs/2405.15012},
  urldate = {2024-08-04},
  abstract = {We consider the problem of language model inversion: given outputs of a language model, we seek to extract the prompt that generated these outputs. We develop a new black-box method, output2prompt, that learns to extract prompts without access to the model's logits and without adversarial or jailbreaking queries. In contrast to previous work, output2prompt only needs outputs of normal user queries. To improve memory efficiency, output2prompt employs a new sparse encoding techique. We measure the efficacy of output2prompt on a variety of user and system prompts and demonstrate zero-shot transferability across different LLMs.},
  pubstate = {prepublished},
  keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/Users/andrew/Zotero/storage/AS24STPL/Zhang et al. - 2024 - Extracting Prompts by Inverting LLM Outputs.pdf;/Users/andrew/Zotero/storage/7IDCIXJH/2405.html}
}

@online{zhaoECGChatLargeECGLanguage2024,
  title = {{{ECG-Chat}}: {{A Large ECG-Language Model}} for {{Cardiac Disease Diagnosis}}},
  shorttitle = {{{ECG-Chat}}},
  author = {Zhao, Yubao and Zhang, Tian and Wang, Xu and Han, Puyu and Chen, Tong and Huang, Linlin and Jin, Youzhu and Kang, Jiaju},
  date = {2024-08-16},
  eprint = {2408.08849},
  eprinttype = {arXiv},
  eprintclass = {eess},
  doi = {10.48550/arXiv.2408.08849},
  url = {http://arxiv.org/abs/2408.08849},
  urldate = {2024-09-05},
  abstract = {The success of Multimodal Large Language Models (MLLMs) in the medical auxiliary field shows great potential, allowing patients to engage in conversations using physiological signal data. However, general MLLMs perform poorly in cardiac disease diagnosis, particularly in the integration of ECG data analysis and long-text medical report generation, mainly due to the complexity of ECG data analysis and the gap between text and ECG signal modalities. Additionally, models often exhibit severe stability deficiencies in long-text generation due to the lack of precise knowledge strongly related to user queries. To address these issues, we propose ECG-Chat, the first multitask MLLMs focused on ECG medical report generation, providing multimodal conversational capabilities based on cardiology knowledge. We propose a contrastive learning approach that integrates ECG waveform data with text reports, aligning ECG features with reports in a fine-grained manner. This method also results in an ECG encoder that excels in zero-shot report retrieval tasks. Additionally, expanding existing datasets, we constructed a 19k ECG diagnosis dataset and a 25k multi-turn dialogue dataset for training and fine-tuning ECG-Chat, which provides professional diagnostic and conversational capabilities. Furthermore, ECG-Chat can generate comprehensive ECG analysis reports through an automated LaTeX generation pipeline. We established a benchmark for the ECG report generation task and tested our model on multiple baselines. ECG-Chat achieved the best performance in classification, retrieval, multimodal dialogue, and medical report generation tasks. Our report template design has also been widely recognized by medical practitioners.},
  pubstate = {prepublished},
  keywords = {Electrical Engineering and Systems Science - Signal Processing}
}

@online{zhouSurveyLargeLanguage2024,
  title = {A {{Survey}} of {{Large Language Models}} in {{Medicine}}: {{Progress}}, {{Application}}, and {{Challenge}}},
  shorttitle = {A {{Survey}} of {{Large Language Models}} in {{Medicine}}},
  author = {Zhou, Hongjian and Liu, Fenglin and Gu, Boyang and Zou, Xinyu and Huang, Jinfa and Wu, Jinge and Li, Yiru and Chen, Sam S. and Zhou, Peilin and Liu, Junling and Hua, Yining and Mao, Chengfeng and You, Chenyu and Wu, Xian and Zheng, Yefeng and Clifton, Lei and Li, Zheng and Luo, Jiebo and Clifton, David A.},
  date = {2024-07-22},
  eprint = {2311.05112},
  eprinttype = {arXiv},
  eprintclass = {cs},
  doi = {10.48550/arXiv.2311.05112},
  url = {http://arxiv.org/abs/2311.05112},
  urldate = {2024-09-07},
  abstract = {Large language models (LLMs), such as ChatGPT, have received substantial attention due to their capabilities for understanding and generating human language. While there has been a burgeoning trend in research focusing on the employment of LLMs in supporting different medical tasks (e.g., enhancing clinical diagnostics and providing medical education), a review of these efforts, particularly their development, practical applications, and outcomes in medicine, remains scarce. Therefore, this review aims to provide a detailed overview of the development and deployment of LLMs in medicine, including the challenges and opportunities they face. In terms of development, we provide a detailed introduction to the principles of existing medical LLMs, including their basic model structures, number of parameters, and sources and scales of data used for model development. It serves as a guide for practitioners in developing medical LLMs tailored to their specific needs. In terms of deployment, we offer a comparison of the performance of different LLMs across various medical tasks, and further compare them with state-of-the-art lightweight models, aiming to provide an understanding of the advantages and limitations of LLMs in medicine. Overall, in this review, we address the following questions: 1) What are the practices for developing medical LLMs 2) How to measure the medical task performance of LLMs in a medical setting? 3) How have medical LLMs been employed in real-world practice? 4) What challenges arise from the use of medical LLMs? and 5) How to more effectively develop and deploy medical LLMs? By answering these questions, this review aims to provide insights into the opportunities for LLMs in medicine and serve as a practical resource. We also maintain a regularly updated list of practical guides on medical LLMs at https://github.com/AI-in-Health/MedLLMsPracticalGuide},
  pubstate = {prepublished},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/Users/andrew/Zotero/storage/FXCVZYWU/Zhou et al. - 2024 - A Survey of Large Language Models in Medicine Progress, Application, and Challenge.pdf;/Users/andrew/Zotero/storage/8QV4P6ZK/2311.html}
}

@article{zouUseNotUse2023,
  title = {To Use or Not to Use? {{Understanding}} Doctoral Students’ Acceptance of {{ChatGPT}} in Writing through Technology Acceptance Model},
  shorttitle = {To Use or Not to Use?},
  author = {Zou, Min and Huang, Liang},
  date = {2023},
  journaltitle = {Frontiers in Psychology},
  volume = {14},
  issn = {1664-1078},
  doi = {10.3389/fpsyg.2023.1259531},
  url = {https://www.frontiersin.org/articles/10.3389/fpsyg.2023.1259531},
  urldate = {2024-03-08},
  abstract = {While artificial intelligence-based chatbots have demonstrated great potential for writing, little is known about whether and how doctoral students accept the use of ChatGPT in writing. Framed with Technology Acceptance Model, this study investigated doctoral students’ acceptance toward ChatGPT in writing and the factors that influence it. The questionnaire survey revealed a high intention to use ChatGPT in writing among doctoral students in China. The findings further indicated that attitude was a significant predictor of behavioural intention to use ChatGPT in writing and mediated the impacts of perceived usefulness and perceived ease of use on it. Perceived ease of ChatGPT use was in turn influenced by students’ past ChatGPT use experience. This study provides powerful evidence for the applicability of Technology Acceptance Model in the acceptance of ChatGPT in writing. The results have significant implications for leveraging ChatGPT for writing in higher education.},
  file = {/Users/andrew/Zotero/storage/Y5W8PAZQ/Zou_Huang_2023_To use or not to use.pdf}
}