-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbibliography.bib
4849 lines (4543 loc) · 551 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@online{adamsBayesianOnlineChangepoint2007,
title = {Bayesian {{Online Changepoint Detection}}},
author = {Adams, Ryan Prescott and MacKay, David J. C.},
date = {2007-10-19},
eprint = {0710.3742},
eprinttype = {arXiv},
eprintclass = {stat},
doi = {10.48550/arXiv.0710.3742},
url = {http://arxiv.org/abs/0710.3742},
urldate = {2023-03-17},
abstract = {Changepoints are abrupt variations in the generative parameters of a data sequence. Online detection of changepoints is useful in modelling and prediction of time series in application areas such as finance, biometrics, and robotics. While frequentist methods have yielded online filtering and prediction techniques, most Bayesian papers have focused on the retrospective segmentation problem. Here we examine the case where the model parameters before and after the changepoint are independent and we derive an online algorithm for exact inference of the most recent changepoint. We compute the probability distribution of the length of the current ``run,'' or time since the last changepoint, using a simple message-passing algorithm. Our implementation is highly modular so that the algorithm may be applied to a variety of types of data. We illustrate this modularity by demonstrating the algorithm on three different real-world data sets.},
pubstate = {prepublished},
keywords = {/unread,Statistics - Machine Learning},
file = {/Users/andrew/Zotero/storage/B9844L4J/Adams_MacKay_2007_Bayesian Online Changepoint Detection.pdf;/Users/andrew/Zotero/storage/36XZ5FQB/0710.html}
}
@online{adamsSparseDenseGPT42023,
title = {From {{Sparse}} to {{Dense}}: {{GPT-4 Summarization}} with {{Chain}} of {{Density Prompting}}},
shorttitle = {From {{Sparse}} to {{Dense}}},
author = {Adams, Griffin and Fabbri, Alexander and Ladhak, Faisal and Lehman, Eric and Elhadad, Noémie},
date = {2023-09-08},
eprint = {2309.04269},
eprinttype = {arXiv},
eprintclass = {cs},
url = {http://arxiv.org/abs/2309.04269},
urldate = {2023-09-17},
abstract = {Selecting the ``right'' amount of information to include in a summary is a difficult task. A good summary should be detailed and entity-centric without being overly dense and hard to follow. To better understand this tradeoff, we solicit increasingly dense GPT-4 summaries with what we refer to as a ``Chain of Density'' (CoD) prompt. Specifically, GPT-4 generates an initial entity-sparse summary before iteratively incorporating missing salient entities without increasing the length. Summaries generated by CoD are more abstractive, exhibit more fusion, and have less of a lead bias than GPT-4 summaries generated by a vanilla prompt. We conduct a human preference study on 100 CNN DailyMail articles and find that that humans prefer GPT-4 summaries that are more dense than those generated by a vanilla prompt and almost as dense as human written summaries. Qualitative analysis supports the notion that there exists a tradeoff between informativeness and readability. 500 annotated CoD summaries, as well as an extra 5,000 unannotated summaries, are freely available on HuggingFace (https://huggingface.co/datasets/griffin/chain\_of\_density).},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/K7HZ73IP/Adams et al_2023_From Sparse to Dense.pdf;/Users/andrew/Zotero/storage/H8GLKSD9/2309.html}
}
@article{adorniRubricbasedLearnerModelling2023,
title = {Rubric-Based {{Learner Modelling}} via {{Noisy Gates Bayesian Networks}} for {{Computational Thinking Skills Assessment}}},
author = {Adorni, Giorgia and Mangili, Francesca and Piatti, Alberto and Bonesana, Claudio and Antonucci, Alessandro},
date = {2023},
journaltitle = {Journal of Communications Software and Systems},
shortjournal = {JCOMSS},
volume = {19},
number = {1},
pages = {52--64},
issn = {18456421, 18466079},
doi = {10.24138/jcomss-2022-0169},
url = {https://jcoms.fesb.unist.hr/10.24138/jcomss-2022-0169/},
urldate = {2024-09-04},
abstract = {In modern and personalised education, there is a growing interest in developing learners’ competencies and accurately assessing them. In a previous work, we proposed a procedure for deriving a learner model for automatic skill assessment from a task-specific competence rubric, thus simplifying the implementation of automated assessment tools. The previous approach, however, suffered two main limitations: (i) the ordering between competencies defined by the assessment rubric was only indirectly modelled; (ii) supplementary skills, not under assessment but necessary for accomplishing the task, were not included in the model. In this work, we address issue (i) by introducing dummy observed nodes, strictly enforcing the skills ordering without changing the network’s structure. In contrast, for point (ii), we design a network with two layers of gates, one performing disjunctive operations by noisy-OR gates and the other conjunctive operations through logical ANDs. Such changes improve the model outcomes’ coherence and the modelling tool’s flexibility without compromising the model’s compact parametrisation, interpretability and simple experts’ elicitation. We used this approach to develop a learner model for Computational Thinking (CT) skills assessment. The CT-cube skills assessment framework and the Cross Array Task (CAT) are used to exemplify it and demonstrate its feasibility.},
langid = {english},
file = {/Users/andrew/Zotero/storage/MGF59B53/Adorni et al. - 2023 - Rubric-based Learner Modelling via Noisy Gates Bayesian Networks for Computational Thinking Skills A.pdf}
}
@online{agarwalTransformersReinforcementLearning2023,
title = {Transformers in {{Reinforcement Learning}}: {{A Survey}}},
shorttitle = {Transformers in {{Reinforcement Learning}}},
author = {Agarwal, Pranav and Rahman, Aamer Abdul and St-Charles, Pierre-Luc and Prince, Simon J. D. and Kahou, Samira Ebrahimi},
date = {2023-07-12},
eprint = {2307.05979},
eprinttype = {arXiv},
eprintclass = {cs},
url = {http://arxiv.org/abs/2307.05979},
urldate = {2023-08-02},
abstract = {Transformers have significantly impacted domains like natural language processing, computer vision, and robotics, where they improve performance compared to other neural networks. This survey explores how transformers are used in reinforcement learning (RL), where they are seen as a promising solution for addressing challenges such as unstable training, credit assignment, lack of interpretability, and partial observability. We begin by providing a brief domain overview of RL, followed by a discussion on the challenges of classical RL algorithms. Next, we delve into the properties of the transformer and its variants and discuss the characteristics that make them well-suited to address the challenges inherent in RL. We examine the application of transformers to various aspects of RL, including representation learning, transition and reward function modeling, and policy optimization. We also discuss recent research that aims to enhance the interpretability and efficiency of transformers in RL, using visualization techniques and efficient training strategies. Often, the transformer architecture must be tailored to the specific needs of a given application. We present a broad overview of how transformers have been adapted for several applications, including robotics, medicine, language modeling, cloud computing, and combinatorial optimization. We conclude by discussing the limitations of using transformers in RL and assess their potential for catalyzing future breakthroughs in this field. CCS Concepts: • Computing methodologies → Reinforcement learning; Neural networks; • General and reference → Surveys and overviews.},
langid = {english},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning},
file = {/Users/andrew/Zotero/storage/W85GDSUV/Agarwal et al. - 2023 - Transformers in Reinforcement Learning A Survey.pdf}
}
@online{aksitovReSTMeetsReAct2023,
title = {{{ReST}} Meets {{ReAct}}: {{Self-Improvement}} for {{Multi-Step Reasoning LLM Agent}}},
shorttitle = {{{ReST}} Meets {{ReAct}}},
author = {Aksitov, Renat and Miryoosefi, Sobhan and Li, Zonglin and Li, Daliang and Babayan, Sheila and Kopparapu, Kavya and Fisher, Zachary and Guo, Ruiqi and Prakash, Sushant and Srinivasan, Pranesh and Zaheer, Manzil and Yu, Felix and Kumar, Sanjiv},
date = {2023-12-15},
eprint = {2312.10003},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2312.10003},
url = {http://arxiv.org/abs/2312.10003},
urldate = {2024-05-16},
abstract = {Answering complex natural language questions often necessitates multi-step reasoning and integrating external information. Several systems have combined knowledge retrieval with a large language model (LLM) to answer such questions. These systems, however, suffer from various failure cases, and we cannot directly train them end-to-end to fix such failures, as interaction with external knowledge is non-differentiable. To address these deficiencies, we define a ReAct-style LLM agent with the ability to reason and act upon external knowledge. We further refine the agent through a ReST-like method that iteratively trains on previous trajectories, employing growing-batch reinforcement learning with AI feedback for continuous self-improvement and self-distillation. Starting from a prompted large model and after just two iterations of the algorithm, we can produce a fine-tuned small model that achieves comparable performance on challenging compositional question-answering benchmarks with two orders of magnitude fewer parameters.},
pubstate = {prepublished},
keywords = {Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/D2H22N54/Aksitov et al_2023_ReST meets ReAct.pdf;/Users/andrew/Zotero/storage/GAVYYD6V/2312.html}
}
@online{albarracinDesigningExplainableArtificial2023,
title = {Designing Explainable Artificial Intelligence with Active Inference: {{A}} Framework for Transparent Introspection and Decision-Making},
shorttitle = {Designing Explainable Artificial Intelligence with Active Inference},
author = {Albarracin, Mahault and Hipólito, Inês and Tremblay, Safae Essafi and Fox, Jason G. and René, Gabriel and Friston, Karl and Ramstead, Maxwell J. D.},
date = {2023-06-06},
eprint = {2306.04025},
eprinttype = {arXiv},
eprintclass = {cs},
url = {http://arxiv.org/abs/2306.04025},
urldate = {2023-11-10},
abstract = {This paper investigates the prospect of developing human-interpretable, explainable artificial intelligence (AI) systems based on active inference and the free energy principle. We first provide a brief overview of active inference, and in particular, of how it applies to the modeling of decision-making, introspection, as well as the generation of overt and covert actions. We then discuss how active inference can be leveraged to design explainable AI systems, namely, by allowing us to model core features of ``introspective'' processes and by generating useful, human-interpretable models of the processes involved in decision-making. We propose an architecture for explainable AI systems using active inference. This architecture foregrounds the role of an explicit hierarchical generative model, the operation of which enables the AI system to track and explain the factors that contribute to its own decisions, and whose structure is designed to be interpretable and auditable by human users. We outline how this architecture can integrate diverse sources of information to make informed decisions in an auditable manner, mimicking or reproducing aspects of human-like consciousness and introspection. Finally, we discuss the implications of our findings for future research in AI, and the potential ethical considerations of developing AI systems with (the appearance of) introspective capabilities.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence},
file = {/Users/andrew/Zotero/storage/7RVVJ5Z5/Albarracin et al. - 2023 - Designing explainable artificial intelligence with.pdf;/Users/andrew/Zotero/storage/NGI5TUX7/2306.html}
}
@book{alevenIntelligentTutoringSystems2010,
title = {Intelligent {{Tutoring Systems}}: 10th {{International Conference}}, {{ITS}} 2010, {{Pittsburgh}}, {{PA}}, {{USA}}, {{June}} 14-18, 2010, {{Proceedings}}, {{Part II}}},
shorttitle = {Intelligent {{Tutoring Systems}}},
editor = {Aleven, Vincent and Kay, Judy and Mostow, Jack},
editora = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard},
editoratype = {redactor},
date = {2010},
series = {Lecture {{Notes}} in {{Computer Science}}},
volume = {6095},
publisher = {Springer},
location = {Berlin, Heidelberg},
doi = {10.1007/978-3-642-13437-1},
url = {http://link.springer.com/10.1007/978-3-642-13437-1},
urldate = {2023-11-29},
isbn = {978-3-642-13436-4 978-3-642-13437-1},
langid = {english},
keywords = {adaptive mobile learning,adaptive systems,affect recognition,affective computing,agents,assessment,Augmented Reality,authoring tools,cognition,computer assisted learning,data mining,design patterns,e-learning,learning,Scaffolding},
file = {/Users/andrew/Zotero/storage/QGLK6IBM/Aleven et al_2010_Intelligent Tutoring Systems.pdf}
}
@article{alshurafatFactorsAffectingAccounting2023,
title = {Factors Affecting Accounting Students’ Misuse of Chatgpt: An Application of the Fraud Triangle Theory},
shorttitle = {Factors Affecting Accounting Students’ Misuse of Chatgpt},
author = {Alshurafat, Hashem and Al Shbail, Mohannad Obeid and Hamdan, Allam and Al-Dmour, Ahmad and Ensour, Waed},
date = {2023-01-01},
journaltitle = {Journal of Financial Reporting and Accounting},
volume = {ahead-of-print},
issn = {1985-2517},
doi = {10.1108/JFRA-04-2023-0182},
url = {https://doi.org/10.1108/JFRA-04-2023-0182},
urldate = {2024-01-29},
abstract = {Purpose This study aims to explore the factors that contribute to student academic dishonesty through an examination of the misuse of AI language models. Using the fraud triangle theory, which posits that opportunity, rationalization and pressure are key factors for fraudulent behavior, this study investigates how these elements interact and contribute to academic dishonesty among students. Design/methodology/approach In this study, data on how accounting students used ChatGPT to cheat was acquired from 279 accounting students in Jordanian public universities over the course of two months, from January 2023 to March 2023, through previously tested and validated questionnaires. The main tool for gathering data was a questionnaire distributed online using Microsoft Forms. Findings The results show that all of the fraud triangle factors are significant determinants of student academic dishonesty and student misuse of ChatGPT. The findings of this research can be used to guide the development of technology-based preventative measures. Originality/value This study provides valuable insights into the motivations and factors that drive students to engage in academic dishonesty and sheds light on the broader issue of technology-assisted academic dishonesty and its impact on the educational system. This study’s contribution is significant, as it sheds light on a pressing issue in education and provides valuable information for educators and policymakers to address the problem and improve academic standards.},
issue = {ahead-of-print},
keywords = {Academic dishonesty,Academic integrity,AI language models,ChatGPT,Fraud triangle theory,Technology-assisted cheating},
file = {/Users/andrew/Zotero/storage/C54TP6F5/html.html}
}
@online{amatriainTransformerModelsIntroduction2023,
title = {Transformer Models: An Introduction and Catalog},
shorttitle = {Transformer Models},
author = {Amatriain, Xavier},
date = {2023-02-11},
eprint = {2302.07730},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2302.07730},
url = {http://arxiv.org/abs/2302.07730},
urldate = {2023-02-16},
abstract = {In the past few years we have seen the meteoric appearance of dozens of models of the Transformer family, all of which have funny, but not self-explanatory, names. The goal of this paper is to offer a somewhat comprehensive but simple catalog and classification of the most popular Transformer models. The paper also includes an introduction to the most important aspects and innovation in Transformer models.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/94ENBB72/Amatriain_2023_Transformer models.pdf;/Users/andrew/Zotero/storage/XCL2MGKM/2302.html}
}
@online{andreasLanguageModelsAgent2022,
title = {Language {{Models}} as {{Agent Models}}},
author = {Andreas, Jacob},
date = {2022-12-03},
eprint = {2212.01681},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2212.01681},
url = {http://arxiv.org/abs/2212.01681},
urldate = {2024-09-04},
abstract = {Language models (LMs) are trained on collections of documents, written by individual human agents to achieve specific goals in an outside world. During training, LMs have access only to text of these documents, with no direct evidence of the internal states of the agents that produced them -- a fact often used to argue that LMs are incapable of modeling goal-directed aspects of human language production and comprehension. Can LMs trained on text learn anything at all about the relationship between language and use? I argue that LMs are models of intentional communication in a specific, narrow sense. When performing next word prediction given a textual context, an LM can infer and represent properties of an agent likely to have produced that context. These representations can in turn influence subsequent LM generation in the same way that agents' communicative intentions influence their language. I survey findings from the recent literature showing that -- even in today's non-robust and error-prone models -- LMs infer and use representations of fine-grained communicative intentions and more abstract beliefs and goals. Despite the limited nature of their training data, they can thus serve as building blocks for systems that communicate and act intentionally.},
pubstate = {prepublished},
keywords = {Computer Science - Computation and Language,Computer Science - Multiagent Systems},
file = {/Users/andrew/Zotero/storage/F4CTEGC3/Andreas - 2022 - Language Models as Agent Models.pdf;/Users/andrew/Zotero/storage/9Q9UNPL7/2212.html}
}
@online{andukuriSTaRGATETeachingLanguage2024,
title = {{{STaR-GATE}}: {{Teaching Language Models}} to {{Ask Clarifying Questions}}},
shorttitle = {{{STaR-GATE}}},
author = {Andukuri, Chinmaya and Fränken, Jan-Philipp and Gerstenberg, Tobias and Goodman, Noah D.},
date = {2024-08-07},
eprint = {2403.19154},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2403.19154},
url = {http://arxiv.org/abs/2403.19154},
urldate = {2024-08-16},
abstract = {When prompting language models to complete a task, users often leave important aspects unsaid. While asking questions could resolve this ambiguity (GATE; Li et al., 2023), models often struggle to ask good questions. We explore a language model's ability to self-improve (STaR; Zelikman et al., 2022) by rewarding the model for generating useful questions-a simple method we dub STaR-GATE. We generate a synthetic dataset of 25,500 unique persona-task prompts to simulate conversations between a pretrained language model-the Questioner-and a Roleplayer whose preferences are unknown to the Questioner. By asking questions, the Questioner elicits preferences from the Roleplayer. The Questioner is iteratively finetuned on questions that increase the probability of high-quality responses to the task, which are generated by an Oracle with access to the Roleplayer's latent preferences. After two iterations of self-improvement, the Questioner asks better questions, allowing it to generate responses that are preferred over responses from the initial model on 72\% of tasks. Our results indicate that teaching a language model to ask better questions leads to better personalized responses.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/MYCXAU5S/Andukuri et al. - 2024 - STaR-GATE Teaching Language Models to Ask Clarifying Questions.pdf;/Users/andrew/Zotero/storage/874GD5A3/2403.html}
}
@online{angelopoulosGentleIntroductionConformal2022,
title = {A {{Gentle Introduction}} to {{Conformal Prediction}} and {{Distribution-Free Uncertainty Quantification}}},
author = {Angelopoulos, Anastasios N. and Bates, Stephen},
date = {2022-12-07},
eprint = {2107.07511},
eprinttype = {arXiv},
eprintclass = {cs, math, stat},
url = {http://arxiv.org/abs/2107.07511},
urldate = {2023-07-28},
abstract = {Black-box machine learning models are now routinely used in high-risk settings, like medical diagnostics, which demand uncertainty quantification to avoid consequential model failures. Conformal prediction (a.k.a. conformal inference) is a user-friendly paradigm for creating statistically rigorous uncertainty sets/intervals for the predictions of such models. Critically, the sets are valid in a distribution-free sense: they possess explicit, non-asymptotic guarantees even without distributional assumptions or model assumptions. One can use conformal prediction with any pre-trained model, such as a neural network, to produce sets that are guaranteed to contain the ground truth with a user-specified probability, such as 90\%. It is easy-to-understand, easy-to-use, and general, applying naturally to problems arising in the fields of computer vision, natural language processing, deep reinforcement learning, and so on.},
langid = {english},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Mathematics - Statistics Theory,Statistics - Machine Learning,Statistics - Methodology},
file = {/Users/andrew/Zotero/storage/N9EX8LDW/Angelopoulos and Bates - 2022 - A Gentle Introduction to Conformal Prediction and .pdf}
}
@online{arkoudasGPT4CanReason2023,
title = {{{GPT-4 Can}}'t {{Reason}}},
author = {Arkoudas, Konstantine},
date = {2023-08-10},
eprint = {2308.03762},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2308.03762},
url = {http://arxiv.org/abs/2308.03762},
urldate = {2023-09-25},
abstract = {GPT-4 was released in March 2023 to wide acclaim, marking a very substantial improvement across the board over GPT-3.5 (OpenAI's previously best model, which had powered the initial release of ChatGPT). However, despite the genuinely impressive improvement, there are good reasons to be highly skeptical of GPT-4's ability to reason. This position paper discusses the nature of reasoning; criticizes the current formulation of reasoning problems in the NLP community, as well as the way in which LLM reasoning performance is currently evaluated; introduces a small collection of 21 diverse reasoning problems; and performs a detailed qualitative evaluation of GPT-4's performance on those problems. Based on this analysis, the paper concludes that, despite its occasional flashes of analytical brilliance, GPT-4 at present is utterly incapable of reasoning.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/3TSKKKLN/Arkoudas_2023_GPT-4 Can't Reason.pdf;/Users/andrew/Zotero/storage/SLBD9E4Z/2308.html}
}
@article{asadiRippleConceptBasedInterpretation,
title = {Ripple: {{Concept-Based Interpretation}} for {{Raw Time Series Models}} in {{Education}}},
author = {Asadi, Mohammad and Swamy, Vinitra and Frej, Jibril and Vignoud, Julien and Marras, Mirko and Kaser, Tanja},
abstract = {Time series is the most prevalent form of input data for educational prediction tasks. The vast majority of research using time series data focuses on hand-crafted features, designed by experts for predictive performance and interpretability. However, extracting these features is labor-intensive for humans and computers. In this paper, we propose an approach that utilizes irregular multivariate time series modeling with graph neural networks to achieve comparable or better accuracy with raw time series clickstreams in comparison to handcrafted features. Furthermore, we extend concept activation vectors for interpretability in raw time series models. We analyze these advances in the education domain, addressing the task of early student performance prediction for downstream targeted interventions and instructional support. Our experimental analysis on 23 MOOCs with millions of combined interactions over six behavioral dimensions show that models designed with our approach can (i) beat state-of-the-art educational time series baselines with no feature extraction and (ii) provide interpretable insights for personalized interventions. Source code: https://github.com/epfl-ml4ed/ripple/.},
langid = {english},
keywords = {/unread,⛔ No DOI found},
file = {/Users/andrew/Zotero/storage/E4TQA4KF/Asadi et al. - Ripple Concept-Based Interpretation for Raw Time .pdf}
}
@online{azarGeneralTheoreticalParadigm2023,
title = {A {{General Theoretical Paradigm}} to {{Understand Learning}} from {{Human Preferences}}},
author = {Azar, Mohammad Gheshlaghi and Rowland, Mark and Piot, Bilal and Guo, Daniel and Calandriello, Daniele and Valko, Michal and Munos, Rémi},
date = {2023-11-21},
eprint = {2310.12036},
eprinttype = {arXiv},
eprintclass = {cs, stat},
doi = {10.48550/arXiv.2310.12036},
url = {http://arxiv.org/abs/2310.12036},
urldate = {2023-11-23},
abstract = {The prevalent deployment of learning from human preferences through reinforcement learning (RLHF) relies on two important approximations: the first assumes that pairwise preferences can be substituted with pointwise rewards. The second assumes that a reward model trained on these pointwise rewards can generalize from collected data to out-of-distribution data sampled by the policy. Recently, Direct Preference Optimisation (DPO) has been proposed as an approach that bypasses the second approximation and learn directly a policy from collected data without the reward modelling stage. However, this method still heavily relies on the first approximation. In this paper we try to gain a deeper theoretical understanding of these practical algorithms. In particular we derive a new general objective called \$\textbackslash Psi\$PO for learning from human preferences that is expressed in terms of pairwise preferences and therefore bypasses both approximations. This new general objective allows us to perform an in-depth analysis of the behavior of RLHF and DPO (as special cases of \$\textbackslash Psi\$PO) and to identify their potential pitfalls. We then consider another special case for \$\textbackslash Psi\$PO by setting \$\textbackslash Psi\$ simply to Identity, for which we can derive an efficient optimisation procedure, prove performance guarantees and demonstrate its empirical superiority to DPO on some illustrative examples.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Statistics - Machine Learning},
file = {/Users/andrew/Zotero/storage/RNUU5HAU/Azar et al_2023_A General Theoretical Paradigm to Understand Learning from Human Preferences.pdf;/Users/andrew/Zotero/storage/T8USN539/2310.html}
}
@article{badrinathPyBKTAccessiblePython,
title = {{{pyBKT}}: {{An Accessible Python Library}} of {{Bayesian Knowledge Tracing Models}}},
author = {Badrinath, Anirudhan and Wang, Frederic and Pardos, Zachary},
abstract = {Bayesian Knowledge Tracing, a model used for cognitive mastery estimation, has been a hallmark of adaptive learning research and an integral component of deployed intelligent tutoring systems (ITS). In this paper, we provide a brief history of knowledge tracing model research and introduce pyBKT, an accessible and computationally efficient library of model extensions from the literature. The library provides data generation, fitting, prediction, and cross-validation routines, as well as a simple to use data helper interface to ingest typical tutor log dataset formats. We evaluate the runtime with various dataset sizes and compare to past implementations. Additionally, we conduct sanity checks of the model using experiments with simulated data to evaluate the accuracy of its EM parameter learning and use real-world data to validate its predictions, comparing pyBKT’s supported model variants with results from the papers in which they were originally introduced. The library is open source and open license for the purpose of making knowledge tracing more accessible to communities of research and practice and to facilitate progress in the field through easier replication of past approaches.},
langid = {english},
keywords = {/unread,⛔ No DOI found},
file = {/Users/andrew/Zotero/storage/EX6WHJBD/Badrinath et al. - pyBKT An Accessible Python Library of Bayesian Kn.pdf}
}
@online{bastaniGenerativeAICan2024,
type = {SSRN Scholarly Paper},
title = {Generative {{AI Can Harm Learning}}},
author = {Bastani, Hamsa and Bastani, Osbert and Sungu, Alp and Ge, Haosen and Kabakcı, Özge and Mariman, Rei},
date = {2024-07-15},
number = {4895486},
location = {Rochester, NY},
doi = {10.2139/ssrn.4895486},
url = {https://papers.ssrn.com/abstract=4895486},
urldate = {2024-07-26},
abstract = {Generative artificial intelligence (AI) is poised to revolutionize how humans work, and has already demonstrated promise in significantly improving human productivity. However, a key remaining question is how generative AI affects learning, namely, how humans acquire new skills as they perform tasks. This kind of skill learning is critical to long-term productivity gains, especially in domains where generative AI is fallible and human experts must check its outputs. We study the impact of generative AI, specifically OpenAI's GPT-4, on human learning in the context of math classes at a high school. In a field experiment involving nearly a thousand students, we have deployed and evaluated two GPT based tutors, one that mimics a standard ChatGPT interface (called GPT Base) and one with prompts designed to safeguard learning (called GPT Tutor). These tutors comprise about 15\% of the curriculum in each of three grades. Consistent with prior work, our results show that access to GPT-4 significantly improves performance (48\% improvement for GPT Base and 127\% for GPT Tutor). However, we additionally find that when access is subsequently taken away, students actually perform worse than those who never had access (17\% reduction for GPT Base). That is, access to GPT-4 can harm educational outcomes. These negative learning effects are largely mitigated by the safeguards included in GPT Tutor. Our results suggest that students attempt to use GPT-4 as a "crutch" during practice problem sessions, and when successful, perform worse on their own. Thus, to maintain long-term productivity, we must be cautious when deploying generative AI to ensure humans continue to learn critical skills. * HB, OB, and AS contributed equally},
langid = {english},
pubstate = {prepublished},
keywords = {Education,Generative AI,Human Capital Development,Human-AI Collaboration,Large Language Models},
file = {/Users/andrew/Zotero/storage/KUWTQ4RR/Bastani et al. - 2024 - Generative AI Can Harm Learning.pdf}
}
@article{battagliaSimulationEnginePhysical2013,
title = {Simulation as an Engine of Physical Scene Understanding},
author = {Battaglia, Peter W. and Hamrick, Jessica B. and Tenenbaum, Joshua B.},
date = {2013-11-05},
journaltitle = {Proceedings of the National Academy of Sciences},
shortjournal = {PNAS},
volume = {110},
number = {45},
eprint = {24145417},
eprinttype = {pmid},
pages = {18327--18332},
issn = {0027-8424, 1091-6490},
doi = {10.1073/pnas.1306572110},
url = {https://www.pnas.org/content/110/45/18327},
urldate = {2019-11-23},
abstract = {In a glance, we can perceive whether a stack of dishes will topple, a branch will support a child’s weight, a grocery bag is poorly packed and liable to tear or crush its contents, or a tool is firmly attached to a table or free to be lifted. Such rapid physical inferences are central to how people interact with the world and with each other, yet their computational underpinnings are poorly understood. We propose a model based on an “intuitive physics engine,” a cognitive mechanism similar to computer engines that simulate rich physics in video games and graphics, but that uses approximate, probabilistic simulations to make robust and fast inferences in complex natural scenes where crucial information is unobserved. This single model fits data from five distinct psychophysical tasks, captures several illusions and biases, and explains core aspects of human mental models and common-sense reasoning that are instrumental to how humans understand their everyday world.},
langid = {english},
file = {/Users/andrew/Zotero/storage/GMGKJPQF/Battaglia et al. - 2013 - Simulation as an engine of physical scene understa.pdf;/Users/andrew/Zotero/storage/HRW9JRML/Battaglia et al. - 2013 - Simulation as an engine of physical scene understa.pdf;/Users/andrew/Zotero/storage/8M4WCSDG/18327.html}
}
@online{battleUnreasonableEffectivenessEccentric2024,
title = {The {{Unreasonable Effectiveness}} of {{Eccentric Automatic Prompts}}},
author = {Battle, Rick and Gollapudi, Teja},
date = {2024-02-20},
eprint = {2402.10949},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2402.10949},
url = {http://arxiv.org/abs/2402.10949},
urldate = {2024-04-25},
abstract = {Large Language Models (LLMs) have demonstrated remarkable problem-solving and basic mathematics abilities. However, their efficacy is highly contingent on the formulation of the prompt. This study endeavors to quantify the influence of incorporating "positive thinking" into the system message of the prompt, then compare that to systematic prompt optimization. We assess the performance of 60 combinations of system message snippets, tested with and without Chain of Thought prompting, across three models with parameters ranging from 7 to 70 billion on the GSM8K dataset. Our findings reveal that results do not universally generalize across models. In most instances, the inclusion of "positive thinking" prompts positively affected model performance. Notably, however, Llama2-70B exhibited an exception when not utilizing Chain of Thought, as the optimal system message was found to be none at all. Given the combinatorial complexity, and thus computation time, of experimenting with hand-tuning prompts for large black-box models, we then compared the performance of the best "positive thinking" prompt against the output of systematic prompt optimization. We show that employing an automated prompt optimizer emerges as the most effective method for enhancing performance, even when working with smaller open-source models. Additionally, our findings reveal that the highest-scoring, automatically-optimized prompt exhibits a degree of peculiarity far beyond expectations.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
file = {/Users/andrew/Zotero/storage/DK5CF28L/Battle_Gollapudi_2024_The Unreasonable Effectiveness of Eccentric Automatic Prompts.pdf;/Users/andrew/Zotero/storage/TKJANXL2/2402.html}
}
@online{battleUnreasonableEffectivenessEccentric2024b,
title = {The {{Unreasonable Effectiveness}} of {{Eccentric Automatic Prompts}}},
author = {Battle, Rick and Gollapudi, Teja},
date = {2024-02-20},
eprint = {2402.10949},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2402.10949},
url = {http://arxiv.org/abs/2402.10949},
urldate = {2024-09-05},
abstract = {Large Language Models (LLMs) have demonstrated remarkable problem-solving and basic mathematics abilities. However, their efficacy is highly contingent on the formulation of the prompt. This study endeavors to quantify the influence of incorporating "positive thinking" into the system message of the prompt, then compare that to systematic prompt optimization. We assess the performance of 60 combinations of system message snippets, tested with and without Chain of Thought prompting, across three models with parameters ranging from 7 to 70 billion on the GSM8K dataset. Our findings reveal that results do not universally generalize across models. In most instances, the inclusion of "positive thinking" prompts positively affected model performance. Notably, however, Llama2-70B exhibited an exception when not utilizing Chain of Thought, as the optimal system message was found to be none at all. Given the combinatorial complexity, and thus computation time, of experimenting with hand-tuning prompts for large black-box models, we then compared the performance of the best "positive thinking" prompt against the output of systematic prompt optimization. We show that employing an automated prompt optimizer emerges as the most effective method for enhancing performance, even when working with smaller open-source models. Additionally, our findings reveal that the highest-scoring, automatically-optimized prompt exhibits a degree of peculiarity far beyond expectations.},
pubstate = {prepublished},
version = {2},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
file = {/Users/andrew/Zotero/storage/NPCN4SRY/Battle and Gollapudi - 2024 - The Unreasonable Effectiveness of Eccentric Automatic Prompts.pdf}
}
@online{bellerCounterfactualSimulationModel2023,
title = {A Counterfactual Simulation Model of Causal Language},
author = {Beller, Ari and Gerstenberg, Tobias},
date = {2023-07-04T20:30:01},
doi = {10.31234/osf.io/xv8hf},
url = {https://psyarxiv.com/xv8hf/},
urldate = {2023-07-05},
abstract = {The words we use to describe what happened shape the story a listener imagines. How do speakers choose what causal expression to use? How does that impact what listeners infer about what happened? In this paper, we develop a computational model of how people use the causal expressions "caused", "enabled", "affected", and "made no difference". The model first builds a causal representation of what happened. By running counterfactual simulations, the model computes causal aspects that capture the different ways in which a candidate cause made a difference to the outcome. Logical combinations of these aspects define a semantics for the different causal expressions. The model then uses pragmatic inference favoring informative utterances to decide what word to use in context. We test our model in a series of experiments. In a set of psycholinguistic studies, we verify semantic and pragmatic assumptions of our model. We show that the causal expressions exist on a hierarchy of informativeness, and that participants draw informative pragmatic inferences in line with this scale. In the next two studies, we demonstrate that our model quantitatively fits participant behavior in a speaker task and a listener task involving dynamic physical scenarios. We compare our model to two lesioned alternatives, one which removes the pragmatic inference component, and another which additionally removes the semantics of the causal expressions. Our full model better accounts for participants' behavior than both alternatives, suggesting that causal knowledge, semantics, and pragmatics are all important for understanding how people produce and comprehend causal language.},
langid = {american},
pubstate = {prepublished},
keywords = {/unread,causality,Cognitive Psychology,Concepts and Categories,counterfactuals,intuitive physics,Judgment and Decision Making,Language,mental simulation,pragmatics,Reasoning,semantics,Social and Behavioral Sciences},
file = {/Users/andrew/Zotero/storage/ZYMRB99E/Beller_Gerstenberg_2023_A counterfactual simulation model of causal language.pdf}
}
@online{belyiLunaEvaluationFoundation2024,
title = {Luna: {{An Evaluation Foundation Model}} to {{Catch Language Model Hallucinations}} with {{High Accuracy}} and {{Low Cost}}},
shorttitle = {Luna},
author = {Belyi, Masha and Friel, Robert and Shao, Shuai and Sanyal, Atindriyo},
date = {2024-06-05},
eprint = {2406.00975},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2406.00975},
url = {http://arxiv.org/abs/2406.00975},
urldate = {2024-06-13},
abstract = {Retriever Augmented Generation (RAG) systems have become pivotal in enhancing the capabilities of language models by incorporating external knowledge retrieval mechanisms. However, a significant challenge in deploying these systems in industry applications is the detection and mitigation of hallucinations: instances where the model generates information that is not grounded in the retrieved context. Addressing this issue is crucial for ensuring the reliability and accuracy of responses generated by large language models (LLMs) in diverse industry settings. Current hallucination detection techniques fail to deliver accuracy, low latency, and low cost simultaneously. We introduce Luna: a DeBERTA-large (440M) encoder, finetuned for hallucination detection in RAG settings. We demonstrate that Luna outperforms GPT-3.5 and commercial evaluation frameworks on the hallucination detection task, with 97\% and 91\% reduction in cost and latency, respectively. Luna is lightweight and generalizes across multiple industry verticals and out-of-domain data, making it an ideal candidate for industry LLM applications.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/TTHKFSV6/Belyi et al_2024_Luna.pdf;/Users/andrew/Zotero/storage/X9BTI9U3/2406.html}
}
@online{bestaGraphThoughtsSolving2023,
title = {Graph of {{Thoughts}}: {{Solving Elaborate Problems}} with {{Large Language Models}}},
shorttitle = {Graph of {{Thoughts}}},
author = {Besta, Maciej and Blach, Nils and Kubicek, Ales and Gerstenberger, Robert and Gianinazzi, Lukas and Gajda, Joanna and Lehmann, Tomasz and Podstawski, Michal and Niewiadomski, Hubert and Nyczyk, Piotr and Hoefler, Torsten},
date = {2023-08-21},
eprint = {2308.09687},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2308.09687},
url = {http://arxiv.org/abs/2308.09687},
urldate = {2023-08-29},
abstract = {We introduce Graph of Thoughts (GoT): a framework that advances prompting capabilities in large language models (LLMs) beyond those offered by paradigms such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary advantage of GoT is the ability to model the information generated by an LLM as an arbitrary graph, where units of information ("LLM thoughts") are vertices, and edges correspond to dependencies between these vertices. This approach enables combining arbitrary LLM thoughts into synergistic outcomes, distilling the essence of whole networks of thoughts, or enhancing thoughts using feedback loops. We illustrate that GoT offers advantages over state of the art on different tasks, for example increasing the quality of sorting by 62\% over ToT, while simultaneously reducing costs by {$>$}31\%. We ensure that GoT is extensible with new thought transformations and thus can be used to spearhead new prompting schemes. This work brings the LLM reasoning closer to human thinking or brain mechanisms such as recurrence, both of which form complex networks.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
file = {/Users/andrew/Zotero/storage/8Y9XLWYG/Besta et al_2023_Graph of Thoughts.pdf;/Users/andrew/Zotero/storage/LLTA6BCG/2308.html}
}
@article{bin-nashwanUseChatGPTAcademia2023,
title = {Use of {{ChatGPT}} in Academia: {{Academic}} Integrity Hangs in the Balance},
shorttitle = {Use of {{ChatGPT}} in Academia},
author = {Bin-Nashwan, Saeed Awadh and Sadallah, Mouad and Bouteraa, Mohamed},
date = {2023-11-01},
journaltitle = {Technology in Society},
shortjournal = {Technology in Society},
volume = {75},
pages = {102370},
issn = {0160-791X},
doi = {10.1016/j.techsoc.2023.102370},
url = {https://www.sciencedirect.com/science/article/pii/S0160791X23001756},
urldate = {2024-04-15},
abstract = {In today's academic world, some academicians, researchers and students have begun employing Artificial Intelligence (AI) language models, e.g., ChatGPT, in completing a variety of academic tasks, including generating ideas, summarising literature, and essay writing. However, the use of ChatGPT in academic settings is a controversial issue, leading to a severe concern about academic integrity and AI-assisted cheating, while scholarly communities still lack clear principles on using such innovation in academia. Accordingly, this study aims to understand the motivations driving academics and researchers to use ChatGPT in their work, and specifically the role of academic integrity in making up adoption behavior. Based on 702 responses retrieved from users of ResearchGate and Academia.edu, we found that ChatGPT usage is positively shaped by time-saving feature, e-word of mouth, academic self-efficacy, academic self-esteem, and perceived stress. In contrast, peer influence and academic integrity had a negative effect on usage. Intriguingly, academic integrity-moderated interactions of time-saving, self-esteem and perceived stress on ChatGPT usage are found to be significantly positive. Therefore, we suggest that stakeholders, including academic institutions, publishers and AI language models' programmers, should work together to specify necessary guidelines for the ethical use of AI chatbots in academic work and research.},
keywords = {Academia,Academic integrity,Artificial intelligence,ChatGPT,Plagiarism,Technology adoption},
file = {/Users/andrew/Zotero/storage/C6DGJBEB/S0160791X23001756.html}
}
@article{binzUsingCognitivePsychology2023,
title = {Using Cognitive Psychology to Understand {{GPT-3}}},
author = {Binz, Marcel and Schulz, Eric},
date = {2023-02-07},
journaltitle = {Proceedings of the National Academy of Sciences},
volume = {120},
number = {6},
pages = {e2218523120},
publisher = {Proceedings of the National Academy of Sciences},
doi = {10.1073/pnas.2218523120},
url = {https://www.pnas.org/doi/10.1073/pnas.2218523120},
urldate = {2023-02-11},
abstract = {We study GPT-3, a recent large language model, using tools from cognitive psychology. More specifically, we assess GPT-3’s decision-making, information search, deliberation, and causal reasoning abilities on a battery of canonical experiments from the literature. We find that much of GPT-3’s behavior is impressive: It solves vignette-based tasks similarly or better than human subjects, is able to make decent decisions from descriptions, outperforms humans in a multiarmed bandit task, and shows signatures of model-based reinforcement learning. Yet, we also find that small perturbations to vignette-based tasks can lead GPT-3 vastly astray, that it shows no signatures of directed exploration, and that it fails miserably in a causal reasoning task. Taken together, these results enrich our understanding of current large language models and pave the way for future investigations using tools from cognitive psychology to study increasingly capable and opaque artificial agents.},
keywords = {/unread},
file = {/Users/andrew/Zotero/storage/2DE7KRJ8/Binz_Schulz_2023_Using cognitive psychology to understand GPT-3.pdf}
}
@article{binzUsingCognitivePsychology2023a,
title = {Using Cognitive Psychology to Understand {{GPT-3}}},
author = {Binz, Marcel and Schulz, Eric},
date = {2023-02-07},
journaltitle = {Proceedings of the National Academy of Sciences},
volume = {120},
number = {6},
pages = {e2218523120},
publisher = {Proceedings of the National Academy of Sciences},
doi = {10.1073/pnas.2218523120},
url = {https://www.pnas.org/doi/10.1073/pnas.2218523120},
urldate = {2023-03-25},
abstract = {We study GPT-3, a recent large language model, using tools from cognitive psychology. More specifically, we assess GPT-3’s decision-making, information search, deliberation, and causal reasoning abilities on a battery of canonical experiments from the literature. We find that much of GPT-3’s behavior is impressive: It solves vignette-based tasks similarly or better than human subjects, is able to make decent decisions from descriptions, outperforms humans in a multiarmed bandit task, and shows signatures of model-based reinforcement learning. Yet, we also find that small perturbations to vignette-based tasks can lead GPT-3 vastly astray, that it shows no signatures of directed exploration, and that it fails miserably in a causal reasoning task. Taken together, these results enrich our understanding of current large language models and pave the way for future investigations using tools from cognitive psychology to study increasingly capable and opaque artificial agents.},
keywords = {/unread},
file = {/Users/andrew/Zotero/storage/J823HK9W/Binz_Schulz_2023_Using cognitive psychology to understand GPT-3.pdf}
}
@article{birhaneLargeModelsWhat2024,
title = {Large Models of What? {{Mistaking}} Engineering Achievements for Human Linguistic Agency},
shorttitle = {Large Models of What?},
author = {Birhane, Abeba and McGann, Marek},
date = {2024-11-01},
journaltitle = {Language Sciences},
shortjournal = {Language Sciences},
volume = {106},
pages = {101672},
issn = {0388-0001},
doi = {10.1016/j.langsci.2024.101672},
url = {https://www.sciencedirect.com/science/article/pii/S0388000124000615},
urldate = {2024-08-31},
abstract = {In this paper we argue that key, often sensational and misleading, claims regarding linguistic capabilities of Large Language Models (LLMs) are based on at least two unfounded assumptions: the assumption of language completeness and the assumption of data completeness. Language completeness assumes that a distinct and complete thing such as “a natural language” exists, the essential characteristics of which can be effectively and comprehensively modelled by an LLM. The assumption of data completeness relies on the belief that a language can be quantified and wholly captured by data. Work within the enactive approach to cognitive science makes clear that, rather than a distinct and complete thing, language is a means or way of acting. Languaging is not the kind of thing that can admit of a complete or comprehensive modelling. From an enactive perspective we identify three key characteristics of enacted language; embodiment, participation, and precariousness, that are absent in LLMs, and likely incompatible in principle with current architectures. We argue that these absences imply that LLMs are not now and cannot in their present form be linguistic agents the way humans are. We illustrate the point in particular through the phenomenon of “algospeak”, a recently described pattern of high-stakes human language activity in heavily controlled online environments. On the basis of these points, we conclude that sensational and misleading claims about LLM agency and capabilities emerge from a deep misconception of both what human language is and what LLMs are.},
keywords = {Agency,Algospeak,Embodiment,Enaction,Language,Large language models,Precariousness,Precarity},
file = {/Users/andrew/Zotero/storage/TQEGURUB/S0388000124000615.html}
}
@online{bommasaniOpportunitiesRisksFoundation2022,
title = {On the {{Opportunities}} and {{Risks}} of {{Foundation Models}}},
author = {Bommasani, Rishi and Hudson, Drew A. and Adeli, Ehsan and Altman, Russ and Arora, Simran and family=Arx, given=Sydney, prefix=von, useprefix=true and Bernstein, Michael S. and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and Brynjolfsson, Erik and Buch, Shyamal and Card, Dallas and Castellon, Rodrigo and Chatterji, Niladri and Chen, Annie and Creel, Kathleen and Davis, Jared Quincy and Demszky, Dora and Donahue, Chris and Doumbouya, Moussa and Durmus, Esin and Ermon, Stefano and Etchemendy, John and Ethayarajh, Kawin and Fei-Fei, Li and Finn, Chelsea and Gale, Trevor and Gillespie, Lauren and Goel, Karan and Goodman, Noah and Grossman, Shelby and Guha, Neel and Hashimoto, Tatsunori and Henderson, Peter and Hewitt, John and Ho, Daniel E. and Hong, Jenny and Hsu, Kyle and Huang, Jing and Icard, Thomas and Jain, Saahil and Jurafsky, Dan and Kalluri, Pratyusha and Karamcheti, Siddharth and Keeling, Geoff and Khani, Fereshte and Khattab, Omar and Koh, Pang Wei and Krass, Mark and Krishna, Ranjay and Kuditipudi, Rohith and Kumar, Ananya and Ladhak, Faisal and Lee, Mina and Lee, Tony and Leskovec, Jure and Levent, Isabelle and Li, Xiang Lisa and Li, Xuechen and Ma, Tengyu and Malik, Ali and Manning, Christopher D. and Mirchandani, Suvir and Mitchell, Eric and Munyikwa, Zanele and Nair, Suraj and Narayan, Avanika and Narayanan, Deepak and Newman, Ben and Nie, Allen and Niebles, Juan Carlos and Nilforoshan, Hamed and Nyarko, Julian and Ogut, Giray and Orr, Laurel and Papadimitriou, Isabel and Park, Joon Sung and Piech, Chris and Portelance, Eva and Potts, Christopher and Raghunathan, Aditi and Reich, Rob and Ren, Hongyu and Rong, Frieda and Roohani, Yusuf and Ruiz, Camilo and Ryan, Jack and Ré, Christopher and Sadigh, Dorsa and Sagawa, Shiori and Santhanam, Keshav and Shih, Andy and Srinivasan, Krishnan and Tamkin, Alex and Taori, Rohan and Thomas, Armin W. and Tramèr, Florian and Wang, Rose E. and Wang, William and Wu, Bohan and Wu, Jiajun and Wu, Yuhuai and Xie, Sang Michael and Yasunaga, Michihiro and You, Jiaxuan and Zaharia, Matei and Zhang, Michael and Zhang, Tianyi and Zhang, Xikun and Zhang, Yuhui and Zheng, Lucia and Zhou, Kaitlyn and Liang, Percy},
date = {2022-07-12},
eprint = {2108.07258},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2108.07258},
url = {http://arxiv.org/abs/2108.07258},
urldate = {2023-11-01},
abstract = {AI is undergoing a paradigm shift with the rise of models (e.g., BERT, DALL-E, GPT-3) that are trained on broad data at scale and are adaptable to a wide range of downstream tasks. We call these models foundation models to underscore their critically central yet incomplete character. This report provides a thorough account of the opportunities and risks of foundation models, ranging from their capabilities (e.g., language, vision, robotics, reasoning, human interaction) and technical principles(e.g., model architectures, training procedures, data, systems, security, evaluation, theory) to their applications (e.g., law, healthcare, education) and societal impact (e.g., inequity, misuse, economic and environmental impact, legal and ethical considerations). Though foundation models are based on standard deep learning and transfer learning, their scale results in new emergent capabilities,and their effectiveness across so many tasks incentivizes homogenization. Homogenization provides powerful leverage but demands caution, as the defects of the foundation model are inherited by all the adapted models downstream. Despite the impending widespread deployment of foundation models, we currently lack a clear understanding of how they work, when they fail, and what they are even capable of due to their emergent properties. To tackle these questions, we believe much of the critical research on foundation models will require deep interdisciplinary collaboration commensurate with their fundamentally sociotechnical nature.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
file = {/Users/andrew/Zotero/storage/ER6APDWA/Bommasani et al_2022_On the Opportunities and Risks of Foundation Models.pdf;/Users/andrew/Zotero/storage/ZF3UG93I/2108.html}
}
@online{borazjanizadehReliableReasoningNatural2024,
title = {Reliable {{Reasoning Beyond Natural Language}}},
author = {Borazjanizadeh, Nasim and Piantadosi, Steven T.},
date = {2024-07-16},
eprint = {2407.11373},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2407.11373},
url = {http://arxiv.org/abs/2407.11373},
urldate = {2024-07-19},
abstract = {Despite their linguistic competence, Large Language models (LLMs) often exhibit limitations in their ability to reason reliably and flexibly. To address this, we propose a neurosymbolic approach that prompts LLMs to extract and encode all relevant information from a problem statement as logical code statements, and then use a logic programming language (Prolog) to conduct the iterative computations of explicit deductive reasoning. Our approach significantly enhances the performance of LLMs on the standard mathematical reasoning benchmark, GSM8k, and the Navigate dataset from the BIG-bench dataset. Additionally, we introduce a novel dataset, the Non-Linear Reasoning (NLR) dataset, consisting of 55 unique word problems that target the shortcomings of the next token prediction paradigm of LLMs and require complex non-linear reasoning but only basic arithmetic skills to solve. Our findings demonstrate that the integration of Prolog enables LLMs to achieve high performance on the NLR dataset, which even the most advanced language models (including GPT4) fail to solve using text only.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/86UZBDDJ/Borazjanizadeh and Piantadosi - 2024 - Reliable Reasoning Beyond Natural Language.pdf;/Users/andrew/Zotero/storage/JP3CL4CU/2407.html}
}
@online{borazjanizadehReliableReasoningNatural2024a,
title = {Reliable {{Reasoning Beyond Natural Language}}},
author = {Borazjanizadeh, Nasim and Piantadosi, Steven T.},
date = {2024-07-19},
eprint = {2407.11373},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2407.11373},
url = {http://arxiv.org/abs/2407.11373},
urldate = {2024-08-14},
abstract = {Despite their linguistic competence, Large Language models (LLMs) often exhibit limitations in their ability to reason reliably and flexibly. To address this, we propose a neurosymbolic approach that prompts LLMs to extract and encode all relevant information from a problem statement as logical code statements, and then use a logic programming language (Prolog) to conduct the iterative computations of explicit deductive reasoning. Our approach significantly enhances the performance of LLMs on the standard mathematical reasoning benchmark, GSM8k, and the Navigate dataset from the BIG-bench dataset. Additionally, we introduce a novel dataset, the Non-Linear Reasoning (NLR) dataset, consisting of 55 unique word problems that target the shortcomings of the next token prediction paradigm of LLMs and require complex non-linear reasoning but only basic arithmetic skills to solve. Our findings demonstrate that the integration of Prolog enables LLMs to achieve high performance on the NLR dataset, which even the most advanced language models (including GPT4) fail to solve using text only.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/5C36RKU7/Borazjanizadeh and Piantadosi - 2024 - Reliable Reasoning Beyond Natural Language.pdf;/Users/andrew/Zotero/storage/Z8RHXBYA/2407.html}
}
@online{bosselutCOMETCommonsenseTransformers2019,
title = {{{COMET}}: {{Commonsense Transformers}} for {{Automatic Knowledge Graph Construction}}},
shorttitle = {{{COMET}}},
author = {Bosselut, Antoine and Rashkin, Hannah and Sap, Maarten and Malaviya, Chaitanya and Celikyilmaz, Asli and Choi, Yejin},
date = {2019-06-14},
eprint = {1906.05317},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.1906.05317},
url = {http://arxiv.org/abs/1906.05317},
urldate = {2024-08-26},
abstract = {We present the first comprehensive study on automatic knowledge base construction for two prevalent commonsense knowledge graphs: ATOMIC (Sap et al., 2019) and ConceptNet (Speer et al., 2017). Contrary to many conventional KBs that store knowledge with canonical templates, commonsense KBs only store loosely structured open-text descriptions of knowledge. We posit that an important step toward automatic commonsense completion is the development of generative models of commonsense knowledge, and propose COMmonsEnse Transformers (COMET) that learn to generate rich and diverse commonsense descriptions in natural language. Despite the challenges of commonsense modeling, our investigation reveals promising results when implicit knowledge from deep pre-trained language models is transferred to generate explicit knowledge in commonsense knowledge graphs. Empirical results demonstrate that COMET is able to generate novel knowledge that humans rate as high quality, with up to 77.5\% (ATOMIC) and 91.7\% (ConceptNet) precision at top 1, which approaches human performance for these resources. Our findings suggest that using generative commonsense models for automatic commonsense KB completion could soon be a plausible alternative to extractive methods.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/HZMSC8ZP/Bosselut et al. - 2019 - COMET Commonsense Transformers for Automatic Knowledge Graph Construction.pdf;/Users/andrew/Zotero/storage/49G7F25F/1906.html}
}
@book{bowenTeachingAI2024,
title = {Teaching with {{AI}}},
author = {Bowen, José Antonio and Watson, C. Edward},
date = {2024},
publisher = {Johns Hopkins University Press},
doi = {10.56021/9781421449227},
url = {https://www.press.jhu.edu/books/title/53869/teaching-ai},
urldate = {2024-06-26},
isbn = {978-1-4214-4922-7 978-1-4214-4923-4},
langid = {english}
}
@online{bowmanEightThingsKnow2023,
title = {Eight {{Things}} to {{Know}} about {{Large Language Models}}},
author = {Bowman, Samuel R.},
date = {2023-04-02},
eprint = {2304.00612},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2304.00612},
url = {http://arxiv.org/abs/2304.00612},
urldate = {2023-11-20},
abstract = {The widespread public deployment of large language models (LLMs) in recent months has prompted a wave of new attention and engagement from advocates, policymakers, and scholars from many fields. This attention is a timely response to the many urgent questions that this technology raises, but it can sometimes miss important considerations. This paper surveys the evidence for eight potentially surprising such points: 1. LLMs predictably get more capable with increasing investment, even without targeted innovation. 2. Many important LLM behaviors emerge unpredictably as a byproduct of increasing investment. 3. LLMs often appear to learn and use representations of the outside world. 4. There are no reliable techniques for steering the behavior of LLMs. 5. Experts are not yet able to interpret the inner workings of LLMs. 6. Human performance on a task isn't an upper bound on LLM performance. 7. LLMs need not express the values of their creators nor the values encoded in web text. 8. Brief interactions with LLMs are often misleading.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/6VI46I6I/Bowman_2023_Eight Things to Know about Large Language Models.pdf;/Users/andrew/Zotero/storage/EQDBZXT6/2304.html}
}
@article{bozkurta.SpeculativeFuturesChatGPT2023,
title = {Speculative Futures on {{ChatGPT}} and Generative Artificial Intelligence ({{AI}}): {{A}} Collective Reflection from the Educational Landscape},
shorttitle = {Speculative Futures on {{ChatGPT}} and Generative Artificial Intelligence ({{AI}})},
author = {Bozkurt, A. and Xiao, J. and Lambert, S. and Pazurek, A. and Crompton, H. and Koseoglu, S. and Farrow, R. and Bond, M. and Nerantzi, C. and Honeychurch, S. and Bali, M. and Dron, J. and Mir, K. and Stewart, B. and Costello, E. and Mason, J. and Stracke, C. M. and Romero-Hall, E. and Koutropoulos, A. and Toquero, C. M. and Singh, L. and Tlili, A. and Lee, K. and Nichols, M. and Ossiannilsson, E. and Brown, M. and Irvine, V. and Raffaghelli, J. E. and Santos-Hermosa, G. and Farrell, O. and Adam, T. and Thong, Y. L. and Sani-Bozkurt, S. and Sharma, R. C. and Hrastinski, S. and Jandrić, P.},
date = {2023-02-13},
publisher = {Zenodo},
doi = {10.5281/ZENODO.7636568},
url = {https://zenodo.org/record/7636568},
urldate = {2023-03-23},
abstract = {While ChatGPT has recently become very popular, AI has a long history and philosophy. This paper intends to explore the promises and pitfalls of the Generative Pre-trained Transformer (GPT) AI and potentially future technologies by adopting a speculative methodology. Speculative future narratives with a specific focus on educational contexts are provided in an attempt to identify emerging themes and discuss their implications for education in the 21st century. Affordances of (using) AI in Education (AIEd) and possible adverse effects are identified and discussed which emerge from the narratives. It is argued that now is the best of times to define human vs AI contribution to education because AI can accomplish more and more educational activities that used to be the prerogative of human educators. Therefore, it is imperative to rethink the respective roles of technology and human educators in education with a future-oriented mindset.},
langid = {english},
keywords = {/unread,artificial intelligence (AI),artificial intelligence in education (AIEd),future educational perspectives,generative pre-trained transformer (GPT),natural language processing,speculative methodology},
file = {/Users/andrew/Zotero/storage/B7H35EFB/Bozkurt, A. et al. - 2023 - Speculative futures on ChatGPT and generative arti.pdf}
}
@article{branwenScalingHypothesis2020,
title = {The {{Scaling Hypothesis}}},
author = {Branwen, Gwern},
date = {2020-05-28},
url = {https://gwern.net/scaling-hypothesis},
urldate = {2023-06-30},
abstract = {On GPT-3: meta-learning, scaling, implications, and deep theory. The scaling hypothesis: neural nets absorb data \& compute, generalizing and becoming more Bayesian as problems get harder, manifesting new abilities even at trivial-by-global-standards-scale. The deep learning revolution has begun as foretold.},
langid = {american},
keywords = {/unread,⛔ No DOI found},
file = {/Users/andrew/Zotero/storage/QHRZYSBL/scaling-hypothesis.html}
}
@online{broschinskiGrafikenErklaertFunktioniert2023,
title = {In 9 Grafiken erklärt – So funktioniert künstliche Intelligenz},
author = {Broschinski, Sebastian and Plattner, Titus and Meier, Patrick and Vögeli, Patrick},
date = {2023-06-10},
url = {https://www.derbund.ch/so-funktioniert-kuenstliche-intelligenz-599276436215},
urldate = {2023-06-13},
abstract = {Kann künstliche Intelligenz mehr als Äpfel und Birnen sortieren? Und warum lassen sich Computer immer noch leicht übertölpeln? Hier finden Sie alle Antworten.},
langid = {ngerman},
organization = {Der Bund},
keywords = {/unread},
file = {/Users/andrew/Zotero/storage/3G74I43I/so-funktioniert-kuenstliche-intelligenz-599276436215.html}
}
@online{brownLanguageModelsAre2020a,
title = {Language {{Models}} Are {{Few-Shot Learners}}},
author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
date = {2020-07-22},
eprint = {2005.14165},
eprinttype = {arXiv},
eprintclass = {cs},
url = {http://arxiv.org/abs/2005.14165},
urldate = {2023-03-01},
abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/SUCZVXXP/Brown et al_2020_Language Models are Few-Shot Learners.pdf;/Users/andrew/Zotero/storage/JW9ITXGI/2005.html}
}
@online{brownLanguageModelsAre2020b,
title = {Language {{Models}} Are {{Few-Shot Learners}}},
author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
date = {2020-07-22},
eprint = {2005.14165},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2005.14165},
url = {http://arxiv.org/abs/2005.14165},
urldate = {2023-06-08},
abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/HSTN9QGG/Brown et al_2020_Language Models are Few-Shot Learners.pdf;/Users/andrew/Zotero/storage/VHPJRB6M/2005.html}
}
@online{bubeckSparksArtificialGeneral2023,
title = {Sparks of {{Artificial General Intelligence}}: {{Early}} Experiments with {{GPT-4}}},
shorttitle = {Sparks of {{Artificial General Intelligence}}},
author = {Bubeck, Sébastien and Chandrasekaran, Varun and Eldan, Ronen and Gehrke, Johannes and Horvitz, Eric and Kamar, Ece and Lee, Peter and Lee, Yin Tat and Li, Yuanzhi and Lundberg, Scott and Nori, Harsha and Palangi, Hamid and Ribeiro, Marco Tulio and Zhang, Yi},
date = {2023-04-13},
eprint = {2303.12712},
eprinttype = {arXiv},
eprintclass = {cs},
url = {http://arxiv.org/abs/2303.12712},
urldate = {2023-05-29},
abstract = {Artificial intelligence (AI) researchers have been developing and refining large language models (LLMs) that exhibit remarkable capabilities across a variety of domains and tasks, challenging our understanding of learning and cognition. The latest model developed by OpenAI, GPT-4 [Ope23], was trained using an unprecedented scale of compute and data. In this paper, we report on our investigation of an early version of GPT-4, when it was still in active development by OpenAI. We contend that (this early version of) GPT4 is part of a new cohort of LLMs (along with ChatGPT and Google’s PaLM for example) that exhibit more general intelligence than previous AI models. We discuss the rising capabilities and implications of these models. We demonstrate that, beyond its mastery of language, GPT-4 can solve novel and difficult tasks that span mathematics, coding, vision, medicine, law, psychology and more, without needing any special prompting. Moreover, in all of these tasks, GPT-4’s performance is strikingly close to human-level performance, and often vastly surpasses prior models such as ChatGPT. Given the breadth and depth of GPT-4’s capabilities, we believe that it could reasonably be viewed as an early (yet still incomplete) version of an artificial general intelligence (AGI) system. In our exploration of GPT-4, we put special emphasis on discovering its limitations, and we discuss the challenges ahead for advancing towards deeper and more comprehensive versions of AGI, including the possible need for pursuing a new paradigm that moves beyond next-word prediction. We conclude with reflections on societal influences of the recent technological leap and future research directions.},
langid = {english},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/2QTBXBKP/Bubeck et al. - 2023 - Sparks of Artificial General Intelligence Early e.pdf}
}
@article{buckHochschulbildungVorHintergrund2023,
title = {Hochschulbildung vor dem Hintergrund von Natural Language Processing (KI-Schreibtools)},
author = {Buck, Isabella and Limburg, Anika},
date = {2023},
langid = {ngerman},
keywords = {/unread,⛔ No DOI found},
file = {/Users/andrew/Zotero/storage/ZYDESPAY/Buck and Limburg - 2023 - Hochschulbildung vor dem Hintergrund von Natural L.pdf}
}
@article{butlerMicrosoftNewFuture2023,
title = {Microsoft {{New Future}} of {{Work Report}} 2023},
author = {Butler, Jenna and Jaffe, Sonia and Baym, Nancy and Czerwinski, Mary and Iqbal, Shamsi and Nowak, Kate and Rintel, Sean and Sellen, Abigail and Vorvoreanu, Mihaela and Abdulhamid, Najeeb G. and Amores, Judith and Andersen, Reid and Awori, Kagonya and Axmed, Maxamed and Boyd, Danah and Brand, James and Buscher, Georg and Carignan, Dean and Chan, Martin and Coleman, Adam and Counts, Scott and Daepp, Madeleine and Fourney, Adam and Goldstein, Daniel G. and Gordon, Andy and Halfaker, Aaron L. and Hernandez, Javier and Hofman, Jake and Lay-Flurrie, Jenny and Liao, Vera and Lindley, Siân and Manivannan, Sathish and Mcilwain, Charlton and Nepal, Subigya and Neville, Jennifer and Nyairo, Stephanie and O'Neill, Jacki and Poznanski, Victor and Ramos, Gonzalo and Rangan, Nagu and Rosedale, Lacey and Rothschild, David and Safavi, Tara and Sarkar, Advait and Scott, Ava and Shah, Chirag and Shah, Neha Parikh and Shapiro, Teny and Shaw, Ryland and Simkute, Auste and Suh, Jina and Suri, Siddharth and Tanase, Ioana and Tankelevitch, Lev and Troy, Adam and Wan, Mengting and White, Ryen W. and Yang, Longqi and Hecht, Brent and Teevan, Jaime},
date = {2023-12-20},
url = {https://www.microsoft.com/en-us/research/publication/microsoft-new-future-of-work-report-2023/},
urldate = {2024-01-29},
abstract = {In the past three years, there have been not one but two generational shifts in how work gets done, both of which were only possible because of decades of research and development. The first shift occurred when COVID made us realize how powerful remote and hybrid work technologies had become, as well as how much […]},
langid = {american},
file = {/Users/andrew/Zotero/storage/W95FJGQ6/Butler et al_2023_Microsoft New Future of Work Report 2023.pdf}
}
@article{cantlonUniquelyHumanIntelligence2024,
title = {Uniquely Human Intelligence Arose from Expanded Information Capacity},
author = {Cantlon, Jessica F. and Piantadosi, Steven T.},
date = {2024-04-02},
journaltitle = {Nature Reviews Psychology},
shortjournal = {Nat Rev Psychol},
pages = {1--19},
publisher = {Nature Publishing Group},
issn = {2731-0574},
doi = {10.1038/s44159-024-00283-3},
url = {https://www.nature.com/articles/s44159-024-00283-3},
urldate = {2024-04-05},
abstract = {Most theories of how human cognition is unique propose specific representational capacities or biases, often thought to arise through evolutionary change. In this Perspective, we argue that the evidence that supports these domain-specific theories is confounded by general information-processing differences. We argue that human uniqueness arises through genetic quantitative increases in the global capacity to process information and share it among systems such as memory, attention and learning. This change explains regularities across numerous subdomains of cognition, behavioural comparisons between species and phenomena in child development. This strict evolutionary continuity theory of human intelligence is consistent with comparative evidence about neural evolution and computational constraints of memory on the ability to represent rules, patterns and abstract generalizations. We show how these differences in the degree of information processing capacity yield differences in kind for human cognition relative to other animals.},
langid = {english},
keywords = {Animal behaviour,Human behaviour,Intelligence,Psychology},
file = {/Users/andrew/Zotero/storage/U2F3MXGM/Cantlon_Piantadosi_2024_Uniquely human intelligence arose from expanded information capacity.pdf}
}
@article{cardonaArtificialIntelligenceFuture,
title = {Artificial {{Intelligence}} and the {{Future}} of {{Teaching}} and {{Learning}}},
author = {Cardona, Miguel A and Rodríguez, Roberto J and Ishmael, Kristina},
langid = {english},
keywords = {/unread,⛔ No DOI found},
file = {/Users/andrew/Zotero/storage/GZIMYF79/Cardona et al. - Artificial Intelligence and the Future of Teaching.pdf}
}
@online{changPromptingLargeLanguage2023,
title = {Prompting {{Large Language Models With}} the {{Socratic Method}}},
author = {Chang, Edward Y.},
date = {2023-03-15},
eprint = {2303.08769},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2303.08769},
url = {http://arxiv.org/abs/2303.08769},
urldate = {2024-02-07},
abstract = {This paper presents a systematic approach to using the Socratic method in developing prompt templates that effectively interact with large language models, including GPT-3. Various methods are examined, and those that yield precise answers and justifications while fostering creativity and imagination to enhance creative writing are identified. Techniques such as \{\textbackslash em definition\}, \{\textbackslash em elenchus\}, \{\textbackslash em dialectic\}, \{\textbackslash em maieutics\}, \{\textbackslash em generalization\}, and \{\textbackslash em counterfactual reasoning\} are discussed for their application in engineering prompt templates and their connections to inductive, deductive, and abductive reasoning. Through examples, the effectiveness of these dialogue and reasoning methods is demonstrated. An interesting observation is made that when the task's goal and user intent are conveyed to GPT-3 via ChatGPT before the start of a dialogue, the large language model seems to connect to the external context expressed in the intent and perform more effectively.},
pubstate = {prepublished},
keywords = {Computer Science - Machine Learning,I.2.7},
file = {/Users/andrew/Zotero/storage/ZJ69AGX2/Chang_2023_Prompting Large Language Models With the Socratic Method.pdf;/Users/andrew/Zotero/storage/PJJ9LIM7/2303.html}
}
@online{chengInductiveDeductiveRethinking2024,
title = {Inductive or {{Deductive}}? {{Rethinking}} the {{Fundamental Reasoning Abilities}} of {{LLMs}}},
shorttitle = {Inductive or {{Deductive}}?},
author = {Cheng, Kewei and Yang, Jingfeng and Jiang, Haoming and Wang, Zhengyang and Huang, Binxuan and Li, Ruirui and Li, Shiyang and Li, Zheng and Gao, Yifan and Li, Xian and Yin, Bing and Sun, Yizhou},
date = {2024-08-06},
eprint = {2408.00114},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2408.00114},
url = {http://arxiv.org/abs/2408.00114},
urldate = {2024-08-25},
abstract = {Reasoning encompasses two typical types: deductive reasoning and inductive reasoning. Despite extensive research into the reasoning capabilities of Large Language Models (LLMs), most studies have failed to rigorously differentiate between inductive and deductive reasoning, leading to a blending of the two. This raises an essential question: In LLM reasoning, which poses a greater challenge - deductive or inductive reasoning? While the deductive reasoning capabilities of LLMs, (i.e. their capacity to follow instructions in reasoning tasks), have received considerable attention, their abilities in true inductive reasoning remain largely unexplored. To investigate into the true inductive reasoning capabilities of LLMs, we propose a novel framework, SolverLearner. This framework enables LLMs to learn the underlying function (i.e., \$y = f\_w(x)\$), that maps input data points \$(x)\$ to their corresponding output values \$(y)\$, using only in-context examples. By focusing on inductive reasoning and separating it from LLM-based deductive reasoning, we can isolate and investigate inductive reasoning of LLMs in its pure form via SolverLearner. Our observations reveal that LLMs demonstrate remarkable inductive reasoning capabilities through SolverLearner, achieving near-perfect performance with ACC of 1 in most cases. Surprisingly, despite their strong inductive reasoning abilities, LLMs tend to relatively lack deductive reasoning capabilities, particularly in tasks involving ``counterfactual'' reasoning.},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence},
file = {/Users/andrew/Zotero/storage/YEHNTN3K/Cheng et al. - 2024 - Inductive or Deductive Rethinking the Fundamental Reasoning Abilities of LLMs.pdf;/Users/andrew/Zotero/storage/Q4SNB2NX/2408.html}
}
@article{chenHierarchicalBayesianModel2024,
title = {A {{Hierarchical Bayesian Model}} of {{Adaptive Teaching}}},
author = {Chen, Alicia M. and Palacci, Andrew and Vélez, Natalia and Hawkins, Robert D. and Gershman, Samuel J.},
date = {2024},
journaltitle = {Cognitive Science},
volume = {48},
number = {7},
pages = {e13477},
issn = {1551-6709},
doi = {10.1111/cogs.13477},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/cogs.13477},
urldate = {2024-07-17},
abstract = {How do teachers learn about what learners already know? How do learners aid teachers by providing them with information about their background knowledge and what they find confusing? We formalize this collaborative reasoning process using a hierarchical Bayesian model of pedagogy. We then evaluate this model in two online behavioral experiments (N = 312 adults). In Experiment 1, we show that teachers select examples that account for learners' background knowledge, and adjust their examples based on learners' feedback. In Experiment 2, we show that learners strategically provide more feedback when teachers' examples deviate from their background knowledge. These findings provide a foundation for extending computational accounts of pedagogy to richer interactive settings.},
langid = {english},
keywords = {Bayesian modeling,Communication,Pedagogy,Social cognition,Theory of mind},
file = {/Users/andrew/Zotero/storage/XGKLBYPV/Chen et al. - 2024 - A Hierarchical Bayesian Model of Adaptive Teaching.pdf;/Users/andrew/Zotero/storage/XUUMCCC7/cogs.html}
}
@article{chiLearningHumanTutoring2001,
title = {Learning from Human Tutoring},
author = {Chi, Michelene and Siler, Stephanie and Jeong, Heisawn and Yamauchi, Takashi and Hausmann, Robert},
date = {2001-07-01},
journaltitle = {Cognitive Science},
shortjournal = {Cognitive Science},
volume = {25},
pages = {471--533},
doi = {10.1016/S0364-0213(01)00044-1},
abstract = {Human one-to-one tutoring has been shown to be a very effective form of instruction. Three contrasting hypotheses, a tutor-centered one, a student-centered one, and an interactive one could all potentially explain the effectiveness of tutoring. To test these hypotheses, analyses focused not only on the effectiveness of the tutors’ moves, but also on the effectiveness of the students’ construction on learning, as well as their interaction. The interaction hypothesis is further tested in the second study by manipulating the kind of tutoring tactics tutors were permitted to use. In order to promote a more interactive style of dialogue, rather than a didactic style, tutors were suppressed from giving explanations and feedback. Instead, tutors were encouraged to prompt the students. Surprisingly, students learned just as effectively even when tutors were suppressed from giving explanations and feedback. Their learning in the interactive style of tutoring is attributed to construction from deeper and a greater amount of scaffolding episodes, as well as their greater effort to take control of their own learning by reading more. What they learned from reading was limited, however, by their reading abilities.},
keywords = {/unread},
file = {/Users/andrew/Zotero/storage/BWWCXV84/Chi et al_2001_Learning from human tutoring.pdf}
}
@online{choiMALADEOrchestrationLLMpowered2024,
title = {{{MALADE}}: {{Orchestration}} of {{LLM-powered Agents}} with {{Retrieval Augmented Generation}} for {{Pharmacovigilance}}},
shorttitle = {{{MALADE}}},
author = {Choi, Jihye and Palumbo, Nils and Chalasani, Prasad and Engelhard, Matthew M. and Jha, Somesh and Kumar, Anivarya and Page, David},
date = {2024-08-03},
eprint = {2408.01869},
eprinttype = {arXiv},
eprintclass = {cs, q-bio},
doi = {10.48550/arXiv.2408.01869},
url = {http://arxiv.org/abs/2408.01869},
urldate = {2024-08-16},
abstract = {In the era of Large Language Models (LLMs), given their remarkable text understanding and generation abilities, there is an unprecedented opportunity to develop new, LLM-based methods for trustworthy medical knowledge synthesis, extraction and summarization. This paper focuses on the problem of Pharmacovigilance (PhV), where the significance and challenges lie in identifying Adverse Drug Events (ADEs) from diverse text sources, such as medical literature, clinical notes, and drug labels. Unfortunately, this task is hindered by factors including variations in the terminologies of drugs and outcomes, and ADE descriptions often being buried in large amounts of narrative text. We present MALADE, the first effective collaborative multi-agent system powered by LLM with Retrieval Augmented Generation for ADE extraction from drug label data. This technique involves augmenting a query to an LLM with relevant information extracted from text resources, and instructing the LLM to compose a response consistent with the augmented data. MALADE is a general LLM-agnostic architecture, and its unique capabilities are: (1) leveraging a variety of external sources, such as medical literature, drug labels, and FDA tools (e.g., OpenFDA drug information API), (2) extracting drug-outcome association in a structured format along with the strength of the association, and (3) providing explanations for established associations. Instantiated with GPT-4 Turbo or GPT-4o, and FDA drug label data, MALADE demonstrates its efficacy with an Area Under ROC Curve of 0.90 against the OMOP Ground Truth table of ADEs. Our implementation leverages the Langroid multi-agent LLM framework and can be found at https://github.com/jihyechoi77/malade.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Information Retrieval,Computer Science - Machine Learning,Computer Science - Multiagent Systems,Quantitative Biology - Quantitative Methods},
file = {/Users/andrew/Zotero/storage/ZHBVB2K3/Choi et al. - 2024 - MALADE Orchestration of LLM-powered Agents with Retrieval Augmented Generation for Pharmacovigilanc.pdf;/Users/andrew/Zotero/storage/XAU7Y67T/2408.html}
}
@article{clarkExtendedMind1998,
title = {The {{Extended Mind}}},
author = {Clark, Andy and Chalmers, David},
date = {1998},
journaltitle = {Analysis},
volume = {58},
number = {1},
eprint = {3328150},
eprinttype = {jstor},
pages = {7--19},
publisher = {[Analysis Committee, Oxford University Press]},
issn = {0003-2638},
url = {https://www.jstor.org/stable/3328150},
urldate = {2024-06-26}
}
@book{clarkSupersizingMindEmbodiment2008,
title = {Supersizing the {{Mind}}: {{Embodiment}}, {{Action}}, and {{Cognitive Extension}}},
shorttitle = {Supersizing the {{Mind}}},
author = {Clark, Andy},
date = {2008},
publisher = {New York: Oxford University Press},
keywords = {/unread},
file = {/Users/andrew/Zotero/storage/UX96EM6A/CLASTM.html}
}
@online{collinsBuildingMachinesThat2024,
title = {Building {{Machines}} That {{Learn}} and {{Think}} with {{People}}},
author = {Collins, Katherine M. and Sucholutsky, Ilia and Bhatt, Umang and Chandra, Kartik and Wong, Lionel and Lee, Mina and Zhang, Cedegao E. and Zhi-Xuan, Tan and Ho, Mark and Mansinghka, Vikash and Weller, Adrian and Tenenbaum, Joshua B. and Griffiths, Thomas L.},
date = {2024-07-21},
eprint = {2408.03943},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2408.03943},
url = {http://arxiv.org/abs/2408.03943},
urldate = {2024-08-13},
abstract = {What do we want from machine intelligence? We envision machines that are not just tools for thought, but partners in thought: reasonable, insightful, knowledgeable, reliable, and trustworthy systems that think with us. Current artificial intelligence (AI) systems satisfy some of these criteria, some of the time. In this Perspective, we show how the science of collaborative cognition can be put to work to engineer systems that really can be called ``thought partners,'' systems built to meet our expectations and complement our limitations. We lay out several modes of collaborative thought in which humans and AI thought partners can engage and propose desiderata for human-compatible thought partnerships. Drawing on motifs from computational cognitive science, we motivate an alternative scaling path for the design of thought partners and ecosystems around their use through a Bayesian lens, whereby the partners we construct actively build and reason over models of the human and world.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Human-Computer Interaction,Computer Science - Machine Learning},
file = {/Users/andrew/Zotero/storage/8XYVBJSF/Collins et al. - 2024 - Building Machines that Learn and Think with People.pdf;/Users/andrew/Zotero/storage/EKLB687T/2408.html}
}
@article{corbettKnowledgeTracingModeling1994,
title = {Knowledge Tracing: {{Modeling}} the Acquisition of Procedural Knowledge},
shorttitle = {Knowledge Tracing},
author = {Corbett, Albert T. and Anderson, John R.},
date = {1994-12-01},
journaltitle = {User Modeling and User-Adapted Interaction},
shortjournal = {User Model User-Adap Inter},
volume = {4},
number = {4},
pages = {253--278},
issn = {1573-1391},
doi = {10.1007/BF01099821},
url = {https://doi.org/10.1007/BF01099821},
urldate = {2023-03-31},
abstract = {This paper describes an effort to model students' changing knowledge state during skill acquisition. Students in this research are learning to write short programs with the ACT Programming Tutor (APT). APT is constructed around a production rule cognitive model of programming knowledge, called theideal student model. This model allows the tutor to solve exercises along with the student and provide assistance as necessary. As the student works, the tutor also maintains an estimate of the probability that the student has learned each of the rules in the ideal model, in a process calledknowledge tracing. The tutor presents an individualized sequence of exercises to the student based on these probability estimates until the student has ‘mastered’ each rule. The programming tutor, cognitive model and learning and performance assumptions are described. A series of studies is reviewed that examine the empirical validity of knowledge tracing and has led to modifications in the process. Currently the model is quite successful in predicting test performance. Further modifications in the modeling process are discussed that may improve performance levels.},
langid = {english},
keywords = {/unread,empirical validity,individual differences,intelligent tutoring systems,learning,mastery learning,procedural knowledge,Student modeling},
file = {/Users/andrew/Zotero/storage/BCTFGDNV/Corbett_Anderson_1994_Knowledge tracing.pdf}
}
@article{cottonChattingCheatingEnsuring2024,
title = {Chatting and Cheating: {{Ensuring}} Academic Integrity in the Era of {{ChatGPT}}},
shorttitle = {Chatting and Cheating},
author = {Cotton, Debby R. E. and Cotton, Peter A. and Shipway, J. Reuben},
date = {2024-03-03},
journaltitle = {Innovations in Education and Teaching International},
volume = {61},
number = {2},
pages = {228--239},
publisher = {Routledge},
issn = {1470-3297},
doi = {10.1080/14703297.2023.2190148},
url = {https://doi.org/10.1080/14703297.2023.2190148},
urldate = {2024-05-27},
abstract = {The use of artificial intelligence in academia is a hot topic in the education field. ChatGPT is an AI tool that offers a range of benefits, including increased student engagement, collaboration, and accessibility. However, is also raises concerns regarding academic honesty and plagiarism. This paper examines the opportunities and challenges of using ChatGPT in higher education, and discusses the potential risks and rewards of these tools. The paper also considers the difficulties of detecting and preventing academic dishonesty, and suggests strategies that universities can adopt to ensure ethical and responsible use of these tools. These strategies include developing policies and procedures, providing training and support, and using various methods to detect and prevent cheating. The paper concludes that while the use of AI in higher education presents both opportunities and challenges, universities can effectively address these concerns by taking a proactive and ethical approach to the use of these tools.},
keywords = {detection and prevention,higher education,Machine-generated writing,plagiarism},
file = {/Users/andrew/Zotero/storage/79TXHW5V/Cotton et al_2024_Chatting and cheating.pdf}
}
@online{craigAICopyrightTrap2024,
type = {SSRN Scholarly Paper},
title = {The {{AI-Copyright Trap}}},
author = {Craig, Carys J.},
date = {2024-07-15},
number = {4905118},
location = {Rochester, NY},
doi = {10.2139/ssrn.4905118},
url = {https://papers.ssrn.com/abstract=4905118},
urldate = {2024-09-06},
abstract = {As AI tools proliferate, policy makers are increasingly being called upon to protect creators and the cultural industries from the extractive, exploitative, and even existential threats posed by generative AI. In their haste to act, however, they risk running headlong into the Copyright Trap: the mistaken conviction that copyright law is the best tool to support human creators and culture in our new technological reality (when in fact it is likely to do more harm than good). It is a trap in the sense that it may satisfy the wants of a small group of powerful stakeholders, but it will harm the interests of the more vulnerable actors who are, perhaps, most drawn to it. Once entered, it will also prove practically impossible to escape. I identify three routes in to the copyright trap in current AI debates: first is the "if value, then (property) right" fallacy; second is the idea that unauthorized copying is inherently wrongful; and third is the resurrection of the starving artist trope to justify copyright's expansion. Ultimately, this article urges AI critics to sidestep the copyright trap, resisting the lure of its proprietary logic in favor of more appropriate routes towards addressing the risks and harms of generative AI.},
langid = {english},
pubstate = {prepublished},
keywords = {Artificial Intelligence,Copyright Infringement,Copyright Law,Generative AI,Intellectual Property,Law and Technology,Text and Data Mining},
file = {/Users/andrew/Zotero/storage/NY2F48XN/Craig - 2024 - The AI-Copyright Trap.pdf}
}
@online{creswellFaithfulReasoningUsing2022,
title = {Faithful {{Reasoning Using Large Language Models}}},
author = {Creswell, Antonia and Shanahan, Murray},
date = {2022-08-30},
eprint = {2208.14271},
eprinttype = {arXiv},
eprintclass = {cs},
doi = {10.48550/arXiv.2208.14271},
url = {http://arxiv.org/abs/2208.14271},
urldate = {2023-04-21},
abstract = {Although contemporary large language models (LMs) demonstrate impressive question-answering capabilities, their answers are typically the product of a single call to the model. This entails an unwelcome degree of opacity and compromises performance, especially on problems that are inherently multi-step. To address these limitations, we show how LMs can be made to perform faithful multi-step reasoning via a process whose causal structure mirrors the underlying logical structure of the problem. Our approach works by chaining together reasoning steps, where each step results from calls to two fine-tuned LMs, one for selection and one for inference, to produce a valid reasoning trace. Our method carries out a beam search through the space of reasoning traces to improve reasoning quality. We demonstrate the effectiveness of our model on multi-step logical deduction and scientific question-answering, showing that it outperforms baselines on final answer accuracy, and generates humanly interpretable reasoning traces whose validity can be checked by the user.},
pubstate = {prepublished},
keywords = {/unread,Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
file = {/Users/andrew/Zotero/storage/JHWPXKW7/Creswell_Shanahan_2022_Faithful Reasoning Using Large Language Models.pdf;/Users/andrew/Zotero/storage/C9TB3MXG/2208.html}
}
@article{cuskleyLimitationsLargeLanguage2024,
title = {The {{Limitations}} of {{Large Language Models}} for {{Understanding Human Language}} and {{Cognition}}},
author = {Cuskley, Christine and Woods, Rebecca and Flaherty, Molly},
date = {2024-08-31},
journaltitle = {Open Mind},
shortjournal = {Open Mind},
volume = {8},
pages = {1058--1083},
issn = {2470-2986},
doi = {10.1162/opmi_a_00160},
url = {https://doi.org/10.1162/opmi_a_00160},
urldate = {2024-09-05},
abstract = {Researchers have recently argued that the capabilities of Large Language Models (LLMs) can provide new insights into longstanding debates about the role of learning and/or innateness in the development and evolution of human language. Here, we argue on two grounds that LLMs alone tell us very little about human language and cognition in terms of acquisition and evolution. First, any similarities between human language and the output of LLMs are purely functional. Borrowing the “four questions” framework from ethology, we argue that what LLMs do is superficially similar, but how they do it is not. In contrast to the rich multimodal data humans leverage in interactive language learning, LLMs rely on immersive exposure to vastly greater quantities of unimodal text data, with recent multimodal efforts built upon mappings between images and text. Second, turning to functional similarities between human language and LLM output, we show that human linguistic behavior is much broader. LLMs were designed to imitate the very specific behavior of human writing; while they do this impressively, the underlying mechanisms of these models limit their capacities for meaning and naturalistic interaction, and their potential for dealing with the diversity in human language. We conclude by emphasising that LLMs are not theories of language, but tools that may be used to study language, and that can only be effectively applied with specific hypotheses to motivate research.}
}
@online{daiCanLargeLanguage2023,
title = {Can {{Large Language Models Provide Feedback}} to {{Students}}? {{A Case Study}} on {{ChatGPT}}},
shorttitle = {Can {{Large Language Models Provide Feedback}} to {{Students}}?},
author = {Dai, Wei and Lin, Jionghao and Jin, Flora and Li, Tongguang and Tsai, Yi-Shan and Gasevic, Dragan and Chen, Guanliang},
date = {2023-04-13T06:49:18},
doi = {10.35542/osf.io/hcgzj},
url = {https://edarxiv.org/hcgzj/},
urldate = {2023-04-15},
abstract = {Educational feedback has been widely acknowledged as an effective approach to improving student learning. However, scaling effective practices can be laborious and costly, which motivated researchers to work on automated feedback systems (AFS). Inspired by the recent advancements in the pre-trained language models (e.g., ChatGPT), we posit that such models might advance the existing knowledge of textual feedback generation in AFS because of their capability to offer natural-sounding and detailed responses. Therefore, we aimed to investigate the feasibility of using ChatGPT to provide students with feedback to help them learn better. Specifically, we first examined the readability of ChatGPT-generated feedback. Then, we measured the agreement between ChatGPT and the instructor when assessing students' assignments according to the marking rubric. Finally, we used a well-known theoretical feedback framework to further investigate the effectiveness of the feedback generated by ChatGPT. Our results show that i) ChatGPT is capable of generating more detailed feedback that fluently and coherently summarizes students' performance than human instructors; ii) ChatGPT achieved high agreement with the instructor when assessing the topic of students' assignments; and iii) ChatGPT could provide feedback on the process of students completing the task, which benefits students developing learning skills.},
langid = {american},
pubstate = {prepublished},
keywords = {/unread,and Research,Automated Feedback,Education,Educational Assessment,Educational Methods,Evaluation,Feedback Effectiveness,Feedback Generation,Higher Education,Large Language Model},
file = {/Users/andrew/Zotero/storage/ICYKBJ87/Dai et al_2023_Can Large Language Models Provide Feedback to Students.pdf}
}
@online{dalalMatrixBayesianLearning2024,
title = {The {{Matrix}}: {{A Bayesian}} Learning Model for {{LLMs}}},
shorttitle = {The {{Matrix}}},
author = {Dalal, Siddhartha and Misra, Vishal},
date = {2024-02-05},
eprint = {2402.03175},
eprinttype = {arXiv},
eprintclass = {cs},
url = {http://arxiv.org/abs/2402.03175},
urldate = {2024-05-29},
abstract = {In this paper, we introduce a Bayesian learning model to understand the behavior of Large Language Models (LLMs). We explore the optimization metric of LLMs, which is based on predicting the next token, and develop a novel model grounded in this principle. Our approach involves constructing an ideal generative text model represented by a multinomial transition probability matrix with a prior, and we examine how LLMs approximate this matrix. We discuss the continuity of the mapping between embeddings and multinomial distributions, and present the Dirichlet approximation theorem to approximate any prior. Additionally, we demonstrate how text generation by LLMs aligns with Bayesian learning principles and delve into the implications for in-context learning, specifically explaining why in-context learning emerges in larger models where prompts are considered as samples to be updated. Our findings indicate that the behavior of LLMs is consistent with Bayesian Learning, offering new insights into their functioning and potential applications.},
langid = {english},
pubstate = {prepublished},
keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,I.2.7},
file = {/Users/andrew/Zotero/storage/GY9L3K28/Dalal and Misra - 2024 - The Matrix A Bayesian learning model for LLMs.pdf}
}
@inproceedings{danryDonJustTell2023,
title = {Don’t {{Just Tell Me}}, {{Ask Me}}: {{AI Systems}} That {{Intelligently Frame Explanations}} as {{Questions Improve Human Logical Discernment Accuracy}} over {{Causal AI}} Explanations},
shorttitle = {Don’t {{Just Tell Me}}, {{Ask Me}}},
booktitle = {Proceedings of the 2023 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
author = {Danry, Valdemar and Pataranutaporn, Pat and Mao, Yaoli and Maes, Pattie},
date = {2023-04-19},
series = {{{CHI}} '23},
pages = {1--13},
publisher = {Association for Computing Machinery},
location = {New York, NY, USA},
doi = {10.1145/3544548.3580672},
url = {https://dl.acm.org/doi/10.1145/3544548.3580672},
urldate = {2024-05-17},
abstract = {Critical thinking is an essential human skill. Despite the importance of critical thinking, research reveals that our reasoning ability suffers from personal biases and cognitive resource limitations, leading to potentially dangerous outcomes. This paper presents the novel idea of AI-framed Questioning that turns information relevant to the AI classification into questions to actively engage users’ thinking and scaffold their reasoning process. We conducted a study with 204 participants comparing the effects of AI-framed Questioning on a critical thinking task; discernment of logical validity of socially divisive statements. Our results show that compared to no feedback and even causal AI explanations of an always correct system, AI-framed Questioning significantly increase human discernment of logically flawed statements. Our experiment exemplifies a future style of Human-AI co-reasoning system, where the AI becomes a critical thinking stimulator rather than an information teller.},
isbn = {978-1-4503-9421-5},
keywords = {AI,AI Explanation,Explainable AI,Human-AI Interaction,Language Model,Logic,Reasoning},
file = {/Users/andrew/Zotero/storage/XIQ6PZEK/Danry et al_2023_Don’t Just Tell Me, Ask Me.pdf}
}
@article{davidUseGenerativeAI2023,
title = {The Use of Generative {{AI}} Tools in {{Design Thinking}} Academic Makeathon},
author = {David, Yigal and Krebs, Assaf and Rosenbaum, Alon},
date = {2023-12-29},
journaltitle = {CERN IdeaSquare Journal of Experimental Innovation},
volume = {7},
number = {3},
pages = {43--49},
issn = {2413-9505},
doi = {10.23726/cij.2023.1470},
url = {https://e-publishing.cern.ch/index.php/CIJ/article/view/1470},
urldate = {2024-08-06},
abstract = {This paper examines the integration and influence of Generative Artificial Intelligence (GAI) tools in a Double Diamond Design Thinking (DDDT) academic makeathon. It analyzes students' interaction with these tools in problem-solving scenarios, offering insights into their perceptions and manner of use. The study reveals that text-based GAI, such as ChatGPT and visual tools such as Midjourney and Dall-E 2, are perceived to be supportive rather than solution-dictating. However, it appears that there is a significant difference between engineering and design students in their approach and their trust in these tools. Moreover, students often use tools like ChatGPT as search engines without fully exploring their capabilities. This paper aims to explore the potential of GAI in its deeper capacity within the DDDT methodology, and how to maximize its value.},
issue = {3},
langid = {english},
keywords = {AI,AI in education,ChatGPT,Dall-E 2,Design Thinking,Double Diamond Design Thinking,Generative artificial intelligence,Human-AI collaboration,Makeathon,Midjourney,Shenkar Jamweek},
file = {/Users/andrew/Zotero/storage/R5BWIXRB/David et al. - 2023 - The use of generative AI tools in Design Thinking academic makeathon.pdf}
}
@article{degallier-rochatHumanAugmentationNot2022,
title = {Human Augmentation, Not Replacement: {{A}} Research Agenda for {{AI}} and Robotics in the Industry},
shorttitle = {Human Augmentation, Not Replacement},
author = {Dégallier-Rochat, Sarah and Kurpicz-Briki, Mascha and Endrissat, Nada and Yatsenko, Olena},
date = {2022},
journaltitle = {Frontiers in Robotics and AI},
volume = {9},
issn = {2296-9144},
doi = {10.3389/frobt.2022.997386},
url = {https://www.frontiersin.org/articles/10.3389/frobt.2022.997386},
urldate = {2023-03-03},
keywords = {/unread},
file = {/Users/andrew/Zotero/storage/9QEXUZ9N/Dégallier-Rochat et al_2022_Human augmentation, not replacement.pdf}
}
@article{degenRationalSpeechAct2023,
title = {The {{Rational Speech Act Framework}}},
author = {Degen, Judith},
date = {2023},
journaltitle = {Annual Review of Linguistics},
volume = {9},
number = {1},
pages = {519--540},
doi = {10.1146/annurev-linguistics-031220-010811},
url = {https://doi.org/10.1146/annurev-linguistics-031220-010811},
urldate = {2023-12-11},
abstract = {The past decade has seen the rapid development of a new approach to pragmatics that attempts to integrate insights from formal and experimental semantics and pragmatics, psycholinguistics, and computational cognitive science in the study of meaning: probabilistic pragmatics. The most influential probabilistic approach to pragmatics is the Rational Speech Act (RSA) framework. In this review, I demonstrate the basic mechanics and commitments of RSA as well as some of its standard extensions, highlighting the key features that have led to its success in accounting for a wide variety of pragmatic phenomena. Fundamentally, it treats language as probabilistic, informativeness as gradient, alternatives as context-dependent, and subjective prior beliefs (world knowledge) as a crucial facet of interpretation. It also provides an integrated account of the link between production and interpretation. I highlight key challenges for RSA, which include scalability, the treatment of the boundedness of cognition, and the incremental and compositional nature of language.},
keywords = {computational pragmatics,context,experimental pragmatics,experimental semantics,probabilistic pragmatics},
file = {/Users/andrew/Zotero/storage/6BTDFBQ6/Degen_2023_The Rational Speech Act Framework.pdf}
}
@article{demszkyUsingLargeLanguage2023,
title = {Using Large Language Models in Psychology},
author = {Demszky, Dorottya and Yang, Diyi and Yeager, David S. and Bryan, Christopher J. and Clapper, Margarett and Chandhok, Susannah and Eichstaedt, Johannes C. and Hecht, Cameron and Jamieson, Jeremy and Johnson, Meghann and Jones, Michaela and Krettek-Cobb, Danielle and Lai, Leslie and JonesMitchell, Nirel and Ong, Desmond C. and Dweck, Carol S. and Gross, James J. and Pennebaker, James W.},
date = {2023-10-13},
journaltitle = {Nature Reviews Psychology},
shortjournal = {Nat Rev Psychol},
pages = {1--14},
publisher = {Nature Publishing Group},
issn = {2731-0574},
doi = {10.1038/s44159-023-00241-5},
url = {https://www.nature.com/articles/s44159-023-00241-5},
urldate = {2023-10-21},
abstract = {Large language models (LLMs), such as OpenAI’s GPT-4, Google’s Bard or Meta’s LLaMa, have created unprecedented opportunities for analysing and generating language data on a massive scale. Because language data have a central role in all areas of psychology, this new technology has the potential to transform the field. In this Perspective, we review the foundations of LLMs. We then explain how the way that LLMs are constructed enables them to effectively generate human-like linguistic output without the ability to think or feel like a human. We argue that although LLMs have the potential to advance psychological measurement, experimentation and practice, they are not yet ready for many of the most transformative psychological applications — but further research and development may enable such use. Next, we examine four major concerns about the application of LLMs to psychology, and how each might be overcome. Finally, we conclude with recommendations for investments that could help to address these concerns: field-initiated ‘keystone’ datasets; increased standardization of performance benchmarks; and shared computing and analysis infrastructure to ensure that the future of LLM-powered research is equitable.},
langid = {english},
keywords = {Human behaviour,Language and linguistics,Psychology,Science,technology and society},
file = {/Users/andrew/Zotero/storage/7BZWJTSP/Demszky et al_2023_Using large language models in psychology.pdf}
}
@incollection{deneckeAnalysisCriticalIncident2024,
title = {Analysis of {{Critical Incident Reports Using Natural Language Processing}}},