update talks, add token-bounds paper

bamos · Jul 12, 2024 · bb89d90 · bb89d90
1 parent 0067b22
commit bb89d90
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 0 deletions.
diff --git a/cv.yaml b/cv.yaml
@@ -507,10 +507,16 @@ repos:
     desc: $\varheart$ Linux, xmonad, emacs, vim, zsh, tmux
 
 talks:
+  - title: Amortized optimization for optimal transport and LLM attacks
+    location: ISMP
+    year: 2024
   - title: Differentiable optimization for control and robotics
     location: RSS Optimization for Robotics Workshop
     url: https://sites.google.com/robotics.utias.utoronto.ca/frontiers-optimization-rss24/schedule
     year: 2024
+  - title: Amortized optimization-based reasoning for AI
+    location: University of Amsterdam
+    year: 2024
   - title: End-to-end learning geometries for graphs, dynamical systems, and regression
     location: LoG New York
     url: https://logmeetupnyc.github.io/

diff --git a/publications/all.bib b/publications/all.bib
@@ -63,6 +63,42 @@ @inproceedings{atanackovic2024meta
   }
 }
 
+@inproceedings{lotfi2024unlocking,
+  title={Unlocking Tokens as Data Points for Generalization Bounds on Larger Language Models},
+  author={Sanae Lotfi and Yilun Kuang and Marc Anton Finzi and Brandon Amos and Micah Goldblum and Andrew Gordon Wilson},
+  _venue={ICML TF2M Workshop},
+  year={2024},
+  url={https://openreview.net/forum?id=cQWsTeTSkZ},
+  abstract={
+  Large language models (LLMs) with billions of parameters excel at
+  predicting the next token in a sequence. Recent work
+  computes non-vacuous compression-based
+  generalization bounds for LLMs, but these bounds are
+  vacuous for large models at the billion-parameter
+  scale. Moreover, these bounds are obtained through
+  restrictive compression techniques, bounding
+  compressed models that generate low-quality
+  text. Additionally, the tightness of these existing
+  bounds depends on the number of IID documents in a
+  training set rather than the much larger number of
+  non-IID constituent tokens, leaving untapped
+  potential for tighter bounds. In this work, we
+  instead use properties of martingales to derive
+  generalization bounds that benefit from the vast
+  number of tokens in LLM training sets. Since a
+  dataset contains far more tokens than documents, our
+  generalization bounds not only tolerate but actually
+  benefit from far less restrictive compression
+  schemes. With Monarch matrices, Kronecker
+  factorizations, and post-training quantization, we
+  achieve non-vacuous generalization bounds for LLMs
+  as large as LLaMA2-70B. Unlike previous approaches,
+  our work achieves the first non-vacuous bounds for
+  models that are deployed in practice and generate
+  high-quality text.
+  }
+}
+
 @misc{amos2023tutorial,
   title={Tutorial on amortized optimization},
   author={Brandon Amos},