KempnerInstitute · Naeemkh · Sep 11, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
@@ -21,7 +21,9 @@ jobs:
 
     - name: Install dependencies
       run: |
-        pip install sphinx sphinx-rtd-theme myst-parser  
+        pip install --upgrade pip
+        pip install setuptools
+        pip install -e '.[docs]'
 
     - name: Build Docs
       run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,33 @@
 # Changelog
 
-## Added
-- Sweep submission scripts and instructions
-
-## Removed
-- The low precision layer norm option. 
-- The `blockg_group_size` config parameter and related code.
-- Now the code only accepts `mlp_hidden_size` and does not support mlp_ratio
-- SwiGlu activation from the olmo model
-- `weight_tying` from the olmo model
-
-## Changed
-- User now can activate flash attention from the config file ([197c38f](https://github.com/KempnerInstitute/min-olmo/commit/197c38fe9c288b65b5620f308cbcf2837b4c0f73)). 
+## Development
+
+### Added
+- Documentation infrastructure [#5](https://github.com/KempnerInstitute/minOLMo/pull/5)
+
+### Removed
+-  
+
+### Changed
+-  
+
+## [v0.1.0]() (2024-08-06)
+
+- Add attention function, model data flow, and olmo sequential block figures. (@Naeemkh, e6e54f0)
+- Add option for nsys profiling (@mbsabath, bf3f2a4)
+- Add table of parameters to the logger (@Naeemkh, 12bf09c)
+- Drop Llama Block (@Naeemkh, 22d0f45)
+- Drop drop out layer (@Naeemkh, 4f2775d and a11ee8a)
+- Add back of the envelop computations (@Naeemkh, 6d83c07)
+- Merge OLMoSequentialBlock into OLMoBlock (@Naeemkh, fff5955)
+- Move flash attention settings to the config file (@Naeemkh, 197c38f)
+- Add sweep generator scripts (@Naeemkh, def2931, e462a92, 1e7fb8c, 7e1c11e)
+- Drop SwiGLU activation function (@Naeemkh, dd12e48, 1d5f0dc, 7c942be)
+- Drop weight_tying (@Naeemkh, 544b0b6)
+- Drop OLMoBlockGroup (@Naeemkh, ceff8f8, ba49aa6 )
+- Keep only PyTorch default LayerNorm (@Naeemkh, beb76cd, d988ea7 )
+- Clean up utility codes for submitting the checkpoints to the cloud (@Naeemkh, f8dbc80)
+- Remove multi-query attention feature and related settings ( @Naeemkh, 74eaf03)
+- Drop effective key value heads and use user requested number of heads ( @Naeemkh, 36f51b7)
+- Fix a bug with setup condo environment (@amazloumi, e51c620, c1f3125 )
+- Drop output multiplier (@Naeemkh, 1b3eb2b)
diff --git a/docs/about.rst b/docs/about.rst
@@ -0,0 +1,19 @@
+About
+=====
+
+The minOLMo package and its documentation present a comprehensive analysis of the Open Language Model (OLMo), a state-of-the-art deep learning model designed for advanced natural language processing tasks. The OLMo model leverages a unique architecture that combines the benefits of transformer-based techniques with innovative training strategies to enhance performance across various linguistic benchmarks. This document delineates the model's design, including its underlying architecture, data flow processes, and deployment strategies across different environments, such as on-prem clusters and cloud platforms.
+We detail the methodologies employed in training the model, including data preprocessing, model optimization, and evaluation metrics used to validate performance on tasks such as text classification, question answering, and language inference. 
+
+
+minOLMo has been forked from the original OLMo model, with the primary goal of removing extra complexity and distributed training capabilities. This streamlined version is now a large language model featuring a simplified codebase that is easy to understand and follow. It is designed to be run and explored by researchers on a single GPU, making it accessible for those who want to delve into the workings of large language models without the need for extensive computational resources.
+
+The following shows the fork history of the minOLMo package:
+
+- Apr 5, 2024: David Brandfonbrener forked the original OLMo repository to create the min-olmo repository. David removed the distributed training capabilities.
+- Apr 22, 2024: Kempner Institute forked the min-olmo repository from David's fork to create the KempnerInstitute/min-olmo repository. Any code after this date is from Kempner Institute affiliated contributors.
+
+
+Acknowledgements
+----------------
+
+The authors wish to express their gratitude to Eran Malach at the Kempner Institute for providing insgihtful discussions on the OLMo model and David Brandfonbrener at the Kempner Institute for providing access to the min-olmo codebase. 
diff --git a/docs/conf.py b/docs/conf.py
@@ -21,13 +21,46 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
 extensions = [
+    "sphinx.ext.intersphinx",
     'sphinx.ext.autodoc',     # Automatically document from docstrings
     'sphinx.ext.napoleon',    # Support for Google and NumPy style docstrings
     'sphinx.ext.viewcode',    # Add links to highlighted source code
     'sphinx.ext.autosummary', # Generate summary tables for modules/classes
-    'myst_parser',            # Support for Markdown files (if using Markdown)
+    "sphinx.ext.doctest",
+    "sphinx.ext.mathjax",
+    "nbsphinx",
+    'myst_parser',            # Support for Markdown files (if using Markdown)  
+    'sphinx.ext.autosectionlabel',
+    'sphinx.ext.githubpages', 
+    'sphinx_copybutton',
+    'sphinxcontrib.bibtex',
+
+]
+myst_enable_extensions = [
+    "dollarmath",
+    "amsmath",
+    "colon_fence",
+    "smartquotes",
+    "linkify",
+    "substitution",
 ]
 
+bibtex_bibfiles = ['refs.bib']
+
+
+# Enable syntax highlighting
+pygments_style = 'default'  # You can also use 'default' or other pygments styles
+
+
+# Figure numbers
+numfig = True
+
+numfig_format = {
+    'figure': 'Figure %s',
+    'table': 'Table %s',
+    'code-block': 'Listing %s',
+    'section': 'Section %s',
+}
 
 templates_path = ['_templates']
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

diff --git a/docs/docs/source/minolmo.rst b/docs/docs/source/minolmo.rst
diff --git a/docs/docs/source/modules.rst b/docs/docs/source/modules.rst
diff --git a/docs/index.rst b/docs/index.rst
@@ -3,18 +3,39 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-minOLMo documentation
-=====================
+minOLMo
+=======
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Introduction
+
+   about
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Users Guide
+
+   source/users/users
+   source/users/single_submission
+   source/users/sweep_submission
+   source/users/train_data
+   source/users/data_loader
+   source/users/random_topics
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: References
+
+   source/users/references
 
-Add your content using ``reStructuredText`` syntax. See the
-`reStructuredText <https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html>`_
-documentation for details.
 
 
 .. toctree::
    :maxdepth: 2
-   :caption: Contents:
+   :caption: API Reference
 
    source/minolmo
-   source/modules
 
diff --git a/docs/refs.bib b/docs/refs.bib
@@ -0,0 +1,39 @@
+@Article{Groeneveld_2024_arxiv,
+  title={Olmo: Accelerating the science of language models},
+  author={Groeneveld, Dirk and Beltagy, Iz and Walsh, Pete and Bhagia, Akshita and Kinney, Rodney and Tafjord, Oyvind and Jha, Ananya Harsh and Ivison, Hamish and Magnusson, Ian and Wang, Yizhong and others},
+  journal={arXiv preprint arXiv:2402.00838},
+  year={2024}
+}
+
+@misc{Soldaini_2024_arxiv,
+      title={Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research}, 
+      author={Luca Soldaini and Rodney Kinney and Akshita Bhagia and Dustin Schwenk and David Atkinson and Russell Authur and Ben Bogin and Khyathi Chandu and Jennifer Dumas and Yanai Elazar and Valentin Hofmann and Ananya Harsh Jha and Sachin Kumar and Li Lucy and Xinxi Lyu and Nathan Lambert and Ian Magnusson and Jacob Morrison and Niklas Muennighoff and Aakanksha Naik and Crystal Nam and Matthew E. Peters and Abhilasha Ravichander and Kyle Richardson and Zejiang Shen and Emma Strubell and Nishant Subramani and Oyvind Tafjord and Pete Walsh and Luke Zettlemoyer and Noah A. Smith and Hannaneh Hajishirzi and Iz Beltagy and Dirk Groeneveld and Jesse Dodge and Kyle Lo},
+      year={2024},
+      eprint={2402.00159},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+
+@misc{Google_T5_2022_Github,
+  author = {{Google Research}},
+  title = {T5: Text-To-Text Transfer Transformer},
+  year = {2024},
+  howpublished = {\url{https://github.com/google-research/text-to-text-transfer-transformer#c4}},
+  note = {Accessed: May 28}
+}
+
+@misc{AllenAI_C4_2021_Github,
+  author = {AllenAI},
+  title = {Download the C4 dataset!},
+  year = {2024},
+  howpublished = {\url{https://github.com/allenai/allennlp/discussions/5056}},
+  note = {Accessed: May 28}
+}
+
+@misc{numpy_memmap,
+  author = {Numpy},
+  title = {Create a memory-map to an array stored in a binary file on disk},
+  year = {2024},
+  howpublished = {\url{https://numpy.org/doc/stable/reference/generated/numpy.memmap.html}},
+  note = {Accessed: May 31}
+}