Code

Summary: Test Plan:
facebookresearch · Dec 12, 2024 · 783f280 · 783f280
1 parent 4988b1e
commit 783f280
Show file tree

Hide file tree

Showing 66 changed files with 9,889 additions and 5 deletions.
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -0,0 +1,12 @@
+name: Lint with Black
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: psf/black@stable
+        with:
+          version: "24.8.0"
diff --git a/.github/workflows/isort.yml b/.github/workflows/isort.yml
@@ -0,0 +1,10 @@
+name: Lint with isort
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: isort/isort-action@master
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,167 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.out
+
+figures/
+.vscode/
+data/
diff --git a/.prettierrc b/.prettierrc
@@ -0,0 +1,8 @@
+{
+  "overrides": [
+    {
+      "files": "*.yaml",
+      "options": { "tabWidth": 2 }
+    }
+  ]
+}
diff --git a/README.md b/README.md
@@ -13,6 +13,11 @@ Scaling trends reveal training and inference efficiency benefits from dynamicall
 patches on average, along with qualitative improvements with reasoning and long tail generalization
 from modeling byte-sequences.
 
+## Development Status
+
+We are actively updating the blt code to make it easier to reproduce our results.
+Please file an issue and/or be patient while we add more features!
+
 ## Quick start
 
 The following commands launch a SLURM job that creates an environment for Meta Lingua.
@@ -30,7 +35,7 @@ sbatch setup/create_env.sh
 Once that is done your can activate the environment
 
 ```bash
-conda activate lingua_<date>
+conda activate blt_<date>
 ```
 
 use the provided script to download and prepare data from huggingface (among `fineweb_edu`, `fineweb_edu_10bt`, or `dclm_baseline_1.0`).
@@ -76,10 +81,10 @@ bash dev/lint.sh
 The BLT is partially based on Meta Lingua, so consider citing it in addition to our BLT paper if you re-use our work.
 
 ```
-@misc{meta_lingua,
-  author = {Mathurin Videau, Badr Youbi Idrissi, Daniel Haziza, Luca Wehrstedt, Jade Copet, Olivier Teytaud, David Lopez-Paz},
-  title = {{Meta Lingua}: A minimal {PyTorch LLM} training library},
-  url = {https://github.com/facebookresearch/lingua},
+@misc{meta_blt,
+  author = {Artidoro Pagnoni, Ram Pasunuru, Pedro Rodriguez, John Nguyen, Benjamin Muller, Margaret Li, Chunting Zhou, Lili Yu, Jason Weston, Luke Zettlemoyer, Gargi Ghosh, Mike Lewis, Ari Holtzman†, Srinivasan Iyer},
+  title = {Byte Latent Transformer: Patches Scale Better Than Tokens},
+  url = {https://github.com/facebookresearch/blt},
   year = {2024}
 }
 ```

diff --git a/apps/__init__.py b/apps/__init__.py
diff --git a/apps/main/__init__.py b/apps/main/__init__.py
diff --git a/apps/main/configs/eval.yaml b/apps/main/configs/eval.yaml
@@ -0,0 +1,35 @@
+name: "debug_evals"
+# ckpt_dir: !!CHANGETHIS!!
+# dump_dir: !!CHANGETHIS!!
+generator:
+  max_tokens: 8192
+  dtype: bf16
+  temperature: 1.0
+  top_p: 0.95
+harness:
+  tasks:
+    - hellaswag
+    - task: boolq
+      dataset_kwargs:
+        trust_remote_code: true
+    - task: nq_open
+      num_fewshot: 5
+    - piqa
+    - task: social_iqa
+      dataset_kwargs:
+        trust_remote_code: true
+    - triviaqa
+    - winogrande
+    - openbookqa
+    - arc_easy
+    - arc_challenge
+    - race
+    - commonsense_qa
+    # - coqa
+    - copa
+    - gsm8k
+    - bbh
+    - mmlu
+    - mmlu_pro
+validation:
+  max_steps: 1000
diff --git a/apps/main/configs/llama_1B.yaml b/apps/main/configs/llama_1B.yaml
@@ -0,0 +1,87 @@
+# dump_dir: !!!CHANGE_THIS!!!
+name: large_lm
+steps: 60_000
+probe_freq: null
+seed: 777
+
+optim:
+  lr: 3e-3
+  weight_decay: 0.033
+  warmup: 5000
+  lr_min_ratio: 0.000001
+  clip: 1.0
+
+distributed:
+  fsdp_type: full_shard
+  compile: true
+  model_dtype: bf16
+  matmul_allow_tf32: false
+  selective_activation_checkpointing: false
+  tp_size: 1
+
+model:
+  dim: 2048
+  n_layers: 25
+  n_heads: 16
+
+data:
+  root_dir: data/shuffled
+  sources:
+    dclm_baseline_1.0: 100.0
+  batch_size: 4
+  prefetch_size: 1024
+  seq_len: 4096
+  n_views: 2
+  load_async: true
+  add_bos: true
+  add_eos: true
+  tokenizer:
+    name: tiktoken
+    path: tokenizers/cl_toplang_128k.tiktoken
+
+profiling:
+  run: true
+  mem_warmup: 0
+  mem_steps: 4
+  profile_warmup: 100
+  profile_steps: 4
+
+checkpoint:
+  dump:
+    every: 2500
+    keep: 3
+  eval:
+    every: 5000
+    keep: -1
+
+logging:
+  freq: 1
+
+async_eval_gpus: 8
+eval:
+  harness:
+    tasks:
+      - hellaswag
+      - task: boolq
+        dataset_kwargs:
+          trust_remote_code: true
+      - piqa
+      - task: social_iqa
+        dataset_kwargs:
+          trust_remote_code: true
+      - winogrande
+      - openbookqa
+      - arc_easy
+      - arc_challenge
+      - race
+      - commonsense_qa
+      - copa
+      # - coqa
+      # - task: nq_open
+      #   num_fewshot: 5
+      # - triviaqa
+  validation:
+    max_steps: 1000
+  generator:
+    max_tokens: 16384
+    dtype: bf16