diff --git a/README.md b/README.md
index 3024f59..859019b 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@
 This is a package to fit and test variational autoencoder (VAE) models for T cell receptor sequences.
 
 It is described in the paper [_Deep generative models for T cell receptor protein sequences_](https://elifesciences.org/articles/46935) by Kristian Davidsen, Branden J Olson, William S DeWitt III, Jean Feng, Elias Harkins, Philip Bradley and Frederick A Matsen IV.
+This project continues to evolve, but the version in the paper is [tagged 0.2.0](https://github.com/matsengrp/vampire/tree/0.2.0).
 
 
 ## Install
@@ -83,8 +84,7 @@ Please get in touch if anything isn't clear.
 
 ## Code styling
 
-This project uses [YAPF](https://github.com/google/yapf) for code formatting with a format defined in `setup.cfg`.
-You can easily run yapf on all the files with a call to `yapf -ir .` in the root directory.
+This project uses [Black](https://black.readthedocs.io/en/stable/) and [docformatter](https://pypi.org/project/docformatter/) for code formatting.
 
 Code also checked using [flake8](http://flake8.pycqa.org/en/latest/).
 `# noqa` comments cause lines to be ignored by flake8.
diff --git a/install/environment.yml b/install/environment.yml
index 4d9b5e4..397013c 100644
--- a/install/environment.yml
+++ b/install/environment.yml
@@ -3,12 +3,8 @@ channels:
   - defaults
   - conda-forge
 dependencies:
-  - python=3.6
+  - python>=3.6
   - biopython
-  - click
-  - flake8
-  - keras
-  - matplotlib
   - pandas
   - parallel
   - pip
@@ -16,10 +12,14 @@ dependencies:
   - pytest
   - scikit-learn
   - scons
-  - seaborn
   - yapf
   - pip:
+    - black
+    - click
     - delegator.py
+    - docformatter
     - exrex
+    - flake8
     - nestly
+    - tensorflow
     - versioneer
diff --git a/setup.cfg b/setup.cfg
index a9465b1..3d4a00d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,7 +1,3 @@
-[yapf]
-based_on_style = pep8
-column_limit = 120
-
 [flake8]
 max-line-length = 120
 
diff --git a/vampire/SConstruct b/vampire/SConstruct
index 9ef2beb..b392816 100644
--- a/vampire/SConstruct
+++ b/vampire/SConstruct
@@ -11,7 +11,7 @@ from os.path import join
 import SCons.Script as sc
 
 # Command line options
-sc.AddOption('--pipe', type='string', help="Which pipeline to run.", default='pipe_main')
+sc.AddOption('--pipe', type='string', help="Which pipeline to run.", default='pipe_simple')
 sc.AddOption('--data', type='string', help="Path to JSON describing data set. Default means use in-repo data.", default='sample_data/sample.json')
 sc.AddOption(
     '--mode', type='string', help="The mode of the run, which is up to the SConscript to define.", default='default')
diff --git a/vampire/__init__.py b/vampire/__init__.py
index 74f4e66..80edaf0 100644
--- a/vampire/__init__.py
+++ b/vampire/__init__.py
@@ -1,4 +1,4 @@
-
 from ._version import get_versions
-__version__ = get_versions()['version']
+
+__version__ = get_versions()["version"]
 del get_versions
diff --git a/vampire/_version.py b/vampire/_version.py
index 1521e77..1de6c23 100644
--- a/vampire/_version.py
+++ b/vampire/_version.py
@@ -1,4 +1,3 @@
-
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
@@ -58,17 +57,18 @@ class NotThisMethod(Exception):
 
 def register_vcs_handler(vcs, method):  # decorator
     """Decorator to mark a method as the handler for a particular VCS."""
+
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
         if vcs not in HANDLERS:
             HANDLERS[vcs] = {}
         HANDLERS[vcs][method] = f
         return f
+
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     p = None
@@ -76,10 +76,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
         try:
             dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -116,16 +119,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     for i in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
         else:
             rootdirs.append(root)
             root = os.path.dirname(root)  # up a level
 
     if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
 
 
@@ -181,7 +190,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -190,7 +199,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = set([r for r in refs if re.search(r"\d", r)])
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -198,19 +207,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
     # no suitable tags, so version is "0+unknown", but full hex is still there
     if verbose:
         print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
@@ -225,8 +241,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -234,10 +249,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+        ],
+        cwd=root,
+    )
     # --long was added in git-1.5.5
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
@@ -260,17 +284,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     dirty = git_describe.endswith("-dirty")
     pieces["dirty"] = dirty
     if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
 
     # now we have TAG-NUM-gHEX or HEX
 
     if "-" in git_describe:
         # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
+            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
             return pieces
 
         # tag
@@ -279,10 +302,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             if verbose:
                 fmt = "tag '%s' doesn't start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
@@ -293,13 +318,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
 
     return pieces
@@ -330,8 +355,7 @@ def render_pep440(pieces):
                 rendered += ".dirty"
     else:
         # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
         if pieces["dirty"]:
             rendered += ".dirty"
     return rendered
@@ -445,11 +469,13 @@ def render_git_describe_long(pieces):
 def render(pieces, style):
     """Render the given version pieces into the requested style."""
     if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
 
     if not style or style == "default":
         style = "pep440"  # the default
@@ -469,9 +495,13 @@ def render(pieces, style):
     else:
         raise ValueError("unknown style '%s'" % style)
 
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
 
 
 def get_versions():
@@ -485,8 +515,7 @@ def get_versions():
     verbose = cfg.verbose
 
     try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
-                                          verbose)
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
     except NotThisMethod:
         pass
 
@@ -495,13 +524,16 @@ def get_versions():
         # versionfile_source is the relative path from the top of the source
         # tree (where the .git directory might live) to this file. Invert
         # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
+        for i in cfg.versionfile_source.split("/"):
             root = os.path.dirname(root)
     except NameError:
-        return {"version": "0+unknown", "full-revisionid": None,
-                "dirty": None,
-                "error": "unable to find root of source tree",
-                "date": None}
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
 
     try:
         pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
@@ -515,6 +547,10 @@ def get_versions():
     except NotThisMethod:
         pass
 
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to compute version", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/vampire/common.py b/vampire/common.py
index f815427..5558d69 100644
--- a/vampire/common.py
+++ b/vampire/common.py
@@ -26,7 +26,7 @@ def running_avg(x):
     running_avg = np.cumsum(x, axis=0)
     for i in range(x.shape[1]):
         running_avg[:, i] /= np.arange(1, len(x) + 1)
-    return (running_avg)
+    return running_avg
 
 
 def repeat_row(a, which_entry, n_repeats):
@@ -68,7 +68,7 @@ def strip_extn(in_path):
     Strips the extension.
     """
     (path, extn) = os.path.splitext(str(in_path))
-    if extn in ['.gz', '.bz2']:
+    if extn in [".gz", ".bz2"]:
         # Perhaps there is more to the suffix.
         return os.path.splitext(path)[0]
     else:
@@ -87,7 +87,7 @@ def path_split_tail(in_path):
     Give the farthest right object in a path, whether it be a directory ending
     with a `/` or a file.
     """
-    return os.path.split(in_path.rstrip('/'))[1]
+    return os.path.split(in_path.rstrip("/"))[1]
 
 
 def cjoin(path, *paths):
@@ -103,7 +103,9 @@ def read_data_csv(fname):
     """
     Read a CSV from our data path.
     """
-    return pd.read_csv(pkg_resources.resource_filename('vampire', os.path.join('data', fname)))
+    return pd.read_csv(
+        pkg_resources.resource_filename("vampire", os.path.join("data", fname))
+    )
 
 
 # ### Misc functions ###
diff --git a/vampire/custom_keras.py b/vampire/custom_keras.py
index a22a96f..c2983ba 100644
--- a/vampire/custom_keras.py
+++ b/vampire/custom_keras.py
@@ -1,8 +1,8 @@
-import keras
-from keras import backend as K
-from keras.callbacks import Callback
-from keras.engine.topology import Layer
 import tensorflow as tf
+import tensorflow.keras as keras
+from tensorflow.keras import backend as K
+from tensorflow.keras.callbacks import Callback
+from tensorflow.keras.engine.topology import Layer
 
 # ### Callbacks ###
 
@@ -36,15 +36,20 @@ class EmbedViaMatrix(Layer):
 
     def __init__(self, embedding_dim, **kwargs):
         self.embedding_dim = embedding_dim
-        self.kernel_initializer = keras.initializers.get('uniform')
+        self.kernel_initializer = keras.initializers.get("uniform")
         super(EmbedViaMatrix, self).__init__(**kwargs)
 
     def build(self, input_shape):
         # Create a trainable weight variable for this layer.
         # The first component of input_shape is the batch size (see https://keras.io/layers/core/#dense).
         self.kernel = self.add_weight(
-            name='kernel', shape=(input_shape[2], self.embedding_dim), initializer='uniform', trainable=True)
-        super(EmbedViaMatrix, self).build(input_shape)  # Be sure to call this at the end
+            name="kernel",
+            shape=(input_shape[2], self.embedding_dim),
+            initializer="uniform",
+            trainable=True,
+        )
+        # Be sure to call this at the end
+        super(EmbedViaMatrix, self).build(input_shape)
 
     def call(self, x):
         return K.dot(x, self.kernel)
@@ -63,13 +68,14 @@ class RightTensordot(Layer):
     """
 
     def __init__(self, right_np_tensor, axes, **kwargs):
-        self.right_tf_tensor = tf.convert_to_tensor(right_np_tensor, dtype=tf.float32)
+        self.right_tf_tensor = tf.convert_to_tensor(value=right_np_tensor, dtype=tf.float32)
         assert type(axes) == int
         self.axes = axes
         super(RightTensordot, self).__init__(**kwargs)
 
     def build(self, input_shape):
-        super(RightTensordot, self).build(input_shape)  # Be sure to call this at the end
+        # Be sure to call this at the end
+        super(RightTensordot, self).build(input_shape)
 
     def call(self, x):
         # Tensordotting with sums over the last `axes` indices of the first argument
@@ -111,8 +117,9 @@ def call(self, x):
             # 20 - argmax will be >= 1 for any site that doesn't have gap in it.
             # Then we clip so it's = 1 for any site that doesn't have gap in it.
             # Note argmax with axis-=1 means over the last (amino acid) axis.
-            tf.clip_by_value(20. - tf.to_float(tf.argmax(x, axis=-1)), 0., 1.),
-            axis=1)
+            tf.clip_by_value(20.0 - tf.cast(tf.argmax(input=x, axis=-1), dtype=tf.float32), 0.0, 1.0),
+            axis=1,
+        )
 
     def compute_output_shape(self, input_shape):
         # Make sure we have 21 states for our amino acid space.
@@ -164,7 +171,9 @@ def __init__(self, v_max_germline_aas, j_max_germline_aas, **kwargs):
         super(ContiguousMatch, self).__init__(**kwargs)
 
     def build(self, input_shape):
-        super(ContiguousMatch, self).build(input_shape)  # Be sure to call this at the end
+        super(ContiguousMatch, self).build(
+            input_shape
+        )  # Be sure to call this at the end
 
     def call(self, inputs):
         (x, v_germline_aa_onehot, j_germline_aa_onehot) = inputs
@@ -173,11 +182,20 @@ def call(self, inputs):
         # then sum is to get an indicator of the germline matches of x across
         # the germline genes. The cumprod_sum then returns the number of
         # contiguous matches for the V and J gene sides.
-        return tf.stack([
-            cumprod_sum(K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2), self.v_max_germline_aas),
-            cumprod_sum(K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2), self.j_max_germline_aas, reverse=True)
-        ],
-                        axis=1)
+        return tf.stack(
+            [
+                cumprod_sum(
+                    K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2),
+                    self.v_max_germline_aas,
+                ),
+                cumprod_sum(
+                    K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2),
+                    self.j_max_germline_aas,
+                    reverse=True,
+                ),
+            ],
+            axis=1,
+        )
 
     def compute_output_shape(self, input_shape):
         # All input should be of shape (batch_size, max_cdr3_len, len(AA_LIST)).
diff --git a/vampire/execute.py b/vampire/execute.py
index 056784b..2748701 100644
--- a/vampire/execute.py
+++ b/vampire/execute.py
@@ -24,7 +24,9 @@
 hostname
 conda activate py36
 cd {dir}
-""".format(dir=os.getcwd())
+""".format(
+    dir=os.getcwd()
+)
 
 
 def translate_paths(in_paths, dest_dir):
@@ -38,17 +40,19 @@ def aux():
         for path in in_paths:
             fname = os.path.basename(path)
             dest_path = os.path.join(dest_dir, fname)
-            yield dest_path, f'cp {path} {dest_path}'
+            yield dest_path, f"cp {path} {dest_path}"
 
     return zip(*aux())
 
 
 @click.command()
-@click.option('--clusters', default='', help='Clusters to submit to. Default is local execution.')
-@click.option('--script-prefix', default='job', help='Prefix for job script name.')
-@click.argument('sources')
-@click.argument('targets')
-@click.argument('to_execute_f_string')
+@click.option(
+    "--clusters", default="", help="Clusters to submit to. Default is local execution."
+)
+@click.option("--script-prefix", default="job", help="Prefix for job script name.")
+@click.argument("sources")
+@click.argument("targets")
+@click.argument("to_execute_f_string")
 def cli(clusters, script_prefix, sources, targets, to_execute_f_string):
     """
     Execute a command with certain sources and targets, perhaps on a SLURM
@@ -64,10 +68,10 @@ def cli(clusters, script_prefix, sources, targets, to_execute_f_string):
     """
 
     # Remove all quotes: they can get in the way with our basename noodling.
-    sources = re.sub('"*\'*', '', sources)
-    targets = re.sub('"*\'*', '', targets)
+    sources = re.sub("\"*'*", "", sources)
+    targets = re.sub("\"*'*", "", targets)
 
-    if clusters == '':
+    if clusters == "":
         # Local execution.
         to_execute = to_execute_f_string.format(sources=sources, targets=targets)
         click.echo("Executing locally:")
@@ -76,32 +80,34 @@ def cli(clusters, script_prefix, sources, targets, to_execute_f_string):
 
     job_uuid = uuid.uuid4().hex
     cluster_directory_d = {
-        'beagle': '/mnt/beagle/delete10/matsen_e/vampire/uuid',
-        'koshu': '/fh/scratch/delete30/matsen_e/vampire/uuid'
+        "beagle": "/mnt/beagle/delete10/matsen_e/vampire/uuid",
+        "koshu": "/fh/scratch/delete30/matsen_e/vampire/uuid",
     }
 
     if clusters in cluster_directory_d:
         # Put the data where the cluster likes it.
         input_dir = os.path.join(cluster_directory_d[clusters], job_uuid)
         sources_l, cp_instructions = translate_paths(sources.split(), input_dir)
-        cp_instructions = [f'mkdir -p {input_dir}'] + list(cp_instructions)
-        sources = ' '.join(sources_l)
+        cp_instructions = [f"mkdir -p {input_dir}"] + list(cp_instructions)
+        sources = " ".join(sources_l)
     else:
         cp_instructions = []
 
     # Put the batch script in the directory of the first target.
     execution_dir = os.path.dirname(targets.split()[0])
-    sentinel_path = os.path.join(execution_dir, 'sentinel.' + job_uuid)
-    script_name = f'{script_prefix}.{job_uuid}.sh'
-    with open(os.path.join(execution_dir, script_name), 'w') as fp:
+    sentinel_path = os.path.join(execution_dir, "sentinel." + job_uuid)
+    script_name = f"{script_prefix}.{job_uuid}.sh"
+    with open(os.path.join(execution_dir, script_name), "w") as fp:
         fp.write(sbatch_prelude)
         for instruction in cp_instructions:
-            fp.write(instruction + '\n')
-        fp.write(to_execute_f_string.format(sources=sources, targets=targets) + '\n')
-        fp.write(f'touch {sentinel_path}\n')
+            fp.write(instruction + "\n")
+        fp.write(to_execute_f_string.format(sources=sources, targets=targets) + "\n")
+        fp.write(f"touch {sentinel_path}\n")
 
-    out = subprocess.check_output(f'cd {execution_dir}; sbatch --clusters {clusters} {script_name}', shell=True)
-    click.echo(out.decode('UTF-8'))
+    out = subprocess.check_output(
+        f"cd {execution_dir}; sbatch --clusters {clusters} {script_name}", shell=True
+    )
+    click.echo(out.decode("UTF-8"))
 
     # Wait until the sentinel file appears, then clean up.
     while not os.path.exists(sentinel_path):
@@ -111,5 +117,5 @@ def cli(clusters, script_prefix, sources, targets, to_execute_f_string):
     return out
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/vampire/gene_name_conversion.py b/vampire/gene_name_conversion.py
index 30bfbcb..17dd092 100644
--- a/vampire/gene_name_conversion.py
+++ b/vampire/gene_name_conversion.py
@@ -3,17 +3,23 @@
 
 import vampire.common as common
 
-translation_csv = 'adaptive-olga-translation.csv'
+translation_csv = "adaptive-olga-translation.csv"
 
 
 def adaptive_to_olga_dict():
-    gb = common.read_data_csv(translation_csv).dropna().groupby('locus')
-    return {locus: {row['adaptive']: row['olga'] for _, row in df.iterrows()} for locus, df in gb}
+    gb = common.read_data_csv(translation_csv).dropna().groupby("locus")
+    return {
+        locus: {row["adaptive"]: row["olga"] for _, row in df.iterrows()}
+        for locus, df in gb
+    }
 
 
 def olga_to_adaptive_dict():
-    gb = common.read_data_csv(translation_csv).dropna().groupby('locus')
-    return {locus: {row['olga']: row['adaptive'] for _, row in df.iterrows()} for locus, df in gb}
+    gb = common.read_data_csv(translation_csv).dropna().groupby("locus")
+    return {
+        locus: {row["olga"]: row["adaptive"] for _, row in df.iterrows()}
+        for locus, df in gb
+    }
 
 
 def filter_by_gene_names(df, conversion_dict):
@@ -21,22 +27,28 @@ def filter_by_gene_names(df, conversion_dict):
     Only allow through gene names that are present for both programs.
     """
     allowed = {locus: set(d.keys()) for locus, d in conversion_dict.items()}
-    return df.loc[df['v_gene'].isin(allowed['TRBV']) & df['j_gene'].isin(allowed['TRBJ']), :]
+    return df.loc[
+        df["v_gene"].isin(allowed["TRBV"]) & df["j_gene"].isin(allowed["TRBJ"]), :
+    ]
 
 
 def convert_gene_names(df, conversion_dict):
     converted = df.copy()
-    converted['v_gene'] = df['v_gene'].map(conversion_dict['TRBV'].get)
-    converted['j_gene'] = df['j_gene'].map(conversion_dict['TRBJ'].get)
+    converted["v_gene"] = df["v_gene"].map(conversion_dict["TRBV"].get)
+    converted["j_gene"] = df["j_gene"].map(conversion_dict["TRBJ"].get)
     return converted
 
 
 def convert_and_filter(df, conversion_dict):
     orig_len = len(df)
-    converted = convert_gene_names(filter_by_gene_names(df, conversion_dict), conversion_dict)
+    converted = convert_gene_names(
+        filter_by_gene_names(df, conversion_dict), conversion_dict
+    )
     n_trimmed = orig_len - len(converted)
     if n_trimmed > 0:
-        click.echo(f"Warning: couldn't convert {n_trimmed} sequences and trimmed them off.")
+        click.echo(
+            f"Warning: couldn't convert {n_trimmed} sequences and trimmed them off."
+        )
     return converted
 
 
@@ -49,24 +61,26 @@ def cli():
 
 
 @cli.command()
-@click.argument('adaptive_csv', type=click.Path(exists=True))
-@click.argument('olga_tsv', type=click.Path(writable=True))
+@click.argument("adaptive_csv", type=click.Path(exists=True))
+@click.argument("olga_tsv", type=click.Path(writable=True))
 def adaptive2olga(adaptive_csv, olga_tsv):
-    df = pd.read_csv(adaptive_csv, usecols=['amino_acid', 'v_gene', 'j_gene'])
-    convert_and_filter(df, adaptive_to_olga_dict()).to_csv(olga_tsv, sep='\t', index=False, header=False)
+    df = pd.read_csv(adaptive_csv, usecols=["amino_acid", "v_gene", "j_gene"])
+    convert_and_filter(df, adaptive_to_olga_dict()).to_csv(
+        olga_tsv, sep="\t", index=False, header=False
+    )
 
 
 @cli.command()
-@click.argument('olga_tsv', type=click.Path(exists=True))
-@click.argument('adaptive_csv', type=click.Path(writable=True))
+@click.argument("olga_tsv", type=click.Path(exists=True))
+@click.argument("adaptive_csv", type=click.Path(writable=True))
 def olga2adaptive(olga_tsv, adaptive_csv):
-    df = pd.read_csv(olga_tsv, sep='\t', header=None)
+    df = pd.read_csv(olga_tsv, sep="\t", header=None)
     if len(df.columns) == 4:
         df = df.iloc[:, 1:4]
     assert len(df.columns) == 3
-    df.columns = ['amino_acid', 'v_gene', 'j_gene']
+    df.columns = ["amino_acid", "v_gene", "j_gene"]
     convert_and_filter(df, olga_to_adaptive_dict()).to_csv(adaptive_csv, index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/vampire/germline_cdr3_aa_tensor.py b/vampire/germline_cdr3_aa_tensor.py
index 3e88f4c..f5ca757 100644
--- a/vampire/germline_cdr3_aa_tensor.py
+++ b/vampire/germline_cdr3_aa_tensor.py
@@ -2,7 +2,9 @@
 import numpy as np
 
 
-def aa_encoding_tensors(germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len):
+def aa_encoding_tensors(
+    germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len
+):
     """
     Build tensors that one-hot-encode the germline sequences that extend into the CDR3.
     V genes are left-aligned, while J genes are right-aligned.
@@ -16,25 +18,24 @@ def aa_encoding_tensors(germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, m
     cdr3_aas = pd.read_csv(germline_cdr3_csv, keep_default_na=False)
 
     d = {
-        locus: {gene: seq['sequence'].iloc[0]
-                for gene, seq in df.groupby('gene')}
-        for locus, df in cdr3_aas.groupby(['locus'])
+        locus: {gene: seq["sequence"].iloc[0] for gene, seq in df.groupby("gene")}
+        for locus, df in cdr3_aas.groupby(["locus"])
     }
 
     # Make sure that we have all of the desired genes in our dictionary.
-    assert set(v_gene_list).issubset(d['TRBV'].keys())
-    assert set(j_gene_list).issubset(d['TRBJ'].keys())
+    assert set(v_gene_list).issubset(d["TRBV"].keys())
+    assert set(j_gene_list).issubset(d["TRBJ"].keys())
 
     v_gene_encoding = np.zeros((len(v_gene_list), max_cdr3_len, len(aa_list)))
     for gene in v_gene_list:
-        seq = d['TRBV'][gene]
+        seq = d["TRBV"][gene]
         gene_index = v_gene_dict[gene]
         for i, c in enumerate(seq):
             v_gene_encoding[gene_index, i, aa_dict[c]] = 1
 
     j_gene_encoding = np.zeros((len(j_gene_list), max_cdr3_len, len(aa_list)))
     for gene in j_gene_list:
-        seq = d['TRBJ'][gene]
+        seq = d["TRBJ"][gene]
         gene_index = j_gene_dict[gene]
         # Here's how the right-aligned indexing works:
         start = max_cdr3_len - len(seq)
diff --git a/vampire/layers.py b/vampire/layers.py
index 1181743..063d4dc 100644
--- a/vampire/layers.py
+++ b/vampire/layers.py
@@ -23,8 +23,14 @@ def build(self, input_shape):
         # Create a trainable weight variable for this layer.
         # The first component of input_shape is the batch size (see https://keras.io/layers/core/#dense).
         self.kernel = self.add_weight(
-            name='kernel', shape=(input_shape[2], self.embedding_dim), initializer='uniform', trainable=True)
-        super(EmbedViaMatrix, self).build(input_shape)  # Be sure to call this at the end
+            name="kernel",
+            shape=(input_shape[2], self.embedding_dim),
+            initializer="uniform",
+            trainable=True,
+        )
+        super(EmbedViaMatrix, self).build(
+            input_shape
+        )  # Be sure to call this at the end
 
     def call(self, x):
         return K.dot(x, self.kernel)
@@ -43,13 +49,15 @@ class RightTensordot(Layer):
     """
 
     def __init__(self, right_np_tensor, axes, **kwargs):
-        self.right_tf_tensor = tf.convert_to_tensor(right_np_tensor, dtype=tf.float32)
+        self.right_tf_tensor = tf.convert_to_tensor(value=right_np_tensor, dtype=tf.float32)
         assert type(axes) == int
         self.axes = axes
         super(RightTensordot, self).__init__(**kwargs)
 
     def build(self, input_shape):
-        super(RightTensordot, self).build(input_shape)  # Be sure to call this at the end
+        super(RightTensordot, self).build(
+            input_shape
+        )  # Be sure to call this at the end
 
     def call(self, x):
         # Tensordotting with sums over the last `axes` indices of the first argument
@@ -91,8 +99,9 @@ def call(self, x):
             # 20 - argmax will be >= 1 for any site that doesn't have gap in it.
             # Then we clip so it's = 1 for any site that doesn't have gap in it.
             # Note argmax with axis-=1 means over the last (amino acid) axis.
-            tf.clip_by_value(20. - tf.to_float(tf.argmax(x, axis=-1)), 0., 1.),
-            axis=1)
+            tf.clip_by_value(20.0 - tf.cast(tf.argmax(input=x, axis=-1), dtype=tf.float32), 0.0, 1.0),
+            axis=1,
+        )
 
     def compute_output_shape(self, input_shape):
         # Make sure we have 21 states for our amino acid space.
@@ -144,7 +153,9 @@ def __init__(self, v_max_germline_aas, j_max_germline_aas, **kwargs):
         super(ContiguousMatch, self).__init__(**kwargs)
 
     def build(self, input_shape):
-        super(ContiguousMatch, self).build(input_shape)  # Be sure to call this at the end
+        super(ContiguousMatch, self).build(
+            input_shape
+        )  # Be sure to call this at the end
 
     def call(self, inputs):
         (x, v_germline_aa_onehot, j_germline_aa_onehot) = inputs
@@ -153,10 +164,20 @@ def call(self, inputs):
         # then sum is to get an indicator of the germline matches of x across
         # the germline genes. The cumprod_sum then returns the number of
         # contiguous matches for the V and J gene sides.
-        return tf.stack([
-                cumprod_sum(K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2), self.v_max_germline_aas),
-                cumprod_sum(K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2), self.j_max_germline_aas, reverse=True)
-            ], axis=1)
+        return tf.stack(
+            [
+                cumprod_sum(
+                    K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2),
+                    self.v_max_germline_aas,
+                ),
+                cumprod_sum(
+                    K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2),
+                    self.j_max_germline_aas,
+                    reverse=True,
+                ),
+            ],
+            axis=1,
+        )
 
     def compute_output_shape(self, input_shape):
         # All input should be of shape (batch_size, max_cdr3_len, len(AA_LIST)).
diff --git a/vampire/models/basic.py b/vampire/models/basic.py
index 3b5a906..cd25bc9 100644
--- a/vampire/models/basic.py
+++ b/vampire/models/basic.py
@@ -7,11 +7,12 @@
 
 import numpy as np
 
-import keras
-from keras.models import Model
-from keras.layers import Activation, Dense, Lambda, Input, Reshape
-from keras import backend as K
-from keras import objectives
+import tensorflow as tf
+import tensorflow.keras as keras
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Activation, Dense, Lambda, Input, Reshape
+from tensorflow.keras import backend as K
+from tensorflow.keras import objectives
 
 import vampire.common as common
 from vampire.custom_keras import BetaWarmup, EmbedViaMatrix
@@ -19,7 +20,7 @@
 
 def build(params):
 
-    beta = K.variable(params['beta'])
+    beta = K.variable(params["beta"])
 
     def sampling(args):
         """
@@ -27,9 +28,11 @@ def sampling(args):
         the latent variables.
         """
         z_mean, z_log_var = args
-        epsilon = K.random_normal(shape=(params['batch_size'], params['latent_dim']), mean=0.0, stddev=1.0)
+        epsilon = K.random_normal(
+            shape=(params["batch_size"], params["latent_dim"]), mean=0.0, stddev=1.0
+        )
         # Reparameterization trick!
-        return (z_mean + K.exp(z_log_var / 2) * epsilon)
+        return z_mean + K.exp(z_log_var / 2) * epsilon
 
     def vae_cdr3_loss(io_encoder, io_decoder):
         """
@@ -38,63 +41,103 @@ def vae_cdr3_loss(io_encoder, io_decoder):
         """
         # Here we multiply by the number of sites, so that we have a
         # total loss across the sites rather than a mean loss.
-        xent_loss = params['max_cdr3_len'] * K.mean(objectives.categorical_crossentropy(io_encoder, io_decoder))
-        kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
+        xent_loss = params["max_cdr3_len"] * K.mean(
+            objectives.categorical_crossentropy(io_encoder, io_decoder)
+        )
+        kl_loss = -0.5 * K.sum(
+            1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1
+        )
         kl_loss *= beta
-        return (xent_loss + kl_loss)
+        return xent_loss + kl_loss
 
     # Input:
-    cdr3_input_shape = (params['max_cdr3_len'], params['n_aas'])
-    cdr3_input = Input(shape=cdr3_input_shape, name='cdr3_input')
-    v_gene_input = Input(shape=(params['n_v_genes'], ), name='v_gene_input')
-    j_gene_input = Input(shape=(params['n_j_genes'], ), name='j_gene_input')
+    cdr3_input_shape = (params["max_cdr3_len"], params["n_aas"])
+    cdr3_input = Input(shape=cdr3_input_shape, name="cdr3_input")
+    v_gene_input = Input(shape=(params["n_v_genes"],), name="v_gene_input")
+    j_gene_input = Input(shape=(params["n_j_genes"],), name="j_gene_input")
 
     # Encoding layers:
-    cdr3_embedding = EmbedViaMatrix(params['aa_embedding_dim'], name='cdr3_embedding')(cdr3_input)
-    cdr3_embedding_flat = Reshape([params['aa_embedding_dim'] * params['max_cdr3_len']],
-                                  name='cdr3_embedding_flat')(cdr3_embedding)
-    v_gene_embedding = Dense(params['v_gene_embedding_dim'], name='v_gene_embedding')(v_gene_input)
-    j_gene_embedding = Dense(params['j_gene_embedding_dim'], name='j_gene_embedding')(j_gene_input)
-    merged_embedding = keras.layers.concatenate([cdr3_embedding_flat, v_gene_embedding, j_gene_embedding],
-                                                name='merged_embedding')
-    encoder_dense_1 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_1')(merged_embedding)
-    encoder_dense_2 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_2')(encoder_dense_1)
+    cdr3_embedding = EmbedViaMatrix(params["aa_embedding_dim"], name="cdr3_embedding")(
+        cdr3_input
+    )
+    cdr3_embedding_flat = Reshape(
+        [params["aa_embedding_dim"] * params["max_cdr3_len"]],
+        name="cdr3_embedding_flat",
+    )(cdr3_embedding)
+    v_gene_embedding = Dense(params["v_gene_embedding_dim"], name="v_gene_embedding")(
+        v_gene_input
+    )
+    j_gene_embedding = Dense(params["j_gene_embedding_dim"], name="j_gene_embedding")(
+        j_gene_input
+    )
+    merged_embedding = keras.layers.concatenate(
+        [cdr3_embedding_flat, v_gene_embedding, j_gene_embedding],
+        name="merged_embedding",
+    )
+    encoder_dense_1 = Dense(
+        params["dense_nodes"], activation="elu", name="encoder_dense_1"
+    )(merged_embedding)
+    encoder_dense_2 = Dense(
+        params["dense_nodes"], activation="elu", name="encoder_dense_2"
+    )(encoder_dense_1)
 
     # Latent layers:
-    z_mean = Dense(params['latent_dim'], name='z_mean')(encoder_dense_2)
-    z_log_var = Dense(params['latent_dim'], name='z_log_var')(encoder_dense_2)
+    z_mean = Dense(params["latent_dim"], name="z_mean")(encoder_dense_2)
+    z_log_var = Dense(params["latent_dim"], name="z_log_var")(encoder_dense_2)
 
     # Decoding layers:
-    z_l = Lambda(sampling, output_shape=(params['latent_dim'], ), name='z')
-    decoder_dense_1_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_1')
-    decoder_dense_2_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_2')
-    cdr3_post_dense_flat_l = Dense(np.array(cdr3_input_shape).prod(), activation='linear', name='cdr3_post_dense_flat')
-    cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name='cdr3_post_dense')
-    cdr3_output_l = Activation(activation='softmax', name='cdr3_output')
-    v_gene_output_l = Dense(params['n_v_genes'], activation='softmax', name='v_gene_output')
-    j_gene_output_l = Dense(params['n_j_genes'], activation='softmax', name='j_gene_output')
+    z_l = Lambda(sampling, output_shape=(params["latent_dim"],), name="z")
+    decoder_dense_1_l = Dense(
+        params["dense_nodes"], activation="elu", name="decoder_dense_1"
+    )
+    decoder_dense_2_l = Dense(
+        params["dense_nodes"], activation="elu", name="decoder_dense_2"
+    )
+    cdr3_post_dense_flat_l = Dense(
+        np.array(cdr3_input_shape).prod(),
+        activation="linear",
+        name="cdr3_post_dense_flat",
+    )
+    cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name="cdr3_post_dense")
+    cdr3_output_l = Activation(activation="softmax", name="cdr3_output")
+    v_gene_output_l = Dense(
+        params["n_v_genes"], activation="softmax", name="v_gene_output"
+    )
+    j_gene_output_l = Dense(
+        params["n_j_genes"], activation="softmax", name="j_gene_output"
+    )
 
     post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_l([z_mean, z_log_var])))
-    cdr3_output = cdr3_output_l(cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder)))
+    cdr3_output = cdr3_output_l(
+        cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder))
+    )
     v_gene_output = v_gene_output_l(post_decoder)
     j_gene_output = j_gene_output_l(post_decoder)
 
     # Define the decoder components separately so we can have it as its own model.
-    z_mean_input = Input(shape=(params['latent_dim'], ))
+    z_mean_input = Input(shape=(params["latent_dim"],))
     decoder_post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_mean_input))
-    decoder_cdr3_output = cdr3_output_l(cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder)))
+    decoder_cdr3_output = cdr3_output_l(
+        cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder))
+    )
     decoder_v_gene_output = v_gene_output_l(decoder_post_decoder)
     decoder_j_gene_output = j_gene_output_l(decoder_post_decoder)
 
     encoder = Model([cdr3_input, v_gene_input, j_gene_input], [z_mean, z_log_var])
-    decoder = Model(z_mean_input, [decoder_cdr3_output, decoder_v_gene_output, decoder_j_gene_output])
-    vae = Model([cdr3_input, v_gene_input, j_gene_input], [cdr3_output, v_gene_output, j_gene_output])
+    decoder = Model(
+        z_mean_input,
+        [decoder_cdr3_output, decoder_v_gene_output, decoder_j_gene_output],
+    )
+    vae = Model(
+        [cdr3_input, v_gene_input, j_gene_input],
+        [cdr3_output, v_gene_output, j_gene_output],
+    )
     vae.compile(
         optimizer="adam",
         loss={
-            'cdr3_output': vae_cdr3_loss,
-            'v_gene_output': keras.losses.categorical_crossentropy,
-            'j_gene_output': keras.losses.categorical_crossentropy,
+            "cdr3_output": vae_cdr3_loss,
+            "v_gene_output": keras.losses.categorical_crossentropy,
+            "j_gene_output": keras.losses.categorical_crossentropy,
         },
         loss_weights={
             # Keep the cdr3_output weight to be 1. The weights are relative
@@ -103,12 +146,13 @@ def vae_cdr3_loss(io_encoder, io_decoder):
             # weight as 1 then we can interpret beta in a straightforward way.
             "cdr3_output": 1,
             "j_gene_output": 0.1305,
-            "v_gene_output": 0.8138
-        })
+            "v_gene_output": 0.8138,
+        },
+    )
 
-    callbacks = [BetaWarmup(beta, params['beta'], params['warmup_period'])]
+    callbacks = [BetaWarmup(beta, params["beta"], params["warmup_period"])]
 
-    return {'encoder': encoder, 'decoder': decoder, 'vae': vae, 'callbacks': callbacks}
+    return {"encoder": encoder, "decoder": decoder, "vae": vae, "callbacks": callbacks}
 
 
 def prepare_data(x_df):
diff --git a/vampire/models/count_match.py b/vampire/models/count_match.py
index c4451b6..e2ee43e 100644
--- a/vampire/models/count_match.py
+++ b/vampire/models/count_match.py
@@ -28,14 +28,20 @@
 
 import vampire.common as common
 import vampire.xcr_vector_conversion as conversion
-from vampire.custom_keras import BetaWarmup, CDR3Length, ContiguousMatch, EmbedViaMatrix, RightTensordot
+from vampire.custom_keras import (
+    BetaWarmup,
+    CDR3Length,
+    ContiguousMatch,
+    EmbedViaMatrix,
+    RightTensordot,
+)
 
 from vampire.germline_cdr3_aa_tensor import max_germline_aas
 
 
 def build(params):
 
-    beta = K.variable(params['beta'])
+    beta = K.variable(params["beta"])
 
     def sampling(args):
         """
@@ -43,9 +49,11 @@ def sampling(args):
         the latent variables.
         """
         z_mean, z_log_var = args
-        epsilon = K.random_normal(shape=(params['batch_size'], params['latent_dim']), mean=0.0, stddev=1.0)
+        epsilon = K.random_normal(
+            shape=(params["batch_size"], params["latent_dim"]), mean=0.0, stddev=1.0
+        )
         # Reparameterization trick!
-        return (z_mean + K.exp(z_log_var / 2) * epsilon)
+        return z_mean + K.exp(z_log_var / 2) * epsilon
 
     def vae_cdr3_loss(io_encoder, io_decoder):
         """
@@ -54,11 +62,14 @@ def vae_cdr3_loss(io_encoder, io_decoder):
         """
         # Here we multiply by the number of sites, so that we have a
         # total loss across the sites rather than a mean loss.
-        xent_loss = params['max_cdr3_len'] * K.mean(
-            objectives.categorical_crossentropy(io_encoder, io_decoder))
-        kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
+        xent_loss = params["max_cdr3_len"] * K.mean(
+            objectives.categorical_crossentropy(io_encoder, io_decoder)
+        )
+        kl_loss = -0.5 * K.sum(
+            1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1
+        )
         kl_loss *= beta
-        return (xent_loss + kl_loss)
+        return xent_loss + kl_loss
 
     def mean_squared_error_2d(io_encoder, io_decoder):
         def mse(i):
@@ -67,93 +78,173 @@ def mse(i):
         return mse(0) + mse(1)
 
     # Input:
-    cdr3_input_shape = (params['max_cdr3_len'], params['n_aas'])
-    cdr3_input = Input(shape=cdr3_input_shape, name='cdr3_input')
-    cdr3_length_input = Input(shape=(1, ), name='cdr3_length_input')
-    v_gene_input = Input(shape=(params['n_v_genes'], ), name='v_gene_input')
-    j_gene_input = Input(shape=(params['n_j_genes'], ), name='j_gene_input')
-    contiguous_match_input = Input(shape=(2, ), name='contiguous_match_input')
+    cdr3_input_shape = (params["max_cdr3_len"], params["n_aas"])
+    cdr3_input = Input(shape=cdr3_input_shape, name="cdr3_input")
+    cdr3_length_input = Input(shape=(1,), name="cdr3_length_input")
+    v_gene_input = Input(shape=(params["n_v_genes"],), name="v_gene_input")
+    j_gene_input = Input(shape=(params["n_j_genes"],), name="j_gene_input")
+    contiguous_match_input = Input(shape=(2,), name="contiguous_match_input")
 
     # Encoding layers:
-    cdr3_embedding = EmbedViaMatrix(params['aa_embedding_dim'], name='cdr3_embedding')(cdr3_input)
-    cdr3_embedding_flat = Reshape([params['aa_embedding_dim'] * params['max_cdr3_len']],
-                                  name='cdr3_embedding_flat')(cdr3_embedding)
-    v_gene_embedding = Dense(params['v_gene_embedding_dim'], name='v_gene_embedding')(v_gene_input)
-    j_gene_embedding = Dense(params['j_gene_embedding_dim'], name='j_gene_embedding')(j_gene_input)
+    cdr3_embedding = EmbedViaMatrix(params["aa_embedding_dim"], name="cdr3_embedding")(
+        cdr3_input
+    )
+    cdr3_embedding_flat = Reshape(
+        [params["aa_embedding_dim"] * params["max_cdr3_len"]],
+        name="cdr3_embedding_flat",
+    )(cdr3_embedding)
+    v_gene_embedding = Dense(params["v_gene_embedding_dim"], name="v_gene_embedding")(
+        v_gene_input
+    )
+    j_gene_embedding = Dense(params["j_gene_embedding_dim"], name="j_gene_embedding")(
+        j_gene_input
+    )
     merged_embedding = keras.layers.concatenate(
-        [cdr3_embedding_flat, cdr3_length_input, v_gene_embedding, j_gene_embedding, contiguous_match_input],
-        name='merged_embedding')
-    encoder_dense_1 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_1')(merged_embedding)
-    encoder_dense_2 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_2')(encoder_dense_1)
+        [
+            cdr3_embedding_flat,
+            cdr3_length_input,
+            v_gene_embedding,
+            j_gene_embedding,
+            contiguous_match_input,
+        ],
+        name="merged_embedding",
+    )
+    encoder_dense_1 = Dense(
+        params["dense_nodes"], activation="elu", name="encoder_dense_1"
+    )(merged_embedding)
+    encoder_dense_2 = Dense(
+        params["dense_nodes"], activation="elu", name="encoder_dense_2"
+    )(encoder_dense_1)
 
     # Latent layers:
-    z_mean = Dense(params['latent_dim'], name='z_mean')(encoder_dense_2)
-    z_log_var = Dense(params['latent_dim'], name='z_log_var')(encoder_dense_2)
+    z_mean = Dense(params["latent_dim"], name="z_mean")(encoder_dense_2)
+    z_log_var = Dense(params["latent_dim"], name="z_log_var")(encoder_dense_2)
 
     # Decoding layers:
-    z_l = Lambda(sampling, output_shape=(params['latent_dim'], ), name='z')
-    decoder_dense_1_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_1')
-    decoder_dense_2_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_2')
-    cdr3_post_dense_flat_l = Dense(np.array(cdr3_input_shape).prod(), activation='linear', name='cdr3_post_dense_flat')
-    cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name='cdr3_post_dense')
-    cdr3_output_l = Activation(activation='softmax', name='cdr3_output')
-    v_gene_output_l = Dense(params['n_v_genes'], activation='softmax', name='v_gene_output')
-    j_gene_output_l = Dense(params['n_j_genes'], activation='softmax', name='j_gene_output')
+    z_l = Lambda(sampling, output_shape=(params["latent_dim"],), name="z")
+    decoder_dense_1_l = Dense(
+        params["dense_nodes"], activation="elu", name="decoder_dense_1"
+    )
+    decoder_dense_2_l = Dense(
+        params["dense_nodes"], activation="elu", name="decoder_dense_2"
+    )
+    cdr3_post_dense_flat_l = Dense(
+        np.array(cdr3_input_shape).prod(),
+        activation="linear",
+        name="cdr3_post_dense_flat",
+    )
+    cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name="cdr3_post_dense")
+    cdr3_output_l = Activation(activation="softmax", name="cdr3_output")
+    v_gene_output_l = Dense(
+        params["n_v_genes"], activation="softmax", name="v_gene_output"
+    )
+    j_gene_output_l = Dense(
+        params["n_j_genes"], activation="softmax", name="j_gene_output"
+    )
 
     post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_l([z_mean, z_log_var])))
     v_gene_output = v_gene_output_l(post_decoder)
     j_gene_output = j_gene_output_l(post_decoder)
 
     # Here's where we incorporate germline amino acid sequences into the output.
-    germline_cdr3_tensors = conversion.adaptive_aa_encoding_tensors(params['max_cdr3_len'])
+    germline_cdr3_tensors = conversion.adaptive_aa_encoding_tensors(
+        params["max_cdr3_len"]
+    )
     (v_germline_cdr3_tensor, j_germline_cdr3_tensor) = germline_cdr3_tensors
-    (v_max_germline_aas, j_max_germline_aas) = [max_germline_aas(g) for g in germline_cdr3_tensors]
-    v_germline_cdr3_l = RightTensordot(v_germline_cdr3_tensor, axes=1, name='v_germline_cdr3')
-    j_germline_cdr3_l = RightTensordot(j_germline_cdr3_tensor, axes=1, name='j_germline_cdr3')
-    cdr3_length_output_l = CDR3Length(name='cdr3_length_output')
-    contiguous_match_output_l = ContiguousMatch(v_max_germline_aas, j_max_germline_aas, name='contiguous_match_output')
+    (v_max_germline_aas, j_max_germline_aas) = [
+        max_germline_aas(g) for g in germline_cdr3_tensors
+    ]
+    v_germline_cdr3_l = RightTensordot(
+        v_germline_cdr3_tensor, axes=1, name="v_germline_cdr3"
+    )
+    j_germline_cdr3_l = RightTensordot(
+        j_germline_cdr3_tensor, axes=1, name="j_germline_cdr3"
+    )
+    cdr3_length_output_l = CDR3Length(name="cdr3_length_output")
+    contiguous_match_output_l = ContiguousMatch(
+        v_max_germline_aas, j_max_germline_aas, name="contiguous_match_output"
+    )
     v_germline_cdr3 = v_germline_cdr3_l(v_gene_output)
     j_germline_cdr3 = j_germline_cdr3_l(j_gene_output)
     cdr3_output = cdr3_output_l(
-        Add(name='cdr3_pre_activation')([
-            cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder)),
-            Add(name='germline_cdr3')([v_germline_cdr3, j_germline_cdr3])
-        ]))
+        Add(name="cdr3_pre_activation")(
+            [
+                cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder)),
+                Add(name="germline_cdr3")([v_germline_cdr3, j_germline_cdr3]),
+            ]
+        )
+    )
     cdr3_length_output = cdr3_length_output_l(cdr3_output)
-    contiguous_match_output = contiguous_match_output_l([cdr3_output, v_germline_cdr3, j_germline_cdr3])
+    contiguous_match_output = contiguous_match_output_l(
+        [cdr3_output, v_germline_cdr3, j_germline_cdr3]
+    )
 
     # Define the decoder components separately so we can have it as its own model.
-    z_mean_input = Input(shape=(params['latent_dim'], ))
+    z_mean_input = Input(shape=(params["latent_dim"],))
     decoder_post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_mean_input))
     decoder_v_gene_output = v_gene_output_l(decoder_post_decoder)
     decoder_j_gene_output = j_gene_output_l(decoder_post_decoder)
     decoder_v_germline_cdr3 = v_germline_cdr3_l(decoder_v_gene_output)
     decoder_j_germline_cdr3 = j_germline_cdr3_l(decoder_j_gene_output)
     decoder_cdr3_output = cdr3_output_l(
-        Add(name='cdr3_pre_activation')([
-            cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder)),
-            Add(name='germline_cdr3')([decoder_v_germline_cdr3, decoder_j_germline_cdr3])
-        ]))
+        Add(name="cdr3_pre_activation")(
+            [
+                cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder)),
+                Add(name="germline_cdr3")(
+                    [decoder_v_germline_cdr3, decoder_j_germline_cdr3]
+                ),
+            ]
+        )
+    )
     decoder_cdr3_length_output = cdr3_length_output_l(decoder_cdr3_output)
     decoder_contiguous_match_output = contiguous_match_output_l(
-        [decoder_cdr3_output, decoder_v_germline_cdr3, decoder_j_germline_cdr3])
-
-    encoder = Model([cdr3_input, cdr3_length_input, v_gene_input, j_gene_input, contiguous_match_input],
-                    [z_mean, z_log_var])
-    decoder = Model(z_mean_input, [
-        decoder_cdr3_output, decoder_cdr3_length_output, decoder_v_gene_output, decoder_j_gene_output,
-        decoder_contiguous_match_output
-    ])
-    vae = Model([cdr3_input, cdr3_length_input, v_gene_input, j_gene_input, contiguous_match_input],
-                [cdr3_output, cdr3_length_output, v_gene_output, j_gene_output, contiguous_match_output])
+        [decoder_cdr3_output, decoder_v_germline_cdr3, decoder_j_germline_cdr3]
+    )
+
+    encoder = Model(
+        [
+            cdr3_input,
+            cdr3_length_input,
+            v_gene_input,
+            j_gene_input,
+            contiguous_match_input,
+        ],
+        [z_mean, z_log_var],
+    )
+    decoder = Model(
+        z_mean_input,
+        [
+            decoder_cdr3_output,
+            decoder_cdr3_length_output,
+            decoder_v_gene_output,
+            decoder_j_gene_output,
+            decoder_contiguous_match_output,
+        ],
+    )
+    vae = Model(
+        [
+            cdr3_input,
+            cdr3_length_input,
+            v_gene_input,
+            j_gene_input,
+            contiguous_match_input,
+        ],
+        [
+            cdr3_output,
+            cdr3_length_output,
+            v_gene_output,
+            j_gene_output,
+            contiguous_match_output,
+        ],
+    )
     vae.compile(
         optimizer="adam",
         loss={
-            'cdr3_output': vae_cdr3_loss,
-            'cdr3_length_output': keras.losses.mean_squared_error,
-            'v_gene_output': keras.losses.categorical_crossentropy,
-            'j_gene_output': keras.losses.categorical_crossentropy,
-            'contiguous_match_output': mean_squared_error_2d
+            "cdr3_output": vae_cdr3_loss,
+            "cdr3_length_output": keras.losses.mean_squared_error,
+            "v_gene_output": keras.losses.categorical_crossentropy,
+            "j_gene_output": keras.losses.categorical_crossentropy,
+            "contiguous_match_output": mean_squared_error_2d,
         },
         loss_weights={
             # Keep the cdr3_output weight to be 1. The weights are relative
@@ -164,23 +255,40 @@ def mse(i):
             "cdr3_output": 1,
             "j_gene_output": 0.1596,
             "v_gene_output": 0.9282,
-            "contiguous_match_output": 0.05
-        })
+            "contiguous_match_output": 0.05,
+        },
+    )
 
-    callbacks = [BetaWarmup(beta, params['beta'], params['warmup_period'])]
+    callbacks = [BetaWarmup(beta, params["beta"], params["warmup_period"])]
 
-    return {'encoder': encoder, 'decoder': decoder, 'vae': vae, 'callbacks': callbacks}
+    return {"encoder": encoder, "decoder": decoder, "vae": vae, "callbacks": callbacks}
 
 
 def prepare_data(x_df):
     cdr3_data, v_gene_data, j_gene_data = common.cols_of_df(x_df)
-    cdr3_length_data = conversion.cdr3_length_of_onehots(x_df['amino_acid'])
+    cdr3_length_data = conversion.cdr3_length_of_onehots(x_df["amino_acid"])
     max_cdr3_len = cdr3_data.shape[1]
-    v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors(max_cdr3_len)
-    contiguous_match_data = conversion.contiguous_match_counts_df(x_df, v_germline_tensor, j_germline_tensor)
-    return [cdr3_data, cdr3_length_data, v_gene_data, j_gene_data, contiguous_match_data]
+    v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors(
+        max_cdr3_len
+    )
+    contiguous_match_data = conversion.contiguous_match_counts_df(
+        x_df, v_germline_tensor, j_germline_tensor
+    )
+    return [
+        cdr3_data,
+        cdr3_length_data,
+        v_gene_data,
+        j_gene_data,
+        contiguous_match_data,
+    ]
 
 
 def interpret_output(output):
-    (cdr3_output, cdr3_length_output, v_gene_output, j_gene_output, contiguous_match_output) = output
+    (
+        cdr3_output,
+        cdr3_length_output,
+        v_gene_output,
+        j_gene_output,
+        contiguous_match_output,
+    ) = output
     return (cdr3_output, v_gene_output, j_gene_output)
diff --git a/vampire/pipe_simple/SConscript b/vampire/pipe_simple/SConscript
new file mode 100644
index 0000000..a045d7c
--- /dev/null
+++ b/vampire/pipe_simple/SConscript
@@ -0,0 +1,361 @@
+import json
+import os
+from os.path import join
+import re
+
+import numpy as np  # noqa
+
+import nestly
+import nestly.scons as ns
+import SCons.Script as sc
+
+import common
+import tcr_vae
+
+from common import cluster_execution_string
+
+sc.Import('env')
+localenv = env.Clone()  # noqa
+
+
+def check_mode():
+    if localenv['mode'] not in ['mini', 'default']:
+        raise Exception(f"Unknown mode '{localenv['mode']}'")
+
+
+def apply_mode(l):
+    """
+    Default mode runs everything, and mini mode just runs a single element from the list.
+    """
+    check_mode()
+    if localenv['mode'] == 'mini':
+        return [l[0]]
+    else:
+        return l
+
+
+def default_params_by_mode():
+    """
+    Mini mode doesn't train for long.
+    """
+    params = tcr_vae.TCRVAE.default_params()
+
+    if localenv['mode'] == 'mini':
+        params['pretrains'] = 2
+        params['warmup_period'] = 3
+        params['epochs'] = 10
+
+    return params
+
+
+def base_dict():
+    """
+    The dictionary that will be shared by all the nests.
+
+    nseqs: the number of sequences generated by the various programs, and for
+    which we evaluate Pvae for on the real data.
+
+    train_size: the number of sequences to take for training.
+
+    max_q: the level for truncating q_{lvj} in the thymic Q calculation.
+    """
+
+    d = {'nseqs': 10000,
+         'train_size': 100000,
+         'max_q': 100,
+         }
+
+    if localenv['mode'] == 'mini':
+        d['nseqs'] = 100
+
+    # Slurp up all key-value pairs in the JSON file into the base_dict.
+    with open(localenv['data']) as fp:
+        for k, v in json.load(fp).items():
+            d[k] = v
+
+    return d
+
+
+def numerical_nest_add(nest_name, number_list):
+    """
+    Add an nest for a list of non-negative numbers, with nice zero-padded
+    directory names.
+    """
+    nest.add(nest_name, apply_mode(number_list), label_func=common.zero_pad_list_func(number_list))
+
+
+# ### Nests and targets ###
+
+nest = ns.SConsWrap(nestly.Nest(base_dict=base_dict()), alias_environment=localenv)
+
+# Nest: the data set choice, named via data name prepended with `_output_`.
+nest.add('data_label', [localenv['data']], label_func=lambda p: '_output_' + common.strip_dirpath_extn(p))
+
+# The test_set_agg allows us to process the test sets once in the things that don't depend on the VAE.
+nest.add_aggregate('test_set_agg', list)
+# The test_set_info_agg gathers information about the processed test sets so we can get at them later.
+nest.add_aggregate('test_set_info_agg', dict)
+# The summarized_agg gathers everything we want to summarize and then stack at the end.
+nest.add_aggregate('summarized_agg', list)
+# The merged gathers merged per-sequence information on the test set.
+nest.add_aggregate('merged_agg', list)
+summarized_agg_names = []
+# nest.add_aggregate('loss_regression_agg', list)
+
+
+@nest.add_target_with_env(localenv)
+def sconscript(env, outdir, c):
+    """
+    Copy SConscript file into output directory.
+    """
+    return env.Command(
+        join(outdir, 'SConscript'),
+        'SConscript',
+        'cp $SOURCE $TARGET')[0]
+
+
+@nest.add_target_with_env(localenv)
+def json_data(env, outdir, c):
+    """
+    Copy JSON data file into output directory.
+    """
+    return env.Command(
+        join(outdir, os.path.basename(localenv['data'])),
+        localenv['data'],
+        'cp $SOURCE $TARGET')[0]
+
+
+@nest.add_target_with_env(localenv)
+def vampire_version(env, outdir, c):
+    """
+    Echo vampire version to file.
+    """
+    return env.Command(
+        join(outdir, 'vampire-version.txt'),
+        [],
+        'python -c "import vampire; print(vampire.__version__)" > $TARGET')[0]
+
+
+# Nest: initial processing of the test sets.
+nest.add('test_data', lambda c: c['test_paths'], label_func=common.strip_dirpath_extn)
+
+
+@nest.add_target_with_env(localenv)
+def test_head(env, outdir, c):
+    """
+    Run the preprocess_adaptive.py script on the test data, which incorporates
+    downsampling.
+    """
+    test_head_path = join(outdir, common.strip_dirpath_extn(c['test_data'])+'.head.csv')
+    # Remove a pesky and meaningless leading `./` that was making trouble later.
+    c['test_set_agg'].append(re.sub('^\./', '', test_head_path))
+    return env.Command(
+        test_head_path,
+        c['test_data'],
+        f"python3 preprocess_adaptive.py --sample {c['nseqs']} $SOURCE $TARGET")[0]
+
+
+@nest.add_target_with_env(localenv)
+def test_pgen(env, outdir, c):
+    test_pgen_path = join(outdir, 'test-head.pgen.tsv')
+    c['test_set_info_agg'][(str(c['test_head']), 'test_pgen')] = test_pgen_path
+    return env.Command(
+        test_pgen_path,
+        c['test_head'],
+        cluster_execution_string('adaptive-pgen.sh {sources} {targets}', localenv, 0))[0]
+
+
+nest.pop('test_data')
+
+
+# Nest: the training data.
+# By using a nest we retain the option of having multiple training sets later.
+nest.add('train_data', lambda c: [c['train_tsv_path']], label_func=common.strip_dirpath_extn)
+summarized_agg_names.append('train_data')
+
+
+# Nest: training set size
+# numerical_nest_add('train_size', np.floor(common.logspace(5000, 500000, 6)))
+# summarized_agg_names.append('train_size')
+
+
+@nest.add_target_with_env(localenv)
+def preprocess(env, outdir, c):
+    """
+    Run the preprocess_adaptive.py script on the training/validation data.
+    """
+    in_path = c['train_data']
+    # We take enough for training and validation_head, with nseqs to spare.
+    n_to_sample = c['train_size'] + 2 * c['nseqs']
+    return env.Command(
+        join(outdir, 'train.processed.csv'),
+        in_path,
+        f'python3 preprocess_adaptive.py --sample {n_to_sample} $SOURCE $TARGET')[0]
+
+
+@nest.add_target_with_env(localenv, 'split')
+@ns.name_targets
+def split(env, outdir, c):
+    """
+    Split the training data into training and validation.
+    """
+    in_path = c['preprocess']
+    return 'training', 'validation', env.Command(
+        [join(outdir, 'training.csv'), join(outdir, 'validation.csv')],
+        in_path,
+        f"python3 util.py split --train-size {c['train_size']} $SOURCE $TARGETS")
+
+@nest.add_target_with_env(localenv)
+def training_head(env, outdir, c):
+    return env.Command(
+        common.strip_extn(c['split']['training'])+'.head.csv',
+        c['split']['training'],
+        f"head -n {c['nseqs']+1} $SOURCE > $TARGET")[0]
+
+
+@nest.add_target_with_env(localenv)
+def training_head_pgen(env, outdir, c):
+    """
+    Get Pgen for some of the training data.
+    """
+    return env.Command(
+        join(outdir, 'training.head.pgen.tsv'),
+        c['training_head'],
+        'adaptive-pgen.sh $SOURCE $TARGET')[0]
+
+
+@nest.add_target_with_env(localenv)
+def validation_head(env, outdir, c):
+    return env.Command(
+        common.strip_extn(c['split']['validation'])+'.head.csv',
+        c['split']['validation'],
+        f"head -n {c['nseqs']+1} $SOURCE > $TARGET")[0]
+
+
+@nest.add_target_with_env(localenv)
+def validation_pgen(env, outdir, c):
+    return env.Command(
+        join(outdir, common.strip_dirpath_extn(c['validation_head'])+'.pgen.csv'),
+        c['validation_head'],
+        cluster_execution_string('adaptive-pgen.sh {sources} {targets}', localenv, 0))[0]
+
+# Nest: the dimension of the latent space.
+# numerical_nest_add('warmup_period', [5*i for i in range(0,6)])
+# summarized_agg_names.append('warmup_period')
+
+# Nest: the dimension of the latent space.
+# numerical_nest_add('latent_dim', [30, 35, 40])
+# summarized_agg_names.append('latent_dim')
+
+# Nest: the number of dense nodes.
+# numerical_nest_add('dense_nodes', [50, 75, 100, 125, 150])
+# summarized_agg_names.append('dense_nodes')
+
+# Nest: the amino acid embedding dimension.
+# numerical_nest_add('aa_embedding_dim', [10, 15, 20, 21])
+# summarized_agg_names.append('aa_embedding_dim')
+
+# Nest: the V gene embedding dimension.
+# numerical_nest_add('v_gene_embedding_dim', [20, 30, 40])
+# summarized_agg_names.append('v_gene_embedding_dim')
+
+# Nest: the strength of the KL component of the VAE loss.
+# numerical_nest_add('beta', np.linspace(0.625, 1, 7))
+numerical_nest_add('beta', [0.75])
+summarized_agg_names.append('beta')
+
+# Nest: monitor for EarlyStopping.
+# nest.add('stopping_monitor', ['loss', 'val_loss'])
+# summarized_agg_names.append('stopping_monitor')
+
+# Nest: EarlyStopping patience.
+# numerical_nest_add('patience', [5*x for x in range(7)])
+# summarized_agg_names.append('patience')
+
+
+# Nest: the model.
+# This needs to appear in the innermost nest so that we can pop it and then do some OLGA work.
+# In order to keep things happy when merging with OLGA results, don't comment
+# this out if you are only interested in one model. Instead, make a
+# single-model nest.
+# nest.add('model', apply_mode([common.strip_dirpath_extn(m) for m in glob.glob('../models/*.py')]))
+nest.add('model', ['basic', 'count_match'])
+summarized_agg_names.append('model')
+
+
+@nest.add_target_with_env(localenv)
+def model_params(env, outdir, c):
+    """
+    Write out a file with parameters from which we can build our VAE.
+    """
+
+    # Copy over any relevant parameters from c into the params dictionary.
+    params = default_params_by_mode()
+    for k, v in c.items():
+        if k in params:
+            params[k] = v
+
+    return env.Command(
+        join(outdir, 'model_params.json'),
+        [],
+        f"echo '{json.dumps(params)}' > $TARGET")[0]
+
+
+@nest.add_target_with_env(localenv, 'trained')
+@ns.name_targets
+def trained(env, outdir, c):
+    return 'weights', 'diagnostics', env.Command(
+        [join(outdir, 'best_weights.h5'), join(outdir, 'diagnostics.csv')],
+        [c['model_params'], c['split']['training']],
+        cluster_execution_string('tcr-vae train {sources} {targets}', localenv))
+
+
+# @nest.add_target_with_env(localenv)
+# def loss(env, outdir, c):
+#     loss = join(outdir, 'loss.csv')
+#     return env.Command(
+#         loss,
+#         [c['model_params'], c['trained']['weights'], c['split']['training'],
+#             c['split']['validation']],
+#         cluster_execution_string('tcr-vae loss {sources} {targets}'))[0]
+
+
+@nest.add_target_with_env(localenv)
+def training_pvae(env, outdir, c):
+    pvae_call = f"tcr-vae pvae --limit-input-to {c['nseqs']}"
+    return env.Command(
+        join(outdir, common.strip_dirpath_extn(c['split']['training'])+'.head.pvae.csv'),
+        [c['model_params'], c['trained']['weights'], c['split']['training']],
+        cluster_execution_string(pvae_call + ' {sources} {targets}', localenv))[0]
+
+
+@nest.add_target_with_env(localenv)
+def validation_pvae(env, outdir, c):
+    return env.Command(
+        join(outdir, common.strip_dirpath_extn(c['validation_head'])+'.pvae.csv'),
+        [c['model_params'], c['trained']['weights'], c['validation_head']],
+        cluster_execution_string('tcr-vae pvae {sources} {targets}', localenv))[0]
+
+
+@nest.add_target_with_env(localenv)
+def vae_generated(env, outdir, c):
+    return env.Command(
+        join(outdir, 'vae-generated.csv'),
+        [c['model_params'], c['trained']['weights']],
+        f"tcr-vae generate --nseqs {c['nseqs']} $SOURCES $TARGET")[0]
+
+
+@nest.add_target_with_env(localenv)
+def vae_generated_pgen(env, outdir, c):
+    return env.Command(
+        join(outdir, 'vae-generated.pgen.tsv'),
+        c['vae_generated'],
+        cluster_execution_string('adaptive-pgen.sh {sources} {targets}', localenv, 0))[0]
+
+@nest.add_target_with_env(localenv)
+def vae_generated_pvae(env, outdir, c):
+    return env.Command(
+        join(outdir, 'vae-generated.pvae.csv'),
+        [c['model_params'], c['trained']['weights'], c['vae_generated']],
+        cluster_execution_string('tcr-vae pvae {sources} {targets}', localenv))[0]
+
diff --git a/vampire/pipe_simple/sample_data/02-0249_TCRB.4000.tsv.bz2 b/vampire/pipe_simple/sample_data/02-0249_TCRB.4000.tsv.bz2
new file mode 100644
index 0000000..25ff8f7
Binary files /dev/null and b/vampire/pipe_simple/sample_data/02-0249_TCRB.4000.tsv.bz2 differ
diff --git a/vampire/pipe_simple/sample_data/02-0249_TCRB.head.tsv b/vampire/pipe_simple/sample_data/02-0249_TCRB.head.tsv
new file mode 100644
index 0000000..ed794c8
--- /dev/null
+++ b/vampire/pipe_simple/sample_data/02-0249_TCRB.head.tsv
@@ -0,0 +1,10 @@
+rearrangement	amino_acid	frame_type	rearrangement_type	templates	reads	frequency	productive_frequency	cdr3_length	v_family	v_gene	v_allele	d_family	d_gene	d_allele	j_family	j_gene	j_allele	v_deletions	d5_deletions	d3_deletions	j_deletions	n2_insertions	n1_insertions	v_index	n1_index	n2_index	d_index	j_index	v_family_ties	v_gene_ties	v_allele_ties	d_family_ties	d_gene_ties	d_allele_ties	j_family_ties	j_gene_ties	j_allele_ties	sequence_tags	v_shm_count	v_shm_indexes	antibody	sample_name	species	locus	product_subtype	kit_pool	total_templates	productive_templates	outofframe_templates	stop_templates	dj_templates	total_rearrangements	productive_rearrangements	outofframe_rearrangements	stop_rearrangements	dj_rearrangements	total_reads	total_productive_reads	total_outofframe_reads	total_stop_reads	total_dj_reads	productive_clonality	productive_entropy	sample_clonality	sample_entropy	sample_amount_ng	sample_cells_mass_estimate	fraction_productive_of_cells_mass_estimate	sample_cells	fraction_productive_of_cells	max_productive_frequency	max_frequency	counting_method	primer_set	release_date	sample_tags	fraction_productive	order_name	kit_id	total_t_cells
+CTCACTCTGGAGTCTGCTGCCTCCTCCCAGACATCTGTATATTTCTGCGCCAGCAGTGAGGGAAATCAGCCCCAGCATTTTGGTGAT	CASSEGNQPQHF	In	VDJ	1	null	8.64655478024781E-6	1.0341261633919338E-5	36	TCRBV10	TCRBV10-01			unresolved		TCRBJ01	TCRBJ01-05	01	2	1	8	4	0	0	45	-1	-1	60	63			01,02	TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01					null	null	null	Vb 12	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+AAGATCCAGCCTGCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCCCGACGCGAAATCAGCCCCAGCATTTTGGTGAT	CASSPTRNQPQHF	In	VDJ	1	null	8.64655478024781E-6	1.0341261633919338E-5	39	TCRBV11	TCRBV11-02	02		unresolved		TCRBJ01	TCRBJ01-05	01	5	2	7	4	4	2	42	54	59	56	63				TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01					null	null	null	Vb 21.3	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+CTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCTGTGCCAGTAGTATAGGGGACCAGCCCCAGCATTTTGGTGAT	CASSIGDQPQHF	In	VDJ	1	null	8.64655478024781E-6	1.0341261633919338E-5	36	TCRBV19	TCRBV19-01			unresolved		TCRBJ01	TCRBJ01-05	01	1	0	7	7	0	0	45	-1	-1	61	66			01,02	TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01					null	null	null	Vb 17	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+ACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCTGTGCCAGTAGCCCTCGGGGGGAACAGCCCCAGCATTTTGGTGAT	CASSPRGEQPQHF	In	VDJ	1	null	8.64655478024781E-6	1.0341261633919338E-5	39	TCRBV19	TCRBV19-01		TCRBD02	TCRBD02-01	01	TCRBJ01	TCRBJ01-05	01	6	8	1	7	2	4	42	53	64	57	66			01,02							null	null	null	Vb 17	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+TGCTGTACCCTCTCAGACATCTGTGTACTTCTGTGCCAGCAGTGATCGCCCCAGGGGAGACCAAACTATTGCTACACCTTCGGTTCG		Out	VDJ	1	null	8.64655478024781E-6	null	50	TCRBV06	TCRBV06-04		TCRBD01	TCRBD01-01	01	TCRBJ01	TCRBJ01-02	01	3	4	2	9	13	6	31	45	57	51	70			01,02							null	null	null	null	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+ACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCTGTGCCAGTAGGGCACACCTAATGGGTGGCTACACCTTCGGTTCG	CASRAHLMGGYTF	In	VDJ	1	null	8.64655478024781E-6	1.0341261633919338E-5	39	TCRBV19	TCRBV19-01			unresolved		TCRBJ01	TCRBJ01-02	01	6	0	9	7	0	12	42	53	-1	65	68			01,02	TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01					null	null	null	Vb 17	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+GAGTGCCTTGGAGCTGGGGGACTCGGCCCTGTATCTCTGTGCCAGAAGCTTGGAACTCTTCAATGGGGTGGCTACACCTTCGGTTCG		Out	VDJ	1	null	8.64655478024781E-6	null	44	TCRBV05	TCRBV05-03			unresolved		TCRBJ01	TCRBJ01-02	01	0	6	2	7	0	11	37	53	-1	64	68			01,02	TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01					null	null	null	null	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+CAGCCTGCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCACCCCAAAGGGCCCGCCGTCTGGCTACACCTTCGGTTCG	CASSTPKGPPSGYTF	In	VDJ	1	null	8.64655478024781E-6	1.0341261633919338E-5	45	TCRBV11	TCRBV11-02	02		unresolved		TCRBJ01	TCRBJ01-02	01	5	5	3	7	9	7	36	48	59	55	68				TCRBD01,TCRBD02	TCRBD01-01,TCRBD02-01					null	null	null	Vb 21.3	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
+ATCCAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCAGCAGTTCGACCTCCTCAGGGGCTGGCTACACCTTCGGTTCG	CASSSTSSGAGYTF	In	VDJ	1	null	8.64655478024781E-6	1.0341261633919338E-5	42	TCRBV12	unresolved		TCRBD01	TCRBD01-01	01	TCRBJ01	TCRBJ01-02	01	4	4	2	7	1	9	39	52	67	61	68		TCRBV12-03,TCRBV12-04								null	null	null	null	02-0249_TCRB	Human	TCRB	Deep	null	115653	96700	17505	1448	0	90702	76251	13167	1284	0	null	null	null	null	null	0.0755023062	14.9939365	0.0734105557	15.2598591	1200.0	184615	0.523792757901579	0	0.0	0.0439400189	0.0367392115	v3	Human-TCRB-PD1x	2015-11-09 16:34:04.58	Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells	0.8361218472499633	UW-Seshadri-P0201	null	0
diff --git a/vampire/pipe_simple/sample_data/sample.json b/vampire/pipe_simple/sample_data/sample.json
new file mode 100644
index 0000000..f5218b8
--- /dev/null
+++ b/vampire/pipe_simple/sample_data/sample.json
@@ -0,0 +1,9 @@
+{
+   "comment": "Just for the purposes of this sample JSON run specification file, the test path overlaps with the train path. Of course, don't do this for a real analysis.",
+   "test_paths" : [
+      "sample_data/02-0249_TCRB.4000.tsv.bz2"
+   ],
+   "train_tsv_path" : "sample_data/02-0249_TCRB.4000.tsv.bz2",
+   "train_size" : 1000,
+   "nseqs" : 100
+}
diff --git a/vampire/preprocess_adaptive.py b/vampire/preprocess_adaptive.py
index d31d31a..da0b973 100644
--- a/vampire/preprocess_adaptive.py
+++ b/vampire/preprocess_adaptive.py
@@ -15,10 +15,10 @@
 
 # Sometimes Adaptive uses one set of column names, and sometimes another.
 HEADER_TRANSLATION_DICT = {
-    'sequenceStatus': 'frame_type',
-    'aminoAcid': 'amino_acid',
-    'vGeneName': 'v_gene',
-    'jGeneName': 'j_gene'
+    "sequenceStatus": "frame_type",
+    "aminoAcid": "amino_acid",
+    "vGeneName": "v_gene",
+    "jGeneName": "j_gene",
 }
 
 
@@ -26,7 +26,7 @@ def filter_and_drop_frame(df):
     """
     Select in-frame sequences and then drop that column.
     """
-    return df.query('frame_type == "In"').drop('frame_type', axis=1)
+    return df.query('frame_type == "In"').drop("frame_type", axis=1)
 
 
 def filter_on_cdr3_bounding_aas(df):
@@ -37,21 +37,24 @@ def filter_on_cdr3_bounding_aas(df):
     Note that according to the Adaptive docs the `amino_acid` column is indeed
     the CDR3 amino acid.
     """
-    return df[df['amino_acid'].str.contains('^C.*F$') | df['amino_acid'].str.contains('^C.*YV$')]
+    return df[
+        df["amino_acid"].str.contains("^C.*F$")
+        | df["amino_acid"].str.contains("^C.*YV$")
+    ]
 
 
 def filter_on_cdr3_length(df, max_len):
     """
     Only take sequences that have a CDR3 of at most `max_len` length.
     """
-    return df[df['amino_acid'].apply(len) <= max_len]
+    return df[df["amino_acid"].apply(len) <= max_len]
 
 
 def filter_on_TCRB(df):
     """
     Only take sequences that have a resolved TCRB gene for V and J.
     """
-    return df[df['v_gene'].str.contains('^TCRB') & df['j_gene'].str.contains('^TCRB')]
+    return df[df["v_gene"].str.contains("^TCRB") & df["j_gene"].str.contains("^TCRB")]
 
 
 def filter_on_olga(df):
@@ -64,8 +67,8 @@ def filter_on_olga(df):
     * exclude TCRBJ2-7, which appears to be problematic for OLGA.
     """
     d = conversion.adaptive_to_olga_dict()
-    del d['TRBJ']['TCRBJ02-05']
-    del d['TRBJ']['TCRBJ02-07']
+    del d["TRBJ"]["TCRBJ02-05"]
+    del d["TRBJ"]["TCRBJ02-07"]
     return conversion.filter_by_gene_names(df, d)
 
 
@@ -89,7 +92,9 @@ def apply_all_filters(df, max_len=30, fail_fraction_remaining=None):
     click.echo(f"Requiring genes that are also present in the OLGA set: {len(df)} rows")
     if fail_fraction_remaining:
         if len(df) / original_count < fail_fraction_remaining:
-            raise Exception(f"We started with {original_count} sequences and now we have {len(df)}. Failing.")
+            raise Exception(
+                f"We started with {original_count} sequences and now we have {len(df)}. Failing."
+            )
     return df.reset_index(drop=True)
 
 
@@ -106,8 +111,8 @@ def collect_vjcdr3_duplicates(df):
 
     for idx, row in df.iterrows():
         # nan means no CDR3 sequence. We don't want to include those.
-        if row['amino_acid'] is not np.nan:
-            key = '_'.join([row['v_gene'], row['j_gene'], row['amino_acid']])
+        if row["amino_acid"] is not np.nan:
+            key = "_".join([row["v_gene"], row["j_gene"], row["amino_acid"]])
             d[key].append(idx)
 
     return d
@@ -138,7 +143,7 @@ def read_adaptive_tsv(path):
     snake_case and the other that uses camelCase.
     """
 
-    test_bite = pd.read_csv(path, delimiter='\t', nrows=1)
+    test_bite = pd.read_csv(path, delimiter="\t", nrows=1)
 
     camel_columns = set(HEADER_TRANSLATION_DICT.keys())
     snake_columns = set(HEADER_TRANSLATION_DICT.values())
@@ -150,7 +155,7 @@ def read_adaptive_tsv(path):
     else:
         raise Exception("Unknown column names!")
 
-    df = pd.read_csv(path, delimiter='\t', usecols=take_columns)
+    df = pd.read_csv(path, delimiter="\t", usecols=take_columns)
     df.rename(columns=HEADER_TRANSLATION_DICT, inplace=True)
     return df
 
@@ -159,34 +164,39 @@ def read_adaptive_tsv(path):
 # Below we use Path rather than File because we don't want to have to figure
 # out whether a file is compressed or not-- Pandas will figure that out for us.
 @click.option(
-    '--fail-fraction-remaining',
+    "--fail-fraction-remaining",
     show_default=True,
     default=0.25,
-    help="Fail if the post-filtration fraction is below this number.")
+    help="Fail if the post-filtration fraction is below this number.",
+)
 @click.option(
-    '--sample',
+    "--sample",
     type=int,
-    metavar='N',
-    help="Sample N sequences without replacement from the preprocessed sequences for output.")
-@click.argument('in_tsv', type=click.Path(exists=True))
-@click.argument('out_csv', type=click.File('w'))
+    metavar="N",
+    help="Sample N sequences without replacement from the preprocessed sequences for output.",
+)
+@click.argument("in_tsv", type=click.Path(exists=True))
+@click.argument("out_csv", type=click.File("w"))
 def preprocess_tsv(fail_fraction_remaining, sample, in_tsv, out_csv):
     """
     Preprocess the Adaptive TSV at IN_TSV and output to OUT_CSV.
 
     This includes doing filters as well as deduplicating on vjcdr3s.
     """
-    df = apply_all_filters(read_adaptive_tsv(in_tsv), fail_fraction_remaining=fail_fraction_remaining)
+    df = apply_all_filters(
+        read_adaptive_tsv(in_tsv), fail_fraction_remaining=fail_fraction_remaining
+    )
 
     if sample:
         if len(df) < sample:
             raise Exception(
-                f"We only have {len(df)} sequences so we can't sample {sample} of them without replacement. Failing.")
+                f"We only have {len(df)} sequences so we can't sample {sample} of them without replacement. Failing."
+            )
         df = df.sample(n=sample)
         click.echo(f"Sampling: {sample} rows")
 
     df.to_csv(out_csv, index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     preprocess_tsv()
diff --git a/vampire/requirements.txt b/vampire/requirements.txt
index dc1c21e..cc27c34 100644
--- a/vampire/requirements.txt
+++ b/vampire/requirements.txt
@@ -3,7 +3,6 @@ click
 delegator.py
 exrex
 flake8
-keras
 matplotlib
 nestly
 pandas
diff --git a/vampire/tcr_vae.py b/vampire/tcr_vae.py
index eb7c22b..83b6913 100644
--- a/vampire/tcr_vae.py
+++ b/vampire/tcr_vae.py
@@ -24,9 +24,9 @@
 import numpy as np
 import pandas as pd
 
-import keras
-import keras.backend as K
-from keras.callbacks import EarlyStopping, ModelCheckpoint
+import tensorflow.keras as keras
+import tensorflow.keras.backend as K
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
 
 import scipy.special as special
 import scipy.stats as stats
@@ -56,7 +56,7 @@ def logprob_of_obs_vect(probs, obs):
 class TCRVAE:
     def __init__(self, params):
         self.params = params
-        model = importlib.import_module('vampire.models.' + params['model'])
+        model = importlib.import_module("vampire.models." + params["model"])
         # Digest the dictionary returned by model.build into self attributes.
         for submodel_name, submodel in model.build(params).items():
             setattr(self, submodel_name, submodel)
@@ -75,7 +75,7 @@ def default_params(cls):
         """
         return dict(
             # Models:
-            model='basic',
+            model="basic",
             # Model parameters.
             latent_dim=20,
             dense_nodes=75,
@@ -89,12 +89,13 @@ def default_params(cls):
             n_v_genes=len(conversion.TCRB_V_GENE_LIST),
             n_j_genes=len(conversion.TCRB_J_GENE_LIST),
             # Training parameters.
-            stopping_monitor='val_loss',
+            stopping_monitor="val_loss",
             batch_size=100,
             pretrains=10,
             warmup_period=20,
             epochs=500,
-            patience=20)
+            patience=20,
+        )
 
     @classmethod
     def default(cls):
@@ -108,7 +109,7 @@ def of_json_file(cls, fname):
         """
         Build a TCRVAE from a parameter dictionary dumped to JSON.
         """
-        with open(fname, 'r') as fp:
+        with open(fname, "r") as fp:
             return cls(json.load(fp))
 
     @classmethod
@@ -120,21 +121,21 @@ def of_directory(cls, path):
         `model_params.json` and a weights file called `best_weights.h5`. Here
         we load that information in.
         """
-        v = cls.of_json_file(os.path.join(path, 'model_params.json'))
-        v.vae.load_weights(os.path.join(path, 'best_weights.h5'))
+        v = cls.of_json_file(os.path.join(path, "model_params.json"))
+        v.vae.load_weights(os.path.join(path, "best_weights.h5"))
         return v
 
     def serialize_params(self, fname):
         """
         Dump model parameters to a file.
         """
-        with open(fname, 'w') as fp:
+        with open(fname, "w") as fp:
             json.dump(self.params, fp)
 
     def reinitialize_weights(self):
         session = K.get_session()
         for layer in self.vae.layers:
-            if hasattr(layer, 'kernel_initializer'):
+            if hasattr(layer, "kernel_initializer"):
                 layer.kernel.initializer.run(session=session)
 
     def get_data(self, fname, data_chunk_size=0):
@@ -142,16 +143,22 @@ def get_data(self, fname, data_chunk_size=0):
         Get data in the correct format from fname. If data_chunk_size is
         nonzero, trim so the data length is a multiple of data_chunk_size.
         """
-        df = pd.read_csv(fname, usecols=['amino_acid', 'v_gene', 'j_gene'])
+        df = pd.read_csv(fname, usecols=["amino_acid", "v_gene", "j_gene"])
         if data_chunk_size == 0:
             sub_df = df
         else:
             assert len(df) >= data_chunk_size
             n_to_take = len(df) - len(df) % data_chunk_size
             sub_df = df[:n_to_take]
-        return conversion.unpadded_tcrbs_to_onehot(sub_df, self.params['max_cdr3_len'])
-
-    def fit(self, x_df: pd.DataFrame, validation_split: float, best_weights_fname: str, tensorboard_log_dir: str):
+        return conversion.unpadded_tcrbs_to_onehot(sub_df, self.params["max_cdr3_len"])
+
+    def fit(
+        self,
+        x_df: pd.DataFrame,
+        validation_split: float,
+        best_weights_fname: str,
+        tensorboard_log_dir: str,
+    ):
         """
         Fit the vae with warmup and early stopping.
         """
@@ -160,46 +167,57 @@ def fit(self, x_df: pd.DataFrame, validation_split: float, best_weights_fname: s
         best_val_loss = np.inf
 
         # We pretrain a given number of times and take the best run for the full train.
-        for pretrain_idx in range(self.params['pretrains']):
+        for pretrain_idx in range(self.params["pretrains"]):
             self.reinitialize_weights()
             # In our first fitting phase we don't apply EarlyStopping so that
             # we get the number of specifed warmup epochs.
             # Below we apply the fact that right now the only thing in self.callbacks is the BetaSchedule callback.
             # If other callbacks appear we'll need to change this.
             if tensorboard_log_dir:
-                callbacks = [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir + '_warmup_' + str(pretrain_idx))]
+                callbacks = [
+                    keras.callbacks.TensorBoard(
+                        log_dir=tensorboard_log_dir + "_warmup_" + str(pretrain_idx)
+                    )
+                ]
             else:
                 callbacks = []
             callbacks += self.callbacks  # <- here re callbacks
             history = self.vae.fit(
                 x=data,  # y=X for a VAE.
                 y=data,
-                epochs=1 + self.params['warmup_period'],
-                batch_size=self.params['batch_size'],
+                epochs=1 + self.params["warmup_period"],
+                batch_size=self.params["batch_size"],
                 validation_split=validation_split,
                 callbacks=callbacks,
-                verbose=2)
-            new_val_loss = history.history['val_loss'][-1]
+                verbose=2,
+            )
+            new_val_loss = history.history["val_loss"][-1]
             if new_val_loss < best_val_loss:
                 best_val_loss = new_val_loss
                 self.vae.save_weights(best_weights_fname, overwrite=True)
 
         self.vae.load_weights(best_weights_fname)
 
-        checkpoint = ModelCheckpoint(best_weights_fname, save_best_only=True, mode='min')
+        checkpoint = ModelCheckpoint(
+            best_weights_fname, save_best_only=True, mode="min"
+        )
         early_stopping = EarlyStopping(
-            monitor=self.params['stopping_monitor'], patience=self.params['patience'], mode='min')
+            monitor=self.params["stopping_monitor"],
+            patience=self.params["patience"],
+            mode="min",
+        )
         callbacks = [checkpoint, early_stopping]
         if tensorboard_log_dir:
             callbacks += [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir)]
         self.vae.fit(
             x=data,  # y=X for a VAE.
             y=data,
-            epochs=self.params['epochs'],
-            batch_size=self.params['batch_size'],
+            epochs=self.params["epochs"],
+            batch_size=self.params["batch_size"],
             validation_split=validation_split,
             callbacks=callbacks,
-            verbose=2)
+            verbose=2,
+        )
 
     def evaluate(self, x_df, per_sequence=False):
         """
@@ -215,7 +233,9 @@ def evaluate(self, x_df, per_sequence=False):
         """
 
         def our_evaluate(data):
-            return self.vae.evaluate(x=data, y=data, batch_size=self.params['batch_size'], verbose=0)
+            return self.vae.evaluate(
+                x=data, y=data, batch_size=self.params["batch_size"], verbose=0
+            )
 
         data = self.prepare_data(x_df)
 
@@ -226,7 +246,12 @@ def our_evaluate(data):
         # data elements the minimum amount for evaluate to work (the
         # batch_size) and evaluate.
         return [
-            our_evaluate([common.repeat_row(data_elt, i, self.params['batch_size']) for data_elt in data])
+            our_evaluate(
+                [
+                    common.repeat_row(data_elt, i, self.params["batch_size"])
+                    for data_elt in data
+                ]
+            )
             for i in range(len(x_df))
         ]
 
@@ -252,14 +277,16 @@ def generate(self, n_seqs):
         """
         Generate a data frame of n_seqs sequences.
         """
-        batch_size = self.params['batch_size']
+        batch_size = self.params["batch_size"]
         # Increase the number of desired sequences as needed so it's divisible by batch_size.
         n_actual = batch_size * math.ceil(n_seqs / batch_size)
         # Sample from the latent space to generate sequences:
-        z_sample = np.random.normal(0, 1, size=(n_actual, self.params['latent_dim']))
+        z_sample = np.random.normal(0, 1, size=(n_actual, self.params["latent_dim"]))
         amino_acid_arr, v_gene_arr, j_gene_arr = self.decode(z_sample)
         # Convert back, restricting to the desired number of sequences.
-        return conversion.onehot_to_tcrbs(amino_acid_arr[:n_seqs], v_gene_arr[:n_seqs], j_gene_arr[:n_seqs])
+        return conversion.onehot_to_tcrbs(
+            amino_acid_arr[:n_seqs], v_gene_arr[:n_seqs], j_gene_arr[:n_seqs]
+        )
 
     def log_pvae_importance_sample(self, x_df, out_ps):
         """
@@ -294,7 +321,7 @@ def log_pvae_importance_sample(self, x_df, out_ps):
 
         # We're going to be getting a one-sample estimate, so we want one slot
         # in our output array for each input sequence.
-        assert (len(x_df) == len(out_ps))
+        assert len(x_df) == len(out_ps)
 
         # Get encoding of x's in the latent space.
         z_mean, z_sd = self.encode(x_df)
@@ -310,10 +337,11 @@ def log_pvae_importance_sample(self, x_df, out_ps):
 
         # Loop over observations.
         for i in range(len(x_df)):
-            log_p_x_given_z = \
-                logprob_of_obs_vect(aa_probs[i], aa_obs[i]) + \
-                np.log(np.sum(v_gene_probs[i] * v_gene_obs[i])) + \
-                np.log(np.sum(j_gene_probs[i] * j_gene_obs[i]))
+            log_p_x_given_z = (
+                logprob_of_obs_vect(aa_probs[i], aa_obs[i])
+                + np.log(np.sum(v_gene_probs[i] * v_gene_obs[i]))
+                + np.log(np.sum(j_gene_probs[i] * j_gene_obs[i]))
+            )
             # p(z)
             # Here we use that the PDF of a multivariate normal with
             # diagonal covariance is the product of the PDF of the
@@ -336,11 +364,11 @@ def cli():
 
 
 @cli.command()
-@click.option('--tensorboard', is_flag=True, help="Record logs for TensorBoard.")
-@click.argument('params_json', type=click.Path(exists=True))
-@click.argument('train_csv', type=click.File('r'))
-@click.argument('best_weights_fname', type=click.Path(writable=True))
-@click.argument('diagnostics_fname', type=click.Path(writable=True))
+@click.option("--tensorboard", is_flag=True, help="Record logs for TensorBoard.")
+@click.argument("params_json", type=click.Path(exists=True))
+@click.argument("train_csv", type=click.File("r"))
+@click.argument("best_weights_fname", type=click.Path(writable=True))
+@click.argument("diagnostics_fname", type=click.Path(writable=True))
 def train(tensorboard, params_json, train_csv, best_weights_fname, diagnostics_fname):
     """
     Train the model described in params_json using data in train_csv, saving
@@ -355,11 +383,11 @@ def train(tensorboard, params_json, train_csv, best_weights_fname, diagnostics_f
     # If this fails then we may have problems with chunks of the data being the
     # wrong length.
     assert sub_chunk_size == float(int(sub_chunk_size))
-    min_data_size = validation_split_multiplier * v.params['batch_size']
+    min_data_size = validation_split_multiplier * v.params["batch_size"]
 
     train_data = v.get_data(train_csv, min_data_size)
     if tensorboard:
-        tensorboard_log_dir = os.path.join(os.path.dirname(best_weights_fname), 'logs')
+        tensorboard_log_dir = os.path.join(os.path.dirname(best_weights_fname), "logs")
     else:
         tensorboard_log_dir = None
     v.fit(train_data, validation_split, best_weights_fname, tensorboard_log_dir)
@@ -370,18 +398,21 @@ def train(tensorboard, params_json, train_csv, best_weights_fname, diagnostics_f
     vp.vae.load_weights(best_weights_fname)
 
     df = pd.DataFrame(
-        OrderedDict([('train', v.evaluate(train_data)), ('vp_train', vp.evaluate(train_data))]),
-        index=v.vae.metrics_names)
+        OrderedDict(
+            [("train", v.evaluate(train_data)), ("vp_train", vp.evaluate(train_data))]
+        ),
+        index=v.vae.metrics_names,
+    )
     df.to_csv(diagnostics_fname)
     return v
 
 
 @cli.command()
-@click.argument('params_json', type=click.Path(exists=True))
-@click.argument('model_weights', type=click.Path(exists=True))
-@click.argument('train_csv', type=click.File('r'))
-@click.argument('validation_csv', type=click.File('r'))
-@click.argument('out_csv', type=click.File('w'))
+@click.argument("params_json", type=click.Path(exists=True))
+@click.argument("model_weights", type=click.Path(exists=True))
+@click.argument("train_csv", type=click.File("r"))
+@click.argument("validation_csv", type=click.File("r"))
+@click.argument("out_csv", type=click.File("w"))
 def loss(params_json, model_weights, train_csv, validation_csv, out_csv):
     """
     Record aggregate losses.
@@ -391,17 +422,25 @@ def loss(params_json, model_weights, train_csv, validation_csv, out_csv):
     v.vae.load_weights(model_weights)
 
     df = pd.DataFrame(
-        OrderedDict([('train', v.evaluate(v.get_data(train_csv, v.params['batch_size']))),
-                     ('validation', v.evaluate(v.get_data(validation_csv, v.params['batch_size'])))]),
-        index=v.vae.metrics_names)
+        OrderedDict(
+            [
+                ("train", v.evaluate(v.get_data(train_csv, v.params["batch_size"]))),
+                (
+                    "validation",
+                    v.evaluate(v.get_data(validation_csv, v.params["batch_size"])),
+                ),
+            ]
+        ),
+        index=v.vae.metrics_names,
+    )
     df.to_csv(out_csv)
 
 
 @cli.command()
-@click.argument('params_json', type=click.Path(exists=True))
-@click.argument('model_weights', type=click.Path(exists=True))
-@click.argument('in_csv', type=click.File('r'))
-@click.argument('out_csv', type=click.File('w'))
+@click.argument("params_json", type=click.Path(exists=True))
+@click.argument("model_weights", type=click.Path(exists=True))
+@click.argument("in_csv", type=click.File("r"))
+@click.argument("out_csv", type=click.File("w"))
 def per_seq_loss(params_json, model_weights, in_csv, out_csv):
     """
     Record per-sequence losses.
@@ -411,18 +450,31 @@ def per_seq_loss(params_json, model_weights, in_csv, out_csv):
     v.vae.load_weights(model_weights)
 
     df = pd.DataFrame(
-        np.array(v.evaluate(v.get_data(in_csv, v.params['batch_size']), per_sequence=True)),
-        columns=v.vae.metrics_names)
+        np.array(
+            v.evaluate(v.get_data(in_csv, v.params["batch_size"]), per_sequence=True)
+        ),
+        columns=v.vae.metrics_names,
+    )
     df.to_csv(out_csv, index=False)
 
 
 @cli.command()
-@click.option('--limit-input-to', default=None, type=int, help='Only use the first <argument> input sequences.')
-@click.option('--nsamples', default=500, show_default=True, help='Number of importance samples to use.')
-@click.argument('params_json', type=click.Path(exists=True))
-@click.argument('model_weights', type=click.Path(exists=True))
-@click.argument('test_csv', type=click.File('r'))
-@click.argument('out_csv', type=click.File('w'))
+@click.option(
+    "--limit-input-to",
+    default=None,
+    type=int,
+    help="Only use the first <argument> input sequences.",
+)
+@click.option(
+    "--nsamples",
+    default=500,
+    show_default=True,
+    help="Number of importance samples to use.",
+)
+@click.argument("params_json", type=click.Path(exists=True))
+@click.argument("model_weights", type=click.Path(exists=True))
+@click.argument("test_csv", type=click.File("r"))
+@click.argument("out_csv", type=click.File("w"))
 def pvae(limit_input_to, nsamples, params_json, model_weights, test_csv, out_csv):
     """
     Estimate Pvae of the sequences in test_csv for the VAE determined by
@@ -437,10 +489,12 @@ def pvae(limit_input_to, nsamples, params_json, model_weights, test_csv, out_csv
     df_x = v.get_data(test_csv)
 
     if limit_input_to is not None:
-        df_x = df_x.iloc[:int(limit_input_to)]
+        df_x = df_x.iloc[: int(limit_input_to)]
 
     log_p_x = np.zeros((nsamples, len(df_x)))
-    click.echo("Calculating pvae for {} via importance sampling...".format(test_csv.name))
+    click.echo(
+        "Calculating pvae for {} via importance sampling...".format(test_csv.name)
+    )
 
     with click.progressbar(range(nsamples)) as bar:
         for i in bar:
@@ -448,21 +502,52 @@ def pvae(limit_input_to, nsamples, params_json, model_weights, test_csv, out_csv
 
     # Calculate log of mean of numbers given in log space.
     avg = special.logsumexp(log_p_x, axis=0) - np.log(nsamples)
-    pd.DataFrame({'log_p_x': avg}).to_csv(out_csv, index=False)
+    pd.DataFrame({"log_p_x": avg}).to_csv(out_csv, index=False)
 
 
 @cli.command()
-@click.option('--nsamples', default=100, show_default=True, help="Number of importance samples to use.")
-@click.option('--batch-size', default=100, show_default=True, help="Batch size for tcregex calculation.")
-@click.option('--max-iters', default=100, show_default=True, help="The maximum number of batch iterations to use.")
 @click.option(
-    '--track-last', default=5, show_default=True, help="We want the SD of the last track-last to be less than tol.")
-@click.option('--tol', default=0.005, show_default=True, help="Tolerance for tcregex accuracy.")
-@click.argument('params_json', type=click.Path(exists=True))
-@click.argument('model_weights', type=click.Path(exists=True))
-@click.argument('in_tcregex')
-@click.argument('out_csv', type=click.File('w'))
-def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json, model_weights, in_tcregex, out_csv):
+    "--nsamples",
+    default=100,
+    show_default=True,
+    help="Number of importance samples to use.",
+)
+@click.option(
+    "--batch-size",
+    default=100,
+    show_default=True,
+    help="Batch size for tcregex calculation.",
+)
+@click.option(
+    "--max-iters",
+    default=100,
+    show_default=True,
+    help="The maximum number of batch iterations to use.",
+)
+@click.option(
+    "--track-last",
+    default=5,
+    show_default=True,
+    help="We want the SD of the last track-last to be less than tol.",
+)
+@click.option(
+    "--tol", default=0.005, show_default=True, help="Tolerance for tcregex accuracy."
+)
+@click.argument("params_json", type=click.Path(exists=True))
+@click.argument("model_weights", type=click.Path(exists=True))
+@click.argument("in_tcregex")
+@click.argument("out_csv", type=click.File("w"))
+def tcregex_pvae(
+    nsamples,
+    batch_size,
+    max_iters,
+    track_last,
+    tol,
+    params_json,
+    model_weights,
+    in_tcregex,
+    out_csv,
+):
     """
     Calculate Pvae for a TCR specified by a tcregex.
 
@@ -484,7 +569,9 @@ def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json,
 
     for batch_i in range(max_iters):
         df_generated = tcregex.sample_tcregex(in_tcregex, batch_size)
-        df_x = conversion.unpadded_tcrbs_to_onehot(df_generated, v.params['max_cdr3_len'])
+        df_x = conversion.unpadded_tcrbs_to_onehot(
+            df_generated, v.params["max_cdr3_len"]
+        )
 
         log_p_x = np.zeros((nsamples, len(df_x)))
 
@@ -493,13 +580,17 @@ def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json,
 
         # Calculate log of mean of numbers given in log space.
         # This calculates the per-sequence log_p_x estimate.
-        df_generated['log_p_x'] = special.logsumexp(log_p_x, axis=0) - np.log(nsamples)
+        df_generated["log_p_x"] = special.logsumexp(log_p_x, axis=0) - np.log(nsamples)
         generated_dfs.append(df_generated)
         catted = pd.concat(generated_dfs)
-        means.append(special.logsumexp(catted['log_p_x'], axis=0) - np.log(len(catted)))
+        means.append(special.logsumexp(catted["log_p_x"], axis=0) - np.log(len(catted)))
         if len(means) > track_last:
             recent_sd = np.std(np.array(means[-track_last:]))
-            click.echo("[Iter {}]\tmean: {:.6}\trecent SD: {:.5}\ttol: {}".format(batch_i, means[-1], recent_sd, tol))
+            click.echo(
+                "[Iter {}]\tmean: {:.6}\trecent SD: {:.5}\ttol: {}".format(
+                    batch_i, means[-1], recent_sd, tol
+                )
+            )
             if recent_sd < tol:
                 break
         else:
@@ -510,10 +601,16 @@ def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json,
 
 
 @cli.command()
-@click.option('-n', '--nseqs', default=100, show_default=True, help='Number of sequences to generate.')
-@click.argument('params_json', type=click.Path(exists=True))
-@click.argument('model_weights', type=click.Path(exists=True))
-@click.argument('out_csv', type=click.File('w'))
+@click.option(
+    "-n",
+    "--nseqs",
+    default=100,
+    show_default=True,
+    help="Number of sequences to generate.",
+)
+@click.argument("params_json", type=click.Path(exists=True))
+@click.argument("model_weights", type=click.Path(exists=True))
+@click.argument("out_csv", type=click.File("w"))
 def generate(nseqs, params_json, model_weights, out_csv):
     """
     Generate some sequences and write them to a file.
@@ -523,5 +620,5 @@ def generate(nseqs, params_json, model_weights, out_csv):
     v.generate(nseqs).to_csv(out_csv, index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/vampire/tcregex.py b/vampire/tcregex.py
index b534ac0..81a0dea 100644
--- a/vampire/tcregex.py
+++ b/vampire/tcregex.py
@@ -9,12 +9,12 @@
 # This is how we specify the "tcregex" format.
 replacement_dict = {
     # . means any amino acid
-    '\.': '[' + ''.join(aas) + ']',
+    "\.": "[" + "".join(aas) + "]",
     # Amino acid ambiguity codes as per
     # http://www.virology.wisc.edu/acp/Classes/DropFolders/Drop660_lectures/SingleLetterCode.html
     # https://febs.onlinelibrary.wiley.com/doi/pdf/10.1111/j.1432-1033.1984.tb07877.x
-    'B': '[DN]',
-    'Z': '[EQ]'
+    "B": "[DN]",
+    "Z": "[EQ]",
 }
 
 
@@ -40,15 +40,17 @@ def sample_split_tcregex(v_gene, j_gene, cdr3_tcregex, n):
     """
     Sample from a tcregex that has been split into its components.
     """
-    df = pd.DataFrame({'amino_acid': sample_cdr3_tcregex(cdr3_tcregex, n)})
-    df['v_gene'] = v_gene
-    df['j_gene'] = j_gene
-    return (df)
+    df = pd.DataFrame({"amino_acid": sample_cdr3_tcregex(cdr3_tcregex, n)})
+    df["v_gene"] = v_gene
+    df["j_gene"] = j_gene
+    return df
 
 
 def sample_tcregex(tcregex, n):
-    inputs = tcregex.split(',')
+    inputs = tcregex.split(",")
     if len(inputs) != 3:
-        raise Exception("tcregexes should be specified in the format `v_gene,j_gene,cdr3_tcregex`.")
+        raise Exception(
+            "tcregexes should be specified in the format `v_gene,j_gene,cdr3_tcregex`."
+        )
     v_gene, j_gene, cdr3_tcregex = inputs
     return sample_split_tcregex(v_gene, j_gene, cdr3_tcregex, n)
diff --git a/vampire/tests/test_germline_cdr3_aa_tensor.py b/vampire/tests/test_germline_cdr3_aa_tensor.py
index 63bad59..4a51180 100644
--- a/vampire/tests/test_germline_cdr3_aa_tensor.py
+++ b/vampire/tests/test_germline_cdr3_aa_tensor.py
@@ -5,23 +5,35 @@
 
 
 def test_aa_encoding_tensors():
-    aa_order = 'ABC'
-    v_gene_list = ['TCRBV01-01', 'TCRBV02-01']
-    j_gene_list = ['TCRBJ01-01', 'TCRBJ01-02', 'TCRBJ01-03']
-    germline_cdr3_csv = pkg_resources.resource_filename('vampire', 'data/germline-cdr3-aas.test.csv')
+    aa_order = "ABC"
+    v_gene_list = ["TCRBV01-01", "TCRBV02-01"]
+    j_gene_list = ["TCRBJ01-01", "TCRBJ01-02", "TCRBJ01-03"]
+    germline_cdr3_csv = pkg_resources.resource_filename(
+        "vampire", "data/germline-cdr3-aas.test.csv"
+    )
     max_cdr3_len = 4
     #                                    B         A
     #                                        C
-    v_encoding_correct = np.array([[[0., 1., 0.], [1., 0., 0.], [0., 0., 0.], [0., 0., 0.]],
-                                   [[0., 0., 1.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]])
+    v_encoding_correct = np.array(
+        [
+            [[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
+            [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
+        ]
+    )
 
     #                                                            A                     C
     #                                                                          A
     #                                                                    C         B
-    j_encoding_correct = np.array([[[0., 0., 0.], [0., 0., 0.], [1., 0., 0.], [0., 0., 1.]],
-                                   [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [1., 0., 0.]],
-                                   [[0., 0., 0.], [0., 0., 0.], [0., 0., 1.], [0., 1., 0.]]])
+    j_encoding_correct = np.array(
+        [
+            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]],
+            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0]],
+        ]
+    )
 
-    (v_enc, j_enc) = aa_encoding_tensors(germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len)
+    (v_enc, j_enc) = aa_encoding_tensors(
+        germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len
+    )
     assert np.array_equal(v_enc, v_encoding_correct)
     assert np.array_equal(j_enc, j_encoding_correct)
diff --git a/vampire/tests/test_layers.py b/vampire/tests/test_layers.py
index 7be7821..7b6c84f 100644
--- a/vampire/tests/test_layers.py
+++ b/vampire/tests/test_layers.py
@@ -5,8 +5,14 @@
 
 
 def test_cumprod_sum():
-    a = np.array([[[1, 1, 1, 0], [1, 0, 0, 0]], [[1, 1, 0, 1], [1, 0, 1, 1]], [[0, 1, 1, 1], [0, 0, 1, 1]]],
-                 dtype=np.float32)
+    a = np.array(
+        [
+            [[1, 1, 1, 0], [1, 0, 0, 0]],
+            [[1, 1, 0, 1], [1, 0, 1, 1]],
+            [[0, 1, 1, 1], [0, 0, 1, 1]],
+        ],
+        dtype=np.float32,
+    )
 
     def np_cumprod_sum(a, length, reverse=False):
         if reverse:
@@ -16,8 +22,10 @@ def np_cumprod_sum(a, length, reverse=False):
     for k in range(a.shape[0]):
         for length in range(1, 4):
             for reverse in [True, False]:
-                with tf.Session() as sess:
+                with tf.compat.v1.Session() as sess:
                     tf_input = tf.constant(a[k, :, :], dtype=tf.float32)
                     tf_result = layers.cumprod_sum(tf_input, length, reverse=reverse)
                     tf_result = sess.run(tf_result)
-                    assert np.allclose(np_cumprod_sum(a[k, :, :], length, reverse), tf_result)
+                    assert np.allclose(
+                        np_cumprod_sum(a[k, :, :], length, reverse), tf_result
+                    )
diff --git a/vampire/tests/test_preprocess.py b/vampire/tests/test_preprocess.py
index 2e47a9d..a8382bc 100644
--- a/vampire/tests/test_preprocess.py
+++ b/vampire/tests/test_preprocess.py
@@ -5,15 +5,15 @@
 
 
 def test_filters():
-    unfiltered = common.read_data_csv('adaptive-filter-test.csv')
-    correct = common.read_data_csv('adaptive-filter-test.correct.csv')
+    unfiltered = common.read_data_csv("adaptive-filter-test.csv")
+    correct = common.read_data_csv("adaptive-filter-test.correct.csv")
     filtered = pre.apply_all_filters(unfiltered)
     assert correct.equals(filtered)
 
 
 def test_dedup():
-    original = common.read_data_csv('vjcdr3-dedup-test.csv')
-    correct = common.read_data_csv('vjcdr3-dedup-test.correct.csv')
+    original = common.read_data_csv("vjcdr3-dedup-test.csv")
+    correct = common.read_data_csv("vjcdr3-dedup-test.correct.csv")
     random.seed(1)
     deduped = pre.dedup_on_vjcdr3(original)
     assert correct.equals(deduped)
diff --git a/vampire/tests/test_xcr_vector_conversion.py b/vampire/tests/test_xcr_vector_conversion.py
index 8a4fd28..4d8bc12 100644
--- a/vampire/tests/test_xcr_vector_conversion.py
+++ b/vampire/tests/test_xcr_vector_conversion.py
@@ -7,11 +7,11 @@
 
 def test_pad_middle():
     with pytest.raises(AssertionError):
-        conversion.pad_middle('AAAAA', 2)
-    assert 'A---B' == conversion.pad_middle('AB', 5)
-    assert 'A--BC' == conversion.pad_middle('ABC', 5)
-    assert 'AB' == conversion.pad_middle('AB', 2)
-    assert '----' == conversion.pad_middle('---', 4)
+        conversion.pad_middle("AAAAA", 2)
+    assert "A---B" == conversion.pad_middle("AB", 5)
+    assert "A--BC" == conversion.pad_middle("ABC", 5)
+    assert "AB" == conversion.pad_middle("AB", 2)
+    assert "----" == conversion.pad_middle("---", 4)
 
 
 def test_gene_conversion():
@@ -22,17 +22,17 @@ def test_gene_conversion():
 
 
 def test_aa_conversion():
-    target = 'CASY'
+    target = "CASY"
     assert conversion.onehot_to_seq(conversion.seq_to_onehot(target)) == target
-    target = 'C-SY'
+    target = "C-SY"
     assert conversion.onehot_to_seq(conversion.seq_to_onehot(target)) == target
 
 
 def test_cdr3_length_of_onehots():
-    data = common.read_data_csv('adaptive-filter-test.correct.csv')
-    lengths = data['amino_acid'].apply(len).apply(float)
+    data = common.read_data_csv("adaptive-filter-test.correct.csv")
+    lengths = data["amino_acid"].apply(len).apply(float)
     onehots = conversion.unpadded_tcrbs_to_onehot(data, 30)
-    assert lengths.equals(conversion.cdr3_length_of_onehots(onehots['amino_acid']))
+    assert lengths.equals(conversion.cdr3_length_of_onehots(onehots["amino_acid"]))
 
 
 def test_contiguous_match_counts():
@@ -41,16 +41,25 @@ def test_contiguous_match_counts():
     j01_02 = j[1]
     v01_v02_mixture = (v[0] + v[1]) / 2
 
-    v01_j02_matching = conversion.seq_to_onehot('CTSSQ-------------------NYGYTF')
-    two_v01_match = conversion.seq_to_onehot('CTWSQ-------------------NYGYTF')
-    three_j02_match = conversion.seq_to_onehot('CTSSQ-------------------NYAYTF')
+    v01_j02_matching = conversion.seq_to_onehot("CTSSQ-------------------NYGYTF")
+    two_v01_match = conversion.seq_to_onehot("CTWSQ-------------------NYGYTF")
+    three_j02_match = conversion.seq_to_onehot("CTSSQ-------------------NYAYTF")
 
     # Here we have a complete match for both genes.
-    assert np.array_equal(conversion.contiguous_match_counts(v01_j02_matching, v01_01, j01_02), np.array([5., 6.]))
+    assert np.array_equal(
+        conversion.contiguous_match_counts(v01_j02_matching, v01_01, j01_02),
+        np.array([5.0, 6.0]),
+    )
     # Here the V match is interrupted by a W instead of an S, and we can see the "contiguous" requirement working.
-    assert np.array_equal(conversion.contiguous_match_counts(two_v01_match, v01_01, j01_02), np.array([2., 6.]))
+    assert np.array_equal(
+        conversion.contiguous_match_counts(two_v01_match, v01_01, j01_02),
+        np.array([2.0, 6.0]),
+    )
     # Equivalent test for J.
-    assert np.array_equal(conversion.contiguous_match_counts(three_j02_match, v01_01, j01_02), np.array([5., 3.]))
+    assert np.array_equal(
+        conversion.contiguous_match_counts(three_j02_match, v01_01, j01_02),
+        np.array([5.0, 3.0]),
+    )
     # For the mixture, we have one residue that matches both, then one that
     # only matches V01, then another two that match both. You can see that in
     # the indicator decay from left to right.
@@ -58,14 +67,20 @@ def test_contiguous_match_counts():
     # 1., 0.5, 0.5, 0.5, 0.25, 0., 0., 0., 0., 0., 0., 0., 0., 0.,
     # 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.
     assert np.array_equal(
-        conversion.contiguous_match_counts(v01_j02_matching, v01_v02_mixture, j01_02), np.array([2.75, 6.]))
+        conversion.contiguous_match_counts(v01_j02_matching, v01_v02_mixture, j01_02),
+        np.array([2.75, 6.0]),
+    )
 
 
 def test_contiguous_match_counts_df():
-    test = conversion.unpadded_tcrbs_to_onehot(common.read_data_csv('adaptive-filter-test.correct.csv'), 30)
+    test = conversion.unpadded_tcrbs_to_onehot(
+        common.read_data_csv("adaptive-filter-test.correct.csv"), 30
+    )
     v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors(30)
-    result = conversion.contiguous_match_counts_df(test, v_germline_tensor, j_germline_tensor)
+    result = conversion.contiguous_match_counts_df(
+        test, v_germline_tensor, j_germline_tensor
+    )
 
-    assert np.array_equal(result[0], np.array([4., 6.]))
-    assert np.array_equal(result[1], np.array([5., 5.]))
-    assert np.array_equal(result[5], np.array([5., 0.]))
+    assert np.array_equal(result[0], np.array([4.0, 6.0]))
+    assert np.array_equal(result[1], np.array([5.0, 5.0]))
+    assert np.array_equal(result[5], np.array([5.0, 0.0]))
diff --git a/vampire/thymic_Q.py b/vampire/thymic_Q.py
index bd8a5c7..e629c32 100644
--- a/vampire/thymic_Q.py
+++ b/vampire/thymic_Q.py
@@ -12,7 +12,7 @@
 
 # These are our "lvj" columns, which are the most common indices for
 # probabilities and sequences in what follows.
-lvj = ['length', 'v_gene', 'j_gene']
+lvj = ["length", "v_gene", "j_gene"]
 
 
 def bound(x, m):
@@ -35,7 +35,7 @@ def add_pseudocount(df, col_name, pseudocount_multiplier):
 
     The pseudocount is pseudocount_multiplier times the smallest non-zero element.
     """
-    zero_rows = (df[col_name] == 0)
+    zero_rows = df[col_name] == 0
     pseudocount = min(df.loc[~zero_rows, col_name]) * pseudocount_multiplier
     df.loc[zero_rows, col_name] += pseudocount
 
@@ -45,11 +45,11 @@ def read_olga_tsv(path):
     Read in a TSV in the format liked by OLGA. Four columns means that there is
     DNA data-- if so, drop it.
     """
-    df = pd.read_csv(path, sep='\t', header=None)
+    df = pd.read_csv(path, sep="\t", header=None)
     if len(df.columns) == 4:
         df = df.iloc[:, 1:]
     assert len(df.columns) == 3
-    df.columns = 'amino_acid v_gene j_gene'.split()
+    df.columns = "amino_acid v_gene j_gene".split()
     return df
 
 
@@ -57,9 +57,9 @@ def read_olga_pgen_tsv(path):
     """
     Read in a TSV output by OLGA's Pgen calculation.
     """
-    df = pd.read_csv(path, sep='\t', header=None)
+    df = pd.read_csv(path, sep="\t", header=None)
     assert len(df.columns) == 4
-    df.columns = 'amino_acid v_gene j_gene Pgen'.split()
+    df.columns = "amino_acid v_gene j_gene Pgen".split()
     return df
 
 
@@ -69,9 +69,9 @@ def lvj_frequency_of_olga_tsv(path, col_name):
     contained in it.
     """
     df = read_olga_tsv(path)
-    df['length'] = df['amino_acid'].apply(len)
+    df["length"] = df["amino_acid"].apply(len)
     df = df.loc[:, lvj]
-    df[col_name] = 1.
+    df[col_name] = 1.0
     df = df.groupby(lvj).sum()
     normalize_column(df, col_name)
     return df
@@ -81,11 +81,11 @@ def set_lvj_index(df):
     """
     Make an lvj index in place from the length, v_gene, and j_gene.
     """
-    df['length'] = df['amino_acid'].apply(len)
+    df["length"] = df["amino_acid"].apply(len)
     df.set_index(lvj, inplace=True)
 
 
-def merge_lvj_dfs(df1, df2, how='outer'):
+def merge_lvj_dfs(df1, df2, how="outer"):
     """
     Merge two data frames on lvj indices.
 
@@ -96,20 +96,24 @@ def merge_lvj_dfs(df1, df2, how='outer'):
     return merged
 
 
-def q_of_train_and_model_pgen(model_p_lvj_csv, train_tsv, max_q=None, pseudocount_multiplier=0.5):
+def q_of_train_and_model_pgen(
+    model_p_lvj_csv, train_tsv, max_q=None, pseudocount_multiplier=0.5
+):
     """
     Fit a q distribution, but truncating q at max_q.
     """
     # Merge the p_lvj from the data and that from the model:
     df = merge_lvj_dfs(
-        lvj_frequency_of_olga_tsv(train_tsv, col_name='data_P_lvj'), pd.read_csv(model_p_lvj_csv, index_col=lvj))
+        lvj_frequency_of_olga_tsv(train_tsv, col_name="data_P_lvj"),
+        pd.read_csv(model_p_lvj_csv, index_col=lvj),
+    )
     # We need to do this merge so we can add a pseudocount:
-    add_pseudocount(df, 'model_P_lvj', pseudocount_multiplier)
-    normalize_column(df, 'model_P_lvj')
-    q = df['data_P_lvj'] / df['model_P_lvj']
+    add_pseudocount(df, "model_P_lvj", pseudocount_multiplier)
+    normalize_column(df, "model_P_lvj")
+    q = df["data_P_lvj"] / df["model_P_lvj"]
     if max_q:
         q = bound(q, max_q)
-    return pd.DataFrame({'q': q})
+    return pd.DataFrame({"q": q})
 
 
 def calc_Ppost(q_csv, data_pgen_tsv, pseudocount_multiplier=0.5):
@@ -120,17 +124,17 @@ def calc_Ppost(q_csv, data_pgen_tsv, pseudocount_multiplier=0.5):
     set_lvj_index(df)
     # Merging on left means that we only use only keys from the left data
     # frame, i.e. the sequences for which we are interested in computing Ppost.
-    df = merge_lvj_dfs(df, pd.read_csv(q_csv, index_col=lvj), how='left')
+    df = merge_lvj_dfs(df, pd.read_csv(q_csv, index_col=lvj), how="left")
     # Add a pseudocount to Pgen and q to prevent infs in whole-data log likelihoods.
-    add_pseudocount(df, 'q', pseudocount_multiplier)
-    add_pseudocount(df, 'Pgen', pseudocount_multiplier)
-    df['Ppost'] = df['Pgen'] * df['q']
+    add_pseudocount(df, "q", pseudocount_multiplier)
+    add_pseudocount(df, "Pgen", pseudocount_multiplier)
+    df["Ppost"] = df["Pgen"] * df["q"]
     # Drop length from the index.
     df.reset_index(level=0, drop=True, inplace=True)
     # Turn the index columns into normal columns.
     df.reset_index(inplace=True)
     # Only keep our usual 3 columns.
-    df.set_index(['amino_acid', 'v_gene', 'j_gene'], inplace=True)
+    df.set_index(["amino_acid", "v_gene", "j_gene"], inplace=True)
     return df
 
 
@@ -147,17 +151,17 @@ def rejection_sample_Ppost(q_df, df_Pgen_sample):
     df = df_Pgen_sample
     set_lvj_index(df)
     # This merge is how we get the q value corresponding to each sequence x.
-    df = merge_lvj_dfs(df, q_df, how='left')
-    max_q = np.max(df['q'])
-    df['acceptance_prob'] = df['q'] / max_q
-    df['random'] = np.random.uniform(size=len(df))
+    df = merge_lvj_dfs(df, q_df, how="left")
+    max_q = np.max(df["q"])
+    df["acceptance_prob"] = df["q"] / max_q
+    df["random"] = np.random.uniform(size=len(df))
     # We subselect the rows that should be accepted, and keep only the
     # amino_acid column (the v_gene and j_gene columns are kept as part of the
     # index).
-    df = df.loc[df['random'] < df['acceptance_prob'], 'amino_acid']
+    df = df.loc[df["random"] < df["acceptance_prob"], "amino_acid"]
     # Turn the index columns into normal columns, just keeping our usual 3
     # columns.
-    df = df.reset_index()[['amino_acid', 'v_gene', 'j_gene']]
+    df = df.reset_index()[["amino_acid", "v_gene", "j_gene"]]
     # Randomize the order of the sequences (they got sorted in the process of
     # merging with Q).
     df = df.sample(frac=1)
@@ -173,7 +177,9 @@ def sample_Ppost(sample_size, q_csv, max_iter=100, proposal_size=1e6):
     with tempfile.TemporaryDirectory() as tmpdir:
         for iter_idx in range(max_iter):
             sample_path = os.path.join(tmpdir, str(iter_idx))
-            c = delegator.run(' '.join(['olga-generate.sh', str(proposal_size), sample_path]))
+            c = delegator.run(
+                " ".join(["olga-generate.sh", str(proposal_size), sample_path])
+            )
             if c.return_code != 0:
                 raise Exception("olga-generate.sh failed!")
             df_sample = rejection_sample_Ppost(q_df, read_olga_tsv(sample_path))
@@ -183,7 +189,9 @@ def sample_Ppost(sample_size, q_csv, max_iter=100, proposal_size=1e6):
                 break
 
     if len(out_df) < sample_size:
-        raise Exception("Did not obtain desired number of samples in the specified maximumn number of iterations.")
+        raise Exception(
+            "Did not obtain desired number of samples in the specified maximumn number of iterations."
+        )
 
     out_df = out_df.head(sample_size)
     return out_df
@@ -195,9 +203,9 @@ def cli():
 
 
 @cli.command()
-@click.option('--col-name', required=True, help="Name for frequency column.")
-@click.argument('in_tsv', type=click.File('r'))
-@click.argument('out_csv', type=click.File('w'))
+@click.option("--col-name", required=True, help="Name for frequency column.")
+@click.argument("in_tsv", type=click.File("r"))
+@click.argument("out_csv", type=click.File("w"))
 def lvj_frequency(col_name, in_tsv, out_csv):
     """
     Calculate frequency of lvj combinations in a given OLGA TSV.
@@ -206,11 +214,17 @@ def lvj_frequency(col_name, in_tsv, out_csv):
 
 
 @cli.command()
-@click.option('--max-q', type=int, default=None, show_default=True, help="Limit q to be at most this value.")
+@click.option(
+    "--max-q",
+    type=int,
+    default=None,
+    show_default=True,
+    help="Limit q to be at most this value.",
+)
 # Here we use a click.Path so we can pass a .bz2'd CSV.
-@click.argument('model_p_lvj_csv', type=click.Path(exists=True))
-@click.argument('train_tsv', type=click.File('r'))
-@click.argument('out_csv', type=click.File('w'))
+@click.argument("model_p_lvj_csv", type=click.Path(exists=True))
+@click.argument("train_tsv", type=click.File("r"))
+@click.argument("out_csv", type=click.File("w"))
 def q(max_q, model_p_lvj_csv, train_tsv, out_csv):
     """
     Calculate q_lvj given training data and p_lvj obtained from the model.
@@ -219,9 +233,9 @@ def q(max_q, model_p_lvj_csv, train_tsv, out_csv):
 
 
 @cli.command()
-@click.argument('q_csv', type=click.File('r'))
-@click.argument('data_pgen_tsv', type=click.File('r'))
-@click.argument('out_csv', type=click.File('w'))
+@click.argument("q_csv", type=click.File("r"))
+@click.argument("data_pgen_tsv", type=click.File("r"))
+@click.argument("out_csv", type=click.File("w"))
 def ppost(q_csv, data_pgen_tsv, out_csv):
     """
     Calculate Ppost given q_lvj and Pgen calculated for a data set.
@@ -231,23 +245,28 @@ def ppost(q_csv, data_pgen_tsv, out_csv):
 
 @cli.command()
 @click.option(
-    '--max-iter',
+    "--max-iter",
     default=100,
     show_default=True,
-    help="Maximum number of iterations to try to achieve the desired number of samples.")
+    help="Maximum number of iterations to try to achieve the desired number of samples.",
+)
 @click.option(
-    '--proposal-size', default=1000000, show_default=True, help="Number of samples to take for proposal distribution.")
-@click.argument('sample_size', type=int)
-@click.argument('q_csv', type=click.File('r'))
-@click.argument('out_tsv', type=click.File('w'))
+    "--proposal-size",
+    default=1000000,
+    show_default=True,
+    help="Number of samples to take for proposal distribution.",
+)
+@click.argument("sample_size", type=int)
+@click.argument("q_csv", type=click.File("r"))
+@click.argument("out_tsv", type=click.File("w"))
 def sample(max_iter, proposal_size, sample_size, q_csv, out_tsv):
     """
     Sample from the Ppost distribution via rejection sampling.
     """
     sample_Ppost(
-        sample_size, q_csv, max_iter=max_iter, proposal_size=proposal_size).to_csv(
-            out_tsv, sep='\t', index=False)
+        sample_size, q_csv, max_iter=max_iter, proposal_size=proposal_size
+    ).to_csv(out_tsv, sep="\t", index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/vampire/util.py b/vampire/util.py
index 139c251..8164a94 100644
--- a/vampire/util.py
+++ b/vampire/util.py
@@ -29,12 +29,12 @@ def cli():
 
 
 @cli.command()
-@click.option('--idx', required=True, help='The row index for the summary output.')
-@click.option('--idx-name', required=True, help='The row index name.')
-@click.argument('seq_path', type=click.Path(exists=True))
-@click.argument('pvae_path', type=click.Path(exists=True))
-@click.argument('ppost_path', type=click.Path(exists=True))
-@click.argument('out_path', type=click.Path(writable=True))
+@click.option("--idx", required=True, help="The row index for the summary output.")
+@click.option("--idx-name", required=True, help="The row index name.")
+@click.argument("seq_path", type=click.Path(exists=True))
+@click.argument("pvae_path", type=click.Path(exists=True))
+@click.argument("ppost_path", type=click.Path(exists=True))
+@click.argument("out_path", type=click.Path(writable=True))
 def merge_ps(idx, idx_name, seq_path, pvae_path, ppost_path, out_path):
     """
     Merge probability estimates from Pvae and Ppost into a single data frame and write to an output CSV.
@@ -44,11 +44,11 @@ def merge_ps(idx, idx_name, seq_path, pvae_path, ppost_path, out_path):
     """
 
     def prep_index(df):
-        df.set_index(['amino_acid', 'v_gene', 'j_gene'], inplace=True)
+        df.set_index(["amino_acid", "v_gene", "j_gene"], inplace=True)
         df.sort_index(inplace=True)
 
     pvae_df = pd.read_csv(seq_path)
-    pvae_df['log_Pvae'] = pd.read_csv(pvae_path)['log_p_x']
+    pvae_df["log_Pvae"] = pd.read_csv(pvae_path)["log_p_x"]
     prep_index(pvae_df)
     ppost_df = convert_gene_names(pd.read_csv(ppost_path), olga_to_adaptive_dict())
     prep_index(ppost_df)
@@ -56,8 +56,14 @@ def prep_index(df):
     # If we don't drop duplicates then merge will expand the number of rows.
     # See https://stackoverflow.com/questions/39019591/duplicated-rows-when-merging-dataframes-in-python
     # We deduplicate Ppost, which is guaranteed to be identical among repeated elements.
-    merged = pd.merge(pvae_df, ppost_df.drop_duplicates(), how='left', left_index=True, right_index=True)
-    merged['log_Ppost'] = np.log(merged['Ppost'])
+    merged = pd.merge(
+        pvae_df,
+        ppost_df.drop_duplicates(),
+        how="left",
+        left_index=True,
+        right_index=True,
+    )
+    merged["log_Ppost"] = np.log(merged["Ppost"])
     merged.reset_index(inplace=True)
     merged[idx_name] = idx
     merged.set_index(idx_name, inplace=True)
@@ -65,10 +71,10 @@ def prep_index(df):
 
 
 @cli.command()
-@click.option('--train-size', default=1000, help="Data count to use for train.")
-@click.argument('in_csv', type=click.File('r'))
-@click.argument('out1_csv', type=click.File('w'))
-@click.argument('out2_csv', type=click.File('w'))
+@click.option("--train-size", default=1000, help="Data count to use for train.")
+@click.argument("in_csv", type=click.File("r"))
+@click.argument("out1_csv", type=click.File("w"))
+@click.argument("out2_csv", type=click.File("w"))
 def split(train_size, in_csv, out1_csv, out2_csv):
     """
     Do a train/test split.
@@ -80,36 +86,43 @@ def split(train_size, in_csv, out1_csv, out2_csv):
 
 
 @cli.command()
-@click.option('--out', type=click.File('w'), help='Output file path.', required=True)
-@click.option('--idx', required=True, help='The row index for the summary output.')
-@click.option('--idx-name', required=True, help='The row index name.')
+@click.option("--out", type=click.File("w"), help="Output file path.", required=True)
+@click.option("--idx", required=True, help="The row index for the summary output.")
+@click.option("--idx-name", required=True, help="The row index name.")
 @click.option(
-    '--colnames', default='', help='Comma-separated column identifier names corresponding to the files that follow.')
-@click.argument('in_paths', nargs=-1)
+    "--colnames",
+    default="",
+    help="Comma-separated column identifier names corresponding to the files that follow.",
+)
+@click.argument("in_paths", nargs=-1)
 def summarize(out, idx, idx_name, colnames, in_paths):
     """
     Summarize results of a run as a single-row CSV. The input is of flexible
     length: each input file is associated with an identifier specified using
     the --colnames flag.
     """
-    colnames = colnames.split(',')
+    colnames = colnames.split(",")
     if len(colnames) != len(in_paths):
-        raise Exception("The number of colnames is not equal to the number of input files.")
+        raise Exception(
+            "The number of colnames is not equal to the number of input files."
+        )
     input_d = {k: v for k, v in zip(colnames, in_paths)}
 
     index = pd.Index([idx], name=idx_name)
-    if 'loss' in input_d:
-        loss_df = pd.read_csv(input_d['loss'], index_col=0).reset_index()
+    if "loss" in input_d:
+        loss_df = pd.read_csv(input_d["loss"], index_col=0).reset_index()
         # The following 3 lines combine the data source and the metric into a
         # single id like `train_j_gene_output_loss`.
-        loss_df = pd.melt(loss_df, id_vars='index')
-        loss_df['id'] = loss_df['variable'] + '_' + loss_df['index']
-        loss_df.set_index('id', inplace=True)
-        df = pd.DataFrame(dict(zip(loss_df.index, loss_df['value'].transpose())), index=index)
+        loss_df = pd.melt(loss_df, id_vars="index")
+        loss_df["id"] = loss_df["variable"] + "_" + loss_df["index"]
+        loss_df.set_index("id", inplace=True)
+        df = pd.DataFrame(
+            dict(zip(loss_df.index, loss_df["value"].transpose())), index=index
+        )
     else:
         df = pd.DataFrame(index=index)
 
-    def slurp_cols(path, prefix='', suffix=''):
+    def slurp_cols(path, prefix="", suffix=""):
         """
         Given a one-row CSV with summaries, add them to df with an optional
         prefix and suffix.
@@ -124,33 +137,38 @@ def add_p_summary(path, name):
         Add a summary of something like `validation_pvae` where `validation` is
         the prefix and `pvae` is the statistic.
         """
-        prefix, statistic = name.split('_')
-        if statistic == 'pvae':
-            log_statistic = pd.read_csv(path)['log_p_x']
-        elif statistic == 'ppost':
-            log_statistic = np.log(pd.read_csv(path)['Ppost'])
+        prefix, statistic = name.split("_")
+        if statistic == "pvae":
+            log_statistic = pd.read_csv(path)["log_p_x"]
+        elif statistic == "ppost":
+            log_statistic = np.log(pd.read_csv(path)["Ppost"])
         else:
             raise Exception(f"Unknown statistic '{statistic}'")
 
-        df[prefix + '_median_log_p'] = np.median(log_statistic)
-        df[prefix + '_mean_log_p'] = np.mean(log_statistic)
+        df[prefix + "_median_log_p"] = np.median(log_statistic)
+        df[prefix + "_mean_log_p"] = np.mean(log_statistic)
 
     for name, path in input_d.items():
         if name in [
-                'training_pvae', 'validation_pvae', 'test_pvae', 'training_ppost', 'validation_ppost', 'test_ppost'
+            "training_pvae",
+            "validation_pvae",
+            "test_pvae",
+            "training_ppost",
+            "validation_ppost",
+            "test_ppost",
         ]:
             add_p_summary(path, name)
-        elif re.search('sumrep_divergences', name):
-            slurp_cols(path, prefix='sumdiv_')
-        elif re.search('auc_', name):
+        elif re.search("sumrep_divergences", name):
+            slurp_cols(path, prefix="sumdiv_")
+        elif re.search("auc_", name):
             slurp_cols(path)
 
     df.to_csv(out)
 
 
 @cli.command()
-@click.option('--out', type=click.File('w'), help='Output file path.', required=True)
-@click.argument('in_paths', nargs=-1)
+@click.option("--out", type=click.File("w"), help="Output file path.", required=True)
+@click.argument("in_paths", nargs=-1)
 def csvstack(out, in_paths):
     """
     Like csvkit's csvstack, but can deal with varying columns.
@@ -158,12 +176,14 @@ def csvstack(out, in_paths):
 
     Note that this sorts the columns by name (part of merging columns).
     """
-    pd.concat([pd.read_csv(path) for path in in_paths], sort=True).to_csv(out, index=False)
+    pd.concat([pd.read_csv(path) for path in in_paths], sort=True).to_csv(
+        out, index=False
+    )
 
 
 @cli.command()
-@click.option('--out', type=click.File('w'), help='Output file path.', required=True)
-@click.argument('in_paths', nargs=-1)
+@click.option("--out", type=click.File("w"), help="Output file path.", required=True)
+@click.argument("in_paths", nargs=-1)
 def fancystack(out, in_paths):
     """
     Like csvkit's csvstack, but fancy.
@@ -175,15 +195,15 @@ def fancystack(out, in_paths):
 
     def read_df(path):
         df = pd.read_csv(path)
-        idx_names = df.columns[0].split(';')
+        idx_names = df.columns[0].split(";")
         idx_str = df.iloc[0, 0]
         # Make sure the index column is constant.
         assert (df.iloc[:, 0] == idx_str).all()
-        idx = idx_str.split(';')
+        idx = idx_str.split(";")
         df.drop(df.columns[0], axis=1, inplace=True)
 
         for k, v in zip(idx_names, idx):
-            if k in ['train_data', 'test_set']:
+            if k in ["train_data", "test_set"]:
                 v = common.strip_dirpath_extn(v)
             df[k] = v
 
@@ -195,29 +215,31 @@ def read_df(path):
 
 
 @cli.command()
-@click.option('--dest-path', type=click.Path(writable=True), required=True)
-@click.argument('loss_paths', nargs=-1)
+@click.option("--dest-path", type=click.Path(writable=True), required=True)
+@click.argument("loss_paths", nargs=-1)
 def copy_best_weights(dest_path, loss_paths):
     """
     Find the best weights according to validation loss and copy them to the specified path.
     """
     loss_paths = sorted(loss_paths)
-    losses = [pd.read_csv(path, index_col=0).loc['loss', 'validation'] for path in loss_paths]
+    losses = [
+        pd.read_csv(path, index_col=0).loc["loss", "validation"] for path in loss_paths
+    ]
     smallest_loss_path = loss_paths[np.argmin(losses)]
-    best_weights = os.path.join(os.path.dirname(smallest_loss_path), 'best_weights.h5')
+    best_weights = os.path.join(os.path.dirname(smallest_loss_path), "best_weights.h5")
     shutil.copyfile(best_weights, dest_path)
 
 
 @cli.command()
-@click.argument('l_csv_path', type=click.File('r'))
-@click.argument('r_csv_path', type=click.File('r'))
-@click.argument('out_csv', type=click.File('w'))
+@click.argument("l_csv_path", type=click.File("r"))
+@click.argument("r_csv_path", type=click.File("r"))
+@click.argument("out_csv", type=click.File("w"))
 def sharedwith(l_csv_path, r_csv_path, out_csv):
     """
     Write out a TCR dataframe identical to that in `l_csv_path` (perhaps with reordered columns)
     but with an extra column `present` indicating if that TCR is present in r_csv_path.
     """
-    avj_columns = ['amino_acid', 'v_gene', 'j_gene']
+    avj_columns = ["amino_acid", "v_gene", "j_gene"]
 
     def read_df(path):
         df = pd.read_csv(path)
@@ -227,25 +249,40 @@ def read_df(path):
     l_df = read_df(l_csv_path)
     r_df = read_df(r_csv_path)
 
-    intersection = pd.merge(l_df, r_df, left_index=True, right_index=True, how='inner')
-    l_df['present'] = 0
-    l_df.loc[intersection.index, 'present'] = 1
+    intersection = pd.merge(l_df, r_df, left_index=True, right_index=True, how="inner")
+    l_df["present"] = 0
+    l_df.loc[intersection.index, "present"] = 1
     click.echo(
-        f"{sum(l_df['present'])} of {len(l_df)} sequences in {l_csv_path.name} are shared with {r_csv_path.name}")
+        f"{sum(l_df['present'])} of {len(l_df)} sequences in {l_csv_path.name} are shared with {r_csv_path.name}"
+    )
     l_df.to_csv(out_csv)
 
 
 @cli.command()
-@click.option('--out-prefix', metavar='PRE', type=click.Path(writable=True), help="Output prefix.", required=True)
-@click.option('--test-size', default=1 / 3, show_default=True, help="Proportion of sample to hold out for testing.")
-@click.option('--test-regex', help="A regular expression that is used to identify the test set.")
 @click.option(
-    '--limit-each',
-    metavar='N',
+    "--out-prefix",
+    metavar="PRE",
+    type=click.Path(writable=True),
+    help="Output prefix.",
+    required=True,
+)
+@click.option(
+    "--test-size",
+    default=1 / 3,
+    show_default=True,
+    help="Proportion of sample to hold out for testing.",
+)
+@click.option(
+    "--test-regex", help="A regular expression that is used to identify the test set."
+)
+@click.option(
+    "--limit-each",
+    metavar="N",
     type=int,
     help="Limit the contribution of each repertoire to the training set "
-    "to a random N sequences. Will fail if any repertoire has less than N seqs.")
-@click.argument('in_paths', nargs=-1)
+    "to a random N sequences. Will fail if any repertoire has less than N seqs.",
+)
+@click.argument("in_paths", nargs=-1)
 def split_repertoires(out_prefix, test_size, test_regex, limit_each, in_paths):
     """
     Do a test-train split on the level of repertoires.
@@ -267,87 +304,87 @@ def split_repertoires(out_prefix, test_size, test_regex, limit_each, in_paths):
     else:
         train_paths, test_paths = train_test_split(in_paths, test_size=test_size)
 
-    train_tsv_path = out_prefix + '.train.tsv'
+    train_tsv_path = out_prefix + ".train.tsv"
 
     info = {
-        'split_call': ' '.join(sys.argv),
-        'split_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
-        'train_paths': train_paths,
-        'test_paths': test_paths,
-        'train_tsv_path': train_tsv_path,
+        "split_call": " ".join(sys.argv),
+        "split_date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
+        "train_paths": train_paths,
+        "test_paths": test_paths,
+        "train_tsv_path": train_tsv_path,
     }
 
-    json_path = out_prefix + '.json'
-    with open(json_path, 'w') as fp:
+    json_path = out_prefix + ".json"
+    with open(json_path, "w") as fp:
         fp.write(json.dumps(info, indent=4))
 
     header_written = False
 
-    with open(train_tsv_path, 'w') as fp:
+    with open(train_tsv_path, "w") as fp:
         for path in train_paths:
             df = preprocess_adaptive.read_adaptive_tsv(path)
             if limit_each:
                 if limit_each > len(df):
-                    raise ValueError(f"--limit-each parameter is greater than the number of sequences in {path}")
+                    raise ValueError(
+                        f"--limit-each parameter is greater than the number of sequences in {path}"
+                    )
                 df = df.sample(n=limit_each)
             if header_written:
-                df.to_csv(fp, sep='\t', header=False, index=False)
+                df.to_csv(fp, sep="\t", header=False, index=False)
             else:
                 # This is our first file to write.
                 header_written = True
-                df.to_csv(fp, sep='\t', index=False)
+                df.to_csv(fp, sep="\t", index=False)
 
     click.echo("Check JSON file with")
     click.echo(f"cat {json_path}")
 
 
 @cli.command()
-@click.option('--train-size', default=0.5, help="Data fraction to use for train.")
-@click.argument('in_csv')
-@click.argument('out_train_csv_bz2')
-@click.argument('out_test_csv_bz2')
+@click.option("--train-size", default=0.5, help="Data fraction to use for train.")
+@click.argument("in_csv")
+@click.argument("out_train_csv_bz2")
+@click.argument("out_test_csv_bz2")
 def split_rows(train_size, in_csv, out_train_csv_bz2, out_test_csv_bz2):
     df = pd.read_csv(in_csv, index_col=0)
     (df1, df2) = train_test_split(df, train_size=train_size)
-    df1.to_csv(out_train_csv_bz2, compression='bz2')
-    df2.to_csv(out_test_csv_bz2, compression='bz2')
+    df1.to_csv(out_train_csv_bz2, compression="bz2")
+    df2.to_csv(out_test_csv_bz2, compression="bz2")
 
 
 def to_fake_csv(seq_list, path, include_freq=False):
     """
     Write a list of our favorite triples to a file.
     """
-    with open(path, 'w') as fp:
+    with open(path, "w") as fp:
         if include_freq:
-            fp.write('amino_acid,v_gene,j_gene,count,frequency\n')
+            fp.write("amino_acid,v_gene,j_gene,count,frequency\n")
         else:
-            fp.write('amino_acid,v_gene,j_gene\n')
+            fp.write("amino_acid,v_gene,j_gene\n")
 
         for line in seq_list:
-            fp.write(line + '\n')
+            fp.write(line + "\n")
 
 
 @cli.command()
 @click.option(
-    '--include-freq',
+    "--include-freq",
     is_flag=True,
-    help="Include frequencies from 'count' and the counts themselves as columns in CSV.")
-@click.option(
-    '--n-to-sample', default=100, help="Number of sequences to sample.")
+    help="Include frequencies from 'count' and the counts themselves as columns in CSV.",
+)
+@click.option("--n-to-sample", default=100, help="Number of sequences to sample.")
 @click.option(
-    '--min-count',
+    "--min-count",
     default=4,
     show_default=True,
-    help="Only include sequences that are found in at least this number of subjects."
+    help="Only include sequences that are found in at least this number of subjects.",
 )
 @click.option(
-    '--column',
-    default='count',
-    help="Counts column to use for sampling probabilities.")
-@click.argument('in_csv')
-@click.argument('out_csv')
-def sample_data_set(include_freq, n_to_sample, min_count, column, in_csv,
-                    out_csv):
+    "--column", default="count", help="Counts column to use for sampling probabilities."
+)
+@click.argument("in_csv")
+@click.argument("out_csv")
+def sample_data_set(include_freq, n_to_sample, min_count, column, in_csv, out_csv):
     """
     Sample sequences according to the counts given in the specified column and
     then output in a CSV file.
@@ -367,11 +404,19 @@ def seq_freqs_of_colname(colname, apply_min_count=False):
             raise ZeroDivisionError
         return seq_counts / count_sum
 
-    sampled_seq_v = np.random.multinomial(n_to_sample, seq_freqs_of_colname(column, apply_min_count=True))
+    sampled_seq_v = np.random.multinomial(
+        n_to_sample, seq_freqs_of_colname(column, apply_min_count=True)
+    )
 
     if include_freq:
         df.reset_index(inplace=True)
-        out_vect = df['index'] + ',' + df['count'].astype('str') + ',' + seq_freqs_of_colname('count').astype('str')
+        out_vect = (
+            df["index"]
+            + ","
+            + df["count"].astype("str")
+            + ","
+            + seq_freqs_of_colname("count").astype("str")
+        )
     else:
         out_vect = df.index
 
@@ -384,5 +429,5 @@ def seq_freqs_of_colname(colname, apply_min_count=False):
     to_fake_csv(sampled_seqs, out_csv, include_freq)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/vampire/xcr_vector_conversion.py b/vampire/xcr_vector_conversion.py
index a300b79..49f6ee7 100644
--- a/vampire/xcr_vector_conversion.py
+++ b/vampire/xcr_vector_conversion.py
@@ -20,12 +20,12 @@
 
 # CDR3Length layer depends on this set and ordering of states.
 # So does tcregex.
-AA_ORDER = 'ACDEFGHIKLMNPQRSTVWY-'
+AA_ORDER = "ACDEFGHIKLMNPQRSTVWY-"
 AA_LIST = list(AA_ORDER)
 AA_DICT = {c: i for i, c in enumerate(AA_LIST)}
 AA_DICT_REV = {i: c for i, c in enumerate(AA_LIST)}
 AA_SET = set(AA_LIST)
-AA_NONGAP = [float(c != '-') for c in AA_LIST]
+AA_NONGAP = [float(c != "-") for c in AA_LIST]
 
 
 def seq_to_onehot(seq):
@@ -36,21 +36,72 @@ def seq_to_onehot(seq):
 
 
 def onehot_to_seq(onehot):
-    return ''.join([AA_DICT_REV[v.argmax()] for v in onehot])
+    return "".join([AA_DICT_REV[v.argmax()] for v in onehot])
 
 
 # ### TCRB ###
 
 # V genes:
 TCRB_V_GENE_LIST = [
-    'TCRBV01-01', 'TCRBV02-01', 'TCRBV03-01', 'TCRBV03-02', 'TCRBV04-01', 'TCRBV04-02', 'TCRBV04-03', 'TCRBV05-01',
-    'TCRBV05-02', 'TCRBV05-03', 'TCRBV05-04', 'TCRBV05-05', 'TCRBV05-06', 'TCRBV05-07', 'TCRBV05-08', 'TCRBV06-01',
-    'TCRBV06-04', 'TCRBV06-05', 'TCRBV06-06', 'TCRBV06-07', 'TCRBV06-08', 'TCRBV06-09', 'TCRBV07-01', 'TCRBV07-02',
-    'TCRBV07-03', 'TCRBV07-04', 'TCRBV07-05', 'TCRBV07-06', 'TCRBV07-07', 'TCRBV07-08', 'TCRBV07-09', 'TCRBV08-02',
-    'TCRBV09-01', 'TCRBV10-01', 'TCRBV10-02', 'TCRBV10-03', 'TCRBV11-01', 'TCRBV11-02', 'TCRBV11-03', 'TCRBV12-01',
-    'TCRBV12-02', 'TCRBV12-05', 'TCRBV13-01', 'TCRBV14-01', 'TCRBV15-01', 'TCRBV16-01', 'TCRBV18-01', 'TCRBV19-01',
-    'TCRBV20-01', 'TCRBV21-01', 'TCRBV22-01', 'TCRBV23-01', 'TCRBV23-or09_02', 'TCRBV25-01', 'TCRBV27-01', 'TCRBV28-01',
-    'TCRBV29-01', 'TCRBV30-01', 'TCRBVA-or09_02'
+    "TCRBV01-01",
+    "TCRBV02-01",
+    "TCRBV03-01",
+    "TCRBV03-02",
+    "TCRBV04-01",
+    "TCRBV04-02",
+    "TCRBV04-03",
+    "TCRBV05-01",
+    "TCRBV05-02",
+    "TCRBV05-03",
+    "TCRBV05-04",
+    "TCRBV05-05",
+    "TCRBV05-06",
+    "TCRBV05-07",
+    "TCRBV05-08",
+    "TCRBV06-01",
+    "TCRBV06-04",
+    "TCRBV06-05",
+    "TCRBV06-06",
+    "TCRBV06-07",
+    "TCRBV06-08",
+    "TCRBV06-09",
+    "TCRBV07-01",
+    "TCRBV07-02",
+    "TCRBV07-03",
+    "TCRBV07-04",
+    "TCRBV07-05",
+    "TCRBV07-06",
+    "TCRBV07-07",
+    "TCRBV07-08",
+    "TCRBV07-09",
+    "TCRBV08-02",
+    "TCRBV09-01",
+    "TCRBV10-01",
+    "TCRBV10-02",
+    "TCRBV10-03",
+    "TCRBV11-01",
+    "TCRBV11-02",
+    "TCRBV11-03",
+    "TCRBV12-01",
+    "TCRBV12-02",
+    "TCRBV12-05",
+    "TCRBV13-01",
+    "TCRBV14-01",
+    "TCRBV15-01",
+    "TCRBV16-01",
+    "TCRBV18-01",
+    "TCRBV19-01",
+    "TCRBV20-01",
+    "TCRBV21-01",
+    "TCRBV22-01",
+    "TCRBV23-01",
+    "TCRBV23-or09_02",
+    "TCRBV25-01",
+    "TCRBV27-01",
+    "TCRBV28-01",
+    "TCRBV29-01",
+    "TCRBV30-01",
+    "TCRBVA-or09_02",
 ]
 TCRB_V_GENE_DICT = {c: i for i, c in enumerate(TCRB_V_GENE_LIST)}
 TCRB_V_GENE_DICT_REV = {i: c for i, c in enumerate(TCRB_V_GENE_LIST)}
@@ -69,8 +120,19 @@ def onehot_to_vgene(onehot):
 
 # J genes:
 TCRB_J_GENE_LIST = [
-    'TCRBJ01-01', 'TCRBJ01-02', 'TCRBJ01-03', 'TCRBJ01-04', 'TCRBJ01-05', 'TCRBJ01-06', 'TCRBJ02-01', 'TCRBJ02-02',
-    'TCRBJ02-03', 'TCRBJ02-04', 'TCRBJ02-05', 'TCRBJ02-06', 'TCRBJ02-07'
+    "TCRBJ01-01",
+    "TCRBJ01-02",
+    "TCRBJ01-03",
+    "TCRBJ01-04",
+    "TCRBJ01-05",
+    "TCRBJ01-06",
+    "TCRBJ02-01",
+    "TCRBJ02-02",
+    "TCRBJ02-03",
+    "TCRBJ02-04",
+    "TCRBJ02-05",
+    "TCRBJ02-06",
+    "TCRBJ02-07",
 ]
 TCRB_J_GENE_DICT = {c: i for i, c in enumerate(TCRB_J_GENE_LIST)}
 TCRB_J_GENE_DICT_REV = {i: c for i, c in enumerate(TCRB_J_GENE_LIST)}
@@ -100,30 +162,36 @@ def pad_middle(seq, desired_length):
     assert seq_len <= desired_length
     pad_start = seq_len // 2
     pad_len = desired_length - seq_len
-    return seq[:pad_start] + '-' * pad_len + seq[pad_start:]
+    return seq[:pad_start] + "-" * pad_len + seq[pad_start:]
 
 
 def unpad(seq):
     """
     Remove gap padding.
     """
-    return seq.translate(seq.maketrans('', '', '-'))
+    return seq.translate(seq.maketrans("", "", "-"))
 
 
 def avj_triple_to_tcr_df(amino_acid, v_gene, j_gene):
     """
     Put our TCR triple into an appropriate DataFrame.
     """
-    return pd.DataFrame(OrderedDict([('amino_acid', amino_acid), ('v_gene', v_gene), ('j_gene', j_gene)]))
+    return pd.DataFrame(
+        OrderedDict(
+            [("amino_acid", amino_acid), ("v_gene", v_gene), ("j_gene", j_gene)]
+        )
+    )
 
 
 def avj_raw_triple_to_tcr_df(amino_acid, v_gene, j_gene):
     """
     A "raw" triple here means as a big np array.
     """
-    return avj_triple_to_tcr_df([amino_acid[i, :, :] for i in range(amino_acid.shape[0])],
-                                [v_gene[i, :] for i in range(v_gene.shape[0])],
-                                [j_gene[i, :] for i in range(j_gene.shape[0])])
+    return avj_triple_to_tcr_df(
+        [amino_acid[i, :, :] for i in range(amino_acid.shape[0])],
+        [v_gene[i, :] for i in range(v_gene.shape[0])],
+        [j_gene[i, :] for i in range(j_gene.shape[0])],
+    )
 
 
 def unpadded_tcrbs_to_onehot(df, desired_length):
@@ -135,10 +203,10 @@ def unpadded_tcrbs_to_onehot(df, desired_length):
     """
 
     return avj_triple_to_tcr_df(
-        df['amino_acid'].apply(lambda s: seq_to_onehot(pad_middle(s, desired_length))),
-        df['v_gene'].apply(vgene_to_onehot),
-        df['j_gene'].apply(jgene_to_onehot)
-        )  # yapf: disable
+        df["amino_acid"].apply(lambda s: seq_to_onehot(pad_middle(s, desired_length))),
+        df["v_gene"].apply(vgene_to_onehot),
+        df["j_gene"].apply(jgene_to_onehot),
+    )  # yapf: disable
 
 
 def onehot_to_padded_tcrbs(amino_acid_arr, v_gene_arr, j_gene_arr):
@@ -163,16 +231,18 @@ def aux(f, a):
         """
         nrows = a.shape[0]
         # Here's where the length-20 assumption lives.
-        out = np.empty((nrows, ), dtype=np.dtype('<U20'))
+        out = np.empty((nrows,), dtype=np.dtype("<U20"))
         for i in range(nrows):
             out[i] = f(a[i])
         return out
 
     return avj_triple_to_tcr_df(
-        np.array([onehot_to_seq(amino_acid_arr[i]) for i in range(amino_acid_arr.shape[0])]),
+        np.array(
+            [onehot_to_seq(amino_acid_arr[i]) for i in range(amino_acid_arr.shape[0])]
+        ),
         aux(onehot_to_vgene, v_gene_arr),
-        aux(onehot_to_jgene, j_gene_arr)
-        )  # yapf: disable
+        aux(onehot_to_jgene, j_gene_arr),
+    )  # yapf: disable
 
 
 def onehot_to_tcrbs(amino_acid_arr, v_gene_arr, j_gene_arr):
@@ -181,14 +251,19 @@ def onehot_to_tcrbs(amino_acid_arr, v_gene_arr, j_gene_arr):
     """
 
     df = onehot_to_padded_tcrbs(amino_acid_arr, v_gene_arr, j_gene_arr)
-    return avj_triple_to_tcr_df(df['amino_acid'].apply(unpad), df['v_gene'], df['j_gene'])
+    return avj_triple_to_tcr_df(
+        df["amino_acid"].apply(unpad), df["v_gene"], df["j_gene"]
+    )
 
 
 def adaptive_aa_encoding_tensors(max_cdr3_len):
-    germline_cdr3_csv = pkg_resources.resource_filename('vampire', 'data/germline-cdr3-aas.csv')
+    germline_cdr3_csv = pkg_resources.resource_filename(
+        "vampire", "data/germline-cdr3-aas.csv"
+    )
 
-    return cdr3_tensor.aa_encoding_tensors(germline_cdr3_csv, AA_ORDER, TCRB_V_GENE_LIST, TCRB_J_GENE_LIST,
-                                           max_cdr3_len)
+    return cdr3_tensor.aa_encoding_tensors(
+        germline_cdr3_csv, AA_ORDER, TCRB_V_GENE_LIST, TCRB_J_GENE_LIST, max_cdr3_len
+    )
 
 
 def cdr3_length_of_onehots(onehot_cdr3s: pd.Series):
@@ -224,11 +299,26 @@ def contiguous_match_counts(padded_onehot, v_germline_aa_onehot, j_germline_aa_o
 
     See tests for examples.
     """
-    return np.array([
-        # Here cumprod ensures that as soon as we get a zero, it's zero thereafter.
-        np.sum(np.cumprod(np.sum(np.multiply(padded_onehot, v_germline_aa_onehot), axis=1))),
-        np.sum(np.cumprod(np.flip(np.sum(np.multiply(padded_onehot, j_germline_aa_onehot), axis=1), axis=0)))
-    ])
+    return np.array(
+        [
+            # Here cumprod ensures that as soon as we get a zero, it's zero thereafter.
+            np.sum(
+                np.cumprod(
+                    np.sum(np.multiply(padded_onehot, v_germline_aa_onehot), axis=1)
+                )
+            ),
+            np.sum(
+                np.cumprod(
+                    np.flip(
+                        np.sum(
+                            np.multiply(padded_onehot, j_germline_aa_onehot), axis=1
+                        ),
+                        axis=0,
+                    )
+                )
+            ),
+        ]
+    )
 
 
 def contiguous_match_counts_df(onehot_df, v_germline_aa_tensor, j_germline_aa_tensor):
@@ -245,9 +335,12 @@ def contiguous_match_counts_df(onehot_df, v_germline_aa_tensor, j_germline_aa_te
     return np.vstack(
         onehot_df.apply(
             lambda row: contiguous_match_counts(
-                row['amino_acid'],
+                row["amino_acid"],
                 # The following two lines obtain the germline aas for the germline
                 # calls of the row.
-                np.tensordot(row['v_gene'], v_germline_aa_tensor, axes=1),
-                np.tensordot(row['j_gene'], j_germline_aa_tensor, axes=1)),
-            axis=1))
+                np.tensordot(row["v_gene"], v_germline_aa_tensor, axes=1),
+                np.tensordot(row["j_gene"], j_germline_aa_tensor, axes=1),
+            ),
+            axis=1,
+        )
+    )