diff --git a/README.md b/README.md index 3024f59..859019b 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ This is a package to fit and test variational autoencoder (VAE) models for T cell receptor sequences. It is described in the paper [_Deep generative models for T cell receptor protein sequences_](https://elifesciences.org/articles/46935) by Kristian Davidsen, Branden J Olson, William S DeWitt III, Jean Feng, Elias Harkins, Philip Bradley and Frederick A Matsen IV. +This project continues to evolve, but the version in the paper is [tagged 0.2.0](https://github.com/matsengrp/vampire/tree/0.2.0). ## Install @@ -83,8 +84,7 @@ Please get in touch if anything isn't clear. ## Code styling -This project uses [YAPF](https://github.com/google/yapf) for code formatting with a format defined in `setup.cfg`. -You can easily run yapf on all the files with a call to `yapf -ir .` in the root directory. +This project uses [Black](https://black.readthedocs.io/en/stable/) and [docformatter](https://pypi.org/project/docformatter/) for code formatting. Code also checked using [flake8](http://flake8.pycqa.org/en/latest/). `# noqa` comments cause lines to be ignored by flake8. diff --git a/install/environment.yml b/install/environment.yml index 4d9b5e4..397013c 100644 --- a/install/environment.yml +++ b/install/environment.yml @@ -3,12 +3,8 @@ channels: - defaults - conda-forge dependencies: - - python=3.6 + - python>=3.6 - biopython - - click - - flake8 - - keras - - matplotlib - pandas - parallel - pip @@ -16,10 +12,14 @@ dependencies: - pytest - scikit-learn - scons - - seaborn - yapf - pip: + - black + - click - delegator.py + - docformatter - exrex + - flake8 - nestly + - tensorflow - versioneer diff --git a/setup.cfg b/setup.cfg index a9465b1..3d4a00d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,3 @@ -[yapf] -based_on_style = pep8 -column_limit = 120 - [flake8] max-line-length = 120 diff --git a/vampire/SConstruct b/vampire/SConstruct index 9ef2beb..b392816 100644 --- a/vampire/SConstruct +++ b/vampire/SConstruct @@ -11,7 +11,7 @@ from os.path import join import SCons.Script as sc # Command line options -sc.AddOption('--pipe', type='string', help="Which pipeline to run.", default='pipe_main') +sc.AddOption('--pipe', type='string', help="Which pipeline to run.", default='pipe_simple') sc.AddOption('--data', type='string', help="Path to JSON describing data set. Default means use in-repo data.", default='sample_data/sample.json') sc.AddOption( '--mode', type='string', help="The mode of the run, which is up to the SConscript to define.", default='default') diff --git a/vampire/__init__.py b/vampire/__init__.py index 74f4e66..80edaf0 100644 --- a/vampire/__init__.py +++ b/vampire/__init__.py @@ -1,4 +1,4 @@ - from ._version import get_versions -__version__ = get_versions()['version'] + +__version__ = get_versions()["version"] del get_versions diff --git a/vampire/_version.py b/vampire/_version.py index 1521e77..1de6c23 100644 --- a/vampire/_version.py +++ b/vampire/_version.py @@ -1,4 +1,3 @@ - # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -58,17 +57,18 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None @@ -76,10 +76,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -116,16 +119,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -181,7 +190,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -190,7 +199,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) + tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -198,19 +207,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -225,8 +241,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -234,10 +249,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -260,17 +284,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -279,10 +302,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -293,13 +318,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -330,8 +355,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -445,11 +469,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -469,9 +495,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } def get_versions(): @@ -485,8 +515,7 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass @@ -495,13 +524,16 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): + for i in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -515,6 +547,10 @@ def get_versions(): except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/vampire/common.py b/vampire/common.py index f815427..5558d69 100644 --- a/vampire/common.py +++ b/vampire/common.py @@ -26,7 +26,7 @@ def running_avg(x): running_avg = np.cumsum(x, axis=0) for i in range(x.shape[1]): running_avg[:, i] /= np.arange(1, len(x) + 1) - return (running_avg) + return running_avg def repeat_row(a, which_entry, n_repeats): @@ -68,7 +68,7 @@ def strip_extn(in_path): Strips the extension. """ (path, extn) = os.path.splitext(str(in_path)) - if extn in ['.gz', '.bz2']: + if extn in [".gz", ".bz2"]: # Perhaps there is more to the suffix. return os.path.splitext(path)[0] else: @@ -87,7 +87,7 @@ def path_split_tail(in_path): Give the farthest right object in a path, whether it be a directory ending with a `/` or a file. """ - return os.path.split(in_path.rstrip('/'))[1] + return os.path.split(in_path.rstrip("/"))[1] def cjoin(path, *paths): @@ -103,7 +103,9 @@ def read_data_csv(fname): """ Read a CSV from our data path. """ - return pd.read_csv(pkg_resources.resource_filename('vampire', os.path.join('data', fname))) + return pd.read_csv( + pkg_resources.resource_filename("vampire", os.path.join("data", fname)) + ) # ### Misc functions ### diff --git a/vampire/custom_keras.py b/vampire/custom_keras.py index a22a96f..c2983ba 100644 --- a/vampire/custom_keras.py +++ b/vampire/custom_keras.py @@ -1,8 +1,8 @@ -import keras -from keras import backend as K -from keras.callbacks import Callback -from keras.engine.topology import Layer import tensorflow as tf +import tensorflow.keras as keras +from tensorflow.keras import backend as K +from tensorflow.keras.callbacks import Callback +from tensorflow.keras.engine.topology import Layer # ### Callbacks ### @@ -36,15 +36,20 @@ class EmbedViaMatrix(Layer): def __init__(self, embedding_dim, **kwargs): self.embedding_dim = embedding_dim - self.kernel_initializer = keras.initializers.get('uniform') + self.kernel_initializer = keras.initializers.get("uniform") super(EmbedViaMatrix, self).__init__(**kwargs) def build(self, input_shape): # Create a trainable weight variable for this layer. # The first component of input_shape is the batch size (see https://keras.io/layers/core/#dense). self.kernel = self.add_weight( - name='kernel', shape=(input_shape[2], self.embedding_dim), initializer='uniform', trainable=True) - super(EmbedViaMatrix, self).build(input_shape) # Be sure to call this at the end + name="kernel", + shape=(input_shape[2], self.embedding_dim), + initializer="uniform", + trainable=True, + ) + # Be sure to call this at the end + super(EmbedViaMatrix, self).build(input_shape) def call(self, x): return K.dot(x, self.kernel) @@ -63,13 +68,14 @@ class RightTensordot(Layer): """ def __init__(self, right_np_tensor, axes, **kwargs): - self.right_tf_tensor = tf.convert_to_tensor(right_np_tensor, dtype=tf.float32) + self.right_tf_tensor = tf.convert_to_tensor(value=right_np_tensor, dtype=tf.float32) assert type(axes) == int self.axes = axes super(RightTensordot, self).__init__(**kwargs) def build(self, input_shape): - super(RightTensordot, self).build(input_shape) # Be sure to call this at the end + # Be sure to call this at the end + super(RightTensordot, self).build(input_shape) def call(self, x): # Tensordotting with sums over the last `axes` indices of the first argument @@ -111,8 +117,9 @@ def call(self, x): # 20 - argmax will be >= 1 for any site that doesn't have gap in it. # Then we clip so it's = 1 for any site that doesn't have gap in it. # Note argmax with axis-=1 means over the last (amino acid) axis. - tf.clip_by_value(20. - tf.to_float(tf.argmax(x, axis=-1)), 0., 1.), - axis=1) + tf.clip_by_value(20.0 - tf.cast(tf.argmax(input=x, axis=-1), dtype=tf.float32), 0.0, 1.0), + axis=1, + ) def compute_output_shape(self, input_shape): # Make sure we have 21 states for our amino acid space. @@ -164,7 +171,9 @@ def __init__(self, v_max_germline_aas, j_max_germline_aas, **kwargs): super(ContiguousMatch, self).__init__(**kwargs) def build(self, input_shape): - super(ContiguousMatch, self).build(input_shape) # Be sure to call this at the end + super(ContiguousMatch, self).build( + input_shape + ) # Be sure to call this at the end def call(self, inputs): (x, v_germline_aa_onehot, j_germline_aa_onehot) = inputs @@ -173,11 +182,20 @@ def call(self, inputs): # then sum is to get an indicator of the germline matches of x across # the germline genes. The cumprod_sum then returns the number of # contiguous matches for the V and J gene sides. - return tf.stack([ - cumprod_sum(K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2), self.v_max_germline_aas), - cumprod_sum(K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2), self.j_max_germline_aas, reverse=True) - ], - axis=1) + return tf.stack( + [ + cumprod_sum( + K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2), + self.v_max_germline_aas, + ), + cumprod_sum( + K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2), + self.j_max_germline_aas, + reverse=True, + ), + ], + axis=1, + ) def compute_output_shape(self, input_shape): # All input should be of shape (batch_size, max_cdr3_len, len(AA_LIST)). diff --git a/vampire/execute.py b/vampire/execute.py index 056784b..2748701 100644 --- a/vampire/execute.py +++ b/vampire/execute.py @@ -24,7 +24,9 @@ hostname conda activate py36 cd {dir} -""".format(dir=os.getcwd()) +""".format( + dir=os.getcwd() +) def translate_paths(in_paths, dest_dir): @@ -38,17 +40,19 @@ def aux(): for path in in_paths: fname = os.path.basename(path) dest_path = os.path.join(dest_dir, fname) - yield dest_path, f'cp {path} {dest_path}' + yield dest_path, f"cp {path} {dest_path}" return zip(*aux()) @click.command() -@click.option('--clusters', default='', help='Clusters to submit to. Default is local execution.') -@click.option('--script-prefix', default='job', help='Prefix for job script name.') -@click.argument('sources') -@click.argument('targets') -@click.argument('to_execute_f_string') +@click.option( + "--clusters", default="", help="Clusters to submit to. Default is local execution." +) +@click.option("--script-prefix", default="job", help="Prefix for job script name.") +@click.argument("sources") +@click.argument("targets") +@click.argument("to_execute_f_string") def cli(clusters, script_prefix, sources, targets, to_execute_f_string): """ Execute a command with certain sources and targets, perhaps on a SLURM @@ -64,10 +68,10 @@ def cli(clusters, script_prefix, sources, targets, to_execute_f_string): """ # Remove all quotes: they can get in the way with our basename noodling. - sources = re.sub('"*\'*', '', sources) - targets = re.sub('"*\'*', '', targets) + sources = re.sub("\"*'*", "", sources) + targets = re.sub("\"*'*", "", targets) - if clusters == '': + if clusters == "": # Local execution. to_execute = to_execute_f_string.format(sources=sources, targets=targets) click.echo("Executing locally:") @@ -76,32 +80,34 @@ def cli(clusters, script_prefix, sources, targets, to_execute_f_string): job_uuid = uuid.uuid4().hex cluster_directory_d = { - 'beagle': '/mnt/beagle/delete10/matsen_e/vampire/uuid', - 'koshu': '/fh/scratch/delete30/matsen_e/vampire/uuid' + "beagle": "/mnt/beagle/delete10/matsen_e/vampire/uuid", + "koshu": "/fh/scratch/delete30/matsen_e/vampire/uuid", } if clusters in cluster_directory_d: # Put the data where the cluster likes it. input_dir = os.path.join(cluster_directory_d[clusters], job_uuid) sources_l, cp_instructions = translate_paths(sources.split(), input_dir) - cp_instructions = [f'mkdir -p {input_dir}'] + list(cp_instructions) - sources = ' '.join(sources_l) + cp_instructions = [f"mkdir -p {input_dir}"] + list(cp_instructions) + sources = " ".join(sources_l) else: cp_instructions = [] # Put the batch script in the directory of the first target. execution_dir = os.path.dirname(targets.split()[0]) - sentinel_path = os.path.join(execution_dir, 'sentinel.' + job_uuid) - script_name = f'{script_prefix}.{job_uuid}.sh' - with open(os.path.join(execution_dir, script_name), 'w') as fp: + sentinel_path = os.path.join(execution_dir, "sentinel." + job_uuid) + script_name = f"{script_prefix}.{job_uuid}.sh" + with open(os.path.join(execution_dir, script_name), "w") as fp: fp.write(sbatch_prelude) for instruction in cp_instructions: - fp.write(instruction + '\n') - fp.write(to_execute_f_string.format(sources=sources, targets=targets) + '\n') - fp.write(f'touch {sentinel_path}\n') + fp.write(instruction + "\n") + fp.write(to_execute_f_string.format(sources=sources, targets=targets) + "\n") + fp.write(f"touch {sentinel_path}\n") - out = subprocess.check_output(f'cd {execution_dir}; sbatch --clusters {clusters} {script_name}', shell=True) - click.echo(out.decode('UTF-8')) + out = subprocess.check_output( + f"cd {execution_dir}; sbatch --clusters {clusters} {script_name}", shell=True + ) + click.echo(out.decode("UTF-8")) # Wait until the sentinel file appears, then clean up. while not os.path.exists(sentinel_path): @@ -111,5 +117,5 @@ def cli(clusters, script_prefix, sources, targets, to_execute_f_string): return out -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/vampire/gene_name_conversion.py b/vampire/gene_name_conversion.py index 30bfbcb..17dd092 100644 --- a/vampire/gene_name_conversion.py +++ b/vampire/gene_name_conversion.py @@ -3,17 +3,23 @@ import vampire.common as common -translation_csv = 'adaptive-olga-translation.csv' +translation_csv = "adaptive-olga-translation.csv" def adaptive_to_olga_dict(): - gb = common.read_data_csv(translation_csv).dropna().groupby('locus') - return {locus: {row['adaptive']: row['olga'] for _, row in df.iterrows()} for locus, df in gb} + gb = common.read_data_csv(translation_csv).dropna().groupby("locus") + return { + locus: {row["adaptive"]: row["olga"] for _, row in df.iterrows()} + for locus, df in gb + } def olga_to_adaptive_dict(): - gb = common.read_data_csv(translation_csv).dropna().groupby('locus') - return {locus: {row['olga']: row['adaptive'] for _, row in df.iterrows()} for locus, df in gb} + gb = common.read_data_csv(translation_csv).dropna().groupby("locus") + return { + locus: {row["olga"]: row["adaptive"] for _, row in df.iterrows()} + for locus, df in gb + } def filter_by_gene_names(df, conversion_dict): @@ -21,22 +27,28 @@ def filter_by_gene_names(df, conversion_dict): Only allow through gene names that are present for both programs. """ allowed = {locus: set(d.keys()) for locus, d in conversion_dict.items()} - return df.loc[df['v_gene'].isin(allowed['TRBV']) & df['j_gene'].isin(allowed['TRBJ']), :] + return df.loc[ + df["v_gene"].isin(allowed["TRBV"]) & df["j_gene"].isin(allowed["TRBJ"]), : + ] def convert_gene_names(df, conversion_dict): converted = df.copy() - converted['v_gene'] = df['v_gene'].map(conversion_dict['TRBV'].get) - converted['j_gene'] = df['j_gene'].map(conversion_dict['TRBJ'].get) + converted["v_gene"] = df["v_gene"].map(conversion_dict["TRBV"].get) + converted["j_gene"] = df["j_gene"].map(conversion_dict["TRBJ"].get) return converted def convert_and_filter(df, conversion_dict): orig_len = len(df) - converted = convert_gene_names(filter_by_gene_names(df, conversion_dict), conversion_dict) + converted = convert_gene_names( + filter_by_gene_names(df, conversion_dict), conversion_dict + ) n_trimmed = orig_len - len(converted) if n_trimmed > 0: - click.echo(f"Warning: couldn't convert {n_trimmed} sequences and trimmed them off.") + click.echo( + f"Warning: couldn't convert {n_trimmed} sequences and trimmed them off." + ) return converted @@ -49,24 +61,26 @@ def cli(): @cli.command() -@click.argument('adaptive_csv', type=click.Path(exists=True)) -@click.argument('olga_tsv', type=click.Path(writable=True)) +@click.argument("adaptive_csv", type=click.Path(exists=True)) +@click.argument("olga_tsv", type=click.Path(writable=True)) def adaptive2olga(adaptive_csv, olga_tsv): - df = pd.read_csv(adaptive_csv, usecols=['amino_acid', 'v_gene', 'j_gene']) - convert_and_filter(df, adaptive_to_olga_dict()).to_csv(olga_tsv, sep='\t', index=False, header=False) + df = pd.read_csv(adaptive_csv, usecols=["amino_acid", "v_gene", "j_gene"]) + convert_and_filter(df, adaptive_to_olga_dict()).to_csv( + olga_tsv, sep="\t", index=False, header=False + ) @cli.command() -@click.argument('olga_tsv', type=click.Path(exists=True)) -@click.argument('adaptive_csv', type=click.Path(writable=True)) +@click.argument("olga_tsv", type=click.Path(exists=True)) +@click.argument("adaptive_csv", type=click.Path(writable=True)) def olga2adaptive(olga_tsv, adaptive_csv): - df = pd.read_csv(olga_tsv, sep='\t', header=None) + df = pd.read_csv(olga_tsv, sep="\t", header=None) if len(df.columns) == 4: df = df.iloc[:, 1:4] assert len(df.columns) == 3 - df.columns = ['amino_acid', 'v_gene', 'j_gene'] + df.columns = ["amino_acid", "v_gene", "j_gene"] convert_and_filter(df, olga_to_adaptive_dict()).to_csv(adaptive_csv, index=False) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/vampire/germline_cdr3_aa_tensor.py b/vampire/germline_cdr3_aa_tensor.py index 3e88f4c..f5ca757 100644 --- a/vampire/germline_cdr3_aa_tensor.py +++ b/vampire/germline_cdr3_aa_tensor.py @@ -2,7 +2,9 @@ import numpy as np -def aa_encoding_tensors(germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len): +def aa_encoding_tensors( + germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len +): """ Build tensors that one-hot-encode the germline sequences that extend into the CDR3. V genes are left-aligned, while J genes are right-aligned. @@ -16,25 +18,24 @@ def aa_encoding_tensors(germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, m cdr3_aas = pd.read_csv(germline_cdr3_csv, keep_default_na=False) d = { - locus: {gene: seq['sequence'].iloc[0] - for gene, seq in df.groupby('gene')} - for locus, df in cdr3_aas.groupby(['locus']) + locus: {gene: seq["sequence"].iloc[0] for gene, seq in df.groupby("gene")} + for locus, df in cdr3_aas.groupby(["locus"]) } # Make sure that we have all of the desired genes in our dictionary. - assert set(v_gene_list).issubset(d['TRBV'].keys()) - assert set(j_gene_list).issubset(d['TRBJ'].keys()) + assert set(v_gene_list).issubset(d["TRBV"].keys()) + assert set(j_gene_list).issubset(d["TRBJ"].keys()) v_gene_encoding = np.zeros((len(v_gene_list), max_cdr3_len, len(aa_list))) for gene in v_gene_list: - seq = d['TRBV'][gene] + seq = d["TRBV"][gene] gene_index = v_gene_dict[gene] for i, c in enumerate(seq): v_gene_encoding[gene_index, i, aa_dict[c]] = 1 j_gene_encoding = np.zeros((len(j_gene_list), max_cdr3_len, len(aa_list))) for gene in j_gene_list: - seq = d['TRBJ'][gene] + seq = d["TRBJ"][gene] gene_index = j_gene_dict[gene] # Here's how the right-aligned indexing works: start = max_cdr3_len - len(seq) diff --git a/vampire/layers.py b/vampire/layers.py index 1181743..063d4dc 100644 --- a/vampire/layers.py +++ b/vampire/layers.py @@ -23,8 +23,14 @@ def build(self, input_shape): # Create a trainable weight variable for this layer. # The first component of input_shape is the batch size (see https://keras.io/layers/core/#dense). self.kernel = self.add_weight( - name='kernel', shape=(input_shape[2], self.embedding_dim), initializer='uniform', trainable=True) - super(EmbedViaMatrix, self).build(input_shape) # Be sure to call this at the end + name="kernel", + shape=(input_shape[2], self.embedding_dim), + initializer="uniform", + trainable=True, + ) + super(EmbedViaMatrix, self).build( + input_shape + ) # Be sure to call this at the end def call(self, x): return K.dot(x, self.kernel) @@ -43,13 +49,15 @@ class RightTensordot(Layer): """ def __init__(self, right_np_tensor, axes, **kwargs): - self.right_tf_tensor = tf.convert_to_tensor(right_np_tensor, dtype=tf.float32) + self.right_tf_tensor = tf.convert_to_tensor(value=right_np_tensor, dtype=tf.float32) assert type(axes) == int self.axes = axes super(RightTensordot, self).__init__(**kwargs) def build(self, input_shape): - super(RightTensordot, self).build(input_shape) # Be sure to call this at the end + super(RightTensordot, self).build( + input_shape + ) # Be sure to call this at the end def call(self, x): # Tensordotting with sums over the last `axes` indices of the first argument @@ -91,8 +99,9 @@ def call(self, x): # 20 - argmax will be >= 1 for any site that doesn't have gap in it. # Then we clip so it's = 1 for any site that doesn't have gap in it. # Note argmax with axis-=1 means over the last (amino acid) axis. - tf.clip_by_value(20. - tf.to_float(tf.argmax(x, axis=-1)), 0., 1.), - axis=1) + tf.clip_by_value(20.0 - tf.cast(tf.argmax(input=x, axis=-1), dtype=tf.float32), 0.0, 1.0), + axis=1, + ) def compute_output_shape(self, input_shape): # Make sure we have 21 states for our amino acid space. @@ -144,7 +153,9 @@ def __init__(self, v_max_germline_aas, j_max_germline_aas, **kwargs): super(ContiguousMatch, self).__init__(**kwargs) def build(self, input_shape): - super(ContiguousMatch, self).build(input_shape) # Be sure to call this at the end + super(ContiguousMatch, self).build( + input_shape + ) # Be sure to call this at the end def call(self, inputs): (x, v_germline_aa_onehot, j_germline_aa_onehot) = inputs @@ -153,10 +164,20 @@ def call(self, inputs): # then sum is to get an indicator of the germline matches of x across # the germline genes. The cumprod_sum then returns the number of # contiguous matches for the V and J gene sides. - return tf.stack([ - cumprod_sum(K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2), self.v_max_germline_aas), - cumprod_sum(K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2), self.j_max_germline_aas, reverse=True) - ], axis=1) + return tf.stack( + [ + cumprod_sum( + K.sum(tf.multiply(x, v_germline_aa_onehot), axis=2), + self.v_max_germline_aas, + ), + cumprod_sum( + K.sum(tf.multiply(x, j_germline_aa_onehot), axis=2), + self.j_max_germline_aas, + reverse=True, + ), + ], + axis=1, + ) def compute_output_shape(self, input_shape): # All input should be of shape (batch_size, max_cdr3_len, len(AA_LIST)). diff --git a/vampire/models/basic.py b/vampire/models/basic.py index 3b5a906..cd25bc9 100644 --- a/vampire/models/basic.py +++ b/vampire/models/basic.py @@ -7,11 +7,12 @@ import numpy as np -import keras -from keras.models import Model -from keras.layers import Activation, Dense, Lambda, Input, Reshape -from keras import backend as K -from keras import objectives +import tensorflow as tf +import tensorflow.keras as keras +from tensorflow.keras.models import Model +from tensorflow.keras.layers import Activation, Dense, Lambda, Input, Reshape +from tensorflow.keras import backend as K +from tensorflow.keras import objectives import vampire.common as common from vampire.custom_keras import BetaWarmup, EmbedViaMatrix @@ -19,7 +20,7 @@ def build(params): - beta = K.variable(params['beta']) + beta = K.variable(params["beta"]) def sampling(args): """ @@ -27,9 +28,11 @@ def sampling(args): the latent variables. """ z_mean, z_log_var = args - epsilon = K.random_normal(shape=(params['batch_size'], params['latent_dim']), mean=0.0, stddev=1.0) + epsilon = K.random_normal( + shape=(params["batch_size"], params["latent_dim"]), mean=0.0, stddev=1.0 + ) # Reparameterization trick! - return (z_mean + K.exp(z_log_var / 2) * epsilon) + return z_mean + K.exp(z_log_var / 2) * epsilon def vae_cdr3_loss(io_encoder, io_decoder): """ @@ -38,63 +41,103 @@ def vae_cdr3_loss(io_encoder, io_decoder): """ # Here we multiply by the number of sites, so that we have a # total loss across the sites rather than a mean loss. - xent_loss = params['max_cdr3_len'] * K.mean(objectives.categorical_crossentropy(io_encoder, io_decoder)) - kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) + xent_loss = params["max_cdr3_len"] * K.mean( + objectives.categorical_crossentropy(io_encoder, io_decoder) + ) + kl_loss = -0.5 * K.sum( + 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1 + ) kl_loss *= beta - return (xent_loss + kl_loss) + return xent_loss + kl_loss # Input: - cdr3_input_shape = (params['max_cdr3_len'], params['n_aas']) - cdr3_input = Input(shape=cdr3_input_shape, name='cdr3_input') - v_gene_input = Input(shape=(params['n_v_genes'], ), name='v_gene_input') - j_gene_input = Input(shape=(params['n_j_genes'], ), name='j_gene_input') + cdr3_input_shape = (params["max_cdr3_len"], params["n_aas"]) + cdr3_input = Input(shape=cdr3_input_shape, name="cdr3_input") + v_gene_input = Input(shape=(params["n_v_genes"],), name="v_gene_input") + j_gene_input = Input(shape=(params["n_j_genes"],), name="j_gene_input") # Encoding layers: - cdr3_embedding = EmbedViaMatrix(params['aa_embedding_dim'], name='cdr3_embedding')(cdr3_input) - cdr3_embedding_flat = Reshape([params['aa_embedding_dim'] * params['max_cdr3_len']], - name='cdr3_embedding_flat')(cdr3_embedding) - v_gene_embedding = Dense(params['v_gene_embedding_dim'], name='v_gene_embedding')(v_gene_input) - j_gene_embedding = Dense(params['j_gene_embedding_dim'], name='j_gene_embedding')(j_gene_input) - merged_embedding = keras.layers.concatenate([cdr3_embedding_flat, v_gene_embedding, j_gene_embedding], - name='merged_embedding') - encoder_dense_1 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_1')(merged_embedding) - encoder_dense_2 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_2')(encoder_dense_1) + cdr3_embedding = EmbedViaMatrix(params["aa_embedding_dim"], name="cdr3_embedding")( + cdr3_input + ) + cdr3_embedding_flat = Reshape( + [params["aa_embedding_dim"] * params["max_cdr3_len"]], + name="cdr3_embedding_flat", + )(cdr3_embedding) + v_gene_embedding = Dense(params["v_gene_embedding_dim"], name="v_gene_embedding")( + v_gene_input + ) + j_gene_embedding = Dense(params["j_gene_embedding_dim"], name="j_gene_embedding")( + j_gene_input + ) + merged_embedding = keras.layers.concatenate( + [cdr3_embedding_flat, v_gene_embedding, j_gene_embedding], + name="merged_embedding", + ) + encoder_dense_1 = Dense( + params["dense_nodes"], activation="elu", name="encoder_dense_1" + )(merged_embedding) + encoder_dense_2 = Dense( + params["dense_nodes"], activation="elu", name="encoder_dense_2" + )(encoder_dense_1) # Latent layers: - z_mean = Dense(params['latent_dim'], name='z_mean')(encoder_dense_2) - z_log_var = Dense(params['latent_dim'], name='z_log_var')(encoder_dense_2) + z_mean = Dense(params["latent_dim"], name="z_mean")(encoder_dense_2) + z_log_var = Dense(params["latent_dim"], name="z_log_var")(encoder_dense_2) # Decoding layers: - z_l = Lambda(sampling, output_shape=(params['latent_dim'], ), name='z') - decoder_dense_1_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_1') - decoder_dense_2_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_2') - cdr3_post_dense_flat_l = Dense(np.array(cdr3_input_shape).prod(), activation='linear', name='cdr3_post_dense_flat') - cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name='cdr3_post_dense') - cdr3_output_l = Activation(activation='softmax', name='cdr3_output') - v_gene_output_l = Dense(params['n_v_genes'], activation='softmax', name='v_gene_output') - j_gene_output_l = Dense(params['n_j_genes'], activation='softmax', name='j_gene_output') + z_l = Lambda(sampling, output_shape=(params["latent_dim"],), name="z") + decoder_dense_1_l = Dense( + params["dense_nodes"], activation="elu", name="decoder_dense_1" + ) + decoder_dense_2_l = Dense( + params["dense_nodes"], activation="elu", name="decoder_dense_2" + ) + cdr3_post_dense_flat_l = Dense( + np.array(cdr3_input_shape).prod(), + activation="linear", + name="cdr3_post_dense_flat", + ) + cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name="cdr3_post_dense") + cdr3_output_l = Activation(activation="softmax", name="cdr3_output") + v_gene_output_l = Dense( + params["n_v_genes"], activation="softmax", name="v_gene_output" + ) + j_gene_output_l = Dense( + params["n_j_genes"], activation="softmax", name="j_gene_output" + ) post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_l([z_mean, z_log_var]))) - cdr3_output = cdr3_output_l(cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder))) + cdr3_output = cdr3_output_l( + cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder)) + ) v_gene_output = v_gene_output_l(post_decoder) j_gene_output = j_gene_output_l(post_decoder) # Define the decoder components separately so we can have it as its own model. - z_mean_input = Input(shape=(params['latent_dim'], )) + z_mean_input = Input(shape=(params["latent_dim"],)) decoder_post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_mean_input)) - decoder_cdr3_output = cdr3_output_l(cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder))) + decoder_cdr3_output = cdr3_output_l( + cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder)) + ) decoder_v_gene_output = v_gene_output_l(decoder_post_decoder) decoder_j_gene_output = j_gene_output_l(decoder_post_decoder) encoder = Model([cdr3_input, v_gene_input, j_gene_input], [z_mean, z_log_var]) - decoder = Model(z_mean_input, [decoder_cdr3_output, decoder_v_gene_output, decoder_j_gene_output]) - vae = Model([cdr3_input, v_gene_input, j_gene_input], [cdr3_output, v_gene_output, j_gene_output]) + decoder = Model( + z_mean_input, + [decoder_cdr3_output, decoder_v_gene_output, decoder_j_gene_output], + ) + vae = Model( + [cdr3_input, v_gene_input, j_gene_input], + [cdr3_output, v_gene_output, j_gene_output], + ) vae.compile( optimizer="adam", loss={ - 'cdr3_output': vae_cdr3_loss, - 'v_gene_output': keras.losses.categorical_crossentropy, - 'j_gene_output': keras.losses.categorical_crossentropy, + "cdr3_output": vae_cdr3_loss, + "v_gene_output": keras.losses.categorical_crossentropy, + "j_gene_output": keras.losses.categorical_crossentropy, }, loss_weights={ # Keep the cdr3_output weight to be 1. The weights are relative @@ -103,12 +146,13 @@ def vae_cdr3_loss(io_encoder, io_decoder): # weight as 1 then we can interpret beta in a straightforward way. "cdr3_output": 1, "j_gene_output": 0.1305, - "v_gene_output": 0.8138 - }) + "v_gene_output": 0.8138, + }, + ) - callbacks = [BetaWarmup(beta, params['beta'], params['warmup_period'])] + callbacks = [BetaWarmup(beta, params["beta"], params["warmup_period"])] - return {'encoder': encoder, 'decoder': decoder, 'vae': vae, 'callbacks': callbacks} + return {"encoder": encoder, "decoder": decoder, "vae": vae, "callbacks": callbacks} def prepare_data(x_df): diff --git a/vampire/models/count_match.py b/vampire/models/count_match.py index c4451b6..e2ee43e 100644 --- a/vampire/models/count_match.py +++ b/vampire/models/count_match.py @@ -28,14 +28,20 @@ import vampire.common as common import vampire.xcr_vector_conversion as conversion -from vampire.custom_keras import BetaWarmup, CDR3Length, ContiguousMatch, EmbedViaMatrix, RightTensordot +from vampire.custom_keras import ( + BetaWarmup, + CDR3Length, + ContiguousMatch, + EmbedViaMatrix, + RightTensordot, +) from vampire.germline_cdr3_aa_tensor import max_germline_aas def build(params): - beta = K.variable(params['beta']) + beta = K.variable(params["beta"]) def sampling(args): """ @@ -43,9 +49,11 @@ def sampling(args): the latent variables. """ z_mean, z_log_var = args - epsilon = K.random_normal(shape=(params['batch_size'], params['latent_dim']), mean=0.0, stddev=1.0) + epsilon = K.random_normal( + shape=(params["batch_size"], params["latent_dim"]), mean=0.0, stddev=1.0 + ) # Reparameterization trick! - return (z_mean + K.exp(z_log_var / 2) * epsilon) + return z_mean + K.exp(z_log_var / 2) * epsilon def vae_cdr3_loss(io_encoder, io_decoder): """ @@ -54,11 +62,14 @@ def vae_cdr3_loss(io_encoder, io_decoder): """ # Here we multiply by the number of sites, so that we have a # total loss across the sites rather than a mean loss. - xent_loss = params['max_cdr3_len'] * K.mean( - objectives.categorical_crossentropy(io_encoder, io_decoder)) - kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) + xent_loss = params["max_cdr3_len"] * K.mean( + objectives.categorical_crossentropy(io_encoder, io_decoder) + ) + kl_loss = -0.5 * K.sum( + 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1 + ) kl_loss *= beta - return (xent_loss + kl_loss) + return xent_loss + kl_loss def mean_squared_error_2d(io_encoder, io_decoder): def mse(i): @@ -67,93 +78,173 @@ def mse(i): return mse(0) + mse(1) # Input: - cdr3_input_shape = (params['max_cdr3_len'], params['n_aas']) - cdr3_input = Input(shape=cdr3_input_shape, name='cdr3_input') - cdr3_length_input = Input(shape=(1, ), name='cdr3_length_input') - v_gene_input = Input(shape=(params['n_v_genes'], ), name='v_gene_input') - j_gene_input = Input(shape=(params['n_j_genes'], ), name='j_gene_input') - contiguous_match_input = Input(shape=(2, ), name='contiguous_match_input') + cdr3_input_shape = (params["max_cdr3_len"], params["n_aas"]) + cdr3_input = Input(shape=cdr3_input_shape, name="cdr3_input") + cdr3_length_input = Input(shape=(1,), name="cdr3_length_input") + v_gene_input = Input(shape=(params["n_v_genes"],), name="v_gene_input") + j_gene_input = Input(shape=(params["n_j_genes"],), name="j_gene_input") + contiguous_match_input = Input(shape=(2,), name="contiguous_match_input") # Encoding layers: - cdr3_embedding = EmbedViaMatrix(params['aa_embedding_dim'], name='cdr3_embedding')(cdr3_input) - cdr3_embedding_flat = Reshape([params['aa_embedding_dim'] * params['max_cdr3_len']], - name='cdr3_embedding_flat')(cdr3_embedding) - v_gene_embedding = Dense(params['v_gene_embedding_dim'], name='v_gene_embedding')(v_gene_input) - j_gene_embedding = Dense(params['j_gene_embedding_dim'], name='j_gene_embedding')(j_gene_input) + cdr3_embedding = EmbedViaMatrix(params["aa_embedding_dim"], name="cdr3_embedding")( + cdr3_input + ) + cdr3_embedding_flat = Reshape( + [params["aa_embedding_dim"] * params["max_cdr3_len"]], + name="cdr3_embedding_flat", + )(cdr3_embedding) + v_gene_embedding = Dense(params["v_gene_embedding_dim"], name="v_gene_embedding")( + v_gene_input + ) + j_gene_embedding = Dense(params["j_gene_embedding_dim"], name="j_gene_embedding")( + j_gene_input + ) merged_embedding = keras.layers.concatenate( - [cdr3_embedding_flat, cdr3_length_input, v_gene_embedding, j_gene_embedding, contiguous_match_input], - name='merged_embedding') - encoder_dense_1 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_1')(merged_embedding) - encoder_dense_2 = Dense(params['dense_nodes'], activation='elu', name='encoder_dense_2')(encoder_dense_1) + [ + cdr3_embedding_flat, + cdr3_length_input, + v_gene_embedding, + j_gene_embedding, + contiguous_match_input, + ], + name="merged_embedding", + ) + encoder_dense_1 = Dense( + params["dense_nodes"], activation="elu", name="encoder_dense_1" + )(merged_embedding) + encoder_dense_2 = Dense( + params["dense_nodes"], activation="elu", name="encoder_dense_2" + )(encoder_dense_1) # Latent layers: - z_mean = Dense(params['latent_dim'], name='z_mean')(encoder_dense_2) - z_log_var = Dense(params['latent_dim'], name='z_log_var')(encoder_dense_2) + z_mean = Dense(params["latent_dim"], name="z_mean")(encoder_dense_2) + z_log_var = Dense(params["latent_dim"], name="z_log_var")(encoder_dense_2) # Decoding layers: - z_l = Lambda(sampling, output_shape=(params['latent_dim'], ), name='z') - decoder_dense_1_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_1') - decoder_dense_2_l = Dense(params['dense_nodes'], activation='elu', name='decoder_dense_2') - cdr3_post_dense_flat_l = Dense(np.array(cdr3_input_shape).prod(), activation='linear', name='cdr3_post_dense_flat') - cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name='cdr3_post_dense') - cdr3_output_l = Activation(activation='softmax', name='cdr3_output') - v_gene_output_l = Dense(params['n_v_genes'], activation='softmax', name='v_gene_output') - j_gene_output_l = Dense(params['n_j_genes'], activation='softmax', name='j_gene_output') + z_l = Lambda(sampling, output_shape=(params["latent_dim"],), name="z") + decoder_dense_1_l = Dense( + params["dense_nodes"], activation="elu", name="decoder_dense_1" + ) + decoder_dense_2_l = Dense( + params["dense_nodes"], activation="elu", name="decoder_dense_2" + ) + cdr3_post_dense_flat_l = Dense( + np.array(cdr3_input_shape).prod(), + activation="linear", + name="cdr3_post_dense_flat", + ) + cdr3_post_dense_reshape_l = Reshape(cdr3_input_shape, name="cdr3_post_dense") + cdr3_output_l = Activation(activation="softmax", name="cdr3_output") + v_gene_output_l = Dense( + params["n_v_genes"], activation="softmax", name="v_gene_output" + ) + j_gene_output_l = Dense( + params["n_j_genes"], activation="softmax", name="j_gene_output" + ) post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_l([z_mean, z_log_var]))) v_gene_output = v_gene_output_l(post_decoder) j_gene_output = j_gene_output_l(post_decoder) # Here's where we incorporate germline amino acid sequences into the output. - germline_cdr3_tensors = conversion.adaptive_aa_encoding_tensors(params['max_cdr3_len']) + germline_cdr3_tensors = conversion.adaptive_aa_encoding_tensors( + params["max_cdr3_len"] + ) (v_germline_cdr3_tensor, j_germline_cdr3_tensor) = germline_cdr3_tensors - (v_max_germline_aas, j_max_germline_aas) = [max_germline_aas(g) for g in germline_cdr3_tensors] - v_germline_cdr3_l = RightTensordot(v_germline_cdr3_tensor, axes=1, name='v_germline_cdr3') - j_germline_cdr3_l = RightTensordot(j_germline_cdr3_tensor, axes=1, name='j_germline_cdr3') - cdr3_length_output_l = CDR3Length(name='cdr3_length_output') - contiguous_match_output_l = ContiguousMatch(v_max_germline_aas, j_max_germline_aas, name='contiguous_match_output') + (v_max_germline_aas, j_max_germline_aas) = [ + max_germline_aas(g) for g in germline_cdr3_tensors + ] + v_germline_cdr3_l = RightTensordot( + v_germline_cdr3_tensor, axes=1, name="v_germline_cdr3" + ) + j_germline_cdr3_l = RightTensordot( + j_germline_cdr3_tensor, axes=1, name="j_germline_cdr3" + ) + cdr3_length_output_l = CDR3Length(name="cdr3_length_output") + contiguous_match_output_l = ContiguousMatch( + v_max_germline_aas, j_max_germline_aas, name="contiguous_match_output" + ) v_germline_cdr3 = v_germline_cdr3_l(v_gene_output) j_germline_cdr3 = j_germline_cdr3_l(j_gene_output) cdr3_output = cdr3_output_l( - Add(name='cdr3_pre_activation')([ - cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder)), - Add(name='germline_cdr3')([v_germline_cdr3, j_germline_cdr3]) - ])) + Add(name="cdr3_pre_activation")( + [ + cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(post_decoder)), + Add(name="germline_cdr3")([v_germline_cdr3, j_germline_cdr3]), + ] + ) + ) cdr3_length_output = cdr3_length_output_l(cdr3_output) - contiguous_match_output = contiguous_match_output_l([cdr3_output, v_germline_cdr3, j_germline_cdr3]) + contiguous_match_output = contiguous_match_output_l( + [cdr3_output, v_germline_cdr3, j_germline_cdr3] + ) # Define the decoder components separately so we can have it as its own model. - z_mean_input = Input(shape=(params['latent_dim'], )) + z_mean_input = Input(shape=(params["latent_dim"],)) decoder_post_decoder = decoder_dense_2_l(decoder_dense_1_l(z_mean_input)) decoder_v_gene_output = v_gene_output_l(decoder_post_decoder) decoder_j_gene_output = j_gene_output_l(decoder_post_decoder) decoder_v_germline_cdr3 = v_germline_cdr3_l(decoder_v_gene_output) decoder_j_germline_cdr3 = j_germline_cdr3_l(decoder_j_gene_output) decoder_cdr3_output = cdr3_output_l( - Add(name='cdr3_pre_activation')([ - cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder)), - Add(name='germline_cdr3')([decoder_v_germline_cdr3, decoder_j_germline_cdr3]) - ])) + Add(name="cdr3_pre_activation")( + [ + cdr3_post_dense_reshape_l(cdr3_post_dense_flat_l(decoder_post_decoder)), + Add(name="germline_cdr3")( + [decoder_v_germline_cdr3, decoder_j_germline_cdr3] + ), + ] + ) + ) decoder_cdr3_length_output = cdr3_length_output_l(decoder_cdr3_output) decoder_contiguous_match_output = contiguous_match_output_l( - [decoder_cdr3_output, decoder_v_germline_cdr3, decoder_j_germline_cdr3]) - - encoder = Model([cdr3_input, cdr3_length_input, v_gene_input, j_gene_input, contiguous_match_input], - [z_mean, z_log_var]) - decoder = Model(z_mean_input, [ - decoder_cdr3_output, decoder_cdr3_length_output, decoder_v_gene_output, decoder_j_gene_output, - decoder_contiguous_match_output - ]) - vae = Model([cdr3_input, cdr3_length_input, v_gene_input, j_gene_input, contiguous_match_input], - [cdr3_output, cdr3_length_output, v_gene_output, j_gene_output, contiguous_match_output]) + [decoder_cdr3_output, decoder_v_germline_cdr3, decoder_j_germline_cdr3] + ) + + encoder = Model( + [ + cdr3_input, + cdr3_length_input, + v_gene_input, + j_gene_input, + contiguous_match_input, + ], + [z_mean, z_log_var], + ) + decoder = Model( + z_mean_input, + [ + decoder_cdr3_output, + decoder_cdr3_length_output, + decoder_v_gene_output, + decoder_j_gene_output, + decoder_contiguous_match_output, + ], + ) + vae = Model( + [ + cdr3_input, + cdr3_length_input, + v_gene_input, + j_gene_input, + contiguous_match_input, + ], + [ + cdr3_output, + cdr3_length_output, + v_gene_output, + j_gene_output, + contiguous_match_output, + ], + ) vae.compile( optimizer="adam", loss={ - 'cdr3_output': vae_cdr3_loss, - 'cdr3_length_output': keras.losses.mean_squared_error, - 'v_gene_output': keras.losses.categorical_crossentropy, - 'j_gene_output': keras.losses.categorical_crossentropy, - 'contiguous_match_output': mean_squared_error_2d + "cdr3_output": vae_cdr3_loss, + "cdr3_length_output": keras.losses.mean_squared_error, + "v_gene_output": keras.losses.categorical_crossentropy, + "j_gene_output": keras.losses.categorical_crossentropy, + "contiguous_match_output": mean_squared_error_2d, }, loss_weights={ # Keep the cdr3_output weight to be 1. The weights are relative @@ -164,23 +255,40 @@ def mse(i): "cdr3_output": 1, "j_gene_output": 0.1596, "v_gene_output": 0.9282, - "contiguous_match_output": 0.05 - }) + "contiguous_match_output": 0.05, + }, + ) - callbacks = [BetaWarmup(beta, params['beta'], params['warmup_period'])] + callbacks = [BetaWarmup(beta, params["beta"], params["warmup_period"])] - return {'encoder': encoder, 'decoder': decoder, 'vae': vae, 'callbacks': callbacks} + return {"encoder": encoder, "decoder": decoder, "vae": vae, "callbacks": callbacks} def prepare_data(x_df): cdr3_data, v_gene_data, j_gene_data = common.cols_of_df(x_df) - cdr3_length_data = conversion.cdr3_length_of_onehots(x_df['amino_acid']) + cdr3_length_data = conversion.cdr3_length_of_onehots(x_df["amino_acid"]) max_cdr3_len = cdr3_data.shape[1] - v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors(max_cdr3_len) - contiguous_match_data = conversion.contiguous_match_counts_df(x_df, v_germline_tensor, j_germline_tensor) - return [cdr3_data, cdr3_length_data, v_gene_data, j_gene_data, contiguous_match_data] + v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors( + max_cdr3_len + ) + contiguous_match_data = conversion.contiguous_match_counts_df( + x_df, v_germline_tensor, j_germline_tensor + ) + return [ + cdr3_data, + cdr3_length_data, + v_gene_data, + j_gene_data, + contiguous_match_data, + ] def interpret_output(output): - (cdr3_output, cdr3_length_output, v_gene_output, j_gene_output, contiguous_match_output) = output + ( + cdr3_output, + cdr3_length_output, + v_gene_output, + j_gene_output, + contiguous_match_output, + ) = output return (cdr3_output, v_gene_output, j_gene_output) diff --git a/vampire/pipe_simple/SConscript b/vampire/pipe_simple/SConscript new file mode 100644 index 0000000..a045d7c --- /dev/null +++ b/vampire/pipe_simple/SConscript @@ -0,0 +1,361 @@ +import json +import os +from os.path import join +import re + +import numpy as np # noqa + +import nestly +import nestly.scons as ns +import SCons.Script as sc + +import common +import tcr_vae + +from common import cluster_execution_string + +sc.Import('env') +localenv = env.Clone() # noqa + + +def check_mode(): + if localenv['mode'] not in ['mini', 'default']: + raise Exception(f"Unknown mode '{localenv['mode']}'") + + +def apply_mode(l): + """ + Default mode runs everything, and mini mode just runs a single element from the list. + """ + check_mode() + if localenv['mode'] == 'mini': + return [l[0]] + else: + return l + + +def default_params_by_mode(): + """ + Mini mode doesn't train for long. + """ + params = tcr_vae.TCRVAE.default_params() + + if localenv['mode'] == 'mini': + params['pretrains'] = 2 + params['warmup_period'] = 3 + params['epochs'] = 10 + + return params + + +def base_dict(): + """ + The dictionary that will be shared by all the nests. + + nseqs: the number of sequences generated by the various programs, and for + which we evaluate Pvae for on the real data. + + train_size: the number of sequences to take for training. + + max_q: the level for truncating q_{lvj} in the thymic Q calculation. + """ + + d = {'nseqs': 10000, + 'train_size': 100000, + 'max_q': 100, + } + + if localenv['mode'] == 'mini': + d['nseqs'] = 100 + + # Slurp up all key-value pairs in the JSON file into the base_dict. + with open(localenv['data']) as fp: + for k, v in json.load(fp).items(): + d[k] = v + + return d + + +def numerical_nest_add(nest_name, number_list): + """ + Add an nest for a list of non-negative numbers, with nice zero-padded + directory names. + """ + nest.add(nest_name, apply_mode(number_list), label_func=common.zero_pad_list_func(number_list)) + + +# ### Nests and targets ### + +nest = ns.SConsWrap(nestly.Nest(base_dict=base_dict()), alias_environment=localenv) + +# Nest: the data set choice, named via data name prepended with `_output_`. +nest.add('data_label', [localenv['data']], label_func=lambda p: '_output_' + common.strip_dirpath_extn(p)) + +# The test_set_agg allows us to process the test sets once in the things that don't depend on the VAE. +nest.add_aggregate('test_set_agg', list) +# The test_set_info_agg gathers information about the processed test sets so we can get at them later. +nest.add_aggregate('test_set_info_agg', dict) +# The summarized_agg gathers everything we want to summarize and then stack at the end. +nest.add_aggregate('summarized_agg', list) +# The merged gathers merged per-sequence information on the test set. +nest.add_aggregate('merged_agg', list) +summarized_agg_names = [] +# nest.add_aggregate('loss_regression_agg', list) + + +@nest.add_target_with_env(localenv) +def sconscript(env, outdir, c): + """ + Copy SConscript file into output directory. + """ + return env.Command( + join(outdir, 'SConscript'), + 'SConscript', + 'cp $SOURCE $TARGET')[0] + + +@nest.add_target_with_env(localenv) +def json_data(env, outdir, c): + """ + Copy JSON data file into output directory. + """ + return env.Command( + join(outdir, os.path.basename(localenv['data'])), + localenv['data'], + 'cp $SOURCE $TARGET')[0] + + +@nest.add_target_with_env(localenv) +def vampire_version(env, outdir, c): + """ + Echo vampire version to file. + """ + return env.Command( + join(outdir, 'vampire-version.txt'), + [], + 'python -c "import vampire; print(vampire.__version__)" > $TARGET')[0] + + +# Nest: initial processing of the test sets. +nest.add('test_data', lambda c: c['test_paths'], label_func=common.strip_dirpath_extn) + + +@nest.add_target_with_env(localenv) +def test_head(env, outdir, c): + """ + Run the preprocess_adaptive.py script on the test data, which incorporates + downsampling. + """ + test_head_path = join(outdir, common.strip_dirpath_extn(c['test_data'])+'.head.csv') + # Remove a pesky and meaningless leading `./` that was making trouble later. + c['test_set_agg'].append(re.sub('^\./', '', test_head_path)) + return env.Command( + test_head_path, + c['test_data'], + f"python3 preprocess_adaptive.py --sample {c['nseqs']} $SOURCE $TARGET")[0] + + +@nest.add_target_with_env(localenv) +def test_pgen(env, outdir, c): + test_pgen_path = join(outdir, 'test-head.pgen.tsv') + c['test_set_info_agg'][(str(c['test_head']), 'test_pgen')] = test_pgen_path + return env.Command( + test_pgen_path, + c['test_head'], + cluster_execution_string('adaptive-pgen.sh {sources} {targets}', localenv, 0))[0] + + +nest.pop('test_data') + + +# Nest: the training data. +# By using a nest we retain the option of having multiple training sets later. +nest.add('train_data', lambda c: [c['train_tsv_path']], label_func=common.strip_dirpath_extn) +summarized_agg_names.append('train_data') + + +# Nest: training set size +# numerical_nest_add('train_size', np.floor(common.logspace(5000, 500000, 6))) +# summarized_agg_names.append('train_size') + + +@nest.add_target_with_env(localenv) +def preprocess(env, outdir, c): + """ + Run the preprocess_adaptive.py script on the training/validation data. + """ + in_path = c['train_data'] + # We take enough for training and validation_head, with nseqs to spare. + n_to_sample = c['train_size'] + 2 * c['nseqs'] + return env.Command( + join(outdir, 'train.processed.csv'), + in_path, + f'python3 preprocess_adaptive.py --sample {n_to_sample} $SOURCE $TARGET')[0] + + +@nest.add_target_with_env(localenv, 'split') +@ns.name_targets +def split(env, outdir, c): + """ + Split the training data into training and validation. + """ + in_path = c['preprocess'] + return 'training', 'validation', env.Command( + [join(outdir, 'training.csv'), join(outdir, 'validation.csv')], + in_path, + f"python3 util.py split --train-size {c['train_size']} $SOURCE $TARGETS") + +@nest.add_target_with_env(localenv) +def training_head(env, outdir, c): + return env.Command( + common.strip_extn(c['split']['training'])+'.head.csv', + c['split']['training'], + f"head -n {c['nseqs']+1} $SOURCE > $TARGET")[0] + + +@nest.add_target_with_env(localenv) +def training_head_pgen(env, outdir, c): + """ + Get Pgen for some of the training data. + """ + return env.Command( + join(outdir, 'training.head.pgen.tsv'), + c['training_head'], + 'adaptive-pgen.sh $SOURCE $TARGET')[0] + + +@nest.add_target_with_env(localenv) +def validation_head(env, outdir, c): + return env.Command( + common.strip_extn(c['split']['validation'])+'.head.csv', + c['split']['validation'], + f"head -n {c['nseqs']+1} $SOURCE > $TARGET")[0] + + +@nest.add_target_with_env(localenv) +def validation_pgen(env, outdir, c): + return env.Command( + join(outdir, common.strip_dirpath_extn(c['validation_head'])+'.pgen.csv'), + c['validation_head'], + cluster_execution_string('adaptive-pgen.sh {sources} {targets}', localenv, 0))[0] + +# Nest: the dimension of the latent space. +# numerical_nest_add('warmup_period', [5*i for i in range(0,6)]) +# summarized_agg_names.append('warmup_period') + +# Nest: the dimension of the latent space. +# numerical_nest_add('latent_dim', [30, 35, 40]) +# summarized_agg_names.append('latent_dim') + +# Nest: the number of dense nodes. +# numerical_nest_add('dense_nodes', [50, 75, 100, 125, 150]) +# summarized_agg_names.append('dense_nodes') + +# Nest: the amino acid embedding dimension. +# numerical_nest_add('aa_embedding_dim', [10, 15, 20, 21]) +# summarized_agg_names.append('aa_embedding_dim') + +# Nest: the V gene embedding dimension. +# numerical_nest_add('v_gene_embedding_dim', [20, 30, 40]) +# summarized_agg_names.append('v_gene_embedding_dim') + +# Nest: the strength of the KL component of the VAE loss. +# numerical_nest_add('beta', np.linspace(0.625, 1, 7)) +numerical_nest_add('beta', [0.75]) +summarized_agg_names.append('beta') + +# Nest: monitor for EarlyStopping. +# nest.add('stopping_monitor', ['loss', 'val_loss']) +# summarized_agg_names.append('stopping_monitor') + +# Nest: EarlyStopping patience. +# numerical_nest_add('patience', [5*x for x in range(7)]) +# summarized_agg_names.append('patience') + + +# Nest: the model. +# This needs to appear in the innermost nest so that we can pop it and then do some OLGA work. +# In order to keep things happy when merging with OLGA results, don't comment +# this out if you are only interested in one model. Instead, make a +# single-model nest. +# nest.add('model', apply_mode([common.strip_dirpath_extn(m) for m in glob.glob('../models/*.py')])) +nest.add('model', ['basic', 'count_match']) +summarized_agg_names.append('model') + + +@nest.add_target_with_env(localenv) +def model_params(env, outdir, c): + """ + Write out a file with parameters from which we can build our VAE. + """ + + # Copy over any relevant parameters from c into the params dictionary. + params = default_params_by_mode() + for k, v in c.items(): + if k in params: + params[k] = v + + return env.Command( + join(outdir, 'model_params.json'), + [], + f"echo '{json.dumps(params)}' > $TARGET")[0] + + +@nest.add_target_with_env(localenv, 'trained') +@ns.name_targets +def trained(env, outdir, c): + return 'weights', 'diagnostics', env.Command( + [join(outdir, 'best_weights.h5'), join(outdir, 'diagnostics.csv')], + [c['model_params'], c['split']['training']], + cluster_execution_string('tcr-vae train {sources} {targets}', localenv)) + + +# @nest.add_target_with_env(localenv) +# def loss(env, outdir, c): +# loss = join(outdir, 'loss.csv') +# return env.Command( +# loss, +# [c['model_params'], c['trained']['weights'], c['split']['training'], +# c['split']['validation']], +# cluster_execution_string('tcr-vae loss {sources} {targets}'))[0] + + +@nest.add_target_with_env(localenv) +def training_pvae(env, outdir, c): + pvae_call = f"tcr-vae pvae --limit-input-to {c['nseqs']}" + return env.Command( + join(outdir, common.strip_dirpath_extn(c['split']['training'])+'.head.pvae.csv'), + [c['model_params'], c['trained']['weights'], c['split']['training']], + cluster_execution_string(pvae_call + ' {sources} {targets}', localenv))[0] + + +@nest.add_target_with_env(localenv) +def validation_pvae(env, outdir, c): + return env.Command( + join(outdir, common.strip_dirpath_extn(c['validation_head'])+'.pvae.csv'), + [c['model_params'], c['trained']['weights'], c['validation_head']], + cluster_execution_string('tcr-vae pvae {sources} {targets}', localenv))[0] + + +@nest.add_target_with_env(localenv) +def vae_generated(env, outdir, c): + return env.Command( + join(outdir, 'vae-generated.csv'), + [c['model_params'], c['trained']['weights']], + f"tcr-vae generate --nseqs {c['nseqs']} $SOURCES $TARGET")[0] + + +@nest.add_target_with_env(localenv) +def vae_generated_pgen(env, outdir, c): + return env.Command( + join(outdir, 'vae-generated.pgen.tsv'), + c['vae_generated'], + cluster_execution_string('adaptive-pgen.sh {sources} {targets}', localenv, 0))[0] + +@nest.add_target_with_env(localenv) +def vae_generated_pvae(env, outdir, c): + return env.Command( + join(outdir, 'vae-generated.pvae.csv'), + [c['model_params'], c['trained']['weights'], c['vae_generated']], + cluster_execution_string('tcr-vae pvae {sources} {targets}', localenv))[0] + diff --git a/vampire/pipe_simple/sample_data/02-0249_TCRB.4000.tsv.bz2 b/vampire/pipe_simple/sample_data/02-0249_TCRB.4000.tsv.bz2 new file mode 100644 index 0000000..25ff8f7 Binary files /dev/null and b/vampire/pipe_simple/sample_data/02-0249_TCRB.4000.tsv.bz2 differ diff --git a/vampire/pipe_simple/sample_data/02-0249_TCRB.head.tsv b/vampire/pipe_simple/sample_data/02-0249_TCRB.head.tsv new file mode 100644 index 0000000..ed794c8 --- /dev/null +++ b/vampire/pipe_simple/sample_data/02-0249_TCRB.head.tsv @@ -0,0 +1,10 @@ +rearrangement amino_acid frame_type rearrangement_type templates reads frequency productive_frequency cdr3_length v_family v_gene v_allele d_family d_gene d_allele j_family j_gene j_allele v_deletions d5_deletions d3_deletions j_deletions n2_insertions n1_insertions v_index n1_index n2_index d_index j_index v_family_ties v_gene_ties v_allele_ties d_family_ties d_gene_ties d_allele_ties j_family_ties j_gene_ties j_allele_ties sequence_tags v_shm_count v_shm_indexes antibody sample_name species locus product_subtype kit_pool total_templates productive_templates outofframe_templates stop_templates dj_templates total_rearrangements productive_rearrangements outofframe_rearrangements stop_rearrangements dj_rearrangements total_reads total_productive_reads total_outofframe_reads total_stop_reads total_dj_reads productive_clonality productive_entropy sample_clonality sample_entropy sample_amount_ng sample_cells_mass_estimate fraction_productive_of_cells_mass_estimate sample_cells fraction_productive_of_cells max_productive_frequency max_frequency counting_method primer_set release_date sample_tags fraction_productive order_name kit_id total_t_cells +CTCACTCTGGAGTCTGCTGCCTCCTCCCAGACATCTGTATATTTCTGCGCCAGCAGTGAGGGAAATCAGCCCCAGCATTTTGGTGAT CASSEGNQPQHF In VDJ 1 null 8.64655478024781E-6 1.0341261633919338E-5 36 TCRBV10 TCRBV10-01 unresolved TCRBJ01 TCRBJ01-05 01 2 1 8 4 0 0 45 -1 -1 60 63 01,02 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 null null null Vb 12 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +AAGATCCAGCCTGCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCCCGACGCGAAATCAGCCCCAGCATTTTGGTGAT CASSPTRNQPQHF In VDJ 1 null 8.64655478024781E-6 1.0341261633919338E-5 39 TCRBV11 TCRBV11-02 02 unresolved TCRBJ01 TCRBJ01-05 01 5 2 7 4 4 2 42 54 59 56 63 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 null null null Vb 21.3 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +CTCACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCTGTGCCAGTAGTATAGGGGACCAGCCCCAGCATTTTGGTGAT CASSIGDQPQHF In VDJ 1 null 8.64655478024781E-6 1.0341261633919338E-5 36 TCRBV19 TCRBV19-01 unresolved TCRBJ01 TCRBJ01-05 01 1 0 7 7 0 0 45 -1 -1 61 66 01,02 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 null null null Vb 17 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +ACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCTGTGCCAGTAGCCCTCGGGGGGAACAGCCCCAGCATTTTGGTGAT CASSPRGEQPQHF In VDJ 1 null 8.64655478024781E-6 1.0341261633919338E-5 39 TCRBV19 TCRBV19-01 TCRBD02 TCRBD02-01 01 TCRBJ01 TCRBJ01-05 01 6 8 1 7 2 4 42 53 64 57 66 01,02 null null null Vb 17 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +TGCTGTACCCTCTCAGACATCTGTGTACTTCTGTGCCAGCAGTGATCGCCCCAGGGGAGACCAAACTATTGCTACACCTTCGGTTCG Out VDJ 1 null 8.64655478024781E-6 null 50 TCRBV06 TCRBV06-04 TCRBD01 TCRBD01-01 01 TCRBJ01 TCRBJ01-02 01 3 4 2 9 13 6 31 45 57 51 70 01,02 null null null null 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +ACTGTGACATCGGCCCAAAAGAACCCGACAGCTTTCTATCTCTGTGCCAGTAGGGCACACCTAATGGGTGGCTACACCTTCGGTTCG CASRAHLMGGYTF In VDJ 1 null 8.64655478024781E-6 1.0341261633919338E-5 39 TCRBV19 TCRBV19-01 unresolved TCRBJ01 TCRBJ01-02 01 6 0 9 7 0 12 42 53 -1 65 68 01,02 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 null null null Vb 17 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +GAGTGCCTTGGAGCTGGGGGACTCGGCCCTGTATCTCTGTGCCAGAAGCTTGGAACTCTTCAATGGGGTGGCTACACCTTCGGTTCG Out VDJ 1 null 8.64655478024781E-6 null 44 TCRBV05 TCRBV05-03 unresolved TCRBJ01 TCRBJ01-02 01 0 6 2 7 0 11 37 53 -1 64 68 01,02 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 null null null null 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +CAGCCTGCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCACCCCAAAGGGCCCGCCGTCTGGCTACACCTTCGGTTCG CASSTPKGPPSGYTF In VDJ 1 null 8.64655478024781E-6 1.0341261633919338E-5 45 TCRBV11 TCRBV11-02 02 unresolved TCRBJ01 TCRBJ01-02 01 5 5 3 7 9 7 36 48 59 55 68 TCRBD01,TCRBD02 TCRBD01-01,TCRBD02-01 null null null Vb 21.3 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 +ATCCAGCCCTCAGAACCCAGGGACTCAGCTGTGTACTTCTGTGCCAGCAGTTCGACCTCCTCAGGGGCTGGCTACACCTTCGGTTCG CASSSTSSGAGYTF In VDJ 1 null 8.64655478024781E-6 1.0341261633919338E-5 42 TCRBV12 unresolved TCRBD01 TCRBD01-01 01 TCRBJ01 TCRBJ01-02 01 4 4 2 7 1 9 39 52 67 61 68 TCRBV12-03,TCRBV12-04 null null null null 02-0249_TCRB Human TCRB Deep null 115653 96700 17505 1448 0 90702 76251 13167 1284 0 null null null null null 0.0755023062 14.9939365 0.0734105557 15.2598591 1200.0 184615 0.523792757901579 0 0.0 0.0439400189 0.0367392115 v3 Human-TCRB-PD1x 2015-11-09 16:34:04.58 Age:14 Years,Age (Range):10-14 Years,Age (Range):12-18 Years (Adolescence),Biological Sex:Male,Racial Group:Mixed racial group,Subject:Interferon Gamma Release Assay (IGRA) Negative,Subject:Tuberculosis Skin Test Negative,Tissue Source:gDNA,Tissue Source:PBMC,Tissue Source:T cells 0.8361218472499633 UW-Seshadri-P0201 null 0 diff --git a/vampire/pipe_simple/sample_data/sample.json b/vampire/pipe_simple/sample_data/sample.json new file mode 100644 index 0000000..f5218b8 --- /dev/null +++ b/vampire/pipe_simple/sample_data/sample.json @@ -0,0 +1,9 @@ +{ + "comment": "Just for the purposes of this sample JSON run specification file, the test path overlaps with the train path. Of course, don't do this for a real analysis.", + "test_paths" : [ + "sample_data/02-0249_TCRB.4000.tsv.bz2" + ], + "train_tsv_path" : "sample_data/02-0249_TCRB.4000.tsv.bz2", + "train_size" : 1000, + "nseqs" : 100 +} diff --git a/vampire/preprocess_adaptive.py b/vampire/preprocess_adaptive.py index d31d31a..da0b973 100644 --- a/vampire/preprocess_adaptive.py +++ b/vampire/preprocess_adaptive.py @@ -15,10 +15,10 @@ # Sometimes Adaptive uses one set of column names, and sometimes another. HEADER_TRANSLATION_DICT = { - 'sequenceStatus': 'frame_type', - 'aminoAcid': 'amino_acid', - 'vGeneName': 'v_gene', - 'jGeneName': 'j_gene' + "sequenceStatus": "frame_type", + "aminoAcid": "amino_acid", + "vGeneName": "v_gene", + "jGeneName": "j_gene", } @@ -26,7 +26,7 @@ def filter_and_drop_frame(df): """ Select in-frame sequences and then drop that column. """ - return df.query('frame_type == "In"').drop('frame_type', axis=1) + return df.query('frame_type == "In"').drop("frame_type", axis=1) def filter_on_cdr3_bounding_aas(df): @@ -37,21 +37,24 @@ def filter_on_cdr3_bounding_aas(df): Note that according to the Adaptive docs the `amino_acid` column is indeed the CDR3 amino acid. """ - return df[df['amino_acid'].str.contains('^C.*F$') | df['amino_acid'].str.contains('^C.*YV$')] + return df[ + df["amino_acid"].str.contains("^C.*F$") + | df["amino_acid"].str.contains("^C.*YV$") + ] def filter_on_cdr3_length(df, max_len): """ Only take sequences that have a CDR3 of at most `max_len` length. """ - return df[df['amino_acid'].apply(len) <= max_len] + return df[df["amino_acid"].apply(len) <= max_len] def filter_on_TCRB(df): """ Only take sequences that have a resolved TCRB gene for V and J. """ - return df[df['v_gene'].str.contains('^TCRB') & df['j_gene'].str.contains('^TCRB')] + return df[df["v_gene"].str.contains("^TCRB") & df["j_gene"].str.contains("^TCRB")] def filter_on_olga(df): @@ -64,8 +67,8 @@ def filter_on_olga(df): * exclude TCRBJ2-7, which appears to be problematic for OLGA. """ d = conversion.adaptive_to_olga_dict() - del d['TRBJ']['TCRBJ02-05'] - del d['TRBJ']['TCRBJ02-07'] + del d["TRBJ"]["TCRBJ02-05"] + del d["TRBJ"]["TCRBJ02-07"] return conversion.filter_by_gene_names(df, d) @@ -89,7 +92,9 @@ def apply_all_filters(df, max_len=30, fail_fraction_remaining=None): click.echo(f"Requiring genes that are also present in the OLGA set: {len(df)} rows") if fail_fraction_remaining: if len(df) / original_count < fail_fraction_remaining: - raise Exception(f"We started with {original_count} sequences and now we have {len(df)}. Failing.") + raise Exception( + f"We started with {original_count} sequences and now we have {len(df)}. Failing." + ) return df.reset_index(drop=True) @@ -106,8 +111,8 @@ def collect_vjcdr3_duplicates(df): for idx, row in df.iterrows(): # nan means no CDR3 sequence. We don't want to include those. - if row['amino_acid'] is not np.nan: - key = '_'.join([row['v_gene'], row['j_gene'], row['amino_acid']]) + if row["amino_acid"] is not np.nan: + key = "_".join([row["v_gene"], row["j_gene"], row["amino_acid"]]) d[key].append(idx) return d @@ -138,7 +143,7 @@ def read_adaptive_tsv(path): snake_case and the other that uses camelCase. """ - test_bite = pd.read_csv(path, delimiter='\t', nrows=1) + test_bite = pd.read_csv(path, delimiter="\t", nrows=1) camel_columns = set(HEADER_TRANSLATION_DICT.keys()) snake_columns = set(HEADER_TRANSLATION_DICT.values()) @@ -150,7 +155,7 @@ def read_adaptive_tsv(path): else: raise Exception("Unknown column names!") - df = pd.read_csv(path, delimiter='\t', usecols=take_columns) + df = pd.read_csv(path, delimiter="\t", usecols=take_columns) df.rename(columns=HEADER_TRANSLATION_DICT, inplace=True) return df @@ -159,34 +164,39 @@ def read_adaptive_tsv(path): # Below we use Path rather than File because we don't want to have to figure # out whether a file is compressed or not-- Pandas will figure that out for us. @click.option( - '--fail-fraction-remaining', + "--fail-fraction-remaining", show_default=True, default=0.25, - help="Fail if the post-filtration fraction is below this number.") + help="Fail if the post-filtration fraction is below this number.", +) @click.option( - '--sample', + "--sample", type=int, - metavar='N', - help="Sample N sequences without replacement from the preprocessed sequences for output.") -@click.argument('in_tsv', type=click.Path(exists=True)) -@click.argument('out_csv', type=click.File('w')) + metavar="N", + help="Sample N sequences without replacement from the preprocessed sequences for output.", +) +@click.argument("in_tsv", type=click.Path(exists=True)) +@click.argument("out_csv", type=click.File("w")) def preprocess_tsv(fail_fraction_remaining, sample, in_tsv, out_csv): """ Preprocess the Adaptive TSV at IN_TSV and output to OUT_CSV. This includes doing filters as well as deduplicating on vjcdr3s. """ - df = apply_all_filters(read_adaptive_tsv(in_tsv), fail_fraction_remaining=fail_fraction_remaining) + df = apply_all_filters( + read_adaptive_tsv(in_tsv), fail_fraction_remaining=fail_fraction_remaining + ) if sample: if len(df) < sample: raise Exception( - f"We only have {len(df)} sequences so we can't sample {sample} of them without replacement. Failing.") + f"We only have {len(df)} sequences so we can't sample {sample} of them without replacement. Failing." + ) df = df.sample(n=sample) click.echo(f"Sampling: {sample} rows") df.to_csv(out_csv, index=False) -if __name__ == '__main__': +if __name__ == "__main__": preprocess_tsv() diff --git a/vampire/requirements.txt b/vampire/requirements.txt index dc1c21e..cc27c34 100644 --- a/vampire/requirements.txt +++ b/vampire/requirements.txt @@ -3,7 +3,6 @@ click delegator.py exrex flake8 -keras matplotlib nestly pandas diff --git a/vampire/tcr_vae.py b/vampire/tcr_vae.py index eb7c22b..83b6913 100644 --- a/vampire/tcr_vae.py +++ b/vampire/tcr_vae.py @@ -24,9 +24,9 @@ import numpy as np import pandas as pd -import keras -import keras.backend as K -from keras.callbacks import EarlyStopping, ModelCheckpoint +import tensorflow.keras as keras +import tensorflow.keras.backend as K +from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint import scipy.special as special import scipy.stats as stats @@ -56,7 +56,7 @@ def logprob_of_obs_vect(probs, obs): class TCRVAE: def __init__(self, params): self.params = params - model = importlib.import_module('vampire.models.' + params['model']) + model = importlib.import_module("vampire.models." + params["model"]) # Digest the dictionary returned by model.build into self attributes. for submodel_name, submodel in model.build(params).items(): setattr(self, submodel_name, submodel) @@ -75,7 +75,7 @@ def default_params(cls): """ return dict( # Models: - model='basic', + model="basic", # Model parameters. latent_dim=20, dense_nodes=75, @@ -89,12 +89,13 @@ def default_params(cls): n_v_genes=len(conversion.TCRB_V_GENE_LIST), n_j_genes=len(conversion.TCRB_J_GENE_LIST), # Training parameters. - stopping_monitor='val_loss', + stopping_monitor="val_loss", batch_size=100, pretrains=10, warmup_period=20, epochs=500, - patience=20) + patience=20, + ) @classmethod def default(cls): @@ -108,7 +109,7 @@ def of_json_file(cls, fname): """ Build a TCRVAE from a parameter dictionary dumped to JSON. """ - with open(fname, 'r') as fp: + with open(fname, "r") as fp: return cls(json.load(fp)) @classmethod @@ -120,21 +121,21 @@ def of_directory(cls, path): `model_params.json` and a weights file called `best_weights.h5`. Here we load that information in. """ - v = cls.of_json_file(os.path.join(path, 'model_params.json')) - v.vae.load_weights(os.path.join(path, 'best_weights.h5')) + v = cls.of_json_file(os.path.join(path, "model_params.json")) + v.vae.load_weights(os.path.join(path, "best_weights.h5")) return v def serialize_params(self, fname): """ Dump model parameters to a file. """ - with open(fname, 'w') as fp: + with open(fname, "w") as fp: json.dump(self.params, fp) def reinitialize_weights(self): session = K.get_session() for layer in self.vae.layers: - if hasattr(layer, 'kernel_initializer'): + if hasattr(layer, "kernel_initializer"): layer.kernel.initializer.run(session=session) def get_data(self, fname, data_chunk_size=0): @@ -142,16 +143,22 @@ def get_data(self, fname, data_chunk_size=0): Get data in the correct format from fname. If data_chunk_size is nonzero, trim so the data length is a multiple of data_chunk_size. """ - df = pd.read_csv(fname, usecols=['amino_acid', 'v_gene', 'j_gene']) + df = pd.read_csv(fname, usecols=["amino_acid", "v_gene", "j_gene"]) if data_chunk_size == 0: sub_df = df else: assert len(df) >= data_chunk_size n_to_take = len(df) - len(df) % data_chunk_size sub_df = df[:n_to_take] - return conversion.unpadded_tcrbs_to_onehot(sub_df, self.params['max_cdr3_len']) - - def fit(self, x_df: pd.DataFrame, validation_split: float, best_weights_fname: str, tensorboard_log_dir: str): + return conversion.unpadded_tcrbs_to_onehot(sub_df, self.params["max_cdr3_len"]) + + def fit( + self, + x_df: pd.DataFrame, + validation_split: float, + best_weights_fname: str, + tensorboard_log_dir: str, + ): """ Fit the vae with warmup and early stopping. """ @@ -160,46 +167,57 @@ def fit(self, x_df: pd.DataFrame, validation_split: float, best_weights_fname: s best_val_loss = np.inf # We pretrain a given number of times and take the best run for the full train. - for pretrain_idx in range(self.params['pretrains']): + for pretrain_idx in range(self.params["pretrains"]): self.reinitialize_weights() # In our first fitting phase we don't apply EarlyStopping so that # we get the number of specifed warmup epochs. # Below we apply the fact that right now the only thing in self.callbacks is the BetaSchedule callback. # If other callbacks appear we'll need to change this. if tensorboard_log_dir: - callbacks = [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir + '_warmup_' + str(pretrain_idx))] + callbacks = [ + keras.callbacks.TensorBoard( + log_dir=tensorboard_log_dir + "_warmup_" + str(pretrain_idx) + ) + ] else: callbacks = [] callbacks += self.callbacks # <- here re callbacks history = self.vae.fit( x=data, # y=X for a VAE. y=data, - epochs=1 + self.params['warmup_period'], - batch_size=self.params['batch_size'], + epochs=1 + self.params["warmup_period"], + batch_size=self.params["batch_size"], validation_split=validation_split, callbacks=callbacks, - verbose=2) - new_val_loss = history.history['val_loss'][-1] + verbose=2, + ) + new_val_loss = history.history["val_loss"][-1] if new_val_loss < best_val_loss: best_val_loss = new_val_loss self.vae.save_weights(best_weights_fname, overwrite=True) self.vae.load_weights(best_weights_fname) - checkpoint = ModelCheckpoint(best_weights_fname, save_best_only=True, mode='min') + checkpoint = ModelCheckpoint( + best_weights_fname, save_best_only=True, mode="min" + ) early_stopping = EarlyStopping( - monitor=self.params['stopping_monitor'], patience=self.params['patience'], mode='min') + monitor=self.params["stopping_monitor"], + patience=self.params["patience"], + mode="min", + ) callbacks = [checkpoint, early_stopping] if tensorboard_log_dir: callbacks += [keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir)] self.vae.fit( x=data, # y=X for a VAE. y=data, - epochs=self.params['epochs'], - batch_size=self.params['batch_size'], + epochs=self.params["epochs"], + batch_size=self.params["batch_size"], validation_split=validation_split, callbacks=callbacks, - verbose=2) + verbose=2, + ) def evaluate(self, x_df, per_sequence=False): """ @@ -215,7 +233,9 @@ def evaluate(self, x_df, per_sequence=False): """ def our_evaluate(data): - return self.vae.evaluate(x=data, y=data, batch_size=self.params['batch_size'], verbose=0) + return self.vae.evaluate( + x=data, y=data, batch_size=self.params["batch_size"], verbose=0 + ) data = self.prepare_data(x_df) @@ -226,7 +246,12 @@ def our_evaluate(data): # data elements the minimum amount for evaluate to work (the # batch_size) and evaluate. return [ - our_evaluate([common.repeat_row(data_elt, i, self.params['batch_size']) for data_elt in data]) + our_evaluate( + [ + common.repeat_row(data_elt, i, self.params["batch_size"]) + for data_elt in data + ] + ) for i in range(len(x_df)) ] @@ -252,14 +277,16 @@ def generate(self, n_seqs): """ Generate a data frame of n_seqs sequences. """ - batch_size = self.params['batch_size'] + batch_size = self.params["batch_size"] # Increase the number of desired sequences as needed so it's divisible by batch_size. n_actual = batch_size * math.ceil(n_seqs / batch_size) # Sample from the latent space to generate sequences: - z_sample = np.random.normal(0, 1, size=(n_actual, self.params['latent_dim'])) + z_sample = np.random.normal(0, 1, size=(n_actual, self.params["latent_dim"])) amino_acid_arr, v_gene_arr, j_gene_arr = self.decode(z_sample) # Convert back, restricting to the desired number of sequences. - return conversion.onehot_to_tcrbs(amino_acid_arr[:n_seqs], v_gene_arr[:n_seqs], j_gene_arr[:n_seqs]) + return conversion.onehot_to_tcrbs( + amino_acid_arr[:n_seqs], v_gene_arr[:n_seqs], j_gene_arr[:n_seqs] + ) def log_pvae_importance_sample(self, x_df, out_ps): """ @@ -294,7 +321,7 @@ def log_pvae_importance_sample(self, x_df, out_ps): # We're going to be getting a one-sample estimate, so we want one slot # in our output array for each input sequence. - assert (len(x_df) == len(out_ps)) + assert len(x_df) == len(out_ps) # Get encoding of x's in the latent space. z_mean, z_sd = self.encode(x_df) @@ -310,10 +337,11 @@ def log_pvae_importance_sample(self, x_df, out_ps): # Loop over observations. for i in range(len(x_df)): - log_p_x_given_z = \ - logprob_of_obs_vect(aa_probs[i], aa_obs[i]) + \ - np.log(np.sum(v_gene_probs[i] * v_gene_obs[i])) + \ - np.log(np.sum(j_gene_probs[i] * j_gene_obs[i])) + log_p_x_given_z = ( + logprob_of_obs_vect(aa_probs[i], aa_obs[i]) + + np.log(np.sum(v_gene_probs[i] * v_gene_obs[i])) + + np.log(np.sum(j_gene_probs[i] * j_gene_obs[i])) + ) # p(z) # Here we use that the PDF of a multivariate normal with # diagonal covariance is the product of the PDF of the @@ -336,11 +364,11 @@ def cli(): @cli.command() -@click.option('--tensorboard', is_flag=True, help="Record logs for TensorBoard.") -@click.argument('params_json', type=click.Path(exists=True)) -@click.argument('train_csv', type=click.File('r')) -@click.argument('best_weights_fname', type=click.Path(writable=True)) -@click.argument('diagnostics_fname', type=click.Path(writable=True)) +@click.option("--tensorboard", is_flag=True, help="Record logs for TensorBoard.") +@click.argument("params_json", type=click.Path(exists=True)) +@click.argument("train_csv", type=click.File("r")) +@click.argument("best_weights_fname", type=click.Path(writable=True)) +@click.argument("diagnostics_fname", type=click.Path(writable=True)) def train(tensorboard, params_json, train_csv, best_weights_fname, diagnostics_fname): """ Train the model described in params_json using data in train_csv, saving @@ -355,11 +383,11 @@ def train(tensorboard, params_json, train_csv, best_weights_fname, diagnostics_f # If this fails then we may have problems with chunks of the data being the # wrong length. assert sub_chunk_size == float(int(sub_chunk_size)) - min_data_size = validation_split_multiplier * v.params['batch_size'] + min_data_size = validation_split_multiplier * v.params["batch_size"] train_data = v.get_data(train_csv, min_data_size) if tensorboard: - tensorboard_log_dir = os.path.join(os.path.dirname(best_weights_fname), 'logs') + tensorboard_log_dir = os.path.join(os.path.dirname(best_weights_fname), "logs") else: tensorboard_log_dir = None v.fit(train_data, validation_split, best_weights_fname, tensorboard_log_dir) @@ -370,18 +398,21 @@ def train(tensorboard, params_json, train_csv, best_weights_fname, diagnostics_f vp.vae.load_weights(best_weights_fname) df = pd.DataFrame( - OrderedDict([('train', v.evaluate(train_data)), ('vp_train', vp.evaluate(train_data))]), - index=v.vae.metrics_names) + OrderedDict( + [("train", v.evaluate(train_data)), ("vp_train", vp.evaluate(train_data))] + ), + index=v.vae.metrics_names, + ) df.to_csv(diagnostics_fname) return v @cli.command() -@click.argument('params_json', type=click.Path(exists=True)) -@click.argument('model_weights', type=click.Path(exists=True)) -@click.argument('train_csv', type=click.File('r')) -@click.argument('validation_csv', type=click.File('r')) -@click.argument('out_csv', type=click.File('w')) +@click.argument("params_json", type=click.Path(exists=True)) +@click.argument("model_weights", type=click.Path(exists=True)) +@click.argument("train_csv", type=click.File("r")) +@click.argument("validation_csv", type=click.File("r")) +@click.argument("out_csv", type=click.File("w")) def loss(params_json, model_weights, train_csv, validation_csv, out_csv): """ Record aggregate losses. @@ -391,17 +422,25 @@ def loss(params_json, model_weights, train_csv, validation_csv, out_csv): v.vae.load_weights(model_weights) df = pd.DataFrame( - OrderedDict([('train', v.evaluate(v.get_data(train_csv, v.params['batch_size']))), - ('validation', v.evaluate(v.get_data(validation_csv, v.params['batch_size'])))]), - index=v.vae.metrics_names) + OrderedDict( + [ + ("train", v.evaluate(v.get_data(train_csv, v.params["batch_size"]))), + ( + "validation", + v.evaluate(v.get_data(validation_csv, v.params["batch_size"])), + ), + ] + ), + index=v.vae.metrics_names, + ) df.to_csv(out_csv) @cli.command() -@click.argument('params_json', type=click.Path(exists=True)) -@click.argument('model_weights', type=click.Path(exists=True)) -@click.argument('in_csv', type=click.File('r')) -@click.argument('out_csv', type=click.File('w')) +@click.argument("params_json", type=click.Path(exists=True)) +@click.argument("model_weights", type=click.Path(exists=True)) +@click.argument("in_csv", type=click.File("r")) +@click.argument("out_csv", type=click.File("w")) def per_seq_loss(params_json, model_weights, in_csv, out_csv): """ Record per-sequence losses. @@ -411,18 +450,31 @@ def per_seq_loss(params_json, model_weights, in_csv, out_csv): v.vae.load_weights(model_weights) df = pd.DataFrame( - np.array(v.evaluate(v.get_data(in_csv, v.params['batch_size']), per_sequence=True)), - columns=v.vae.metrics_names) + np.array( + v.evaluate(v.get_data(in_csv, v.params["batch_size"]), per_sequence=True) + ), + columns=v.vae.metrics_names, + ) df.to_csv(out_csv, index=False) @cli.command() -@click.option('--limit-input-to', default=None, type=int, help='Only use the first input sequences.') -@click.option('--nsamples', default=500, show_default=True, help='Number of importance samples to use.') -@click.argument('params_json', type=click.Path(exists=True)) -@click.argument('model_weights', type=click.Path(exists=True)) -@click.argument('test_csv', type=click.File('r')) -@click.argument('out_csv', type=click.File('w')) +@click.option( + "--limit-input-to", + default=None, + type=int, + help="Only use the first input sequences.", +) +@click.option( + "--nsamples", + default=500, + show_default=True, + help="Number of importance samples to use.", +) +@click.argument("params_json", type=click.Path(exists=True)) +@click.argument("model_weights", type=click.Path(exists=True)) +@click.argument("test_csv", type=click.File("r")) +@click.argument("out_csv", type=click.File("w")) def pvae(limit_input_to, nsamples, params_json, model_weights, test_csv, out_csv): """ Estimate Pvae of the sequences in test_csv for the VAE determined by @@ -437,10 +489,12 @@ def pvae(limit_input_to, nsamples, params_json, model_weights, test_csv, out_csv df_x = v.get_data(test_csv) if limit_input_to is not None: - df_x = df_x.iloc[:int(limit_input_to)] + df_x = df_x.iloc[: int(limit_input_to)] log_p_x = np.zeros((nsamples, len(df_x))) - click.echo("Calculating pvae for {} via importance sampling...".format(test_csv.name)) + click.echo( + "Calculating pvae for {} via importance sampling...".format(test_csv.name) + ) with click.progressbar(range(nsamples)) as bar: for i in bar: @@ -448,21 +502,52 @@ def pvae(limit_input_to, nsamples, params_json, model_weights, test_csv, out_csv # Calculate log of mean of numbers given in log space. avg = special.logsumexp(log_p_x, axis=0) - np.log(nsamples) - pd.DataFrame({'log_p_x': avg}).to_csv(out_csv, index=False) + pd.DataFrame({"log_p_x": avg}).to_csv(out_csv, index=False) @cli.command() -@click.option('--nsamples', default=100, show_default=True, help="Number of importance samples to use.") -@click.option('--batch-size', default=100, show_default=True, help="Batch size for tcregex calculation.") -@click.option('--max-iters', default=100, show_default=True, help="The maximum number of batch iterations to use.") @click.option( - '--track-last', default=5, show_default=True, help="We want the SD of the last track-last to be less than tol.") -@click.option('--tol', default=0.005, show_default=True, help="Tolerance for tcregex accuracy.") -@click.argument('params_json', type=click.Path(exists=True)) -@click.argument('model_weights', type=click.Path(exists=True)) -@click.argument('in_tcregex') -@click.argument('out_csv', type=click.File('w')) -def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json, model_weights, in_tcregex, out_csv): + "--nsamples", + default=100, + show_default=True, + help="Number of importance samples to use.", +) +@click.option( + "--batch-size", + default=100, + show_default=True, + help="Batch size for tcregex calculation.", +) +@click.option( + "--max-iters", + default=100, + show_default=True, + help="The maximum number of batch iterations to use.", +) +@click.option( + "--track-last", + default=5, + show_default=True, + help="We want the SD of the last track-last to be less than tol.", +) +@click.option( + "--tol", default=0.005, show_default=True, help="Tolerance for tcregex accuracy." +) +@click.argument("params_json", type=click.Path(exists=True)) +@click.argument("model_weights", type=click.Path(exists=True)) +@click.argument("in_tcregex") +@click.argument("out_csv", type=click.File("w")) +def tcregex_pvae( + nsamples, + batch_size, + max_iters, + track_last, + tol, + params_json, + model_weights, + in_tcregex, + out_csv, +): """ Calculate Pvae for a TCR specified by a tcregex. @@ -484,7 +569,9 @@ def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json, for batch_i in range(max_iters): df_generated = tcregex.sample_tcregex(in_tcregex, batch_size) - df_x = conversion.unpadded_tcrbs_to_onehot(df_generated, v.params['max_cdr3_len']) + df_x = conversion.unpadded_tcrbs_to_onehot( + df_generated, v.params["max_cdr3_len"] + ) log_p_x = np.zeros((nsamples, len(df_x))) @@ -493,13 +580,17 @@ def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json, # Calculate log of mean of numbers given in log space. # This calculates the per-sequence log_p_x estimate. - df_generated['log_p_x'] = special.logsumexp(log_p_x, axis=0) - np.log(nsamples) + df_generated["log_p_x"] = special.logsumexp(log_p_x, axis=0) - np.log(nsamples) generated_dfs.append(df_generated) catted = pd.concat(generated_dfs) - means.append(special.logsumexp(catted['log_p_x'], axis=0) - np.log(len(catted))) + means.append(special.logsumexp(catted["log_p_x"], axis=0) - np.log(len(catted))) if len(means) > track_last: recent_sd = np.std(np.array(means[-track_last:])) - click.echo("[Iter {}]\tmean: {:.6}\trecent SD: {:.5}\ttol: {}".format(batch_i, means[-1], recent_sd, tol)) + click.echo( + "[Iter {}]\tmean: {:.6}\trecent SD: {:.5}\ttol: {}".format( + batch_i, means[-1], recent_sd, tol + ) + ) if recent_sd < tol: break else: @@ -510,10 +601,16 @@ def tcregex_pvae(nsamples, batch_size, max_iters, track_last, tol, params_json, @cli.command() -@click.option('-n', '--nseqs', default=100, show_default=True, help='Number of sequences to generate.') -@click.argument('params_json', type=click.Path(exists=True)) -@click.argument('model_weights', type=click.Path(exists=True)) -@click.argument('out_csv', type=click.File('w')) +@click.option( + "-n", + "--nseqs", + default=100, + show_default=True, + help="Number of sequences to generate.", +) +@click.argument("params_json", type=click.Path(exists=True)) +@click.argument("model_weights", type=click.Path(exists=True)) +@click.argument("out_csv", type=click.File("w")) def generate(nseqs, params_json, model_weights, out_csv): """ Generate some sequences and write them to a file. @@ -523,5 +620,5 @@ def generate(nseqs, params_json, model_weights, out_csv): v.generate(nseqs).to_csv(out_csv, index=False) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/vampire/tcregex.py b/vampire/tcregex.py index b534ac0..81a0dea 100644 --- a/vampire/tcregex.py +++ b/vampire/tcregex.py @@ -9,12 +9,12 @@ # This is how we specify the "tcregex" format. replacement_dict = { # . means any amino acid - '\.': '[' + ''.join(aas) + ']', + "\.": "[" + "".join(aas) + "]", # Amino acid ambiguity codes as per # http://www.virology.wisc.edu/acp/Classes/DropFolders/Drop660_lectures/SingleLetterCode.html # https://febs.onlinelibrary.wiley.com/doi/pdf/10.1111/j.1432-1033.1984.tb07877.x - 'B': '[DN]', - 'Z': '[EQ]' + "B": "[DN]", + "Z": "[EQ]", } @@ -40,15 +40,17 @@ def sample_split_tcregex(v_gene, j_gene, cdr3_tcregex, n): """ Sample from a tcregex that has been split into its components. """ - df = pd.DataFrame({'amino_acid': sample_cdr3_tcregex(cdr3_tcregex, n)}) - df['v_gene'] = v_gene - df['j_gene'] = j_gene - return (df) + df = pd.DataFrame({"amino_acid": sample_cdr3_tcregex(cdr3_tcregex, n)}) + df["v_gene"] = v_gene + df["j_gene"] = j_gene + return df def sample_tcregex(tcregex, n): - inputs = tcregex.split(',') + inputs = tcregex.split(",") if len(inputs) != 3: - raise Exception("tcregexes should be specified in the format `v_gene,j_gene,cdr3_tcregex`.") + raise Exception( + "tcregexes should be specified in the format `v_gene,j_gene,cdr3_tcregex`." + ) v_gene, j_gene, cdr3_tcregex = inputs return sample_split_tcregex(v_gene, j_gene, cdr3_tcregex, n) diff --git a/vampire/tests/test_germline_cdr3_aa_tensor.py b/vampire/tests/test_germline_cdr3_aa_tensor.py index 63bad59..4a51180 100644 --- a/vampire/tests/test_germline_cdr3_aa_tensor.py +++ b/vampire/tests/test_germline_cdr3_aa_tensor.py @@ -5,23 +5,35 @@ def test_aa_encoding_tensors(): - aa_order = 'ABC' - v_gene_list = ['TCRBV01-01', 'TCRBV02-01'] - j_gene_list = ['TCRBJ01-01', 'TCRBJ01-02', 'TCRBJ01-03'] - germline_cdr3_csv = pkg_resources.resource_filename('vampire', 'data/germline-cdr3-aas.test.csv') + aa_order = "ABC" + v_gene_list = ["TCRBV01-01", "TCRBV02-01"] + j_gene_list = ["TCRBJ01-01", "TCRBJ01-02", "TCRBJ01-03"] + germline_cdr3_csv = pkg_resources.resource_filename( + "vampire", "data/germline-cdr3-aas.test.csv" + ) max_cdr3_len = 4 # B A # C - v_encoding_correct = np.array([[[0., 1., 0.], [1., 0., 0.], [0., 0., 0.], [0., 0., 0.]], - [[0., 0., 1.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]]) + v_encoding_correct = np.array( + [ + [[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ] + ) # A C # A # C B - j_encoding_correct = np.array([[[0., 0., 0.], [0., 0., 0.], [1., 0., 0.], [0., 0., 1.]], - [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [1., 0., 0.]], - [[0., 0., 0.], [0., 0., 0.], [0., 0., 1.], [0., 1., 0.]]]) + j_encoding_correct = np.array( + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0]], + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0]], + ] + ) - (v_enc, j_enc) = aa_encoding_tensors(germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len) + (v_enc, j_enc) = aa_encoding_tensors( + germline_cdr3_csv, aa_order, v_gene_list, j_gene_list, max_cdr3_len + ) assert np.array_equal(v_enc, v_encoding_correct) assert np.array_equal(j_enc, j_encoding_correct) diff --git a/vampire/tests/test_layers.py b/vampire/tests/test_layers.py index 7be7821..7b6c84f 100644 --- a/vampire/tests/test_layers.py +++ b/vampire/tests/test_layers.py @@ -5,8 +5,14 @@ def test_cumprod_sum(): - a = np.array([[[1, 1, 1, 0], [1, 0, 0, 0]], [[1, 1, 0, 1], [1, 0, 1, 1]], [[0, 1, 1, 1], [0, 0, 1, 1]]], - dtype=np.float32) + a = np.array( + [ + [[1, 1, 1, 0], [1, 0, 0, 0]], + [[1, 1, 0, 1], [1, 0, 1, 1]], + [[0, 1, 1, 1], [0, 0, 1, 1]], + ], + dtype=np.float32, + ) def np_cumprod_sum(a, length, reverse=False): if reverse: @@ -16,8 +22,10 @@ def np_cumprod_sum(a, length, reverse=False): for k in range(a.shape[0]): for length in range(1, 4): for reverse in [True, False]: - with tf.Session() as sess: + with tf.compat.v1.Session() as sess: tf_input = tf.constant(a[k, :, :], dtype=tf.float32) tf_result = layers.cumprod_sum(tf_input, length, reverse=reverse) tf_result = sess.run(tf_result) - assert np.allclose(np_cumprod_sum(a[k, :, :], length, reverse), tf_result) + assert np.allclose( + np_cumprod_sum(a[k, :, :], length, reverse), tf_result + ) diff --git a/vampire/tests/test_preprocess.py b/vampire/tests/test_preprocess.py index 2e47a9d..a8382bc 100644 --- a/vampire/tests/test_preprocess.py +++ b/vampire/tests/test_preprocess.py @@ -5,15 +5,15 @@ def test_filters(): - unfiltered = common.read_data_csv('adaptive-filter-test.csv') - correct = common.read_data_csv('adaptive-filter-test.correct.csv') + unfiltered = common.read_data_csv("adaptive-filter-test.csv") + correct = common.read_data_csv("adaptive-filter-test.correct.csv") filtered = pre.apply_all_filters(unfiltered) assert correct.equals(filtered) def test_dedup(): - original = common.read_data_csv('vjcdr3-dedup-test.csv') - correct = common.read_data_csv('vjcdr3-dedup-test.correct.csv') + original = common.read_data_csv("vjcdr3-dedup-test.csv") + correct = common.read_data_csv("vjcdr3-dedup-test.correct.csv") random.seed(1) deduped = pre.dedup_on_vjcdr3(original) assert correct.equals(deduped) diff --git a/vampire/tests/test_xcr_vector_conversion.py b/vampire/tests/test_xcr_vector_conversion.py index 8a4fd28..4d8bc12 100644 --- a/vampire/tests/test_xcr_vector_conversion.py +++ b/vampire/tests/test_xcr_vector_conversion.py @@ -7,11 +7,11 @@ def test_pad_middle(): with pytest.raises(AssertionError): - conversion.pad_middle('AAAAA', 2) - assert 'A---B' == conversion.pad_middle('AB', 5) - assert 'A--BC' == conversion.pad_middle('ABC', 5) - assert 'AB' == conversion.pad_middle('AB', 2) - assert '----' == conversion.pad_middle('---', 4) + conversion.pad_middle("AAAAA", 2) + assert "A---B" == conversion.pad_middle("AB", 5) + assert "A--BC" == conversion.pad_middle("ABC", 5) + assert "AB" == conversion.pad_middle("AB", 2) + assert "----" == conversion.pad_middle("---", 4) def test_gene_conversion(): @@ -22,17 +22,17 @@ def test_gene_conversion(): def test_aa_conversion(): - target = 'CASY' + target = "CASY" assert conversion.onehot_to_seq(conversion.seq_to_onehot(target)) == target - target = 'C-SY' + target = "C-SY" assert conversion.onehot_to_seq(conversion.seq_to_onehot(target)) == target def test_cdr3_length_of_onehots(): - data = common.read_data_csv('adaptive-filter-test.correct.csv') - lengths = data['amino_acid'].apply(len).apply(float) + data = common.read_data_csv("adaptive-filter-test.correct.csv") + lengths = data["amino_acid"].apply(len).apply(float) onehots = conversion.unpadded_tcrbs_to_onehot(data, 30) - assert lengths.equals(conversion.cdr3_length_of_onehots(onehots['amino_acid'])) + assert lengths.equals(conversion.cdr3_length_of_onehots(onehots["amino_acid"])) def test_contiguous_match_counts(): @@ -41,16 +41,25 @@ def test_contiguous_match_counts(): j01_02 = j[1] v01_v02_mixture = (v[0] + v[1]) / 2 - v01_j02_matching = conversion.seq_to_onehot('CTSSQ-------------------NYGYTF') - two_v01_match = conversion.seq_to_onehot('CTWSQ-------------------NYGYTF') - three_j02_match = conversion.seq_to_onehot('CTSSQ-------------------NYAYTF') + v01_j02_matching = conversion.seq_to_onehot("CTSSQ-------------------NYGYTF") + two_v01_match = conversion.seq_to_onehot("CTWSQ-------------------NYGYTF") + three_j02_match = conversion.seq_to_onehot("CTSSQ-------------------NYAYTF") # Here we have a complete match for both genes. - assert np.array_equal(conversion.contiguous_match_counts(v01_j02_matching, v01_01, j01_02), np.array([5., 6.])) + assert np.array_equal( + conversion.contiguous_match_counts(v01_j02_matching, v01_01, j01_02), + np.array([5.0, 6.0]), + ) # Here the V match is interrupted by a W instead of an S, and we can see the "contiguous" requirement working. - assert np.array_equal(conversion.contiguous_match_counts(two_v01_match, v01_01, j01_02), np.array([2., 6.])) + assert np.array_equal( + conversion.contiguous_match_counts(two_v01_match, v01_01, j01_02), + np.array([2.0, 6.0]), + ) # Equivalent test for J. - assert np.array_equal(conversion.contiguous_match_counts(three_j02_match, v01_01, j01_02), np.array([5., 3.])) + assert np.array_equal( + conversion.contiguous_match_counts(three_j02_match, v01_01, j01_02), + np.array([5.0, 3.0]), + ) # For the mixture, we have one residue that matches both, then one that # only matches V01, then another two that match both. You can see that in # the indicator decay from left to right. @@ -58,14 +67,20 @@ def test_contiguous_match_counts(): # 1., 0.5, 0.5, 0.5, 0.25, 0., 0., 0., 0., 0., 0., 0., 0., 0., # 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1. assert np.array_equal( - conversion.contiguous_match_counts(v01_j02_matching, v01_v02_mixture, j01_02), np.array([2.75, 6.])) + conversion.contiguous_match_counts(v01_j02_matching, v01_v02_mixture, j01_02), + np.array([2.75, 6.0]), + ) def test_contiguous_match_counts_df(): - test = conversion.unpadded_tcrbs_to_onehot(common.read_data_csv('adaptive-filter-test.correct.csv'), 30) + test = conversion.unpadded_tcrbs_to_onehot( + common.read_data_csv("adaptive-filter-test.correct.csv"), 30 + ) v_germline_tensor, j_germline_tensor = conversion.adaptive_aa_encoding_tensors(30) - result = conversion.contiguous_match_counts_df(test, v_germline_tensor, j_germline_tensor) + result = conversion.contiguous_match_counts_df( + test, v_germline_tensor, j_germline_tensor + ) - assert np.array_equal(result[0], np.array([4., 6.])) - assert np.array_equal(result[1], np.array([5., 5.])) - assert np.array_equal(result[5], np.array([5., 0.])) + assert np.array_equal(result[0], np.array([4.0, 6.0])) + assert np.array_equal(result[1], np.array([5.0, 5.0])) + assert np.array_equal(result[5], np.array([5.0, 0.0])) diff --git a/vampire/thymic_Q.py b/vampire/thymic_Q.py index bd8a5c7..e629c32 100644 --- a/vampire/thymic_Q.py +++ b/vampire/thymic_Q.py @@ -12,7 +12,7 @@ # These are our "lvj" columns, which are the most common indices for # probabilities and sequences in what follows. -lvj = ['length', 'v_gene', 'j_gene'] +lvj = ["length", "v_gene", "j_gene"] def bound(x, m): @@ -35,7 +35,7 @@ def add_pseudocount(df, col_name, pseudocount_multiplier): The pseudocount is pseudocount_multiplier times the smallest non-zero element. """ - zero_rows = (df[col_name] == 0) + zero_rows = df[col_name] == 0 pseudocount = min(df.loc[~zero_rows, col_name]) * pseudocount_multiplier df.loc[zero_rows, col_name] += pseudocount @@ -45,11 +45,11 @@ def read_olga_tsv(path): Read in a TSV in the format liked by OLGA. Four columns means that there is DNA data-- if so, drop it. """ - df = pd.read_csv(path, sep='\t', header=None) + df = pd.read_csv(path, sep="\t", header=None) if len(df.columns) == 4: df = df.iloc[:, 1:] assert len(df.columns) == 3 - df.columns = 'amino_acid v_gene j_gene'.split() + df.columns = "amino_acid v_gene j_gene".split() return df @@ -57,9 +57,9 @@ def read_olga_pgen_tsv(path): """ Read in a TSV output by OLGA's Pgen calculation. """ - df = pd.read_csv(path, sep='\t', header=None) + df = pd.read_csv(path, sep="\t", header=None) assert len(df.columns) == 4 - df.columns = 'amino_acid v_gene j_gene Pgen'.split() + df.columns = "amino_acid v_gene j_gene Pgen".split() return df @@ -69,9 +69,9 @@ def lvj_frequency_of_olga_tsv(path, col_name): contained in it. """ df = read_olga_tsv(path) - df['length'] = df['amino_acid'].apply(len) + df["length"] = df["amino_acid"].apply(len) df = df.loc[:, lvj] - df[col_name] = 1. + df[col_name] = 1.0 df = df.groupby(lvj).sum() normalize_column(df, col_name) return df @@ -81,11 +81,11 @@ def set_lvj_index(df): """ Make an lvj index in place from the length, v_gene, and j_gene. """ - df['length'] = df['amino_acid'].apply(len) + df["length"] = df["amino_acid"].apply(len) df.set_index(lvj, inplace=True) -def merge_lvj_dfs(df1, df2, how='outer'): +def merge_lvj_dfs(df1, df2, how="outer"): """ Merge two data frames on lvj indices. @@ -96,20 +96,24 @@ def merge_lvj_dfs(df1, df2, how='outer'): return merged -def q_of_train_and_model_pgen(model_p_lvj_csv, train_tsv, max_q=None, pseudocount_multiplier=0.5): +def q_of_train_and_model_pgen( + model_p_lvj_csv, train_tsv, max_q=None, pseudocount_multiplier=0.5 +): """ Fit a q distribution, but truncating q at max_q. """ # Merge the p_lvj from the data and that from the model: df = merge_lvj_dfs( - lvj_frequency_of_olga_tsv(train_tsv, col_name='data_P_lvj'), pd.read_csv(model_p_lvj_csv, index_col=lvj)) + lvj_frequency_of_olga_tsv(train_tsv, col_name="data_P_lvj"), + pd.read_csv(model_p_lvj_csv, index_col=lvj), + ) # We need to do this merge so we can add a pseudocount: - add_pseudocount(df, 'model_P_lvj', pseudocount_multiplier) - normalize_column(df, 'model_P_lvj') - q = df['data_P_lvj'] / df['model_P_lvj'] + add_pseudocount(df, "model_P_lvj", pseudocount_multiplier) + normalize_column(df, "model_P_lvj") + q = df["data_P_lvj"] / df["model_P_lvj"] if max_q: q = bound(q, max_q) - return pd.DataFrame({'q': q}) + return pd.DataFrame({"q": q}) def calc_Ppost(q_csv, data_pgen_tsv, pseudocount_multiplier=0.5): @@ -120,17 +124,17 @@ def calc_Ppost(q_csv, data_pgen_tsv, pseudocount_multiplier=0.5): set_lvj_index(df) # Merging on left means that we only use only keys from the left data # frame, i.e. the sequences for which we are interested in computing Ppost. - df = merge_lvj_dfs(df, pd.read_csv(q_csv, index_col=lvj), how='left') + df = merge_lvj_dfs(df, pd.read_csv(q_csv, index_col=lvj), how="left") # Add a pseudocount to Pgen and q to prevent infs in whole-data log likelihoods. - add_pseudocount(df, 'q', pseudocount_multiplier) - add_pseudocount(df, 'Pgen', pseudocount_multiplier) - df['Ppost'] = df['Pgen'] * df['q'] + add_pseudocount(df, "q", pseudocount_multiplier) + add_pseudocount(df, "Pgen", pseudocount_multiplier) + df["Ppost"] = df["Pgen"] * df["q"] # Drop length from the index. df.reset_index(level=0, drop=True, inplace=True) # Turn the index columns into normal columns. df.reset_index(inplace=True) # Only keep our usual 3 columns. - df.set_index(['amino_acid', 'v_gene', 'j_gene'], inplace=True) + df.set_index(["amino_acid", "v_gene", "j_gene"], inplace=True) return df @@ -147,17 +151,17 @@ def rejection_sample_Ppost(q_df, df_Pgen_sample): df = df_Pgen_sample set_lvj_index(df) # This merge is how we get the q value corresponding to each sequence x. - df = merge_lvj_dfs(df, q_df, how='left') - max_q = np.max(df['q']) - df['acceptance_prob'] = df['q'] / max_q - df['random'] = np.random.uniform(size=len(df)) + df = merge_lvj_dfs(df, q_df, how="left") + max_q = np.max(df["q"]) + df["acceptance_prob"] = df["q"] / max_q + df["random"] = np.random.uniform(size=len(df)) # We subselect the rows that should be accepted, and keep only the # amino_acid column (the v_gene and j_gene columns are kept as part of the # index). - df = df.loc[df['random'] < df['acceptance_prob'], 'amino_acid'] + df = df.loc[df["random"] < df["acceptance_prob"], "amino_acid"] # Turn the index columns into normal columns, just keeping our usual 3 # columns. - df = df.reset_index()[['amino_acid', 'v_gene', 'j_gene']] + df = df.reset_index()[["amino_acid", "v_gene", "j_gene"]] # Randomize the order of the sequences (they got sorted in the process of # merging with Q). df = df.sample(frac=1) @@ -173,7 +177,9 @@ def sample_Ppost(sample_size, q_csv, max_iter=100, proposal_size=1e6): with tempfile.TemporaryDirectory() as tmpdir: for iter_idx in range(max_iter): sample_path = os.path.join(tmpdir, str(iter_idx)) - c = delegator.run(' '.join(['olga-generate.sh', str(proposal_size), sample_path])) + c = delegator.run( + " ".join(["olga-generate.sh", str(proposal_size), sample_path]) + ) if c.return_code != 0: raise Exception("olga-generate.sh failed!") df_sample = rejection_sample_Ppost(q_df, read_olga_tsv(sample_path)) @@ -183,7 +189,9 @@ def sample_Ppost(sample_size, q_csv, max_iter=100, proposal_size=1e6): break if len(out_df) < sample_size: - raise Exception("Did not obtain desired number of samples in the specified maximumn number of iterations.") + raise Exception( + "Did not obtain desired number of samples in the specified maximumn number of iterations." + ) out_df = out_df.head(sample_size) return out_df @@ -195,9 +203,9 @@ def cli(): @cli.command() -@click.option('--col-name', required=True, help="Name for frequency column.") -@click.argument('in_tsv', type=click.File('r')) -@click.argument('out_csv', type=click.File('w')) +@click.option("--col-name", required=True, help="Name for frequency column.") +@click.argument("in_tsv", type=click.File("r")) +@click.argument("out_csv", type=click.File("w")) def lvj_frequency(col_name, in_tsv, out_csv): """ Calculate frequency of lvj combinations in a given OLGA TSV. @@ -206,11 +214,17 @@ def lvj_frequency(col_name, in_tsv, out_csv): @cli.command() -@click.option('--max-q', type=int, default=None, show_default=True, help="Limit q to be at most this value.") +@click.option( + "--max-q", + type=int, + default=None, + show_default=True, + help="Limit q to be at most this value.", +) # Here we use a click.Path so we can pass a .bz2'd CSV. -@click.argument('model_p_lvj_csv', type=click.Path(exists=True)) -@click.argument('train_tsv', type=click.File('r')) -@click.argument('out_csv', type=click.File('w')) +@click.argument("model_p_lvj_csv", type=click.Path(exists=True)) +@click.argument("train_tsv", type=click.File("r")) +@click.argument("out_csv", type=click.File("w")) def q(max_q, model_p_lvj_csv, train_tsv, out_csv): """ Calculate q_lvj given training data and p_lvj obtained from the model. @@ -219,9 +233,9 @@ def q(max_q, model_p_lvj_csv, train_tsv, out_csv): @cli.command() -@click.argument('q_csv', type=click.File('r')) -@click.argument('data_pgen_tsv', type=click.File('r')) -@click.argument('out_csv', type=click.File('w')) +@click.argument("q_csv", type=click.File("r")) +@click.argument("data_pgen_tsv", type=click.File("r")) +@click.argument("out_csv", type=click.File("w")) def ppost(q_csv, data_pgen_tsv, out_csv): """ Calculate Ppost given q_lvj and Pgen calculated for a data set. @@ -231,23 +245,28 @@ def ppost(q_csv, data_pgen_tsv, out_csv): @cli.command() @click.option( - '--max-iter', + "--max-iter", default=100, show_default=True, - help="Maximum number of iterations to try to achieve the desired number of samples.") + help="Maximum number of iterations to try to achieve the desired number of samples.", +) @click.option( - '--proposal-size', default=1000000, show_default=True, help="Number of samples to take for proposal distribution.") -@click.argument('sample_size', type=int) -@click.argument('q_csv', type=click.File('r')) -@click.argument('out_tsv', type=click.File('w')) + "--proposal-size", + default=1000000, + show_default=True, + help="Number of samples to take for proposal distribution.", +) +@click.argument("sample_size", type=int) +@click.argument("q_csv", type=click.File("r")) +@click.argument("out_tsv", type=click.File("w")) def sample(max_iter, proposal_size, sample_size, q_csv, out_tsv): """ Sample from the Ppost distribution via rejection sampling. """ sample_Ppost( - sample_size, q_csv, max_iter=max_iter, proposal_size=proposal_size).to_csv( - out_tsv, sep='\t', index=False) + sample_size, q_csv, max_iter=max_iter, proposal_size=proposal_size + ).to_csv(out_tsv, sep="\t", index=False) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/vampire/util.py b/vampire/util.py index 139c251..8164a94 100644 --- a/vampire/util.py +++ b/vampire/util.py @@ -29,12 +29,12 @@ def cli(): @cli.command() -@click.option('--idx', required=True, help='The row index for the summary output.') -@click.option('--idx-name', required=True, help='The row index name.') -@click.argument('seq_path', type=click.Path(exists=True)) -@click.argument('pvae_path', type=click.Path(exists=True)) -@click.argument('ppost_path', type=click.Path(exists=True)) -@click.argument('out_path', type=click.Path(writable=True)) +@click.option("--idx", required=True, help="The row index for the summary output.") +@click.option("--idx-name", required=True, help="The row index name.") +@click.argument("seq_path", type=click.Path(exists=True)) +@click.argument("pvae_path", type=click.Path(exists=True)) +@click.argument("ppost_path", type=click.Path(exists=True)) +@click.argument("out_path", type=click.Path(writable=True)) def merge_ps(idx, idx_name, seq_path, pvae_path, ppost_path, out_path): """ Merge probability estimates from Pvae and Ppost into a single data frame and write to an output CSV. @@ -44,11 +44,11 @@ def merge_ps(idx, idx_name, seq_path, pvae_path, ppost_path, out_path): """ def prep_index(df): - df.set_index(['amino_acid', 'v_gene', 'j_gene'], inplace=True) + df.set_index(["amino_acid", "v_gene", "j_gene"], inplace=True) df.sort_index(inplace=True) pvae_df = pd.read_csv(seq_path) - pvae_df['log_Pvae'] = pd.read_csv(pvae_path)['log_p_x'] + pvae_df["log_Pvae"] = pd.read_csv(pvae_path)["log_p_x"] prep_index(pvae_df) ppost_df = convert_gene_names(pd.read_csv(ppost_path), olga_to_adaptive_dict()) prep_index(ppost_df) @@ -56,8 +56,14 @@ def prep_index(df): # If we don't drop duplicates then merge will expand the number of rows. # See https://stackoverflow.com/questions/39019591/duplicated-rows-when-merging-dataframes-in-python # We deduplicate Ppost, which is guaranteed to be identical among repeated elements. - merged = pd.merge(pvae_df, ppost_df.drop_duplicates(), how='left', left_index=True, right_index=True) - merged['log_Ppost'] = np.log(merged['Ppost']) + merged = pd.merge( + pvae_df, + ppost_df.drop_duplicates(), + how="left", + left_index=True, + right_index=True, + ) + merged["log_Ppost"] = np.log(merged["Ppost"]) merged.reset_index(inplace=True) merged[idx_name] = idx merged.set_index(idx_name, inplace=True) @@ -65,10 +71,10 @@ def prep_index(df): @cli.command() -@click.option('--train-size', default=1000, help="Data count to use for train.") -@click.argument('in_csv', type=click.File('r')) -@click.argument('out1_csv', type=click.File('w')) -@click.argument('out2_csv', type=click.File('w')) +@click.option("--train-size", default=1000, help="Data count to use for train.") +@click.argument("in_csv", type=click.File("r")) +@click.argument("out1_csv", type=click.File("w")) +@click.argument("out2_csv", type=click.File("w")) def split(train_size, in_csv, out1_csv, out2_csv): """ Do a train/test split. @@ -80,36 +86,43 @@ def split(train_size, in_csv, out1_csv, out2_csv): @cli.command() -@click.option('--out', type=click.File('w'), help='Output file path.', required=True) -@click.option('--idx', required=True, help='The row index for the summary output.') -@click.option('--idx-name', required=True, help='The row index name.') +@click.option("--out", type=click.File("w"), help="Output file path.", required=True) +@click.option("--idx", required=True, help="The row index for the summary output.") +@click.option("--idx-name", required=True, help="The row index name.") @click.option( - '--colnames', default='', help='Comma-separated column identifier names corresponding to the files that follow.') -@click.argument('in_paths', nargs=-1) + "--colnames", + default="", + help="Comma-separated column identifier names corresponding to the files that follow.", +) +@click.argument("in_paths", nargs=-1) def summarize(out, idx, idx_name, colnames, in_paths): """ Summarize results of a run as a single-row CSV. The input is of flexible length: each input file is associated with an identifier specified using the --colnames flag. """ - colnames = colnames.split(',') + colnames = colnames.split(",") if len(colnames) != len(in_paths): - raise Exception("The number of colnames is not equal to the number of input files.") + raise Exception( + "The number of colnames is not equal to the number of input files." + ) input_d = {k: v for k, v in zip(colnames, in_paths)} index = pd.Index([idx], name=idx_name) - if 'loss' in input_d: - loss_df = pd.read_csv(input_d['loss'], index_col=0).reset_index() + if "loss" in input_d: + loss_df = pd.read_csv(input_d["loss"], index_col=0).reset_index() # The following 3 lines combine the data source and the metric into a # single id like `train_j_gene_output_loss`. - loss_df = pd.melt(loss_df, id_vars='index') - loss_df['id'] = loss_df['variable'] + '_' + loss_df['index'] - loss_df.set_index('id', inplace=True) - df = pd.DataFrame(dict(zip(loss_df.index, loss_df['value'].transpose())), index=index) + loss_df = pd.melt(loss_df, id_vars="index") + loss_df["id"] = loss_df["variable"] + "_" + loss_df["index"] + loss_df.set_index("id", inplace=True) + df = pd.DataFrame( + dict(zip(loss_df.index, loss_df["value"].transpose())), index=index + ) else: df = pd.DataFrame(index=index) - def slurp_cols(path, prefix='', suffix=''): + def slurp_cols(path, prefix="", suffix=""): """ Given a one-row CSV with summaries, add them to df with an optional prefix and suffix. @@ -124,33 +137,38 @@ def add_p_summary(path, name): Add a summary of something like `validation_pvae` where `validation` is the prefix and `pvae` is the statistic. """ - prefix, statistic = name.split('_') - if statistic == 'pvae': - log_statistic = pd.read_csv(path)['log_p_x'] - elif statistic == 'ppost': - log_statistic = np.log(pd.read_csv(path)['Ppost']) + prefix, statistic = name.split("_") + if statistic == "pvae": + log_statistic = pd.read_csv(path)["log_p_x"] + elif statistic == "ppost": + log_statistic = np.log(pd.read_csv(path)["Ppost"]) else: raise Exception(f"Unknown statistic '{statistic}'") - df[prefix + '_median_log_p'] = np.median(log_statistic) - df[prefix + '_mean_log_p'] = np.mean(log_statistic) + df[prefix + "_median_log_p"] = np.median(log_statistic) + df[prefix + "_mean_log_p"] = np.mean(log_statistic) for name, path in input_d.items(): if name in [ - 'training_pvae', 'validation_pvae', 'test_pvae', 'training_ppost', 'validation_ppost', 'test_ppost' + "training_pvae", + "validation_pvae", + "test_pvae", + "training_ppost", + "validation_ppost", + "test_ppost", ]: add_p_summary(path, name) - elif re.search('sumrep_divergences', name): - slurp_cols(path, prefix='sumdiv_') - elif re.search('auc_', name): + elif re.search("sumrep_divergences", name): + slurp_cols(path, prefix="sumdiv_") + elif re.search("auc_", name): slurp_cols(path) df.to_csv(out) @cli.command() -@click.option('--out', type=click.File('w'), help='Output file path.', required=True) -@click.argument('in_paths', nargs=-1) +@click.option("--out", type=click.File("w"), help="Output file path.", required=True) +@click.argument("in_paths", nargs=-1) def csvstack(out, in_paths): """ Like csvkit's csvstack, but can deal with varying columns. @@ -158,12 +176,14 @@ def csvstack(out, in_paths): Note that this sorts the columns by name (part of merging columns). """ - pd.concat([pd.read_csv(path) for path in in_paths], sort=True).to_csv(out, index=False) + pd.concat([pd.read_csv(path) for path in in_paths], sort=True).to_csv( + out, index=False + ) @cli.command() -@click.option('--out', type=click.File('w'), help='Output file path.', required=True) -@click.argument('in_paths', nargs=-1) +@click.option("--out", type=click.File("w"), help="Output file path.", required=True) +@click.argument("in_paths", nargs=-1) def fancystack(out, in_paths): """ Like csvkit's csvstack, but fancy. @@ -175,15 +195,15 @@ def fancystack(out, in_paths): def read_df(path): df = pd.read_csv(path) - idx_names = df.columns[0].split(';') + idx_names = df.columns[0].split(";") idx_str = df.iloc[0, 0] # Make sure the index column is constant. assert (df.iloc[:, 0] == idx_str).all() - idx = idx_str.split(';') + idx = idx_str.split(";") df.drop(df.columns[0], axis=1, inplace=True) for k, v in zip(idx_names, idx): - if k in ['train_data', 'test_set']: + if k in ["train_data", "test_set"]: v = common.strip_dirpath_extn(v) df[k] = v @@ -195,29 +215,31 @@ def read_df(path): @cli.command() -@click.option('--dest-path', type=click.Path(writable=True), required=True) -@click.argument('loss_paths', nargs=-1) +@click.option("--dest-path", type=click.Path(writable=True), required=True) +@click.argument("loss_paths", nargs=-1) def copy_best_weights(dest_path, loss_paths): """ Find the best weights according to validation loss and copy them to the specified path. """ loss_paths = sorted(loss_paths) - losses = [pd.read_csv(path, index_col=0).loc['loss', 'validation'] for path in loss_paths] + losses = [ + pd.read_csv(path, index_col=0).loc["loss", "validation"] for path in loss_paths + ] smallest_loss_path = loss_paths[np.argmin(losses)] - best_weights = os.path.join(os.path.dirname(smallest_loss_path), 'best_weights.h5') + best_weights = os.path.join(os.path.dirname(smallest_loss_path), "best_weights.h5") shutil.copyfile(best_weights, dest_path) @cli.command() -@click.argument('l_csv_path', type=click.File('r')) -@click.argument('r_csv_path', type=click.File('r')) -@click.argument('out_csv', type=click.File('w')) +@click.argument("l_csv_path", type=click.File("r")) +@click.argument("r_csv_path", type=click.File("r")) +@click.argument("out_csv", type=click.File("w")) def sharedwith(l_csv_path, r_csv_path, out_csv): """ Write out a TCR dataframe identical to that in `l_csv_path` (perhaps with reordered columns) but with an extra column `present` indicating if that TCR is present in r_csv_path. """ - avj_columns = ['amino_acid', 'v_gene', 'j_gene'] + avj_columns = ["amino_acid", "v_gene", "j_gene"] def read_df(path): df = pd.read_csv(path) @@ -227,25 +249,40 @@ def read_df(path): l_df = read_df(l_csv_path) r_df = read_df(r_csv_path) - intersection = pd.merge(l_df, r_df, left_index=True, right_index=True, how='inner') - l_df['present'] = 0 - l_df.loc[intersection.index, 'present'] = 1 + intersection = pd.merge(l_df, r_df, left_index=True, right_index=True, how="inner") + l_df["present"] = 0 + l_df.loc[intersection.index, "present"] = 1 click.echo( - f"{sum(l_df['present'])} of {len(l_df)} sequences in {l_csv_path.name} are shared with {r_csv_path.name}") + f"{sum(l_df['present'])} of {len(l_df)} sequences in {l_csv_path.name} are shared with {r_csv_path.name}" + ) l_df.to_csv(out_csv) @cli.command() -@click.option('--out-prefix', metavar='PRE', type=click.Path(writable=True), help="Output prefix.", required=True) -@click.option('--test-size', default=1 / 3, show_default=True, help="Proportion of sample to hold out for testing.") -@click.option('--test-regex', help="A regular expression that is used to identify the test set.") @click.option( - '--limit-each', - metavar='N', + "--out-prefix", + metavar="PRE", + type=click.Path(writable=True), + help="Output prefix.", + required=True, +) +@click.option( + "--test-size", + default=1 / 3, + show_default=True, + help="Proportion of sample to hold out for testing.", +) +@click.option( + "--test-regex", help="A regular expression that is used to identify the test set." +) +@click.option( + "--limit-each", + metavar="N", type=int, help="Limit the contribution of each repertoire to the training set " - "to a random N sequences. Will fail if any repertoire has less than N seqs.") -@click.argument('in_paths', nargs=-1) + "to a random N sequences. Will fail if any repertoire has less than N seqs.", +) +@click.argument("in_paths", nargs=-1) def split_repertoires(out_prefix, test_size, test_regex, limit_each, in_paths): """ Do a test-train split on the level of repertoires. @@ -267,87 +304,87 @@ def split_repertoires(out_prefix, test_size, test_regex, limit_each, in_paths): else: train_paths, test_paths = train_test_split(in_paths, test_size=test_size) - train_tsv_path = out_prefix + '.train.tsv' + train_tsv_path = out_prefix + ".train.tsv" info = { - 'split_call': ' '.join(sys.argv), - 'split_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), - 'train_paths': train_paths, - 'test_paths': test_paths, - 'train_tsv_path': train_tsv_path, + "split_call": " ".join(sys.argv), + "split_date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"), + "train_paths": train_paths, + "test_paths": test_paths, + "train_tsv_path": train_tsv_path, } - json_path = out_prefix + '.json' - with open(json_path, 'w') as fp: + json_path = out_prefix + ".json" + with open(json_path, "w") as fp: fp.write(json.dumps(info, indent=4)) header_written = False - with open(train_tsv_path, 'w') as fp: + with open(train_tsv_path, "w") as fp: for path in train_paths: df = preprocess_adaptive.read_adaptive_tsv(path) if limit_each: if limit_each > len(df): - raise ValueError(f"--limit-each parameter is greater than the number of sequences in {path}") + raise ValueError( + f"--limit-each parameter is greater than the number of sequences in {path}" + ) df = df.sample(n=limit_each) if header_written: - df.to_csv(fp, sep='\t', header=False, index=False) + df.to_csv(fp, sep="\t", header=False, index=False) else: # This is our first file to write. header_written = True - df.to_csv(fp, sep='\t', index=False) + df.to_csv(fp, sep="\t", index=False) click.echo("Check JSON file with") click.echo(f"cat {json_path}") @cli.command() -@click.option('--train-size', default=0.5, help="Data fraction to use for train.") -@click.argument('in_csv') -@click.argument('out_train_csv_bz2') -@click.argument('out_test_csv_bz2') +@click.option("--train-size", default=0.5, help="Data fraction to use for train.") +@click.argument("in_csv") +@click.argument("out_train_csv_bz2") +@click.argument("out_test_csv_bz2") def split_rows(train_size, in_csv, out_train_csv_bz2, out_test_csv_bz2): df = pd.read_csv(in_csv, index_col=0) (df1, df2) = train_test_split(df, train_size=train_size) - df1.to_csv(out_train_csv_bz2, compression='bz2') - df2.to_csv(out_test_csv_bz2, compression='bz2') + df1.to_csv(out_train_csv_bz2, compression="bz2") + df2.to_csv(out_test_csv_bz2, compression="bz2") def to_fake_csv(seq_list, path, include_freq=False): """ Write a list of our favorite triples to a file. """ - with open(path, 'w') as fp: + with open(path, "w") as fp: if include_freq: - fp.write('amino_acid,v_gene,j_gene,count,frequency\n') + fp.write("amino_acid,v_gene,j_gene,count,frequency\n") else: - fp.write('amino_acid,v_gene,j_gene\n') + fp.write("amino_acid,v_gene,j_gene\n") for line in seq_list: - fp.write(line + '\n') + fp.write(line + "\n") @cli.command() @click.option( - '--include-freq', + "--include-freq", is_flag=True, - help="Include frequencies from 'count' and the counts themselves as columns in CSV.") -@click.option( - '--n-to-sample', default=100, help="Number of sequences to sample.") + help="Include frequencies from 'count' and the counts themselves as columns in CSV.", +) +@click.option("--n-to-sample", default=100, help="Number of sequences to sample.") @click.option( - '--min-count', + "--min-count", default=4, show_default=True, - help="Only include sequences that are found in at least this number of subjects." + help="Only include sequences that are found in at least this number of subjects.", ) @click.option( - '--column', - default='count', - help="Counts column to use for sampling probabilities.") -@click.argument('in_csv') -@click.argument('out_csv') -def sample_data_set(include_freq, n_to_sample, min_count, column, in_csv, - out_csv): + "--column", default="count", help="Counts column to use for sampling probabilities." +) +@click.argument("in_csv") +@click.argument("out_csv") +def sample_data_set(include_freq, n_to_sample, min_count, column, in_csv, out_csv): """ Sample sequences according to the counts given in the specified column and then output in a CSV file. @@ -367,11 +404,19 @@ def seq_freqs_of_colname(colname, apply_min_count=False): raise ZeroDivisionError return seq_counts / count_sum - sampled_seq_v = np.random.multinomial(n_to_sample, seq_freqs_of_colname(column, apply_min_count=True)) + sampled_seq_v = np.random.multinomial( + n_to_sample, seq_freqs_of_colname(column, apply_min_count=True) + ) if include_freq: df.reset_index(inplace=True) - out_vect = df['index'] + ',' + df['count'].astype('str') + ',' + seq_freqs_of_colname('count').astype('str') + out_vect = ( + df["index"] + + "," + + df["count"].astype("str") + + "," + + seq_freqs_of_colname("count").astype("str") + ) else: out_vect = df.index @@ -384,5 +429,5 @@ def seq_freqs_of_colname(colname, apply_min_count=False): to_fake_csv(sampled_seqs, out_csv, include_freq) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/vampire/xcr_vector_conversion.py b/vampire/xcr_vector_conversion.py index a300b79..49f6ee7 100644 --- a/vampire/xcr_vector_conversion.py +++ b/vampire/xcr_vector_conversion.py @@ -20,12 +20,12 @@ # CDR3Length layer depends on this set and ordering of states. # So does tcregex. -AA_ORDER = 'ACDEFGHIKLMNPQRSTVWY-' +AA_ORDER = "ACDEFGHIKLMNPQRSTVWY-" AA_LIST = list(AA_ORDER) AA_DICT = {c: i for i, c in enumerate(AA_LIST)} AA_DICT_REV = {i: c for i, c in enumerate(AA_LIST)} AA_SET = set(AA_LIST) -AA_NONGAP = [float(c != '-') for c in AA_LIST] +AA_NONGAP = [float(c != "-") for c in AA_LIST] def seq_to_onehot(seq): @@ -36,21 +36,72 @@ def seq_to_onehot(seq): def onehot_to_seq(onehot): - return ''.join([AA_DICT_REV[v.argmax()] for v in onehot]) + return "".join([AA_DICT_REV[v.argmax()] for v in onehot]) # ### TCRB ### # V genes: TCRB_V_GENE_LIST = [ - 'TCRBV01-01', 'TCRBV02-01', 'TCRBV03-01', 'TCRBV03-02', 'TCRBV04-01', 'TCRBV04-02', 'TCRBV04-03', 'TCRBV05-01', - 'TCRBV05-02', 'TCRBV05-03', 'TCRBV05-04', 'TCRBV05-05', 'TCRBV05-06', 'TCRBV05-07', 'TCRBV05-08', 'TCRBV06-01', - 'TCRBV06-04', 'TCRBV06-05', 'TCRBV06-06', 'TCRBV06-07', 'TCRBV06-08', 'TCRBV06-09', 'TCRBV07-01', 'TCRBV07-02', - 'TCRBV07-03', 'TCRBV07-04', 'TCRBV07-05', 'TCRBV07-06', 'TCRBV07-07', 'TCRBV07-08', 'TCRBV07-09', 'TCRBV08-02', - 'TCRBV09-01', 'TCRBV10-01', 'TCRBV10-02', 'TCRBV10-03', 'TCRBV11-01', 'TCRBV11-02', 'TCRBV11-03', 'TCRBV12-01', - 'TCRBV12-02', 'TCRBV12-05', 'TCRBV13-01', 'TCRBV14-01', 'TCRBV15-01', 'TCRBV16-01', 'TCRBV18-01', 'TCRBV19-01', - 'TCRBV20-01', 'TCRBV21-01', 'TCRBV22-01', 'TCRBV23-01', 'TCRBV23-or09_02', 'TCRBV25-01', 'TCRBV27-01', 'TCRBV28-01', - 'TCRBV29-01', 'TCRBV30-01', 'TCRBVA-or09_02' + "TCRBV01-01", + "TCRBV02-01", + "TCRBV03-01", + "TCRBV03-02", + "TCRBV04-01", + "TCRBV04-02", + "TCRBV04-03", + "TCRBV05-01", + "TCRBV05-02", + "TCRBV05-03", + "TCRBV05-04", + "TCRBV05-05", + "TCRBV05-06", + "TCRBV05-07", + "TCRBV05-08", + "TCRBV06-01", + "TCRBV06-04", + "TCRBV06-05", + "TCRBV06-06", + "TCRBV06-07", + "TCRBV06-08", + "TCRBV06-09", + "TCRBV07-01", + "TCRBV07-02", + "TCRBV07-03", + "TCRBV07-04", + "TCRBV07-05", + "TCRBV07-06", + "TCRBV07-07", + "TCRBV07-08", + "TCRBV07-09", + "TCRBV08-02", + "TCRBV09-01", + "TCRBV10-01", + "TCRBV10-02", + "TCRBV10-03", + "TCRBV11-01", + "TCRBV11-02", + "TCRBV11-03", + "TCRBV12-01", + "TCRBV12-02", + "TCRBV12-05", + "TCRBV13-01", + "TCRBV14-01", + "TCRBV15-01", + "TCRBV16-01", + "TCRBV18-01", + "TCRBV19-01", + "TCRBV20-01", + "TCRBV21-01", + "TCRBV22-01", + "TCRBV23-01", + "TCRBV23-or09_02", + "TCRBV25-01", + "TCRBV27-01", + "TCRBV28-01", + "TCRBV29-01", + "TCRBV30-01", + "TCRBVA-or09_02", ] TCRB_V_GENE_DICT = {c: i for i, c in enumerate(TCRB_V_GENE_LIST)} TCRB_V_GENE_DICT_REV = {i: c for i, c in enumerate(TCRB_V_GENE_LIST)} @@ -69,8 +120,19 @@ def onehot_to_vgene(onehot): # J genes: TCRB_J_GENE_LIST = [ - 'TCRBJ01-01', 'TCRBJ01-02', 'TCRBJ01-03', 'TCRBJ01-04', 'TCRBJ01-05', 'TCRBJ01-06', 'TCRBJ02-01', 'TCRBJ02-02', - 'TCRBJ02-03', 'TCRBJ02-04', 'TCRBJ02-05', 'TCRBJ02-06', 'TCRBJ02-07' + "TCRBJ01-01", + "TCRBJ01-02", + "TCRBJ01-03", + "TCRBJ01-04", + "TCRBJ01-05", + "TCRBJ01-06", + "TCRBJ02-01", + "TCRBJ02-02", + "TCRBJ02-03", + "TCRBJ02-04", + "TCRBJ02-05", + "TCRBJ02-06", + "TCRBJ02-07", ] TCRB_J_GENE_DICT = {c: i for i, c in enumerate(TCRB_J_GENE_LIST)} TCRB_J_GENE_DICT_REV = {i: c for i, c in enumerate(TCRB_J_GENE_LIST)} @@ -100,30 +162,36 @@ def pad_middle(seq, desired_length): assert seq_len <= desired_length pad_start = seq_len // 2 pad_len = desired_length - seq_len - return seq[:pad_start] + '-' * pad_len + seq[pad_start:] + return seq[:pad_start] + "-" * pad_len + seq[pad_start:] def unpad(seq): """ Remove gap padding. """ - return seq.translate(seq.maketrans('', '', '-')) + return seq.translate(seq.maketrans("", "", "-")) def avj_triple_to_tcr_df(amino_acid, v_gene, j_gene): """ Put our TCR triple into an appropriate DataFrame. """ - return pd.DataFrame(OrderedDict([('amino_acid', amino_acid), ('v_gene', v_gene), ('j_gene', j_gene)])) + return pd.DataFrame( + OrderedDict( + [("amino_acid", amino_acid), ("v_gene", v_gene), ("j_gene", j_gene)] + ) + ) def avj_raw_triple_to_tcr_df(amino_acid, v_gene, j_gene): """ A "raw" triple here means as a big np array. """ - return avj_triple_to_tcr_df([amino_acid[i, :, :] for i in range(amino_acid.shape[0])], - [v_gene[i, :] for i in range(v_gene.shape[0])], - [j_gene[i, :] for i in range(j_gene.shape[0])]) + return avj_triple_to_tcr_df( + [amino_acid[i, :, :] for i in range(amino_acid.shape[0])], + [v_gene[i, :] for i in range(v_gene.shape[0])], + [j_gene[i, :] for i in range(j_gene.shape[0])], + ) def unpadded_tcrbs_to_onehot(df, desired_length): @@ -135,10 +203,10 @@ def unpadded_tcrbs_to_onehot(df, desired_length): """ return avj_triple_to_tcr_df( - df['amino_acid'].apply(lambda s: seq_to_onehot(pad_middle(s, desired_length))), - df['v_gene'].apply(vgene_to_onehot), - df['j_gene'].apply(jgene_to_onehot) - ) # yapf: disable + df["amino_acid"].apply(lambda s: seq_to_onehot(pad_middle(s, desired_length))), + df["v_gene"].apply(vgene_to_onehot), + df["j_gene"].apply(jgene_to_onehot), + ) # yapf: disable def onehot_to_padded_tcrbs(amino_acid_arr, v_gene_arr, j_gene_arr): @@ -163,16 +231,18 @@ def aux(f, a): """ nrows = a.shape[0] # Here's where the length-20 assumption lives. - out = np.empty((nrows, ), dtype=np.dtype('