Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cli_vamb.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
pip install -e .
- name: Run VAMB
run: |
vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 -t 10
vamb bin default --outdir outdir_vamb --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz -l 32 -e 10 -q 2 -o C --minfasta 200000 --compress -t 10
ls -la outdir_vamb
cat outdir_vamb/log.txt
- name: Run TaxVAMB
Expand Down
3 changes: 3 additions & 0 deletions doc/how_to_run.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ Each program in Vamb only has a subset of the following options.
* `--minfasta`: Output all bins with a total size (sum of contig lengths) greater than or equal to this
number. The bins will be output in a directory called `bins` under the output directory, and each bin
will be a FASTA file with the same name as the bin, suffixed by ".fna".
* `--compress`: When set, compress the FASTA files generated by the `--minfasta` option
with gzip, and change the extension to ".fna.gz" from ".fna".
If set, `--minfasta` must also be set.
* `-o` Set binsplit separator. See the section on binsplitting in "tips for running Vamb" section for its meaning.
If not passed, defaults to `C` if 'C' is present in all identifiers.
To disable binsplitting, pass `-o` without an argument.
Expand Down
11 changes: 7 additions & 4 deletions src/create_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
parser.add_argument("clusterspath", help="Path to clusters.tsv")
parser.add_argument("minsize", help="Minimum size of bin in bp", type=int, default=0)
parser.add_argument("outdir", help="Directory to create")
parser.add_argument("compress", action="store_true")

if len(sys.argv) == 1:
parser.print_help()
Expand All @@ -31,11 +32,13 @@
with open(args.clusterspath) as file:
clusters = vamb.vambtools.read_clusters(file)

clusters = {
cluster: contigs
clusters = [
(cluster, contigs)
for (cluster, contigs) in clusters.items()
if sum(lens[c] for c in contigs) >= args.minsize
}
]

with vamb.vambtools.Reader(args.fastapath) as file:
vamb.vambtools.write_bins(pathlib.Path(args.outdir), clusters, file, maxbins=None)
vamb.vambtools.write_bins(
pathlib.Path(args.outdir), clusters, file, args.compress, maxbins=None
)
56 changes: 31 additions & 25 deletions test/test_vambtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ def test_bad_params(self):
# Too many bins for maxbins
with self.assertRaises(ValueError):
vamb.vambtools.write_bins(
self.dir, self.bins.items(), self.file, maxbins=self.N_BINS - 1
self.dir, self.bins.items(), self.file, False, maxbins=self.N_BINS - 1
)

# Parent does not exist
Expand All @@ -546,6 +546,7 @@ def test_bad_params(self):
pathlib.Path("svogew/foo"),
self.bins.items(),
self.file,
False,
maxbins=self.N_BINS + 1,
)

Expand All @@ -556,6 +557,7 @@ def test_bad_params(self):
pathlib.Path(file.name),
self.bins.items(),
self.file,
False,
maxbins=self.N_BINS + 1,
)

Expand All @@ -564,31 +566,35 @@ def test_bad_params(self):
bins = {k: v.copy() for k, v in self.bins.items()}
next(iter(bins.values())).add("a_new_bin_which_does_not_exist")
vamb.vambtools.write_bins(
self.dir, bins.items(), self.file, maxbins=self.N_BINS + 1
self.dir, bins.items(), self.file, False, maxbins=self.N_BINS + 1
)

def test_round_trip(self):
with tempfile.TemporaryDirectory() as dir:
vamb.vambtools.write_bins(
pathlib.Path(dir),
self.bins.items(),
self.file,
maxbins=self.N_BINS,
)
for opener, compress, suffix_len in [(open, False, 4), (gzip.open, True, 7)]:
self.file.seek(0)
with tempfile.TemporaryDirectory() as dir:
vamb.vambtools.write_bins(
pathlib.Path(dir),
self.bins.items(),
self.file,
compress,
maxbins=self.N_BINS,
)

reconstructed_bins: dict[str, set[str]] = dict()
for filename in os.listdir(dir):
with open(os.path.join(dir, filename), "rb") as file:
entries = list(vamb.vambtools.byte_iterfasta(file, None))
binname = filename[:-4]
reconstructed_bins[binname] = set()
for entry in entries:
reconstructed_bins[binname].add(entry.identifier)

# Same bins
self.assertEqual(len(self.bins), len(reconstructed_bins))
self.assertEqual(
sum(map(len, self.bins.values())),
sum(map(len, reconstructed_bins.values())),
)
self.assertEqual(self.bins, reconstructed_bins)
reconstructed_bins: dict[str, set[str]] = dict()
for filename in os.listdir(dir):
with opener(os.path.join(dir, filename), "rb") as file:
entries = list(vamb.vambtools.byte_iterfasta(file, None))
binname = filename[:-suffix_len]
print(compress, binname)
reconstructed_bins[binname] = set()
for entry in entries:
reconstructed_bins[binname].add(entry.identifier)

# Same bins
self.assertEqual(len(self.bins), len(reconstructed_bins))
self.assertEqual(
sum(map(len, self.bins.values())),
sum(map(len, reconstructed_bins.values())),
)
self.assertEqual(self.bins, reconstructed_bins)
41 changes: 33 additions & 8 deletions vamb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ def __init__(


class BinOutputOptions:
__slots__ = ["binsplitter", "min_fasta_output_size"]
__slots__ = ["binsplitter", "min_fasta_output_size", "compress_fasta_output"]

# We take a composition as arguments, because if min_fasta_output_size is set,
# we need to guarantee that the composition is passed as fasta.
Expand All @@ -669,13 +669,15 @@ def from_args(cls, comp: CompositionOptions, args: argparse.Namespace):
comp,
typeasserted(args.binsplit_separator, (str, type(None))),
typeasserted(args.min_fasta_output_size, (int, type(None))),
args.compress_fasta_output,
)

def __init__(
self,
composition: CompositionOptions,
binsplit_separator: Optional[str],
min_fasta_output_size: Optional[int],
compress_fasta_output: bool,
):
self.binsplitter = vamb.vambtools.BinSplitter(binsplit_separator)
if min_fasta_output_size is not None:
Expand All @@ -687,7 +689,14 @@ def __init__(
raise argparse.ArgumentTypeError(
"Minimum FASTA output size must be nonnegative"
)
else:
if compress_fasta_output:
raise argparse.ArgumentError(
None, "If `--compress` is set, `minfasta` cannot be None"
)

self.min_fasta_output_size = min_fasta_output_size
self.compress_fasta_output = compress_fasta_output


@logger.catch(reraise=True)
Expand Down Expand Up @@ -1152,6 +1161,7 @@ class FastaOutput(NamedTuple):
existing_fasta_path: FASTAPath
bins_dir_to_populate: Path # (or to create, if not existing)
min_fasta_size: int
compress_output: bool

@classmethod
def try_from_common(cls, common: BinnerCommonOptions):
Expand All @@ -1162,6 +1172,7 @@ def try_from_common(cls, common: BinnerCommonOptions):
common.comp.path,
common.general.out_dir.joinpath("bins"),
common.output.min_fasta_output_size,
common.output.compress_fasta_output,
)
else:
return None
Expand Down Expand Up @@ -1234,6 +1245,7 @@ def export_clusters(
sequence_lens,
sequence_names,
fasta_output_struct.min_fasta_size,
fasta_output_struct.compress_output,
)

return None
Expand Down Expand Up @@ -1388,6 +1400,7 @@ def cluster_and_write_files(
cast(Sequence[int], sequence_lens),
sequence_names,
fasta_output.min_fasta_size,
fasta_output.compress_output,
)


Expand All @@ -1398,6 +1411,7 @@ def create_cluster_fasta_files(
sequence_lens: Sequence[int],
sequence_names: Sequence[str],
min_bin_size: int,
compress_output: bool,
) -> None:
begintime = time.time()
filtered_clusters: list[tuple[str, list[str]]] = []
Expand All @@ -1407,11 +1421,16 @@ def create_cluster_fasta_files(
if sum(sizeof[c] for c in contigs) >= min_bin_size:
filtered_clusters.append((binname, list(contigs)))

logger.opt(raw=True).info("\n")
logger.info("Writing clusters.")
logger.info(f"\tCompression: {compress_output}")

with vamb.vambtools.Reader(existing_fasta_path) as file:
vamb.vambtools.write_bins(
dir_to_populate,
filtered_clusters,
file,
compress_output,
None,
)
elapsed = round(time.time() - begintime, 2)
Expand Down Expand Up @@ -1718,7 +1737,7 @@ def train_and_predict_fold(
lengths: np.ndarray,
targets: np.ndarray,
nodes: list[str],
table_parent: np.ndarray,
table_parent: list[int],
taxonomy_options: TaxometerOptions,
cuda: bool,
) -> tuple[list[vamb.taxonomy.PredictedContigTaxonomy], float]:
Expand Down Expand Up @@ -1835,8 +1854,9 @@ def cross_validate_taxonomy(
classes_order.append(i.ranks[-1])
targets = np.array([ind_nodes[i] for i in classes_order])

fold_args = [
(
results: list[tuple[list[vamb.taxonomy.PredictedContigTaxonomy], float]] = []
for fold, (train_idx, test_idx) in enumerate(kf.split(np.arange(n_contigs))):
local_results = train_and_predict_fold(
fold,
train_idx,
test_idx,
Expand All @@ -1850,11 +1870,9 @@ def cross_validate_taxonomy(
taxonomy_options,
cuda,
)
for fold, (train_idx, test_idx) in enumerate(kf.split(np.arange(n_contigs)))
]
loss_tests = []
results.append(local_results)

results = [train_and_predict_fold(*args) for args in fold_args]
loss_tests = []

for fold_predicted_taxonomies, loss_test in results:
all_predicted_taxonomies.extend(fold_predicted_taxonomies)
Expand Down Expand Up @@ -2150,6 +2168,7 @@ def run_reclustering(opt: ReclusteringOptions):
opt.composition.path,
opt.general.out_dir.joinpath("bins"),
opt.output.min_fasta_output_size,
opt.output.compress_fasta_output,
)
fasta_output = (
fasta_output_struct,
Expand Down Expand Up @@ -2320,6 +2339,12 @@ def add_bin_output_arguments(subparser: argparse.ArgumentParser):
default=None,
help="Minimum bin size to output as fasta [None = no files]",
)
bin_os.add_argument(
"--compress",
dest="compress_fasta_output",
help="Compress FASTA output to with extension '.fna.gz'",
action="store_true",
)
bin_os.add_argument(
"-o",
dest="binsplit_separator",
Expand Down
2 changes: 1 addition & 1 deletion vamb/hloss_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
class Hierarchy:
"""Hierarchy of nodes 0, ..., n-1."""

def __init__(self, parents):
def __init__(self, parents: np.ndarray):
n = len(parents)
assert np.all(parents[1:] < np.arange(1, n))
self._parents = parents
Expand Down
6 changes: 3 additions & 3 deletions vamb/taxvamb_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def make_dataloader_concat_hloss(
lengths,
labels,
N: int,
table_parent,
table_parent: list[int],
no_filter: bool = True,
batchsize: int = 256,
destroy: bool = False,
Expand Down Expand Up @@ -775,7 +775,7 @@ def __init__(
nsamples: int,
nlabels: int,
nodes,
table_parent,
table_parent: list[int],
nhiddens: Optional[list[int]] = None,
alpha: Optional[float] = None,
beta: float = 200.0,
Expand Down Expand Up @@ -832,7 +832,7 @@ def __init__(
self.encoderlayers.append(_nn.Linear(nin, nout))
self.encodernorms.append(_nn.BatchNorm1d(nout))

self.tree = _hloss.Hierarchy(table_parent)
self.tree = _hloss.Hierarchy(_np.array(table_parent))
self.n_tree_nodes = nlabels

self.nodes = nodes
Expand Down
11 changes: 10 additions & 1 deletion vamb/vambtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ def write_bins(
directory: Path,
bins: Collection[tuple[str, Iterable[str]]],
fastaio: Iterable[bytes],
compress: bool,
maxbins: Optional[int] = 1000,
):
"""Writes bins as FASTA files in a directory, one file per bin.
Expand Down Expand Up @@ -702,7 +703,15 @@ def write_bins(
)

# Print bin to file
with open(directory.joinpath(binname + ".fna"), "wb") as file:
base_output_name = directory.joinpath(binname)
if compress:
context = _gzip.open(
base_output_name.with_suffix(".fna.gz"), "wb", compresslevel=1
)
else:
context = open(base_output_name.with_suffix(".fna"), "wb")

with context as file:
for contig in contigs:
file.write(_gzip.decompress(bytes_by_id[contig]))
file.write(b"\n")
Expand Down