diff --git a/augur/filter/_run.py b/augur/filter/_run.py index db50b355d..971a41707 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -2,7 +2,7 @@ from tempfile import NamedTemporaryFile from . import constants from .dates import parse_dates -from .io import initialize_input_source_table, import_metadata, import_sequence_index, print_debug_report, write_outputs +from .io import check_disk_space, initialize_input_source_table, import_metadata, import_sequence_index, print_debug_report, write_outputs from .include_exclude_rules import apply_filters, construct_filters from .report import print_report from .subsample import apply_subsampling @@ -19,6 +19,8 @@ def run(args: Namespace): if args.debug: constants.RUNTIME_DEBUG = True + check_disk_space(args.metadata) + initialize_input_source_table() import_metadata(args.metadata, args.metadata_id_columns, args.metadata_delimiters) diff --git a/augur/filter/io.py b/augur/filter/io.py index 57e55cce1..54f45293f 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -1,6 +1,9 @@ +import io import os from typing import Iterable, Sequence, Set from tempfile import NamedTemporaryFile + +import xopen from augur.errors import AugurError from augur.filter.debug import add_debugging from augur.index import ( @@ -19,6 +22,33 @@ from . import constants +def check_disk_space(metadata_file: str): + """Check whether there is enough disk space for augur filter to run, and + raise an error if there is not enough. + """ + # The database file is expected to be about 2x the disk space used by + # uncompressed metadata. This is an estimation based on test runs. + + import shutil + total, used, free = shutil.disk_usage(__file__) + + metadata = Metadata(metadata_file, id_columns=("strain", "name")) + + try: + # Use threads=0 to force usage of Python libraries which implement seek and tell. + with metadata.open(threads=0) as f: + # This works for some streams, including uncompressed and gzip. + f.seek(0, os.SEEK_END) + size = f.tell() + if free < size * 2: + raise AugurError(f"Not enough disk space. augur filter will need to write a database file with expected size {_human_readable_size(size * 2)}.") + return + except io.UnsupportedOperation: + print_err("WARNING: File size of metadata could not be determined. If the uncompressed data is large, you may run out of disk space.") + # FIXME: For ZSTD files, try using frame content sizes (if available). + # ยน https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header + + def import_priorities_table(path): """Import a priorities file into the database.""" try: