From 0d6ad2e8a978da601f75ab98ca890a653831a87f Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 30 Dec 2022 20:41:29 -0800 Subject: [PATCH] Create an index on the metadata ID column See docstring of db.import_.sqlite3.create_index for reasoning. --- augur/db/import_/sqlite3.py | 13 +++++++++++++ augur/io/sqlite3.py | 8 ++++++++ .../db/cram/import-metadata-duplicates-error.t | 18 ++++++++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 tests/functional/db/cram/import-metadata-duplicates-error.t diff --git a/augur/db/import_/sqlite3.py b/augur/db/import_/sqlite3.py index 16de1936b..a3d606618 100644 --- a/augur/db/import_/sqlite3.py +++ b/augur/db/import_/sqlite3.py @@ -23,6 +23,7 @@ def import_(metadata_file:str, metadata_table_name:str, if bool(metadata_file and metadata_table_name): metadata = Metadata(metadata_file) import_metadata(metadata, metadata_table_name, database) + create_index(metadata.id_column, metadata_table_name, database) if sequences_file and sequences_table_name: # Get ID column name from metadata if available. @@ -30,6 +31,7 @@ def import_(metadata_file:str, metadata_table_name:str, sequences = Sequences(sequences_file) import_sequences(sequences, id_column, sequences_table_name, database) + create_index(id_column, sequences_table_name, database) except Sqlite3ImportError as e: # Delete the database file if it was created for this import. if not db_file_existed: @@ -80,3 +82,14 @@ def import_sequences(sequences:Sequences, id_column:str, table_name:str, databas database.executemany(insert_statement, records) except sqlite3.ProgrammingError as e: raise Sqlite3ImportError("Failed to import rows.") from e + + +def create_index(column:str, table_name:str, database:Sqlite3Database): + """Create an index on a column. + + This serves to detect duplicates and speed up strain-based queries. + """ + try: + database.create_unique_index(table_name, column) + except sqlite3.IntegrityError: + raise Sqlite3ImportError(f"Duplicate found in column '{column}' of table '{table_name}'.") diff --git a/augur/io/sqlite3.py b/augur/io/sqlite3.py index 573c2f01b..5c5f13aee 100644 --- a/augur/io/sqlite3.py +++ b/augur/io/sqlite3.py @@ -56,6 +56,14 @@ def create_table(self, table_name:str, columns:List[str]): ) """) + def create_unique_index(self, table_name:str, column:str): + """Create a unique index on a column in a table.""" + index_name = f'index_{table_name}_{column}' + self.execute(f""" + CREATE UNIQUE INDEX {sanitize_identifier(index_name)} + ON {sanitize_identifier(table_name)} ({sanitize_identifier(column)}) + """) + def query_to_csv(self, query:str, path:str, header=True, chunksize:int=10000, **to_csv_kwargs): """Query the database and write results to a tabular file. diff --git a/tests/functional/db/cram/import-metadata-duplicates-error.t b/tests/functional/db/cram/import-metadata-duplicates-error.t new file mode 100644 index 000000000..c758a1133 --- /dev/null +++ b/tests/functional/db/cram/import-metadata-duplicates-error.t @@ -0,0 +1,18 @@ +Setup + + $ export AUGUR="$TESTDIR/../../../../bin/augur" + +Metadata with a duplicate under the ID column will error. + + $ cat >metadata.tsv <<~~ + > strain date + > SEQ1 2000-01-01 + > SEQ1 2000-02-01 + > SEQ3 2000-03-01 + > ~~ + + $ ${AUGUR} db import \ + > --metadata metadata.tsv \ + > --output-sqlite3 data.sqlite3 + ERROR: Duplicate found in column 'strain' of table 'metadata'. + [2]