Skip to content

Commit

Permalink
Create an index on the metadata ID column
Browse files Browse the repository at this point in the history
See docstring of db.import_.sqlite3.create_index for reasoning.
  • Loading branch information
victorlin committed Dec 31, 2022
1 parent 753b21d commit 0d6ad2e
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 0 deletions.
13 changes: 13 additions & 0 deletions augur/db/import_/sqlite3.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ def import_(metadata_file:str, metadata_table_name:str,
if bool(metadata_file and metadata_table_name):
metadata = Metadata(metadata_file)
import_metadata(metadata, metadata_table_name, database)
create_index(metadata.id_column, metadata_table_name, database)

if sequences_file and sequences_table_name:
# Get ID column name from metadata if available.
id_column = metadata.id_column if bool(metadata_file and metadata_table_name) else DEFAULT_SEQUENCES_ID_COLUMN

sequences = Sequences(sequences_file)
import_sequences(sequences, id_column, sequences_table_name, database)
create_index(id_column, sequences_table_name, database)
except Sqlite3ImportError as e:
# Delete the database file if it was created for this import.
if not db_file_existed:
Expand Down Expand Up @@ -80,3 +82,14 @@ def import_sequences(sequences:Sequences, id_column:str, table_name:str, databas
database.executemany(insert_statement, records)
except sqlite3.ProgrammingError as e:
raise Sqlite3ImportError("Failed to import rows.") from e


def create_index(column:str, table_name:str, database:Sqlite3Database):
"""Create an index on a column.
This serves to detect duplicates and speed up strain-based queries.
"""
try:
database.create_unique_index(table_name, column)
except sqlite3.IntegrityError:
raise Sqlite3ImportError(f"Duplicate found in column '{column}' of table '{table_name}'.")
8 changes: 8 additions & 0 deletions augur/io/sqlite3.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@ def create_table(self, table_name:str, columns:List[str]):
)
""")

def create_unique_index(self, table_name:str, column:str):
"""Create a unique index on a column in a table."""
index_name = f'index_{table_name}_{column}'
self.execute(f"""
CREATE UNIQUE INDEX {sanitize_identifier(index_name)}
ON {sanitize_identifier(table_name)} ({sanitize_identifier(column)})
""")

def query_to_csv(self, query:str, path:str, header=True, chunksize:int=10000, **to_csv_kwargs):
"""Query the database and write results to a tabular file.
Expand Down
18 changes: 18 additions & 0 deletions tests/functional/db/cram/import-metadata-duplicates-error.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Setup

$ export AUGUR="$TESTDIR/../../../../bin/augur"

Metadata with a duplicate under the ID column will error.

$ cat >metadata.tsv <<~~
> strain date
> SEQ1 2000-01-01
> SEQ1 2000-02-01
> SEQ3 2000-03-01
> ~~

$ ${AUGUR} db import \
> --metadata metadata.tsv \
> --output-sqlite3 data.sqlite3
ERROR: Duplicate found in column 'strain' of table 'metadata'.
[2]

0 comments on commit 0d6ad2e

Please sign in to comment.