Skip to content

Commit

Permalink
Create an index on the metadata ID column
Browse files Browse the repository at this point in the history
See docstring of db.load.sqlite3.create_metadata_index for reasoning.
  • Loading branch information
victorlin committed Nov 8, 2022
1 parent 35ac009 commit dcc438c
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 0 deletions.
13 changes: 13 additions & 0 deletions augur/db/load/sqlite3.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def load(metadata_file:str, metadata_table_name:str, db_file:str):
database = Sqlite3Database(db_file)

load_metadata(metadata, metadata_table_name, database)
create_metadata_index(metadata, metadata_table_name, database)


def load_metadata(metadata:Metadata, table_name:str, database:Sqlite3Database):
Expand All @@ -37,3 +38,15 @@ def load_metadata(metadata:Metadata, table_name:str, database:Sqlite3Database):
database.executemany(insert_statement, rows)
except sqlite3.ProgrammingError:
raise AugurError(f'Failed to load {metadata.file}.')


def create_metadata_index(metadata:Metadata, table_name:str, database:Sqlite3Database):
"""Create an index on the metadata ID column.
This serves to detect duplicates and speed up strain-based queries.
"""
try:
database.create_unique_index(table_name, metadata.id_column)
except sqlite3.IntegrityError:
database.delete()
raise AugurError(f"Duplicate found in '{metadata.file}'.")
8 changes: 8 additions & 0 deletions augur/io/sqlite3.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def executemany(self, *args):
with self.connect() as con:
con.executemany(*args)

def create_unique_index(self, table_name:str, column:str):
"""Create a unique index on a column in a table."""
index_name = f'index_{table_name}_{column}'
self.execute(f"""
CREATE UNIQUE INDEX {sanitize_identifier(index_name)}
ON {sanitize_identifier(table_name)} ({sanitize_identifier(column)})
""")

def query_to_file(self, query:str, path:str, header=True, chunksize:int=10000, columns_to_exclude:List[str]=[], **to_csv_kwargs):
"""Query the database and write results to a tabular file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ Metadata with a duplicate under the ID column will error.
$ ${AUGUR} db load \
> --metadata metadata.tsv \
> --to-sqlite3 data.sqlite3
ERROR: Duplicate found in 'metadata.tsv'.
[2]

0 comments on commit dcc438c

Please sign in to comment.