Skip to content

Commit

Permalink
feat: build local antiSMASH database with duckdb (#354)
Browse files Browse the repository at this point in the history
* feat: build AntiSMASH database using DuckDB

* fix: automatically allow prefix for project

* chore: move prefix finder function to common.smk

* chore: add build antismash db in template config

* feat: upgrade metabase driver

* feat: integrate existing dbt database with antiSMASH database

* feat: build antismash db before running dbt
  • Loading branch information
matinnuhamunada authored Jul 16, 2024
1 parent b83cf8e commit 36def9e
Show file tree
Hide file tree
Showing 11 changed files with 410 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .examples/_config_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,4 @@ utility_parameters:
METABASE_MIN_MEMORY: "2g"
METABASE_MAX_MEMORY: "8g"
METABASE_VERSION: "v0.49.6"
METABASE_DUCKDB_PLUGIN_VERSION: "0.2.6"
METABASE_DUCKDB_PLUGIN_VERSION: "0.2.8"
2 changes: 2 additions & 0 deletions workflow/Database
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@ for name in PEP_PROJECTS.keys():
model_to_ignore.append(item)
models_to_ignore[name] = model_to_ignore
db.append(f"data/processed/{name}/dbt/antiSMASH_{version}/dbt_bgcflow.duckdb")
db.append(f"data/processed/{name}/antismash_database/antiSMASH_database_{version}")

rule all:
input:
db,

##### Modules #####
include: "rules/build-database.smk"
include: "rules/antismash-db-duckdb.smk"
2 changes: 1 addition & 1 deletion workflow/Metabase
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ metabase_config = {
"METABASE_MIN_MEMORY": "2g",
"METABASE_MAX_MEMORY": "8g",
"METABASE_VERSION": "v0.49.6",
"METABASE_DUCKDB_PLUGIN_VERSION": "0.2.6",
"METABASE_DUCKDB_PLUGIN_VERSION": "0.2.8",
"DMB_SETUP_TOKEN": "ad0fb086-351b-4fa5-a17e-76282d2c9753",
"METABASE_HTTP": "http://localhost:3000",
"MB_IS_METABOT_ENABLED" : "true"
Expand Down
39 changes: 39 additions & 0 deletions workflow/envs/antismash_db-duckdb.post-deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
set -e

# Create a directory for resources if it doesn't already exist
mkdir -p resources
rm -rf resources/antismash_db-schema_duckdb

# create logs directory
mkdir -p logs/antismash_db-duckdb
LOG="logs/antismash_db-duckdb/antismash_db-duckdb_template.log"

# Clone the antismash_db-schema_duckdb repository into the resources directory
git clone https://github.com/NBChub/antismash_db-schema_duckdb.git resources/antismash_db-schema_duckdb 2>> $LOG

# Clone the db-schema repository into the antismash_db-schema_duckdb directory
git clone https://github.com/antismash/db-schema.git resources/antismash_db-schema_duckdb/db-schema 2>> $LOG

# Download the antiSMASH databases using a custom script or command
download-antismash-databases 2>> $LOG

# Initialize the DuckDB database with the schema from the db-schema directory
(cd resources/antismash_db-schema_duckdb && python init_duckdb.py db-schema duckdb-schema) 2>> $LOG

# Download the NCBI taxonomy dump files into the ncbi-taxdump directory
(cd resources/antismash_db-schema_duckdb && wget -P ncbi-taxdump https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz -nc) 2>> $LOG

# Extract the downloaded NCBI taxonomy dump files
(cd resources/antismash_db-schema_duckdb/ncbi-taxdump && tar -xvf new_taxdump.tar.gz) 2>> $LOG

# Install the asdb-taxa tool using cargo (Rust's package manager)
cargo install asdb-taxa 2>> $LOG

# Add cargo's bin directory to the PATH to ensure asdb-taxa can be executed
export PATH="$HOME/.cargo/bin:$PATH"

# Clone the db-import repository into the antismash_db-schema_duckdb directory
git clone git@github.com:matinnuhamunada/db-import.git resources/antismash_db-schema_duckdb/db-import 2>> $LOG

# Checkout a specific branch of the db-import repository
(cd resources/antismash_db-schema_duckdb/db-import && git checkout -b v4.0.0-duckdb v4.0.0-duckdb) 2>> $LOG
298 changes: 298 additions & 0 deletions workflow/envs/antismash_db-duckdb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
name: antismash_db_env
channels:
- bioconda
- conda-forge
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- anyio=4.2.0=pyhd8ed1ab_0
- argon2-cffi=23.1.0=pyhd8ed1ab_0
- argon2-cffi-bindings=21.2.0=py310h2372a71_4
- arrow=1.3.0=pyhd8ed1ab_0
- asttokens=2.4.1=pyhd8ed1ab_0
- async-lru=2.0.4=pyhd8ed1ab_0
- attrs=23.2.0=pyh71513ae_0
- babel=2.14.0=pyhd8ed1ab_0
- beautifulsoup4=4.12.3=pyha770c72_0
- binutils_impl_linux-64=2.40=hf600244_0
- blast=2.15.0=pl5321h6f7f691_1
- bleach=6.1.0=pyhd8ed1ab_0
- boost-cpp=1.74.0=h75c5d50_8
- brotli-python=1.1.0=py310hc6cd4ac_1
- bzip2=1.0.8=hd590300_5
- c-ares=1.26.0=hd590300_0
- ca-certificates=2024.2.2=hbcca054_0
- cached-property=1.5.2=hd8ed1ab_1
- cached_property=1.5.2=pyha770c72_1
- certifi=2024.2.2=pyhd8ed1ab_0
- cffi=1.16.0=py310h2fee648_0
- cfgv=3.3.1=pyhd8ed1ab_0
- charset-normalizer=3.3.2=pyhd8ed1ab_0
- comm=0.2.1=pyhd8ed1ab_0
- curl=8.5.0=hca28451_0
- debugpy=1.8.0=py310hc6cd4ac_1
- decorator=5.1.1=pyhd8ed1ab_0
- defusedxml=0.7.1=pyhd8ed1ab_0
- diamond=2.0.15=hb97b32f_1
- distlib=0.3.8=pyhd8ed1ab_0
- entrez-direct=16.2=he881be0_1
- entrypoints=0.4=pyhd8ed1ab_0
- exceptiongroup=1.2.0=pyhd8ed1ab_2
- executing=2.0.1=pyhd8ed1ab_0
- expat=2.5.0=hcb278e6_1
- fasttree=2.1.11=h031d066_2
- filelock=3.13.1=pyhd8ed1ab_0
- fqdn=1.5.1=pyhd8ed1ab_0
- gcc_impl_linux-64=13.2.0=h338b0a0_5
- gettext=0.21.1=h27087fc_0
- ghostscript=10.02.1=h59595ed_0
- glimmerhmm=3.0.4=pl5321h87f3376_5
- h11=0.14.0=pyhd8ed1ab_0
- h2=4.1.0=pyhd8ed1ab_0
- hmmer=3.4=hdbdd923_0
- hmmer2=2.3.2=h031d066_9
- hpack=4.0.0=pyh9f0ad1d_0
- httpcore=1.0.2=pyhd8ed1ab_0
- httpx=0.26.0=pyhd8ed1ab_0
- hyperframe=6.0.1=pyhd8ed1ab_0
- icu=70.1=h27087fc_0
- identify=2.5.33=pyhd8ed1ab_0
- idna=3.6=pyhd8ed1ab_0
- importlib-metadata=7.0.1=pyha770c72_0
- importlib_metadata=7.0.1=hd8ed1ab_0
- importlib_resources=6.1.1=pyhd8ed1ab_0
- ipykernel=6.29.0=pyhd33586a_0
- ipython=8.21.0=pyh707e725_0
- isoduration=20.11.0=pyhd8ed1ab_0
- jedi=0.19.1=pyhd8ed1ab_0
- json5=0.9.14=pyhd8ed1ab_0
- jsonpointer=2.4=py310hff52083_3
- jsonschema-specifications=2023.12.1=pyhd8ed1ab_0
- jsonschema-with-format-nongpl=4.21.1=pyhd8ed1ab_0
- jupyter-lsp=2.2.2=pyhd8ed1ab_0
- jupyter_client=8.6.0=pyhd8ed1ab_0
- jupyter_core=5.7.1=py310hff52083_0
- jupyter_events=0.9.0=pyhd8ed1ab_0
- jupyter_server=2.12.5=pyhd8ed1ab_0
- jupyter_server_terminals=0.5.2=pyhd8ed1ab_0
- jupyterlab=4.1.0=pyhd8ed1ab_0
- jupyterlab_pygments=0.3.0=pyhd8ed1ab_0
- jupyterlab_server=2.25.2=pyhd8ed1ab_0
- kernel-headers_linux-64=2.6.32=he073ed8_16
- keyutils=1.6.1=h166bdaf_0
- krb5=1.21.2=h659d440_0
- ld_impl_linux-64=2.40=h41732ed_0
- libcurl=8.5.0=hca28451_0
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=hd590300_2
- libexpat=2.5.0=hcb278e6_1
- libffi=3.4.2=h7f98852_5
- libgcc-devel_linux-64=13.2.0=ha9c7c90_105
- libgcc-ng=13.2.0=h807b86a_5
- libgomp=13.2.0=h807b86a_5
- libiconv=1.17=hd590300_2
- libidn2=2.3.7=hd590300_0
- libnghttp2=1.58.0=h47da74e_1
- libnsl=2.0.1=hd590300_0
- libsanitizer=13.2.0=h7e041cc_5
- libsodium=1.0.18=h36c2ea0_1
- libsqlite=3.44.2=h2797004_0
- libssh2=1.11.0=h0841786_0
- libstdcxx-ng=13.2.0=h7e041cc_5
- libunistring=0.9.10=h7f98852_0
- libuuid=2.38.1=h0b41bf4_0
- libxcrypt=4.4.36=hd590300_1
- libxml2=2.9.14=h22db469_4
- libxslt=1.1.35=h8affb1d_0
- libzlib=1.2.13=hd590300_5
- matplotlib-inline=0.1.6=pyhd8ed1ab_0
- meme=4.11.2=py310pl5321h2cd2fb3_8
- mistune=3.0.2=pyhd8ed1ab_0
- muscle=5.1=h4ac6f70_3
- nbclient=0.8.0=pyhd8ed1ab_0
- nbconvert-core=7.14.2=pyhd8ed1ab_0
- nbformat=5.9.2=pyhd8ed1ab_0
- ncbi-vdb=3.0.10=hdbdd923_0
- ncurses=6.4=h59595ed_2
- nest-asyncio=1.6.0=pyhd8ed1ab_0
- nodeenv=1.8.0=pyhd8ed1ab_0
- notebook-shim=0.2.3=pyhd8ed1ab_0
- openssl=3.2.1=hd590300_0
- ossuuid=1.6.2=hf484d3e_1000
- overrides=7.7.0=pyhd8ed1ab_0
- packaging=23.2=pyhd8ed1ab_0
- pandocfilters=1.5.0=pyhd8ed1ab_0
- parso=0.8.3=pyhd8ed1ab_0
- pcre=8.45=h9c3ff4c_0
- perl=5.32.1=7_hd590300_perl5
- perl-alien-build=2.48=pl5321hec16e2b_0
- perl-alien-libxml2=0.17=pl5321hec16e2b_0
- perl-archive-tar=2.40=pl5321hdfd78af_0
- perl-base=2.23=pl5321hd8ed1ab_0
- perl-business-isbn=3.007=pl5321hd8ed1ab_0
- perl-business-isbn-data=20210112.006=pl5321hd8ed1ab_0
- perl-capture-tiny=0.48=pl5321ha770c72_1
- perl-carp=1.50=pl5321hd8ed1ab_0
- perl-cgi=4.56=pl5321h031d066_1
- perl-common-sense=3.75=pl5321hd8ed1ab_0
- perl-compress-raw-bzip2=2.201=pl5321h166bdaf_0
- perl-compress-raw-zlib=2.202=pl5321h166bdaf_0
- perl-config-general=2.65=pl5321hdfd78af_0
- perl-constant=1.33=pl5321hd8ed1ab_0
- perl-dbi=1.643=pl5321h166bdaf_0
- perl-digest-md5=2.58=pl5321h166bdaf_0
- perl-encode=3.19=pl5321h166bdaf_0
- perl-encode-locale=1.05=pl5321hdfd78af_7
- perl-exporter=5.74=pl5321hd8ed1ab_0
- perl-exporter-tiny=1.002002=pl5321hd8ed1ab_0
- perl-extutils-makemaker=7.70=pl5321hd8ed1ab_0
- perl-ffi-checklib=0.28=pl5321hdfd78af_0
- perl-file-chdir=0.1011=pl5321hd8ed1ab_0
- perl-file-path=2.18=pl5321hd8ed1ab_0
- perl-file-spec=3.48_01=pl5321hdfd78af_2
- perl-file-temp=0.2304=pl5321hd8ed1ab_0
- perl-file-which=1.24=pl5321hd8ed1ab_0
- perl-html-parser=3.81=pl5321h4ac6f70_1
- perl-html-tagset=3.20=pl5321hdfd78af_4
- perl-html-template=2.97=pl5321hdfd78af_2
- perl-html-tree=5.07=pl5321hdfd78af_2
- perl-http-date=6.06=pl5321hdfd78af_0
- perl-http-message=6.36=pl5321hdfd78af_0
- perl-importer=0.026=pl5321hd8ed1ab_0
- perl-inc-latest=0.500=pl5321ha770c72_0
- perl-io-compress=2.201=pl5321hdbdd923_2
- perl-io-html=1.004=pl5321hdfd78af_0
- perl-io-zlib=1.14=pl5321hdfd78af_0
- perl-json=4.10=pl5321hdfd78af_0
- perl-json-xs=2.34=pl5321h4ac6f70_6
- perl-list-moreutils=0.430=pl5321hdfd78af_0
- perl-list-moreutils-xs=0.430=pl5321h031d066_2
- perl-log-log4perl=1.56=pl5321hd8ed1ab_0
- perl-lwp-mediatypes=6.04=pl5321hdfd78af_1
- perl-math-cdf=0.1=pl5321h031d066_9
- perl-mime-base64=3.16=pl5321h166bdaf_0
- perl-module-build=0.4234=pl5321ha770c72_0
- perl-parent=0.241=pl5321hd8ed1ab_0
- perl-path-tiny=0.124=pl5321hd8ed1ab_0
- perl-pathtools=3.75=pl5321h166bdaf_0
- perl-scalar-list-utils=1.63=pl5321h166bdaf_0
- perl-scope-guard=0.21=pl5321hd8ed1ab_0
- perl-storable=3.15=pl5321h166bdaf_0
- perl-sub-info=0.002=pl5321hd8ed1ab_0
- perl-sys-info=0.7811=pl5321hdfd78af_1
- perl-sys-info-base=0.7807=pl5321hdfd78af_1
- perl-sys-info-driver-linux=0.7905=pl5321hdfd78af_1
- perl-term-table=0.016=pl5321hdfd78af_0
- perl-test-fatal=0.016=pl5321ha770c72_0
- perl-test-nowarnings=1.06=pl5321ha770c72_0
- perl-test-warnings=0.031=pl5321ha770c72_0
- perl-test2-suite=0.000145=pl5321hdfd78af_0
- perl-text-template-simple=0.91=pl5321hdfd78af_1
- perl-time-local=1.35=pl5321hdfd78af_0
- perl-timedate=2.33=pl5321hdfd78af_2
- perl-try-tiny=0.31=pl5321ha770c72_0
- perl-types-serialiser=1.01=pl5321hdfd78af_0
- perl-unix-processors=2.046=pl5321h7f98852_1001
- perl-uri=5.17=pl5321ha770c72_0
- perl-url-encode=0.03=pl5321h9ee0642_0
- perl-xml-libxml=2.0207=pl5321h661654b_0
- perl-xml-namespacesupport=1.12=pl5321hd8ed1ab_0
- perl-xml-parser=2.44_01=pl5321hc3e0081_1003
- perl-xml-sax=1.02=pl5321hd8ed1ab_0
- perl-xml-sax-base=1.09=pl5321hd8ed1ab_0
- perl-xml-sax-expat=0.51=pl5321hd8ed1ab_0
- perl-xml-simple=2.25=pl5321hdfd78af_2
- perl-yaml=1.30=pl5321hdfd78af_0
- pexpect=4.9.0=pyhd8ed1ab_0
- pickleshare=0.7.5=py_1003
- pip=24.0=pyhd8ed1ab_0
- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1
- platformdirs=4.2.0=pyhd8ed1ab_0
- pre-commit=3.6.0=pyha770c72_0
- prodigal=2.6.3=h031d066_7
- prometheus_client=0.19.0=pyhd8ed1ab_0
- prompt-toolkit=3.0.42=pyha770c72_0
- psutil=5.9.8=py310h2372a71_0
- ptyprocess=0.7.0=pyhd3deb0d_0
- pure_eval=0.2.2=pyhd8ed1ab_0
- pycparser=2.21=pyhd8ed1ab_0
- pygments=2.17.2=pyhd8ed1ab_0
- pysocks=1.7.1=pyha2e5f31_6
- python=3.10.13=hd12c33a_1_cpython
- python-dateutil=2.8.2=pyhd8ed1ab_0
- python-fastjsonschema=2.19.1=pyhd8ed1ab_0
- python-json-logger=2.0.7=pyhd8ed1ab_0
- python_abi=3.10=4_cp310
- pytz=2024.1=pyhd8ed1ab_0
- pyyaml=6.0.1=py310h2372a71_1
- pyzmq=25.1.2=py310h795f18f_0
- readline=8.2=h8228510_1
- referencing=0.33.0=pyhd8ed1ab_0
- requests=2.31.0=pyhd8ed1ab_0
- rfc3339-validator=0.1.4=pyhd8ed1ab_0
- rfc3986-validator=0.1.1=pyh9f0ad1d_0
- rpds-py=0.17.1=py310hcb5633a_0
- rust=1.75.0=h70c747d_0
- rust-std-x86_64-unknown-linux-gnu=1.75.0=h2c6d0dc_0
- send2trash=1.8.2=pyh41d4057_0
- setuptools=69.0.3=pyhd8ed1ab_0
- six=1.16.0=pyh6c4a22f_0
- sniffio=1.3.0=pyhd8ed1ab_0
- soupsieve=2.5=pyhd8ed1ab_1
- stack_data=0.6.2=pyhd8ed1ab_0
- sysroot_linux-64=2.12=he073ed8_16
- terminado=0.18.0=pyh0d859eb_0
- tinycss2=1.2.1=pyhd8ed1ab_0
- tk=8.6.13=noxft_h4845f30_101
- tomli=2.0.1=pyhd8ed1ab_0
- tornado=6.3.3=py310h2372a71_1
- traitlets=5.14.1=pyhd8ed1ab_0
- types-python-dateutil=2.8.19.20240106=pyhd8ed1ab_0
- typing-extensions=4.9.0=hd8ed1ab_0
- typing_extensions=4.9.0=pyha770c72_0
- typing_utils=0.1.0=pyhd8ed1ab_0
- tzdata=2024a=h0c530f3_0
- ukkonen=1.0.1=py310hd41b1e2_4
- uri-template=1.3.0=pyhd8ed1ab_0
- urllib3=2.2.0=pyhd8ed1ab_0
- virtualenv=20.25.0=pyhd8ed1ab_0
- wcwidth=0.2.13=pyhd8ed1ab_0
- webcolors=1.13=pyhd8ed1ab_0
- webencodings=0.5.1=pyhd8ed1ab_2
- websocket-client=1.7.0=pyhd8ed1ab_0
- wget=1.20.3=ha35d2d1_1
- wheel=0.42.0=pyhd8ed1ab_0
- xz=5.2.6=h166bdaf_0
- yaml=0.2.5=h7f98852_2
- zeromq=4.3.5=h59595ed_0
- zipp=3.17.0=pyhd8ed1ab_0
- zlib=1.2.13=hd590300_5
- zstd=1.5.5=hfc55251_0
- pip:
- git+https://github.com/antismash/antismash.git@7-1-0-1
- bcbio-gff==0.7.0
- biopython==1.78
- brawn==1.0.1
- contourpy==1.2.1
- cycler==0.12.1
- duckdb==1.0.0
- fonttools==4.53.0
- helperlibs==0.2.1
- jinja2==3.1.2
- joblib==1.3.2
- jsonschema==4.11.0
- kiwisolver==1.4.5
- libsass==0.22.0
- markupsafe==2.1.3
- matplotlib==3.8.1
- moods-python==1.9.4.1
- nrpys==0.1.1
- numpy==1.26.2
- pillow==10.3.0
- pyparsing==3.1.2
- pyrsistent==0.20.0
- pysvg-py3==0.2.2.post3
- scikit-learn==1.3.2
- scipy==1.11.3
- threadpoolctl==3.5.0
- sqlparse==0.5.0
1 change: 1 addition & 0 deletions workflow/envs/bgcflow_notes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ dependencies:
- jupyterlab-dash
- networkx
- alive_progress
- duckdb==1.0.0
6 changes: 3 additions & 3 deletions workflow/envs/dbt-duckdb.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ channels:
- defaults
dependencies:
- python==3.11
- python-duckdb==0.9.2
- python-duckdb==1.0.0
- unzip
- pip
- pip:
- dbt-duckdb==1.7.4
- dbt-metabase==1.3.0
- dbt-duckdb==1.8.1
- dbt-metabase==1.3.2
Loading

0 comments on commit 36def9e

Please sign in to comment.