diff --git a/.examples/_config_example.yaml b/.examples/_config_example.yaml index 29567c77..a1eee32f 100644 --- a/.examples/_config_example.yaml +++ b/.examples/_config_example.yaml @@ -77,4 +77,4 @@ utility_parameters: METABASE_MIN_MEMORY: "2g" METABASE_MAX_MEMORY: "8g" METABASE_VERSION: "v0.49.6" - METABASE_DUCKDB_PLUGIN_VERSION: "0.2.6" + METABASE_DUCKDB_PLUGIN_VERSION: "0.2.8" diff --git a/workflow/Database b/workflow/Database index fdfa84f9..5c5b985c 100644 --- a/workflow/Database +++ b/workflow/Database @@ -39,6 +39,7 @@ for name in PEP_PROJECTS.keys(): model_to_ignore.append(item) models_to_ignore[name] = model_to_ignore db.append(f"data/processed/{name}/dbt/antiSMASH_{version}/dbt_bgcflow.duckdb") + db.append(f"data/processed/{name}/antismash_database/antiSMASH_database_{version}") rule all: input: @@ -46,3 +47,4 @@ rule all: ##### Modules ##### include: "rules/build-database.smk" +include: "rules/antismash-db-duckdb.smk" diff --git a/workflow/Metabase b/workflow/Metabase index 58e247a9..89767c76 100644 --- a/workflow/Metabase +++ b/workflow/Metabase @@ -157,7 +157,7 @@ metabase_config = { "METABASE_MIN_MEMORY": "2g", "METABASE_MAX_MEMORY": "8g", "METABASE_VERSION": "v0.49.6", - "METABASE_DUCKDB_PLUGIN_VERSION": "0.2.6", + "METABASE_DUCKDB_PLUGIN_VERSION": "0.2.8", "DMB_SETUP_TOKEN": "ad0fb086-351b-4fa5-a17e-76282d2c9753", "METABASE_HTTP": "http://localhost:3000", "MB_IS_METABOT_ENABLED" : "true" diff --git a/workflow/envs/antismash_db-duckdb.post-deploy.sh b/workflow/envs/antismash_db-duckdb.post-deploy.sh new file mode 100644 index 00000000..d202efc9 --- /dev/null +++ b/workflow/envs/antismash_db-duckdb.post-deploy.sh @@ -0,0 +1,39 @@ +set -e + +# Create a directory for resources if it doesn't already exist +mkdir -p resources +rm -rf resources/antismash_db-schema_duckdb + +# create logs directory +mkdir -p logs/antismash_db-duckdb +LOG="logs/antismash_db-duckdb/antismash_db-duckdb_template.log" + +# Clone the antismash_db-schema_duckdb repository into the resources directory +git clone https://github.com/NBChub/antismash_db-schema_duckdb.git resources/antismash_db-schema_duckdb 2>> $LOG + +# Clone the db-schema repository into the antismash_db-schema_duckdb directory +git clone https://github.com/antismash/db-schema.git resources/antismash_db-schema_duckdb/db-schema 2>> $LOG + +# Download the antiSMASH databases using a custom script or command +download-antismash-databases 2>> $LOG + +# Initialize the DuckDB database with the schema from the db-schema directory +(cd resources/antismash_db-schema_duckdb && python init_duckdb.py db-schema duckdb-schema) 2>> $LOG + +# Download the NCBI taxonomy dump files into the ncbi-taxdump directory +(cd resources/antismash_db-schema_duckdb && wget -P ncbi-taxdump https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz -nc) 2>> $LOG + +# Extract the downloaded NCBI taxonomy dump files +(cd resources/antismash_db-schema_duckdb/ncbi-taxdump && tar -xvf new_taxdump.tar.gz) 2>> $LOG + +# Install the asdb-taxa tool using cargo (Rust's package manager) +cargo install asdb-taxa 2>> $LOG + +# Add cargo's bin directory to the PATH to ensure asdb-taxa can be executed +export PATH="$HOME/.cargo/bin:$PATH" + +# Clone the db-import repository into the antismash_db-schema_duckdb directory +git clone git@github.com:matinnuhamunada/db-import.git resources/antismash_db-schema_duckdb/db-import 2>> $LOG + +# Checkout a specific branch of the db-import repository +(cd resources/antismash_db-schema_duckdb/db-import && git checkout -b v4.0.0-duckdb v4.0.0-duckdb) 2>> $LOG diff --git a/workflow/envs/antismash_db-duckdb.yaml b/workflow/envs/antismash_db-duckdb.yaml new file mode 100644 index 00000000..38de4c9a --- /dev/null +++ b/workflow/envs/antismash_db-duckdb.yaml @@ -0,0 +1,298 @@ +name: antismash_db_env +channels: + - bioconda + - conda-forge +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - anyio=4.2.0=pyhd8ed1ab_0 + - argon2-cffi=23.1.0=pyhd8ed1ab_0 + - argon2-cffi-bindings=21.2.0=py310h2372a71_4 + - arrow=1.3.0=pyhd8ed1ab_0 + - asttokens=2.4.1=pyhd8ed1ab_0 + - async-lru=2.0.4=pyhd8ed1ab_0 + - attrs=23.2.0=pyh71513ae_0 + - babel=2.14.0=pyhd8ed1ab_0 + - beautifulsoup4=4.12.3=pyha770c72_0 + - binutils_impl_linux-64=2.40=hf600244_0 + - blast=2.15.0=pl5321h6f7f691_1 + - bleach=6.1.0=pyhd8ed1ab_0 + - boost-cpp=1.74.0=h75c5d50_8 + - brotli-python=1.1.0=py310hc6cd4ac_1 + - bzip2=1.0.8=hd590300_5 + - c-ares=1.26.0=hd590300_0 + - ca-certificates=2024.2.2=hbcca054_0 + - cached-property=1.5.2=hd8ed1ab_1 + - cached_property=1.5.2=pyha770c72_1 + - certifi=2024.2.2=pyhd8ed1ab_0 + - cffi=1.16.0=py310h2fee648_0 + - cfgv=3.3.1=pyhd8ed1ab_0 + - charset-normalizer=3.3.2=pyhd8ed1ab_0 + - comm=0.2.1=pyhd8ed1ab_0 + - curl=8.5.0=hca28451_0 + - debugpy=1.8.0=py310hc6cd4ac_1 + - decorator=5.1.1=pyhd8ed1ab_0 + - defusedxml=0.7.1=pyhd8ed1ab_0 + - diamond=2.0.15=hb97b32f_1 + - distlib=0.3.8=pyhd8ed1ab_0 + - entrez-direct=16.2=he881be0_1 + - entrypoints=0.4=pyhd8ed1ab_0 + - exceptiongroup=1.2.0=pyhd8ed1ab_2 + - executing=2.0.1=pyhd8ed1ab_0 + - expat=2.5.0=hcb278e6_1 + - fasttree=2.1.11=h031d066_2 + - filelock=3.13.1=pyhd8ed1ab_0 + - fqdn=1.5.1=pyhd8ed1ab_0 + - gcc_impl_linux-64=13.2.0=h338b0a0_5 + - gettext=0.21.1=h27087fc_0 + - ghostscript=10.02.1=h59595ed_0 + - glimmerhmm=3.0.4=pl5321h87f3376_5 + - h11=0.14.0=pyhd8ed1ab_0 + - h2=4.1.0=pyhd8ed1ab_0 + - hmmer=3.4=hdbdd923_0 + - hmmer2=2.3.2=h031d066_9 + - hpack=4.0.0=pyh9f0ad1d_0 + - httpcore=1.0.2=pyhd8ed1ab_0 + - httpx=0.26.0=pyhd8ed1ab_0 + - hyperframe=6.0.1=pyhd8ed1ab_0 + - icu=70.1=h27087fc_0 + - identify=2.5.33=pyhd8ed1ab_0 + - idna=3.6=pyhd8ed1ab_0 + - importlib-metadata=7.0.1=pyha770c72_0 + - importlib_metadata=7.0.1=hd8ed1ab_0 + - importlib_resources=6.1.1=pyhd8ed1ab_0 + - ipykernel=6.29.0=pyhd33586a_0 + - ipython=8.21.0=pyh707e725_0 + - isoduration=20.11.0=pyhd8ed1ab_0 + - jedi=0.19.1=pyhd8ed1ab_0 + - json5=0.9.14=pyhd8ed1ab_0 + - jsonpointer=2.4=py310hff52083_3 + - jsonschema-specifications=2023.12.1=pyhd8ed1ab_0 + - jsonschema-with-format-nongpl=4.21.1=pyhd8ed1ab_0 + - jupyter-lsp=2.2.2=pyhd8ed1ab_0 + - jupyter_client=8.6.0=pyhd8ed1ab_0 + - jupyter_core=5.7.1=py310hff52083_0 + - jupyter_events=0.9.0=pyhd8ed1ab_0 + - jupyter_server=2.12.5=pyhd8ed1ab_0 + - jupyter_server_terminals=0.5.2=pyhd8ed1ab_0 + - jupyterlab=4.1.0=pyhd8ed1ab_0 + - jupyterlab_pygments=0.3.0=pyhd8ed1ab_0 + - jupyterlab_server=2.25.2=pyhd8ed1ab_0 + - kernel-headers_linux-64=2.6.32=he073ed8_16 + - keyutils=1.6.1=h166bdaf_0 + - krb5=1.21.2=h659d440_0 + - ld_impl_linux-64=2.40=h41732ed_0 + - libcurl=8.5.0=hca28451_0 + - libedit=3.1.20191231=he28a2e2_2 + - libev=4.33=hd590300_2 + - libexpat=2.5.0=hcb278e6_1 + - libffi=3.4.2=h7f98852_5 + - libgcc-devel_linux-64=13.2.0=ha9c7c90_105 + - libgcc-ng=13.2.0=h807b86a_5 + - libgomp=13.2.0=h807b86a_5 + - libiconv=1.17=hd590300_2 + - libidn2=2.3.7=hd590300_0 + - libnghttp2=1.58.0=h47da74e_1 + - libnsl=2.0.1=hd590300_0 + - libsanitizer=13.2.0=h7e041cc_5 + - libsodium=1.0.18=h36c2ea0_1 + - libsqlite=3.44.2=h2797004_0 + - libssh2=1.11.0=h0841786_0 + - libstdcxx-ng=13.2.0=h7e041cc_5 + - libunistring=0.9.10=h7f98852_0 + - libuuid=2.38.1=h0b41bf4_0 + - libxcrypt=4.4.36=hd590300_1 + - libxml2=2.9.14=h22db469_4 + - libxslt=1.1.35=h8affb1d_0 + - libzlib=1.2.13=hd590300_5 + - matplotlib-inline=0.1.6=pyhd8ed1ab_0 + - meme=4.11.2=py310pl5321h2cd2fb3_8 + - mistune=3.0.2=pyhd8ed1ab_0 + - muscle=5.1=h4ac6f70_3 + - nbclient=0.8.0=pyhd8ed1ab_0 + - nbconvert-core=7.14.2=pyhd8ed1ab_0 + - nbformat=5.9.2=pyhd8ed1ab_0 + - ncbi-vdb=3.0.10=hdbdd923_0 + - ncurses=6.4=h59595ed_2 + - nest-asyncio=1.6.0=pyhd8ed1ab_0 + - nodeenv=1.8.0=pyhd8ed1ab_0 + - notebook-shim=0.2.3=pyhd8ed1ab_0 + - openssl=3.2.1=hd590300_0 + - ossuuid=1.6.2=hf484d3e_1000 + - overrides=7.7.0=pyhd8ed1ab_0 + - packaging=23.2=pyhd8ed1ab_0 + - pandocfilters=1.5.0=pyhd8ed1ab_0 + - parso=0.8.3=pyhd8ed1ab_0 + - pcre=8.45=h9c3ff4c_0 + - perl=5.32.1=7_hd590300_perl5 + - perl-alien-build=2.48=pl5321hec16e2b_0 + - perl-alien-libxml2=0.17=pl5321hec16e2b_0 + - perl-archive-tar=2.40=pl5321hdfd78af_0 + - perl-base=2.23=pl5321hd8ed1ab_0 + - perl-business-isbn=3.007=pl5321hd8ed1ab_0 + - perl-business-isbn-data=20210112.006=pl5321hd8ed1ab_0 + - perl-capture-tiny=0.48=pl5321ha770c72_1 + - perl-carp=1.50=pl5321hd8ed1ab_0 + - perl-cgi=4.56=pl5321h031d066_1 + - perl-common-sense=3.75=pl5321hd8ed1ab_0 + - perl-compress-raw-bzip2=2.201=pl5321h166bdaf_0 + - perl-compress-raw-zlib=2.202=pl5321h166bdaf_0 + - perl-config-general=2.65=pl5321hdfd78af_0 + - perl-constant=1.33=pl5321hd8ed1ab_0 + - perl-dbi=1.643=pl5321h166bdaf_0 + - perl-digest-md5=2.58=pl5321h166bdaf_0 + - perl-encode=3.19=pl5321h166bdaf_0 + - perl-encode-locale=1.05=pl5321hdfd78af_7 + - perl-exporter=5.74=pl5321hd8ed1ab_0 + - perl-exporter-tiny=1.002002=pl5321hd8ed1ab_0 + - perl-extutils-makemaker=7.70=pl5321hd8ed1ab_0 + - perl-ffi-checklib=0.28=pl5321hdfd78af_0 + - perl-file-chdir=0.1011=pl5321hd8ed1ab_0 + - perl-file-path=2.18=pl5321hd8ed1ab_0 + - perl-file-spec=3.48_01=pl5321hdfd78af_2 + - perl-file-temp=0.2304=pl5321hd8ed1ab_0 + - perl-file-which=1.24=pl5321hd8ed1ab_0 + - perl-html-parser=3.81=pl5321h4ac6f70_1 + - perl-html-tagset=3.20=pl5321hdfd78af_4 + - perl-html-template=2.97=pl5321hdfd78af_2 + - perl-html-tree=5.07=pl5321hdfd78af_2 + - perl-http-date=6.06=pl5321hdfd78af_0 + - perl-http-message=6.36=pl5321hdfd78af_0 + - perl-importer=0.026=pl5321hd8ed1ab_0 + - perl-inc-latest=0.500=pl5321ha770c72_0 + - perl-io-compress=2.201=pl5321hdbdd923_2 + - perl-io-html=1.004=pl5321hdfd78af_0 + - perl-io-zlib=1.14=pl5321hdfd78af_0 + - perl-json=4.10=pl5321hdfd78af_0 + - perl-json-xs=2.34=pl5321h4ac6f70_6 + - perl-list-moreutils=0.430=pl5321hdfd78af_0 + - perl-list-moreutils-xs=0.430=pl5321h031d066_2 + - perl-log-log4perl=1.56=pl5321hd8ed1ab_0 + - perl-lwp-mediatypes=6.04=pl5321hdfd78af_1 + - perl-math-cdf=0.1=pl5321h031d066_9 + - perl-mime-base64=3.16=pl5321h166bdaf_0 + - perl-module-build=0.4234=pl5321ha770c72_0 + - perl-parent=0.241=pl5321hd8ed1ab_0 + - perl-path-tiny=0.124=pl5321hd8ed1ab_0 + - perl-pathtools=3.75=pl5321h166bdaf_0 + - perl-scalar-list-utils=1.63=pl5321h166bdaf_0 + - perl-scope-guard=0.21=pl5321hd8ed1ab_0 + - perl-storable=3.15=pl5321h166bdaf_0 + - perl-sub-info=0.002=pl5321hd8ed1ab_0 + - perl-sys-info=0.7811=pl5321hdfd78af_1 + - perl-sys-info-base=0.7807=pl5321hdfd78af_1 + - perl-sys-info-driver-linux=0.7905=pl5321hdfd78af_1 + - perl-term-table=0.016=pl5321hdfd78af_0 + - perl-test-fatal=0.016=pl5321ha770c72_0 + - perl-test-nowarnings=1.06=pl5321ha770c72_0 + - perl-test-warnings=0.031=pl5321ha770c72_0 + - perl-test2-suite=0.000145=pl5321hdfd78af_0 + - perl-text-template-simple=0.91=pl5321hdfd78af_1 + - perl-time-local=1.35=pl5321hdfd78af_0 + - perl-timedate=2.33=pl5321hdfd78af_2 + - perl-try-tiny=0.31=pl5321ha770c72_0 + - perl-types-serialiser=1.01=pl5321hdfd78af_0 + - perl-unix-processors=2.046=pl5321h7f98852_1001 + - perl-uri=5.17=pl5321ha770c72_0 + - perl-url-encode=0.03=pl5321h9ee0642_0 + - perl-xml-libxml=2.0207=pl5321h661654b_0 + - perl-xml-namespacesupport=1.12=pl5321hd8ed1ab_0 + - perl-xml-parser=2.44_01=pl5321hc3e0081_1003 + - perl-xml-sax=1.02=pl5321hd8ed1ab_0 + - perl-xml-sax-base=1.09=pl5321hd8ed1ab_0 + - perl-xml-sax-expat=0.51=pl5321hd8ed1ab_0 + - perl-xml-simple=2.25=pl5321hdfd78af_2 + - perl-yaml=1.30=pl5321hdfd78af_0 + - pexpect=4.9.0=pyhd8ed1ab_0 + - pickleshare=0.7.5=py_1003 + - pip=24.0=pyhd8ed1ab_0 + - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 + - platformdirs=4.2.0=pyhd8ed1ab_0 + - pre-commit=3.6.0=pyha770c72_0 + - prodigal=2.6.3=h031d066_7 + - prometheus_client=0.19.0=pyhd8ed1ab_0 + - prompt-toolkit=3.0.42=pyha770c72_0 + - psutil=5.9.8=py310h2372a71_0 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pure_eval=0.2.2=pyhd8ed1ab_0 + - pycparser=2.21=pyhd8ed1ab_0 + - pygments=2.17.2=pyhd8ed1ab_0 + - pysocks=1.7.1=pyha2e5f31_6 + - python=3.10.13=hd12c33a_1_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-fastjsonschema=2.19.1=pyhd8ed1ab_0 + - python-json-logger=2.0.7=pyhd8ed1ab_0 + - python_abi=3.10=4_cp310 + - pytz=2024.1=pyhd8ed1ab_0 + - pyyaml=6.0.1=py310h2372a71_1 + - pyzmq=25.1.2=py310h795f18f_0 + - readline=8.2=h8228510_1 + - referencing=0.33.0=pyhd8ed1ab_0 + - requests=2.31.0=pyhd8ed1ab_0 + - rfc3339-validator=0.1.4=pyhd8ed1ab_0 + - rfc3986-validator=0.1.1=pyh9f0ad1d_0 + - rpds-py=0.17.1=py310hcb5633a_0 + - rust=1.75.0=h70c747d_0 + - rust-std-x86_64-unknown-linux-gnu=1.75.0=h2c6d0dc_0 + - send2trash=1.8.2=pyh41d4057_0 + - setuptools=69.0.3=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - sniffio=1.3.0=pyhd8ed1ab_0 + - soupsieve=2.5=pyhd8ed1ab_1 + - stack_data=0.6.2=pyhd8ed1ab_0 + - sysroot_linux-64=2.12=he073ed8_16 + - terminado=0.18.0=pyh0d859eb_0 + - tinycss2=1.2.1=pyhd8ed1ab_0 + - tk=8.6.13=noxft_h4845f30_101 + - tomli=2.0.1=pyhd8ed1ab_0 + - tornado=6.3.3=py310h2372a71_1 + - traitlets=5.14.1=pyhd8ed1ab_0 + - types-python-dateutil=2.8.19.20240106=pyhd8ed1ab_0 + - typing-extensions=4.9.0=hd8ed1ab_0 + - typing_extensions=4.9.0=pyha770c72_0 + - typing_utils=0.1.0=pyhd8ed1ab_0 + - tzdata=2024a=h0c530f3_0 + - ukkonen=1.0.1=py310hd41b1e2_4 + - uri-template=1.3.0=pyhd8ed1ab_0 + - urllib3=2.2.0=pyhd8ed1ab_0 + - virtualenv=20.25.0=pyhd8ed1ab_0 + - wcwidth=0.2.13=pyhd8ed1ab_0 + - webcolors=1.13=pyhd8ed1ab_0 + - webencodings=0.5.1=pyhd8ed1ab_2 + - websocket-client=1.7.0=pyhd8ed1ab_0 + - wget=1.20.3=ha35d2d1_1 + - wheel=0.42.0=pyhd8ed1ab_0 + - xz=5.2.6=h166bdaf_0 + - yaml=0.2.5=h7f98852_2 + - zeromq=4.3.5=h59595ed_0 + - zipp=3.17.0=pyhd8ed1ab_0 + - zlib=1.2.13=hd590300_5 + - zstd=1.5.5=hfc55251_0 + - pip: + - git+https://github.com/antismash/antismash.git@7-1-0-1 + - bcbio-gff==0.7.0 + - biopython==1.78 + - brawn==1.0.1 + - contourpy==1.2.1 + - cycler==0.12.1 + - duckdb==1.0.0 + - fonttools==4.53.0 + - helperlibs==0.2.1 + - jinja2==3.1.2 + - joblib==1.3.2 + - jsonschema==4.11.0 + - kiwisolver==1.4.5 + - libsass==0.22.0 + - markupsafe==2.1.3 + - matplotlib==3.8.1 + - moods-python==1.9.4.1 + - nrpys==0.1.1 + - numpy==1.26.2 + - pillow==10.3.0 + - pyparsing==3.1.2 + - pyrsistent==0.20.0 + - pysvg-py3==0.2.2.post3 + - scikit-learn==1.3.2 + - scipy==1.11.3 + - threadpoolctl==3.5.0 + - sqlparse==0.5.0 diff --git a/workflow/envs/bgcflow_notes.yaml b/workflow/envs/bgcflow_notes.yaml index df342010..5cd9d87b 100644 --- a/workflow/envs/bgcflow_notes.yaml +++ b/workflow/envs/bgcflow_notes.yaml @@ -30,3 +30,4 @@ dependencies: - jupyterlab-dash - networkx - alive_progress + - duckdb==1.0.0 diff --git a/workflow/envs/dbt-duckdb.yaml b/workflow/envs/dbt-duckdb.yaml index 4e717a54..fdeb800f 100644 --- a/workflow/envs/dbt-duckdb.yaml +++ b/workflow/envs/dbt-duckdb.yaml @@ -5,9 +5,9 @@ channels: - defaults dependencies: - python==3.11 - - python-duckdb==0.9.2 + - python-duckdb==1.0.0 - unzip - pip - pip: - - dbt-duckdb==1.7.4 - - dbt-metabase==1.3.0 + - dbt-duckdb==1.8.1 + - dbt-metabase==1.3.2 diff --git a/workflow/rules/antismash-db-duckdb.smk b/workflow/rules/antismash-db-duckdb.smk new file mode 100644 index 00000000..69042608 --- /dev/null +++ b/workflow/rules/antismash-db-duckdb.smk @@ -0,0 +1,34 @@ +rule antismash_db_duckdb: + input: + antismash=lambda wildcards: expand("data/processed/{name}/antismash/{version}/{strains}", + name=wildcards.name, + version=wildcards.version, + strains=[s for s in PEP_PROJECTS[wildcards.name].sample_table.genome_id.unique()], + ), + output: + database=directory("data/processed/{name}/antismash_database/antiSMASH_database_{version}"), + conda: + "../envs/antismash_db-duckdb.yaml" + threads: 4 + log: + "logs/antismash_db-duckdb/{name}_{version}.log", + params: + prefix=lambda wildcards: find_common_prefix(f"data/processed/{wildcards.name}/antismash/{wildcards.version}/"), + duckdb = "resources/antismash_db-schema_duckdb/duckdb-schema/antismash_db.duckdb", + shell: + """ + set -e + mkdir -p {output.database} + bash_script=$PWD/resources/antismash_db-schema_duckdb/full_workflow.sh + input_dir=$PWD/data/processed/{wildcards.name}/antismash/{wildcards.version} + + # Install the asdb-taxa tool using cargo (Rust's package manager) + cargo install asdb-taxa &>> {log} + + # Add cargo's bin directory to the PATH to ensure asdb-taxa can be executed + export PATH="$HOME/.cargo/bin:$PATH" + + # Run the full_workflow.sh script + duckdb=$PWD/{params.duckdb} + (cd {output.database} && bash $bash_script -p {params.prefix} $input_dir -d $duckdb) &>> {log} + """ diff --git a/workflow/rules/build-database.smk b/workflow/rules/build-database.smk index 6932a688..59751e12 100644 --- a/workflow/rules/build-database.smk +++ b/workflow/rules/build-database.smk @@ -151,6 +151,7 @@ def exclude_model_dbt(model_to_ignore): rule build_database: input: + database="data/processed/{name}/antismash_database/antiSMASH_database_{version}", profile = "data/processed/{name}/dbt/antiSMASH_{version}/models/sources.yml" output: duckdb = "data/processed/{name}/dbt/antiSMASH_{version}/dbt_bgcflow.duckdb" @@ -163,6 +164,7 @@ rule build_database: exclude = lambda wildcards: exclude_model_dbt(models_to_ignore[wildcards.name]) shell: """ + cp {input.database}/antismash_db.duckdb {output.duckdb} 2>> {log} command="dbt build --threads {threads} {params.exclude} -x" echo $command >> {log} (cd {params.dbt} \ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9a24c3d4..26644a7c 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -2,6 +2,7 @@ import os import numpy as np import pandas as pd import yaml, json, sys, itertools, hashlib +from collections import defaultdict from snakemake.utils import validate from snakemake.utils import min_version from pathlib import Path @@ -959,3 +960,30 @@ def get_user_input_with_timeout(prompt, timeout): result[0] = None # Timeout occurred return result[0] + +def find_common_prefix(input_dir): + """ + Find common prefixes of antiSMASH JSON files in the input directory. + Used in to define allowed prefixes in antismash-db-duckdb.smk + """ + input_path = Path(input_dir) + + # Find all .json files in the input directory recursively + json_files = list(input_path.glob('**/*.json')) + + # Group files by the first letter of each file + files_by_first_letter = defaultdict(list) + for file in json_files: + first_letter = file.stem[0] if file.stem else '' + files_by_first_letter[first_letter].append(file) + + # Find the common prefix within each group + common_prefixes = [] + for files in files_by_first_letter.values(): + filenames = [file.stem for file in files] + common_prefix = os.path.commonprefix(filenames) + if common_prefix: # Only add non-empty prefixes + common_prefixes.append(common_prefix) + + # Join the common prefixes with "," + return ",".join(common_prefixes) diff --git a/workflow/rules/metabase.smk b/workflow/rules/metabase.smk index 9a5c4d9a..12d447fd 100644 --- a/workflow/rules/metabase.smk +++ b/workflow/rules/metabase.smk @@ -23,5 +23,5 @@ rule metabase_duckdb_plugin: release=metabase_config["METABASE_DUCKDB_PLUGIN_VERSION"] shell: """ - wget -O {output.plugin} https://github.com/AlexR2D2/metabase_duckdb_driver/releases/download/{params.release}/duckdb.metabase-driver.jar 2>> {log} + wget -O {output.plugin} https://github.com/MotherDuck-Open-Source/metabase_duckdb_driver/releases/download/{params.release}/duckdb.metabase-driver.jar 2>> {log} """