Make "databases" usable in sub-projects

soedinglab · Oct 1, 2021 · 5afd33c · 5afd33c
1 parent f651879
commit 5afd33c
Show file tree

Hide file tree

Showing 3 changed files with 184 additions and 169 deletions.
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
@@ -1,6 +1,7 @@
 #include "Command.h"
 #include "Parameters.h"
 #include "CommandDeclarations.h"
+#include "DownloadDatabase.h"
 
 Parameters& par = Parameters::getInstance();
 std::vector<Command> baseCommands = {
@@ -1236,3 +1237,155 @@ std::vector<Command> baseCommands = {
                 "",
                 CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}
 };
+
+#include "databases.sh.h"
+
+std::vector<DatabaseDownload> downloads = {{
+                                       "UniRef100",
+                                       "The UniProt Reference Clusters provide clustered sets of sequences from the UniProt Knowledgebase.",
+                                       "Suzek et al: UniRef: comprehensive and non-redundant UniProt reference clusters. Bioinformatics 23(10), 1282–1288 (2007)",
+                                       "https://www.uniprot.org/help/uniref",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "UniRef90",
+                                       "The UniProt Reference Clusters provide clustered sets of sequences from the UniProt Knowledgebase.",
+                                       "Suzek et al: UniRef: comprehensive and non-redundant UniProt reference clusters. Bioinformatics 23(10), 1282–1288 (2007)",
+                                       "https://www.uniprot.org/help/uniref",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "UniRef50",
+                                       "The UniProt Reference Clusters provide clustered sets of sequences from the UniProt Knowledgebase.",
+                                       "Suzek et al: UniRef: comprehensive and non-redundant UniProt reference clusters. Bioinformatics 23(10), 1282–1288 (2007)",
+                                       "https://www.uniprot.org/help/uniref",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "UniProtKB",
+                                       "The UniProt Knowledgebase is the central hub for the collection of functional information on proteins, with accurate, consistent and rich annotation.",
+                                       "The UniProt Consortium: UniProt: a worldwide hub of protein knowledge. Nucleic Acids Res 47(D1), D506-515 (2019)",
+                                       "https://www.uniprot.org/help/uniprotkb",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "UniProtKB/TrEMBL",
+                                       "UniProtKB/TrEMBL (unreviewed) contains protein sequences associated with computationally generated annotation and large-scale functional characterization.",
+                                       "The UniProt Consortium: UniProt: a worldwide hub of protein knowledge. Nucleic Acids Res 47(D1), D506-515 (2019)",
+                                       "https://www.uniprot.org/help/uniprotkb",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "UniProtKB/Swiss-Prot",
+                                       "UniProtKB/Swiss-Prot (reviewed) is a high quality manually annotated and non-redundant protein sequence database, which brings together experimental results, computed features and scientific conclusions.",
+                                       "The UniProt Consortium: UniProt: a worldwide hub of protein knowledge. Nucleic Acids Res 47(D1), D506-515 (2019)",
+                                       "https://uniprot.org",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "NR",
+                                       "Non-redundant protein sequences from GenPept, Swissprot, PIR, PDF, PDB, and NCBI RefSeq.",
+                                       "NCBI Resource Coordinators: Database resources of the National Center for Biotechnology Information. Nucleic Acids Res 46(D1), D8-D13 (2018)",
+                                       "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "NT",
+                                       "Partially non-redundant nucleotide sequences from all traditional divisions of GenBank, EMBL, and DDBJ excluding GSS, STS, PAT, EST, HTG, and WGS.",
+                                       "NCBI Resource Coordinators: Database resources of the National Center for Biotechnology Information. Nucleic Acids Res 46(D1), D8-D13 (2018)",
+                                       "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA",
+                                       false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "GTDB",
+                                       "Genome Taxonomy Database is a phylogenetically consistent, genome-based taxonomy that provides rank-normalized classifications for ~150,000 bacterial and archaeal genomes from domain to genus.",
+                                       "Parks et al: A complete domain-to-species taxonomy for Bacteria and Archaea. Nat Biotechnol 38(9), 1079–1086 (2020)",
+                                       "https://gtdb.ecogenomic.org",
+                                       true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "PDB",
+                                       "The Protein Data Bank is the single worldwide archive of structural data of biological macromolecules.",
+                                       "Berman et al: The Protein Data Bank. Nucleic Acids Res 28(1), 235-242 (2000)",
+                                       "https://www.rcsb.org",
+                                       false, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "PDB70",
+                                       "PDB clustered to 70% sequence identity and enriched using HHblits with Uniclust sequences.",
+                                       "Steinegger et al: HH-suite3 for fast remote homology detection and deep protein annotation. BMC Bioinform 20(1), 473 (2019)",
+                                       "https://github.com/soedinglab/hh-suite",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "Pfam-A.full",
+                                       "The Pfam database is a large collection of protein families, each represented by multiple sequence alignments and hidden Markov models.",
+                                       "El-Gebali and Mistry et al: The Pfam protein families database in 2019. Nucleic Acids Res 47(D1), D427-D432 (2019)",
+                                       "https://pfam.xfam.org",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "Pfam-A.seed",
+                                       "The Pfam database is a large collection of protein families, each represented by multiple sequence alignments and hidden Markov models.",
+                                       "El-Gebali and Mistry et al: The Pfam protein families database in 2019. Nucleic Acids Res 47(D1), D427-D432 (2019)",
+                                       "https://pfam.xfam.org",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "Pfam-B",
+                                       "Pfam-B is a large automatically generated supplement to the Pfam database.",
+                                       "Sonnhammer et al: A new Pfam-B is released. Xfam Blog (2020)",
+                                       "https://xfam.wordpress.com/2020/06/30/a-new-pfam-b-is-released",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "CDD",
+                                       "Conserved Domain Database is a protein annotation resource consisting of well-annotated MSAs for ancient domains and full-length proteins.",
+                                       "Lu et al: CDD/SPARCLE: the conserved domain database in 2020. Nucleic Acids Res 48(D1), D265–D268 (2020)",
+                                       "https://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "eggNOG",
+                                       "eggNOG is a hierarchical, functionally and phylogenetically annotated orthology resource",
+                                       "Huerta-Cepas et al: eggNOG 5.0: a hierarchical, functionally and phylogenetically annotated orthology resource based on 5090 organisms and 2502 viruses. Nucleic Acids Res 47(D1), D309–D314 (2019)",
+                                       "http://eggnog5.embl.de",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "VOGDB",
+                                       "VOGDB is a continously updated resource of Virus Orthologous Groups",
+                                       "Marz et al: Challenges in RNA virus bioinformatics. Bioinformatics 30, 1793–9 (2014)",
+                                       "https://vogdb.org",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "dbCAN2",
+                                       "dbCAN2 is a database of carbohydrate-active enzymes.",
+                                       "Zhang et al: dbCAN2: a meta server for automated carbohydrate-active enzyme annotation. Nucleic Acids Res 46(W1), W95-W101 (2018)",
+                                       "http://bcb.unl.edu/dbCAN2",
+                                       false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "SILVA",
+                                       "SILVA provides datasets of aligned small and large subunit ribosomal RNA sequences for all three domains of life.",
+                                       "Yilmaz et al: The SILVA and \"All-species Living Tree Project (LTP)\" taxonomic frameworks. Nucleic Acids Res 42(D1), D643-D648 (2014)",
+                                       "https://www.arb-silva.de",
+                                       true, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
+                                       { { "SILVA_REL", "138" } }
+                               }, {
+                                       "Resfinder",
+                                       "ResFinder is a database that captures antimicrobial resistance genes from whole-genome data sets.",
+                                       "Zankari et al: Identification of acquired antimicrobial resistance genes. J Antimicrob Chemother 67(11), 2640-2644 (2012)",
+                                       "https://cge.cbs.dtu.dk/services/ResFinder",
+                                       false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
+                                       { }
+                               }, {
+                                       "Kalamari",
+                                       "Kalamari contains over 250 genomes chosen to be representative of agents tracked by genome-based foodborne disease surveillance, common contaminants, and diverse phyla and bacterial genera.",
+                                       "Katz et al: Kraken with Kalamari: Contamination Detection. ASM Poster, 270 (2018)",
+                                       "https://github.com/lskatz/Kalamari",
+                                       true, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
+                                       { }
+                               },
+};
diff --git a/src/commons/DownloadDatabase.h b/src/commons/DownloadDatabase.h
@@ -0,0 +1,29 @@
+//
+// Created by Martin Steinegger on 10/1/21.
+//
+
+#ifndef MMSEQS_DOWNLOADDATABASE_H
+#define MMSEQS_DOWNLOADDATABASE_H
+#include <vector>
+#include <string>
+
+
+struct EnvironmentEntry {
+    const char* key;
+    const char* value;
+};
+
+struct DatabaseDownload {
+    const char *name;
+    const char *description;
+    const char *citation;
+    const char *url;
+    bool hasTaxonomy;
+    int dbType;
+    const unsigned char *script;
+    size_t scriptLength;
+    std::vector<EnvironmentEntry> environment;
+};
+
+
+#endif //MMSEQS_DOWNLOADDATABASE_H