diff --git a/lib/mmseqs/.cirrus.yml b/lib/mmseqs/.cirrus.yml index d36f557..28f0112 100644 --- a/lib/mmseqs/.cirrus.yml +++ b/lib/mmseqs/.cirrus.yml @@ -4,7 +4,7 @@ env: task: name: FreeBSD-13 freebsd_instance: - image_family: freebsd-13-0 + image_family: freebsd-13-2-snap install_script: pkg install -y cmake git samtools compile_script: | mkdir build && cd build diff --git a/lib/mmseqs/cmake/MMseqsSetupDerivedTarget.cmake b/lib/mmseqs/cmake/MMseqsSetupDerivedTarget.cmake index a5a85fa..f5248d8 100644 --- a/lib/mmseqs/cmake/MMseqsSetupDerivedTarget.cmake +++ b/lib/mmseqs/cmake/MMseqsSetupDerivedTarget.cmake @@ -1,12 +1,16 @@ include(AppendTargetProperty) function (mmseqs_setup_derived_target TARGET) - get_target_property(COMPILE_TMP mmseqs-framework COMPILE_FLAGS) - get_target_property(LINK_TMP mmseqs-framework LINK_FLAGS) - get_target_property(DEF_TMP mmseqs-framework COMPILE_DEFINITIONS) - get_target_property(INCL_TMP mmseqs-framework INCLUDE_DIRECTORIES) + set(SOURCE "${ARGN}") + if(NOT SOURCE) + set(SOURCE "mmseqs-framework") + endif() + get_target_property(COMPILE_TMP ${SOURCE} COMPILE_FLAGS) + get_target_property(LINK_TMP ${SOURCE} LINK_FLAGS) + get_target_property(DEF_TMP ${SOURCE} COMPILE_DEFINITIONS) + get_target_property(INCL_TMP ${SOURCE} INCLUDE_DIRECTORIES) - target_link_libraries(${TARGET} mmseqs-framework) + target_link_libraries(${TARGET} ${SOURCE}) append_target_property(${TARGET} COMPILE_FLAGS ${COMPILE_TMP}) append_target_property(${TARGET} LINK_FLAGS ${LINK_TMP}) set_property(TARGET ${TARGET} APPEND PROPERTY COMPILE_DEFINITIONS ${DEF_TMP}) diff --git a/lib/mmseqs/data/workflow/blastp.sh b/lib/mmseqs/data/workflow/blastp.sh index 5722cfb..5d587fd 100755 --- a/lib/mmseqs/data/workflow/blastp.sh +++ b/lib/mmseqs/data/workflow/blastp.sh @@ -5,6 +5,33 @@ fail() { exit 1 } +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +fake_pref() { + QDB="$1" + TDB="$2" + RES="$3" + # create link to data file which contains a list of all targets that should be aligned + ln -s "$(abspath "${TDB}.index")" "${RES}" + # create new index repeatedly pointing to same entry + INDEX_SIZE="$(wc -c < "${TDB}.index")" + awk -v size="$INDEX_SIZE" '{ print $1"\t0\t"size; }' "${QDB}.index" > "${RES}.index" + # create dbtype (7) + awk 'BEGIN { printf("%c%c%c%c",7,0,0,0); exit; }' > "${RES}.dbtype" +} + notExists() { [ ! -f "$1" ] } @@ -27,14 +54,23 @@ ALN_RES_MERGE="$TMP_PATH/aln_0" while [ "$STEP" -lt "$STEPS" ]; do SENS_PARAM=SENSE_${STEP} eval SENS="\$$SENS_PARAM" - # call prefilter module + + # 1. Prefilter hits if notExists "$TMP_PATH/pref_$STEP.dbtype"; then - # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $PREFILTER_PAR -s "$SENS" \ - || fail "Prefilter died" + if [ "$PREFMODE" = "EXHAUSTIVE" ]; then + fake_pref "${INPUT}" "${TARGET}" "$TMP_PATH/pref_$STEP" + elif [ "$PREFMODE" = "UNGAPPED" ]; then + # shellcheck disable=SC2086 + $RUNNER "$MMSEQS" ungappedprefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $UNGAPPEDPREFILTER_PAR \ + || fail "Ungapped prefilter died" + else + # shellcheck disable=SC2086 + $RUNNER "$MMSEQS" prefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $PREFILTER_PAR -s "$SENS" \ + || fail "Prefilter died" + fi fi - # call alignment module + # 2. alignment module if [ "$STEPS" -eq 1 ]; then if notExists "$3.dbtype"; then # shellcheck disable=SC2086 diff --git a/lib/mmseqs/data/workflow/blastpgp.sh b/lib/mmseqs/data/workflow/blastpgp.sh index 95a1d44..806f680 100755 --- a/lib/mmseqs/data/workflow/blastpgp.sh +++ b/lib/mmseqs/data/workflow/blastpgp.sh @@ -5,6 +5,33 @@ fail() { exit 1 } +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +fake_pref() { + QDB="$1" + TDB="$2" + RES="$3" + # create link to data file which contains a list of all targets that should be aligned + ln -s "$(abspath "${TDB}.index")" "${RES}" + # create new index repeatedly pointing to same entry + INDEX_SIZE="$(wc -c < "${TDB}.index")" + awk -v size="$INDEX_SIZE" '{ print $1"\t0\t"size; }' "${QDB}.index" > "${RES}.index" + # create dbtype (7) + awk 'BEGIN { printf("%c%c%c%c",7,0,0,0); exit; }' > "${RES}.dbtype" +} + notExists() { [ ! -f "$1" ] } @@ -28,15 +55,26 @@ STEP=0 while [ "$STEP" -lt "$NUM_IT" ]; do # call prefilter module if notExists "$TMP_PATH/pref_tmp_${STEP}.done"; then - PARAM="PREFILTER_PAR_$STEP" - eval TMP="\$$PARAM" + if [ "$PREFMODE" = "EXHAUSTIVE" ]; then + TMP="" + PREF="fake_pref" + elif [ "$PREFMODE" = "UNGAPPED" ]; then + PARAM="UNGAPPEDPREFILTER_PAR_$STEP" + eval TMP="\$$PARAM" + PREF="${MMSEQS} ungappedprefilter" + else + PARAM="PREFILTER_PAR_$STEP" + eval TMP="\$$PARAM" + PREF="${MMSEQS} prefilter" + fi + if [ "$STEP" -eq 0 ]; then # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "$QUERYDB" "$2" "$TMP_PATH/pref_$STEP" ${TMP} \ + $RUNNER $PREF "$QUERYDB" "$2" "$TMP_PATH/pref_$STEP" ${TMP} \ || fail "Prefilter died" else # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "$QUERYDB" "$2" "$TMP_PATH/pref_tmp_$STEP" ${TMP} \ + $RUNNER $PREF "$QUERYDB" "$2" "$TMP_PATH/pref_tmp_$STEP" ${TMP} \ || fail "Prefilter died" fi touch "$TMP_PATH/pref_tmp_${STEP}.done" diff --git a/lib/mmseqs/data/workflow/createtaxdb.sh b/lib/mmseqs/data/workflow/createtaxdb.sh index c1adce1..66fb7f9 100755 --- a/lib/mmseqs/data/workflow/createtaxdb.sh +++ b/lib/mmseqs/data/workflow/createtaxdb.sh @@ -59,7 +59,7 @@ if { [ "${DBMODE}" = "1" ] && notExists "${TAXDBNAME}_taxonomy"; } || { [ "${DBM # Download NCBI taxon information if notExists "${TMP_PATH}/ncbi_download.complete"; then echo "Download taxdump.tar.gz" - downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" "${TMP_PATH}/taxdump.tar.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" "${TMP_PATH}/taxdump.tar.gz" tar -C "${TMP_PATH}" -xzf "${TMP_PATH}/taxdump.tar.gz" names.dmp nodes.dmp merged.dmp delnodes.dmp touch "${TMP_PATH}/ncbi_download.complete" rm -f "${TMP_PATH}/taxdump.tar.gz" diff --git a/lib/mmseqs/data/workflow/databases.sh b/lib/mmseqs/data/workflow/databases.sh index 2fc6685..61704ec 100644 --- a/lib/mmseqs/data/workflow/databases.sh +++ b/lib/mmseqs/data/workflow/databases.sh @@ -118,9 +118,9 @@ case "${SELECTION}" in if notExists "${TMP_PATH}/nr.gz"; then date "+%s" > "${TMP_PATH}/version" downloadFile "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz" "${TMP_PATH}/nr.gz" - downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" "${TMP_PATH}/prot.accession2taxid.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" "${TMP_PATH}/prot.accession2taxid.gz" gunzip "${TMP_PATH}/prot.accession2taxid.gz" - downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz" "${TMP_PATH}/pdb.accession2taxid.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz" "${TMP_PATH}/pdb.accession2taxid.gz" gunzip "${TMP_PATH}/pdb.accession2taxid.gz" fi push_back "${TMP_PATH}/nr.gz" @@ -147,7 +147,7 @@ case "${SELECTION}" in "PDB") if notExists "${TMP_PATH}/pdb_seqres.txt.gz"; then date "+%s" > "${TMP_PATH}/version" - downloadFile "https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz" "${TMP_PATH}/pdb_seqres.txt.gz" + downloadFile "https://files.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz" "${TMP_PATH}/pdb_seqres.txt.gz" fi push_back "${TMP_PATH}/pdb_seqres.txt.gz" INPUT_TYPE="FASTA_LIST" @@ -212,8 +212,8 @@ case "${SELECTION}" in ;; "CDD") if notExists "${TMP_PATH}/msa.msa.gz"; then - downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version" - downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version" + downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz" fi INPUT_TYPE="FASTA_MSA" SED_FIX_LOOKUP='s|\.FASTA||g' diff --git a/lib/mmseqs/data/workflow/taxpercontig.sh b/lib/mmseqs/data/workflow/taxpercontig.sh index 299ac36..f350df2 100644 --- a/lib/mmseqs/data/workflow/taxpercontig.sh +++ b/lib/mmseqs/data/workflow/taxpercontig.sh @@ -45,7 +45,11 @@ if [ -n "${ORF_FILTER}" ]; then fi if notExists "${TMP_PATH}/orfs_aln.list"; then - awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln.list" + # shellcheck disable=SC2086 + "$MMSEQS" recoverlongestorf "${ORFS_DB}" "${TMP_PATH}/orfs_aln" "${TMP_PATH}/orfs_aln_recovered.list" ${THREADS_PAR} \ + || fail "recoverlongestorf died" + awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln_remain.list" + cat "${TMP_PATH}/orfs_aln_recovered.list" "${TMP_PATH}/orfs_aln_remain.list" > "${TMP_PATH}/orfs_aln.list" fi if notExists "${TMP_PATH}/orfs_filter.dbtype"; then diff --git a/lib/mmseqs/data/workflow/tsv2exprofiledb.sh b/lib/mmseqs/data/workflow/tsv2exprofiledb.sh index 3323410..e2b8237 100644 --- a/lib/mmseqs/data/workflow/tsv2exprofiledb.sh +++ b/lib/mmseqs/data/workflow/tsv2exprofiledb.sh @@ -22,19 +22,19 @@ fi if notExists "${OUT}.dbtype"; then "$MMSEQS" tsv2db "${IN}.tsv" "${OUT}_tmp" --output-dbtype 0 ${VERBOSITY} - MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${VERBOSITY} + MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${VERBOSITY} "$MMSEQS" rmdb "${OUT}_tmp" ${VERBOSITY} fi if notExists "${OUT}_seq.dbtype"; then "$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq_tmp" --output-dbtype 0 ${VERBOSITY} - MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${VERBOSITY} + MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${VERBOSITY} "$MMSEQS" rmdb "${OUT}_seq_tmp" ${VERBOSITY} fi if notExists "${OUT}_aln.dbtype"; then "$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln_tmp" --output-dbtype 5 ${VERBOSITY} - MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_aln_tmp" "${OUT}_aln" ${VERBOSITY} + MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_aln_tmp" "${OUT}_aln" ${VERBOSITY} "$MMSEQS" rmdb "${OUT}_aln_tmp" ${VERBOSITY} fi diff --git a/lib/mmseqs/lib/alp/sls_pvalues.hpp b/lib/mmseqs/lib/alp/sls_pvalues.hpp index 023cd47..ab5d950 100755 --- a/lib/mmseqs/lib/alp/sls_pvalues.hpp +++ b/lib/mmseqs/lib/alp/sls_pvalues.hpp @@ -258,7 +258,7 @@ namespace Sls { return rand_C; }; - static inline double standard_normal()//generates standard normal random value using the Box–Muller transform + static inline double standard_normal()//generates standard normal random value using the Box-Muller transform { double r1=0; while(r1==0) diff --git a/lib/mmseqs/lib/ksw2/kseq.h b/lib/mmseqs/lib/ksw2/kseq.h index 759d06a..749bb46 100644 --- a/lib/mmseqs/lib/ksw2/kseq.h +++ b/lib/mmseqs/lib/ksw2/kseq.h @@ -113,9 +113,10 @@ typedef struct __kstring_t { if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } else break; \ } \ - if (delimiter == KS_SEP_LINE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == '\n') { ks->newline+=(append == 1); break; } \ + if (delimiter == KS_SEP_LINE) { \ + unsigned char *sep = (unsigned char*)memchr(ks->buf + ks->begin, '\n', ks->end - ks->begin); \ + i = sep != NULL ? sep - (unsigned char*)ks->buf : ks->end; \ + ks->newline += (sep != NULL && append == 1); \ } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ diff --git a/lib/mmseqs/src/CMakeLists.txt b/lib/mmseqs/src/CMakeLists.txt index 27f6a69..56715c6 100644 --- a/lib/mmseqs/src/CMakeLists.txt +++ b/lib/mmseqs/src/CMakeLists.txt @@ -76,8 +76,8 @@ if (NOT EMSCRIPTEN) endif() if (ENABLE_WERROR) - append_target_property(mmseqs-framework COMPILE_FLAGS -Werror) - append_target_property(mmseqs-framework LINK_FLAGS -Werror) + append_target_property(mmseqs-framework COMPILE_FLAGS -Werror -Wno-unused-command-line-argument) + append_target_property(mmseqs-framework LINK_FLAGS -Werror -Wno-unused-command-line-argument) endif() # needed for concat.h @@ -222,6 +222,7 @@ if (OPENMP_FOUND) if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") target_link_libraries(mmseqs-framework ${OpenMP_CXX_LIBRARIES}) endif() + target_include_directories(mmseqs-framework PUBLIC ${OpenMP_CXX_INCLUDE_DIRS}) append_target_property(mmseqs-framework COMPILE_FLAGS ${OpenMP_CXX_FLAGS}) append_target_property(mmseqs-framework LINK_FLAGS ${OpenMP_CXX_FLAGS}) elseif (REQUIRE_OPENMP) diff --git a/lib/mmseqs/src/CommandDeclarations.h b/lib/mmseqs/src/CommandDeclarations.h index 5036634..3c4abb4 100644 --- a/lib/mmseqs/src/CommandDeclarations.h +++ b/lib/mmseqs/src/CommandDeclarations.h @@ -23,6 +23,7 @@ extern int convertkb(int argc, const char **argv, const Command& command); extern int convertmsa(int argc, const char **argv, const Command& command); extern int convertprofiledb(int argc, const char **argv, const Command& command); extern int createdb(int argc, const char **argv, const Command& command); +extern int makepaddedseqdb(int argc, const char **argv, const Command& command); extern int createindex(int argc, const char **argv, const Command& command); extern int createlinindex(int argc, const char **argv, const Command& command); extern int createseqfiledb(int argc, const char **argv, const Command& command); @@ -97,6 +98,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command extern int gappedprefilter(int argc, const char **argv, const Command& command); extern int unpackdb(int argc, const char **argv, const Command& command); extern int rbh(int argc, const char **argv, const Command& command); +extern int recoverlongestorf(int argc, const char **argv, const Command& command); extern int result2flat(int argc, const char **argv, const Command& command); extern int result2msa(int argc, const char **argv, const Command& command); extern int result2dnamsa(int argc, const char **argv, const Command& command); diff --git a/lib/mmseqs/src/MMseqsBase.cpp b/lib/mmseqs/src/MMseqsBase.cpp index a8b8a84..720aeac 100644 --- a/lib/mmseqs/src/MMseqsBase.cpp +++ b/lib/mmseqs/src/MMseqsBase.cpp @@ -39,9 +39,9 @@ std::vector baseCommands = { "Slower, sensitive clustering", "mmseqs easy-cluster examples/DB.fasta result tmp\n" "# Cluster output\n" - "# - result_rep_seq.fasta: Representatives\n" - "# - result_all_seq.fasta: FASTA-like per cluster\n" - "# - result_cluster.tsv: Adjacency list\n\n" + "# - result_rep_seq.fasta: Representatives\n" + "# - result_all_seqs.fasta: FASTA-like per cluster\n" + "# - result_cluster.tsv: Adjacency list\n\n" "# Important parameter: --min-seq-id, --cov-mode and -c \n" "# --cov-mode \n" "# 0 1 2\n" @@ -62,9 +62,9 @@ std::vector baseCommands = { "Fast linear time cluster, less sensitive clustering", "mmseqs easy-linclust examples/DB.fasta result tmp\n\n" "# Linclust output\n" - "# - result_rep_seq.fasta: Representatives\n" - "# - result_all_seq.fasta: FASTA-like per cluster\n" - "# - result_cluster.tsv: Adjecency list\n\n" + "# - result_rep_seq.fasta: Representatives\n" + "# - result_all_seqs.fasta: FASTA-like per cluster\n" + "# - result_cluster.tsv: Adjecency list\n\n" "# Important parameter: --min-seq-id, --cov-mode and -c \n" "# --cov-mode \n" "# 0 1 2\n" @@ -130,6 +130,13 @@ std::vector baseCommands = { " ... | ", CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}}, + {"makepaddedseqdb", makepaddedseqdb, &par.onlyverbosity, COMMAND_HIDDEN, + "Generate a padded sequence DB", + "Generate a padded sequence DB", + "Martin Steinegger ", + " ", + CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb }, + {"sequenceIndexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}}, {"appenddbtoindex", appenddbtoindex, &par.appenddbtoindex, COMMAND_HIDDEN, NULL, NULL, @@ -137,7 +144,7 @@ std::vector baseCommands = { " ... ", CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb }, {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}}, - {"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN, + {"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN, NULL, NULL, "Martin Steinegger ", @@ -555,7 +562,7 @@ std::vector baseCommands = { {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"pvalDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, - {"mergeresultsbyset", mergeresultsbyset, &par.threadsandcompression,COMMAND_MULTIHIT, + {"mergeresultsbyset", mergeresultsbyset, &par.mergeresultsbyset, COMMAND_MULTIHIT, "Merge results from multiple ORFs back to their respective contig", NULL, "Ruoshi Zhang, Clovis Norroy & Milot Mirdita ", @@ -900,6 +907,14 @@ std::vector baseCommands = { "Eli Levy Karin ", " ", CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}, + {"recoverlongestorf", recoverlongestorf, &par.onlythreads, COMMAND_EXPERT, + "Recover longest ORF for taxonomy annotation after elimination", + NULL, + "Sung-eun Jang", + " ", + CITATION_MMSEQS2, {{"orfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}, + {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb}, + {"tsvFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"reverseseq", reverseseq, &par.reverseseq, COMMAND_SEQUENCE, "Reverse (without complement) sequences", NULL, diff --git a/lib/mmseqs/src/alignment/Matcher.cpp b/lib/mmseqs/src/alignment/Matcher.cpp index ae1d8c8..3b83bf5 100644 --- a/lib/mmseqs/src/alignment/Matcher.cpp +++ b/lib/mmseqs/src/alignment/Matcher.cpp @@ -282,7 +282,7 @@ size_t Matcher::resultToBuffer(char * buff1, const result_t &result, bool addBac *(tmpBuff-1) = '\t'; tmpBuff = Util::fastSeqIdToBuffer(result.seqId, tmpBuff); *(tmpBuff-1) = '\t'; - tmpBuff += sprintf(tmpBuff,"%.3E",result.eval); + tmpBuff += snprintf(tmpBuff, 32, "%.3E", result.eval); tmpBuff++; *(tmpBuff-1) = '\t'; tmpBuff = Itoa::i32toa_sse2(result.qStartPos, tmpBuff); diff --git a/lib/mmseqs/src/alignment/MultipleAlignment.cpp b/lib/mmseqs/src/alignment/MultipleAlignment.cpp index 7206f88..efe9575 100644 --- a/lib/mmseqs/src/alignment/MultipleAlignment.cpp +++ b/lib/mmseqs/src/alignment/MultipleAlignment.cpp @@ -47,17 +47,15 @@ void MultipleAlignment::computeQueryGaps(unsigned int *queryGaps, Sequence *cent for(size_t i = 0; i < alignmentResults.size(); i++) { const Matcher::result_t& alignment = alignmentResults[i]; const std::string& bt = alignment.backtrace; - size_t queryPos = 0; - size_t targetPos = 0; size_t currentQueryGapSize = 0; - queryPos = alignment.qStartPos; - targetPos = alignment.dbStartPos; + size_t queryPos = alignment.qStartPos; + // size_t targetPos = alignment.dbStartPos; // compute query gaps (deletions) for (size_t pos = 0; pos < bt.size(); ++pos) { char bt_letter = bt.at(pos); if (bt_letter == 'M') { // match state ++queryPos; - ++targetPos; + // ++targetPos; currentQueryGapSize = 0; } else { if (bt_letter == 'I') { // insertion @@ -65,7 +63,7 @@ void MultipleAlignment::computeQueryGaps(unsigned int *queryGaps, Sequence *cent currentQueryGapSize = 0; } else { // deletion - ++targetPos; + // ++targetPos; currentQueryGapSize += 1; size_t gapCount = queryGaps[queryPos]; queryGaps[queryPos] = std::max(gapCount, currentQueryGapSize); diff --git a/lib/mmseqs/src/alignment/PSSMCalculator.cpp b/lib/mmseqs/src/alignment/PSSMCalculator.cpp index ab550b5..79bafaf 100644 --- a/lib/mmseqs/src/alignment/PSSMCalculator.cpp +++ b/lib/mmseqs/src/alignment/PSSMCalculator.cpp @@ -92,7 +92,7 @@ PSSMCalculator::~PSSMCalculator() { // PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize, // size_t queryLength, // const char **msaSeqs, -// bool wg) { +// bool wg, 0.0) { // increaseSetSize(setSize); // // Quick and dirty calculation of the weight per sequence wg[k] // computeSequenceWeights(seqWeight, queryLength, setSize, msaSeqs); @@ -135,10 +135,10 @@ PSSMCalculator::~PSSMCalculator() { //// PSSMCalculator::printPSSM(queryLength); // } // this overload is only used in TestProfileAlignment.cpp and TestPSSM.cpp -PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs, bool wg) { +PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs, bool wg, float scoreBias) { #ifdef GAP_POS_SCORING std::vector dummy; - return computePSSMFromMSA(setSize, queryLength, msaSeqs, dummy, wg); + return computePSSMFromMSA(setSize, queryLength, msaSeqs, dummy, wg, scoreBias); } #endif @@ -146,7 +146,8 @@ PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize, size_ PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA( size_t setSize, size_t queryLength, const char **msaSeqs, const std::vector &alnResults, - bool wg + bool wg, + float scoreBias ) { #endif increaseSetSize(setSize); @@ -193,7 +194,7 @@ PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA( // PSSMCalculator::printPSSM(queryLength); // create final Matrix - computeLogPSSM(subMat, pssm, profile, 8.0, queryLength, 0.0); + computeLogPSSM(subMat, pssm, profile, 8.0, queryLength, scoreBias); // PSSMCalculator::printProfile(queryLength); // PSSMCalculator::printPSSM(queryLength); #ifdef GAP_POS_SCORING @@ -244,7 +245,7 @@ void PSSMCalculator::computeLogPSSM(BaseMatrix *subMat, char *pssm, const float const float aaProb = profile[pos * Sequence::PROFILE_AA_SIZE + aa]; const unsigned int idx = pos * Sequence::PROFILE_AA_SIZE + aa; float logProb = MathUtil::flog2(aaProb / subMat->pBack[aa]); - float pssmVal = bitFactor * logProb + scoreBias; + float pssmVal = bitFactor * logProb + bitFactor * scoreBias; pssmVal = static_cast((pssmVal < 0.0) ? pssmVal - 0.5 : pssmVal + 0.5); float truncPssmVal = std::min(pssmVal, 127.0f); truncPssmVal = std::max(-128.0f, truncPssmVal); diff --git a/lib/mmseqs/src/alignment/PSSMCalculator.h b/lib/mmseqs/src/alignment/PSSMCalculator.h index 579433c..146475b 100644 --- a/lib/mmseqs/src/alignment/PSSMCalculator.h +++ b/lib/mmseqs/src/alignment/PSSMCalculator.h @@ -47,9 +47,9 @@ class PSSMCalculator { ~PSSMCalculator(); - Profile computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs, bool wg); + Profile computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs, bool wg, float scoreBias); #ifdef GAP_POS_SCORING - Profile computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs, const std::vector &alnResults, bool wg); + Profile computePSSMFromMSA(size_t setSize, size_t queryLength, const char **msaSeqs, const std::vector &alnResults, bool wg, float scoreBias); #endif void printProfile(size_t queryLength); diff --git a/lib/mmseqs/src/commons/Application.cpp b/lib/mmseqs/src/commons/Application.cpp index 9ff319f..bfa921b 100644 --- a/lib/mmseqs/src/commons/Application.cpp +++ b/lib/mmseqs/src/commons/Application.cpp @@ -16,26 +16,33 @@ extern const char *show_extended_help; extern const char *show_bash_info; extern bool hide_base_commands; -extern std::vector commands; -extern std::vector baseCommands; +extern std::vector*> commands; extern std::vector categories; extern void (*validatorUpdate)(void); - -Command *getCommandByName(const char *s) { - for (size_t i = 0; i < commands.size(); i++) { - Command &p = commands[i]; - if (!strcmp(s, p.cmd)) - return &p; - } - for (size_t i = 0; i < baseCommands.size(); i++) { - Command &p = baseCommands[i]; - if (!strcmp(s, p.cmd)) - return &p; +extern void (*initCommands)(void); + +const Command* getCommandByName(const char *s) { + // allow base commands also to be called with a prefix, e.g. "mmseqs base:createdb" + // this allows inheriting programs to find shadowed base modules + const char *prefix = "base:"; + const char *check = strncmp(s, prefix, strlen(prefix)) == 0 ? s + strlen(prefix) : s; + + for (size_t i = commands.size() - 1; true; i--) { + for (size_t j = 0; j < commands[i]->size(); j++) { + const Command &p = (*commands[i])[j]; + if (!strcmp(s, p.cmd)) + return &p; + if (i == 0 && !strcmp(check, p.cmd)) + return &p; + } + if (i == 0) { + break; + } } return NULL; } -int runCommand(Command *p, int argc, const char **argv) { +int runCommand(const Command *p, int argc, const char **argv) { Timer timer; int status = p->commandFunction(argc, argv, *p); Debug(Debug::INFO) << "Time for processing: " << timer.lap() << "\n"; @@ -52,20 +59,19 @@ void printUsage(bool showExtended) { std::vector showCategoryHeader(categories.size(), 0); for (size_t i = 0; i < categories.size(); ++i) { - for (size_t j = 0; j < commands.size(); j++) { - Command &p = commands[j]; - if (p.mode & categories[i].mode) { - showCategoryHeader[i] = 1; - break; - } - } + size_t start = 0; if (hide_base_commands) { - continue; - } - for (size_t j = 0; j < baseCommands.size(); j++) { - Command &p = baseCommands[j]; - if (p.mode & categories[i].mode) { - showCategoryHeader[i] = 1; + start = commands.size() - 1; + } + for (size_t j = commands.size() - 1; j >= start; j--) { + for (size_t k = 0; k < commands[j]->size(); k++) { + const Command &p = (*commands[j])[k]; + if (p.mode & categories[i].mode) { + showCategoryHeader[i] = 1; + break; + } + } + if (j == 0) { break; } } @@ -87,25 +93,22 @@ void printUsage(bool showExtended) { } usage << "\n" << std::setw(20) << categories[i].title << "\n"; - for (size_t j = 0; j < commands.size(); j++) { - struct Command &p = commands[j]; - if (showExtended == false && (p.mode & COMMAND_EXPERT) != 0) { - continue; - } - if (p.mode & categories[i].mode) { - usage << std::left << std::setw(20) << " " + std::string(p.cmd) << "\t" << p.description << "\n"; - } - } + size_t start = 0; if (hide_base_commands) { - continue; + start = commands.size() - 1; } - for (size_t j = 0; j < baseCommands.size(); j++) { - struct Command &p = baseCommands[j]; - if (showExtended == false && (p.mode & COMMAND_EXPERT) != 0) { - continue; + for (size_t j = commands.size() - 1; j >= start; j--) { + for (size_t k = 0; k < commands[j]->size(); k++) { + const Command &p = (*commands[j])[k]; + if (showExtended == false && (p.mode & COMMAND_EXPERT) != 0) { + continue; + } + if (p.mode & categories[i].mode) { + usage << std::left << std::setw(20) << " " + std::string(p.cmd) << "\t" << p.description << "\n"; + } } - if (p.mode & categories[i].mode) { - usage << std::left << std::setw(20) << " " + std::string(p.cmd) << "\t" << p.description << "\n"; + if (j == 0) { + break; } } } @@ -124,55 +127,48 @@ void printUsage(bool showExtended) { int shellcompletion(int argc, const char **argv) { // mmseqs programs if (argc == 0) { - for (size_t i = 0; i < commands.size(); i++) { - struct Command &p = commands[i]; - if (p.mode & COMMAND_HIDDEN) - continue; - Debug(Debug::INFO) << p.cmd << " "; - } - if (hide_base_commands == false) { - for (size_t i = 0; i < baseCommands.size(); i++) { - struct Command &p = baseCommands[i]; + size_t start = 0; + if (hide_base_commands) { + start = commands.size() - 1; + } + for (size_t i = commands.size() - 1; i >= start; i--) { + for (size_t j = 0; j < commands[i]->size(); j++) { + const Command &p = (*commands[i])[j]; if (p.mode & COMMAND_HIDDEN) continue; Debug(Debug::INFO) << p.cmd << " "; } + if (i == 0) { + break; + } } Debug(Debug::INFO) << "\n"; } // mmseqs parameters for given program if (argc == 1) { - for (size_t i = 0; i < commands.size(); i++) { - struct Command &p = commands[i]; - if (strcmp(p.cmd, argv[0]) != 0) { - continue; - } - if (p.params == NULL) { - continue; - } - for (std::vector::const_iterator it = p.params->begin(); it != p.params->end(); ++it) { - Debug(Debug::INFO) << (*it)->name << " "; - } - Debug(Debug::INFO) << "\n"; - break; + size_t start = 0; + if (hide_base_commands) { + start = commands.size() - 1; } - if (hide_base_commands == false) { - for (size_t i = 0; i < baseCommands.size(); i++) { - struct Command &p = baseCommands[i]; + for (size_t i = commands.size() - 1; i >= start; i--) { + for (size_t j = 0; j < commands[i]->size(); j++) { + const Command &p = (*commands[i])[j]; if (strcmp(p.cmd, argv[0]) != 0) { continue; } if (p.params == NULL) { continue; } - for (std::vector::const_iterator it = p.params->begin(); - it != p.params->end(); ++it) { + for (std::vector::const_iterator it = p.params->begin(); it != p.params->end(); ++it) { Debug(Debug::INFO) << (*it)->name << " "; } Debug(Debug::INFO) << "\n"; break; } + if (i == 0) { + break; + } } Debug(Debug::INFO) << "\n"; } @@ -180,6 +176,10 @@ int shellcompletion(int argc, const char **argv) { } int main(int argc, const char **argv) { + if (initCommands != NULL) { + initCommands(); + } + if (argc < 2) { printUsage(false); return EXIT_SUCCESS; @@ -196,7 +196,7 @@ int main(int argc, const char **argv) { FileUtil::fixRlimitNoFile(); setenv("MMSEQS", argv[0], true); - Command *c = NULL; + const Command *c = NULL; if (strncmp(argv[1], "shellcompletion", strlen("shellcompletion")) == 0) { return shellcompletion(argc - 2, argv + 2); } else if ((c = getCommandByName(argv[1])) != NULL) { @@ -206,37 +206,34 @@ int main(int argc, const char **argv) { Debug(Debug::INFO) << "\nInvalid Command: " << argv[1] << "\n"; // Suggest some command that the user might have meant - size_t index = SIZE_MAX; + size_t indexI = SIZE_MAX; + size_t indexJ = SIZE_MAX; int maxDistance = 0; - for (size_t i = 0; i < commands.size(); ++i) { - struct Command &p = commands[i]; - if (p.mode & COMMAND_HIDDEN) { - continue; - } - - int distance = DistanceCalculator::localLevenshteinDistance(argv[1], p.cmd); - if (distance > maxDistance) { - maxDistance = distance; - index = i; - } + size_t start = 0; + if (hide_base_commands) { + start = commands.size() - 1; } - - if (hide_base_commands == false) { - for (size_t i = 0; i < baseCommands.size(); ++i) { - struct Command &p = baseCommands[i]; - if (p.mode & COMMAND_HIDDEN) + for (size_t i = commands.size() - 1; i >= start; i--) { + for (size_t j = 0; j < commands[i]->size(); j++) { + const Command &p = (*commands[i])[j]; + if (p.mode & COMMAND_HIDDEN) { continue; + } int distance = DistanceCalculator::localLevenshteinDistance(argv[1], p.cmd); if (distance > maxDistance) { maxDistance = distance; - index = i; + indexI = i; + indexJ = j; } } + if (i == 0) { + break; + } } - if (index != SIZE_MAX) { - Debug(Debug::WARNING) << "Did you mean \"" << argv[0] << " " << baseCommands[index].cmd << "\"?\n"; + if (indexI != SIZE_MAX && indexJ != SIZE_MAX) { + Debug(Debug::WARNING) << "Did you mean \"" << argv[0] << " " << (*commands[indexI])[indexJ].cmd << "\"?\n"; } return EXIT_FAILURE; diff --git a/lib/mmseqs/src/commons/CSProfile.cpp b/lib/mmseqs/src/commons/CSProfile.cpp index 6acf8da..5b2fc6f 100644 --- a/lib/mmseqs/src/commons/CSProfile.cpp +++ b/lib/mmseqs/src/commons/CSProfile.cpp @@ -135,12 +135,12 @@ void ContextLibrary::readContextProfile(std::stringstream &in, LibraryReader &re } // Calculate maximum of pseudocount weights double max = -DBL_MAX; - double mean = 0.0; + // double mean = 0.0; for (size_t a = 0; a < 20; ++a) { - mean += pc_weight[a]; + // mean += pc_weight[a]; if (pc_weight[a] > max) max = pc_weight[a]; } - mean /= 20.0; + // mean /= 20.0; // Rescale pseudocount weights and calculate their sum in lin-space long double sum = 0.0; diff --git a/lib/mmseqs/src/commons/Command.cpp b/lib/mmseqs/src/commons/Command.cpp index 677f46a..ac6a8b1 100644 --- a/lib/mmseqs/src/commons/Command.cpp +++ b/lib/mmseqs/src/commons/Command.cpp @@ -1,6 +1,11 @@ #include "Command.h" #include "Parameters.h" +std::vector*> commands; +void registerCommands(std::vector* cmd) { + commands.emplace_back(cmd); +} + std::vector categories = { {"Easy workflows for plain text input/output", COMMAND_EASY}, {"Main workflows for database input/output", COMMAND_MAIN}, diff --git a/lib/mmseqs/src/commons/Command.h b/lib/mmseqs/src/commons/Command.h index 5c5fd1a..308309e 100644 --- a/lib/mmseqs/src/commons/Command.h +++ b/lib/mmseqs/src/commons/Command.h @@ -108,4 +108,6 @@ struct Categories { CommandMode mode; }; +void registerCommands(std::vector* cmd); + #endif diff --git a/lib/mmseqs/src/commons/Parameters.cpp b/lib/mmseqs/src/commons/Parameters.cpp index 30a8143..afc56ba 100644 --- a/lib/mmseqs/src/commons/Parameters.cpp +++ b/lib/mmseqs/src/commons/Parameters.cpp @@ -102,7 +102,7 @@ Parameters::Parameters(): PARAM_V(PARAM_V_ID, "-v", "Verbosity", "Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info", typeid(int), (void *) &verbosity, "^[0-3]{1}$", MMseqsParameter::COMMAND_COMMON), // convertalignments PARAM_FORMAT_MODE(PARAM_FORMAT_MODE_ID, "--format-mode", "Alignment format", "Output format:\n0: BLAST-TAB\n1: SAM\n2: BLAST-TAB + query/db length\n3: Pretty HTML\n4: BLAST-TAB + column headers\nBLAST-TAB (0) and BLAST-TAB + column headers (4) support custom output formats (--format-output)", typeid(int), (void *) &formatAlignmentMode, "^[0-4]{1}$"), - PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID, "--format-output", "Format alignment output", "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,qorfstart,qorfend,torfstart,torfend", typeid(std::string), (void *) &outfmt, ""), + PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID, "--format-output", "Format alignment output", "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,fident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage,qorfstart,qorfend,torfstart,torfend,ppos", typeid(std::string), (void *) &outfmt, ""), PARAM_DB_OUTPUT(PARAM_DB_OUTPUT_ID, "--db-output", "Database output", "Return a result DB instead of a text file", typeid(bool), (void *) &dbOut, "", MMseqsParameter::COMMAND_EXPERT), // --include-only-extendablediagonal PARAM_RESCORE_MODE(PARAM_RESCORE_MODE_ID, "--rescore-mode", "Rescore mode", "Rescore diagonals with:\n0: Hamming distance\n1: local alignment (score only)\n2: local alignment\n3: global alignment\n4: longest alignment fulfilling window quality criterion", typeid(int), (void *) &rescoreMode, "^[0-4]{1}$"), @@ -163,6 +163,7 @@ Parameters::Parameters(): PARAM_NUM_ITERATIONS(PARAM_NUM_ITERATIONS_ID, "--num-iterations", "Search iterations", "Number of iterative profile search iterations", typeid(int), (void *) &numIterations, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE), PARAM_START_SENS(PARAM_START_SENS_ID, "--start-sens", "Start sensitivity", "Start sensitivity", typeid(float), (void *) &startSens, "^[0-9]*(\\.[0-9]+)?$"), PARAM_SENS_STEPS(PARAM_SENS_STEPS_ID, "--sens-steps", "Search steps", "Number of search steps performed from --start-sens to -s", typeid(int), (void *) &sensSteps, "^[1-9]{1}$"), + PARAM_PREF_MODE(PARAM_PREF_MODE_ID,"--prefilter-mode", "Prefilter mode", "prefilter mode: 0: kmer/ungapped 1: ungapped, 2: nofilter",typeid(int), (void *) &prefMode, "^[0-2]{1}$"), PARAM_EXHAUSTIVE_SEARCH(PARAM_EXHAUSTIVE_SEARCH_ID, "--exhaustive-search", "Exhaustive search mode", "For bigger profile DB, run iteratively the search by greedily swapping the search results", typeid(bool), (void *) &exhaustiveSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_EXHAUSTIVE_SEARCH_FILTER(PARAM_EXHAUSTIVE_SEARCH_FILTER_ID, "--exhaustive-search-filter", "Filter results during exhaustive search", "Filter result during search: 0: do not filter, 1: filter", typeid(int), (void *) &exhaustiveFilterMsa, "^[0-1]{1}$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), @@ -282,7 +283,7 @@ Parameters::Parameters(): // taxonomyreport PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID, "--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"), // createtaxdb - PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void *) &ncbiTaxDump, ""), + PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void *) &ncbiTaxDump, ""), PARAM_TAX_MAPPING_FILE(PARAM_TAX_MAPPING_FILE_ID, "--tax-mapping-file", "Taxonomy mapping file", "File to map sequence identifier to taxonomical identifier", typeid(std::string), (void *) &taxMappingFile, ""), PARAM_TAX_MAPPING_MODE(PARAM_TAX_MAPPING_MODE_ID, "--tax-mapping-mode", "Taxonomy mapping mode", "Map taxonomy based on sequence database 0: .lookup file 1: .source file", typeid(int), (void *) &taxMappingMode, "^[0-1]{1}$"), PARAM_TAX_DB_MODE(PARAM_TAX_DB_MODE_ID, "--tax-db-mode", "Taxonomy db mode", "Create taxonomy database as: 0: .dmp flat files (human readable) 1: binary dump (faster readin)", typeid(int), (void *) &taxDbMode, "^[0-1]{1}$"), @@ -891,6 +892,11 @@ Parameters::Parameters(): combinepvalbyset.push_back(&PARAM_COMPRESSED); combinepvalbyset.push_back(&PARAM_V); + // mergeresultsbyset + mergeresultsbyset.push_back(&PARAM_PRELOAD_MODE); + mergeresultsbyset.push_back(&PARAM_THREADS); + mergeresultsbyset.push_back(&PARAM_COMPRESSED); + mergeresultsbyset.push_back(&PARAM_V); // offsetalignment offsetalignment.push_back(&PARAM_CHAIN_ALIGNMENT); @@ -1253,6 +1259,7 @@ Parameters::Parameters(): searchworkflow.push_back(&PARAM_NUM_ITERATIONS); searchworkflow.push_back(&PARAM_START_SENS); searchworkflow.push_back(&PARAM_SENS_STEPS); + searchworkflow.push_back(&PARAM_PREF_MODE); searchworkflow.push_back(&PARAM_EXHAUSTIVE_SEARCH); searchworkflow.push_back(&PARAM_EXHAUSTIVE_SEARCH_FILTER); searchworkflow.push_back(&PARAM_STRAND); @@ -1631,7 +1638,6 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c } } - size_t parametersFound = 0; for (int argIdx = 0; argIdx < argc; argIdx++) { // it is a parameter if it starts with - or -- const bool longParameter = (pargv[argIdx][0] == '-' && pargv[argIdx][1] == '-'); @@ -1848,8 +1854,6 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c EXIT(EXIT_FAILURE); } - - parametersFound++; } else { // parameter is actually a filename #ifdef __CYGWIN__ @@ -2263,6 +2267,7 @@ void Parameters::setDefaults() { // search workflow numIterations = 1; + prefMode = PREF_MODE_KMER; startSens = 4; sensSteps = 1; exhaustiveSearch = false; @@ -2573,6 +2578,9 @@ void Parameters::setDefaults() { taxonomySearchMode = Parameters::TAXONOMY_APPROX_2BLCA; taxonomyOutputMode = Parameters::TAXONOMY_OUTPUT_LCA; + // help + help = 0; + // substituion matrix substitutionMatrices = { {"nucleotide.out", nucleotide_out, nucleotide_out_len }, @@ -2831,6 +2839,7 @@ std::vector Parameters::getOutputFormat(int formatMode, const std::string & else if (outformatSplit[i].compare("qorfend") == 0){ code = Parameters::OUTFMT_QORFEND;} else if (outformatSplit[i].compare("torfstart") == 0){ code = Parameters::OUTFMT_TORFSTART;} else if (outformatSplit[i].compare("torfend") == 0){ code = Parameters::OUTFMT_TORFEND;} + else if (outformatSplit[i].compare("ppos") == 0){ needSequences = true; needBacktrace = true; code = Parameters::OUTFMT_PPOS;} else if (outformatSplit[i].compare("empty") == 0){ code = Parameters::OUTFMT_EMPTY;} else { Debug(Debug::ERROR) << "Format code " << outformatSplit[i] << " does not exist."; diff --git a/lib/mmseqs/src/commons/Parameters.h b/lib/mmseqs/src/commons/Parameters.h index 131ac58..745d2f8 100644 --- a/lib/mmseqs/src/commons/Parameters.h +++ b/lib/mmseqs/src/commons/Parameters.h @@ -59,6 +59,8 @@ struct MMseqsParameter { } }; +void initParameterSingleton(void); +#define DEFAULT_PARAMETER_SINGLETON_INIT void initParameterSingleton() { new Parameters; } class Parameters { public: @@ -182,6 +184,7 @@ class Parameters { static const int OUTFMT_TORFSTART = 37; static const int OUTFMT_TORFEND = 38; static const int OUTFMT_FIDENT = 39; + static const int OUTFMT_PPOS = 40; static const int INDEX_SUBSET_NORMAL = 0; static const int INDEX_SUBSET_NO_HEADERS = 1; @@ -305,6 +308,11 @@ class Parameters { static const int ID_MODE_KEYS = 0; static const int ID_MODE_LOOKUP = 1; + // prefilter mode + static const int PREF_MODE_KMER = 0; + static const int PREF_MODE_UNGAPPED = 1; + static const int PREF_MODE_EXHAUSTIVE = 2; + // unpackdb static const int UNPACK_NAME_KEY = 0; static const int UNPACK_NAME_ACCESSION = 1; @@ -449,6 +457,7 @@ class Parameters { // SEARCH WORKFLOW int numIterations; + int prefMode; float startSens; int sensSteps; bool exhaustiveSearch; @@ -705,13 +714,11 @@ class Parameters { static Parameters& getInstance() { if (instance == NULL) { - initInstance(); + initParameterSingleton(); } return *instance; } - static void initInstance() { - new Parameters; - } + friend void initParameterSingleton(void); void setDefaults(); void initMatrices(); @@ -871,6 +878,7 @@ class Parameters { PARAMETER(PARAM_NUM_ITERATIONS) PARAMETER(PARAM_START_SENS) PARAMETER(PARAM_SENS_STEPS) + PARAMETER(PARAM_PREF_MODE) PARAMETER(PARAM_EXHAUSTIVE_SEARCH) PARAMETER(PARAM_EXHAUSTIVE_SEARCH_FILTER) PARAMETER(PARAM_STRAND) @@ -1158,6 +1166,7 @@ class Parameters { std::vector profile2seq; std::vector besthitbyset; std::vector combinepvalbyset; + std::vector mergeresultsbyset; std::vector multihitdb; std::vector multihitsearch; std::vector expandaln; diff --git a/lib/mmseqs/src/commons/ProfileStates.cpp b/lib/mmseqs/src/commons/ProfileStates.cpp index c2d21cd..922fde3 100644 --- a/lib/mmseqs/src/commons/ProfileStates.cpp +++ b/lib/mmseqs/src/commons/ProfileStates.cpp @@ -135,7 +135,7 @@ int ProfileStates::readProfile(std::stringstream &in, float * profile, float * pos += Util::skipWhitespace(pos); pos += Util::skipNoneWhitespace(pos); pos += Util::skipWhitespace(pos); - float s = 0.0; + // float s = 0.0; // Store the probabilities for (int k = 0 ; k < nalph ; k++) @@ -143,7 +143,7 @@ int ProfileStates::readProfile(std::stringstream &in, float * profile, float * float score = std::strtod(pos, NULL); float prob = MathUtil::fpow2(-score/kScale); profile[ProfileStates::hh2mmseqsAAorder(k)] = prob;// /background[k]; - s+=prob; + // s+=prob; char* oldPos = pos; pos += Util::skipNoneWhitespace(pos); pos += Util::skipWhitespace(pos); diff --git a/lib/mmseqs/src/commons/Sequence.cpp b/lib/mmseqs/src/commons/Sequence.cpp index 8bbdbb9..6461efc 100644 --- a/lib/mmseqs/src/commons/Sequence.cpp +++ b/lib/mmseqs/src/commons/Sequence.cpp @@ -346,6 +346,8 @@ void Sequence::printProfile() const { } #ifdef GAP_POS_SCORING printf("%3d %3d %3d\n", gDel[i] & 0xF, gDel[i] >> 4, gIns[i]); +else + printf("\n"); #endif } } diff --git a/lib/mmseqs/src/commons/SubstitutionMatrix.cpp b/lib/mmseqs/src/commons/SubstitutionMatrix.cpp index c9d31df..f15bc99 100644 --- a/lib/mmseqs/src/commons/SubstitutionMatrix.cpp +++ b/lib/mmseqs/src/commons/SubstitutionMatrix.cpp @@ -166,7 +166,7 @@ void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrection(short *profileS } void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores, - int N, size_t alphabetSize, BaseMatrix *subMat) { + unsigned int N, size_t alphabetSize, BaseMatrix *subMat) { const int windowSize = 40; @@ -176,32 +176,33 @@ void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *prof ProfileStates ps(alphabetSize,subMat->pBack); - for (int pos = 0; pos < N; pos++) { + for (unsigned int pos = 0; pos < N; pos++) { for(size_t aa = 0; aa < alphabetSize; aa++) { pnul[pos] += *(profileScores + pos + N*aa) * ps.prior[aa]; } } - for (int i = 0; i < N; i++){ - const int minPos = std::max(0, (i - windowSize/2)); - const int maxPos = std::min(N, (i + windowSize/2)); + for (unsigned int i = 0; i < N; i++){ + const int minPos = std::max(0, ((int)i - windowSize/2)); + const unsigned int maxPos = std::min(N, (i + windowSize/2)); const int windowLength = maxPos - minPos; // negative score for the amino acids in the neighborhood of i memset(aaSum, 0, sizeof(float) * alphabetSize); - for (int j = minPos; j < maxPos; j++){ - if( i == j ) + for (unsigned int j = minPos; j < maxPos; j++){ + if (i == j) { continue; - for(size_t aa = 0; aa < alphabetSize; aa++){ + } + for (size_t aa = 0; aa < alphabetSize; aa++) { aaSum[aa] += *(profileScores + aa*N + j) - pnul[j]; } } - for(size_t aa = 0; aa < alphabetSize; aa++) { + for (size_t aa = 0; aa < alphabetSize; aa++) { profileScores[i + aa*N] = static_cast(*(profileScores + i + N*aa) - aaSum[aa]/windowLength); } } - delete [] aaSum; - delete [] pnul; + delete[] aaSum; + delete[] pnul; } diff --git a/lib/mmseqs/src/commons/SubstitutionMatrix.h b/lib/mmseqs/src/commons/SubstitutionMatrix.h index cb705a6..16c66d9 100644 --- a/lib/mmseqs/src/commons/SubstitutionMatrix.h +++ b/lib/mmseqs/src/commons/SubstitutionMatrix.h @@ -28,7 +28,7 @@ class SubstitutionMatrix : public BaseMatrix { const int N, size_t alphabetSize); static void calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores, - int N, + unsigned int N, size_t alphabetSize, BaseMatrix *subMat); static void calcGlobalAaBiasCorrection(const BaseMatrix * m, diff --git a/lib/mmseqs/src/linclust/LinsearchIndexReader.cpp b/lib/mmseqs/src/linclust/LinsearchIndexReader.cpp index e2ca3ca..8f303c8 100644 --- a/lib/mmseqs/src/linclust/LinsearchIndexReader.cpp +++ b/lib/mmseqs/src/linclust/LinsearchIndexReader.cpp @@ -25,7 +25,6 @@ size_t LinsearchIndexReader::pickCenterKmer(KmerPosition *hashSeqPair, si prevHash = BIT_SET(prevHash, 63); } size_t prevHashStart = 0; - size_t prevSetSize = 0; for (size_t elementIdx = 0; elementIdx < splitKmerCount + 1; elementIdx++) { size_t currKmer = hashSeqPair[elementIdx].kmer; if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { @@ -52,7 +51,6 @@ size_t LinsearchIndexReader::pickCenterKmer(KmerPosition *hashSeqPair, si if (hashSeqPair[elementIdx].kmer == SIZE_T_MAX) { break; } - prevSetSize++; prevHash = hashSeqPair[elementIdx].kmer; if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { prevHash = BIT_SET(prevHash, 63); diff --git a/lib/mmseqs/src/mmseqs.cpp b/lib/mmseqs/src/mmseqs.cpp index cdbb313..e7a76d7 100644 --- a/lib/mmseqs/src/mmseqs.cpp +++ b/lib/mmseqs/src/mmseqs.cpp @@ -1,6 +1,7 @@ #include "Command.h" #include "DownloadDatabase.h" #include "Prefiltering.h" +#include "Parameters.h" const char* binary_name = "mmseqs"; const char* tool_name = "MMseqs2"; @@ -12,7 +13,15 @@ extern const char* MMSEQS_CURRENT_INDEX_VERSION; const char* index_version_compatible = MMSEQS_CURRENT_INDEX_VERSION; bool hide_base_commands = false; void (*validatorUpdate)(void) = 0; -std::vector commands = {}; + +extern std::vector baseCommands; +void init() { + registerCommands(&baseCommands); +} +void (*initCommands)(void) = init; + +DEFAULT_PARAMETER_SINGLETON_INIT + std::vector externalDownloads = {}; std::vector externalThreshold = {}; diff --git a/lib/mmseqs/src/multihit/combinepvalperset.cpp b/lib/mmseqs/src/multihit/combinepvalperset.cpp index 30f0ee8..37f8e8e 100644 --- a/lib/mmseqs/src/multihit/combinepvalperset.cpp +++ b/lib/mmseqs/src/multihit/combinepvalperset.cpp @@ -105,13 +105,13 @@ class PvalueAggregator : public Aggregation { return buffer; } - size_t k = 0; + // size_t k = 0; double r = 0; const double logPvalThr = log(pvalThreshold); for (size_t i = 0; i < dataToAggregate.size(); ++i) { double logPvalue = std::strtod(dataToAggregate[i][1].c_str(), NULL); if (logPvalue < logPvalThr) { - k++; + // k++; r -= logPvalue - logPvalThr; } } diff --git a/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp b/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp index 102362b..139c8ab 100644 --- a/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp +++ b/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp @@ -36,7 +36,7 @@ CacheFriendlyOperations::~CacheFriendlyOperations(){ template size_t CacheFriendlyOperations::findDuplicates(IndexEntryLocal **input, CounterResult *output, - size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore) { + size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore) { do { setupBinPointer(); CounterResult *lastPosition = (binDataFrame + BINCOUNT * binSize) - 1; @@ -58,12 +58,16 @@ size_t CacheFriendlyOperations::mergeElementsByScore(CounterResult *inp } template -size_t CacheFriendlyOperations::mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N) { +size_t CacheFriendlyOperations::mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N, const bool keepScoredHits) { do { setupBinPointer(); hashElements(inputOutputArray, N); } while(checkForOverflowAndResizeArray(false) == true); // overflowed occurred - return mergeDiagonalDuplicates(inputOutputArray); + if(keepScoredHits){ + return mergeDiagonalKeepScoredHitsDuplicates(inputOutputArray); + }else{ + return mergeDiagonalDuplicates(inputOutputArray); + } } template @@ -93,6 +97,7 @@ size_t CacheFriendlyOperations::mergeDiagonalDuplicates(CounterResult * --n; } // combine diagonals + // we keep only the last diagonal element for (size_t n = 0; n < currBinSize; n++) { const CounterResult &element = binStartPos[n]; const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT); @@ -109,6 +114,40 @@ size_t CacheFriendlyOperations::mergeDiagonalDuplicates(CounterResult * return doubleElementCount; } + +template +size_t CacheFriendlyOperations::mergeDiagonalKeepScoredHitsDuplicates(CounterResult *output) { + size_t doubleElementCount = 0; + const CounterResult *bin_ref_pointer = binDataFrame; + // duplicateBitArray is already zero'd from findDuplicates + + for (size_t bin = 0; bin < BINCOUNT; bin++) { + const CounterResult *binStartPos = (bin_ref_pointer + bin * binSize); + const size_t currBinSize = (bins[bin] - binStartPos); + // write diagonals + 1 in reverse order in the byte array + for (size_t n = 0; n < currBinSize; n++) { + const unsigned int element = binStartPos[n].id >> (MASK_0_5_BIT); + duplicateBitArray[element] = static_cast(binStartPos[n].diagonal) + 1; + } + // combine diagonals + // we keep only the last diagonal element + size_t n = currBinSize - 1; + while (n != static_cast(-1)) { + const CounterResult &element = binStartPos[n]; + const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT); + output[doubleElementCount].id = element.id; + output[doubleElementCount].count = element.count; + output[doubleElementCount].diagonal = element.diagonal; +// std::cout << output[doubleElementCount].id << " " << (int)output[doubleElementCount].count << " " << (int)static_cast(output[doubleElementCount].diagonal) << std::endl; + // memory overflow can not happen since input array = output array + doubleElementCount += (output[doubleElementCount].count != 0 || duplicateBitArray[hashBinElement] != static_cast(binStartPos[n].diagonal)) ? 1 : 0; + duplicateBitArray[hashBinElement] = static_cast(element.diagonal); + --n; + } + } + return doubleElementCount; +} + template size_t CacheFriendlyOperations::mergeScoreDuplicates(CounterResult *output) { size_t doubleElementCount = 0; @@ -211,12 +250,12 @@ size_t CacheFriendlyOperations::findDuplicates(CounterResult *output, s output[doubleElementCount].id = element; output[doubleElementCount].count = 0; output[doubleElementCount].diagonal = tmpElementBuffer[n].diagonal; - // const unsigned char diagonal = static_cast(tmpElementBuffer[n].diagonal); + // const unsigned char diagonal = static_cast(tmpElementBuffer[n].diagonal); // memory overflow can not happen since input array = output array - // if(duplicateBitArray[hashBinElement] != tmpElementBuffer[n].diagonal){ - // std::cout << "seq="<< output[doubleElementCount].id << "\tDiag=" << (int) output[doubleElementCount].diagonal - // << " dup.Array=" << (int)duplicateBitArray[hashBinElement] << " tmp.Arr="<< (int)tmpElementBuffer[n].diagonal << std::endl; - // } + // if(duplicateBitArray[hashBinElement] != tmpElementBuffer[n].diagonal){ + // std::cout << "seq="<< output[doubleElementCount].id << "\tDiag=" << (int) output[doubleElementCount].diagonal + // << " dup.Array=" << (int)duplicateBitArray[hashBinElement] << " tmp.Arr="<< (int)tmpElementBuffer[n].diagonal << std::endl; + // } doubleElementCount += (duplicateBitArray[hashBinElement] != static_cast(tmpElementBuffer[n].diagonal)) ? 1 : 0; duplicateBitArray[hashBinElement] = static_cast(tmpElementBuffer[n].diagonal); } diff --git a/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.h b/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.h index 1206e1f..efbfa75 100644 --- a/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.h +++ b/lib/mmseqs/src/prefiltering/CacheFriendlyOperations.h @@ -81,7 +81,7 @@ class CacheFriendlyOperations { size_t mergeElementsByScore(CounterResult *inputOutputArray, const size_t N); // merge elements in CounterResult by diagonal, combines elements with same ids that occur after each other - size_t mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N); + size_t mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N, const bool keepScoredHits = false); size_t keepMaxScoreElementOnly(CounterResult *inputOutputArray, const size_t N); @@ -124,6 +124,8 @@ class CacheFriendlyOperations { size_t mergeDiagonalDuplicates(CounterResult *output); + size_t mergeDiagonalKeepScoredHitsDuplicates(CounterResult *output); + size_t keepMaxElement(CounterResult *output); }; diff --git a/lib/mmseqs/src/prefiltering/IndexTable.h b/lib/mmseqs/src/prefiltering/IndexTable.h index 83fe285..cea7e26 100644 --- a/lib/mmseqs/src/prefiltering/IndexTable.h +++ b/lib/mmseqs/src/prefiltering/IndexTable.h @@ -264,14 +264,14 @@ class IndexTable { size_t entrySize = 0; size_t minKmer = 0; - size_t emptyKmer = 0; + // size_t emptyKmer = 0; for (size_t i = 0; i < tableSize; i++) { const ptrdiff_t size = offsets[i + 1] - offsets[i]; minKmer = std::min(minKmer, (size_t) size); entrySize += size; - if (size == 0) { - emptyKmer++; - } + // if (size == 0) { + // emptyKmer++; + // } if (((size_t) size) < topElements[top_N - 1].first) continue; for (size_t j = 0; j < top_N; j++) { diff --git a/lib/mmseqs/src/prefiltering/Prefiltering.cpp b/lib/mmseqs/src/prefiltering/Prefiltering.cpp index 9efabdb..14e8992 100644 --- a/lib/mmseqs/src/prefiltering/Prefiltering.cpp +++ b/lib/mmseqs/src/prefiltering/Prefiltering.cpp @@ -226,7 +226,7 @@ Prefiltering::Prefiltering(const std::string &queryDB, if (par.taxonList.length() > 0) { - taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList); + taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList, par.threads); } else { taxonomyHook = NULL; } diff --git a/lib/mmseqs/src/prefiltering/QueryMatcher.cpp b/lib/mmseqs/src/prefiltering/QueryMatcher.cpp index 395faba..4666bcf 100644 --- a/lib/mmseqs/src/prefiltering/QueryMatcher.cpp +++ b/lib/mmseqs/src/prefiltering/QueryMatcher.cpp @@ -97,7 +97,9 @@ std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned } else { memset(compositionBias, 0, sizeof(float) * querySeq->L); } - + if(diagonalScoring == true){ + ungappedAlignment->createProfile(querySeq, compositionBias); + } size_t resultSize = match(querySeq, compositionBias); if (hook != NULL) { resultSize = hook->afterDiagonalMatchingHook(*this, resultSize); @@ -105,7 +107,7 @@ std::pair QueryMatcher::matchQuery(Sequence *querySeq, unsigned std::pair queryResult; if (diagonalScoring) { // write diagonal scores in count value - ungappedAlignment->processQuery(querySeq, compositionBias, foundDiagonals, resultSize); + ungappedAlignment->align(foundDiagonals, resultSize); memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int)); CounterResult * resultReadPos = foundDiagonals; CounterResult * resultWritePos = foundDiagonals + resultSize; @@ -267,24 +269,17 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) { //std::cout << seq->getDbKey() << std::endl; //idx.printKmer(index[kmerPos], kmerSize, kmerSubMat->num2aa); //std::cout << "\t" << current_i << "\t"<< index[kmerPos] << std::endl; - //for (size_t i = 0; i < seqListSize; i++) { - // char diag = entries[i].position_j - current_i; - // std::cout << "(" << entries[i].seqId << " " << (int) diag << ")\t"; - //} +// for (size_t i = 0; i < seqListSize; i++) { +// if(23865 == entries[i].seqId ){ +// char diag = entries[i].position_j - current_i; +// std::cout << "(" << entries[i].seqId << " " << (int) diag << ")\t"; +// } +// } //std::cout << std::endl; // detected overflow while matching if ((sequenceHits + seqListSize) >= lastSequenceHit) { stats->diagonalOverflow = true; - // realloc foundDiagonals if only 10% of memory left - if((foundDiagonalsSize - overflowHitCount) < 0.1 * foundDiagonalsSize){ - foundDiagonalsSize *= 1.5; - foundDiagonals = (CounterResult*) realloc(foundDiagonals, foundDiagonalsSize * sizeof(CounterResult)); - if(foundDiagonals == NULL){ - Debug(Debug::ERROR) << "Out of memory in QueryMatcher::match\n"; - EXIT(EXIT_FAILURE); - } - } // last pointer indexPointer[current_i + 1] = sequenceHits; //std::cout << "Overflow in i=" << indexStart << std::endl; @@ -292,10 +287,19 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) { foundDiagonals + overflowHitCount, foundDiagonalsSize - overflowHitCount, indexStart, current_i, (diagonalScoring == false)); - + // this happens only if we have two overflows in a row if (overflowHitCount != 0) { - // merge lists, hitCount is max. dbSize so there can be no overflow in mergeElements - overflowHitCount = mergeElements(foundDiagonals, hitCount + overflowHitCount); + if(diagonalScoring == true){ + overflowHitCount = mergeElements(foundDiagonals, hitCount + overflowHitCount, true); + // align the new diaognals + ungappedAlignment->align(foundDiagonals, overflowHitCount); + // We keep only the maximal diagonal scoring hit, so the max number of hits is DBsize + overflowHitCount = keepMaxScoreElementOnly(foundDiagonals, overflowHitCount); + } else { + // in case of scoring we just sum up in mergeElements, so the max number of hits is DBsize + // merge lists, hitCount is max. dbSize so there can be no overflow in mergeElements + overflowHitCount = mergeElements(foundDiagonals, hitCount + overflowHitCount); + } } else { overflowHitCount = hitCount; } @@ -463,11 +467,11 @@ size_t QueryMatcher::findDuplicates(IndexEntryLocal **hitsByIndex, return localResultSize; } -size_t QueryMatcher::mergeElements(CounterResult *foundDiagonals, size_t hitCounter) { +size_t QueryMatcher::mergeElements(CounterResult *foundDiagonals, size_t hitCounter, bool keepScoredHits) { size_t overflowHitCount = 0; #define MERGE_CASE(x) \ case x: overflowHitCount = diagonalScoring ? \ - cachedOperation##x->mergeElementsByDiagonal(foundDiagonals,hitCounter) : \ + cachedOperation##x->mergeElementsByDiagonal(foundDiagonals,hitCounter, keepScoredHits) : \ cachedOperation##x->mergeElementsByScore(foundDiagonals,hitCounter); \ break; diff --git a/lib/mmseqs/src/prefiltering/QueryMatcher.h b/lib/mmseqs/src/prefiltering/QueryMatcher.h index 7731413..08b9ebd 100644 --- a/lib/mmseqs/src/prefiltering/QueryMatcher.h +++ b/lib/mmseqs/src/prefiltering/QueryMatcher.h @@ -258,8 +258,7 @@ class QueryMatcher { size_t findDuplicates(IndexEntryLocal **hitsByIndex, CounterResult *output, size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore); - - size_t mergeElements(CounterResult *foundDiagonals, size_t hitCounter); + size_t mergeElements(CounterResult *foundDiagonals, size_t hitCounter, bool keepHitsWithCounts = false); size_t keepMaxScoreElementOnly(CounterResult *foundDiagonals, size_t resultSize); diff --git a/lib/mmseqs/src/prefiltering/QueryMatcherTaxonomyHook.h b/lib/mmseqs/src/prefiltering/QueryMatcherTaxonomyHook.h index c8a86b0..de32c33 100644 --- a/lib/mmseqs/src/prefiltering/QueryMatcherTaxonomyHook.h +++ b/lib/mmseqs/src/prefiltering/QueryMatcherTaxonomyHook.h @@ -7,20 +7,30 @@ #include "DBReader.h" #include "TaxonomyExpression.h" +#ifdef OPENMP +#include +#endif + class QueryMatcherTaxonomyHook : public QueryMatcherHook { public: - QueryMatcherTaxonomyHook(std::string targetPath, DBReader* targetReader, const std::string& expressionString) - : targetReader(targetReader), dbFrom(0) { + QueryMatcherTaxonomyHook(std::string targetPath, DBReader* targetReader, const std::string& expressionString, unsigned int threads) + : targetReader(targetReader), dbFrom(0), threads(threads) { std::string targetName = dbPathWithoutIndex(targetPath); taxonomy = NcbiTaxonomy::openTaxonomy(targetName); taxonomyMapping = new MappingReader(targetName); - expression = new TaxonomyExpression(expressionString, *taxonomy); + expression = new TaxonomyExpression*[threads]; + for (unsigned int i = 0; i < threads; i++) { + expression[i] = new TaxonomyExpression(expressionString, *taxonomy); + } } ~QueryMatcherTaxonomyHook() { delete taxonomy; delete taxonomyMapping; - delete expression; + for (unsigned int i = 0; i < threads; i++) { + delete expression[i]; + } + delete[] expression; } void setDbFrom(unsigned int from) { @@ -28,12 +38,16 @@ class QueryMatcherTaxonomyHook : public QueryMatcherHook { } size_t afterDiagonalMatchingHook(QueryMatcher& matcher, size_t resultSize) { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); +#endif size_t writePos = 0; for (size_t i = 0; i < resultSize; i++) { unsigned int currId = matcher.foundDiagonals[i].id; unsigned int key = targetReader->getDbKey(dbFrom + currId); TaxID currTax = taxonomyMapping->lookup(key); - if (expression->isAncestor(currTax)) { + if (expression[thread_idx]->isAncestor(currTax)) { if (i != writePos) { matcher.foundDiagonals[writePos] = matcher.foundDiagonals[i]; } @@ -63,9 +77,10 @@ class QueryMatcherTaxonomyHook : public QueryMatcherHook { NcbiTaxonomy* taxonomy; MappingReader* taxonomyMapping; DBReader* targetReader; - TaxonomyExpression* expression; + TaxonomyExpression** expression; unsigned int dbFrom; + unsigned int threads; }; #endif \ No newline at end of file diff --git a/lib/mmseqs/src/prefiltering/UngappedAlignment.cpp b/lib/mmseqs/src/prefiltering/UngappedAlignment.cpp index 01e4365..54cef6f 100644 --- a/lib/mmseqs/src/prefiltering/UngappedAlignment.cpp +++ b/lib/mmseqs/src/prefiltering/UngappedAlignment.cpp @@ -22,13 +22,8 @@ UngappedAlignment::~UngappedAlignment() { delete [] score_arr; } -void UngappedAlignment::processQuery(Sequence *seq, - float *biasCorrection, - CounterResult *results, - size_t resultSize) { - createProfile(seq, biasCorrection, subMatrix->subMatrix); - queryLen = seq->L; - computeScores(queryProfile, seq->L, results, resultSize); +void UngappedAlignment::align(CounterResult *results, size_t resultSize) { + computeScores(queryProfile, queryLen, results, resultSize); } @@ -290,7 +285,7 @@ void UngappedAlignment::scoreDiagonalAndUpdateHits(const char * queryProfile, // update score for(size_t hitIdx = 0; hitIdx < hitSize; hitIdx++){ hits[seqs[hitIdx].id]->count = static_cast(std::min(static_cast(255), - score_arr[hitIdx])); + score_arr[hitIdx])); if(seqs[hitIdx].seqLen == 1){ std::pair dbSeq = sequenceLookup->getSequence(hits[hitIdx]->id); if(dbSeq.second >= 32768){ @@ -344,6 +339,10 @@ void UngappedAlignment::computeScores(const char *queryProfile, // continue; // } const unsigned short currDiag = results[i].diagonal; + // skip results that already have a diagonal score + if(results[i].count != 0){ + continue; + } diagonalMatches[currDiag * DIAGONALBINSIZE + diagonalCounter[currDiag]] = &results[i]; diagonalCounter[currDiag]++; if(diagonalCounter[currDiag] == DIAGONALBINSIZE) { @@ -384,9 +383,8 @@ void UngappedAlignment::extractScores(unsigned int *score_arr, simd_int score) { void UngappedAlignment::createProfile(Sequence *seq, - float * biasCorrection, - short **subMat) { - + float * biasCorrection) { + queryLen = seq->L; if(Parameters::isEqualDbtype(seq->getSequenceType(), Parameters::DBTYPE_HMM_PROFILE)) { memset(queryProfile, 0, (Sequence::PROFILE_AA_SIZE + 1) * seq->L); }else{ @@ -409,7 +407,7 @@ void UngappedAlignment::createProfile(Sequence *seq, for (int pos = 0; pos < seq->L; pos++) { unsigned int aaIdx = seq->numSequence[pos]; for (int i = 0; i < subMatrix->alphabetSize; i++) { - queryProfile[pos * (Sequence::PROFILE_AA_SIZE + 1) + i] = (subMat[aaIdx][i] + aaCorrectionScore[pos]); + queryProfile[pos * (Sequence::PROFILE_AA_SIZE + 1) + i] = (subMatrix->subMatrix[aaIdx][i] + aaCorrectionScore[pos]); } } } diff --git a/lib/mmseqs/src/prefiltering/UngappedAlignment.h b/lib/mmseqs/src/prefiltering/UngappedAlignment.h index d350674..79e5e9d 100644 --- a/lib/mmseqs/src/prefiltering/UngappedAlignment.h +++ b/lib/mmseqs/src/prefiltering/UngappedAlignment.h @@ -18,10 +18,12 @@ class UngappedAlignment { ~UngappedAlignment(); + void createProfile(Sequence *seq, float *biasCorrection); + // This function computes the diagonal score for each CounterResult object // it assigns the diagonal score to the CounterResult object - void processQuery(Sequence *seq, float *compositionBias, CounterResult *results, - size_t resultSize); + void align(CounterResult *results, + size_t resultSize); int scoreSingelSequenceByCounterResult(CounterResult &result); @@ -90,8 +92,6 @@ class UngappedAlignment { void extractScores(unsigned int *score_arr, simd_int score); - void createProfile(Sequence *seq, float *biasCorrection, short **subMat); - int computeSingelSequenceScores(const char *queryProfile, const unsigned int queryLen, std::pair &dbSeq, int diagonal, unsigned int minDistToDiagonal); diff --git a/lib/mmseqs/src/prefiltering/ungappedprefilter.cpp b/lib/mmseqs/src/prefiltering/ungappedprefilter.cpp index 09e2f51..84bc0a9 100644 --- a/lib/mmseqs/src/prefiltering/ungappedprefilter.cpp +++ b/lib/mmseqs/src/prefiltering/ungappedprefilter.cpp @@ -77,8 +77,8 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m QueryMatcherTaxonomyHook * taxonomyHook = NULL; - if(par.PARAM_TAXON_LIST.wasSet){ - taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList); + if (par.PARAM_TAXON_LIST.wasSet) { + taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList, par.threads); } Debug::Progress progress(qdbr->getSize()); @@ -121,7 +121,7 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m unsigned int targetKey = tdbr->getDbKey(tId); if(taxonomyHook != NULL){ TaxID currTax = taxonomyHook->taxonomyMapping->lookup(targetKey); - if (taxonomyHook->expression->isAncestor(currTax) == false) { + if (taxonomyHook->expression[thread_idx]->isAncestor(currTax) == false) { continue; } } diff --git a/lib/mmseqs/src/test/TestAlignment.cpp b/lib/mmseqs/src/test/TestAlignment.cpp index 62f528a..394278e 100644 --- a/lib/mmseqs/src/test/TestAlignment.cpp +++ b/lib/mmseqs/src/test/TestAlignment.cpp @@ -20,6 +20,7 @@ #include "Matcher.h" const char* binary_name = "test_alignment"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { const size_t kmer_size=6; diff --git a/lib/mmseqs/src/test/TestAlignmentPerformance.cpp b/lib/mmseqs/src/test/TestAlignmentPerformance.cpp index 225bd2b..d2b59e9 100644 --- a/lib/mmseqs/src/test/TestAlignmentPerformance.cpp +++ b/lib/mmseqs/src/test/TestAlignmentPerformance.cpp @@ -24,6 +24,7 @@ #include "StripedSmithWaterman.h" const char* binary_name = "test_alignmentperformance"; +DEFAULT_PARAMETER_SINGLETON_INIT #define MAX_FILENAME_LIST_FILES 4096 diff --git a/lib/mmseqs/src/test/TestAlignmentTraceback.cpp b/lib/mmseqs/src/test/TestAlignmentTraceback.cpp index beddd5a..e002d18 100644 --- a/lib/mmseqs/src/test/TestAlignmentTraceback.cpp +++ b/lib/mmseqs/src/test/TestAlignmentTraceback.cpp @@ -18,6 +18,7 @@ #include "Parameters.h" const char* binary_name = "test_alignmenttraceback"; +DEFAULT_PARAMETER_SINGLETON_INIT struct scores{ short H; @@ -87,16 +88,16 @@ void sw( // backtrace int i=target_length - 1; int j=query_length - 1; - int step = 0; + // int step = 0; int state = get_val(bt, i, j); - int matched_cols = 0; + // int matched_cols = 0; while (state!=-1) // while (state!=STOP) because STOP=0 { - step++; + // step++; //std::cout << step<< " " << i << " " << j << " " << state << std::endl; switch (state) { case M: // current state is MM, previous state is bMM[i][j] - matched_cols++; + // matched_cols++; fprintf(stdout,"%c", subMat.num2aa[db_sequence[target_start+i]]); if(query_sequence[query_start + j] == db_sequence[target_start + i]){ fprintf(stdout, "|"); diff --git a/lib/mmseqs/src/test/TestAlp.cpp b/lib/mmseqs/src/test/TestAlp.cpp index d3b98a4..ab9c13a 100644 --- a/lib/mmseqs/src/test/TestAlp.cpp +++ b/lib/mmseqs/src/test/TestAlp.cpp @@ -43,6 +43,7 @@ Contents: pairwise alignment algorithms #include "sls_alignment_evaluer.hpp" const char* binary_name = "test_alp"; +DEFAULT_PARAMETER_SINGLETON_INIT using namespace Sls; using namespace std; diff --git a/lib/mmseqs/src/test/TestBacktraceTranslator.cpp b/lib/mmseqs/src/test/TestBacktraceTranslator.cpp index 39328d0..b8a7bfd 100644 --- a/lib/mmseqs/src/test/TestBacktraceTranslator.cpp +++ b/lib/mmseqs/src/test/TestBacktraceTranslator.cpp @@ -1,5 +1,4 @@ #include "BacktraceTranslator.h" -#include "Parameters.h" const char* binary_name = "test_backtracetranslator"; diff --git a/lib/mmseqs/src/test/TestBestAlphabet.cpp b/lib/mmseqs/src/test/TestBestAlphabet.cpp index fb432c5..0289775 100644 --- a/lib/mmseqs/src/test/TestBestAlphabet.cpp +++ b/lib/mmseqs/src/test/TestBestAlphabet.cpp @@ -14,7 +14,6 @@ #include "DBReader.h" #include "Sequence.h" #include "Indexer.h" -#include "Parameters.h" const char* binary_name = "test_bestalphabet"; diff --git a/lib/mmseqs/src/test/TestCompositionBias.cpp b/lib/mmseqs/src/test/TestCompositionBias.cpp index 1a3e699..5fb33d5 100644 --- a/lib/mmseqs/src/test/TestCompositionBias.cpp +++ b/lib/mmseqs/src/test/TestCompositionBias.cpp @@ -9,6 +9,7 @@ #include "Parameters.h" const char* binary_name = "test_compositionbias"; +DEFAULT_PARAMETER_SINGLETON_INIT void calcLocalAaBiasCorrection(Sequence* seq, SubstitutionMatrix * m){ const int windowSize = 40; diff --git a/lib/mmseqs/src/test/TestDiagonalScoring.cpp b/lib/mmseqs/src/test/TestDiagonalScoring.cpp index cdb5b45..1640740 100644 --- a/lib/mmseqs/src/test/TestDiagonalScoring.cpp +++ b/lib/mmseqs/src/test/TestDiagonalScoring.cpp @@ -14,6 +14,7 @@ #include "Parameters.h" const char* binary_name = "test_diagonalscoring"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { size_t kmer_size = 6; @@ -106,11 +107,12 @@ int main (int, const char**) { // std::cout << compositionBias[i] << " "; // } // std::cout << std::endl; +// matcher.createProfile(&s5, compositionBias); // for(size_t i = 0; i < s6.L; i++){ // hits[0].id = s6.getId(); // hits[0].diagonal = 0; // hits[0].count = 0; -// matcher.processQuery(&s5, compositionBias, hits, 1, 0); +// matcher.align(hits, 1, 0); // std::cout << hits[0].diagonal << " " << (int)hits[0].count << std::endl; // } @@ -120,75 +122,83 @@ int main (int, const char**) { hits[0].id = s1.getId(); hits[0].diagonal = 0; - matcher.processQuery(&s1, compositionBias, hits, 1); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s1.getId(); hits[i].diagonal = 0; } - matcher.processQuery(&s1, compositionBias, hits, 16); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; + matcher.createProfile(&s2, compositionBias); hits[0].id = s1.getId(); hits[0].diagonal = 9; - matcher.processQuery(&s2, compositionBias, hits, 1); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s1.getId(); hits[i].diagonal = 9; } - matcher.processQuery(&s2, compositionBias, hits, 16); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s2.getId(); hits[i].diagonal = -9; } - matcher.processQuery(&s1, compositionBias, hits, 16); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; - matcher.processQuery(&s1, compositionBias, hits, 1); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s2.getId(); hits[i].diagonal = -9; } - matcher.processQuery(&s3, compositionBias, hits, 16); + matcher.createProfile(&s3, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; - matcher.processQuery(&s3, compositionBias, hits, 1); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s4.getId(); hits[0].diagonal = -256; - matcher.processQuery(&s1, compositionBias, hits, 1); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s1.getId(); hits[0].diagonal = 256; - matcher.processQuery(&s4,compositionBias, hits, 1); + matcher.createProfile(&s4, compositionBias); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s7.getId(); hits[0].diagonal = -512; - matcher.processQuery(&s1,compositionBias, hits, 16); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s1.getId(); hits[0].diagonal = 512; - matcher.processQuery(&s7,compositionBias, hits, 16); + matcher.createProfile(&s7, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s7.getId(); hits[0].diagonal = 0; - matcher.processQuery(&s7, compositionBias, hits, 16); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; delete [] compositionBias; diff --git a/lib/mmseqs/src/test/TestDiagonalScoringPerformance.cpp b/lib/mmseqs/src/test/TestDiagonalScoringPerformance.cpp index 3f224d2..ed8d386 100644 --- a/lib/mmseqs/src/test/TestDiagonalScoringPerformance.cpp +++ b/lib/mmseqs/src/test/TestDiagonalScoringPerformance.cpp @@ -20,6 +20,7 @@ KSEQ_INIT(int, read) #include "Parameters.h" const char* binary_name = "test_diagonalscoringperformance"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { size_t kmer_size = 6; @@ -91,17 +92,17 @@ int main (int, const char**) { SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias, 1.0); - - matcher.processQuery(&s1,compositionBias, hits, 16); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 16); std::cout << (int)hits[0].count << " "; std::cout << (int)hits[1].count << " "; std::cout << (int)hits[2].count << " "; std::cout << (int)hits[3].count << std::endl; - matcher.processQuery(&s1, compositionBias, hits, 1); - matcher.processQuery(&s1, compositionBias, hits + 1, 1); - matcher.processQuery(&s1, compositionBias, hits + 2, 1); - matcher.processQuery(&s1, compositionBias, hits + 3, 1); + matcher.align(hits, 1); + matcher.align(hits + 1, 1); + matcher.align(hits + 2, 1); + matcher.align(hits + 3, 1); std::cout << (int)hits[0].count<< " "; std::cout << (int)hits[1].count<< " "; @@ -113,7 +114,7 @@ int main (int, const char**) { hits[j].diagonal = rand()%s1.L; } // std::reverse(hits, hits+1000); - matcher.processQuery(&s1, compositionBias, hits, 16000); + matcher.align(hits, 16000); } // std::cout << ExtendedSubstitutionMatrix::calcScore(s1.sequence, s1.sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].diagonalScore << std::endl; // std::cout << (int)hits[0].diagonalScore << std::endl; diff --git a/lib/mmseqs/src/test/TestKmerGenerator.cpp b/lib/mmseqs/src/test/TestKmerGenerator.cpp index a745689..5fea731 100644 --- a/lib/mmseqs/src/test/TestKmerGenerator.cpp +++ b/lib/mmseqs/src/test/TestKmerGenerator.cpp @@ -15,6 +15,7 @@ #include "Parameters.h" const char* binary_name = "test_kmergenerator"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { const size_t kmer_size=6; diff --git a/lib/mmseqs/src/test/TestKmerNucl.cpp b/lib/mmseqs/src/test/TestKmerNucl.cpp index 7456159..384d875 100644 --- a/lib/mmseqs/src/test/TestKmerNucl.cpp +++ b/lib/mmseqs/src/test/TestKmerNucl.cpp @@ -6,6 +6,7 @@ #include "Orf.h" const char* binary_name = "test_kmernucl"; +DEFAULT_PARAMETER_SINGLETON_INIT std::string kmerToSting(size_t idx, int size) { char output[32]; diff --git a/lib/mmseqs/src/test/TestKmerScore.cpp b/lib/mmseqs/src/test/TestKmerScore.cpp index 77a6bf9..66b4be2 100644 --- a/lib/mmseqs/src/test/TestKmerScore.cpp +++ b/lib/mmseqs/src/test/TestKmerScore.cpp @@ -8,6 +8,7 @@ #include "Parameters.h" const char* binary_name = "test_kmerscore"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { const size_t kmer_size = 6; diff --git a/lib/mmseqs/src/test/TestKsw2.cpp b/lib/mmseqs/src/test/TestKsw2.cpp index 8f36780..21f130d 100644 --- a/lib/mmseqs/src/test/TestKsw2.cpp +++ b/lib/mmseqs/src/test/TestKsw2.cpp @@ -255,7 +255,7 @@ void align(const char * qSeqAscii, uint8_t *qseq, uint8_t *qseqrev, const int qL std::string middleAln; int queryPos = qStartPos; int targetPos = tStartPos; - int aaIds = 0; + // int aaIds = 0; for(int i = 0; i < ezAlign.n_cigar; i++){ int len = ezAlign.cigar[i]>>4; switch("MID"[ezAlign.cigar[i]&0xf]){ @@ -264,7 +264,7 @@ void align(const char * qSeqAscii, uint8_t *qseq, uint8_t *qseqrev, const int qL queryAln.push_back(qSeqAscii[queryPos]); if (qSeqAscii[queryPos] == tSeqAscii[targetPos]) { middleAln.push_back('|'); - aaIds++; + // aaIds++; } else { middleAln.push_back('*'); } diff --git a/lib/mmseqs/src/test/TestKwayMerge.cpp b/lib/mmseqs/src/test/TestKwayMerge.cpp index 864ce52..d62619d 100644 --- a/lib/mmseqs/src/test/TestKwayMerge.cpp +++ b/lib/mmseqs/src/test/TestKwayMerge.cpp @@ -8,7 +8,6 @@ #include "SubstitutionMatrix.h" #include "Sequence.h" -#include "Parameters.h" const char* binary_name = "test_kwaymerge"; struct KmerEntry{ diff --git a/lib/mmseqs/src/test/TestMultipleAlignment.cpp b/lib/mmseqs/src/test/TestMultipleAlignment.cpp index 4957397..c16c26e 100644 --- a/lib/mmseqs/src/test/TestMultipleAlignment.cpp +++ b/lib/mmseqs/src/test/TestMultipleAlignment.cpp @@ -14,6 +14,7 @@ #include "Parameters.h" const char* binary_name = "test_multiplealignment"; +DEFAULT_PARAMETER_SINGLETON_INIT int main(int, const char**) { Parameters& par = Parameters::getInstance(); @@ -88,7 +89,7 @@ int main(int, const char**) { #ifdef GAP_POS_SCORING , alnResults #endif - , false + , false, 0.0 ); pssm.printProfile(res.centerLength); pssm.printPSSM(res.centerLength); diff --git a/lib/mmseqs/src/test/TestPSSM.cpp b/lib/mmseqs/src/test/TestPSSM.cpp index 7feb3ef..4a469c5 100644 --- a/lib/mmseqs/src/test/TestPSSM.cpp +++ b/lib/mmseqs/src/test/TestPSSM.cpp @@ -15,6 +15,7 @@ #include "MultipleAlignment.h" const char* binary_name = "test_pssm"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { Parameters& par = Parameters::getInstance(); @@ -1611,7 +1612,7 @@ int main (int, const char**) { , par.gapPseudoCount #endif ); - pssm.computePSSMFromMSA(filteredSetSize, res.centerLength, (const char**) res.msaSequence, false); + pssm.computePSSMFromMSA(filteredSetSize, res.centerLength, (const char**) res.msaSequence, false, 0.0); //pssm.printProfile(res.centerLength); pssm.printPSSM(res.centerLength); MultipleAlignment::deleteMSA(&res); diff --git a/lib/mmseqs/src/test/TestPSSMPrune.cpp b/lib/mmseqs/src/test/TestPSSMPrune.cpp index e3428f5..3212042 100644 --- a/lib/mmseqs/src/test/TestPSSMPrune.cpp +++ b/lib/mmseqs/src/test/TestPSSMPrune.cpp @@ -16,6 +16,7 @@ #include const char* binary_name = "test_pssmprune"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { Parameters& par = Parameters::getInstance(); @@ -90,7 +91,7 @@ int main (int, const char**) { //seqSet.push_back(s5); // PSSMCalculator pssm(&subMat, counter, 1.0, 1.5); -// pssm.computePSSMFromMSA(filterResult.setSize, res.centerLength, filterResult.filteredMsaSequence, false); +// pssm.computePSSMFromMSA(filterResult.setSize, res.centerLength, filterResult.filteredMsaSequence, false, 0.0); // //pssm.printProfile(res.centerLength); // pssm.printPSSM(res.centerLength); MultipleAlignment::deleteMSA(&res); diff --git a/lib/mmseqs/src/test/TestProfileAlignment.cpp b/lib/mmseqs/src/test/TestProfileAlignment.cpp index ad63383..74a3779 100644 --- a/lib/mmseqs/src/test/TestProfileAlignment.cpp +++ b/lib/mmseqs/src/test/TestProfileAlignment.cpp @@ -19,6 +19,7 @@ #include "Parameters.h" const char* binary_name = "test_profilealignment"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { const size_t kmer_size=6; @@ -774,7 +775,7 @@ int main (int, const char**) { 21 : subMat.aa2num[(int) msaSeq[k][pos]]; } } - PSSMCalculator::Profile pssmRet = pssmCalculator.computePSSMFromMSA(setSize,centerSeqSize, (const char **) msaSequence, false); + PSSMCalculator::Profile pssmRet = pssmCalculator.computePSSMFromMSA(setSize,centerSeqSize, (const char **) msaSequence, false, 0.0); const char * sequence = pssmRet.pssm; char * data = new char[centerSeqSize*20+1]; for (size_t i = 0; i < centerSeqSize*20; i++) { diff --git a/lib/mmseqs/src/test/TestReduceMatrix.cpp b/lib/mmseqs/src/test/TestReduceMatrix.cpp index b75975d..7c6f922 100644 --- a/lib/mmseqs/src/test/TestReduceMatrix.cpp +++ b/lib/mmseqs/src/test/TestReduceMatrix.cpp @@ -12,6 +12,7 @@ #include "Parameters.h" const char* binary_name = "test_reducematrix"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { const int reductionAlphabetSize = 17; diff --git a/lib/mmseqs/src/test/TestScoreMatrixSerialization.cpp b/lib/mmseqs/src/test/TestScoreMatrixSerialization.cpp index 0f383c4..bfcb2ca 100644 --- a/lib/mmseqs/src/test/TestScoreMatrixSerialization.cpp +++ b/lib/mmseqs/src/test/TestScoreMatrixSerialization.cpp @@ -5,6 +5,7 @@ #include "Debug.h" const char* binary_name = "test_scorematrixserialization"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { Parameters& par = Parameters::getInstance(); diff --git a/lib/mmseqs/src/test/TestSequenceIndex.cpp b/lib/mmseqs/src/test/TestSequenceIndex.cpp index ffdbebf..aa67c65 100644 --- a/lib/mmseqs/src/test/TestSequenceIndex.cpp +++ b/lib/mmseqs/src/test/TestSequenceIndex.cpp @@ -11,6 +11,7 @@ #include "Parameters.h" const char* binary_name = "test_sequenceindex"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { size_t kmer_size = 6; diff --git a/lib/mmseqs/src/test/TestTanTan.cpp b/lib/mmseqs/src/test/TestTanTan.cpp index fe1fc0b..0567069 100644 --- a/lib/mmseqs/src/test/TestTanTan.cpp +++ b/lib/mmseqs/src/test/TestTanTan.cpp @@ -11,6 +11,7 @@ #include "Parameters.h" const char* binary_name = "test_tantan"; +DEFAULT_PARAMETER_SINGLETON_INIT int main (int, const char**) { const size_t kmer_size = 6; diff --git a/lib/mmseqs/src/util/CMakeLists.txt b/lib/mmseqs/src/util/CMakeLists.txt index 3b2638f..c43c335 100644 --- a/lib/mmseqs/src/util/CMakeLists.txt +++ b/lib/mmseqs/src/util/CMakeLists.txt @@ -31,6 +31,7 @@ set(util_source_files util/filterdb.cpp util/gff2db.cpp util/renamedbkeys.cpp + util/makepaddedseqdb.cpp util/masksequence.cpp util/maskbygff.cpp util/mergeclusters.cpp @@ -46,6 +47,7 @@ set(util_source_files util/profile2pssm.cpp util/profile2neff.cpp util/profile2seq.cpp + util/recoverlongestorf.cpp util/result2dnamsa.cpp util/result2flat.cpp util/result2msa.cpp diff --git a/lib/mmseqs/src/util/convertalignments.cpp b/lib/mmseqs/src/util/convertalignments.cpp index f0d8893..637e179 100644 --- a/lib/mmseqs/src/util/convertalignments.cpp +++ b/lib/mmseqs/src/util/convertalignments.cpp @@ -349,6 +349,9 @@ int convertalignments(int argc, const char **argv, const Command &command) { std::string newBacktrace; newBacktrace.reserve(1024); + std::string complementBuffer; + complementBuffer.reserve(1024); + const TaxonNode * taxonNode = NULL; #pragma omp for schedule(dynamic, 10) @@ -636,6 +639,56 @@ int convertalignments(int argc, const char **argv, const Command &command) { case Parameters::OUTFMT_TORFEND: result.append(SSTR(res.dbOrfEndPos)); break; + case Parameters::OUTFMT_PPOS: { + float pPositive = 0; + int matchCount = 0; + if (res.backtrace.empty() == false) { + int qPos = 0; + int tPos = 0; + for (size_t pos = 0; pos < res.backtrace.size(); pos++) { + switch (res.backtrace[pos]) { + case 'M': { + char qRes = queryProfile ? queryProfData[qPos] : querySeqData[qPos]; + char tRes = targetProfile ? targetProfData[tPos] : targetSeqData[tPos]; + pPositive += (subMat->subMatrix[subMat->aa2num[(int)qRes]][subMat->aa2num[(int)tRes]] > 0); + matchCount += 1; + qPos++; + tPos++; + break; + } + case 'D': + qPos++; + break; + case 'I': + tPos++; + break; + } + } + pPositive /= matchCount; + } + result.append(SSTR(pPositive)); + break; + } + case Parameters::OUTFMT_QFRAME: { + int frame; + if (res.qStartPos <= res.qEndPos) { + frame = (res.qStartPos - 1) % 3 + 1; + } else { + frame = -1 * ((res.qLen - res.qStartPos) % 3 + 1); + } + result.append(SSTR(frame)); + break; + } + case Parameters::OUTFMT_TFRAME: { + int frame; + if (res.dbStartPos <= res.dbEndPos) { + frame = (res.dbStartPos - 1) % 3 + 1; + } else { + frame = -1 * ((res.dbLen - res.dbStartPos) % 3 + 1); + } + result.append(SSTR(frame)); + break; + } } if (i < outcodes.size() - 1) { result.push_back('\t'); @@ -664,32 +717,45 @@ int convertalignments(int argc, const char **argv, const Command &command) { break; } case Parameters::FORMAT_ALIGNMENT_SAM: { - bool strand = res.qEndPos > res.qStartPos; + bool forward = res.qEndPos > res.qStartPos; int rawScore = static_cast(evaluer->computeRawScoreFromBitScore(res.score) + 0.5); uint32_t mapq = -4.343 * log(exp(static_cast(-rawScore))); mapq = (uint32_t) (mapq + 4.99); mapq = mapq < 254 ? mapq : 254; - int count = snprintf(buffer, sizeof(buffer), "%s\t%d\t%s\t%d\t%d\t", queryId.c_str(), (strand) ? 16: 0, targetId.c_str(), res.dbStartPos + 1, mapq); + int count = snprintf(buffer, sizeof(buffer), "%s\t%d\t%s\t%d\t%d\t", queryId.c_str(), (forward) ? 0 : 16, targetId.c_str(), res.dbStartPos + 1, mapq); if (count < 0 || static_cast(count) >= sizeof(buffer)) { Debug(Debug::WARNING) << "Truncated line in entry" << i << "!\n"; continue; } result.append(buffer, count); - if (isTranslatedSearch == true && targetNucs == true && queryNucs == true) { + if (isTranslatedSearch == true && queryNucs == true) { Matcher::result_t::protein2nucl(res.backtrace, newBacktrace); result.append(newBacktrace); newBacktrace.clear(); - } else { result.append(res.backtrace); } result.append("\t*\t0\t0\t"); int start = std::min(res.qStartPos, res.qEndPos); int end = std::max(res.qStartPos, res.qEndPos); - if (queryProfile) { - result.append(queryProfData.c_str() + start, (end + 1) - start); + if (forward == false && queryNucs == true) { + if (queryProfile) { + complementBuffer.assign(queryProfData.c_str() + start, (end + 1) - start); + } else { + complementBuffer.assign(querySeqData + start, (end + 1) - start); + } + std::reverse(complementBuffer.begin(), complementBuffer.end()); + for (size_t i = 0; i < complementBuffer.size(); i++) { + complementBuffer[i] = Orf::complement(complementBuffer[i]); + } + result.append(complementBuffer); + complementBuffer.clear(); } else { - result.append(querySeqData + start, (end + 1) - start); + if (queryProfile) { + result.append(queryProfData.c_str() + start, (end + 1) - start); + } else { + result.append(querySeqData + start, (end + 1) - start); + } } count = snprintf(buffer, sizeof(buffer), "\t*\tAS:i:%d\tNM:i:%d\n", rawScore, missMatchCount); if (count < 0 || static_cast(count) >= sizeof(buffer)) { diff --git a/lib/mmseqs/src/util/expandaln.cpp b/lib/mmseqs/src/util/expandaln.cpp index b43630f..cbb767c 100644 --- a/lib/mmseqs/src/util/expandaln.cpp +++ b/lib/mmseqs/src/util/expandaln.cpp @@ -321,8 +321,8 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl continue; } } else { - size_t cSeqId = cReader->getId(cSeqKey); if (returnAlnRes == false || par.expansionMode == Parameters::EXPAND_RESCORE_BACKTRACE) { + size_t cSeqId = cReader->getId(cSeqKey); cSeq.mapSequence(cSeqId, cSeqKey, cReader->getData(cSeqId, thread_idx), cReader->getSeqLen(cSeqId)); } //rescoreResultByBacktrace(resultAc, aSeq, cSeq, subMat, compositionBias, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid()); @@ -393,7 +393,7 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl (int)(par.filterMaxSeqId * 100), par.Ndiff, par.filterMinEnable, (const char **) res.msaSequence, true) : res.setSize; - PSSMCalculator::Profile pssmRes = calculator->computePSSMFromMSA(filteredSetSize, aSeq.L, (const char **) res.msaSequence, par.wg); + PSSMCalculator::Profile pssmRes = calculator->computePSSMFromMSA(filteredSetSize, aSeq.L, (const char **) res.msaSequence, par.wg, 0.0); if (par.maskProfile == true) { masker->mask(aSeq, par.maskProb, pssmRes); } diff --git a/lib/mmseqs/src/util/extractframes.cpp b/lib/mmseqs/src/util/extractframes.cpp index 94c4236..348d40b 100644 --- a/lib/mmseqs/src/util/extractframes.cpp +++ b/lib/mmseqs/src/util/extractframes.cpp @@ -78,12 +78,12 @@ int extractframes(int argc, const char **argv, const Command& command) { if(reverseFrames != 0){ size_t sequenceLength = dataLength -2; - bool hasWrongChar = false; + // bool hasWrongChar = false; for(size_t pos = 0; pos < sequenceLength; ++pos) { char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]); reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement; reverseComplementStr.push_back(reverseComplement); - hasWrongChar |= (reverseComplement == '.'); + // hasWrongChar |= (reverseComplement == '.'); } // if(hasWrongChar == true){ // continue; diff --git a/lib/mmseqs/src/util/gff2db.cpp b/lib/mmseqs/src/util/gff2db.cpp index f112646..6266e08 100644 --- a/lib/mmseqs/src/util/gff2db.cpp +++ b/lib/mmseqs/src/util/gff2db.cpp @@ -33,7 +33,7 @@ int gff2db(int argc, const char **argv, const Command &command) { headerWriter.open(); std::string outLookup = outDb + ".lookup"; std::string outLookupIndex = outDb + ".lookup.index"; - DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_OMIT_FILE); + DBWriter lookupWriter(outLookup.c_str(), outLookupIndex.c_str(), par.threads, 0, Parameters::DBTYPE_OMIT_FILE); lookupWriter.open(); FILE *source = FileUtil::openAndDelete((outDb + ".source").c_str(), "w"); diff --git a/lib/mmseqs/src/util/indexdb.cpp b/lib/mmseqs/src/util/indexdb.cpp index 61fbabd..c9c6470 100644 --- a/lib/mmseqs/src/util/indexdb.cpp +++ b/lib/mmseqs/src/util/indexdb.cpp @@ -13,7 +13,7 @@ void setIndexDbDefaults(Parameters *p) { p->sensitivity = 5.7; } -std::string findIncompatibleParameter(DBReader& index, const Parameters& par, const int dbtype) { +std::string findIncompatibleParameter(DBReader& index, const Parameters& par, int kmerScore, const int dbtype) { PrefilteringIndexData meta = PrefilteringIndexReader::getMetadata(&index); if (meta.compBiasCorr != par.compBiasCorrection) return "compBiasCorrection"; @@ -27,13 +27,8 @@ std::string findIncompatibleParameter(DBReader& index, const Param return "kmerSize"; if (meta.mask != par.maskMode) return "maskMode"; - if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE)) { - if (meta.kmerThr != par.kmerScore.values.profile()) - return "kmerScore"; - } else { - if (meta.kmerThr != par.kmerScore.values.sequence()) - return "kmerScore"; - } + if (meta.kmerThr != kmerScore) + return "kmerScore"; if (meta.spacedKmer != par.spacedKmer) return "spacedKmer"; if (BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.aminoacid().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index) && @@ -139,7 +134,7 @@ int indexdb(int argc, const char **argv, const Command &command) { } std::string check; - const bool compatible = PrefilteringIndexReader::checkIfIndexFile(&index) && (check = findIncompatibleParameter(index, par, dbr.getDbtype())) == ""; + const bool compatible = PrefilteringIndexReader::checkIfIndexFile(&index) && (check = findIncompatibleParameter(index, par, kmerScore, dbr.getDbtype())) == ""; index.close(); if (compatible) { Debug(Debug::INFO) << "Index is up to date and compatible. Force recreation with --check-compatibility 0 parameter.\n"; diff --git a/lib/mmseqs/src/util/makepaddedseqdb.cpp b/lib/mmseqs/src/util/makepaddedseqdb.cpp new file mode 100644 index 0000000..6239fe2 --- /dev/null +++ b/lib/mmseqs/src/util/makepaddedseqdb.cpp @@ -0,0 +1,54 @@ +#include "Parameters.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Debug.h" +#include "Util.h" + +int makepaddedseqdb(int argc, const char **argv, const Command &command) { + const char AA_TO_20[256] = { + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 0, 20, 4, 3, 6, 13, 7, 8, 9, 20, 11, 10, 12, 2, 20, + 14, 5, 1, 15, 16, 20, 19, 17, 20, 18, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 + }; + + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + DBReader dbr(par.db1.c_str(), par.db1Index.c_str(), 1, + DBReader::USE_INDEX | DBReader::USE_DATA); + dbr.open(DBReader::SORT_BY_LENGTH); + DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), 1, false, dbr.getDbtype()); + writer.open(); + std::string result; + const int ALIGN = 4; + for (long id = dbr.getSize() - 1; id >= 0; id--) { + unsigned int key = dbr.getDbKey(id); + char *data = dbr.getData(id, 0); + size_t seqLen = dbr.getSeqLen(id); + const size_t sequencepadding = (seqLen % ALIGN == 0) ? 0 : ALIGN - seqLen % ALIGN; + for (size_t i = 0; i < seqLen; i++) { + result.append(1, AA_TO_20[(unsigned char)data[i]]); + } + result.append(sequencepadding, static_cast(20)); + writer.writeData(result.c_str(), result.size(), key, 0, false, false); + writer.writeIndexEntry(key, writer.getStart(0), seqLen, 0); + result.clear(); + } + writer.close(true, false); + DBReader::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY); + + dbr.close(); + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/lib/mmseqs/src/util/msa2profile.cpp b/lib/mmseqs/src/util/msa2profile.cpp index e6fb0a2..0189cfd 100644 --- a/lib/mmseqs/src/util/msa2profile.cpp +++ b/lib/mmseqs/src/util/msa2profile.cpp @@ -413,7 +413,7 @@ int msa2profile(int argc, const char **argv, const Command &command) { #ifdef GAP_POS_SCORING alnResults, #endif - par.wg); + par.wg, 0.0); if (par.compBiasCorrection == true) { SubstitutionMatrix::calcGlobalAaBiasCorrection(&subMat, pssmRes.pssm, pNullBuffer, Sequence::PROFILE_AA_SIZE, diff --git a/lib/mmseqs/src/util/msa2result.cpp b/lib/mmseqs/src/util/msa2result.cpp index 5ed6521..4dbc93c 100644 --- a/lib/mmseqs/src/util/msa2result.cpp +++ b/lib/mmseqs/src/util/msa2result.cpp @@ -384,7 +384,7 @@ int msa2result(int argc, const char **argv, const Command &command) { static_cast(par.filterMaxSeqId * 100), par.Ndiff, par.filterMinEnable, (const char **) msaSequences, true); } - PSSMCalculator::Profile pssmRes = calculator.computePSSMFromMSA(filteredSetSize, centerLength, (const char **) msaSequences, par.wg); + PSSMCalculator::Profile pssmRes = calculator.computePSSMFromMSA(filteredSetSize, centerLength, (const char **) msaSequences, par.wg, 0.0); resultWriter.writeStart(thread_idx); for (size_t i = 0; i < setSize; ++i) { diff --git a/lib/mmseqs/src/util/recoverlongestorf.cpp b/lib/mmseqs/src/util/recoverlongestorf.cpp new file mode 100644 index 0000000..b25d894 --- /dev/null +++ b/lib/mmseqs/src/util/recoverlongestorf.cpp @@ -0,0 +1,148 @@ +#include "Parameters.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Debug.h" +#include "Util.h" +#include "Orf.h" + +#include +#include + +#ifdef OPENMP +#include +#endif + +int recoverlongestorf(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + + DBReader headerReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + headerReader.open(DBReader::LINEAR_ACCCESS); + + DBReader resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + resultReader.open(DBReader::LINEAR_ACCCESS); + + DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, false, Parameters::DBTYPE_OMIT_FILE); + writer.open(); + + Debug::Progress progress(resultReader.getSize()); + std::unordered_map> contigToLongest; +#pragma omp parallel + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); + +#endif + std::unordered_map> localContigToLongest; + +#pragma omp for schedule(dynamic, 100) + for (size_t id = 0; id < headerReader.getSize(); ++id) { + progress.updateProgress(); + unsigned int orfKey = headerReader.getDbKey(id); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + size_t orfLen = std::max(orf.from, orf.to) - std::min(orf.from, orf.to) + 1; + std::unordered_map>::iterator it = localContigToLongest.find(contigKey); + if (it != localContigToLongest.end()) { + std::pair orfKeyToLength = it->second; + if (orfLen > orfKeyToLength.second) { + it->second = std::make_pair(orfKey, orfLen); + } + } else { + localContigToLongest.emplace(contigKey, std::make_pair(orfKey, orfLen)); + } + } + +#pragma omp critical + { + for (auto &entry : localContigToLongest) { + auto &contigKey = entry.first; + auto &localData = entry.second; + auto it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + if (localData.second > it->second.second) { + it->second = localData; + } + } else { + contigToLongest[contigKey] = localData; + } + } + } + } + + progress.reset(resultReader.getSize()); + std::unordered_set acceptedContigs; + std::unordered_set eliminatedContigs; +#pragma omp parallel + { + int thread_idx = 0; +#ifdef OPENMP + thread_idx = omp_get_thread_num(); +#endif + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + + std::unordered_set localAcceptedContigs; + std::unordered_set localEliminatedContigs; +#pragma omp for schedule(dynamic, 5) + for (size_t i = 0; i < resultReader.getSize(); ++i) { + progress.updateProgress(); + + unsigned int key = resultReader.getDbKey(i); + size_t entryLength = resultReader.getEntryLen(i); + if (entryLength > 1) { + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localAcceptedContigs.emplace(contigKey); + } + + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localEliminatedContigs.emplace(contigKey); + } + +#pragma omp critical + { + acceptedContigs.insert(localAcceptedContigs.begin(), localAcceptedContigs.end()); + eliminatedContigs.insert(localEliminatedContigs.begin(), localEliminatedContigs.end()); + } + } + + for (auto it = eliminatedContigs.begin(); it != eliminatedContigs.end(); ) { + if (acceptedContigs.find(*it) != acceptedContigs.end()) { + it = eliminatedContigs.erase(it); + } else { + ++it; + } + } + + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + for (auto contigIt = eliminatedContigs.begin(); contigIt != eliminatedContigs.end(); ++contigIt) { + unsigned int contigKey = *contigIt; + std::unordered_map>::iterator it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + unsigned int orfKey = it->second.first; + resultBuffer.append(SSTR(orfKey)); + resultBuffer.append(1, '\n'); + writer.writeData(resultBuffer.c_str(), resultBuffer.length(), orfKey, 0, false, false); + resultBuffer.clear(); + } else { + Debug(Debug::ERROR) << "Missing contig " << contigKey << "\n"; + EXIT(EXIT_FAILURE); + } + } + + writer.close(true); + FileUtil::remove(writer.getIndexFileName()); + headerReader.close(); + resultReader.close(); + + return EXIT_SUCCESS; +} diff --git a/lib/mmseqs/src/util/result2msa.cpp b/lib/mmseqs/src/util/result2msa.cpp index 47689b4..b6d969e 100644 --- a/lib/mmseqs/src/util/result2msa.cpp +++ b/lib/mmseqs/src/util/result2msa.cpp @@ -38,35 +38,20 @@ int result2msa(int argc, const char **argv, const Command &command) { IndexReader *targetHeaderReaderIdx = NULL; const bool sameDatabase = (par.db1.compare(par.db2) == 0) ? true : false; - if (Parameters::isEqualDbtype(FileUtil::parseDbType(par.db2.c_str()), Parameters::DBTYPE_INDEX_DB)) { - if (isCA3M == true) { - Debug(Debug::ERROR) << "Cannot use result2msa with indexed target database for CA3M output\n"; - return EXIT_FAILURE; - } - uint16_t extended = DBReader::getExtendedDbtype(FileUtil::parseDbType(par.db3.c_str())); - bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); - tDbrIdx = new IndexReader(par.db2, par.threads, - extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC ? IndexReader::SRC_SEQUENCES : IndexReader::SEQUENCES, - (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); - tDbr = tDbrIdx->sequenceReader; - targetHeaderReaderIdx = new IndexReader(par.db2, par.threads, - extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC ? IndexReader::SRC_HEADERS : IndexReader::HEADERS, - (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); - targetHeaderReader = targetHeaderReaderIdx->sequenceReader; - } else { - tDbr = new DBReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); - tDbr->open(DBReader::NOSORT); - if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { - tDbr->readMmapedDataInMemory(); - } - if (isCA3M == false || sameDatabase) { - targetHeaderReader = new DBReader(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); - targetHeaderReader->open(DBReader::NOSORT); - if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) { - targetHeaderReader->readMmapedDataInMemory(); - } - } + if (isCA3M == true) { + Debug(Debug::ERROR) << "Cannot use result2msa with indexed target database for CA3M output\n"; + return EXIT_FAILURE; } + uint16_t extended = DBReader::getExtendedDbtype(FileUtil::parseDbType(par.db3.c_str())); + bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); + tDbrIdx = new IndexReader(par.db2, par.threads, + extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC ? IndexReader::SRC_SEQUENCES : IndexReader::SEQUENCES, + (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); + tDbr = tDbrIdx->sequenceReader; + targetHeaderReaderIdx = new IndexReader(par.db2, par.threads, + extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC ? IndexReader::SRC_HEADERS : IndexReader::HEADERS, + (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); + targetHeaderReader = targetHeaderReaderIdx->sequenceReader; DBReader *qDbr = NULL; DBReader *queryHeaderReader = NULL; @@ -503,7 +488,7 @@ int result2msa(int argc, const char **argv, const Command &command) { #ifdef GAP_POS_SCORING alnResults, #endif - par.wg); + par.wg, 0.0); result.append(">consensus_"); result.append(centerSequenceHeader, centerHeaderLength); for (int pos = 0; pos < centerSequence.L; pos++) { diff --git a/lib/mmseqs/src/util/result2profile.cpp b/lib/mmseqs/src/util/result2profile.cpp index a00dabc..34759b1 100644 --- a/lib/mmseqs/src/util/result2profile.cpp +++ b/lib/mmseqs/src/util/result2profile.cpp @@ -260,7 +260,7 @@ int result2profile(int argc, const char **argv, const Command &command, bool ret #ifdef GAP_POS_SCORING alnResults, #endif - par.wg); + par.wg, 0.0); if (par.compBiasCorrection == true){ SubstitutionMatrix::calcGlobalAaBiasCorrection(&subMat, pssmRes.pssm, pNullBuffer, Sequence::PROFILE_AA_SIZE, diff --git a/lib/mmseqs/src/util/result2stats.cpp b/lib/mmseqs/src/util/result2stats.cpp index f678d68..c6b67cd 100644 --- a/lib/mmseqs/src/util/result2stats.cpp +++ b/lib/mmseqs/src/util/result2stats.cpp @@ -383,7 +383,6 @@ int StatsComputer::sequenceWise(typename PerSequence::type call, bool onlyRes buffer.append("\n"); } else { // for every hit - int cnt = 0; while (*results != '\0') { Util::parseKey(results, dbKey); char *rest; @@ -402,7 +401,6 @@ int StatsComputer::sequenceWise(typename PerSequence::type call, bool onlyRes buffer.append("\n"); results = Util::skipLine(results); - cnt++; } } statWriter->writeData(buffer.c_str(), buffer.length(), resultReader->getDbKey(id), thread_idx); diff --git a/lib/mmseqs/src/workflow/Cluster.cpp b/lib/mmseqs/src/workflow/Cluster.cpp index 1e9c629..6293eb4 100644 --- a/lib/mmseqs/src/workflow/Cluster.cpp +++ b/lib/mmseqs/src/workflow/Cluster.cpp @@ -94,7 +94,7 @@ void setClusterAutomagicParameters(Parameters& par) { } if (par.singleStepClustering == false && par.clusteringMode == Parameters::CONNECTED_COMPONENT) { Debug(Debug::WARNING) << "Connected component clustering produces less clusters in a single step clustering.\n" - << "Please use --single-step-cluster"; + << "Please use --single-step-clustering"; } if (par.PARAM_CLUSTER_STEPS.wasSet == false) { par.clusterSteps = setAutomaticIterations(par.sensitivity); diff --git a/lib/mmseqs/src/workflow/Search.cpp b/lib/mmseqs/src/workflow/Search.cpp index 750535d..62ec132 100644 --- a/lib/mmseqs/src/workflow/Search.cpp +++ b/lib/mmseqs/src/workflow/Search.cpp @@ -256,6 +256,8 @@ int search(int argc, const char **argv, const Command& command) { } } + + const bool isUngappedMode = par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED; if (isUngappedMode && (searchMode & (Parameters::SEARCH_MODE_FLAG_QUERY_PROFILE |Parameters::SEARCH_MODE_FLAG_TARGET_PROFILE ))) { par.printUsageMessage(command, MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PREFILTER); @@ -316,6 +318,19 @@ int search(int argc, const char **argv, const Command& command) { } else { cmd.addVariable("ALIGN_MODULE", "align"); } + + switch(par.prefMode){ + case Parameters::PREF_MODE_KMER: + cmd.addVariable("PREFMODE", "KMER"); + break; + case Parameters::PREF_MODE_UNGAPPED: + cmd.addVariable("PREFMODE", "UNGAPPED"); + break; + case Parameters::PREF_MODE_EXHAUSTIVE: + cmd.addVariable("PREFMODE", "EXHAUSTIVE"); + break; + } + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); std::string program; cmd.addVariable("RUNNER", par.runner.c_str()); @@ -334,7 +349,11 @@ int search(int argc, const char **argv, const Command& command) { par.covMode = Util::swapCoverageMode(par.covMode); size_t maxResListLen = par.maxResListLen; par.maxResListLen = std::max((size_t)300, queryDbSize); - cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str()); + if(par.prefMode == Parameters::PREF_MODE_KMER){ + cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable("UNGAPPEDPREFILTER_PAR", par.createParameterString(par.ungappedprefilter).c_str()); + } par.maxResListLen = maxResListLen; double originalEvalThr = par.evalThr; par.evalThr = std::numeric_limits::max(); @@ -385,8 +404,13 @@ int search(int argc, const char **argv, const Command& command) { if (i == (par.numIterations - 1)) { par.evalThr = originalEval; } - cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), - par.createParameterString(par.prefilter).c_str()); + if (par.prefMode == Parameters::PREF_MODE_KMER) { + cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.prefilter).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable(std::string("UNGAPPEDPREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.ungappedprefilter).c_str()); + } if (isUngappedMode) { par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT; cmd.addVariable(std::string("ALIGNMENT_PAR_" + SSTR(i)).c_str(), @@ -439,8 +463,13 @@ int search(int argc, const char **argv, const Command& command) { par.evalThr = originalEval; } - cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), - par.createParameterString(par.prefilter).c_str()); + if (par.prefMode == Parameters::PREF_MODE_KMER) { + cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.prefilter).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable(std::string("UNGAPPEDPREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.ungappedprefilter).c_str()); + } if (isUngappedMode) { par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT; cmd.addVariable(std::string("ALIGNMENT_PAR_" + SSTR(i)).c_str(), @@ -487,7 +516,11 @@ int search(int argc, const char **argv, const Command& command) { prefilterWithoutS.push_back(par.prefilter[i]); } } - cmd.addVariable("PREFILTER_PAR", par.createParameterString(prefilterWithoutS).c_str()); + if (par.prefMode == Parameters::PREF_MODE_KMER) { + cmd.addVariable("PREFILTER_PAR", par.createParameterString(prefilterWithoutS).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable("UNGAPPEDPREFILTER_PAR", par.createParameterString(par.ungappedprefilter).c_str()); + } if (isUngappedMode) { par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT; cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str()); diff --git a/lib/mmseqs/src/workflow/Taxonomy.cpp b/lib/mmseqs/src/workflow/Taxonomy.cpp index 4dbd34c..78bb760 100644 --- a/lib/mmseqs/src/workflow/Taxonomy.cpp +++ b/lib/mmseqs/src/workflow/Taxonomy.cpp @@ -95,6 +95,7 @@ int taxonomy(int argc, const char **argv, const Command& command) { cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("RUNNER", par.runner.c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); if (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED && !(searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) { diff --git a/lib/mmseqs/util/regression b/lib/mmseqs/util/regression index ac1fc17..5e3bc17 160000 --- a/lib/mmseqs/util/regression +++ b/lib/mmseqs/util/regression @@ -1 +1 @@ -Subproject commit ac1fc179cc06139452da92de754411df780573fa +Subproject commit 5e3bc17e17fb7d34e459bc8ad2bedbf7ded2a038