From 07c9618d07e9d0081250e0c2a8ac92da6dc30239 Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Tue, 26 May 2020 08:20:11 -0400 Subject: [PATCH 01/36] Add github binary artifact to ccpp build action --- .github/workflows/ccpp.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 4585ca0..0f8469b 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -23,3 +23,10 @@ jobs: run: ./amrfinder -u - name: make test run: make test + - name: make gitnub_binaries + run: make github_binaries + - uses: actions/upload-artifact@v2 + with: + name: release-binary + path: amrfinder_binaries_v*.tar.gz + From 84b81e6f2a1e7d1def5f9c36fdb58085a28f1b36 Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Tue, 7 Jul 2020 09:57:33 -0400 Subject: [PATCH 02/36] Updated text for -l option --- amrfinder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amrfinder.cpp b/amrfinder.cpp index bf7e181..d81d0c6 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -392,7 +392,7 @@ struct ThisApplication : ShellApplication if (list_organisms) { const StringVector organisms (db2organisms ()); - cout << "Possible organisms: " + organisms. toString (", ") << endl; + cout << endl << "Available --organism options: " + organisms. toString (", ") << endl; return; } From a8f8f6bb43055c2fbe769a7ab4ff9b5a56872a51 Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Fri, 10 Jul 2020 14:19:06 -0400 Subject: [PATCH 03/36] Updated text for -i option (PD-3482) Also commented out commit trigger for mac conda test. --- .github/workflows/mac_conda.yml | 6 +++--- amrfinder.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/mac_conda.yml b/.github/workflows/mac_conda.yml index e77e150..1eefce6 100644 --- a/.github/workflows/mac_conda.yml +++ b/.github/workflows/mac_conda.yml @@ -1,8 +1,8 @@ name: Mac bioconda on: - push: - branches: - - master + # push: + # branches: + # - master schedule: - cron: '30 3 * * *' # 3:30am everyday repository_dispatch: diff --git a/amrfinder.cpp b/amrfinder.cpp index d81d0c6..e6f5488 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -186,7 +186,7 @@ struct ThisApplication : ShellApplication addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR"); - addKey ("ident_min", "Minimum identity for nucleotide hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); + addKey ("ident_min", "Minimum proportion of identical residues in alignment for hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); // "nucleotide hit" --> "reference protein" ?? addKey ("coverage_min", "Minimum coverage of the reference protein (0..1)", toString (partial_coverage_min_def), 'c', "MIN_COV"); addKey ("organism", "Taxonomy group. To see all possible taxonomy groups use the --list_organisms flag", "", 'O', "ORGANISM"); From f5e0fdf7b45c642fb32a87752d04d65b4b92af9e Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Mon, 13 Jul 2020 16:51:24 -0400 Subject: [PATCH 04/36] Revise --ident_min message slightly --- amrfinder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/amrfinder.cpp b/amrfinder.cpp index e6f5488..08440c7 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -186,7 +186,7 @@ struct ThisApplication : ShellApplication addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR"); - addKey ("ident_min", "Minimum proportion of identical residues in alignment for hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); + addKey ("ident_min", "Minimum proportion of identical amino acids in alignment for hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); // "nucleotide hit" --> "reference protein" ?? addKey ("coverage_min", "Minimum coverage of the reference protein (0..1)", toString (partial_coverage_min_def), 'c', "MIN_COV"); addKey ("organism", "Taxonomy group. To see all possible taxonomy groups use the --list_organisms flag", "", 'O', "ORGANISM"); From 8030d91ba792687d937c84ab09ac5350173e6462 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Tue, 14 Jul 2020 10:29:40 -0400 Subject: [PATCH 05/36] --ident_min instruction; -l for old database versions --- amrfinder.cpp | 26 ++++++++++++--------- common.cpp | 7 ++---- common.hpp | 64 ++++++++++++++++++++++++++++++++++++++------------- version.txt | 2 +- 4 files changed, 66 insertions(+), 33 deletions(-) diff --git a/amrfinder.cpp b/amrfinder.cpp index 08440c7..f1cc03b 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,8 @@ * cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 07/13/2020 PD-3484 -l for old database versions +* 3.8.5 07/10/2020 PD-3482 --ident_min instruction * 3.8.4 05/13/2020 PD-3447 Custom point mutation does not match the reference sequence * Text "*** ERROR ***" is not repeated twice * 3.8.3 05/01/2020 WILDTYPE mutations were reported as 0-based @@ -187,7 +189,7 @@ struct ThisApplication : ShellApplication addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR"); addKey ("ident_min", "Minimum proportion of identical amino acids in alignment for hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); - // "nucleotide hit" --> "reference protein" ?? + // "PD-3482 addKey ("coverage_min", "Minimum coverage of the reference protein (0..1)", toString (partial_coverage_min_def), 'c', "MIN_COV"); addKey ("organism", "Taxonomy group. To see all possible taxonomy groups use the --list_organisms flag", "", 'O', "ORGANISM"); addFlag ("list_organisms", "Print the list of all possible taxonomy groups for mutations identification and exit", 'l'); @@ -248,8 +250,10 @@ struct ThisApplication : ShellApplication StringVector db2organisms () const { - exec ("tail -n +2 " + tmp + ".db/AMRProt-mutation.tab" + " | cut -f 1 > " + tmp + ".prot_org"); + checkFile (tmp + ".db/taxgroup.tab"); + checkFile (tmp + ".db/AMRProt-mutation.tab"); exec ("tail -n +2 " + tmp + ".db/taxgroup.tab" + " | cut -f 1 > " + tmp + ".tax_org"); + exec ("tail -n +2 " + tmp + ".db/AMRProt-mutation.tab" + " | cut -f 1 > " + tmp + ".prot_org"); exec ("cat " + tmp + ".prot_org " + tmp + ".tax_org | sort -u > " + tmp + ".org"); return StringVector (tmp + ".org", (size_t) 100); // PAR } @@ -389,14 +393,6 @@ struct ThisApplication : ShellApplication exec ("ln -s " + shellQuote (path2canonical (db)) + " " + tmp + ".db"); - if (list_organisms) - { - const StringVector organisms (db2organisms ()); - cout << endl << "Available --organism options: " + organisms. toString (", ") << endl; - return; - } - - // PD-3051 try { @@ -419,6 +415,14 @@ struct ThisApplication : ShellApplication } + if (list_organisms) + { + const StringVector organisms (db2organisms ()); + cout << endl << "Available --organism options: " + organisms. toString (", ") << endl; + return; + } + + { string searchMode; StringVector includes; @@ -523,7 +527,7 @@ struct ThisApplication : ShellApplication { const StringVector organisms (db2organisms ()); if (! organisms. contains (organism1)) - throw runtime_error ("Possible organisms: " + organisms. toString (", ")); + throw runtime_error ("Possible organisms: " + organisms. toString (", ")); } } if (! organism1. empty ()) diff --git a/common.cpp b/common.cpp index 48bc1c0..edd836c 100644 --- a/common.cpp +++ b/common.cpp @@ -1215,7 +1215,7 @@ StringVector::StringVector (const string &fName, } catch (const exception &e) { - throw runtime_error ("Loading file " + shellQuote (fName) + "\n" + e. what ()); + throw runtime_error ("Reading file " + shellQuote (fName) + "\n" + e. what ()); } } @@ -1563,10 +1563,7 @@ void Token::readInput (CharInput &in) qc (); if (verbose ()) - { - cout << type2str (type) << ' '; - cout << *this << ' ' << charNum << endl; - } + cout << type2str (type) << ' ' << *this << ' ' << charNum + 1 << endl; } diff --git a/common.hpp b/common.hpp index aa29c6e..34eead5 100644 --- a/common.hpp +++ b/common.hpp @@ -788,10 +788,14 @@ struct List : list prefixIt++; } } - List& operator<< (T t) + List& operator<< (const T &t) { P::push_back (t); return *this; } + List& operator<< (T &&t) + { P::push_back (move (t)); + return *this; + } template */> List& operator<< (const list &other) { P::insert (P::end (), other. begin (), other. end ()); @@ -803,12 +807,16 @@ struct List : list return *this; } T popFront () - { const T t = P::front (); + { if (P::empty ()) + throw range_error ("popFront() empty list"); + const T t = P::front (); P::pop_front (); return t; } T popBack () - { const T t = P::back (); + { if (P::empty ()) + throw range_error ("popBack() empty list"); + const T t = P::back (); P::pop_back (); return t; } @@ -1062,6 +1070,11 @@ inline string shellQuote (string s) bool fileExists (const string &fName); +inline void checkFile (const string &fName) + { if (! fileExists (fName)) + throw runtime_error ("File " + strQuote (fName) + " does not exist"); + } + streampos getFileSize (const string &fName); inline void removeFile (const string &fName) @@ -1380,7 +1393,7 @@ struct Root virtual bool empty () const { return true; } virtual void clear () - {} + { throw logic_error ("Root::clear() is not implemented"); } // Postcondition: empty() virtual void read (istream &/*is*/) { throw logic_error ("Root::read() is not implemented"); } @@ -1498,20 +1511,22 @@ struct Vector : vector {} - typename P::reference operator[] (size_t index) +private: + void checkIndex (const string &operation, + size_t index) const { #ifndef NDEBUG if (index >= P::size ()) - throw range_error ("Vector assignment to index = " + to_string (index) + ", but size = " + to_string (P::size ())); + throw range_error ("Vector " + operation + " operation: index = " + to_string (index) + ", but size = " + to_string (P::size ())); #endif + } +public: + typename P::reference operator[] (size_t index) + { checkIndex ("assignment", index); return P::operator[] (index); } typename P::const_reference operator[] (size_t index) const - { - #ifndef NDEBUG - if (index >= P::size ()) - throw range_error ("Vector reading of index = " + to_string (index) + ", but size = " + to_string (P::size ())); - #endif + { checkIndex ("get", index); return P::operator[] (index); } void reserveInc (size_t inc) @@ -1593,9 +1608,16 @@ struct Vector : vector { eraseMany (index, index + 1); } void eraseMany (size_t from, size_t to) - { P::erase ( P::begin () + (ptrdiff_t) from - , P::begin () + (ptrdiff_t) to - ); + { if (to < from) + throw logic_error ("Vector::eraseMany(): to < from"); + if (to == from) + return; + checkIndex ("eraseMany", to - 1); + auto it1 = P::begin (); + std::advance (it1, from); + auto it2 = it1; + std::advance (it2, to - from); + P::erase (it1, it2); } void wipe () { P::clear (); @@ -1618,6 +1640,14 @@ struct Vector : vector std::swap (t, (*this) [(size_t) rand. get ((ulong) P::size ())]); searchSorted = false; } + void pop_back () + { + #ifndef NDEBUG + if (P::empty ()) + throw range_error ("Empty vector pop_back"); + #endif + P::pop_back (); + } T pop (size_t n = 1) { T t = T (); while (n) @@ -2399,6 +2429,7 @@ template template struct RandomSet +// Set stored in a vector for a random access { private: Vector vec; @@ -2444,7 +2475,6 @@ struct RandomSet } const Vector& getVec () const { return vec; } - // For a random access }; @@ -2742,6 +2772,8 @@ struct Token : Root void saveText (ostream &os) const override; bool empty () const override { return type == eDelimiter && name. empty (); } + void clear () override + { *this = Token (); } static string type2str (Type type) @@ -3393,7 +3425,7 @@ struct Application : Singleton, Root const string description; const bool needsArg; const bool gnu; - string version {"1.0.0"}; + string version {"0.0.0"}; static constexpr const char* helpS {"help"}; static constexpr const char* versionS {"version"}; diff --git a/version.txt b/version.txt index ff313b8..0cbfaed 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.4 +3.8.5 From 5733cc698a0ab3bc683cd831d509ef871f085d19 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Wed, 29 Jul 2020 17:22:50 -0400 Subject: [PATCH 06/36] PD-3468 -name --- alignment.cpp | 10 +++++----- amr_report.cpp | 8 ++++++++ amrfinder.cpp | 15 +++++++++------ common.cpp | 6 +++--- common.hpp | 48 +++++++++++++++++++++++++----------------------- dna_mutation.cpp | 7 +++++++ fasta_check.cpp | 2 +- version.txt | 2 +- 8 files changed, 59 insertions(+), 39 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 87efb16..700c3ab 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -414,9 +414,9 @@ bool normalizeSeq (const string &seq1, ASSERT (seq1. size () == seq2. size ()); bool changed = false; - size_t start = NO_INDEX; + size_t start = no_index; FFOR (size_t, i, seq1. size ()) - if (start == NO_INDEX) + if (start == no_index) { if (seq2 [i] == '-') start = i; @@ -432,7 +432,7 @@ bool normalizeSeq (const string &seq1, changed = true; } else - start = NO_INDEX; + start = no_index; } ASSERT (seq1. size () == seq2. size ()); @@ -694,13 +694,13 @@ void Alignment::setSeqChanges (const Vector &refMutations, } // SeqChange::mutation - size_t start_ref_prev = NO_INDEX; + size_t start_ref_prev = no_index; for (SeqChange& seqChange : seqChanges) { seqChange. qc (); if (verbose ()) seqChange. saveText (cout); - IMPLY (start_ref_prev != NO_INDEX, start_ref_prev <= seqChange. start_ref); + IMPLY (start_ref_prev != no_index, start_ref_prev <= seqChange. start_ref); while (j < refMutations. size ()) { const Mutation& mut = refMutations [j]; diff --git a/amr_report.cpp b/amr_report.cpp index b490f66..7f249eb 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -296,6 +296,8 @@ const string frameShiftS ("[frameshift]"); unique_ptr mutation_all; +string input_name; + struct BlastAlignment : Alignment @@ -517,6 +519,8 @@ struct BlastAlignment : Alignment const string method (empty () ? na : getMethod (cds)); ASSERT (isMutation () == ! (seqChange. empty () && ! mut)); TabDel td (2, false); + if (! input_name. empty ()) + td << input_name;; td << (targetProt ? nvl(targetName, na) : na); // PD-2534 if (cdsExist) td << (empty () ? na : (cds. contig. empty () ? nvl (targetName,na) : cds. contig)) @@ -1357,6 +1361,8 @@ struct Batch { // Cf. BlastAlignment::saveText() TabDel td; + if (! input_name. empty ()) + td << "Name"; td << "Protein identifier"; // 1 // targetName // PD-2534 if (cdsExist) // Contig @@ -1534,6 +1540,7 @@ struct ThisApplication : Application addFlag ("force_cds_report", "Report contig/start/stop/strand even if this information does not exist"); addFlag ("non_reportable", "Report non-reportable families"); addFlag ("core", "Report only core reportale families"); + addKey ("name", "Text to be added as the first column \"name\" to all rows of the report"); // Testing addFlag ("nosame", "Exclude the same reference protein accessions from the BLAST output (for testing)"); addFlag ("noblast", "Exclude the BLAST output (for testing)"); @@ -1572,6 +1579,7 @@ struct ThisApplication : Application const bool force_cds_report = getFlag ("force_cds_report"); const bool non_reportable = getFlag ("non_reportable"); const bool report_core_only = getFlag ("core"); + input_name = getArg ("name"); const bool nosame = getFlag ("nosame"); const bool noblast = getFlag ("noblast"); const bool nohmm = getFlag ("nohmm"); diff --git a/amrfinder.cpp b/amrfinder.cpp index f1cc03b..6ea9fb3 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 3.8.6 07/29/2020 PD-3468 -name option * 07/13/2020 PD-3484 -l for old database versions * 3.8.5 07/10/2020 PD-3482 --ident_min instruction * 3.8.4 05/13/2020 PD-3447 Custom point mutation does not match the reference sequence @@ -199,10 +200,10 @@ struct ThisApplication : ShellApplication addKey ("mutation_all", "File to report all mutations", "", '\0', "MUT_ALL_FILE"); addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); //addKey ("hmmer_bin" ?? + addKey ("name", "Text to be added as the first column \"name\" to all rows of the report, for example it can be an assembly name", "", '\0', "NAME"); addKey ("output", "Write output to OUTPUT_FILE instead of STDOUT", "", 'o', "OUTPUT_FILE"); addFlag ("quiet", "Suppress messages to STDERR", 'q'); addFlag ("gpipe_org", "NCBI internal GPipe organism names"); - //addKey ("sample", "Sample name to be adde as the first column of the report", ""); addKey ("parm", "amr_report parameters for testing: -nosame -noblast -skip_hmm_check -bed", "", '\0', "PARM"); version = SVN_REV; // threads_max: do not include blast/hmmsearch's threads ?? @@ -277,6 +278,7 @@ struct ThisApplication : ShellApplication const bool report_common = getFlag ("report_common"); const string mutation_all = shellQuote (getArg ("mutation_all")); string blast_bin = getArg ("blast_bin"); + const string input_name = shellQuote (getArg ("name")); const string parm = getArg ("parm"); const string output = shellQuote (getArg ("output")); const bool quiet = getFlag ("quiet"); @@ -588,7 +590,7 @@ struct ThisApplication : ShellApplication { findProg ("blastp"); findProg ("hmmsearch"); - exec (fullProg ("fasta_check") + prot + " -aa -hyphen " + qcS + " -log " + logFName, logFName); + exec (fullProg ("fasta_check") + prot + " -aa -hyphen" + qcS + " -log " + logFName, logFName); if (! emptyArg (gff) && ! contains (parm, "-bed")) { @@ -660,13 +662,13 @@ struct ThisApplication : ShellApplication { stderr << "Running blastx...\n"; findProg ("blastx"); - exec (fullProg ("fasta_check") + dna + " -hyphen -len "+ tmp + ".len " + qcS + " -log " + logFName, logFName); + exec (fullProg ("fasta_check") + dna + " -hyphen -len "+ tmp + ".len" + qcS + " -log " + logFName, logFName); const size_t threadsAvailable = th. getAvailable (); //ASSERT (threadsAvailable); if (threadsAvailable >= 2) { exec ("mkdir " + tmp + ".chunk"); - exec (fullProg ("fasta2parts") + dna + " " + to_string (threadsAvailable) + " " + tmp + ".chunk " + qcS + " -log " + logFName, logFName); // PAR + exec (fullProg ("fasta2parts") + dna + " " + to_string (threadsAvailable) + " " + tmp + ".chunk" + qcS + " -log " + logFName, logFName); // PAR exec ("mkdir " + tmp + ".blastx_dir"); FileItemGenerator fig (false, true, tmp + ".chunk"); string item; @@ -729,6 +731,7 @@ struct ThisApplication : ShellApplication // ".amr" + const string nameS (emptyArg (input_name) ? "" : " -name " + input_name); { const string mutation_allS (emptyArg (mutation_all) ? "" : ("-mutation_all " + tmp + ".mutation_all")); const string coreS (add_plus ? "" : " -core"); @@ -738,7 +741,7 @@ struct ThisApplication : ShellApplication + (ident == -1 ? string () : " -ident_min " + toString (ident)) + " -coverage_min " + toString (cov) + ifS (suppress_common, " -suppress_prot " + tmp + ".suppress_prot") + pgapS - + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr", logFName); + + nameS + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr", logFName); } if ( ! emptyArg (dna) && ! organism1. empty () @@ -747,7 +750,7 @@ struct ThisApplication : ShellApplication { const string mutation_allS (emptyArg (mutation_all) ? "" : ("-mutation_all " + tmp + ".mutation_all.dna")); exec (fullProg ("dna_mutation") + tmp + ".blastn " + shellQuote (db + "/AMR_DNA-" + organism1 + ".tab") + " " + strQuote (organism1) + " " + mutation_allS - + " " + qcS + " -log " + logFName + " > " + tmp + ".amr-snp", logFName); + + nameS + qcS + " -log " + logFName + " > " + tmp + ".amr-snp", logFName); exec ("tail -n +2 " + tmp + ".amr-snp >> " + tmp + ".amr"); if (! emptyArg (mutation_all)) exec ("tail -n +2 " + tmp + ".mutation_all.dna >> " + tmp + ".mutation_all"); diff --git a/common.cpp b/common.cpp index edd836c..4bc2e1d 100644 --- a/common.cpp +++ b/common.cpp @@ -854,7 +854,7 @@ size_t strMonth2num (const string& month) return (size_t) (m - 1); } - size_t i = NO_INDEX; + size_t i = no_index; if (month == "Jan") i = 0; else if (month == "Feb") i = 1; else if (month == "Mar") i = 2; @@ -1830,7 +1830,7 @@ string Csv::getWord () QC_ASSERT (goodPos ()); size_t start = pos; - size_t stop = NO_INDEX; + size_t stop = no_index; if (s [pos] == '\"') { pos++; @@ -1841,7 +1841,7 @@ string Csv::getWord () } findChar (','); - if (stop == NO_INDEX) + if (stop == no_index) stop = pos; pos++; diff --git a/common.hpp b/common.hpp index 34eead5..cc4c9a5 100644 --- a/common.hpp +++ b/common.hpp @@ -306,12 +306,12 @@ inline bool boolPow (bool x, bool power) // ebool - extended bool -enum ebool {EFALSE = false, - ETRUE = true, - UBOOL = true + 1}; +enum ebool {efalse = false, + etrue = true, + enull = true + 1}; inline ebool toEbool (bool b) - { return b ? ETRUE : EFALSE; } + { return b ? etrue : efalse; } inline bool operator<= (ebool a, ebool b) @@ -320,10 +320,10 @@ inline bool operator<= (ebool a, ebool b) } inline void toggle (ebool &b) - { if (b == ETRUE) - b = EFALSE; - else if (b == EFALSE) - b = ETRUE; + { if (b == etrue) + b = efalse; + else if (b == efalse) + b = etrue; } @@ -393,7 +393,7 @@ uint powInt (uint a, -const size_t NO_INDEX = SIZE_MAX; +constexpr size_t no_index = numeric_limits::max (); @@ -772,7 +772,7 @@ struct List : list return i; else i++; - return NO_INDEX; + return no_index; } bool isPrefix (const List &prefix) const { typename List::const_iterator wholeIt = P::begin (); @@ -1295,7 +1295,7 @@ template ASSERT (threads_max >= 1); results. clear (); results. reserve (threads_max); - if (threads_max == 1) + if (threads_max == 1 || i_max <= 1) { results. push_back (Res ()); func (0, i_max, results. front (), forward(args)...); @@ -1566,7 +1566,7 @@ struct Vector : vector return n; else n++; - return NO_INDEX; + return no_index; } size_t countValue (const T &value) const { size_t n = 0; @@ -1728,17 +1728,17 @@ struct Vector : vector size_t binSearch (const T &value, bool exact = true) const - // Return: if exact then NO_INDEX or vec[Return] = value else min {i : vec[i] >= value} + // Return: if exact then no_index or vec[Return] = value else min {i : vec[i] >= value} { if (P::empty ()) - return NO_INDEX; + return no_index; checkSorted (); size_t lo = 0; // vec.at(lo) <= value size_t hi = P::size () - 1; // lo <= hi if (value < (*this) [lo]) - return exact ? NO_INDEX : lo; + return exact ? no_index : lo; if ((*this) [hi] < value) - return NO_INDEX; + return no_index; // at(lo) <= value <= at(hi) for (;;) { @@ -1757,11 +1757,11 @@ struct Vector : vector return lo; if (! exact || (*this) [hi] == value) return hi; - return NO_INDEX; + return no_index; } template bool containsFast (const U &value) const - { return binSearch (value) != NO_INDEX; } + { return binSearch (value) != no_index; } template bool containsFastAll (const Vector &other) const { if (other. size () > P::size ()) @@ -1838,14 +1838,14 @@ struct Vector : vector size_t findDuplicate () const { if (P::size () <= 1) - return NO_INDEX; + return no_index; FOR_START (size_t, i, 1, P::size ()) if ((*this) [i] == (*this) [i - 1]) return i; - return NO_INDEX; + return no_index; } bool isUniq () const - { return findDuplicate () == NO_INDEX; } + { return findDuplicate () == no_index; } template void uniq (const Equal &equal) { if (P::size () <= 1) @@ -2095,6 +2095,8 @@ struct Heap : Root, Nocopy bool empty () const final { return arr. empty (); } + size_t size () const + { return arr. size (); } Heap& operator<< (T item) { arr << item; increaseKey (arr. size () - 1); @@ -2497,11 +2499,11 @@ struct Enumerate size_t find (const T &t) const { if (const size_t* num = Common_sp::findPtr (elem2num, t)) return *num; - return NO_INDEX; + return no_index; } size_t add (const T &t) { size_t num = find (t); - if (num == NO_INDEX) + if (num == no_index) { num2elem << t; num = num2elem. size () - 1; elem2num [t] = num; diff --git a/dna_mutation.cpp b/dna_mutation.cpp index d5353ac..77e197f 100644 --- a/dna_mutation.cpp +++ b/dna_mutation.cpp @@ -50,6 +50,7 @@ namespace map > accession2mutations; unique_ptr mutation_all; +string input_name; @@ -109,6 +110,8 @@ struct BlastnAlignment : Alignment { ASSERT (! (seqChange. empty () && ! seqChange. mutation)); TabDel td (2, false); + if (! input_name. empty ()) + td << input_name;; td << na // PD-2534 << nvl (targetName, na) << (empty () ? 0 : targetStart + 1) @@ -211,6 +214,8 @@ struct Batch { // Cf. BlastnAlignment::saveText() TabDel td; + if (! input_name. empty ()) + td << "Name"; td << "Protein identifier" // targetName // PD-2534 // Contig << "Contig id" @@ -267,6 +272,7 @@ struct ThisApplication : Application addPositional ("mutation", "Mutations table"); addPositional ("organism", "Organism name"); addKey ("mutation_all", "File to report all mutations"); + addKey ("name", "Text to be added as the first column \"name\" to all rows of the report"); version = SVN_REV; } @@ -278,6 +284,7 @@ struct ThisApplication : Application const string mutation_tab = getArg ("mutation"); const string organism = getArg ("organism"); const string mutation_all_FName = getArg ("mutation_all"); + input_name = getArg ("name"); if (! mutation_all_FName. empty ()) diff --git a/fasta_check.cpp b/fasta_check.cpp index 8cdcc5d..ff9e726 100644 --- a/fasta_check.cpp +++ b/fasta_check.cpp @@ -148,7 +148,7 @@ struct ThisApplication : Application ids. sort (); const size_t index = ids. findDuplicate (); - if (index != NO_INDEX) + if (index != no_index) throw runtime_error ("Duplicate identifier: " + ids [index]); } }; diff --git a/version.txt b/version.txt index 0cbfaed..2e14a95 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.5 +3.8.6 From 6a0c79fbc8b555327f660178edc8c4fe4d4720ff Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Mon, 3 Aug 2020 16:10:07 -0400 Subject: [PATCH 07/36] PD-3504 --protein_output, --nucleotide_output --- .gitignore | 1 + Makefile | 7 +- amrfinder.cpp | 49 +++++++--- fasta_extract.cpp | 242 ++++++++++++++++++++++++++++++++++++++++++++++ version.txt | 2 +- 5 files changed, 287 insertions(+), 14 deletions(-) create mode 100644 fasta_extract.cpp diff --git a/.gitignore b/.gitignore index aa4d7f3..44dc746 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ data/ amr_report amrfinder fasta_check +fasta_extract dna_mutation gff_check amrfinder_update diff --git a/Makefile b/Makefile index e92ff6a..b86b44c 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) -c .PHONY: all clean install release -BINARIES= amr_report amrfinder amrfinder_update fasta_check fasta2parts gff_check dna_mutation +BINARIES= amr_report amrfinder amrfinder_update fasta_check fasta_extract fasta2parts gff_check dna_mutation all: $(BINARIES) @@ -81,6 +81,11 @@ fasta_checkOBJS=fasta_check.o common.o fasta_check: $(fasta_checkOBJS) $(CXX) -o $@ $(fasta_checkOBJS) +fasta_extract.o: common.hpp common.inc +fasta_extractOBJS=fasta_extract.o common.o +fasta_extract: $(fasta_extractOBJS) + $(CXX) -o $@ $(fasta_extractOBJS) + fasta2parts.o: common.hpp common.inc fasta2partsOBJS=fasta2parts.o common.o fasta2parts: $(fasta2partsOBJS) diff --git a/amrfinder.cpp b/amrfinder.cpp index 6ea9fb3..3591116 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,7 +33,8 @@ * cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: -* 3.8.6 07/29/2020 PD-3468 -name option +* 3.8.7 08/03/2020 PD-3504 --protein_output, --nucleotide_output options by fasta_extract.cpp +* 3.8.6 07/29/2020 PD-3468 --name option * 07/13/2020 PD-3484 -l for old database versions * 3.8.5 07/10/2020 PD-3482 --ident_min instruction * 3.8.4 05/13/2020 PD-3447 Custom point mutation does not match the reference sequence @@ -184,8 +185,8 @@ struct ThisApplication : ShellApplication : ShellApplication (HELP, true, true, true) { addFlag ("update", "Update the AMRFinder database", 'u'); // PD-2379 - addKey ("protein", "Protein FASTA file to search", "", 'p', "PROT_FASTA"); - addKey ("nucleotide", "Nucleotide FASTA file to search", "", 'n', "NUC_FASTA"); + addKey ("protein", "Input protein FASTA file", "", 'p', "PROT_FASTA"); + addKey ("nucleotide", "Input nucleotide FASTA file", "", 'n', "NUC_FASTA"); addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR"); @@ -202,6 +203,8 @@ struct ThisApplication : ShellApplication //addKey ("hmmer_bin" ?? addKey ("name", "Text to be added as the first column \"name\" to all rows of the report, for example it can be an assembly name", "", '\0', "NAME"); addKey ("output", "Write output to OUTPUT_FILE instead of STDOUT", "", 'o', "OUTPUT_FILE"); + addKey ("protein_output", "Output protein FASTA file of reported proteins", "", '\0', "PROT_FASTA_OUT"); + addKey ("nucleotide_output", "Output nucleotide FASTA file of reported nucleotide sequences", "", '\0', "NUC_FASTA_OUT"); addFlag ("quiet", "Suppress messages to STDERR", 'q'); addFlag ("gpipe_org", "NCBI internal GPipe organism names"); addKey ("parm", "amr_report parameters for testing: -nosame -noblast -skip_hmm_check -bed", "", '\0', "PARM"); @@ -281,6 +284,8 @@ struct ThisApplication : ShellApplication const string input_name = shellQuote (getArg ("name")); const string parm = getArg ("parm"); const string output = shellQuote (getArg ("output")); + const string prot_out = shellQuote (getArg ("protein_output")); + const string dna_out = shellQuote (getArg ("nucleotide_output")); const bool quiet = getFlag ("quiet"); const bool gpipe_org = getFlag ("gpipe_org"); @@ -429,11 +434,12 @@ struct ThisApplication : ShellApplication string searchMode; StringVector includes; if (emptyArg (prot)) + { if (emptyArg (dna)) { if (update) return; - throw runtime_error ("Parameter --prot or --nucleotide must be present"); + throw runtime_error ("Parameter --protein or --nucleotide must be present"); } else { @@ -441,6 +447,7 @@ struct ThisApplication : ShellApplication throw runtime_error ("Parameter --gff is redundant"); searchMode = "translated nucleotide"; } + } else { searchMode = "protein"; @@ -452,10 +459,14 @@ struct ThisApplication : ShellApplication else { if (emptyArg (gff)) - throw runtime_error ("If parameters --prot and --nucleotide are present then parameter --gff must be present"); + throw runtime_error ("If parameters --protein and --nucleotide are present then parameter --gff must be present"); searchMode = "combined translated and protein"; } } + if (emptyArg (prot) && ! emptyArg (prot_out)) + throw runtime_error ("Parameter --protein must be present for --protein_out"); + if (emptyArg (dna) && ! emptyArg (dna_out)) + throw runtime_error ("Parameter --nucleotide must be present for --nucleotide_out"); ASSERT (! searchMode. empty ()); if (emptyArg (organism)) includes << key2shortHelp ("organism") + " option to add mutation searches and suppress common proteins"; @@ -542,9 +553,10 @@ struct ThisApplication : ShellApplication const string force_cds_report (! emptyArg (dna) && ! organism1. empty () ? "-force_cds_report" : ""); // Needed for dna_mutation - prog2dir ["fasta_check"] = execDir; - prog2dir ["fasta2parts"] = execDir; - prog2dir ["amr_report"] = execDir; + prog2dir ["fasta_check"] = execDir; + prog2dir ["fasta2parts"] = execDir; + prog2dir ["amr_report"] = execDir; + prog2dir ["fasta_extract"] = execDir; bool lcl = false; @@ -769,16 +781,29 @@ struct ThisApplication : ShellApplication exec ("LANG=C && tail -n +2 " + tmp + ".mutation_all | sort " + sortS + " | uniq >> " + tmp + ".mutation_all-out"); exec ("mv " + tmp + ".mutation_all-out " + mutation_all); } - - // timing the run - const time_t end = time (NULL); - stderr << "AMRFinder took " << end - start << " seconds to complete\n"; if (emptyArg (output)) exec ("cat " + tmp + ".amr"); else exec ("cp " + tmp + ".amr " + output); + + + if (! emptyArg (prot_out)) + { + exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$1 != \"NA\" {print $1, $6, $7};' | sort -u > " + tmp + ".prot_out"); + exec (fullProg ("fasta_extract") + prot + " " + tmp + ".prot_out -aa" + qcS + " -log " + logFName + " > " + prot_out, logFName); + } + if (! emptyArg (dna_out)) + { + exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$1 == \"NA\" {print $2, $3, $4, $5, $6, $7};' | sort -u > " + tmp + ".dna_out"); + exec (fullProg ("fasta_extract") + dna + " " + tmp + ".dna_out" + qcS + " -log " + logFName + " > " + dna_out, logFName); + } + + + // timing the run + const time_t end = time (NULL); + stderr << "AMRFinder took " << end - start << " seconds to complete\n"; } }; diff --git a/fasta_extract.cpp b/fasta_extract.cpp new file mode 100644 index 0000000..0f2397e --- /dev/null +++ b/fasta_extract.cpp @@ -0,0 +1,242 @@ +// fasta_check.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* Extract sequences out of a FASTA file +* +*/ + + +#undef NDEBUG +#include "common.inc" + +#include "common.hpp" +using namespace Common_sp; + + + +namespace +{ + + + +struct Segment +{ + size_t start {0}; + size_t stop {0}; + bool strand {true}; + // false <=> negative + string genesymbol; + string name; + + + bool isDna () const + { return stop; } + size_t size () const + { return stop - start; } + void saveText (ostream &os) const + { TabDel td; + td << start << stop << strand << genesymbol << name; + os << td. str (); + } +}; + + + +char complementaryNucleotide (char wildNucleotide) +{ + char r = ' '; + switch (toLower (wildNucleotide)) + { + case 'a': r = 't'; break; + case 'c': r = 'g'; break; + case 'g': r = 'c'; break; + case 't': r = 'a'; break; + case 'm': r = 'k'; break; + case 'r': r = 'y'; break; + case 'w': r = 'w'; break; + case 's': r = 's'; break; + case 'y': r = 'r'; break; + case 'k': r = 'm'; break; + case 'v': r = 'b'; break; + case 'h': r = 'd'; break; + case 'd': r = 'h'; break; + case 'b': r = 'v'; break; + case 'n': r = 'n'; break; + case '-': r = '-'; break; + default: + throw runtime_error ("Bad nucleotide " + to_string (wildNucleotide)); + } + if (isupper (wildNucleotide)) + r = toUpper (r); + + return r; +} + + + +void process (const string &id, + string &seq, + const map> &id2segments) +{ + if (id. empty ()) + return; + const Vector* segments = findPtr (id2segments, id); + if (! segments) + return; + + replaceStr (seq, "-", ""); + QC_ASSERT (! seq. empty ()); + + for (const Segment& seg : *segments) + { + cout << '>' << id; + if (seg. isDna ()) + cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-'); + cout << ' ' << seg. genesymbol << ' ' << seg. name << endl; + if (seg. isDna ()) + { + QC_ASSERT (seg. stop <= seq. size ()); + seq = seq. substr (seg. start, seg. size ()); + if (! seg. strand) + { + reverse (seq); + for (char &c : seq) + c = complementaryNucleotide (c); + } + } + constexpr size_t line_len = 60; // PAR + for (size_t i = 0; i < seq. size (); i += line_len) + cout << seq. substr (i, line_len) << endl; + } +} + + + +struct ThisApplication : Application +{ + ThisApplication () + : Application ("Extract sequences out of a FASTA file") + { + addPositional ("fasta", "FASTA file"); + addPositional ("target", "Target identifiers in the FASTA file to extract.\n\ +Line format for amino acid sequences : \n\ +Line format for nucleotide sequences : =1)> = start)> \ +"); + addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); + version = SVN_REV; + } + + + + void body () const final + { + const string fName = getArg ("fasta"); + const string targetFName = getArg ("target"); + const bool aa = getFlag ("aa"); + + + map> id2segments; + { + LineInput f (targetFName); + string id; + Istringstream iss; + while (f. nextLine ()) + { + iss. reset (f. line); + Segment seg; + iss >> id; + char strand; + if (! aa) + { + iss >> seg. start >> seg. stop >> strand; + QC_ASSERT (seg. start); + QC_ASSERT (seg. start <= seg. stop); + seg. start--; + seg. strand = (strand == '+'); + } + iss >> seg. genesymbol; + seg. name = f. line. substr ((size_t) iss. tellg ()); + trim (seg. name); + QC_ASSERT (aa == ! seg. isDna ()); + id2segments [id] << move (seg); + } + } + if (verbose ()) + for (const auto& it : id2segments) + { + cout << it. first << ": " << endl; + for (const Segment& seg : it. second) + { + cout << " "; + seg. saveText (cout); + cout << endl; + } + } + if (id2segments. empty ()) + return; + + + LineInput f (fName); + string id; + string seq; + while (f. nextLine ()) + { + if (f. line. empty ()) + continue; + if (f. line [0] == '>') + { + process (id, seq, id2segments); + size_t pos = 1; + while (pos < f. line. size () && ! isspace (f. line [pos])) + pos++; + id = f. line. substr (1, pos - 1); + seq. clear (); + } + else + seq += f. line; + } + process (id, seq, id2segments); + } +}; + + + +} // namespace + + + +int main (int argc, + const char* argv[]) +{ + ThisApplication app; + return app. run (argc, argv); +} + + + diff --git a/version.txt b/version.txt index 2e14a95..4351a7e 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.6 +3.8.7 From 0bdabe2a6f4bd106a7b3ee9be45a3aabcce09fb0 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Tue, 4 Aug 2020 18:00:28 -0400 Subject: [PATCH 08/36] PD-3504 bug in fasta_extract.cpp, more output in --nucleotide_output --- amr_report.cpp | 4 ++-- amrfinder.cpp | 43 +++++++++++++++++++++++++++++-------------- common.cpp | 14 ++++++++++++++ common.hpp | 5 +++++ fasta_extract.cpp | 13 +++++++------ version.txt | 2 +- 6 files changed, 58 insertions(+), 23 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 7f249eb..dc877cb 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -524,8 +524,8 @@ struct BlastAlignment : Alignment td << (targetProt ? nvl(targetName, na) : na); // PD-2534 if (cdsExist) td << (empty () ? na : (cds. contig. empty () ? nvl (targetName,na) : cds. contig)) - << (empty () ? 0 : (cds. contig. empty () ? targetStart : cds. start) + 1) - << (empty () ? 0 : (cds. contig. empty () ? targetEnd : cds. stop)) + << (empty () ? 0 : (cds. contig. empty () ? targetStart : cds. start) + 1) + << (empty () ? 0 : (cds. contig. empty () ? targetEnd : cds. stop)) << (empty () ? na : (cds. contig. empty () ? (targetStrand ? "+" : "-") : (cds. strand ? "+" : "-"))); td << (isMutation () ? mut diff --git a/amrfinder.cpp b/amrfinder.cpp index 3591116..a6deaa3 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -30,9 +30,10 @@ * AMRFinder * * Dependencies: NCBI BLAST, HMMer -* cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which +* awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 3.8.8 08/04/2020 bug in fasta_extract.cpp, more output in --nucleotide_output * 3.8.7 08/03/2020 PD-3504 --protein_output, --nucleotide_output options by fasta_extract.cpp * 3.8.6 07/29/2020 PD-3468 --name option * 07/13/2020 PD-3484 -l for old database versions @@ -243,10 +244,7 @@ struct ThisApplication : ShellApplication // stderr << "Compiled for Linux" << "\n"; //exec ("nproc --all > " + tmp + ".nproc"); //exec ("grep processor /proc/cpuinfo | tail -1 | awk '{print $3}' > " + tmp + ".nproc"); - exec ("grep -c processor /proc/cpuinfo > " + tmp + ".nproc"); - const StringVector vec (tmp + ".nproc", (size_t) 1); - QC_ASSERT (vec. size () == 1); - return str2 (vec [0]); + return str2 (exec2str ("grep -c processor /proc/cpuinfo", "nproc")); #endif } @@ -261,6 +259,13 @@ struct ThisApplication : ShellApplication exec ("cat " + tmp + ".prot_org " + tmp + ".tax_org | sort -u > " + tmp + ".org"); return StringVector (tmp + ".org", (size_t) 100); // PAR } + + + + string col2num (const string &colName) const + { + return exec2str ("head -1 " + tmp + ".amr | tr '\t' '\n' | grep -n \"" + colName + "\" | cut -f 1 -d ':'", "col"); + } @@ -712,7 +717,7 @@ struct ThisApplication : ShellApplication prog2dir ["dna_mutation"] = execDir; stderr << "Running blastn...\n"; exec (fullProg ("blastn") + " -query " + dna + " -db " + tmp + ".db/AMR_DNA-" + organism1 + " -evalue 1e-20 -dust no " - BLAST_FMT " -out " + tmp + ".blastn > " + logFName + " 2> " + logFName, logFName); + BLAST_FMT " -out " + tmp + ".blastn > " + logFName + " 2> " + logFName, logFName); } else exec ("cp /dev/null " + tmp + ".blastn"); @@ -748,12 +753,12 @@ struct ThisApplication : ShellApplication const string mutation_allS (emptyArg (mutation_all) ? "" : ("-mutation_all " + tmp + ".mutation_all")); const string coreS (add_plus ? "" : " -core"); exec (fullProg ("amr_report") + " -fam " + shellQuote (db + "/fam.tab") + " " + amr_report_blastp + " " + amr_report_blastx - + " -organism " + strQuote (organism1) + " -mutation " + shellQuote (db + "/AMRProt-mutation.tab") + " " + mutation_allS + " " - + force_cds_report + " -pseudo" + coreS - + (ident == -1 ? string () : " -ident_min " + toString (ident)) - + " -coverage_min " + toString (cov) - + ifS (suppress_common, " -suppress_prot " + tmp + ".suppress_prot") + pgapS - + nameS + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr", logFName); + + " -organism " + strQuote (organism1) + " -mutation " + shellQuote (db + "/AMRProt-mutation.tab") + " " + mutation_allS + " " + + force_cds_report + " -pseudo" + coreS + + (ident == -1 ? string () : " -ident_min " + toString (ident)) + + " -coverage_min " + toString (cov) + + ifS (suppress_common, " -suppress_prot " + tmp + ".suppress_prot") + pgapS + + nameS + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr", logFName); } if ( ! emptyArg (dna) && ! organism1. empty () @@ -789,14 +794,24 @@ struct ThisApplication : ShellApplication exec ("cp " + tmp + ".amr " + output); + // Column names are from amr_report.cpp if (! emptyArg (prot_out)) { - exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$1 != \"NA\" {print $1, $6, $7};' | sort -u > " + tmp + ".prot_out"); + const string protCol (col2num ("Protein identifier")); + const string geneSymbolCol (col2num ("Gene symbol")); + const string seqNameCol (col2num ("Sequence name")); + exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$" + protCol + " != \"NA\" {print $" + protCol + ", $" + geneSymbolCol + ", $" + seqNameCol + "};' | sort -u > " + tmp + ".prot_out"); exec (fullProg ("fasta_extract") + prot + " " + tmp + ".prot_out -aa" + qcS + " -log " + logFName + " > " + prot_out, logFName); } if (! emptyArg (dna_out)) { - exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$1 == \"NA\" {print $2, $3, $4, $5, $6, $7};' | sort -u > " + tmp + ".dna_out"); + const string contigCol (col2num ("Contig id")); + const string startCol (col2num ("Start")); + const string stopCol (col2num ("Stop")); + const string strandCol (col2num ("Strand")); + const string geneSymbolCol (col2num ("Gene symbol")); + const string seqNameCol (col2num ("Sequence name")); + exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$" + contigCol + " != \"NA\" {print $" + contigCol + ", $" + startCol + ", $" + stopCol + ", $" + strandCol + ", $" + geneSymbolCol + ", $" + seqNameCol + "};' | sort -u > " + tmp + ".dna_out"); exec (fullProg ("fasta_extract") + dna + " " + tmp + ".dna_out" + qcS + " -log " + logFName + " > " + dna_out, logFName); } diff --git a/common.cpp b/common.cpp index 4bc2e1d..7c2aca7 100644 --- a/common.cpp +++ b/common.cpp @@ -3026,6 +3026,20 @@ string ShellApplication::fullProg (const string &progName) const +string ShellApplication::exec2str (const string &cmd, + const string &tmpPrefix, + const string &logFName) const +{ + ASSERT (! contains (tmpPrefix, ' ')); + const string out (tmp + "." + tmpPrefix); + exec (cmd + " > " + out, logFName); + const StringVector vec (out, (size_t) 1); + QC_ASSERT (vec. size () == 1); + return vec [0]; +} + + + } diff --git a/common.hpp b/common.hpp index cc4c9a5..39b44e5 100644 --- a/common.hpp +++ b/common.hpp @@ -3623,6 +3623,11 @@ struct ShellApplication : Application string fullProg (const string &progName) const; // Return: shellQuote (directory + progName) + ' ' // Requires: After findProg(progName) + string exec2str (const string &cmd, + const string &tmpPrefix, + const string &logFName = string()) const; + // Return: `cmd > .tmpPredix && cat .tmpPredix` + // Requires: cmd produces one line }; #endif diff --git a/fasta_extract.cpp b/fasta_extract.cpp index 0f2397e..e3464e4 100644 --- a/fasta_extract.cpp +++ b/fasta_extract.cpp @@ -119,20 +119,21 @@ void process (const string &id, if (seg. isDna ()) cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-'); cout << ' ' << seg. genesymbol << ' ' << seg. name << endl; + string seq1 (seq); if (seg. isDna ()) { - QC_ASSERT (seg. stop <= seq. size ()); - seq = seq. substr (seg. start, seg. size ()); + QC_ASSERT (seg. stop <= seq1. size ()); + seq1 = seq1. substr (seg. start, seg. size ()); if (! seg. strand) { - reverse (seq); - for (char &c : seq) + reverse (seq1); + for (char &c : seq1) c = complementaryNucleotide (c); } } constexpr size_t line_len = 60; // PAR - for (size_t i = 0; i < seq. size (); i += line_len) - cout << seq. substr (i, line_len) << endl; + for (size_t i = 0; i < seq1. size (); i += line_len) + cout << seq1. substr (i, line_len) << endl; } } diff --git a/version.txt b/version.txt index 4351a7e..d7d8e42 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.7 +3.8.8 From d23be7c0133c71f5cde336f7696e247bc692703d Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Mon, 10 Aug 2020 08:32:36 -0400 Subject: [PATCH 09/36] Add workflow_dispatch manual trigger for actions (#38) * Add workflow_dispatch manual trigger for actions * Add workflow_dispatch manual trigger for actions * Add workflow_dispatch manual trigger for actions * Add workflow_dispatch manual trigger for actions --- .github/workflows/binary.yml | 1 + .github/workflows/ccpp.yml | 1 + .github/workflows/conda.yml | 1 + .github/workflows/mac_conda.yml | 4 +--- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/binary.yml b/.github/workflows/binary.yml index 7324782..6501c56 100644 --- a/.github/workflows/binary.yml +++ b/.github/workflows/binary.yml @@ -1,6 +1,7 @@ name: binary tarball on: + workflow_dispatch: release: repository_dispatch: types: [linux-binary-test, install-test] diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 0f8469b..8e098a9 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -1,6 +1,7 @@ name: C++ CI on: + workflow_dispatch: push: repository_dispatch: types: [linux-compile-test, install-test] diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 70056cf..980998e 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -1,6 +1,7 @@ name: Linux bioconda on: + workflow_dispatch: schedule: - cron: '15 4 * * *' # 4:15am everyday repository_dispatch: diff --git a/.github/workflows/mac_conda.yml b/.github/workflows/mac_conda.yml index 1eefce6..ddbd5c2 100644 --- a/.github/workflows/mac_conda.yml +++ b/.github/workflows/mac_conda.yml @@ -1,8 +1,6 @@ name: Mac bioconda on: - # push: - # branches: - # - master + workflow_dispatch: schedule: - cron: '30 3 * * *' # 3:30am everyday repository_dispatch: From 04f73562ec6691d59d3278f3902506bcd57cae7a Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Thu, 20 Aug 2020 12:58:25 -0400 Subject: [PATCH 10/36] PD-3469 -u does not overwrite the same version unless --force_update --- alignment.hpp | 2 ++ amr_report.cpp | 81 +++++++++++++++++++++++++------------------- amrfinder.cpp | 11 ++++-- amrfinder_update.cpp | 30 +++++++++++++--- common.cpp | 17 +++++----- common.hpp | 10 ++++-- common.inc | 19 ++++++----- dna_mutation.cpp | 4 ++- version.txt | 2 +- 9 files changed, 114 insertions(+), 62 deletions(-) diff --git a/alignment.hpp b/alignment.hpp index 42ee9de..f9a0845 100644 --- a/alignment.hpp +++ b/alignment.hpp @@ -182,6 +182,7 @@ struct SeqChange : Root bool finish (const string &refSeq, size_t flankingLen); // Return: good match + // Input: flankingLen: valid if > 0 private: void setSeq (); void setStartStopRef (); @@ -252,6 +253,7 @@ struct Alignment : Root void setSeqChanges (const Vector &refMutations, size_t flankingLen/*, bool allMutationsP*/); + // Input: flankingLen: valid if > 0 public: bool empty () const override { return targetName. empty (); } diff --git a/amr_report.cpp b/amr_report.cpp index dc877cb..c5eedaa 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -371,6 +371,7 @@ struct BlastAlignment : Alignment throw runtime_error (string ("Bad AMRFinder database\n") + e. what ()); } QC_ASSERT (! refAccession. empty ()); + refName = refAccession; replace (product, '_', ' '); @@ -429,9 +430,11 @@ struct BlastAlignment : Alignment if (const Vector* refMutations = findPtr (accession2mutations, refAccession)) { if (verbose ()) - cout << "Mutation protein found: " << refName << endl; + cout << "Mutation protein found: " << refAccession << endl << line << endl; QC_ASSERT (isMutation ()); setSeqChanges (*refMutations, 0 /*flankingLen*/ /*, mutation_all. get ()*/); + if (verbose ()) + cout << endl; } } catch (...) @@ -462,7 +465,7 @@ struct BlastAlignment : Alignment seqChanges << SeqChange (this, & mut); } #endif - void qc () const + void qc () const override { if (! qc_on) return; @@ -496,7 +499,7 @@ struct BlastAlignment : Alignment #endif QC_IMPLY (! seqChanges. empty (), isMutation ()); } - void saveText (ostream& os) const + void saveText (ostream& os) const override { // PD-736, PD-774, PD-780, PD-799 const string na ("NA"); const string proteinName (isMutation () @@ -517,7 +520,7 @@ struct BlastAlignment : Alignment { const Mutation* mut = seqChange. mutation; const string method (empty () ? na : getMethod (cds)); - ASSERT (isMutation () == ! (seqChange. empty () && ! mut)); + IMPLY (! verbose (), isMutation () == ! (seqChange. empty () && ! mut)); TabDel td (2, false); if (! input_name. empty ()) td << input_name;; @@ -628,7 +631,9 @@ struct BlastAlignment : Alignment Set getMutationSymbols () const { Set mutationSymbols; for (const SeqChange& seqChange : seqChanges) - if (seqChange. mutation) + if ( ! seqChange. empty () + && seqChange. mutation + ) mutationSymbols << seqChange. mutation->geneMutation; return mutationSymbols; } @@ -659,7 +664,7 @@ struct BlastAlignment : Alignment ); } bool truncated (const Locus &cds) const - { return (missedDnaStart (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStart ()) : targetStart <= Locus::end_delta)) + { return (missedDnaStart (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStart ()) : targetStart <= Locus::end_delta)) || (missedDnaStop (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStop ()) : targetLen - targetEnd <= Locus::end_delta)); } bool truncatedCds () const @@ -746,7 +751,7 @@ struct BlastAlignment : Alignment { ASSERT (targetProt == other. targetProt); return targetStrand == other. targetStrand && targetStart + mismatchTailTarget >= other. targetStart - && targetEnd <= other. targetEnd + mismatchTailTarget; + && targetEnd <= other. targetEnd + mismatchTailTarget; } // Requires: same targetName bool matchesCds (const BlastAlignment &other) const @@ -806,18 +811,6 @@ struct BlastAlignment : Alignment { if (targetName != other. targetName) return false; - // PD-807 - if ( ! (targetProt && famId == other. famId) // PD-2441 - //&& ! sameTarget (other) - && ! other. insideEq (*this) - && ! insideEq (other) - ) - return false; - //if (targetProt) - //{ LESS_PART (other, *this, isMutation ()); } - LESS_PART (other, *this, refExactlyMatched ()); // PD-1261, PD-1678 - LESS_PART (other, *this, nident); - LESS_PART (*this, other, refEffectiveLen ()); } else { // PD-1902, PD-2139, PD-2313, PD-2320 @@ -825,27 +818,45 @@ struct BlastAlignment : Alignment return false; if (! targetProt && ! other. matchesCds (*this)) return false; - if (isMutation ()) - { - if (! other. isMutation ()) - return true; - const Set mutationSymbols ( getMutationSymbols ()); - const Set otherMutationSymbols (other. getMutationSymbols ()); - if (mutationSymbols == otherMutationSymbols) - return targetProt; - if (mutationSymbols. containsAll (otherMutationSymbols)) - return true; - if (otherMutationSymbols. containsAll (mutationSymbols)) - return false; - } - else - { + } + if (isMutation ()) + { + if (! other. isMutation ()) + return true; + const Set mutationSymbols ( getMutationSymbols ()); + const Set otherMutationSymbols (other. getMutationSymbols ()); + if (mutationSymbols == otherMutationSymbols && targetProt != other. targetProt) + return targetProt; + if (mutationSymbols. containsAll (otherMutationSymbols)) + return true; + if (otherMutationSymbols. containsAll (mutationSymbols)) + return false; + } + else + { + if (targetProt == other. targetProt) + { + // PD-807 + if ( ! (targetProt && famId == other. famId) // PD-2441 + //&& ! sameTarget (other) + && ! other. insideEq (*this) + && ! insideEq (other) + ) + return false; + //if (targetProt) + //{ LESS_PART (other, *this, isMutation ()); } + LESS_PART (other, *this, refExactlyMatched ()); // PD-1261, PD-1678 + LESS_PART (other, *this, nident); + LESS_PART (*this, other, refEffectiveLen ()); + } + else + { LESS_PART (other, *this, refExactlyMatched ()); //LESS_PART (other, *this, allele ()); // PD-2352 LESS_PART (other, *this, alleleReported ()); LESS_PART (other, *this, targetProt); } - } + } return true; } public: diff --git a/amrfinder.cpp b/amrfinder.cpp index a6deaa3..e861ad6 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,8 @@ * awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 3.8.10 08/20/2020 PD-3469 --force_update +* 3.8.9 08/13/2020 BLAST -show_gis parameter is removed, more mutations are reported for --mutation_all * 3.8.8 08/04/2020 bug in fasta_extract.cpp, more output in --nucleotide_output * 3.8.7 08/03/2020 PD-3504 --protein_output, --nucleotide_output options by fasta_extract.cpp * 3.8.6 07/29/2020 PD-3468 --name option @@ -186,6 +188,7 @@ struct ThisApplication : ShellApplication : ShellApplication (HELP, true, true, true) { addFlag ("update", "Update the AMRFinder database", 'u'); // PD-2379 + addFlag ("force_update", "Force updating the AMRFinder database"); // PD-3469 addKey ("protein", "Input protein FASTA file", "", 'p', "PROT_FASTA"); addKey ("nucleotide", "Input nucleotide FASTA file", "", 'n', "NUC_FASTA"); addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); @@ -275,6 +278,7 @@ struct ThisApplication : ShellApplication const string dna = shellQuote (getArg ("nucleotide")); string db = getArg ("database"); const bool update = getFlag ("update"); + const bool force_update = getFlag ("force_update"); const string gff = shellQuote (getArg ("gff")); const bool pgap = getFlag ("pgap"); const double ident = arg2double ("ident_min"); @@ -314,6 +318,8 @@ struct ThisApplication : ShellApplication if (report_common && emptyArg (organism)) throw runtime_error ("--report_common requires --organism"); + if (report_common && ! add_plus) + throw runtime_error ("--report_common requires --plus"); // PD-3437 if (! emptyArg (mutation_all) && emptyArg (organism)) @@ -384,7 +390,8 @@ struct ThisApplication : ShellApplication if (! dbDir. items. empty () && dbDir. items. back () == "latest") { prog2dir ["amrfinder_update"] = execDir; - exec (fullProg ("amrfinder_update") + " -d " + shellQuote (dbDir. getParent ()) + ifS (quiet, " -q") + ifS (qc_on, " --debug") + " > " + logFName, logFName); + exec (fullProg ("amrfinder_update") + " -d " + shellQuote (dbDir. getParent ()) + ifS (force_update, " --force_update") + + ifS (quiet, " -q") + ifS (qc_on, " --debug") + " > " + logFName, logFName); } else { @@ -598,7 +605,7 @@ struct ThisApplication : ShellApplication // PD-2967 - const string blastp_par ("-show_gis -comp_based_stats 0 -evalue 1e-10 "); + const string blastp_par ("-comp_based_stats 0 -evalue 1e-10 "); // was: -culling_limit 20 // PD-2967 if (! emptyArg (prot)) { diff --git a/amrfinder_update.cpp b/amrfinder_update.cpp index 332eb36..6b59f66 100644 --- a/amrfinder_update.cpp +++ b/amrfinder_update.cpp @@ -266,6 +266,7 @@ Requirements:\n\ { addKey ("database", "Directory for all versions of AMRFinder databases", "$BASE/data", 'd', "DATABASE_DIR"); // Symbolic link ?? + addFlag ("force_update", "Force updating the AMRFinder database"); // PD-3469 addFlag ("quiet", "Suppress messages to STDERR", 'q'); version = SVN_REV; @@ -281,8 +282,9 @@ Requirements:\n\ void shellBody () const final { - const string mainDirOrig = getArg ("database"); - const bool quiet = getFlag ("quiet"); + const string mainDirOrig = getArg ("database"); + const bool force_update = getFlag ("force_update"); + const bool quiet = getFlag ("quiet"); Stderr stderr (quiet); @@ -333,15 +335,33 @@ Requirements:\n\ if (! directoryExists (mainDirS)) exec ("mkdir -p " + shellQuote (mainDirS)); + const string versionFName ("version.txt"); + const string urlDir (URL + curMinor + "/" + latest_version + "/"); + const string latestDir (mainDirS + latest_version + "/"); if (directoryExists (latestDir)) + { + if (! force_update) + { + curl. download (urlDir + versionFName, tmp); + const StringVector version_old (latestDir + versionFName, (size_t) 100); + const StringVector version_new (tmp, (size_t) 100); + if ( ! version_old. empty () + && ! version_new. empty () + && version_old == version_new + ) + { + stderr << shellQuote (latestDir) << " contains the latest version: " << version_old. front () << '\n'; + stderr << "No update is done\n"; + return; + } + } stderr << shellQuote (latestDir) << " already exists, overwriting what was there\n"; + } else exec ("mkdir -p " + shellQuote (latestDir)); - stderr << "Downloading AMRFinder database version " << latest_version << " into " << shellQuote (latestDir) << "\n"; - const string urlDir (URL + curMinor + "/" + latest_version + "/"); fetchAMRFile (curl, urlDir, latestDir, "AMR.LIB"); fetchAMRFile (curl, urlDir, latestDir, "AMRProt"); fetchAMRFile (curl, urlDir, latestDir, "AMRProt-mutation.tab"); @@ -350,7 +370,7 @@ Requirements:\n\ fetchAMRFile (curl, urlDir, latestDir, "database_format_version.txt"); // PD-3051 fetchAMRFile (curl, urlDir, latestDir, "fam.tab"); fetchAMRFile (curl, urlDir, latestDir, "taxgroup.tab"); - fetchAMRFile (curl, urlDir, latestDir, "version.txt"); + fetchAMRFile (curl, urlDir, latestDir, versionFName); StringVector dnaPointMuts; { diff --git a/common.cpp b/common.cpp index 7c2aca7..a39b86b 100644 --- a/common.cpp +++ b/common.cpp @@ -49,19 +49,16 @@ -[[noreturn]] void errorThrow (const string &msg) -{ - throw std::logic_error (msg); -} - - - - namespace Common_sp { +[[noreturn]] void errorThrow (const string &msg) + { throw std::logic_error (msg); } + + + vector programArgs; string programName; @@ -1106,6 +1103,8 @@ void exec (const string &cmd, //Chronometer_OnePass cop (cmd); if (verbose ()) cout << cmd << endl; + if (logPtr) + *logPtr << cmd << endl; const int status = system (cmd. c_str ()); // pipefail's are not caught if (status) @@ -2901,11 +2900,13 @@ int Application::run (int argc, jRoot = nullptr; } + #if 0 if (! logFName. empty ()) { delete logPtr; logPtr = nullptr; } + #endif } catch (const std::exception &e) { diff --git a/common.hpp b/common.hpp index 39b44e5..d521f8d 100644 --- a/common.hpp +++ b/common.hpp @@ -3545,8 +3545,12 @@ struct Application : Singleton, Root void setPositional (List::iterator &posIt, const string &value); public: - virtual ~Application () - {} + ~Application () + { if (logPtr) + { delete logPtr; + logPtr = nullptr; + } + } protected: @@ -3592,6 +3596,8 @@ struct ShellApplication : Application // Environment const bool useTmp; string tmp; + // Temporary file prefix + // If log is used then tmp is printed in the log file and the temporary files are not deleted string execDir; // Ends with '/' // Physically real directory of the software diff --git a/common.inc b/common.inc index 3ab49b2..63175a1 100644 --- a/common.inc +++ b/common.inc @@ -80,14 +80,17 @@ #endif -[[noreturn]] void errorThrow (const std::string &msg); - -#define ERROR_MSG(msg) \ - { if (! std::uncaught_exception ()) \ - { std::ostringstream oss_; \ - oss_ << WHERE << ": " << (msg); \ - errorThrow (oss_. str ()); \ - } else {} \ +namespace Common_sp +{ + [[noreturn]] void errorThrow (const std::string &msg); +} + +#define ERROR_MSG(msg) \ + { if (! std::uncaught_exception ()) \ + { std::ostringstream oss_; \ + oss_ << WHERE << ": " << (msg); \ + Common_sp::errorThrow (oss_. str ()); \ + } else {} \ } #define ERROR ERROR_MSG ("") #define NOT_IMPLEMENTED ERROR_MSG ("Not implemented: ") diff --git a/dna_mutation.cpp b/dna_mutation.cpp index 77e197f..5889d2c 100644 --- a/dna_mutation.cpp +++ b/dna_mutation.cpp @@ -166,6 +166,7 @@ struct BlastnAlignment : Alignment bool good () const { return targetSeq. size () >= min (refLen, 2 * flankingLen + 1); } +#if 0 bool operator< (const BlastnAlignment &other) const { LESS_PART (*this, other, targetName); LESS_PART (other, *this, pIdentity ()); @@ -173,6 +174,7 @@ struct BlastnAlignment : Alignment LESS_PART (*this, other, refName); return false; } +#endif }; @@ -347,7 +349,7 @@ struct ThisApplication : Application SeqChange& seqChange2 = *iter; ASSERT (seqChange2. al == blastAl2); //ASSERT (seqChange2. mutation); - if ( seqChange1. start_target == seqChange2. start_target + if ( seqChange1. start_target == seqChange2. start_target && seqChange1. better (seqChange2) ) iter. erase (); diff --git a/version.txt b/version.txt index d7d8e42..d20cc2b 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.8 +3.8.10 From 953aa89a0683f6d016f0babb47bcb68a49dea52c Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Fri, 21 Aug 2020 18:44:45 -0400 Subject: [PATCH 11/36] PD-2407 --type parameter --- amrfinder.cpp | 56 ++++++++++++++++++++++++++++++++++++++++---- amrfinder_update.cpp | 2 +- common.inc | 3 ++- version.txt | 2 +- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/amrfinder.cpp b/amrfinder.cpp index e861ad6..1086c78 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 3.8.11 08/21/2020 PD-2407 --type * 3.8.10 08/20/2020 PD-3469 --force_update * 3.8.9 08/13/2020 BLAST -show_gis parameter is removed, more mutations are reported for --mutation_all * 3.8.8 08/04/2020 bug in fasta_extract.cpp, more output in --nucleotide_output @@ -178,6 +179,13 @@ struct Warning : Singleton { stderr << Color::code () << "\n"; } }; + + +const StringVector all_types {"AMR", "STRESS", "VIRULENCE"}; + // select id from FAM where [type] = 1 + + + // ThisApplication @@ -203,6 +211,8 @@ struct ThisApplication : ShellApplication addFlag ("plus", "Add the plus genes to the report"); // PD-2789 addFlag ("report_common", "Report proteins common to a taxonomy group"); // PD-2756 addKey ("mutation_all", "File to report all mutations", "", '\0', "MUT_ALL_FILE"); + addKey ("type", "Limit search to specific element types: " + all_types. toString (",") + ". A comma delimited list, case-insensitive", "", '\0', "TYPE"); + // "Element type" is a column name in the report addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); //addKey ("hmmer_bin" ?? addKey ("name", "Text to be added as the first column \"name\" to all rows of the report, for example it can be an assembly name", "", '\0', "NAME"); @@ -289,6 +299,7 @@ struct ThisApplication : ShellApplication const bool add_plus = getFlag ("plus"); const bool report_common = getFlag ("report_common"); const string mutation_all = shellQuote (getArg ("mutation_all")); + const string type = getArg ("type"); string blast_bin = getArg ("blast_bin"); const string input_name = shellQuote (getArg ("name")); const string parm = getArg ("parm"); @@ -321,12 +332,31 @@ struct ThisApplication : ShellApplication if (report_common && ! add_plus) throw runtime_error ("--report_common requires --plus"); + if (force_update && ! update) + throw runtime_error ("--force_update requires --update"); + // PD-3437 if (! emptyArg (mutation_all) && emptyArg (organism)) { Warning warning (stderr); stderr << "--mutation_all option used without -O/--organism option. No point mutations will be screened"; } + + StringVector typeVec; + if (! type. empty ()) + { + const List typeList (str2list (type, ',')); + for (string s: typeList) + { + trim (s); + if (s. empty ()) + continue; + strUpper (s); + if (! all_types. contains (s)) + throw runtime_error ("Unknown element type " + strQuote (s)); + typeVec << s; + } + } if (! emptyArg (output)) try { OFStream f (unQuote (output)); } @@ -780,18 +810,35 @@ struct ThisApplication : ShellApplication exec ("tail -n +2 " + tmp + ".mutation_all.dna >> " + tmp + ".mutation_all"); } + // Column names are from amr_report.cpp + string typeFilter; + if (! typeVec. empty ()) + { + const string typeCol (col2num ("Element type")); + for (const string& t : typeVec) + typeFilter += " || $" + typeCol + " == \"" + t + "\""; + } + // PD-2244, PD-3230 const string sortS (emptyArg (dna) && emptyArg (gff) ? "-k1,1 -k2,2" : "-k2,2 -k3,3n -k4,4n -k5,5 -k1,1 -k6,6"); // Sorting AMR report exec ("head -1 " + tmp + ".amr > " + tmp + ".amr-out"); exec ("LANG=C && tail -n +2 " + tmp + ".amr | sort " + sortS + " | uniq >> " + tmp + ".amr-out"); - exec ("mv " + tmp + ".amr-out " + tmp + ".amr"); + + if (! typeFilter. empty ()) + exec ("awk -F '\t' 'NR == 1 " + typeFilter + "' " + tmp + ".amr-out > " + tmp + ".amr"); + else + exec ("mv " + tmp + ".amr-out " + tmp + ".amr"); + // Sorting mutation_all if (! emptyArg (mutation_all)) { exec ("head -1 " + tmp + ".mutation_all > " + tmp + ".mutation_all-out"); exec ("LANG=C && tail -n +2 " + tmp + ".mutation_all | sort " + sortS + " | uniq >> " + tmp + ".mutation_all-out"); - exec ("mv " + tmp + ".mutation_all-out " + mutation_all); + if (! typeFilter. empty ()) + exec ("awk -F '\t' 'NR == 1 " + typeFilter + "' " + tmp + ".mutation_all-out > " + mutation_all); + else + exec ("mv " + tmp + ".mutation_all-out " + mutation_all); } @@ -799,9 +846,8 @@ struct ThisApplication : ShellApplication exec ("cat " + tmp + ".amr"); else exec ("cp " + tmp + ".amr " + output); - - - // Column names are from amr_report.cpp + + if (! emptyArg (prot_out)) { const string protCol (col2num ("Protein identifier")); diff --git a/amrfinder_update.cpp b/amrfinder_update.cpp index 6b59f66..2350dda 100644 --- a/amrfinder_update.cpp +++ b/amrfinder_update.cpp @@ -348,7 +348,7 @@ Requirements:\n\ const StringVector version_new (tmp, (size_t) 100); if ( ! version_old. empty () && ! version_new. empty () - && version_old == version_new + && version_old. front () == version_new. front () ) { stderr << shellQuote (latestDir) << " contains the latest version: " << version_old. front () << '\n'; diff --git a/common.inc b/common.inc index 63175a1..aea3900 100644 --- a/common.inc +++ b/common.inc @@ -91,13 +91,14 @@ namespace Common_sp oss_ << WHERE << ": " << (msg); \ Common_sp::errorThrow (oss_. str ()); \ } else {} \ + exit (1); \ } #define ERROR ERROR_MSG ("") #define NOT_IMPLEMENTED ERROR_MSG ("Not implemented: ") #define NEVER_CALL ERROR_MSG ("Never call: ") -#define QC_ASSERT(cond) if (! (cond)) ERROR_MSG (#cond) else {} +#define QC_ASSERT(cond) if (! (cond)) ERROR_MSG (#cond) else {} // Must be the last statement in a function body // Logic errors diff --git a/version.txt b/version.txt index d20cc2b..73bb444 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.10 +3.8.11 From 04aa90b3ed580ef277e6fbe97c3c09433398464e Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Mon, 24 Aug 2020 15:03:57 -0400 Subject: [PATCH 12/36] PD-2394 fusion genes are reported to include both gene symbols on each line --- amr_report.cpp | 95 +++++++++++++++++++++++++++++++++++++++++--------- amrfinder.cpp | 1 + common.cpp | 1 - common.hpp | 10 +++--- version.txt | 2 +- 5 files changed, 87 insertions(+), 22 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index c5eedaa..dd158fc 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -298,6 +298,8 @@ unique_ptr mutation_all; string input_name; +const string na ("NA"); + struct BlastAlignment : Alignment @@ -320,6 +322,7 @@ struct BlastAlignment : Alignment // <= parts size_t parts {1}; // >= 1 + const BlastAlignment* fusion {nullptr}; // Table FAM string famId; string gene; @@ -501,7 +504,6 @@ struct BlastAlignment : Alignment } void saveText (ostream& os) const override { // PD-736, PD-774, PD-780, PD-799 - const string na ("NA"); const string proteinName (isMutation () ? product /*string ()*/ : refExactlyMatched () || parts >= 2 || refAccession. empty () // PD-3187, PD-3192 @@ -538,7 +540,7 @@ struct BlastAlignment : Alignment : gene + "_" + seqChange. getMutationStr () : print_fam ? famId - : (isLeft (method, "ALLELE") ? famId : nvl (getFam () -> genesymbol, na)) + : getGeneSymbols () ) << (isMutation () ? mut @@ -675,6 +677,22 @@ struct BlastAlignment : Alignment } bool alleleReported () const { return refExactlyMatched () && allele () && (! targetProt || refLen == targetLen); } + string getGeneSymbol () const + { return alleleReported () /*isLeft (method, "ALLELE")*/ + ? famId + : nvl (getFam () -> genesymbol, na); + } + string getGeneSymbols () const + { if (fusion) + { + string s1 ( getGeneSymbol ()); + string s2 (fusion->getGeneSymbol ()); + if (part == 2) + swap (s1, s2); + return s1 + "/" + s2; + } + return getGeneSymbol (); + } string getMethod (const Locus &cds) const { //IMPLY (refExactlyMatched () && ! mutation_all. get (), ! isMutation ()) string method (refExactlyMatched () @@ -893,9 +911,22 @@ struct BlastAlignment : Alignment ; } size_t getCdsStart () const - { return cdss. empty () - ? 0 - : cdss. front (). start; + { if (cdss. empty ()) + return 0; + size_t start = numeric_limits::max (); + for (const Locus& cds : cdss) + minimize (start, cds. start); + ASSERT (start != numeric_limits::max ()); + return start; + } + size_t getCdsStop () const + { if (cdss. empty ()) + return 0; + size_t stop = 0; + for (const Locus& cds : cdss) + maximize (stop, cds. stop); + ASSERT (stop); + return stop; } void setTargetAlign () { targetAlign = targetEnd - targetStart; @@ -940,16 +971,29 @@ struct BlastAlignment : Alignment } static bool less (const BlastAlignment* a, const BlastAlignment* b) - { ASSERT (a); - ASSERT (b); - LESS_PART (*a, *b, targetName); - LESS_PART (*a, *b, targetStart); - LESS_PART (*a, *b, getCdsStart ()); - LESS_PART (*a, *b, famId); - LESS_PART (*a, *b, refAccession); - LESS_PART (*b, *a, pIdentity ()); - return false; - } + { ASSERT (a); + ASSERT (b); + LESS_PART (*a, *b, targetName); + LESS_PART (*a, *b, targetStart); + LESS_PART (*a, *b, targetEnd); + LESS_PART (*a, *b, getCdsStart ()); + LESS_PART (*a, *b, getCdsStop ()); + LESS_PART (*a, *b, refAccession); + LESS_PART (*a, *b, part); + //LESS_PART (*a, *b, famId); + //LESS_PART (*b, *a, pIdentity ()); + return false; + } + bool sameMatch (const BlastAlignment* other) const + // Cf. less() + { ASSERT (other); + return targetName == other->targetName + && targetStart == other->targetStart + && targetEnd == other->targetEnd + && getCdsStart () == other->getCdsStart () + && getCdsStop () == other->getCdsStop () + && refAccession == other->refAccession; + } }; @@ -1351,7 +1395,26 @@ struct Batch } #endif - goodBlastAls. sort (BlastAlignment::less); + // PD-2394 + // BlastAlignment::fusion + goodBlastAls. sort (BlastAlignment::less); + { + const BlastAlignment* prev = nullptr; + for (const BlastAlignment* blastAl : goodBlastAls) + { + if ( prev + && prev->sameMatch (blastAl) + && prev ->part == 1 + && blastAl->part == 2 + ) + { + QC_ASSERT (blastAl->parts == 2); + var_cast (prev) -> fusion = blastAl; + var_cast (blastAl) -> fusion = prev; + } + prev = blastAl; + } + } if (verbose ()) { diff --git a/amrfinder.cpp b/amrfinder.cpp index 1086c78..214702e 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 3.8.12 08/24/2020 PD-2394 fusion genes are reported to include both gene symbols on each line * 3.8.11 08/21/2020 PD-2407 --type * 3.8.10 08/20/2020 PD-3469 --force_update * 3.8.9 08/13/2020 BLAST -show_gis parameter is removed, more mutations are reported for --mutation_all diff --git a/common.cpp b/common.cpp index a39b86b..3726bc7 100644 --- a/common.cpp +++ b/common.cpp @@ -1474,7 +1474,6 @@ string CharInput::getLine () - // Token void Token::readInput (CharInput &in) diff --git a/common.hpp b/common.hpp index d521f8d..99598e7 100644 --- a/common.hpp +++ b/common.hpp @@ -2699,12 +2699,14 @@ struct CharInput : Input : runtime_error (what_arg) {} }; + string errorText (const string &what, + bool expected = true) const + { return "Error at line " + to_string (lineNum + 1) + ", pos. " + to_string (charNum + 1) + + (what. empty () ? string () : (": " + what + ifS (expected, " is expected"))); + } [[noreturn]] void error (const string &what, bool expected = true) const - { throw Error ("Error at line " + to_string (lineNum + 1) + ", pos. " + to_string (charNum + 1) - + (what. empty () ? string () : (": " + what + ifS (expected, " is expected"))) - ); - } + { throw Error (errorText (what, expected)); } }; diff --git a/version.txt b/version.txt index 73bb444..89a1ad7 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.11 +3.8.12 From 2a090499e102b377c17aa44ba43ba7e9cadd7b14 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Tue, 25 Aug 2020 17:28:56 -0400 Subject: [PATCH 13/36] PD-2322 a complete nucleotide hit is preferred to a partial protein hit --- amr_report.cpp | 4 +++- amrfinder.cpp | 1 + version.txt | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index dd158fc..507daf8 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -718,7 +718,8 @@ struct BlastAlignment : Alignment && ! targetProt ) method = "INTERNAL_STOP"; - else if (method != "HMM") + else + if (method != "HMM") method += (targetProt ? "P" : "X"); return method; } @@ -872,6 +873,7 @@ struct BlastAlignment : Alignment LESS_PART (other, *this, refExactlyMatched ()); //LESS_PART (other, *this, allele ()); // PD-2352 LESS_PART (other, *this, alleleReported ()); + LESS_PART (*this, other, partial ()); // PD-2322 LESS_PART (other, *this, targetProt); } } diff --git a/amrfinder.cpp b/amrfinder.cpp index 214702e..d490482 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 3.8.13 08/25/2020 PD-2322 a complete nucleotide hit should be preferred to a partial protein hit * 3.8.12 08/24/2020 PD-2394 fusion genes are reported to include both gene symbols on each line * 3.8.11 08/21/2020 PD-2407 --type * 3.8.10 08/20/2020 PD-3469 --force_update diff --git a/version.txt b/version.txt index 89a1ad7..6a801ce 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.12 +3.8.13 From 82cf9ae3e2c088b78f2ffb8b637189e49fb19b09 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Thu, 27 Aug 2020 15:45:21 -0400 Subject: [PATCH 14/36] PD-3470 method FRAME_SHIFT, amr_report is faster --- alignment.cpp | 49 ++++++ alignment.hpp | 13 ++ amr_report.cpp | 402 +++++++++++++++++++++++++++++-------------------- amrfinder.cpp | 3 +- version.txt | 2 +- 5 files changed, 302 insertions(+), 167 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 700c3ab..6cb6c53 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -829,6 +829,55 @@ void Alignment::qc () const +bool Alignment::getFrameShift_right (const Alignment &rightPart, + size_t diff_max) const +{ + ASSERT (! targetProt); + ASSERT (refProt); + ASSERT (rightPart. refProt); + + if (this == & rightPart) + return false; + + if (rightPart. targetProt) + return false; + + if ( targetName != rightPart. targetName + || refName != rightPart. refName + || targetStrand != rightPart. targetStrand + ) + return false; + + if ( refStart >= rightPart. refStart + || refEnd >= rightPart. refEnd + || difference (rightPart. refStart, refEnd) > diff_max / 3 + ) + return false; + + if (targetStrand) + { + if ( targetStart >= rightPart. targetStart + || targetEnd >= rightPart. targetEnd + || difference (rightPart. targetStart, targetEnd) > diff_max + ) + return false; + } + else + { + if ( targetStart <= rightPart. targetStart + || targetEnd <= rightPart. targetEnd + || difference (rightPart. targetEnd, targetStart) > diff_max + ) + return false; + } + + if (targetStart % 3 == rightPart. targetStart % 3) + return false; + + return true; +} + + } // namespace diff --git a/alignment.hpp b/alignment.hpp index f9a0845..2ecb2fa 100644 --- a/alignment.hpp +++ b/alignment.hpp @@ -290,6 +290,19 @@ struct Alignment : Root && nident == refLen && nident == targetSeq. size (); } + bool getFrameShift (const Alignment &other, + size_t diff_max) const + // Return: success + // Input: diff_max: in bp + // Requires: !targetProt, refProt, rightPart.refProt + { return nident >= other. nident + && ( getFrameShift_right (other, diff_max) + || other. getFrameShift_right (*this, diff_max) + ); + } +private: + bool getFrameShift_right (const Alignment &rightPart, + size_t diff_max) const; }; diff --git a/amr_report.cpp b/amr_report.cpp index 507daf8..0bd1aa4 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -291,8 +291,8 @@ bool cdsExist = false; bool print_fam = false; bool reportPseudo = false; -const string stopCodonS ("[stop]"); -const string frameShiftS ("[frameshift]"); +//const string stopCodonS ("[stop]"); +//const string frameShiftS ("[frameshift]"); unique_ptr mutation_all; @@ -410,8 +410,7 @@ struct BlastAlignment : Alignment if (contains (targetSeq, "*")) stopCodon = true; //frameShift = contains (targetSeq, "/"); // Needs "blastall -p blastx ... " - if (! targetProt && (targetEnd - targetStart) % 3 != 0) - frameShift = true; + IMPLY (! targetProt, (targetEnd - targetStart) % 3 == 0); // redundant ?? // For BLASTX // PD-1280 @@ -550,7 +549,7 @@ struct BlastAlignment : Alignment : proteinName + " [UNKNOWN]" : proteinName ) - + ifS (reportPseudo, ifS (frameShift, " " + frameShiftS)) + //+ ifS (reportPseudo, ifS (frameShift, " " + frameShiftS)) << (isMutation () || getFam () -> reportable >= 2 ? "core" : "plus"); // PD-2825 // PD-1856 if (isMutation ()) @@ -710,16 +709,26 @@ struct BlastAlignment : Alignment : "BLAST" ); // PD-2088, PD-2320 + bool suffix = true; if ( ( method == "BLAST" || method == "PARTIAL" || method == "PARTIAL_CONTIG_END" ) - && stopCodon && ! targetProt ) - method = "INTERNAL_STOP"; - else - if (method != "HMM") + { + if (stopCodon) + { + method = "INTERNAL_STOP"; + suffix = false; + } + else if (frameShift) + { + method = "FRAME_SHIFT"; + suffix = false; + } + } + if (suffix && method != "HMM") method += (targetProt ? "P" : "X"); return method; } @@ -744,6 +753,8 @@ struct BlastAlignment : Alignment ) return false; } + if (frameShift) + return true; // PD-1032 if (partial ()) if ( parts > 1 @@ -989,8 +1000,9 @@ struct BlastAlignment : Alignment bool sameMatch (const BlastAlignment* other) const // Cf. less() { ASSERT (other); - return targetName == other->targetName - && targetStart == other->targetStart + ASSERT (targetName == other->targetName); + return /*targetName == other->targetName + &&*/ targetStart == other->targetStart && targetEnd == other->targetEnd && getCdsStart () == other->getCdsStart () && getCdsStop () == other->getCdsStop () @@ -1018,6 +1030,29 @@ bool HmmAlignment::better (const BlastAlignment& other) const +typedef map> BlastAlignmentsOwn; + // first = second::targetName +size_t getSize (const BlastAlignmentsOwn &ba) +{ + size_t n = 0; + for (const auto& it : ba) + n += it. second. size (); + return n; +} + + +typedef map> BlastAlignments; + // first = second::targetName +size_t getSize (const BlastAlignments &ba) +{ + size_t n = 0; + for (const auto& it : ba) + n += it. second. size (); + return n; +} + + + // Batch struct Batch @@ -1028,13 +1063,13 @@ struct Batch StringVector suppress_prots; // Target input - VectorOwn blastAls; + BlastAlignmentsOwn blastAls; VectorOwn hmmAls; bool hmmExist {false}; map domains; // Best domain // Output - VectorPtr goodBlastAls; + BlastAlignments goodBlastAls; Batch (const string &famFName, @@ -1211,26 +1246,34 @@ struct Batch // Output: goodBlastAls { ASSERT (goodBlastAls. empty ()); - for (const BlastAlignment* blastAl : blastAls) - { - ASSERT (blastAl); - ASSERT (blastAl->good ()); - bool found = false; - for (const BlastAlignment* goodBlastAl : goodBlastAls) - if (goodBlastAl->better (*blastAl)) - { - found = true; - break; - } - if (found) - continue; - for (Iter> goodIter (goodBlastAls); goodIter. next ();) - if (blastAl->better (**goodIter)) - goodIter. erase (); - goodBlastAls << blastAl; - } + for (const auto& it1 : blastAls) + for (const BlastAlignment* blastAl : it1. second) + { + ASSERT (blastAl); + ASSERT (blastAl->good ()); + bool found = false; + for (const auto& it2 : goodBlastAls) // efficiency ?? + { + for (const BlastAlignment* goodBlastAl : it2. second) + if (goodBlastAl->better (*blastAl)) + { + found = true; + break; + } + if (found) + break; + } + if (found) + continue; + for (auto& it2 : goodBlastAls) // efficiency ?? + for (Iter> goodIter (it2. second); goodIter. next ();) + if (blastAl->better (**goodIter)) + goodIter. erase (); + ASSERT (! blastAl->targetName. empty ()); + goodBlastAls [blastAl->targetName] << blastAl; + } if (verbose ()) - cout << "# Best Blasts: " << goodBlastAls. size () << endl; + cout << "# Best Blasts: " << getSize (goodBlastAls) << endl; } public: @@ -1240,40 +1283,60 @@ struct Batch // Input: blastAls, domains, hmmAls // Output: goodBlastAls { - // Use BlastAlignment.cdss - for (Iter> iter (blastAls); iter. next ();) - if (! (*iter)->good ()) - delete iter. erase (); - - // Group by targetName and process each targetName separately for speed ?? + // BlastAlign::frameShift + for (auto& it : blastAls) + for (Iter> iter (it. second); iter. next ();) + if (! (*iter)->targetProt) + for (const BlastAlignment* blastAl : it. second) + if (! blastAl->targetProt && blastAl->getFrameShift (**iter, 60)) // PAR + { + iter. erase (); + var_cast (blastAl) -> frameShift = true; + //blastAl->saveText (cerr); + break; + } + for (auto& it : blastAls) + for (Iter> iter (it. second); iter. next ();) + if (! (*iter)->good ()) + delete iter. erase (); + if (verbose ()) + cout << "# Good Blasts: " << getSize (blastAls) << endl; + if (retainBlasts) - goodBlastAls = blastAls; + { + ASSERT (goodBlastAls. empty ()); + for (const auto& it : blastAls) + goodBlastAls [it. first] = it. second; + } else blastParetoBetter (); // Cf. dna_mutation.cpp - for (const BlastAlignment* blastAl1 : goodBlastAls) - for (const SeqChange& seqChange1 : blastAl1->seqChanges) - { - ASSERT (seqChange1. al == blastAl1); - //ASSERT (seqChange1. mutation); - for (const BlastAlignment* blastAl2 : goodBlastAls) - if ( blastAl2->targetName == blastAl1->targetName - && blastAl2->targetStrand == blastAl1->targetStrand - && blastAl2 != blastAl1 - ) - for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) - { - SeqChange& seqChange2 = *iter; - //ASSERT (seqChange2. mutation); - ASSERT (seqChange2. al == blastAl2); - if ( seqChange1. start_target == seqChange2. start_target - && seqChange1. better (seqChange2) - ) - iter. erase (); - } - } + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl1 : it. second) + for (const SeqChange& seqChange1 : blastAl1->seqChanges) + { + ASSERT (seqChange1. al == blastAl1); + //ASSERT (seqChange1. mutation); + for (const BlastAlignment* blastAl2 : it. second) + { + ASSERT (blastAl2->targetName == blastAl1->targetName); + if ( blastAl2->targetStrand == blastAl1->targetStrand + && blastAl2 != blastAl1 + ) + for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + { + SeqChange& seqChange2 = *iter; + //ASSERT (seqChange2. mutation); + ASSERT (seqChange2. al == blastAl2); + if ( seqChange1. start_target == seqChange2. start_target + && seqChange1. better (seqChange2) + ) + iter. erase (); + } + } + } // HMM: Pareto-better() VectorPtr goodHmmAls; goodHmmAls. reserve (hmmAls. size ()); @@ -1318,63 +1381,74 @@ struct Batch // PD-741 if (hmmExist && ! skip_hmm_check) - for (Iter> iter (goodBlastAls); iter. next ();) - if ( /*! iter->refExactlyMatched () */ - ! (*iter) -> isMutation () - && (*iter) -> targetProt - && (*iter) -> pIdentity () < 0.98 - frac_delta // PAR // PD-1673 - && ! (*iter) -> partial () - ) - if (const Fam* fam = (*iter) -> getFam () -> getHmmFam ()) - { - bool found = false; - for (const HmmAlignment* hmmAl : goodHmmAls) - if ( (*iter) -> targetName == hmmAl->sseqid - && fam == hmmAl->fam - ) - { - found = true; - break; - } - if (! found) // BLAST is wrong - { - if (verbose ()) - { - cout << "HMM-wrong: "; - (*iter) -> saveText (cout); - } - iter. erase (); - } - } + for (auto& it : goodBlastAls) + for (Iter> iter (it. second); iter. next ();) + if ( /*! iter->refExactlyMatched () */ + ! (*iter) -> isMutation () + && (*iter) -> targetProt + && (*iter) -> pIdentity () < 0.98 - frac_delta // PAR // PD-1673 + && ! (*iter) -> partial () + ) + if (const Fam* fam = (*iter) -> getFam () -> getHmmFam ()) + { + bool found = false; + for (const HmmAlignment* hmmAl : goodHmmAls) + if ( (*iter) -> targetName == hmmAl->sseqid + && fam == hmmAl->fam + ) + { + found = true; + break; + } + if (! found) // BLAST is wrong + { + if (verbose ()) + { + cout << "HMM-wrong: "; + (*iter) -> saveText (cout); + } + iter. erase (); + } + } if (verbose ()) - cout << "# Best Blasts left: " << goodBlastAls. size () << endl; + cout << "# Best Blasts left: " << getSize (goodBlastAls) << endl; for (Iter> hmmIt (goodHmmAls); hmmIt. next ();) - for (const BlastAlignment* blastAl : goodBlastAls) - if (blastAl->better (**hmmIt)) - { - ASSERT (! blastAl->hmmAl); - var_cast (blastAl) -> hmmAl = *hmmIt; - hmmIt. erase (); - break; - } + for (const auto& it : goodBlastAls) + { + bool found = false; + for (const BlastAlignment* blastAl : it. second) + if (blastAl->better (**hmmIt)) + { + ASSERT (! blastAl->hmmAl); + var_cast (blastAl) -> hmmAl = *hmmIt; + hmmIt. erase (); + found = true; + break; + } + if (found) + break; + } // PD-2783 - for (Iter> blastIt (goodBlastAls); blastIt. next ();) - for (const HmmAlignment* hmmAl : goodHmmAls) - if (hmmAl->better (**blastIt)) - { - blastIt. erase (); - break; - } + for (auto& it : goodBlastAls) + for (Iter> blastIt (it. second); blastIt. next ();) + for (const HmmAlignment* hmmAl : goodHmmAls) + if (hmmAl->better (**blastIt)) + { + blastIt. erase (); + break; + } // Output // goodHmmAls --> goodBlastAls for (const HmmAlignment* hmmAl : goodHmmAls) { - ASSERT (hmmAl->blastAl. get ()); - goodBlastAls << hmmAl->blastAl. get (); + const BlastAlignment* blastAl = hmmAl->blastAl. get (); + ASSERT (blastAl); + ASSERT (! blastAl->targetName. empty ()); + goodBlastAls [blastAl->targetName] << blastAl; } #if 0 @@ -1399,33 +1473,38 @@ struct Batch // PD-2394 // BlastAlignment::fusion - goodBlastAls. sort (BlastAlignment::less); - { - const BlastAlignment* prev = nullptr; - for (const BlastAlignment* blastAl : goodBlastAls) + for (auto& it : goodBlastAls) + { + auto& goodBlastAls_ = it. second; + goodBlastAls_. sort (BlastAlignment::less); { - if ( prev - && prev->sameMatch (blastAl) - && prev ->part == 1 - && blastAl->part == 2 - ) + const BlastAlignment* prev = nullptr; + for (const BlastAlignment* blastAl : goodBlastAls_) { - QC_ASSERT (blastAl->parts == 2); - var_cast (prev) -> fusion = blastAl; - var_cast (blastAl) -> fusion = prev; + if ( prev + && prev->sameMatch (blastAl) + && prev ->part == 1 + && blastAl->part == 2 + ) + { + QC_ASSERT (blastAl->parts == 2); + var_cast (prev) -> fusion = blastAl; + var_cast (blastAl) -> fusion = prev; + } + prev = blastAl; } - prev = blastAl; } } if (verbose ()) { cout << endl << "After process():" << endl; - for (const BlastAlignment* blastAl : goodBlastAls) - { - blastAl->saveText (cout); - cout << "# Mutations: " << blastAl->seqChanges. size () << endl; - } + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl : it. second) + { + blastAl->saveText (cout); + cout << "# Mutations: " << blastAl->seqChanges. size () << endl; + } } } @@ -1476,25 +1555,26 @@ struct Batch *mutation_all << td. str () << endl; } - for (const BlastAlignment* blastAl : goodBlastAls) - { - ASSERT (blastAl); - if (blastAl->isMutation ()) - if (blastAl->seqChanges. empty ()) - ; - else - { + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl : it. second) + { + ASSERT (blastAl); + if (blastAl->isMutation ()) + if (blastAl->seqChanges. empty ()) + ; + else + { + blastAl->saveText (os); + blastAl->qc (); + } + else if ( blastAl->getFam () -> reportable >= reportable_min + && ! suppress_prots. containsFast (blastAl->refAccession) + ) + { blastAl->saveText (os); blastAl->qc (); } - else if ( blastAl->getFam () -> reportable >= reportable_min - && ! suppress_prots. containsFast (blastAl->refAccession) - ) - { - blastAl->saveText (os); - blastAl->qc (); - } - } + } } @@ -1502,12 +1582,13 @@ struct Batch void printTargetIds (ostream &os) const { ASSERT (os. good ()); - for (const BlastAlignment* blastAl : goodBlastAls) - if ( blastAl->targetProt - && ! blastAl->isMutation () - && blastAl->getFam () -> reportable >= reportable_min - ) - os << blastAl->targetName << endl; + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl : it. second) + if ( blastAl->targetProt + && ! blastAl->isMutation () + && blastAl->getFam () -> reportable >= reportable_min + ) + os << blastAl->targetName << endl; } }; @@ -1612,7 +1693,7 @@ struct ThisApplication : Application // Output addKey ("out", "Identifiers of the reported input proteins"); addFlag ("print_fam", "Print the FAM.id instead of gene symbol"); - addFlag ("pseudo", "Indicate pseudo-genes in the protein name as " + strQuote (stopCodonS) + " or " + strQuote (frameShiftS)); + addFlag ("pseudo", "Indicate pseudo-genes as method INTERNAL_STOP or FRAME_SHIFT"); addFlag ("force_cds_report", "Report contig/start/stop/strand even if this information does not exist"); addFlag ("non_reportable", "Report non-reportable families"); addFlag ("core", "Report only core reportale families"); @@ -1720,13 +1801,8 @@ struct ThisApplication : Application al->qc (); if (nosame && al->refAccession == al->targetName) continue; - if (al->good ()) - batch. blastAls << al; - else - { - ASSERT (! al->refExactlyMatched ()); - delete al; - } + ASSERT (! al->targetName. empty ()); + batch. blastAls [al->targetName] << al; } } @@ -1744,18 +1820,13 @@ struct ThisApplication : Application al->qc (); if (nosame && al->refAccession == al->targetName) continue; - if (al->good ()) - batch. blastAls << al; - else - { - ASSERT (! al->refExactlyMatched ()); - delete al; - } + ASSERT (! al->targetName. empty ()); + batch. blastAls [al->targetName] << al; } } } if (verbose ()) - cout << "# Good Blasts: " << batch. blastAls. size () << endl; + cout << "# Blasts: " << getSize (batch. blastAls) << endl; if (! nohmm) @@ -1877,9 +1948,10 @@ struct ThisApplication : Application var_cast (locus). contigLen = contig2len [locus.contig]; } } - for (const BlastAlignment* al : batch. blastAls) - if (al->targetProt) - var_cast (al) -> setCdss (seqId2locusTag, * annot. get ()); + for (const auto& it : batch. blastAls) + for (const BlastAlignment* al : it. second) + if (al->targetProt) + var_cast (al) -> setCdss (seqId2locusTag, * annot. get ()); for (const HmmAlignment* hmmAl : batch. hmmAls) var_cast (hmmAl->blastAl. get ()) -> setCdss (seqId2locusTag, * annot. get ());; } diff --git a/amrfinder.cpp b/amrfinder.cpp index d490482..e2a6d44 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,7 +33,8 @@ * awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: -* 3.8.13 08/25/2020 PD-2322 a complete nucleotide hit should be preferred to a partial protein hit +* 3.8.14 08/26/2020 PD-3470 method FRAME_SHIFT, amr_report is faster +* 3.8.13 08/25/2020 PD-2322 a complete nucleotide hit is preferred to a partial protein hit * 3.8.12 08/24/2020 PD-2394 fusion genes are reported to include both gene symbols on each line * 3.8.11 08/21/2020 PD-2407 --type * 3.8.10 08/20/2020 PD-3469 --force_update diff --git a/version.txt b/version.txt index 6a801ce..c7413b8 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.13 +3.8.14 From 4ea0d7c142a3e0d7865d6d4e1a5425be17ebb252 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Fri, 28 Aug 2020 13:31:24 -0400 Subject: [PATCH 15/36] PD-3475 Return BLAST alignment parameters for HMM-only hits where available --- amr_report.cpp | 110 +++++++++++++++++++++++++++++-------------------- amrfinder.cpp | 3 +- version.txt | 2 +- 3 files changed, 69 insertions(+), 46 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 0bd1aa4..5d6d8ad 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -313,8 +313,7 @@ struct BlastAlignment : Alignment bool frameShift {false}; // Reference protein -//long gi {0}; - // 0 <=> HMM method + bool fromHmm {false}; string refAccession; // empty() <=> HMM method size_t part {1}; @@ -445,28 +444,39 @@ struct BlastAlignment : Alignment throw; } } - explicit BlastAlignment (const HmmAlignment& other) - : famId (other. fam->id) - , gene (other. fam->id) - , product (other. fam->familyName) - , hmmAl (& other) - { targetName = other. sseqid; + BlastAlignment (const HmmAlignment& hmmAl_arg, + const BlastAlignment* best) + : fromHmm (true) + , famId (hmmAl_arg. fam->id) + , gene (hmmAl_arg. fam->id) + , product (hmmAl_arg. fam->familyName) + , hmmAl (& hmmAl_arg) + { ASSERT (hmmAl_arg. good ()); + targetName = hmmAl_arg. sseqid; targetProt = true; alProt = true; if (allele ()) ERROR_MSG (famId + " " + gene); - ASSERT (other. good ()); - } -#if 0 - explicit BlastAlignment (const Mutation& mut) - : famId (mut. gene) - , gene (mut. gene) - , resistance ("mutation") - { targetProt = true; - alProt = true; - seqChanges << SeqChange (this, & mut); + if (best) + { + targetSeq = best->targetSeq; + targetStart = best->targetStart; + targetEnd = best->targetEnd; + targetLen = best->targetLen; + targetStrand = best->targetStrand; + refProt = true; + refName = best->refName; + refSeq = best->refSeq; + refStart = best->refStart; + refEnd = best->refEnd; + refLen = best->refLen; + alProt = true; + nident = best->nident; + targetAlign = best->targetAlign; + targetAlign_aa = best->targetAlign_aa; + refAccession = best->refAccession; + } } -#endif void qc () const override { if (! qc_on) @@ -479,6 +489,7 @@ struct BlastAlignment : Alignment QC_ASSERT (part >= 1); QC_ASSERT (part <= parts); QC_ASSERT (! product. empty ()); + QC_IMPLY (! fromHmm, ! refAccession. empty ()); QC_ASSERT (refAccession. empty () == targetSeq. empty ()); QC_ASSERT (! refAccession. empty () == (bool) refLen); QC_ASSERT (! refAccession. empty () == (bool) nident); @@ -694,19 +705,19 @@ struct BlastAlignment : Alignment } string getMethod (const Locus &cds) const { //IMPLY (refExactlyMatched () && ! mutation_all. get (), ! isMutation ()) - string method (refExactlyMatched () - ? alleleReported () - ? "ALLELE" - : "EXACT" // PD-776 - : refAccession. empty () - ? "HMM" - : partial () - ? truncated (cds) - ? "PARTIAL_CONTIG_END" // PD-2267 - : "PARTIAL" - : isMutation () - ? "POINT" - : "BLAST" + string method (fromHmm + ? "HMM" + : refExactlyMatched () + ? alleleReported () + ? "ALLELE" + : "EXACT" // PD-776 + : partial () + ? truncated (cds) + ? "PARTIAL_CONTIG_END" // PD-2267 + : "PARTIAL" + : isMutation () + ? "POINT" + : "BLAST" ); // PD-2088, PD-2320 bool suffix = true; @@ -1241,7 +1252,6 @@ struct Batch void blastParetoBetter () - // BLAST: Pareto-better() // Input: blastAls // Output: goodBlastAls { @@ -1283,7 +1293,7 @@ struct Batch // Input: blastAls, domains, hmmAls // Output: goodBlastAls { - // BlastAlign::frameShift + // BlastAlignment::frameShift for (auto& it : blastAls) for (Iter> iter (it. second); iter. next ();) if (! (*iter)->targetProt) @@ -1296,6 +1306,7 @@ struct Batch break; } + // BlastAlignment::good() for (auto& it : blastAls) for (Iter> iter (it. second); iter. next ();) if (! (*iter)->good ()) @@ -1869,22 +1880,33 @@ struct ThisApplication : Application delete hmmAl; continue; } - auto al = new BlastAlignment (*hmmAl); + const BlastAlignment* bestBlastAl = nullptr; // PD-3475 + if (const VectorOwn* blastAls_ = findPtr (batch. blastAls, hmmAl->sseqid)) + { + size_t nident = 0; + for (const BlastAlignment* blastAl : *blastAls_) + if (maximize (nident, blastAl->nident)) + bestBlastAl = blastAl; + } + auto al = new BlastAlignment (*hmmAl, bestBlastAl); hmmAl->blastAl. reset (al); if (verbose ()) cout << al->targetName << " " << al->gene << endl; const HmmAlignment::Domain domain = batch. domains [HmmAlignment::Pair (al->targetName, al->gene)]; if (! domain. hmmLen) continue; // domain does not exist - /*al->refLen = domain. hmmLen; - al->refStart = domain. hmmStart; - al->refEnd = domain. hmmStop; */ - al->targetLen = domain. seqLen; - al->targetStart = domain. seqStart; - al->targetEnd = domain. seqStop; - al->setTargetAlign (); - ASSERT (! al->refExactlyMatched ()); - ASSERT (! al->partial ()); + if (! bestBlastAl) + { + /*al->refLen = domain. hmmLen; + al->refStart = domain. hmmStart; + al->refEnd = domain. hmmStop; */ + al->targetLen = domain. seqLen; + al->targetStart = domain. seqStart; + al->targetEnd = domain. seqStop; + al->setTargetAlign (); + //ASSERT (! al->refExactlyMatched ()); + //ASSERT (! al->partial ()); + } al->qc (); batch. hmmAls << hmmAl; } diff --git a/amrfinder.cpp b/amrfinder.cpp index e2a6d44..70a58d5 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,7 +33,8 @@ * awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: -* 3.8.14 08/26/2020 PD-3470 method FRAME_SHIFT, amr_report is faster +* 3.8.15 08/28/2020 PD-3475 Return BLAST alignment parameters for HMM-only hits where available +* 3.8.14 08/27/2020 PD-3470 method FRAME_SHIFT, amr_report is faster * 3.8.13 08/25/2020 PD-2322 a complete nucleotide hit is preferred to a partial protein hit * 3.8.12 08/24/2020 PD-2394 fusion genes are reported to include both gene symbols on each line * 3.8.11 08/21/2020 PD-2407 --type diff --git a/version.txt b/version.txt index c7413b8..84691dd 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.14 +3.8.15 From 454307c75675b4a74ae9bca379334f92f3f552ab Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Tue, 1 Sep 2020 17:50:39 -0400 Subject: [PATCH 16/36] PD-2322 a complete nucleotide hit is not preferred to a partial protein hit; stopCodon field is borrowed from BLASTX to BPASTP --- amr_report.cpp | 42 ++++++++++++++++++++++++++++++++++++++---- amrfinder.cpp | 2 ++ version.txt | 2 +- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 5d6d8ad..01ead40 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -560,7 +560,7 @@ struct BlastAlignment : Alignment : proteinName + " [UNKNOWN]" : proteinName ) - //+ ifS (reportPseudo, ifS (frameShift, " " + frameShiftS)) + //+ ifS (reportPseudo, ifS (stopCodon, " [STOP_CODON]")) << (isMutation () || getFam () -> reportable >= 2 ? "core" : "plus"); // PD-2825 // PD-1856 if (isMutation ()) @@ -629,6 +629,8 @@ struct BlastAlignment : Alignment << '\t' << targetProt << '\t' << nident << '\t' << refMutation + << '\t' << stopCodon + << '\t' << frameShift << '\t'; os << td. str () << endl; } @@ -725,7 +727,7 @@ struct BlastAlignment : Alignment || method == "PARTIAL" || method == "PARTIAL_CONTIG_END" ) - && ! targetProt + //&& ! targetProt ) { if (stopCodon) @@ -895,7 +897,7 @@ struct BlastAlignment : Alignment LESS_PART (other, *this, refExactlyMatched ()); //LESS_PART (other, *this, allele ()); // PD-2352 LESS_PART (other, *this, alleleReported ()); - LESS_PART (*this, other, partial ()); // PD-2322 + //LESS_PART (*this, other, partial ()); LESS_PART (other, *this, targetProt); } } @@ -1285,6 +1287,20 @@ struct Batch if (verbose ()) cout << "# Best Blasts: " << getSize (goodBlastAls) << endl; } + + + void setStopCodon (BlastAlignment &blastAlP) + { ASSERT (blastAlP. targetProt); + for (const Locus& cds : blastAlP. cdss) + if (const VectorOwn* blastAlsX = findPtr (blastAls, cds. contig)) + for (const BlastAlignment* blastAlX : *blastAlsX) + if ( ! blastAlX->targetProt + && blastAlX->stopCodon + && blastAlP. better (*blastAlX) + ) + blastAlP. stopCodon = true; + } + public: @@ -1302,7 +1318,6 @@ struct Batch { iter. erase (); var_cast (blastAl) -> frameShift = true; - //blastAl->saveText (cerr); break; } @@ -1314,6 +1329,25 @@ struct Batch if (verbose ()) cout << "# Good Blasts: " << getSize (blastAls) << endl; + // PD-2322 + for (const auto& it : blastAls) + for (const BlastAlignment* blastAlP : it. second) + if (blastAlP->targetProt) + setStopCodon (* var_cast (blastAlP)); + for (const HmmAlignment* hmmAl : hmmAls) + { + const BlastAlignment* blastAl = hmmAl->blastAl. get (); + ASSERT (blastAl); + setStopCodon (* var_cast (blastAl)); + } + if (verbose (-1)) + { + cout << "After setStopCodon():" << endl; + for (const auto& it : blastAls) + for (const BlastAlignment* blastAl : it. second) + blastAl->saveText (cout); + } + if (retainBlasts) { ASSERT (goodBlastAls. empty ()); diff --git a/amrfinder.cpp b/amrfinder.cpp index 70a58d5..3a7be7e 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which * * Release changes: +* 3.8.16 09/01/2020 PD-2322 a complete nucleotide hit is not preferred to a partial protein hit; stopCodon field is borrowed from BLASTX to BPASTP * 3.8.15 08/28/2020 PD-3475 Return BLAST alignment parameters for HMM-only hits where available * 3.8.14 08/27/2020 PD-3470 method FRAME_SHIFT, amr_report is faster * 3.8.13 08/25/2020 PD-2322 a complete nucleotide hit is preferred to a partial protein hit @@ -824,6 +825,7 @@ struct ThisApplication : ShellApplication } // PD-2244, PD-3230 + // use col2num() ?? const string sortS (emptyArg (dna) && emptyArg (gff) ? "-k1,1 -k2,2" : "-k2,2 -k3,3n -k4,4n -k5,5 -k1,1 -k6,6"); // Sorting AMR report exec ("head -1 " + tmp + ".amr > " + tmp + ".amr-out"); diff --git a/version.txt b/version.txt index 84691dd..eee6392 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.15 +3.8.16 From 24374ab2559a8f2d5ffd602297b3c57db8430674 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Wed, 2 Sep 2020 16:57:32 -0400 Subject: [PATCH 17/36] PD-3528 ordering of rows in the report is broken with parameter --name --- amrfinder.cpp | 53 +++++++++++++++++++++++++++++++++++++------- amrfinder_update.cpp | 2 +- common.cpp | 9 ++++---- common.hpp | 4 ++-- version.txt | 2 +- 5 files changed, 54 insertions(+), 16 deletions(-) diff --git a/amrfinder.cpp b/amrfinder.cpp index 3a7be7e..d1ff2f7 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -30,9 +30,10 @@ * AMRFinder * * Dependencies: NCBI BLAST, HMMer -* awk, cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which +* awk, cat, cp, cut, grep, head, ln, mkdir, mv, sort, tail, uniq, which * * Release changes: +* 3.8.17 09/02/2020 PD-3528 ordering of rows in the report is broken with parameter --name * 3.8.16 09/01/2020 PD-2322 a complete nucleotide hit is not preferred to a partial protein hit; stopCodon field is borrowed from BLASTX to BPASTP * 3.8.15 08/28/2020 PD-3475 Return BLAST alignment parameters for HMM-only hits where available * 3.8.14 08/27/2020 PD-3470 method FRAME_SHIFT, amr_report is faster @@ -281,9 +282,31 @@ struct ThisApplication : ShellApplication string col2num (const string &colName) const + // Return: number + // Input: tmp + ".amr": must have the header line { return exec2str ("head -1 " + tmp + ".amr | tr '\t' '\n' | grep -n \"" + colName + "\" | cut -f 1 -d ':'", "col"); } + + + + struct SortField : Named + { + bool numeric {false}; + + explicit SortField (const string &name_arg, + bool numeric_arg = false) + : Named (name_arg) + , numeric (numeric_arg) + { + ASSERT (str2 (name) > 0); + } + void saveText (ostream &os) const + { os << "-k" << name << ',' << name; + if (numeric) + os << 'n'; + } + }; @@ -741,10 +764,13 @@ struct ThisApplication : ShellApplication th. exec (fullProg ("blastx") + " -query " + dna + " -db " + tmp + ".db/AMRProt" + " " + blastx_par + to_string (gencode) + " " BLAST_FMT " -out " + tmp + ".blastx > /dev/null 2> /dev/null", threadsAvailable); - amr_report_blastx = "-blastx " + tmp + ".blastx -dna_len " + tmp + ".len"; } else + { exec ("cp /dev/null " + tmp + ".blastx"); + exec ("cp /dev/null " + tmp + ".len"); + } + amr_report_blastx = "-blastx " + tmp + ".blastx -dna_len " + tmp + ".len"; } @@ -789,7 +815,7 @@ struct ThisApplication : ShellApplication } - // ".amr" + // tmp + ".amr", tmp + ".mutation_all" const string nameS (emptyArg (input_name) ? "" : " -name " + input_name); { const string mutation_allS (emptyArg (mutation_all) ? "" : ("-mutation_all " + tmp + ".mutation_all")); @@ -824,12 +850,23 @@ struct ThisApplication : ShellApplication typeFilter += " || $" + typeCol + " == \"" + t + "\""; } - // PD-2244, PD-3230 - // use col2num() ?? - const string sortS (emptyArg (dna) && emptyArg (gff) ? "-k1,1 -k2,2" : "-k2,2 -k3,3n -k4,4n -k5,5 -k1,1 -k6,6"); // Sorting AMR report - exec ("head -1 " + tmp + ".amr > " + tmp + ".amr-out"); - exec ("LANG=C && tail -n +2 " + tmp + ".amr | sort " + sortS + " | uniq >> " + tmp + ".amr-out"); + // PD-2244, PD-3230 + string sortS; + { + Vector sortFields; + if (! (emptyArg (dna) && emptyArg (gff))) + sortFields << SortField (col2num ("Contig id")) + << SortField (col2num ("Start"), true) + << SortField (col2num ("Stop"), true) + << SortField (col2num ("Strand")); + sortFields << SortField (col2num ("Protein identifier")) + << SortField (col2num ("Gene symbol")); + for (const SortField& sf : sortFields) + sortS += " " + sf. str (); + } + exec ("head -1 " + tmp + ".amr > " + tmp + ".amr-out"); + exec ("LANG=C && tail -n +2 " + tmp + ".amr | sort " + sortS + " >> " + tmp + ".amr-out"); if (! typeFilter. empty ()) exec ("awk -F '\t' 'NR == 1 " + typeFilter + "' " + tmp + ".amr-out > " + tmp + ".amr"); diff --git a/amrfinder_update.cpp b/amrfinder_update.cpp index 2350dda..52a3183 100644 --- a/amrfinder_update.cpp +++ b/amrfinder_update.cpp @@ -30,7 +30,7 @@ * Updating of AMRFinder data * * Dependencies: NCBI BLAST, HMMer -* mkdir, ln +* ln, mkdir * curl.{h,c} * * Release changes: see amrfinder.cpp diff --git a/common.cpp b/common.cpp index 3726bc7..145cccf 100644 --- a/common.cpp +++ b/common.cpp @@ -3027,14 +3027,15 @@ string ShellApplication::fullProg (const string &progName) const string ShellApplication::exec2str (const string &cmd, - const string &tmpPrefix, + const string &tmpName, const string &logFName) const { - ASSERT (! contains (tmpPrefix, ' ')); - const string out (tmp + "." + tmpPrefix); + ASSERT (! contains (tmpName, ' ')); + const string out (tmp + "." + tmpName); exec (cmd + " > " + out, logFName); const StringVector vec (out, (size_t) 1); - QC_ASSERT (vec. size () == 1); + if (vec. size () != 1) + throw runtime_error (cmd + "\nOne line is expected"); return vec [0]; } diff --git a/common.hpp b/common.hpp index 99598e7..849b9ef 100644 --- a/common.hpp +++ b/common.hpp @@ -3632,9 +3632,9 @@ struct ShellApplication : Application // Return: shellQuote (directory + progName) + ' ' // Requires: After findProg(progName) string exec2str (const string &cmd, - const string &tmpPrefix, + const string &tmpName, const string &logFName = string()) const; - // Return: `cmd > .tmpPredix && cat .tmpPredix` + // Return: `cmd > .tmpName && cat .tmpName` // Requires: cmd produces one line }; #endif diff --git a/version.txt b/version.txt index eee6392..e29d809 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.16 +3.8.17 From 713fee451f5b9f3b3d4526e612e947aa3aa16011 Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Thu, 3 Sep 2020 09:46:41 -0400 Subject: [PATCH 18/36] Update test data for new features --- test_both.expected | 4 ++-- test_prot.expected | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test_both.expected b/test_both.expected index 48131fb..ca7bdd2 100644 --- a/test_both.expected +++ b/test_both.expected @@ -8,7 +8,7 @@ NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 99.56 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA NA contig10 486 1307 + blaOXA class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NA NA -blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NA NA +blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NA NA qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA NA contig14 1 1089 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA @@ -17,4 +17,4 @@ NA contig15 1 2905 + 23S_A2058T Escherichia azithromycin/erythromycin/telithromy NA contig16 1 720 + nfsA_K141STOP Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA NA contig16 1 720 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA NA contig17 1 247 + ampC_T-14TGT Escherichia cephalosporin resistant ampC core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA -nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein +nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family nitroimidazole resistance protein NF000262.1 NimIJ family nitroimidazole resistance protein diff --git a/test_prot.expected b/test_prot.expected index d78cb36..b0f4e73 100644 --- a/test_prot.expected +++ b/test_prot.expected @@ -10,4 +10,4 @@ qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regu emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA pmrB_C84R contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTP 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA nfsA_R15C_K141STOP contig16 1 423 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN PARTIALP 140 240 58.33 99.29 140 WP_089631889.1 nitroreductase NfsA NA NA -nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein +nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family nitroimidazole resistance protein NF000262.1 NimIJ family nitroimidazole resistance protein From 812508eaf748c025370a40aab72ba460c98832b6 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Thu, 3 Sep 2020 14:46:10 -0400 Subject: [PATCH 19/36] PD-3292 'which' dependence is removed --- amrfinder.cpp | 3 ++- common.cpp | 39 ++++++++++++++++++++------------------- common.hpp | 9 +++++++-- version.txt | 2 +- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/amrfinder.cpp b/amrfinder.cpp index d1ff2f7..838bf0a 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -30,9 +30,10 @@ * AMRFinder * * Dependencies: NCBI BLAST, HMMer -* awk, cat, cp, cut, grep, head, ln, mkdir, mv, sort, tail, uniq, which +* awk, cat, cp, cut, grep, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: +* 3.8.18 09/03/2020 PD-3292 removed the dependece on "which" * 3.8.17 09/02/2020 PD-3528 ordering of rows in the report is broken with parameter --name * 3.8.16 09/01/2020 PD-2322 a complete nucleotide hit is not preferred to a partial protein hit; stopCodon field is borrowed from BLASTX to BPASTP * 3.8.15 08/28/2020 PD-3475 Return BLAST alignment parameters for HMM-only hits where available diff --git a/common.cpp b/common.cpp index 145cccf..65b0d53 100644 --- a/common.cpp +++ b/common.cpp @@ -185,7 +185,8 @@ namespace // time ?? #ifndef _MSC_VER - const char* hostname = getenv ("HOSTNAME");//const char* shell = getenv ("SHELL"); + const char* hostname = getenv ("HOSTNAME"); +//const char* shell = getenv ("SHELL"); const char* shell = getenv ("SHELL"); const char* pwd = getenv ("PWD"); #endif @@ -1120,6 +1121,22 @@ void exec (const string &cmd, +#ifndef _MSC_VER +string which (const string &progName) +{ + ASSERT (! progName. empty ()); + + const List paths (str2list (getenv ("PATH"), ':')); + for (const string& path : paths) + if (fileExists (path + "/" + progName)) + return path + "/"; + + return string (); +} +#endif + + + // Threads @@ -2947,7 +2964,7 @@ void ShellApplication::initEnvironment () // execDir, programName execDir = getProgramDirName (); if (execDir. empty ()) - execDir = which (programArgs. front ()); + execDir = this->which (programArgs. front ()); if (! isRight (execDir, "/")) throw logic_error ("Cannot identify the directory of the executable"); { @@ -2974,22 +2991,6 @@ void ShellApplication::body () const -string ShellApplication::which (const string &progName) const -{ - if (tmp. empty ()) - throw runtime_error ("Temporary file is needed"); - - try { exec ("which " + shellQuote (progName) + " 1> " + tmp + ".src 2> /dev/null"); } - catch (const runtime_error &) - { return string (); } - - const StringVector vec (tmp + ".src", (size_t) 1); - QC_ASSERT (vec. size () == 1); - return getDirName (vec [0]); -} - - - void ShellApplication::findProg (const string &progName) const { ASSERT (! progName. empty ()); @@ -3001,7 +3002,7 @@ void ShellApplication::findProg (const string &progName) const { dir = fileExists (execDir + progName) ? execDir - : which (progName); + : this->which (progName); if (dir. empty ()) throw runtime_error ("Binary " + shellQuote (progName) + " is not found.\nPlease make sure that " + shellQuote (progName) + " is in the same directory as " + shellQuote (Common_sp::programName) + " or is in your $PATH.");; diff --git a/common.hpp b/common.hpp index 849b9ef..ff41c02 100644 --- a/common.hpp +++ b/common.hpp @@ -1241,6 +1241,11 @@ int getVerbosity (); void exec (const string &cmd, const string &logFName = string()); +#ifndef _MSC_VER + string which (const string &progName); + // Return: isRight(,"/") or empty() if there is no path +#endif + // Threads @@ -3624,8 +3629,8 @@ struct ShellApplication : Application protected: static bool emptyArg (const string &s) { return s. empty () || s == "\'\'"; } - string which (const string &progName) const; - // Return: isRight(,"/") or empty() + string which (const string &progName) const + { return Common_sp::which (progName); } void findProg (const string &progName) const; // Output: prog2dir string fullProg (const string &progName) const; diff --git a/version.txt b/version.txt index e29d809..9ad6380 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.17 +3.8.18 From b27088153614cbbabe5c5206e053fb020ee9da58 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Fri, 4 Sep 2020 13:37:13 -0400 Subject: [PATCH 20/36] PD-3292 removed the dependence on 'grep' --- amrfinder.cpp | 74 ++++++++++++++++++++++++++++++++------------------- common.cpp | 37 +++++++++++++++++++++++--- common.hpp | 9 +++---- version.txt | 2 +- 4 files changed, 85 insertions(+), 37 deletions(-) diff --git a/amrfinder.cpp b/amrfinder.cpp index 838bf0a..4c0d997 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -30,9 +30,10 @@ * AMRFinder * * Dependencies: NCBI BLAST, HMMer -* awk, cat, cp, cut, grep, head, ln, mkdir, mv, sort, tail, uniq +* awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: +* 3.8.19 09/04/2020 PD-3292 removed the dependece on "grep" * 3.8.18 09/03/2020 PD-3292 removed the dependece on "which" * 3.8.17 09/02/2020 PD-3528 ordering of rows in the report is broken with parameter --name * 3.8.16 09/01/2020 PD-2322 a complete nucleotide hit is not preferred to a partial protein hit; stopCodon field is borrowed from BLASTX to BPASTP @@ -246,26 +247,15 @@ struct ThisApplication : ShellApplication bool blastThreadable (const string &blast) const { exec (fullProg (blast) + " -help > " + tmp + ".blast_help"); - return ! system (("grep '^ *\\-num_threads' " + tmp + ".blast_help > /dev/null 2> /dev/null"). c_str ()); - } - - - - size_t get_threads_max_max () const - { - #if __APPLE__ - // stderr << "Compiled for MacOS" << "\n"; - int count; - size_t count_len = sizeof(count); - sysctlbyname("hw.logicalcpu", &count, &count_len, NULL, 0); - // fprintf(stderr,"you have %i cpu cores", count); - return count; - #else - // stderr << "Compiled for Linux" << "\n"; - //exec ("nproc --all > " + tmp + ".nproc"); - //exec ("grep processor /proc/cpuinfo | tail -1 | awk '{print $3}' > " + tmp + ".nproc"); - return str2 (exec2str ("grep -c processor /proc/cpuinfo", "nproc")); - #endif + LineInput f (tmp + ".blast_help"); + while (f. nextLine ()) + { + trim (f. line); + if (contains (f. line, "-num_threads")) + return true; + } + return false; + //return ! system (("grep '^ *\\-num_threads' " + tmp + ".blast_help > /dev/null 2> /dev/null"). c_str ()); } @@ -286,7 +276,17 @@ struct ThisApplication : ShellApplication // Return: number // Input: tmp + ".amr": must have the header line { - return exec2str ("head -1 " + tmp + ".amr | tr '\t' '\n' | grep -n \"" + colName + "\" | cut -f 1 -d ':'", "col"); + LineInput f (tmp + ".amr"); + EXEC_ASSERT (f. nextLine ()); + const List columns (str2list (f. line, '\t')); + size_t n = 1; + for (const string& column : columns) + if (column == colName) + return to_string (n); + else + n++; + throw runtime_error ("Column " + strQuote (colName) + " not found in " + tmp + ".amr"); + //return exec2str ("head -1 " + tmp + ".amr | tr '\t' '\n' | grep -n \"" + colName + "\" | cut -f 1 -d ':'", "col"); } @@ -678,13 +678,31 @@ struct ThisApplication : ShellApplication if (! emptyArg (gff) && ! contains (parm, "-bed")) { string locus_tag; - const int status = system (("grep '^>.*\\[locus_tag=' " + prot + " > /dev/null"). c_str ()); - const bool locus_tagP = (status == 0); - if (locus_tagP /*|| gpipe*/) { - locus_tag = " -locus_tag " + tmp + ".match"; - gff_match = " -gff_match " + tmp + ".match"; - } + #if 0 + const int status = system (("grep '^>.*\\[locus_tag=' " + prot + " > /dev/null"). c_str ()); + const bool locus_tagP = (status == 0); + #else + bool locus_tagP = false; + { + LineInput f (unQuote (prot)); + while (f. nextLine ()) + if ( ! f. line. empty () + && f. line [0] == '>' + && contains (f. line, "[locus_tag=") + ) + { + locus_tagP = true; + break; + } + } + #endif + if (locus_tagP /*|| gpipe*/) + { + locus_tag = " -locus_tag " + tmp + ".match"; + gff_match = " -gff_match " + tmp + ".match"; + } + } prog2dir ["gff_check"] = execDir; string dnaPar; if (! emptyArg (dna)) diff --git a/common.cpp b/common.cpp index 65b0d53..e798134 100644 --- a/common.cpp +++ b/common.cpp @@ -40,11 +40,15 @@ #include #include #include +#include + #ifndef _MSC_VER #include -//#include + #ifdef __APPLE__ + #include + #include + #endif #endif -#include @@ -116,6 +120,31 @@ bool isMainThread () { return threads_max == 1 || get_thread_id () == main_thread_id; } + +#ifndef _MSC_VER +size_t get_threads_max_max () +{ +#if __APPLE__ + // stderr << "Compiled for MacOS" << "\n"; + int count; + size_t count_len = sizeof(count); + sysctlbyname ("hw.logicalcpu", &count, &count_len, NULL, 0); + // fprintf(stderr,"you have %i cpu cores", count); + return count; +#else + LineInput f ("/proc/cpuinfo"); + size_t n = 0; + while (f. nextLine ()) + if (isLeft (f. line, "processor")) + n++; + return n; +#endif +} +#endif + + + + bool Chronometer::enabled = false; @@ -1128,7 +1157,9 @@ string which (const string &progName) const List paths (str2list (getenv ("PATH"), ':')); for (const string& path : paths) - if (fileExists (path + "/" + progName)) + if ( ! path. empty () + && fileExists (path + "/" + progName) + ) return path + "/"; return string (); diff --git a/common.hpp b/common.hpp index ff41c02..074f092 100644 --- a/common.hpp +++ b/common.hpp @@ -58,11 +58,6 @@ #pragma warning (default : 4005) #endif -#ifdef __APPLE__ - #include - #include -#endif - #include #include #include @@ -129,6 +124,10 @@ extern size_t threads_max; // >= 1 extern thread::id main_thread_id; bool isMainThread (); +#ifndef _MSC_VER + size_t get_threads_max_max (); +#endif + [[noreturn]] void errorExit (const char* msg, diff --git a/version.txt b/version.txt index 9ad6380..143c2f5 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.18 +3.8.19 From 2803b0ca611febac4b33b247b994d8e0eb1095e5 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Mon, 14 Sep 2020 14:44:12 -0400 Subject: [PATCH 21/36] PD-3531 "--parm -print_fam" bug --- amr_report.cpp | 2 +- amrfinder.cpp | 21 ++++++++------------- version.txt | 2 +- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 01ead40..96cd6f7 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -1570,7 +1570,7 @@ struct Batch << "Start" // targetStart // 3 << "Stop" // targetEnd // 4 << "Strand"; // targetStrand // 5 - td << (print_fam ? "FAM.id" : "Gene symbol") // 6 or 2 + td << "Gene symbol" // 6 or 2 << "Sequence name" // 7 or 3 << "Scope" // PD-2825 // 8 or 4 // PD-1856 diff --git a/amrfinder.cpp b/amrfinder.cpp index 4c0d997..61a2d77 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,8 +33,10 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: -* 3.8.19 09/04/2020 PD-3292 removed the dependece on "grep" -* 3.8.18 09/03/2020 PD-3292 removed the dependece on "which" +* --force_update implies --update; -U +* 3.8.20 09/14/2020 PD-3531 "--parm -print_fam" bug +* 3.8.19 09/04/2020 PD-3292 removed the dependence on "grep" +* 3.8.18 09/03/2020 PD-3292 removed the dependence on "which" * 3.8.17 09/02/2020 PD-3528 ordering of rows in the report is broken with parameter --name * 3.8.16 09/01/2020 PD-2322 a complete nucleotide hit is not preferred to a partial protein hit; stopCodon field is borrowed from BLASTX to BPASTP * 3.8.15 08/28/2020 PD-3475 Return BLAST alignment parameters for HMM-only hits where available @@ -204,7 +206,7 @@ struct ThisApplication : ShellApplication : ShellApplication (HELP, true, true, true) { addFlag ("update", "Update the AMRFinder database", 'u'); // PD-2379 - addFlag ("force_update", "Force updating the AMRFinder database"); // PD-3469 + addFlag ("force_update", "Force updating the AMRFinder database", 'U'); // PD-3469 addKey ("protein", "Input protein FASTA file", "", 'p', "PROT_FASTA"); addKey ("nucleotide", "Input nucleotide FASTA file", "", 'n', "NUC_FASTA"); addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); @@ -255,7 +257,6 @@ struct ThisApplication : ShellApplication return true; } return false; - //return ! system (("grep '^ *\\-num_threads' " + tmp + ".blast_help > /dev/null 2> /dev/null"). c_str ()); } @@ -286,7 +287,6 @@ struct ThisApplication : ShellApplication else n++; throw runtime_error ("Column " + strQuote (colName) + " not found in " + tmp + ".amr"); - //return exec2str ("head -1 " + tmp + ".amr | tr '\t' '\n' | grep -n \"" + colName + "\" | cut -f 1 -d ':'", "col"); } @@ -316,8 +316,8 @@ struct ThisApplication : ShellApplication const string prot = shellQuote (getArg ("protein")); const string dna = shellQuote (getArg ("nucleotide")); string db = getArg ("database"); - const bool update = getFlag ("update"); const bool force_update = getFlag ("force_update"); + const bool update = getFlag ("update") || force_update; const string gff = shellQuote (getArg ("gff")); const bool pgap = getFlag ("pgap"); const double ident = arg2double ("ident_min"); @@ -361,8 +361,8 @@ struct ThisApplication : ShellApplication if (report_common && ! add_plus) throw runtime_error ("--report_common requires --plus"); - if (force_update && ! update) - throw runtime_error ("--force_update requires --update"); + //if (force_update && ! update) + //throw runtime_error ("--force_update requires --update"); // PD-3437 if (! emptyArg (mutation_all) && emptyArg (organism)) @@ -679,10 +679,6 @@ struct ThisApplication : ShellApplication { string locus_tag; { - #if 0 - const int status = system (("grep '^>.*\\[locus_tag=' " + prot + " > /dev/null"). c_str ()); - const bool locus_tagP = (status == 0); - #else bool locus_tagP = false; { LineInput f (unQuote (prot)); @@ -696,7 +692,6 @@ struct ThisApplication : ShellApplication break; } } - #endif if (locus_tagP /*|| gpipe*/) { locus_tag = " -locus_tag " + tmp + ".match"; diff --git a/version.txt b/version.txt index 143c2f5..7671800 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.19 +3.8.20 From 509de198f786a9e6ddb09cad48046ffe631ff781 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Mon, 14 Sep 2020 18:23:55 -0400 Subject: [PATCH 22/36] PD-3536 point mutations merging bug --- amr_report.cpp | 58 ++++++++++++++++++++++---------------------------- amrfinder.cpp | 3 ++- version.txt | 2 +- 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 96cd6f7..f98c9fa 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -862,45 +862,39 @@ struct BlastAlignment : Alignment if (! targetProt && ! other. matchesCds (*this)) return false; } - if (isMutation ()) - { - if (! other. isMutation ()) - return true; + if (isMutation () && other. isMutation ()) + { const Set mutationSymbols ( getMutationSymbols ()); const Set otherMutationSymbols (other. getMutationSymbols ()); if (mutationSymbols == otherMutationSymbols && targetProt != other. targetProt) return targetProt; if (mutationSymbols. containsAll (otherMutationSymbols)) return true; - if (otherMutationSymbols. containsAll (mutationSymbols)) - return false; + return false; // PD-3536 } - else - { - if (targetProt == other. targetProt) - { - // PD-807 - if ( ! (targetProt && famId == other. famId) // PD-2441 - //&& ! sameTarget (other) - && ! other. insideEq (*this) - && ! insideEq (other) - ) - return false; - //if (targetProt) - //{ LESS_PART (other, *this, isMutation ()); } - LESS_PART (other, *this, refExactlyMatched ()); // PD-1261, PD-1678 - LESS_PART (other, *this, nident); - LESS_PART (*this, other, refEffectiveLen ()); - } - else - { - LESS_PART (other, *this, refExactlyMatched ()); - //LESS_PART (other, *this, allele ()); // PD-2352 - LESS_PART (other, *this, alleleReported ()); - //LESS_PART (*this, other, partial ()); - LESS_PART (other, *this, targetProt); - } - } + if (targetProt == other. targetProt) + { + // PD-807 + if ( ! (targetProt && famId == other. famId) // PD-2441 + //&& ! sameTarget (other) + && ! other. insideEq (*this) + && ! insideEq (other) + ) + return false; + //if (targetProt) + //{ LESS_PART (other, *this, isMutation ()); } + LESS_PART (other, *this, refExactlyMatched ()); // PD-1261, PD-1678 + LESS_PART (other, *this, nident); + LESS_PART (*this, other, refEffectiveLen ()); + } + else + { + LESS_PART (other, *this, refExactlyMatched ()); + //LESS_PART (other, *this, allele ()); // PD-2352 + LESS_PART (other, *this, alleleReported ()); + //LESS_PART (*this, other, partial ()); + LESS_PART (other, *this, targetProt); + } return true; } public: diff --git a/amrfinder.cpp b/amrfinder.cpp index 61a2d77..a91f22e 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,7 +33,8 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: -* --force_update implies --update; -U +* 3.8.21 09/14/2020 PD-3536 point mutation merging bug +* PD-3469 --force_update implies --update; -U * 3.8.20 09/14/2020 PD-3531 "--parm -print_fam" bug * 3.8.19 09/04/2020 PD-3292 removed the dependence on "grep" * 3.8.18 09/03/2020 PD-3292 removed the dependence on "which" diff --git a/version.txt b/version.txt index 7671800..335ca69 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.20 +3.8.21 From b06f4c210c23e237169602dcc7e3843fc5300272 Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Tue, 15 Sep 2020 13:06:44 -0400 Subject: [PATCH 23/36] Changed message for --force_update --- amrfinder_update.cpp | 2 +- test.expected | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) delete mode 100644 test.expected diff --git a/amrfinder_update.cpp b/amrfinder_update.cpp index 52a3183..c156ff3 100644 --- a/amrfinder_update.cpp +++ b/amrfinder_update.cpp @@ -352,7 +352,7 @@ Requirements:\n\ ) { stderr << shellQuote (latestDir) << " contains the latest version: " << version_old. front () << '\n'; - stderr << "No update is done\n"; + stderr << "Skipping update, use amrfinder --force_update to overwrite the existing database\n"; return; } } diff --git a/test.expected b/test.expected deleted file mode 100644 index 3854391..0000000 --- a/test.expected +++ /dev/null @@ -1,10 +0,0 @@ -Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description -blaTEM-156 contig1 101 961 + blaTEM-156 class A beta-lactamase TEM-156 AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase -NA contig10 486 1307 + blaOXA class D beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NF000270.1 class D beta-lactamase -blaPDC-114_blast contig2 1 1191 + blaPDC PDC family class C beta-lactamase AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase -blaOXA-436_partial contig3 101 802 + blaOXA OXA-48 family class D beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase -vanG contig4 101 1147 + vanG D-alanine--D-serine ligase VanG AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG -NA contig8 101 700 + blaTEM TEM family class A beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase -NA contig9 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDX 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032895.1 aminoglycoside O-phosphotransferase APH(3'')-Ib -NA contig9 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDX 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NF000295.1 sulfonamide-resistant dihydropteroate synthase Sul2 -nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein From f46c3e9041650c7ff4e61638047d93948ed8f0c9 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Tue, 15 Sep 2020 18:21:17 -0400 Subject: [PATCH 24/36] PD-3470, PD-3526 bugs with frame shifts and point mutation reference preference --- alignment.cpp | 6 ++-- amr_report.cpp | 83 +++++++++++++++++++++++++++++--------------------- amrfinder.cpp | 7 +++-- version.txt | 2 +- 4 files changed, 56 insertions(+), 42 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 6cb6c53..d807454 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -850,7 +850,7 @@ bool Alignment::getFrameShift_right (const Alignment &rightPart, if ( refStart >= rightPart. refStart || refEnd >= rightPart. refEnd - || difference (rightPart. refStart, refEnd) > diff_max / 3 + || refEnd + diff_max / 3 < rightPart. refStart ) return false; @@ -858,7 +858,7 @@ bool Alignment::getFrameShift_right (const Alignment &rightPart, { if ( targetStart >= rightPart. targetStart || targetEnd >= rightPart. targetEnd - || difference (rightPart. targetStart, targetEnd) > diff_max + || targetEnd + diff_max < rightPart. targetStart ) return false; } @@ -866,7 +866,7 @@ bool Alignment::getFrameShift_right (const Alignment &rightPart, { if ( targetStart <= rightPart. targetStart || targetEnd <= rightPart. targetEnd - || difference (rightPart. targetEnd, targetStart) > diff_max + || targetStart > rightPart. targetEnd + diff_max ) return false; } diff --git a/amr_report.cpp b/amr_report.cpp index f98c9fa..1b7badc 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -334,7 +334,6 @@ struct BlastAlignment : Alignment string product; Vector cdss; static constexpr size_t mismatchTail_aa = 10; // PAR - size_t mismatchTailTarget {0}; const HmmAlignment* hmmAl {nullptr}; @@ -420,10 +419,6 @@ struct BlastAlignment : Alignment ) nident++; - mismatchTailTarget = mismatchTail_aa; - if (! targetProt) - mismatchTailTarget *= 3; - if (! targetProt) cdss << move (Locus (0, targetName, targetStart, targetEnd, targetStrand, partialDna, 0)); @@ -623,14 +618,14 @@ struct BlastAlignment : Alignment ) { if (verbose ()) - os << refExactlyMatched () - << '\t' << allele () - << '\t' << alleleReported () - << '\t' << targetProt - << '\t' << nident - << '\t' << refMutation - << '\t' << stopCodon - << '\t' << frameShift + os << refExactlyMatched () // 1 + << '\t' << allele () // 2 + << '\t' << alleleReported () // 3 + << '\t' << targetProt // 4 + << '\t' << nident // 5 + << '\t' << refMutation // 6 + << '\t' << stopCodon // 7 + << '\t' << frameShift // 8 << '\t'; os << td. str () << endl; } @@ -780,21 +775,13 @@ struct BlastAlignment : Alignment return passBlastRule (completeBR); } private: -#if 0 - bool sameTarget (const BlastAlignment &other) const - // Symmetric - { ASSERT (targetProt == other. targetProt); - return targetStrand == other. targetStrand - && difference (targetStart, other. targetStart) <= mismatchTailTarget - && difference (targetEnd, other. targetEnd) <= mismatchTailTarget; - } - // Requires: same targetName -#endif + size_t mismatchTailTarget () const + { return (mismatchTail_aa + (frameShift ? 20 : 0)) * (targetProt ? 1 : 3); } bool insideEq (const BlastAlignment &other) const { ASSERT (targetProt == other. targetProt); - return targetStrand == other. targetStrand - && targetStart + mismatchTailTarget >= other. targetStart - && targetEnd <= other. targetEnd + mismatchTailTarget; + return targetStrand == other. targetStrand + && targetStart + mismatchTailTarget () >= other. targetStart + && targetEnd <= other. targetEnd + mismatchTailTarget (); } // Requires: same targetName bool matchesCds (const BlastAlignment &other) const @@ -868,9 +855,10 @@ struct BlastAlignment : Alignment const Set otherMutationSymbols (other. getMutationSymbols ()); if (mutationSymbols == otherMutationSymbols && targetProt != other. targetProt) return targetProt; - if (mutationSymbols. containsAll (otherMutationSymbols)) + if ( mutationSymbols. containsAll (otherMutationSymbols) + && ! otherMutationSymbols. containsAll (mutationSymbols) + ) return true; - return false; // PD-3536 } if (targetProt == other. targetProt) { @@ -1305,15 +1293,34 @@ struct Batch { // BlastAlignment::frameShift for (auto& it : blastAls) - for (Iter> iter (it. second); iter. next ();) - if (! (*iter)->targetProt) - for (const BlastAlignment* blastAl : it. second) - if (! blastAl->targetProt && blastAl->getFrameShift (**iter, 60)) // PAR + { + for (const BlastAlignment* &blastAl1 : it. second) + if (! blastAl1->targetProt) + { + bool found = false; + for (const BlastAlignment* blastAl2 : it. second) + { + if ( blastAl2 + && ! blastAl2->targetProt + && blastAl2->getFrameShift (*blastAl1, 60) // PAR + ) { - iter. erase (); - var_cast (blastAl) -> frameShift = true; - break; + ASSERT (blastAl1 != blastAl2); + var_cast (blastAl2) -> frameShift = true; + if (verbose (-1)) + blastAl2->saveText (cout); + found = true; } + } + if (found) + { + const BlastAlignment* blastAl_ = blastAl1; + delete blastAl_; + blastAl1 = nullptr; + } + } + it. second. filterValue ([] (const BlastAlignment* blastAl) { return ! blastAl; }); + } // BlastAlignment::good() for (auto& it : blastAls) @@ -1321,7 +1328,13 @@ struct Batch if (! (*iter)->good ()) delete iter. erase (); if (verbose ()) + { cout << "# Good Blasts: " << getSize (blastAls) << endl; + if (verbose (-1)) + for (const auto& it : blastAls) + for (const BlastAlignment* blastAl : it. second) + blastAl->saveText (cout); + } // PD-2322 for (const auto& it : blastAls) diff --git a/amrfinder.cpp b/amrfinder.cpp index a91f22e..79339e7 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,7 +33,8 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: -* 3.8.21 09/14/2020 PD-3536 point mutation merging bug +* 3.8.22 09/15/2020 PD-3470 frameshift detection bug; preference of point mutation reference proteins +* 3.8.21 09/14/2020 PD-3536 point mutations merging bug * PD-3469 --force_update implies --update; -U * 3.8.20 09/14/2020 PD-3531 "--parm -print_fam" bug * 3.8.19 09/04/2020 PD-3292 removed the dependence on "grep" @@ -665,7 +666,7 @@ struct ThisApplication : ShellApplication // PD-2967 - const string blastp_par ("-comp_based_stats 0 -evalue 1e-10 "); + const string blastp_par ("-comp_based_stats 0 -evalue 1e-10"); // was: -culling_limit 20 // PD-2967 if (! emptyArg (prot)) { @@ -729,7 +730,7 @@ struct ThisApplication : ShellApplication // " -task blastp-fast -word_size 6 -threshold 21 " // PD-2303 string num_threads; if (blastThreadable ("blastp") && prot_threads > 1) - num_threads = " -num_threads " + to_string (prot_threads); + num_threads = " -num_threads " + to_string (prot_threads); th. exec (fullProg ("blastp") + " -query " + prot + " -db " + tmp + ".db/AMRProt" +" " + blastp_par + num_threads + " " BLAST_FMT " -out " + tmp + ".blastp > /dev/null 2> /dev/null", prot_threads); diff --git a/version.txt b/version.txt index 335ca69..5bde6cb 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.21 +3.8.22 From aed519ce865db6148252babdae401a09b8bdbc4d Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Wed, 16 Sep 2020 18:16:13 -0400 Subject: [PATCH 25/36] PD-3536 simplifying point mutations preference --- amr_report.cpp | 39 +++++++++++++++++++++++++-------------- amrfinder.cpp | 1 + version.txt | 2 +- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 1b7badc..edbf4e9 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -428,7 +428,7 @@ struct BlastAlignment : Alignment if (verbose ()) cout << "Mutation protein found: " << refAccession << endl << line << endl; QC_ASSERT (isMutation ()); - setSeqChanges (*refMutations, 0 /*flankingLen*/ /*, mutation_all. get ()*/); + setSeqChanges (*refMutations, 0); if (verbose ()) cout << endl; } @@ -523,7 +523,7 @@ struct BlastAlignment : Alignment if (seqChanges_. empty ()) seqChanges_ << SeqChange (); for (const Locus& cds : cdss_) - for (SeqChange& seqChange : seqChanges_) + for (const SeqChange& seqChange : seqChanges_) { const Mutation* mut = seqChange. mutation; const string method (empty () ? na : getMethod (cds)); @@ -539,20 +539,20 @@ struct BlastAlignment : Alignment << (empty () ? na : (cds. contig. empty () ? (targetStrand ? "+" : "-") : (cds. strand ? "+" : "-"))); td << (isMutation () ? mut - ? seqChange. empty () - ? mut->wildtype () - : mut->geneMutation - : gene + "_" + seqChange. getMutationStr () + ? seqChange. empty () + ? mut->wildtype () + : mut->geneMutation + : gene + "_" + seqChange. getMutationStr () : print_fam - ? famId - : getGeneSymbols () + ? famId + : getGeneSymbols () ) << (isMutation () ? mut - ? seqChange. empty () - ? proteinName + " [WILDTYPE]" - : mut->name - : proteinName + " [UNKNOWN]" + ? seqChange. empty () + ? proteinName + " [WILDTYPE]" + : mut->name + : proteinName + " [UNKNOWN]" : proteinName ) //+ ifS (reportPseudo, ifS (stopCodon, " [STOP_CODON]")) @@ -849,7 +849,8 @@ struct BlastAlignment : Alignment if (! targetProt && ! other. matchesCds (*this)) return false; } - if (isMutation () && other. isMutation ()) + #if 0 + if (isMutation () && other. isMutation ()) { const Set mutationSymbols ( getMutationSymbols ()); const Set otherMutationSymbols (other. getMutationSymbols ()); @@ -860,6 +861,7 @@ struct BlastAlignment : Alignment ) return true; } + #endif if (targetProt == other. targetProt) { // PD-807 @@ -883,6 +885,13 @@ struct BlastAlignment : Alignment //LESS_PART (*this, other, partial ()); LESS_PART (other, *this, targetProt); } + if (isMutation () && other. isMutation ()) + { + const Set mutationSymbols ( getMutationSymbols ()); + const Set otherMutationSymbols (other. getMutationSymbols ()); + if (mutationSymbols == otherMutationSymbols && targetProt != other. targetProt) + return targetProt; + } return true; } public: @@ -1373,7 +1382,7 @@ struct Batch //ASSERT (seqChange1. mutation); for (const BlastAlignment* blastAl2 : it. second) { - ASSERT (blastAl2->targetName == blastAl1->targetName); + ASSERT (blastAl2->targetName == blastAl1->targetName); if ( blastAl2->targetStrand == blastAl1->targetStrand && blastAl2 != blastAl1 ) @@ -1551,6 +1560,8 @@ struct Batch if (verbose ()) { cout << endl << "After process():" << endl; + if (mutation_all. get ()) + *mutation_all << endl << "After process():" << endl; for (const auto& it : goodBlastAls) for (const BlastAlignment* blastAl : it. second) { diff --git a/amrfinder.cpp b/amrfinder.cpp index 79339e7..77104ca 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: +* 3.8.23 09/16/2020 PD-3536 simplifying point mutations preference * 3.8.22 09/15/2020 PD-3470 frameshift detection bug; preference of point mutation reference proteins * 3.8.21 09/14/2020 PD-3536 point mutations merging bug * PD-3469 --force_update implies --update; -U diff --git a/version.txt b/version.txt index 5bde6cb..d7d3ae9 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.22 +3.8.23 From 3fa81eb8606f900797ab32196d452240afb712cd Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Mon, 21 Sep 2020 16:46:02 -0400 Subject: [PATCH 26/36] PD-3536 --pointmut_all reports all SNPs in a reference gene repetition; POINTX method with more SNPs is preferred over POINTP method --- alignment.hpp | 2 ++ amr_report.cpp | 32 +++++++++++++++++++++++++++----- amrfinder.cpp | 2 ++ dna_mutation.cpp | 11 ++++++----- version.txt | 2 +- 5 files changed, 38 insertions(+), 11 deletions(-) diff --git a/alignment.hpp b/alignment.hpp index 2ecb2fa..c3730a3 100644 --- a/alignment.hpp +++ b/alignment.hpp @@ -137,6 +137,8 @@ struct SeqChange : Root char prev {'\0'}; const Mutation* mutation {nullptr}; + + const SeqChange* replacement {nullptr}; SeqChange () = default; diff --git a/amr_report.cpp b/amr_report.cpp index edbf4e9..2c374c4 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -614,7 +614,7 @@ struct BlastAlignment : Alignment } } if ( ! isMutation () - || (! seqChange. empty () && mut) // resistant mutation + || (! seqChange. empty () && mut && ! seqChange. replacement) // resistant mutation ) { if (verbose ()) @@ -708,6 +708,7 @@ struct BlastAlignment : Alignment ? alleleReported () ? "ALLELE" : "EXACT" // PD-776 + #if 0 : partial () ? truncated (cds) ? "PARTIAL_CONTIG_END" // PD-2267 @@ -715,6 +716,15 @@ struct BlastAlignment : Alignment : isMutation () ? "POINT" : "BLAST" + #else + : isMutation () + ? "POINT" + : partial () + ? truncated (cds) + ? "PARTIAL_CONTIG_END" // PD-2267 + : "PARTIAL" + : "BLAST" + #endif ); // PD-2088, PD-2320 bool suffix = true; @@ -850,6 +860,7 @@ struct BlastAlignment : Alignment return false; } #if 0 + // Moved below if (isMutation () && other. isMutation ()) { const Set mutationSymbols ( getMutationSymbols ()); @@ -883,7 +894,7 @@ struct BlastAlignment : Alignment //LESS_PART (other, *this, allele ()); // PD-2352 LESS_PART (other, *this, alleleReported ()); //LESS_PART (*this, other, partial ()); - LESS_PART (other, *this, targetProt); + //LESS_PART (other, *this, targetProt); // moved below } if (isMutation () && other. isMutation ()) { @@ -891,7 +902,16 @@ struct BlastAlignment : Alignment const Set otherMutationSymbols (other. getMutationSymbols ()); if (mutationSymbols == otherMutationSymbols && targetProt != other. targetProt) return targetProt; + if ( mutationSymbols. containsAll (otherMutationSymbols) + && ! otherMutationSymbols. containsAll (mutationSymbols) + ) + return true; + if ( otherMutationSymbols. containsAll (mutationSymbols) + && ! mutationSymbols. containsAll (otherMutationSymbols) + ) + return false; } + LESS_PART (other, *this, targetProt); return true; } public: @@ -1386,15 +1406,17 @@ struct Batch if ( blastAl2->targetStrand == blastAl1->targetStrand && blastAl2 != blastAl1 ) - for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + //for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + for (SeqChange& seqChange2 : var_cast (blastAl2) -> seqChanges) { - SeqChange& seqChange2 = *iter; + //SeqChange& seqChange2 = *iter; //ASSERT (seqChange2. mutation); ASSERT (seqChange2. al == blastAl2); if ( seqChange1. start_target == seqChange2. start_target && seqChange1. better (seqChange2) ) - iter. erase (); + //iter. erase (); + seqChange2. replacement = & seqChange1; } } } diff --git a/amrfinder.cpp b/amrfinder.cpp index 77104ca..6bc80e7 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,8 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: +* POINTX method with more SNPs is preferred over POINTP method +* 3.8.24 09/21/2020 PD-3536 --pointmut_all reports all SNPs in a reference gene repetition * 3.8.23 09/16/2020 PD-3536 simplifying point mutations preference * 3.8.22 09/15/2020 PD-3470 frameshift detection bug; preference of point mutation reference proteins * 3.8.21 09/14/2020 PD-3536 point mutations merging bug diff --git a/dna_mutation.cpp b/dna_mutation.cpp index 5889d2c..d8ff87e 100644 --- a/dna_mutation.cpp +++ b/dna_mutation.cpp @@ -156,7 +156,7 @@ struct BlastnAlignment : Alignment // HMM td << na << na; - if (! seqChange. empty () && seqChange. mutation) // resistant mutation + if (! seqChange. empty () && seqChange. mutation && ! seqChange. replacement) // resistant mutation os << td. str () << endl; if (mutation_all. get ()) *mutation_all << td. str () << endl; @@ -343,16 +343,17 @@ struct ThisApplication : Application && blastAl2->targetStrand == blastAl1->targetStrand && blastAl2 != blastAl1 ) - //for (SeqChange& seqChange2 : blastAl2. seqChanges) - for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + //for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + for (SeqChange& seqChange2 : var_cast (blastAl2) -> seqChanges) { - SeqChange& seqChange2 = *iter; + //SeqChange& seqChange2 = *iter; ASSERT (seqChange2. al == blastAl2); //ASSERT (seqChange2. mutation); if ( seqChange1. start_target == seqChange2. start_target && seqChange1. better (seqChange2) ) - iter. erase (); + //iter. erase (); + seqChange2. replacement = & seqChange1; } } diff --git a/version.txt b/version.txt index d7d3ae9..2bc5cfc 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.23 +3.8.24 From cba46eddf06bc2dd9e6c3a9919c245af3ff2dccf Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Wed, 23 Sep 2020 15:27:45 -0400 Subject: [PATCH 27/36] Corrected test_prot.expected --- test_prot.expected | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_prot.expected b/test_prot.expected index b0f4e73..e090745 100644 --- a/test_prot.expected +++ b/test_prot.expected @@ -9,5 +9,5 @@ blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA pmrB_C84R contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTP 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA -nfsA_R15C_K141STOP contig16 1 423 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN PARTIALP 140 240 58.33 99.29 140 WP_089631889.1 nitroreductase NfsA NA NA +nfsA_R15C_K141STOP contig16 1 423 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTP 140 240 58.33 99.29 140 WP_089631889.1 nitroreductase NfsA NA NA nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family nitroimidazole resistance protein NF000262.1 NimIJ family nitroimidazole resistance protein From c62d6745e7462e037707d0f100b670b5b849078a Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Fri, 25 Sep 2020 15:49:50 -0400 Subject: [PATCH 28/36] Add test for non-overwrite with -u option --- .github/workflows/ccpp.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 8e098a9..00a1ae7 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -24,7 +24,9 @@ jobs: run: ./amrfinder -u - name: make test run: make test - - name: make gitnub_binaries + - name: test for no-overwrite upload (PD-3469 / https://github.com/ncbi/amr/issues/16) + run: ./amrfinder -u | fgrep 'Skipping update, use amrfinder --force_update to overwrite the existing database' + - name: make github_binaries run: make github_binaries - uses: actions/upload-artifact@v2 with: From 040b098cdf21b5ad81d1bbe8991666116c4328f6 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Fri, 25 Sep 2020 15:52:04 -0400 Subject: [PATCH 29/36] PD-3547 identification of frameshifts is disabpled --- amr_report.cpp | 10 ++++++---- amrfinder.cpp | 1 + version.txt | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 2c374c4..cf73459 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -771,7 +771,7 @@ struct BlastAlignment : Alignment ) return false; } - if (frameShift) + if (frameShift) // ?? return true; // PD-1032 if (partial ()) @@ -1321,6 +1321,7 @@ struct Batch // Output: goodBlastAls { // BlastAlignment::frameShift + #if 0 // PD-3547 for (auto& it : blastAls) { for (const BlastAlignment* &blastAl1 : it. second) @@ -1350,6 +1351,7 @@ struct Batch } it. second. filterValue ([] (const BlastAlignment* blastAl) { return ! blastAl; }); } + #endif // BlastAlignment::good() for (auto& it : blastAls) @@ -1778,7 +1780,7 @@ struct ThisApplication : Application // Output addKey ("out", "Identifiers of the reported input proteins"); addFlag ("print_fam", "Print the FAM.id instead of gene symbol"); - addFlag ("pseudo", "Indicate pseudo-genes as method INTERNAL_STOP or FRAME_SHIFT"); + addFlag ("pseudo", "Indicate pseudo-genes as method INTERNAL_STOP"); // or FRAME_SHIFT addFlag ("force_cds_report", "Report contig/start/stop/strand even if this information does not exist"); addFlag ("non_reportable", "Report non-reportable families"); addFlag ("core", "Report only core reportale families"); @@ -1957,9 +1959,9 @@ struct ThisApplication : Application const BlastAlignment* bestBlastAl = nullptr; // PD-3475 if (const VectorOwn* blastAls_ = findPtr (batch. blastAls, hmmAl->sseqid)) { - size_t nident = 0; + size_t nident_max = 0; for (const BlastAlignment* blastAl : *blastAls_) - if (maximize (nident, blastAl->nident)) + if (maximize (nident_max, blastAl->nident)) bestBlastAl = blastAl; } auto al = new BlastAlignment (*hmmAl, bestBlastAl); diff --git a/amrfinder.cpp b/amrfinder.cpp index 6bc80e7..8ce4fef 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: +* 3.8.25 09/25/2020 PD-3547 identification of frameshifts is disabpled * POINTX method with more SNPs is preferred over POINTP method * 3.8.24 09/21/2020 PD-3536 --pointmut_all reports all SNPs in a reference gene repetition * 3.8.23 09/16/2020 PD-3536 simplifying point mutations preference diff --git a/version.txt b/version.txt index 2bc5cfc..965749c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.24 +3.8.25 From 596895ca00b8677ae7018812b01ec90106a47d9e Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Fri, 25 Sep 2020 15:56:44 -0400 Subject: [PATCH 30/36] Fix bug in database update test --- .github/workflows/ccpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 00a1ae7..e1b4a8f 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -25,7 +25,7 @@ jobs: - name: make test run: make test - name: test for no-overwrite upload (PD-3469 / https://github.com/ncbi/amr/issues/16) - run: ./amrfinder -u | fgrep 'Skipping update, use amrfinder --force_update to overwrite the existing database' + run: ./amrfinder -u 2>&1 | fgrep 'Skipping update, use amrfinder --force_update to overwrite the existing database' - name: make github_binaries run: make github_binaries - uses: actions/upload-artifact@v2 From f0a9b32cc6f4a14d370c24382e0253f16c5f9e17 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Fri, 25 Sep 2020 17:07:19 -0400 Subject: [PATCH 31/36] PD-2381 proteins with non-standard start codons that are extended in the N-terminal direction are EXACTP --- amr_report.cpp | 5 ++--- amrfinder.cpp | 3 ++- version.txt | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index cf73459..35688d6 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -410,10 +410,9 @@ struct BlastAlignment : Alignment //frameShift = contains (targetSeq, "/"); // Needs "blastall -p blastx ... " IMPLY (! targetProt, (targetEnd - targetStart) % 3 == 0); // redundant ?? - // For BLASTX // PD-1280 - if ( ! targetProt - && refStart == 0 + if ( /*! targetProt // PD-2381 + &&*/ refStart == 0 && charInSet (targetSeq [0], "LIV") && nident < targetAlign_aa ) diff --git a/amrfinder.cpp b/amrfinder.cpp index 8ce4fef..02c0f30 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,7 +33,8 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: -* 3.8.25 09/25/2020 PD-3547 identification of frameshifts is disabpled +* 3.8.26 09/25/2020 PD-2381 proteins with non-standard start codons that are extended in the N-terminal direction are EXACTP +* 3.8.25 09/25/2020 PD-3547 identification of frameshifts is disabled * POINTX method with more SNPs is preferred over POINTP method * 3.8.24 09/21/2020 PD-3536 --pointmut_all reports all SNPs in a reference gene repetition * 3.8.23 09/16/2020 PD-3536 simplifying point mutations preference diff --git a/version.txt b/version.txt index 965749c..cf387a7 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.25 +3.8.26 From 0f3086bef22d907964a898dbd5e6930b081eab30 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Mon, 28 Sep 2020 12:17:58 -0400 Subject: [PATCH 32/36] PD-3281 non-standard start codons are not changed in fusion proteins --- amr_report.cpp | 9 +++++---- amrfinder.cpp | 1 + version.txt | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index 35688d6..af76b63 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -53,7 +53,8 @@ namespace { -constexpr static double frac_delta = 1e-5; // PAR +constexpr double frac_delta = 1e-5; // PAR +constexpr size_t domain_min = 20; // aa // PAR constexpr double ident_min_def = 0.9; @@ -411,8 +412,8 @@ struct BlastAlignment : Alignment IMPLY (! targetProt, (targetEnd - targetStart) % 3 == 0); // redundant ?? // PD-1280 - if ( /*! targetProt // PD-2381 - &&*/ refStart == 0 + if ( (! targetProt || targetStart < domain_min) // PD-2381 + && refStart == 0 && charInSet (targetSeq [0], "LIV") && nident < targetAlign_aa ) @@ -785,7 +786,7 @@ struct BlastAlignment : Alignment } private: size_t mismatchTailTarget () const - { return (mismatchTail_aa + (frameShift ? 20 : 0)) * (targetProt ? 1 : 3); } + { return (mismatchTail_aa + (frameShift ? domain_min : 0)) * (targetProt ? 1 : 3); } bool insideEq (const BlastAlignment &other) const { ASSERT (targetProt == other. targetProt); return targetStrand == other. targetStrand diff --git a/amrfinder.cpp b/amrfinder.cpp index 02c0f30..2888355 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,7 @@ * awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq * * Release changes: +* 3.8.27 09/28/2020 PD-2381 non-standard start codons are not changed in fusion proteins * 3.8.26 09/25/2020 PD-2381 proteins with non-standard start codons that are extended in the N-terminal direction are EXACTP * 3.8.25 09/25/2020 PD-3547 identification of frameshifts is disabled * POINTX method with more SNPs is preferred over POINTP method diff --git a/version.txt b/version.txt index cf387a7..ca84d17 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.26 +3.8.27 From af84fc95d703f0bf0563ae9f4ecdea81a1c959d8 Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Mon, 28 Sep 2020 13:23:58 -0400 Subject: [PATCH 33/36] Typo fix --- .github/workflows/ccpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index e1b4a8f..ae6e964 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -24,7 +24,7 @@ jobs: run: ./amrfinder -u - name: make test run: make test - - name: test for no-overwrite upload (PD-3469 / https://github.com/ncbi/amr/issues/16) + - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) run: ./amrfinder -u 2>&1 | fgrep 'Skipping update, use amrfinder --force_update to overwrite the existing database' - name: make github_binaries run: make github_binaries From 7c7d793fcaf6e9fb9bbd351f8f6e3758d0b65294 Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Mon, 28 Sep 2020 13:38:05 -0400 Subject: [PATCH 34/36] Update test data for non-standard start codon --- test_both.expected | 2 +- test_prot.expected | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_both.expected b/test_both.expected index ca7bdd2..c1961a5 100644 --- a/test_both.expected +++ b/test_both.expected @@ -5,7 +5,7 @@ blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamas vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG NA contig04 1261 2391 + blaEC BlaEC family class C beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM BLASTX 377 377 100.00 98.14 377 WP_063610930.1 class C extended-spectrum beta-lactamase EC-15 NA NA NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NA NA -aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 99.56 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase +aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA NA contig10 486 1307 + blaOXA class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NA NA blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NA NA diff --git a/test_prot.expected b/test_prot.expected index e090745..ea47025 100644 --- a/test_prot.expected +++ b/test_prot.expected @@ -3,7 +3,7 @@ blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG -aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 99.56 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase +aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIALP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NA NA qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA From 3f883d03e102597189e436921dc3b5dac2c2d012 Mon Sep 17 00:00:00 2001 From: Slava Brover Date: Wed, 30 Sep 2020 11:16:07 -0400 Subject: [PATCH 35/36] PD-2407 --test is removed --- amr_report.cpp | 17 ++++++++--------- amrfinder.cpp | 26 +++++++++++++++++++------- fasta_extract.cpp | 1 + version.txt | 2 +- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/amr_report.cpp b/amr_report.cpp index af76b63..5d14b45 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -547,15 +547,14 @@ struct BlastAlignment : Alignment ? famId : getGeneSymbols () ) - << (isMutation () - ? mut - ? seqChange. empty () - ? proteinName + " [WILDTYPE]" - : mut->name - : proteinName + " [UNKNOWN]" - : proteinName - ) - //+ ifS (reportPseudo, ifS (stopCodon, " [STOP_CODON]")) + << (isMutation () + ? mut + ? seqChange. empty () + ? proteinName + " [WILDTYPE]" + : mut->name + : proteinName + " [UNKNOWN]" + : proteinName + ) << (isMutation () || getFam () -> reportable >= 2 ? "core" : "plus"); // PD-2825 // PD-1856 if (isMutation ()) diff --git a/amrfinder.cpp b/amrfinder.cpp index 2888355..8bf1af4 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -30,9 +30,11 @@ * AMRFinder * * Dependencies: NCBI BLAST, HMMer -* awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail, uniq +* awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail * * Release changes: +* 09/30/2020 PD-2407 option --type is removed +* 3.8.28 09/29/2020 PD-3292 dependence on "uniq" is removed * 3.8.27 09/28/2020 PD-2381 non-standard start codons are not changed in fusion proteins * 3.8.26 09/25/2020 PD-2381 proteins with non-standard start codons that are extended in the N-terminal direction are EXACTP * 3.8.25 09/25/2020 PD-3547 identification of frameshifts is disabled @@ -199,7 +201,7 @@ struct Warning : Singleton -const StringVector all_types {"AMR", "STRESS", "VIRULENCE"}; +//const StringVector all_types {"AMR", "STRESS", "VIRULENCE"}; // select id from FAM where [type] = 1 @@ -229,7 +231,7 @@ struct ThisApplication : ShellApplication addFlag ("plus", "Add the plus genes to the report"); // PD-2789 addFlag ("report_common", "Report proteins common to a taxonomy group"); // PD-2756 addKey ("mutation_all", "File to report all mutations", "", '\0', "MUT_ALL_FILE"); - addKey ("type", "Limit search to specific element types: " + all_types. toString (",") + ". A comma delimited list, case-insensitive", "", '\0', "TYPE"); + //addKey ("type", "Limit search to specific element types: " + all_types. toString (",") + ". A comma delimited list, case-insensitive", "", '\0', "TYPE"); // "Element type" is a column name in the report addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); //addKey ("hmmer_bin" ?? @@ -336,7 +338,7 @@ struct ThisApplication : ShellApplication const bool add_plus = getFlag ("plus"); const bool report_common = getFlag ("report_common"); const string mutation_all = shellQuote (getArg ("mutation_all")); - const string type = getArg ("type"); + //const string type = getArg ("type"); string blast_bin = getArg ("blast_bin"); const string input_name = shellQuote (getArg ("name")); const string parm = getArg ("parm"); @@ -378,7 +380,8 @@ struct ThisApplication : ShellApplication Warning warning (stderr); stderr << "--mutation_all option used without -O/--organism option. No point mutations will be screened"; } - + + #if 0 StringVector typeVec; if (! type. empty ()) { @@ -394,6 +397,7 @@ struct ThisApplication : ShellApplication typeVec << s; } } + #endif if (! emptyArg (output)) try { OFStream f (unQuote (output)); } @@ -864,6 +868,8 @@ struct ThisApplication : ShellApplication } // Column names are from amr_report.cpp + + #if 0 string typeFilter; if (! typeVec. empty ()) { @@ -871,6 +877,7 @@ struct ThisApplication : ShellApplication for (const string& t : typeVec) typeFilter += " || $" + typeCol + " == \"" + t + "\""; } + #endif // Sorting AMR report // PD-2244, PD-3230 @@ -890,19 +897,24 @@ struct ThisApplication : ShellApplication exec ("head -1 " + tmp + ".amr > " + tmp + ".amr-out"); exec ("LANG=C && tail -n +2 " + tmp + ".amr | sort " + sortS + " >> " + tmp + ".amr-out"); + #if 0 if (! typeFilter. empty ()) exec ("awk -F '\t' 'NR == 1 " + typeFilter + "' " + tmp + ".amr-out > " + tmp + ".amr"); else + #endif exec ("mv " + tmp + ".amr-out " + tmp + ".amr"); // Sorting mutation_all if (! emptyArg (mutation_all)) { - exec ("head -1 " + tmp + ".mutation_all > " + tmp + ".mutation_all-out"); - exec ("LANG=C && tail -n +2 " + tmp + ".mutation_all | sort " + sortS + " | uniq >> " + tmp + ".mutation_all-out"); + exec ("head -1 " + tmp + ".mutation_all > " + tmp + ".mutation_all-out"); + exec ("LANG=C && tail -n +2 " + tmp + ".mutation_all | sort -u | sort " + sortS + " >> " + tmp + ".mutation_all-out"); + // "sort -u | sort " replaces "sort | uniq" + #if 0 if (! typeFilter. empty ()) exec ("awk -F '\t' 'NR == 1 " + typeFilter + "' " + tmp + ".mutation_all-out > " + mutation_all); else + #endif exec ("mv " + tmp + ".mutation_all-out " + mutation_all); } diff --git a/fasta_extract.cpp b/fasta_extract.cpp index e3464e4..7e1c1a7 100644 --- a/fasta_extract.cpp +++ b/fasta_extract.cpp @@ -46,6 +46,7 @@ namespace struct Segment +// not circular { size_t start {0}; size_t stop {0}; diff --git a/version.txt b/version.txt index ca84d17..2529892 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.27 +3.8.28 From 86cf27edd4b1362dfbf0ccba5d08d8af40699a8e Mon Sep 17 00:00:00 2001 From: Arjun Prasad Date: Thu, 1 Oct 2020 14:14:28 -0400 Subject: [PATCH 36/36] Minor updates to README.md text --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f60b438..41785a6 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ This software and the accompanying database are designed to find acquired antimicrobial resistance genes and some point mutations in protein or assembled nucleotide sequences. We have also added "plus" stress, head, and biocide resistance as well as some virulence factors and E. coli antigens. ## See [the wiki for documentation](https://github.com/ncbi/amr/wiki) -[Subscribe to our announce list](https://www.ncbi.nlm.nih.gov/mailman/listinfo/amrfinder-announce) for announcements of database or software updates. +[Please subscribe to our announce list](https://www.ncbi.nlm.nih.gov/mailman/listinfo/amrfinder-announce) for announcements of database and software updates. ---- # Licenses