diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 1fa535c..ae6e964 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -24,3 +24,12 @@ jobs: run: ./amrfinder -u - name: make test run: make test + - name: test for no-overwrite database update (PD-3469 / https://github.com/ncbi/amr/issues/16) + run: ./amrfinder -u 2>&1 | fgrep 'Skipping update, use amrfinder --force_update to overwrite the existing database' + - name: make github_binaries + run: make github_binaries + - uses: actions/upload-artifact@v2 + with: + name: release-binary + path: amrfinder_binaries_v*.tar.gz + diff --git a/.gitignore b/.gitignore index aa4d7f3..44dc746 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ data/ amr_report amrfinder fasta_check +fasta_extract dna_mutation gff_check amrfinder_update diff --git a/Makefile b/Makefile index 126a6f5..5b722c4 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ COMPILE.cpp= $(CXX) $(CPPFLAGS) $(SVNREV) $(DBDIR) -c .PHONY: all clean install release -BINARIES= amr_report amrfinder amrfinder_update fasta_check fasta2parts gff_check dna_mutation +BINARIES= amr_report amrfinder amrfinder_update fasta_check fasta_extract fasta2parts gff_check dna_mutation all: $(BINARIES) @@ -89,6 +89,11 @@ fasta_checkOBJS=fasta_check.o common.o fasta_check: $(fasta_checkOBJS) $(CXX) -o $@ $(fasta_checkOBJS) +fasta_extract.o: common.hpp common.inc +fasta_extractOBJS=fasta_extract.o common.o +fasta_extract: $(fasta_extractOBJS) + $(CXX) -o $@ $(fasta_extractOBJS) + fasta2parts.o: common.hpp common.inc fasta2partsOBJS=fasta2parts.o common.o fasta2parts: $(fasta2partsOBJS) diff --git a/README.md b/README.md index f60b438..41785a6 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ This software and the accompanying database are designed to find acquired antimicrobial resistance genes and some point mutations in protein or assembled nucleotide sequences. We have also added "plus" stress, head, and biocide resistance as well as some virulence factors and E. coli antigens. ## See [the wiki for documentation](https://github.com/ncbi/amr/wiki) -[Subscribe to our announce list](https://www.ncbi.nlm.nih.gov/mailman/listinfo/amrfinder-announce) for announcements of database or software updates. +[Please subscribe to our announce list](https://www.ncbi.nlm.nih.gov/mailman/listinfo/amrfinder-announce) for announcements of database and software updates. ---- # Licenses diff --git a/alignment.cpp b/alignment.cpp index 87efb16..d807454 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -414,9 +414,9 @@ bool normalizeSeq (const string &seq1, ASSERT (seq1. size () == seq2. size ()); bool changed = false; - size_t start = NO_INDEX; + size_t start = no_index; FFOR (size_t, i, seq1. size ()) - if (start == NO_INDEX) + if (start == no_index) { if (seq2 [i] == '-') start = i; @@ -432,7 +432,7 @@ bool normalizeSeq (const string &seq1, changed = true; } else - start = NO_INDEX; + start = no_index; } ASSERT (seq1. size () == seq2. size ()); @@ -694,13 +694,13 @@ void Alignment::setSeqChanges (const Vector &refMutations, } // SeqChange::mutation - size_t start_ref_prev = NO_INDEX; + size_t start_ref_prev = no_index; for (SeqChange& seqChange : seqChanges) { seqChange. qc (); if (verbose ()) seqChange. saveText (cout); - IMPLY (start_ref_prev != NO_INDEX, start_ref_prev <= seqChange. start_ref); + IMPLY (start_ref_prev != no_index, start_ref_prev <= seqChange. start_ref); while (j < refMutations. size ()) { const Mutation& mut = refMutations [j]; @@ -829,6 +829,55 @@ void Alignment::qc () const +bool Alignment::getFrameShift_right (const Alignment &rightPart, + size_t diff_max) const +{ + ASSERT (! targetProt); + ASSERT (refProt); + ASSERT (rightPart. refProt); + + if (this == & rightPart) + return false; + + if (rightPart. targetProt) + return false; + + if ( targetName != rightPart. targetName + || refName != rightPart. refName + || targetStrand != rightPart. targetStrand + ) + return false; + + if ( refStart >= rightPart. refStart + || refEnd >= rightPart. refEnd + || refEnd + diff_max / 3 < rightPart. refStart + ) + return false; + + if (targetStrand) + { + if ( targetStart >= rightPart. targetStart + || targetEnd >= rightPart. targetEnd + || targetEnd + diff_max < rightPart. targetStart + ) + return false; + } + else + { + if ( targetStart <= rightPart. targetStart + || targetEnd <= rightPart. targetEnd + || targetStart > rightPart. targetEnd + diff_max + ) + return false; + } + + if (targetStart % 3 == rightPart. targetStart % 3) + return false; + + return true; +} + + } // namespace diff --git a/alignment.hpp b/alignment.hpp index 42ee9de..c3730a3 100644 --- a/alignment.hpp +++ b/alignment.hpp @@ -137,6 +137,8 @@ struct SeqChange : Root char prev {'\0'}; const Mutation* mutation {nullptr}; + + const SeqChange* replacement {nullptr}; SeqChange () = default; @@ -182,6 +184,7 @@ struct SeqChange : Root bool finish (const string &refSeq, size_t flankingLen); // Return: good match + // Input: flankingLen: valid if > 0 private: void setSeq (); void setStartStopRef (); @@ -252,6 +255,7 @@ struct Alignment : Root void setSeqChanges (const Vector &refMutations, size_t flankingLen/*, bool allMutationsP*/); + // Input: flankingLen: valid if > 0 public: bool empty () const override { return targetName. empty (); } @@ -288,6 +292,19 @@ struct Alignment : Root && nident == refLen && nident == targetSeq. size (); } + bool getFrameShift (const Alignment &other, + size_t diff_max) const + // Return: success + // Input: diff_max: in bp + // Requires: !targetProt, refProt, rightPart.refProt + { return nident >= other. nident + && ( getFrameShift_right (other, diff_max) + || other. getFrameShift_right (*this, diff_max) + ); + } +private: + bool getFrameShift_right (const Alignment &rightPart, + size_t diff_max) const; }; diff --git a/amr_report.cpp b/amr_report.cpp index b490f66..5d14b45 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -53,7 +53,8 @@ namespace { -constexpr static double frac_delta = 1e-5; // PAR +constexpr double frac_delta = 1e-5; // PAR +constexpr size_t domain_min = 20; // aa // PAR constexpr double ident_min_def = 0.9; @@ -291,11 +292,15 @@ bool cdsExist = false; bool print_fam = false; bool reportPseudo = false; -const string stopCodonS ("[stop]"); -const string frameShiftS ("[frameshift]"); +//const string stopCodonS ("[stop]"); +//const string frameShiftS ("[frameshift]"); unique_ptr mutation_all; +string input_name; + +const string na ("NA"); + struct BlastAlignment : Alignment @@ -309,8 +314,7 @@ struct BlastAlignment : Alignment bool frameShift {false}; // Reference protein -//long gi {0}; - // 0 <=> HMM method + bool fromHmm {false}; string refAccession; // empty() <=> HMM method size_t part {1}; @@ -318,6 +322,7 @@ struct BlastAlignment : Alignment // <= parts size_t parts {1}; // >= 1 + const BlastAlignment* fusion {nullptr}; // Table FAM string famId; string gene; @@ -330,7 +335,6 @@ struct BlastAlignment : Alignment string product; Vector cdss; static constexpr size_t mismatchTail_aa = 10; // PAR - size_t mismatchTailTarget {0}; const HmmAlignment* hmmAl {nullptr}; @@ -369,6 +373,7 @@ struct BlastAlignment : Alignment throw runtime_error (string ("Bad AMRFinder database\n") + e. what ()); } QC_ASSERT (! refAccession. empty ()); + refName = refAccession; replace (product, '_', ' '); @@ -404,22 +409,16 @@ struct BlastAlignment : Alignment if (contains (targetSeq, "*")) stopCodon = true; //frameShift = contains (targetSeq, "/"); // Needs "blastall -p blastx ... " - if (! targetProt && (targetEnd - targetStart) % 3 != 0) - frameShift = true; + IMPLY (! targetProt, (targetEnd - targetStart) % 3 == 0); // redundant ?? - // For BLASTX // PD-1280 - if ( ! targetProt + if ( (! targetProt || targetStart < domain_min) // PD-2381 && refStart == 0 && charInSet (targetSeq [0], "LIV") && nident < targetAlign_aa ) nident++; - mismatchTailTarget = mismatchTail_aa; - if (! targetProt) - mismatchTailTarget *= 3; - if (! targetProt) cdss << move (Locus (0, targetName, targetStart, targetEnd, targetStrand, partialDna, 0)); @@ -427,9 +426,11 @@ struct BlastAlignment : Alignment if (const Vector* refMutations = findPtr (accession2mutations, refAccession)) { if (verbose ()) - cout << "Mutation protein found: " << refName << endl; + cout << "Mutation protein found: " << refAccession << endl << line << endl; QC_ASSERT (isMutation ()); - setSeqChanges (*refMutations, 0 /*flankingLen*/ /*, mutation_all. get ()*/); + setSeqChanges (*refMutations, 0); + if (verbose ()) + cout << endl; } } catch (...) @@ -438,29 +439,40 @@ struct BlastAlignment : Alignment throw; } } - explicit BlastAlignment (const HmmAlignment& other) - : famId (other. fam->id) - , gene (other. fam->id) - , product (other. fam->familyName) - , hmmAl (& other) - { targetName = other. sseqid; + BlastAlignment (const HmmAlignment& hmmAl_arg, + const BlastAlignment* best) + : fromHmm (true) + , famId (hmmAl_arg. fam->id) + , gene (hmmAl_arg. fam->id) + , product (hmmAl_arg. fam->familyName) + , hmmAl (& hmmAl_arg) + { ASSERT (hmmAl_arg. good ()); + targetName = hmmAl_arg. sseqid; targetProt = true; alProt = true; if (allele ()) ERROR_MSG (famId + " " + gene); - ASSERT (other. good ()); - } -#if 0 - explicit BlastAlignment (const Mutation& mut) - : famId (mut. gene) - , gene (mut. gene) - , resistance ("mutation") - { targetProt = true; - alProt = true; - seqChanges << SeqChange (this, & mut); + if (best) + { + targetSeq = best->targetSeq; + targetStart = best->targetStart; + targetEnd = best->targetEnd; + targetLen = best->targetLen; + targetStrand = best->targetStrand; + refProt = true; + refName = best->refName; + refSeq = best->refSeq; + refStart = best->refStart; + refEnd = best->refEnd; + refLen = best->refLen; + alProt = true; + nident = best->nident; + targetAlign = best->targetAlign; + targetAlign_aa = best->targetAlign_aa; + refAccession = best->refAccession; + } } -#endif - void qc () const + void qc () const override { if (! qc_on) return; @@ -472,6 +484,7 @@ struct BlastAlignment : Alignment QC_ASSERT (part >= 1); QC_ASSERT (part <= parts); QC_ASSERT (! product. empty ()); + QC_IMPLY (! fromHmm, ! refAccession. empty ()); QC_ASSERT (refAccession. empty () == targetSeq. empty ()); QC_ASSERT (! refAccession. empty () == (bool) refLen); QC_ASSERT (! refAccession. empty () == (bool) nident); @@ -494,9 +507,8 @@ struct BlastAlignment : Alignment #endif QC_IMPLY (! seqChanges. empty (), isMutation ()); } - void saveText (ostream& os) const + void saveText (ostream& os) const override { // PD-736, PD-774, PD-780, PD-799 - const string na ("NA"); const string proteinName (isMutation () ? product /*string ()*/ : refExactlyMatched () || parts >= 2 || refAccession. empty () // PD-3187, PD-3192 @@ -511,37 +523,38 @@ struct BlastAlignment : Alignment if (seqChanges_. empty ()) seqChanges_ << SeqChange (); for (const Locus& cds : cdss_) - for (SeqChange& seqChange : seqChanges_) + for (const SeqChange& seqChange : seqChanges_) { const Mutation* mut = seqChange. mutation; const string method (empty () ? na : getMethod (cds)); - ASSERT (isMutation () == ! (seqChange. empty () && ! mut)); + IMPLY (! verbose (), isMutation () == ! (seqChange. empty () && ! mut)); TabDel td (2, false); + if (! input_name. empty ()) + td << input_name;; td << (targetProt ? nvl(targetName, na) : na); // PD-2534 if (cdsExist) td << (empty () ? na : (cds. contig. empty () ? nvl (targetName,na) : cds. contig)) - << (empty () ? 0 : (cds. contig. empty () ? targetStart : cds. start) + 1) - << (empty () ? 0 : (cds. contig. empty () ? targetEnd : cds. stop)) + << (empty () ? 0 : (cds. contig. empty () ? targetStart : cds. start) + 1) + << (empty () ? 0 : (cds. contig. empty () ? targetEnd : cds. stop)) << (empty () ? na : (cds. contig. empty () ? (targetStrand ? "+" : "-") : (cds. strand ? "+" : "-"))); td << (isMutation () ? mut - ? seqChange. empty () - ? mut->wildtype () - : mut->geneMutation - : gene + "_" + seqChange. getMutationStr () + ? seqChange. empty () + ? mut->wildtype () + : mut->geneMutation + : gene + "_" + seqChange. getMutationStr () : print_fam - ? famId - : (isLeft (method, "ALLELE") ? famId : nvl (getFam () -> genesymbol, na)) + ? famId + : getGeneSymbols () + ) + << (isMutation () + ? mut + ? seqChange. empty () + ? proteinName + " [WILDTYPE]" + : mut->name + : proteinName + " [UNKNOWN]" + : proteinName ) - << (isMutation () - ? mut - ? seqChange. empty () - ? proteinName + " [WILDTYPE]" - : mut->name - : proteinName + " [UNKNOWN]" - : proteinName - ) - + ifS (reportPseudo, ifS (frameShift, " " + frameShiftS)) << (isMutation () || getFam () -> reportable >= 2 ? "core" : "plus"); // PD-2825 // PD-1856 if (isMutation ()) @@ -600,16 +613,18 @@ struct BlastAlignment : Alignment } } if ( ! isMutation () - || (! seqChange. empty () && mut) // resistant mutation + || (! seqChange. empty () && mut && ! seqChange. replacement) // resistant mutation ) { if (verbose ()) - os << refExactlyMatched () - << '\t' << allele () - << '\t' << alleleReported () - << '\t' << targetProt - << '\t' << nident - << '\t' << refMutation + os << refExactlyMatched () // 1 + << '\t' << allele () // 2 + << '\t' << alleleReported () // 3 + << '\t' << targetProt // 4 + << '\t' << nident // 5 + << '\t' << refMutation // 6 + << '\t' << stopCodon // 7 + << '\t' << frameShift // 8 << '\t'; os << td. str () << endl; } @@ -624,7 +639,9 @@ struct BlastAlignment : Alignment Set getMutationSymbols () const { Set mutationSymbols; for (const SeqChange& seqChange : seqChanges) - if (seqChange. mutation) + if ( ! seqChange. empty () + && seqChange. mutation + ) mutationSymbols << seqChange. mutation->geneMutation; return mutationSymbols; } @@ -655,7 +672,7 @@ struct BlastAlignment : Alignment ); } bool truncated (const Locus &cds) const - { return (missedDnaStart (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStart ()) : targetStart <= Locus::end_delta)) + { return (missedDnaStart (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStart ()) : targetStart <= Locus::end_delta)) || (missedDnaStop (cds) > 0 && (targetProt ? (cds. empty () ? false : cds. atContigStop ()) : targetLen - targetEnd <= Locus::end_delta)); } bool truncatedCds () const @@ -666,32 +683,69 @@ struct BlastAlignment : Alignment } bool alleleReported () const { return refExactlyMatched () && allele () && (! targetProt || refLen == targetLen); } + string getGeneSymbol () const + { return alleleReported () /*isLeft (method, "ALLELE")*/ + ? famId + : nvl (getFam () -> genesymbol, na); + } + string getGeneSymbols () const + { if (fusion) + { + string s1 ( getGeneSymbol ()); + string s2 (fusion->getGeneSymbol ()); + if (part == 2) + swap (s1, s2); + return s1 + "/" + s2; + } + return getGeneSymbol (); + } string getMethod (const Locus &cds) const { //IMPLY (refExactlyMatched () && ! mutation_all. get (), ! isMutation ()) - string method (refExactlyMatched () - ? alleleReported () - ? "ALLELE" - : "EXACT" // PD-776 - : refAccession. empty () - ? "HMM" - : partial () - ? truncated (cds) - ? "PARTIAL_CONTIG_END" // PD-2267 - : "PARTIAL" - : isMutation () - ? "POINT" - : "BLAST" + string method (fromHmm + ? "HMM" + : refExactlyMatched () + ? alleleReported () + ? "ALLELE" + : "EXACT" // PD-776 + #if 0 + : partial () + ? truncated (cds) + ? "PARTIAL_CONTIG_END" // PD-2267 + : "PARTIAL" + : isMutation () + ? "POINT" + : "BLAST" + #else + : isMutation () + ? "POINT" + : partial () + ? truncated (cds) + ? "PARTIAL_CONTIG_END" // PD-2267 + : "PARTIAL" + : "BLAST" + #endif ); // PD-2088, PD-2320 + bool suffix = true; if ( ( method == "BLAST" || method == "PARTIAL" || method == "PARTIAL_CONTIG_END" ) - && stopCodon - && ! targetProt + //&& ! targetProt ) - method = "INTERNAL_STOP"; - else if (method != "HMM") + { + if (stopCodon) + { + method = "INTERNAL_STOP"; + suffix = false; + } + else if (frameShift) + { + method = "FRAME_SHIFT"; + suffix = false; + } + } + if (suffix && method != "HMM") method += (targetProt ? "P" : "X"); return method; } @@ -716,6 +770,8 @@ struct BlastAlignment : Alignment ) return false; } + if (frameShift) // ?? + return true; // PD-1032 if (partial ()) if ( parts > 1 @@ -728,21 +784,13 @@ struct BlastAlignment : Alignment return passBlastRule (completeBR); } private: -#if 0 - bool sameTarget (const BlastAlignment &other) const - // Symmetric - { ASSERT (targetProt == other. targetProt); - return targetStrand == other. targetStrand - && difference (targetStart, other. targetStart) <= mismatchTailTarget - && difference (targetEnd, other. targetEnd) <= mismatchTailTarget; - } - // Requires: same targetName -#endif + size_t mismatchTailTarget () const + { return (mismatchTail_aa + (frameShift ? domain_min : 0)) * (targetProt ? 1 : 3); } bool insideEq (const BlastAlignment &other) const { ASSERT (targetProt == other. targetProt); - return targetStrand == other. targetStrand - && targetStart + mismatchTailTarget >= other. targetStart - && targetEnd <= other. targetEnd + mismatchTailTarget; + return targetStrand == other. targetStrand + && targetStart + mismatchTailTarget () >= other. targetStart + && targetEnd <= other. targetEnd + mismatchTailTarget (); } // Requires: same targetName bool matchesCds (const BlastAlignment &other) const @@ -802,6 +850,30 @@ struct BlastAlignment : Alignment { if (targetName != other. targetName) return false; + } + else + { // PD-1902, PD-2139, PD-2313, PD-2320 + if (targetProt && ! matchesCds (other)) + return false; + if (! targetProt && ! other. matchesCds (*this)) + return false; + } + #if 0 + // Moved below + if (isMutation () && other. isMutation ()) + { + const Set mutationSymbols ( getMutationSymbols ()); + const Set otherMutationSymbols (other. getMutationSymbols ()); + if (mutationSymbols == otherMutationSymbols && targetProt != other. targetProt) + return targetProt; + if ( mutationSymbols. containsAll (otherMutationSymbols) + && ! otherMutationSymbols. containsAll (mutationSymbols) + ) + return true; + } + #endif + if (targetProt == other. targetProt) + { // PD-807 if ( ! (targetProt && famId == other. famId) // PD-2441 //&& ! sameTarget (other) @@ -816,32 +888,29 @@ struct BlastAlignment : Alignment LESS_PART (*this, other, refEffectiveLen ()); } else - { // PD-1902, PD-2139, PD-2313, PD-2320 - if (targetProt && ! matchesCds (other)) - return false; - if (! targetProt && ! other. matchesCds (*this)) - return false; - if (isMutation ()) - { - if (! other. isMutation ()) - return true; - const Set mutationSymbols ( getMutationSymbols ()); - const Set otherMutationSymbols (other. getMutationSymbols ()); - if (mutationSymbols == otherMutationSymbols) - return targetProt; - if (mutationSymbols. containsAll (otherMutationSymbols)) - return true; - if (otherMutationSymbols. containsAll (mutationSymbols)) - return false; - } - else - { - LESS_PART (other, *this, refExactlyMatched ()); - //LESS_PART (other, *this, allele ()); // PD-2352 - LESS_PART (other, *this, alleleReported ()); - LESS_PART (other, *this, targetProt); - } + { + LESS_PART (other, *this, refExactlyMatched ()); + //LESS_PART (other, *this, allele ()); // PD-2352 + LESS_PART (other, *this, alleleReported ()); + //LESS_PART (*this, other, partial ()); + //LESS_PART (other, *this, targetProt); // moved below } + if (isMutation () && other. isMutation ()) + { + const Set mutationSymbols ( getMutationSymbols ()); + const Set otherMutationSymbols (other. getMutationSymbols ()); + if (mutationSymbols == otherMutationSymbols && targetProt != other. targetProt) + return targetProt; + if ( mutationSymbols. containsAll (otherMutationSymbols) + && ! otherMutationSymbols. containsAll (mutationSymbols) + ) + return true; + if ( otherMutationSymbols. containsAll (mutationSymbols) + && ! mutationSymbols. containsAll (otherMutationSymbols) + ) + return false; + } + LESS_PART (other, *this, targetProt); return true; } public: @@ -878,9 +947,22 @@ struct BlastAlignment : Alignment ; } size_t getCdsStart () const - { return cdss. empty () - ? 0 - : cdss. front (). start; + { if (cdss. empty ()) + return 0; + size_t start = numeric_limits::max (); + for (const Locus& cds : cdss) + minimize (start, cds. start); + ASSERT (start != numeric_limits::max ()); + return start; + } + size_t getCdsStop () const + { if (cdss. empty ()) + return 0; + size_t stop = 0; + for (const Locus& cds : cdss) + maximize (stop, cds. stop); + ASSERT (stop); + return stop; } void setTargetAlign () { targetAlign = targetEnd - targetStart; @@ -925,16 +1007,30 @@ struct BlastAlignment : Alignment } static bool less (const BlastAlignment* a, const BlastAlignment* b) - { ASSERT (a); - ASSERT (b); - LESS_PART (*a, *b, targetName); - LESS_PART (*a, *b, targetStart); - LESS_PART (*a, *b, getCdsStart ()); - LESS_PART (*a, *b, famId); - LESS_PART (*a, *b, refAccession); - LESS_PART (*b, *a, pIdentity ()); - return false; - } + { ASSERT (a); + ASSERT (b); + LESS_PART (*a, *b, targetName); + LESS_PART (*a, *b, targetStart); + LESS_PART (*a, *b, targetEnd); + LESS_PART (*a, *b, getCdsStart ()); + LESS_PART (*a, *b, getCdsStop ()); + LESS_PART (*a, *b, refAccession); + LESS_PART (*a, *b, part); + //LESS_PART (*a, *b, famId); + //LESS_PART (*b, *a, pIdentity ()); + return false; + } + bool sameMatch (const BlastAlignment* other) const + // Cf. less() + { ASSERT (other); + ASSERT (targetName == other->targetName); + return /*targetName == other->targetName + &&*/ targetStart == other->targetStart + && targetEnd == other->targetEnd + && getCdsStart () == other->getCdsStart () + && getCdsStop () == other->getCdsStop () + && refAccession == other->refAccession; + } }; @@ -957,6 +1053,29 @@ bool HmmAlignment::better (const BlastAlignment& other) const +typedef map> BlastAlignmentsOwn; + // first = second::targetName +size_t getSize (const BlastAlignmentsOwn &ba) +{ + size_t n = 0; + for (const auto& it : ba) + n += it. second. size (); + return n; +} + + +typedef map> BlastAlignments; + // first = second::targetName +size_t getSize (const BlastAlignments &ba) +{ + size_t n = 0; + for (const auto& it : ba) + n += it. second. size (); + return n; +} + + + // Batch struct Batch @@ -967,13 +1086,13 @@ struct Batch StringVector suppress_prots; // Target input - VectorOwn blastAls; + BlastAlignmentsOwn blastAls; VectorOwn hmmAls; bool hmmExist {false}; map domains; // Best domain // Output - VectorPtr goodBlastAls; + BlastAlignments goodBlastAls; Batch (const string &famFName, @@ -1145,32 +1264,53 @@ struct Batch void blastParetoBetter () - // BLAST: Pareto-better() // Input: blastAls // Output: goodBlastAls { ASSERT (goodBlastAls. empty ()); - for (const BlastAlignment* blastAl : blastAls) - { - ASSERT (blastAl); - ASSERT (blastAl->good ()); - bool found = false; - for (const BlastAlignment* goodBlastAl : goodBlastAls) - if (goodBlastAl->better (*blastAl)) - { - found = true; - break; - } - if (found) - continue; - for (Iter> goodIter (goodBlastAls); goodIter. next ();) - if (blastAl->better (**goodIter)) - goodIter. erase (); - goodBlastAls << blastAl; - } + for (const auto& it1 : blastAls) + for (const BlastAlignment* blastAl : it1. second) + { + ASSERT (blastAl); + ASSERT (blastAl->good ()); + bool found = false; + for (const auto& it2 : goodBlastAls) // efficiency ?? + { + for (const BlastAlignment* goodBlastAl : it2. second) + if (goodBlastAl->better (*blastAl)) + { + found = true; + break; + } + if (found) + break; + } + if (found) + continue; + for (auto& it2 : goodBlastAls) // efficiency ?? + for (Iter> goodIter (it2. second); goodIter. next ();) + if (blastAl->better (**goodIter)) + goodIter. erase (); + ASSERT (! blastAl->targetName. empty ()); + goodBlastAls [blastAl->targetName] << blastAl; + } if (verbose ()) - cout << "# Best Blasts: " << goodBlastAls. size () << endl; + cout << "# Best Blasts: " << getSize (goodBlastAls) << endl; } + + + void setStopCodon (BlastAlignment &blastAlP) + { ASSERT (blastAlP. targetProt); + for (const Locus& cds : blastAlP. cdss) + if (const VectorOwn* blastAlsX = findPtr (blastAls, cds. contig)) + for (const BlastAlignment* blastAlX : *blastAlsX) + if ( ! blastAlX->targetProt + && blastAlX->stopCodon + && blastAlP. better (*blastAlX) + ) + blastAlP. stopCodon = true; + } + public: @@ -1179,40 +1319,108 @@ struct Batch // Input: blastAls, domains, hmmAls // Output: goodBlastAls { - // Use BlastAlignment.cdss - for (Iter> iter (blastAls); iter. next ();) - if (! (*iter)->good ()) - delete iter. erase (); - - // Group by targetName and process each targetName separately for speed ?? + // BlastAlignment::frameShift + #if 0 // PD-3547 + for (auto& it : blastAls) + { + for (const BlastAlignment* &blastAl1 : it. second) + if (! blastAl1->targetProt) + { + bool found = false; + for (const BlastAlignment* blastAl2 : it. second) + { + if ( blastAl2 + && ! blastAl2->targetProt + && blastAl2->getFrameShift (*blastAl1, 60) // PAR + ) + { + ASSERT (blastAl1 != blastAl2); + var_cast (blastAl2) -> frameShift = true; + if (verbose (-1)) + blastAl2->saveText (cout); + found = true; + } + } + if (found) + { + const BlastAlignment* blastAl_ = blastAl1; + delete blastAl_; + blastAl1 = nullptr; + } + } + it. second. filterValue ([] (const BlastAlignment* blastAl) { return ! blastAl; }); + } + #endif + // BlastAlignment::good() + for (auto& it : blastAls) + for (Iter> iter (it. second); iter. next ();) + if (! (*iter)->good ()) + delete iter. erase (); + if (verbose ()) + { + cout << "# Good Blasts: " << getSize (blastAls) << endl; + if (verbose (-1)) + for (const auto& it : blastAls) + for (const BlastAlignment* blastAl : it. second) + blastAl->saveText (cout); + } + + // PD-2322 + for (const auto& it : blastAls) + for (const BlastAlignment* blastAlP : it. second) + if (blastAlP->targetProt) + setStopCodon (* var_cast (blastAlP)); + for (const HmmAlignment* hmmAl : hmmAls) + { + const BlastAlignment* blastAl = hmmAl->blastAl. get (); + ASSERT (blastAl); + setStopCodon (* var_cast (blastAl)); + } + if (verbose (-1)) + { + cout << "After setStopCodon():" << endl; + for (const auto& it : blastAls) + for (const BlastAlignment* blastAl : it. second) + blastAl->saveText (cout); + } + if (retainBlasts) - goodBlastAls = blastAls; + { + ASSERT (goodBlastAls. empty ()); + for (const auto& it : blastAls) + goodBlastAls [it. first] = it. second; + } else blastParetoBetter (); // Cf. dna_mutation.cpp - for (const BlastAlignment* blastAl1 : goodBlastAls) - for (const SeqChange& seqChange1 : blastAl1->seqChanges) - { - ASSERT (seqChange1. al == blastAl1); - //ASSERT (seqChange1. mutation); - for (const BlastAlignment* blastAl2 : goodBlastAls) - if ( blastAl2->targetName == blastAl1->targetName - && blastAl2->targetStrand == blastAl1->targetStrand - && blastAl2 != blastAl1 - ) - for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) - { - SeqChange& seqChange2 = *iter; - //ASSERT (seqChange2. mutation); - ASSERT (seqChange2. al == blastAl2); - if ( seqChange1. start_target == seqChange2. start_target - && seqChange1. better (seqChange2) - ) - iter. erase (); - } - } + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl1 : it. second) + for (const SeqChange& seqChange1 : blastAl1->seqChanges) + { + ASSERT (seqChange1. al == blastAl1); + //ASSERT (seqChange1. mutation); + for (const BlastAlignment* blastAl2 : it. second) + { + ASSERT (blastAl2->targetName == blastAl1->targetName); + if ( blastAl2->targetStrand == blastAl1->targetStrand + && blastAl2 != blastAl1 + ) + //for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + for (SeqChange& seqChange2 : var_cast (blastAl2) -> seqChanges) + { + //SeqChange& seqChange2 = *iter; + //ASSERT (seqChange2. mutation); + ASSERT (seqChange2. al == blastAl2); + if ( seqChange1. start_target == seqChange2. start_target + && seqChange1. better (seqChange2) + ) + //iter. erase (); + seqChange2. replacement = & seqChange1; + } + } + } // HMM: Pareto-better() VectorPtr goodHmmAls; goodHmmAls. reserve (hmmAls. size ()); @@ -1257,63 +1465,74 @@ struct Batch // PD-741 if (hmmExist && ! skip_hmm_check) - for (Iter> iter (goodBlastAls); iter. next ();) - if ( /*! iter->refExactlyMatched () */ - ! (*iter) -> isMutation () - && (*iter) -> targetProt - && (*iter) -> pIdentity () < 0.98 - frac_delta // PAR // PD-1673 - && ! (*iter) -> partial () - ) - if (const Fam* fam = (*iter) -> getFam () -> getHmmFam ()) - { - bool found = false; - for (const HmmAlignment* hmmAl : goodHmmAls) - if ( (*iter) -> targetName == hmmAl->sseqid - && fam == hmmAl->fam - ) - { - found = true; - break; - } - if (! found) // BLAST is wrong - { - if (verbose ()) - { - cout << "HMM-wrong: "; - (*iter) -> saveText (cout); - } - iter. erase (); - } - } + for (auto& it : goodBlastAls) + for (Iter> iter (it. second); iter. next ();) + if ( /*! iter->refExactlyMatched () */ + ! (*iter) -> isMutation () + && (*iter) -> targetProt + && (*iter) -> pIdentity () < 0.98 - frac_delta // PAR // PD-1673 + && ! (*iter) -> partial () + ) + if (const Fam* fam = (*iter) -> getFam () -> getHmmFam ()) + { + bool found = false; + for (const HmmAlignment* hmmAl : goodHmmAls) + if ( (*iter) -> targetName == hmmAl->sseqid + && fam == hmmAl->fam + ) + { + found = true; + break; + } + if (! found) // BLAST is wrong + { + if (verbose ()) + { + cout << "HMM-wrong: "; + (*iter) -> saveText (cout); + } + iter. erase (); + } + } if (verbose ()) - cout << "# Best Blasts left: " << goodBlastAls. size () << endl; + cout << "# Best Blasts left: " << getSize (goodBlastAls) << endl; for (Iter> hmmIt (goodHmmAls); hmmIt. next ();) - for (const BlastAlignment* blastAl : goodBlastAls) - if (blastAl->better (**hmmIt)) - { - ASSERT (! blastAl->hmmAl); - var_cast (blastAl) -> hmmAl = *hmmIt; - hmmIt. erase (); - break; - } + for (const auto& it : goodBlastAls) + { + bool found = false; + for (const BlastAlignment* blastAl : it. second) + if (blastAl->better (**hmmIt)) + { + ASSERT (! blastAl->hmmAl); + var_cast (blastAl) -> hmmAl = *hmmIt; + hmmIt. erase (); + found = true; + break; + } + if (found) + break; + } // PD-2783 - for (Iter> blastIt (goodBlastAls); blastIt. next ();) - for (const HmmAlignment* hmmAl : goodHmmAls) - if (hmmAl->better (**blastIt)) - { - blastIt. erase (); - break; - } + for (auto& it : goodBlastAls) + for (Iter> blastIt (it. second); blastIt. next ();) + for (const HmmAlignment* hmmAl : goodHmmAls) + if (hmmAl->better (**blastIt)) + { + blastIt. erase (); + break; + } // Output // goodHmmAls --> goodBlastAls for (const HmmAlignment* hmmAl : goodHmmAls) { - ASSERT (hmmAl->blastAl. get ()); - goodBlastAls << hmmAl->blastAl. get (); + const BlastAlignment* blastAl = hmmAl->blastAl. get (); + ASSERT (blastAl); + ASSERT (! blastAl->targetName. empty ()); + goodBlastAls [blastAl->targetName] << blastAl; } #if 0 @@ -1336,16 +1555,42 @@ struct Batch } #endif - goodBlastAls. sort (BlastAlignment::less); + // PD-2394 + // BlastAlignment::fusion + for (auto& it : goodBlastAls) + { + auto& goodBlastAls_ = it. second; + goodBlastAls_. sort (BlastAlignment::less); + { + const BlastAlignment* prev = nullptr; + for (const BlastAlignment* blastAl : goodBlastAls_) + { + if ( prev + && prev->sameMatch (blastAl) + && prev ->part == 1 + && blastAl->part == 2 + ) + { + QC_ASSERT (blastAl->parts == 2); + var_cast (prev) -> fusion = blastAl; + var_cast (blastAl) -> fusion = prev; + } + prev = blastAl; + } + } + } if (verbose ()) { cout << endl << "After process():" << endl; - for (const BlastAlignment* blastAl : goodBlastAls) - { - blastAl->saveText (cout); - cout << "# Mutations: " << blastAl->seqChanges. size () << endl; - } + if (mutation_all. get ()) + *mutation_all << endl << "After process():" << endl; + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl : it. second) + { + blastAl->saveText (cout); + cout << "# Mutations: " << blastAl->seqChanges. size () << endl; + } } } @@ -1357,6 +1602,8 @@ struct Batch { // Cf. BlastAlignment::saveText() TabDel td; + if (! input_name. empty ()) + td << "Name"; td << "Protein identifier"; // 1 // targetName // PD-2534 if (cdsExist) // Contig @@ -1364,7 +1611,7 @@ struct Batch << "Start" // targetStart // 3 << "Stop" // targetEnd // 4 << "Strand"; // targetStrand // 5 - td << (print_fam ? "FAM.id" : "Gene symbol") // 6 or 2 + td << "Gene symbol" // 6 or 2 << "Sequence name" // 7 or 3 << "Scope" // PD-2825 // 8 or 4 // PD-1856 @@ -1394,25 +1641,26 @@ struct Batch *mutation_all << td. str () << endl; } - for (const BlastAlignment* blastAl : goodBlastAls) - { - ASSERT (blastAl); - if (blastAl->isMutation ()) - if (blastAl->seqChanges. empty ()) - ; - else - { + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl : it. second) + { + ASSERT (blastAl); + if (blastAl->isMutation ()) + if (blastAl->seqChanges. empty ()) + ; + else + { + blastAl->saveText (os); + blastAl->qc (); + } + else if ( blastAl->getFam () -> reportable >= reportable_min + && ! suppress_prots. containsFast (blastAl->refAccession) + ) + { blastAl->saveText (os); blastAl->qc (); } - else if ( blastAl->getFam () -> reportable >= reportable_min - && ! suppress_prots. containsFast (blastAl->refAccession) - ) - { - blastAl->saveText (os); - blastAl->qc (); - } - } + } } @@ -1420,12 +1668,13 @@ struct Batch void printTargetIds (ostream &os) const { ASSERT (os. good ()); - for (const BlastAlignment* blastAl : goodBlastAls) - if ( blastAl->targetProt - && ! blastAl->isMutation () - && blastAl->getFam () -> reportable >= reportable_min - ) - os << blastAl->targetName << endl; + for (const auto& it : goodBlastAls) + for (const BlastAlignment* blastAl : it. second) + if ( blastAl->targetProt + && ! blastAl->isMutation () + && blastAl->getFam () -> reportable >= reportable_min + ) + os << blastAl->targetName << endl; } }; @@ -1530,10 +1779,11 @@ struct ThisApplication : Application // Output addKey ("out", "Identifiers of the reported input proteins"); addFlag ("print_fam", "Print the FAM.id instead of gene symbol"); - addFlag ("pseudo", "Indicate pseudo-genes in the protein name as " + strQuote (stopCodonS) + " or " + strQuote (frameShiftS)); + addFlag ("pseudo", "Indicate pseudo-genes as method INTERNAL_STOP"); // or FRAME_SHIFT addFlag ("force_cds_report", "Report contig/start/stop/strand even if this information does not exist"); addFlag ("non_reportable", "Report non-reportable families"); addFlag ("core", "Report only core reportale families"); + addKey ("name", "Text to be added as the first column \"name\" to all rows of the report"); // Testing addFlag ("nosame", "Exclude the same reference protein accessions from the BLAST output (for testing)"); addFlag ("noblast", "Exclude the BLAST output (for testing)"); @@ -1572,6 +1822,7 @@ struct ThisApplication : Application const bool force_cds_report = getFlag ("force_cds_report"); const bool non_reportable = getFlag ("non_reportable"); const bool report_core_only = getFlag ("core"); + input_name = getArg ("name"); const bool nosame = getFlag ("nosame"); const bool noblast = getFlag ("noblast"); const bool nohmm = getFlag ("nohmm"); @@ -1636,13 +1887,8 @@ struct ThisApplication : Application al->qc (); if (nosame && al->refAccession == al->targetName) continue; - if (al->good ()) - batch. blastAls << al; - else - { - ASSERT (! al->refExactlyMatched ()); - delete al; - } + ASSERT (! al->targetName. empty ()); + batch. blastAls [al->targetName] << al; } } @@ -1660,18 +1906,13 @@ struct ThisApplication : Application al->qc (); if (nosame && al->refAccession == al->targetName) continue; - if (al->good ()) - batch. blastAls << al; - else - { - ASSERT (! al->refExactlyMatched ()); - delete al; - } + ASSERT (! al->targetName. empty ()); + batch. blastAls [al->targetName] << al; } } } if (verbose ()) - cout << "# Good Blasts: " << batch. blastAls. size () << endl; + cout << "# Blasts: " << getSize (batch. blastAls) << endl; if (! nohmm) @@ -1714,22 +1955,33 @@ struct ThisApplication : Application delete hmmAl; continue; } - auto al = new BlastAlignment (*hmmAl); + const BlastAlignment* bestBlastAl = nullptr; // PD-3475 + if (const VectorOwn* blastAls_ = findPtr (batch. blastAls, hmmAl->sseqid)) + { + size_t nident_max = 0; + for (const BlastAlignment* blastAl : *blastAls_) + if (maximize (nident_max, blastAl->nident)) + bestBlastAl = blastAl; + } + auto al = new BlastAlignment (*hmmAl, bestBlastAl); hmmAl->blastAl. reset (al); if (verbose ()) cout << al->targetName << " " << al->gene << endl; const HmmAlignment::Domain domain = batch. domains [HmmAlignment::Pair (al->targetName, al->gene)]; if (! domain. hmmLen) continue; // domain does not exist - /*al->refLen = domain. hmmLen; - al->refStart = domain. hmmStart; - al->refEnd = domain. hmmStop; */ - al->targetLen = domain. seqLen; - al->targetStart = domain. seqStart; - al->targetEnd = domain. seqStop; - al->setTargetAlign (); - ASSERT (! al->refExactlyMatched ()); - ASSERT (! al->partial ()); + if (! bestBlastAl) + { + /*al->refLen = domain. hmmLen; + al->refStart = domain. hmmStart; + al->refEnd = domain. hmmStop; */ + al->targetLen = domain. seqLen; + al->targetStart = domain. seqStart; + al->targetEnd = domain. seqStop; + al->setTargetAlign (); + //ASSERT (! al->refExactlyMatched ()); + //ASSERT (! al->partial ()); + } al->qc (); batch. hmmAls << hmmAl; } @@ -1793,9 +2045,10 @@ struct ThisApplication : Application var_cast (locus). contigLen = contig2len [locus.contig]; } } - for (const BlastAlignment* al : batch. blastAls) - if (al->targetProt) - var_cast (al) -> setCdss (seqId2locusTag, * annot. get ()); + for (const auto& it : batch. blastAls) + for (const BlastAlignment* al : it. second) + if (al->targetProt) + var_cast (al) -> setCdss (seqId2locusTag, * annot. get ()); for (const HmmAlignment* hmmAl : batch. hmmAls) var_cast (hmmAl->blastAl. get ()) -> setCdss (seqId2locusTag, * annot. get ());; } diff --git a/amrfinder.cpp b/amrfinder.cpp index bf7e181..8bf1af4 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -30,9 +30,37 @@ * AMRFinder * * Dependencies: NCBI BLAST, HMMer -* cat, cp, cut, grep, head, mkdir, mv, nproc, sort, tail, which +* awk, cat, cp, cut, head, ln, mkdir, mv, sort, tail * * Release changes: +* 09/30/2020 PD-2407 option --type is removed +* 3.8.28 09/29/2020 PD-3292 dependence on "uniq" is removed +* 3.8.27 09/28/2020 PD-2381 non-standard start codons are not changed in fusion proteins +* 3.8.26 09/25/2020 PD-2381 proteins with non-standard start codons that are extended in the N-terminal direction are EXACTP +* 3.8.25 09/25/2020 PD-3547 identification of frameshifts is disabled +* POINTX method with more SNPs is preferred over POINTP method +* 3.8.24 09/21/2020 PD-3536 --pointmut_all reports all SNPs in a reference gene repetition +* 3.8.23 09/16/2020 PD-3536 simplifying point mutations preference +* 3.8.22 09/15/2020 PD-3470 frameshift detection bug; preference of point mutation reference proteins +* 3.8.21 09/14/2020 PD-3536 point mutations merging bug +* PD-3469 --force_update implies --update; -U +* 3.8.20 09/14/2020 PD-3531 "--parm -print_fam" bug +* 3.8.19 09/04/2020 PD-3292 removed the dependence on "grep" +* 3.8.18 09/03/2020 PD-3292 removed the dependence on "which" +* 3.8.17 09/02/2020 PD-3528 ordering of rows in the report is broken with parameter --name +* 3.8.16 09/01/2020 PD-2322 a complete nucleotide hit is not preferred to a partial protein hit; stopCodon field is borrowed from BLASTX to BPASTP +* 3.8.15 08/28/2020 PD-3475 Return BLAST alignment parameters for HMM-only hits where available +* 3.8.14 08/27/2020 PD-3470 method FRAME_SHIFT, amr_report is faster +* 3.8.13 08/25/2020 PD-2322 a complete nucleotide hit is preferred to a partial protein hit +* 3.8.12 08/24/2020 PD-2394 fusion genes are reported to include both gene symbols on each line +* 3.8.11 08/21/2020 PD-2407 --type +* 3.8.10 08/20/2020 PD-3469 --force_update +* 3.8.9 08/13/2020 BLAST -show_gis parameter is removed, more mutations are reported for --mutation_all +* 3.8.8 08/04/2020 bug in fasta_extract.cpp, more output in --nucleotide_output +* 3.8.7 08/03/2020 PD-3504 --protein_output, --nucleotide_output options by fasta_extract.cpp +* 3.8.6 07/29/2020 PD-3468 --name option +* 07/13/2020 PD-3484 -l for old database versions +* 3.8.5 07/10/2020 PD-3482 --ident_min instruction * 3.8.4 05/13/2020 PD-3447 Custom point mutation does not match the reference sequence * Text "*** ERROR ***" is not repeated twice * 3.8.3 05/01/2020 WILDTYPE mutations were reported as 0-based @@ -171,6 +199,13 @@ struct Warning : Singleton { stderr << Color::code () << "\n"; } }; + + +//const StringVector all_types {"AMR", "STRESS", "VIRULENCE"}; + // select id from FAM where [type] = 1 + + + // ThisApplication @@ -181,13 +216,14 @@ struct ThisApplication : ShellApplication : ShellApplication (HELP, true, true, true) { addFlag ("update", "Update the AMRFinder database", 'u'); // PD-2379 - addKey ("protein", "Protein FASTA file to search", "", 'p', "PROT_FASTA"); - addKey ("nucleotide", "Nucleotide FASTA file to search", "", 'n', "NUC_FASTA"); + addFlag ("force_update", "Force updating the AMRFinder database", 'U'); // PD-3469 + addKey ("protein", "Input protein FASTA file", "", 'p', "PROT_FASTA"); + addKey ("nucleotide", "Input nucleotide FASTA file", "", 'n', "NUC_FASTA"); addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR"); - addKey ("ident_min", "Minimum identity for nucleotide hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); - // "nucleotide hit" --> "reference protein" ?? + addKey ("ident_min", "Minimum proportion of identical amino acids in alignment for hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); + // "PD-3482 addKey ("coverage_min", "Minimum coverage of the reference protein (0..1)", toString (partial_coverage_min_def), 'c', "MIN_COV"); addKey ("organism", "Taxonomy group. To see all possible taxonomy groups use the --list_organisms flag", "", 'O', "ORGANISM"); addFlag ("list_organisms", "Print the list of all possible taxonomy groups for mutations identification and exit", 'l'); @@ -195,12 +231,16 @@ struct ThisApplication : ShellApplication addFlag ("plus", "Add the plus genes to the report"); // PD-2789 addFlag ("report_common", "Report proteins common to a taxonomy group"); // PD-2756 addKey ("mutation_all", "File to report all mutations", "", '\0', "MUT_ALL_FILE"); + //addKey ("type", "Limit search to specific element types: " + all_types. toString (",") + ". A comma delimited list, case-insensitive", "", '\0', "TYPE"); + // "Element type" is a column name in the report addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); //addKey ("hmmer_bin" ?? + addKey ("name", "Text to be added as the first column \"name\" to all rows of the report, for example it can be an assembly name", "", '\0', "NAME"); addKey ("output", "Write output to OUTPUT_FILE instead of STDOUT", "", 'o', "OUTPUT_FILE"); + addKey ("protein_output", "Output protein FASTA file of reported proteins", "", '\0', "PROT_FASTA_OUT"); + addKey ("nucleotide_output", "Output nucleotide FASTA file of reported nucleotide sequences", "", '\0', "NUC_FASTA_OUT"); addFlag ("quiet", "Suppress messages to STDERR", 'q'); addFlag ("gpipe_org", "NCBI internal GPipe organism names"); - //addKey ("sample", "Sample name to be adde as the first column of the report", ""); addKey ("parm", "amr_report parameters for testing: -nosame -noblast -skip_hmm_check -bed", "", '\0', "PARM"); version = SVN_REV; // threads_max: do not include blast/hmmsearch's threads ?? @@ -219,40 +259,65 @@ struct ThisApplication : ShellApplication bool blastThreadable (const string &blast) const { exec (fullProg (blast) + " -help > " + tmp + ".blast_help"); - return ! system (("grep '^ *\\-num_threads' " + tmp + ".blast_help > /dev/null 2> /dev/null"). c_str ()); - } - - - - size_t get_threads_max_max () const - { - #if __APPLE__ - // stderr << "Compiled for MacOS" << "\n"; - int count; - size_t count_len = sizeof(count); - sysctlbyname("hw.logicalcpu", &count, &count_len, NULL, 0); - // fprintf(stderr,"you have %i cpu cores", count); - return count; - #else - // stderr << "Compiled for Linux" << "\n"; - //exec ("nproc --all > " + tmp + ".nproc"); - //exec ("grep processor /proc/cpuinfo | tail -1 | awk '{print $3}' > " + tmp + ".nproc"); - exec ("grep -c processor /proc/cpuinfo > " + tmp + ".nproc"); - const StringVector vec (tmp + ".nproc", (size_t) 1); - QC_ASSERT (vec. size () == 1); - return str2 (vec [0]); - #endif + LineInput f (tmp + ".blast_help"); + while (f. nextLine ()) + { + trim (f. line); + if (contains (f. line, "-num_threads")) + return true; + } + return false; } StringVector db2organisms () const { - exec ("tail -n +2 " + tmp + ".db/AMRProt-mutation.tab" + " | cut -f 1 > " + tmp + ".prot_org"); + checkFile (tmp + ".db/taxgroup.tab"); + checkFile (tmp + ".db/AMRProt-mutation.tab"); exec ("tail -n +2 " + tmp + ".db/taxgroup.tab" + " | cut -f 1 > " + tmp + ".tax_org"); + exec ("tail -n +2 " + tmp + ".db/AMRProt-mutation.tab" + " | cut -f 1 > " + tmp + ".prot_org"); exec ("cat " + tmp + ".prot_org " + tmp + ".tax_org | sort -u > " + tmp + ".org"); return StringVector (tmp + ".org", (size_t) 100); // PAR } + + + + string col2num (const string &colName) const + // Return: number + // Input: tmp + ".amr": must have the header line + { + LineInput f (tmp + ".amr"); + EXEC_ASSERT (f. nextLine ()); + const List columns (str2list (f. line, '\t')); + size_t n = 1; + for (const string& column : columns) + if (column == colName) + return to_string (n); + else + n++; + throw runtime_error ("Column " + strQuote (colName) + " not found in " + tmp + ".amr"); + } + + + + struct SortField : Named + { + bool numeric {false}; + + explicit SortField (const string &name_arg, + bool numeric_arg = false) + : Named (name_arg) + , numeric (numeric_arg) + { + ASSERT (str2 (name) > 0); + } + void saveText (ostream &os) const + { os << "-k" << name << ',' << name; + if (numeric) + os << 'n'; + } + }; @@ -261,7 +326,8 @@ struct ThisApplication : ShellApplication const string prot = shellQuote (getArg ("protein")); const string dna = shellQuote (getArg ("nucleotide")); string db = getArg ("database"); - const bool update = getFlag ("update"); + const bool force_update = getFlag ("force_update"); + const bool update = getFlag ("update") || force_update; const string gff = shellQuote (getArg ("gff")); const bool pgap = getFlag ("pgap"); const double ident = arg2double ("ident_min"); @@ -272,9 +338,13 @@ struct ThisApplication : ShellApplication const bool add_plus = getFlag ("plus"); const bool report_common = getFlag ("report_common"); const string mutation_all = shellQuote (getArg ("mutation_all")); + //const string type = getArg ("type"); string blast_bin = getArg ("blast_bin"); + const string input_name = shellQuote (getArg ("name")); const string parm = getArg ("parm"); const string output = shellQuote (getArg ("output")); + const string prot_out = shellQuote (getArg ("protein_output")); + const string dna_out = shellQuote (getArg ("nucleotide_output")); const bool quiet = getFlag ("quiet"); const bool gpipe_org = getFlag ("gpipe_org"); @@ -298,6 +368,11 @@ struct ThisApplication : ShellApplication if (report_common && emptyArg (organism)) throw runtime_error ("--report_common requires --organism"); + if (report_common && ! add_plus) + throw runtime_error ("--report_common requires --plus"); + + //if (force_update && ! update) + //throw runtime_error ("--force_update requires --update"); // PD-3437 if (! emptyArg (mutation_all) && emptyArg (organism)) @@ -306,6 +381,24 @@ struct ThisApplication : ShellApplication stderr << "--mutation_all option used without -O/--organism option. No point mutations will be screened"; } + #if 0 + StringVector typeVec; + if (! type. empty ()) + { + const List typeList (str2list (type, ',')); + for (string s: typeList) + { + trim (s); + if (s. empty ()) + continue; + strUpper (s); + if (! all_types. contains (s)) + throw runtime_error ("Unknown element type " + strQuote (s)); + typeVec << s; + } + } + #endif + if (! emptyArg (output)) try { OFStream f (unQuote (output)); } catch (...) { throw runtime_error ("Cannot open output file " + output); } @@ -368,7 +461,8 @@ struct ThisApplication : ShellApplication if (! dbDir. items. empty () && dbDir. items. back () == "latest") { prog2dir ["amrfinder_update"] = execDir; - exec (fullProg ("amrfinder_update") + " -d " + shellQuote (dbDir. getParent ()) + ifS (quiet, " -q") + ifS (qc_on, " --debug") + " > " + logFName, logFName); + exec (fullProg ("amrfinder_update") + " -d " + shellQuote (dbDir. getParent ()) + ifS (force_update, " --force_update") + + ifS (quiet, " -q") + ifS (qc_on, " --debug") + " > " + logFName, logFName); } else { @@ -389,14 +483,6 @@ struct ThisApplication : ShellApplication exec ("ln -s " + shellQuote (path2canonical (db)) + " " + tmp + ".db"); - if (list_organisms) - { - const StringVector organisms (db2organisms ()); - cout << "Possible organisms: " + organisms. toString (", ") << endl; - return; - } - - // PD-3051 try { @@ -419,15 +505,24 @@ struct ThisApplication : ShellApplication } + if (list_organisms) + { + const StringVector organisms (db2organisms ()); + cout << endl << "Available --organism options: " + organisms. toString (", ") << endl; + return; + } + + { string searchMode; StringVector includes; if (emptyArg (prot)) + { if (emptyArg (dna)) { if (update) return; - throw runtime_error ("Parameter --prot or --nucleotide must be present"); + throw runtime_error ("Parameter --protein or --nucleotide must be present"); } else { @@ -435,6 +530,7 @@ struct ThisApplication : ShellApplication throw runtime_error ("Parameter --gff is redundant"); searchMode = "translated nucleotide"; } + } else { searchMode = "protein"; @@ -446,10 +542,14 @@ struct ThisApplication : ShellApplication else { if (emptyArg (gff)) - throw runtime_error ("If parameters --prot and --nucleotide are present then parameter --gff must be present"); + throw runtime_error ("If parameters --protein and --nucleotide are present then parameter --gff must be present"); searchMode = "combined translated and protein"; } } + if (emptyArg (prot) && ! emptyArg (prot_out)) + throw runtime_error ("Parameter --protein must be present for --protein_out"); + if (emptyArg (dna) && ! emptyArg (dna_out)) + throw runtime_error ("Parameter --nucleotide must be present for --nucleotide_out"); ASSERT (! searchMode. empty ()); if (emptyArg (organism)) includes << key2shortHelp ("organism") + " option to add mutation searches and suppress common proteins"; @@ -523,7 +623,7 @@ struct ThisApplication : ShellApplication { const StringVector organisms (db2organisms ()); if (! organisms. contains (organism1)) - throw runtime_error ("Possible organisms: " + organisms. toString (", ")); + throw runtime_error ("Possible organisms: " + organisms. toString (", ")); } } if (! organism1. empty ()) @@ -536,9 +636,10 @@ struct ThisApplication : ShellApplication const string force_cds_report (! emptyArg (dna) && ! organism1. empty () ? "-force_cds_report" : ""); // Needed for dna_mutation - prog2dir ["fasta_check"] = execDir; - prog2dir ["fasta2parts"] = execDir; - prog2dir ["amr_report"] = execDir; + prog2dir ["fasta_check"] = execDir; + prog2dir ["fasta2parts"] = execDir; + prog2dir ["amr_report"] = execDir; + prog2dir ["fasta_extract"] = execDir; bool lcl = false; @@ -575,7 +676,7 @@ struct ThisApplication : ShellApplication // PD-2967 - const string blastp_par ("-show_gis -comp_based_stats 0 -evalue 1e-10 "); + const string blastp_par ("-comp_based_stats 0 -evalue 1e-10"); // was: -culling_limit 20 // PD-2967 if (! emptyArg (prot)) { @@ -584,18 +685,31 @@ struct ThisApplication : ShellApplication { findProg ("blastp"); findProg ("hmmsearch"); - exec (fullProg ("fasta_check") + prot + " -aa -hyphen " + qcS + " -log " + logFName, logFName); + exec (fullProg ("fasta_check") + prot + " -aa -hyphen" + qcS + " -log " + logFName, logFName); if (! emptyArg (gff) && ! contains (parm, "-bed")) { string locus_tag; - const int status = system (("grep '^>.*\\[locus_tag=' " + prot + " > /dev/null"). c_str ()); - const bool locus_tagP = (status == 0); - if (locus_tagP /*|| gpipe*/) { - locus_tag = " -locus_tag " + tmp + ".match"; - gff_match = " -gff_match " + tmp + ".match"; - } + bool locus_tagP = false; + { + LineInput f (unQuote (prot)); + while (f. nextLine ()) + if ( ! f. line. empty () + && f. line [0] == '>' + && contains (f. line, "[locus_tag=") + ) + { + locus_tagP = true; + break; + } + } + if (locus_tagP /*|| gpipe*/) + { + locus_tag = " -locus_tag " + tmp + ".match"; + gff_match = " -gff_match " + tmp + ".match"; + } + } prog2dir ["gff_check"] = execDir; string dnaPar; if (! emptyArg (dna)) @@ -626,7 +740,7 @@ struct ThisApplication : ShellApplication // " -task blastp-fast -word_size 6 -threshold 21 " // PD-2303 string num_threads; if (blastThreadable ("blastp") && prot_threads > 1) - num_threads = " -num_threads " + to_string (prot_threads); + num_threads = " -num_threads " + to_string (prot_threads); th. exec (fullProg ("blastp") + " -query " + prot + " -db " + tmp + ".db/AMRProt" +" " + blastp_par + num_threads + " " BLAST_FMT " -out " + tmp + ".blastp > /dev/null 2> /dev/null", prot_threads); @@ -656,13 +770,13 @@ struct ThisApplication : ShellApplication { stderr << "Running blastx...\n"; findProg ("blastx"); - exec (fullProg ("fasta_check") + dna + " -hyphen -len "+ tmp + ".len " + qcS + " -log " + logFName, logFName); + exec (fullProg ("fasta_check") + dna + " -hyphen -len "+ tmp + ".len" + qcS + " -log " + logFName, logFName); const size_t threadsAvailable = th. getAvailable (); //ASSERT (threadsAvailable); if (threadsAvailable >= 2) { exec ("mkdir " + tmp + ".chunk"); - exec (fullProg ("fasta2parts") + dna + " " + to_string (threadsAvailable) + " " + tmp + ".chunk " + qcS + " -log " + logFName, logFName); // PAR + exec (fullProg ("fasta2parts") + dna + " " + to_string (threadsAvailable) + " " + tmp + ".chunk" + qcS + " -log " + logFName, logFName); // PAR exec ("mkdir " + tmp + ".blastx_dir"); FileItemGenerator fig (false, true, tmp + ".chunk"); string item; @@ -676,10 +790,13 @@ struct ThisApplication : ShellApplication th. exec (fullProg ("blastx") + " -query " + dna + " -db " + tmp + ".db/AMRProt" + " " + blastx_par + to_string (gencode) + " " BLAST_FMT " -out " + tmp + ".blastx > /dev/null 2> /dev/null", threadsAvailable); - amr_report_blastx = "-blastx " + tmp + ".blastx -dna_len " + tmp + ".len"; } else + { exec ("cp /dev/null " + tmp + ".blastx"); + exec ("cp /dev/null " + tmp + ".len"); + } + amr_report_blastx = "-blastx " + tmp + ".blastx -dna_len " + tmp + ".len"; } @@ -694,7 +811,7 @@ struct ThisApplication : ShellApplication prog2dir ["dna_mutation"] = execDir; stderr << "Running blastn...\n"; exec (fullProg ("blastn") + " -query " + dna + " -db " + tmp + ".db/AMR_DNA-" + organism1 + " -evalue 1e-20 -dust no " - BLAST_FMT " -out " + tmp + ".blastn > " + logFName + " 2> " + logFName, logFName); + BLAST_FMT " -out " + tmp + ".blastn > " + logFName + " 2> " + logFName, logFName); } else exec ("cp /dev/null " + tmp + ".blastn"); @@ -724,17 +841,18 @@ struct ThisApplication : ShellApplication } - // ".amr" + // tmp + ".amr", tmp + ".mutation_all" + const string nameS (emptyArg (input_name) ? "" : " -name " + input_name); { const string mutation_allS (emptyArg (mutation_all) ? "" : ("-mutation_all " + tmp + ".mutation_all")); const string coreS (add_plus ? "" : " -core"); exec (fullProg ("amr_report") + " -fam " + shellQuote (db + "/fam.tab") + " " + amr_report_blastp + " " + amr_report_blastx - + " -organism " + strQuote (organism1) + " -mutation " + shellQuote (db + "/AMRProt-mutation.tab") + " " + mutation_allS + " " - + force_cds_report + " -pseudo" + coreS - + (ident == -1 ? string () : " -ident_min " + toString (ident)) - + " -coverage_min " + toString (cov) - + ifS (suppress_common, " -suppress_prot " + tmp + ".suppress_prot") + pgapS - + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr", logFName); + + " -organism " + strQuote (organism1) + " -mutation " + shellQuote (db + "/AMRProt-mutation.tab") + " " + mutation_allS + " " + + force_cds_report + " -pseudo" + coreS + + (ident == -1 ? string () : " -ident_min " + toString (ident)) + + " -coverage_min " + toString (cov) + + ifS (suppress_common, " -suppress_prot " + tmp + ".suppress_prot") + pgapS + + nameS + qcS + " " + parm + " -log " + logFName + " > " + tmp + ".amr", logFName); } if ( ! emptyArg (dna) && ! organism1. empty () @@ -743,35 +861,94 @@ struct ThisApplication : ShellApplication { const string mutation_allS (emptyArg (mutation_all) ? "" : ("-mutation_all " + tmp + ".mutation_all.dna")); exec (fullProg ("dna_mutation") + tmp + ".blastn " + shellQuote (db + "/AMR_DNA-" + organism1 + ".tab") + " " + strQuote (organism1) + " " + mutation_allS - + " " + qcS + " -log " + logFName + " > " + tmp + ".amr-snp", logFName); + + nameS + qcS + " -log " + logFName + " > " + tmp + ".amr-snp", logFName); exec ("tail -n +2 " + tmp + ".amr-snp >> " + tmp + ".amr"); if (! emptyArg (mutation_all)) exec ("tail -n +2 " + tmp + ".mutation_all.dna >> " + tmp + ".mutation_all"); } - // PD-2244, PD-3230 - const string sortS (emptyArg (dna) && emptyArg (gff) ? "-k1,1 -k2,2" : "-k2,2 -k3,3n -k4,4n -k5,5 -k1,1 -k6,6"); + // Column names are from amr_report.cpp + + #if 0 + string typeFilter; + if (! typeVec. empty ()) + { + const string typeCol (col2num ("Element type")); + for (const string& t : typeVec) + typeFilter += " || $" + typeCol + " == \"" + t + "\""; + } + #endif + // Sorting AMR report - exec ("head -1 " + tmp + ".amr > " + tmp + ".amr-out"); - exec ("LANG=C && tail -n +2 " + tmp + ".amr | sort " + sortS + " | uniq >> " + tmp + ".amr-out"); - exec ("mv " + tmp + ".amr-out " + tmp + ".amr"); + // PD-2244, PD-3230 + string sortS; + { + Vector sortFields; + if (! (emptyArg (dna) && emptyArg (gff))) + sortFields << SortField (col2num ("Contig id")) + << SortField (col2num ("Start"), true) + << SortField (col2num ("Stop"), true) + << SortField (col2num ("Strand")); + sortFields << SortField (col2num ("Protein identifier")) + << SortField (col2num ("Gene symbol")); + for (const SortField& sf : sortFields) + sortS += " " + sf. str (); + } + exec ("head -1 " + tmp + ".amr > " + tmp + ".amr-out"); + exec ("LANG=C && tail -n +2 " + tmp + ".amr | sort " + sortS + " >> " + tmp + ".amr-out"); + + #if 0 + if (! typeFilter. empty ()) + exec ("awk -F '\t' 'NR == 1 " + typeFilter + "' " + tmp + ".amr-out > " + tmp + ".amr"); + else + #endif + exec ("mv " + tmp + ".amr-out " + tmp + ".amr"); + // Sorting mutation_all if (! emptyArg (mutation_all)) { - exec ("head -1 " + tmp + ".mutation_all > " + tmp + ".mutation_all-out"); - exec ("LANG=C && tail -n +2 " + tmp + ".mutation_all | sort " + sortS + " | uniq >> " + tmp + ".mutation_all-out"); - exec ("mv " + tmp + ".mutation_all-out " + mutation_all); + exec ("head -1 " + tmp + ".mutation_all > " + tmp + ".mutation_all-out"); + exec ("LANG=C && tail -n +2 " + tmp + ".mutation_all | sort -u | sort " + sortS + " >> " + tmp + ".mutation_all-out"); + // "sort -u | sort " replaces "sort | uniq" + #if 0 + if (! typeFilter. empty ()) + exec ("awk -F '\t' 'NR == 1 " + typeFilter + "' " + tmp + ".mutation_all-out > " + mutation_all); + else + #endif + exec ("mv " + tmp + ".mutation_all-out " + mutation_all); } - - // timing the run - const time_t end = time (NULL); - stderr << "AMRFinder took " << end - start << " seconds to complete\n"; if (emptyArg (output)) exec ("cat " + tmp + ".amr"); else exec ("cp " + tmp + ".amr " + output); + + + if (! emptyArg (prot_out)) + { + const string protCol (col2num ("Protein identifier")); + const string geneSymbolCol (col2num ("Gene symbol")); + const string seqNameCol (col2num ("Sequence name")); + exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$" + protCol + " != \"NA\" {print $" + protCol + ", $" + geneSymbolCol + ", $" + seqNameCol + "};' | sort -u > " + tmp + ".prot_out"); + exec (fullProg ("fasta_extract") + prot + " " + tmp + ".prot_out -aa" + qcS + " -log " + logFName + " > " + prot_out, logFName); + } + if (! emptyArg (dna_out)) + { + const string contigCol (col2num ("Contig id")); + const string startCol (col2num ("Start")); + const string stopCol (col2num ("Stop")); + const string strandCol (col2num ("Strand")); + const string geneSymbolCol (col2num ("Gene symbol")); + const string seqNameCol (col2num ("Sequence name")); + exec ("tail -n +2 " + tmp + ".amr | awk -F '\t' '$" + contigCol + " != \"NA\" {print $" + contigCol + ", $" + startCol + ", $" + stopCol + ", $" + strandCol + ", $" + geneSymbolCol + ", $" + seqNameCol + "};' | sort -u > " + tmp + ".dna_out"); + exec (fullProg ("fasta_extract") + dna + " " + tmp + ".dna_out" + qcS + " -log " + logFName + " > " + dna_out, logFName); + } + + + // timing the run + const time_t end = time (NULL); + stderr << "AMRFinder took " << end - start << " seconds to complete\n"; } }; diff --git a/amrfinder_update.cpp b/amrfinder_update.cpp index 332eb36..c156ff3 100644 --- a/amrfinder_update.cpp +++ b/amrfinder_update.cpp @@ -30,7 +30,7 @@ * Updating of AMRFinder data * * Dependencies: NCBI BLAST, HMMer -* mkdir, ln +* ln, mkdir * curl.{h,c} * * Release changes: see amrfinder.cpp @@ -266,6 +266,7 @@ Requirements:\n\ { addKey ("database", "Directory for all versions of AMRFinder databases", "$BASE/data", 'd', "DATABASE_DIR"); // Symbolic link ?? + addFlag ("force_update", "Force updating the AMRFinder database"); // PD-3469 addFlag ("quiet", "Suppress messages to STDERR", 'q'); version = SVN_REV; @@ -281,8 +282,9 @@ Requirements:\n\ void shellBody () const final { - const string mainDirOrig = getArg ("database"); - const bool quiet = getFlag ("quiet"); + const string mainDirOrig = getArg ("database"); + const bool force_update = getFlag ("force_update"); + const bool quiet = getFlag ("quiet"); Stderr stderr (quiet); @@ -333,15 +335,33 @@ Requirements:\n\ if (! directoryExists (mainDirS)) exec ("mkdir -p " + shellQuote (mainDirS)); + const string versionFName ("version.txt"); + const string urlDir (URL + curMinor + "/" + latest_version + "/"); + const string latestDir (mainDirS + latest_version + "/"); if (directoryExists (latestDir)) + { + if (! force_update) + { + curl. download (urlDir + versionFName, tmp); + const StringVector version_old (latestDir + versionFName, (size_t) 100); + const StringVector version_new (tmp, (size_t) 100); + if ( ! version_old. empty () + && ! version_new. empty () + && version_old. front () == version_new. front () + ) + { + stderr << shellQuote (latestDir) << " contains the latest version: " << version_old. front () << '\n'; + stderr << "Skipping update, use amrfinder --force_update to overwrite the existing database\n"; + return; + } + } stderr << shellQuote (latestDir) << " already exists, overwriting what was there\n"; + } else exec ("mkdir -p " + shellQuote (latestDir)); - stderr << "Downloading AMRFinder database version " << latest_version << " into " << shellQuote (latestDir) << "\n"; - const string urlDir (URL + curMinor + "/" + latest_version + "/"); fetchAMRFile (curl, urlDir, latestDir, "AMR.LIB"); fetchAMRFile (curl, urlDir, latestDir, "AMRProt"); fetchAMRFile (curl, urlDir, latestDir, "AMRProt-mutation.tab"); @@ -350,7 +370,7 @@ Requirements:\n\ fetchAMRFile (curl, urlDir, latestDir, "database_format_version.txt"); // PD-3051 fetchAMRFile (curl, urlDir, latestDir, "fam.tab"); fetchAMRFile (curl, urlDir, latestDir, "taxgroup.tab"); - fetchAMRFile (curl, urlDir, latestDir, "version.txt"); + fetchAMRFile (curl, urlDir, latestDir, versionFName); StringVector dnaPointMuts; { diff --git a/common.cpp b/common.cpp index 48bc1c0..e798134 100644 --- a/common.cpp +++ b/common.cpp @@ -40,19 +40,15 @@ #include #include #include +#include + #ifndef _MSC_VER #include -//#include + #ifdef __APPLE__ + #include + #include + #endif #endif -#include - - - - -[[noreturn]] void errorThrow (const string &msg) -{ - throw std::logic_error (msg); -} @@ -62,6 +58,11 @@ namespace Common_sp +[[noreturn]] void errorThrow (const string &msg) + { throw std::logic_error (msg); } + + + vector programArgs; string programName; @@ -119,6 +120,31 @@ bool isMainThread () { return threads_max == 1 || get_thread_id () == main_thread_id; } + +#ifndef _MSC_VER +size_t get_threads_max_max () +{ +#if __APPLE__ + // stderr << "Compiled for MacOS" << "\n"; + int count; + size_t count_len = sizeof(count); + sysctlbyname ("hw.logicalcpu", &count, &count_len, NULL, 0); + // fprintf(stderr,"you have %i cpu cores", count); + return count; +#else + LineInput f ("/proc/cpuinfo"); + size_t n = 0; + while (f. nextLine ()) + if (isLeft (f. line, "processor")) + n++; + return n; +#endif +} +#endif + + + + bool Chronometer::enabled = false; @@ -188,7 +214,8 @@ namespace // time ?? #ifndef _MSC_VER - const char* hostname = getenv ("HOSTNAME");//const char* shell = getenv ("SHELL"); + const char* hostname = getenv ("HOSTNAME"); +//const char* shell = getenv ("SHELL"); const char* shell = getenv ("SHELL"); const char* pwd = getenv ("PWD"); #endif @@ -854,7 +881,7 @@ size_t strMonth2num (const string& month) return (size_t) (m - 1); } - size_t i = NO_INDEX; + size_t i = no_index; if (month == "Jan") i = 0; else if (month == "Feb") i = 1; else if (month == "Mar") i = 2; @@ -1106,6 +1133,8 @@ void exec (const string &cmd, //Chronometer_OnePass cop (cmd); if (verbose ()) cout << cmd << endl; + if (logPtr) + *logPtr << cmd << endl; const int status = system (cmd. c_str ()); // pipefail's are not caught if (status) @@ -1121,6 +1150,24 @@ void exec (const string &cmd, +#ifndef _MSC_VER +string which (const string &progName) +{ + ASSERT (! progName. empty ()); + + const List paths (str2list (getenv ("PATH"), ':')); + for (const string& path : paths) + if ( ! path. empty () + && fileExists (path + "/" + progName) + ) + return path + "/"; + + return string (); +} +#endif + + + // Threads @@ -1215,7 +1262,7 @@ StringVector::StringVector (const string &fName, } catch (const exception &e) { - throw runtime_error ("Loading file " + shellQuote (fName) + "\n" + e. what ()); + throw runtime_error ("Reading file " + shellQuote (fName) + "\n" + e. what ()); } } @@ -1475,7 +1522,6 @@ string CharInput::getLine () - // Token void Token::readInput (CharInput &in) @@ -1563,10 +1609,7 @@ void Token::readInput (CharInput &in) qc (); if (verbose ()) - { - cout << type2str (type) << ' '; - cout << *this << ' ' << charNum << endl; - } + cout << type2str (type) << ' ' << *this << ' ' << charNum + 1 << endl; } @@ -1833,7 +1876,7 @@ string Csv::getWord () QC_ASSERT (goodPos ()); size_t start = pos; - size_t stop = NO_INDEX; + size_t stop = no_index; if (s [pos] == '\"') { pos++; @@ -1844,7 +1887,7 @@ string Csv::getWord () } findChar (','); - if (stop == NO_INDEX) + if (stop == no_index) stop = pos; pos++; @@ -2904,11 +2947,13 @@ int Application::run (int argc, jRoot = nullptr; } + #if 0 if (! logFName. empty ()) { delete logPtr; logPtr = nullptr; } + #endif } catch (const std::exception &e) { @@ -2950,7 +2995,7 @@ void ShellApplication::initEnvironment () // execDir, programName execDir = getProgramDirName (); if (execDir. empty ()) - execDir = which (programArgs. front ()); + execDir = this->which (programArgs. front ()); if (! isRight (execDir, "/")) throw logic_error ("Cannot identify the directory of the executable"); { @@ -2977,22 +3022,6 @@ void ShellApplication::body () const -string ShellApplication::which (const string &progName) const -{ - if (tmp. empty ()) - throw runtime_error ("Temporary file is needed"); - - try { exec ("which " + shellQuote (progName) + " 1> " + tmp + ".src 2> /dev/null"); } - catch (const runtime_error &) - { return string (); } - - const StringVector vec (tmp + ".src", (size_t) 1); - QC_ASSERT (vec. size () == 1); - return getDirName (vec [0]); -} - - - void ShellApplication::findProg (const string &progName) const { ASSERT (! progName. empty ()); @@ -3004,7 +3033,7 @@ void ShellApplication::findProg (const string &progName) const { dir = fileExists (execDir + progName) ? execDir - : which (progName); + : this->which (progName); if (dir. empty ()) throw runtime_error ("Binary " + shellQuote (progName) + " is not found.\nPlease make sure that " + shellQuote (progName) + " is in the same directory as " + shellQuote (Common_sp::programName) + " or is in your $PATH.");; @@ -3029,6 +3058,21 @@ string ShellApplication::fullProg (const string &progName) const +string ShellApplication::exec2str (const string &cmd, + const string &tmpName, + const string &logFName) const +{ + ASSERT (! contains (tmpName, ' ')); + const string out (tmp + "." + tmpName); + exec (cmd + " > " + out, logFName); + const StringVector vec (out, (size_t) 1); + if (vec. size () != 1) + throw runtime_error (cmd + "\nOne line is expected"); + return vec [0]; +} + + + } diff --git a/common.hpp b/common.hpp index aa29c6e..074f092 100644 --- a/common.hpp +++ b/common.hpp @@ -58,11 +58,6 @@ #pragma warning (default : 4005) #endif -#ifdef __APPLE__ - #include - #include -#endif - #include #include #include @@ -129,6 +124,10 @@ extern size_t threads_max; // >= 1 extern thread::id main_thread_id; bool isMainThread (); +#ifndef _MSC_VER + size_t get_threads_max_max (); +#endif + [[noreturn]] void errorExit (const char* msg, @@ -306,12 +305,12 @@ inline bool boolPow (bool x, bool power) // ebool - extended bool -enum ebool {EFALSE = false, - ETRUE = true, - UBOOL = true + 1}; +enum ebool {efalse = false, + etrue = true, + enull = true + 1}; inline ebool toEbool (bool b) - { return b ? ETRUE : EFALSE; } + { return b ? etrue : efalse; } inline bool operator<= (ebool a, ebool b) @@ -320,10 +319,10 @@ inline bool operator<= (ebool a, ebool b) } inline void toggle (ebool &b) - { if (b == ETRUE) - b = EFALSE; - else if (b == EFALSE) - b = ETRUE; + { if (b == etrue) + b = efalse; + else if (b == efalse) + b = etrue; } @@ -393,7 +392,7 @@ uint powInt (uint a, -const size_t NO_INDEX = SIZE_MAX; +constexpr size_t no_index = numeric_limits::max (); @@ -772,7 +771,7 @@ struct List : list return i; else i++; - return NO_INDEX; + return no_index; } bool isPrefix (const List &prefix) const { typename List::const_iterator wholeIt = P::begin (); @@ -788,10 +787,14 @@ struct List : list prefixIt++; } } - List& operator<< (T t) + List& operator<< (const T &t) { P::push_back (t); return *this; } + List& operator<< (T &&t) + { P::push_back (move (t)); + return *this; + } template */> List& operator<< (const list &other) { P::insert (P::end (), other. begin (), other. end ()); @@ -803,12 +806,16 @@ struct List : list return *this; } T popFront () - { const T t = P::front (); + { if (P::empty ()) + throw range_error ("popFront() empty list"); + const T t = P::front (); P::pop_front (); return t; } T popBack () - { const T t = P::back (); + { if (P::empty ()) + throw range_error ("popBack() empty list"); + const T t = P::back (); P::pop_back (); return t; } @@ -1062,6 +1069,11 @@ inline string shellQuote (string s) bool fileExists (const string &fName); +inline void checkFile (const string &fName) + { if (! fileExists (fName)) + throw runtime_error ("File " + strQuote (fName) + " does not exist"); + } + streampos getFileSize (const string &fName); inline void removeFile (const string &fName) @@ -1228,6 +1240,11 @@ int getVerbosity (); void exec (const string &cmd, const string &logFName = string()); +#ifndef _MSC_VER + string which (const string &progName); + // Return: isRight(,"/") or empty() if there is no path +#endif + // Threads @@ -1282,7 +1299,7 @@ template ASSERT (threads_max >= 1); results. clear (); results. reserve (threads_max); - if (threads_max == 1) + if (threads_max == 1 || i_max <= 1) { results. push_back (Res ()); func (0, i_max, results. front (), forward(args)...); @@ -1380,7 +1397,7 @@ struct Root virtual bool empty () const { return true; } virtual void clear () - {} + { throw logic_error ("Root::clear() is not implemented"); } // Postcondition: empty() virtual void read (istream &/*is*/) { throw logic_error ("Root::read() is not implemented"); } @@ -1498,20 +1515,22 @@ struct Vector : vector {} - typename P::reference operator[] (size_t index) +private: + void checkIndex (const string &operation, + size_t index) const { #ifndef NDEBUG if (index >= P::size ()) - throw range_error ("Vector assignment to index = " + to_string (index) + ", but size = " + to_string (P::size ())); + throw range_error ("Vector " + operation + " operation: index = " + to_string (index) + ", but size = " + to_string (P::size ())); #endif + } +public: + typename P::reference operator[] (size_t index) + { checkIndex ("assignment", index); return P::operator[] (index); } typename P::const_reference operator[] (size_t index) const - { - #ifndef NDEBUG - if (index >= P::size ()) - throw range_error ("Vector reading of index = " + to_string (index) + ", but size = " + to_string (P::size ())); - #endif + { checkIndex ("get", index); return P::operator[] (index); } void reserveInc (size_t inc) @@ -1551,7 +1570,7 @@ struct Vector : vector return n; else n++; - return NO_INDEX; + return no_index; } size_t countValue (const T &value) const { size_t n = 0; @@ -1593,9 +1612,16 @@ struct Vector : vector { eraseMany (index, index + 1); } void eraseMany (size_t from, size_t to) - { P::erase ( P::begin () + (ptrdiff_t) from - , P::begin () + (ptrdiff_t) to - ); + { if (to < from) + throw logic_error ("Vector::eraseMany(): to < from"); + if (to == from) + return; + checkIndex ("eraseMany", to - 1); + auto it1 = P::begin (); + std::advance (it1, from); + auto it2 = it1; + std::advance (it2, to - from); + P::erase (it1, it2); } void wipe () { P::clear (); @@ -1618,6 +1644,14 @@ struct Vector : vector std::swap (t, (*this) [(size_t) rand. get ((ulong) P::size ())]); searchSorted = false; } + void pop_back () + { + #ifndef NDEBUG + if (P::empty ()) + throw range_error ("Empty vector pop_back"); + #endif + P::pop_back (); + } T pop (size_t n = 1) { T t = T (); while (n) @@ -1698,17 +1732,17 @@ struct Vector : vector size_t binSearch (const T &value, bool exact = true) const - // Return: if exact then NO_INDEX or vec[Return] = value else min {i : vec[i] >= value} + // Return: if exact then no_index or vec[Return] = value else min {i : vec[i] >= value} { if (P::empty ()) - return NO_INDEX; + return no_index; checkSorted (); size_t lo = 0; // vec.at(lo) <= value size_t hi = P::size () - 1; // lo <= hi if (value < (*this) [lo]) - return exact ? NO_INDEX : lo; + return exact ? no_index : lo; if ((*this) [hi] < value) - return NO_INDEX; + return no_index; // at(lo) <= value <= at(hi) for (;;) { @@ -1727,11 +1761,11 @@ struct Vector : vector return lo; if (! exact || (*this) [hi] == value) return hi; - return NO_INDEX; + return no_index; } template bool containsFast (const U &value) const - { return binSearch (value) != NO_INDEX; } + { return binSearch (value) != no_index; } template bool containsFastAll (const Vector &other) const { if (other. size () > P::size ()) @@ -1808,14 +1842,14 @@ struct Vector : vector size_t findDuplicate () const { if (P::size () <= 1) - return NO_INDEX; + return no_index; FOR_START (size_t, i, 1, P::size ()) if ((*this) [i] == (*this) [i - 1]) return i; - return NO_INDEX; + return no_index; } bool isUniq () const - { return findDuplicate () == NO_INDEX; } + { return findDuplicate () == no_index; } template void uniq (const Equal &equal) { if (P::size () <= 1) @@ -2065,6 +2099,8 @@ struct Heap : Root, Nocopy bool empty () const final { return arr. empty (); } + size_t size () const + { return arr. size (); } Heap& operator<< (T item) { arr << item; increaseKey (arr. size () - 1); @@ -2399,6 +2435,7 @@ template template struct RandomSet +// Set stored in a vector for a random access { private: Vector vec; @@ -2444,7 +2481,6 @@ struct RandomSet } const Vector& getVec () const { return vec; } - // For a random access }; @@ -2467,11 +2503,11 @@ struct Enumerate size_t find (const T &t) const { if (const size_t* num = Common_sp::findPtr (elem2num, t)) return *num; - return NO_INDEX; + return no_index; } size_t add (const T &t) { size_t num = find (t); - if (num == NO_INDEX) + if (num == no_index) { num2elem << t; num = num2elem. size () - 1; elem2num [t] = num; @@ -2667,12 +2703,14 @@ struct CharInput : Input : runtime_error (what_arg) {} }; + string errorText (const string &what, + bool expected = true) const + { return "Error at line " + to_string (lineNum + 1) + ", pos. " + to_string (charNum + 1) + + (what. empty () ? string () : (": " + what + ifS (expected, " is expected"))); + } [[noreturn]] void error (const string &what, bool expected = true) const - { throw Error ("Error at line " + to_string (lineNum + 1) + ", pos. " + to_string (charNum + 1) - + (what. empty () ? string () : (": " + what + ifS (expected, " is expected"))) - ); - } + { throw Error (errorText (what, expected)); } }; @@ -2742,6 +2780,8 @@ struct Token : Root void saveText (ostream &os) const override; bool empty () const override { return type == eDelimiter && name. empty (); } + void clear () override + { *this = Token (); } static string type2str (Type type) @@ -3393,7 +3433,7 @@ struct Application : Singleton, Root const string description; const bool needsArg; const bool gnu; - string version {"1.0.0"}; + string version {"0.0.0"}; static constexpr const char* helpS {"help"}; static constexpr const char* versionS {"version"}; @@ -3511,8 +3551,12 @@ struct Application : Singleton, Root void setPositional (List::iterator &posIt, const string &value); public: - virtual ~Application () - {} + ~Application () + { if (logPtr) + { delete logPtr; + logPtr = nullptr; + } + } protected: @@ -3558,6 +3602,8 @@ struct ShellApplication : Application // Environment const bool useTmp; string tmp; + // Temporary file prefix + // If log is used then tmp is printed in the log file and the temporary files are not deleted string execDir; // Ends with '/' // Physically real directory of the software @@ -3582,13 +3628,18 @@ struct ShellApplication : Application protected: static bool emptyArg (const string &s) { return s. empty () || s == "\'\'"; } - string which (const string &progName) const; - // Return: isRight(,"/") or empty() + string which (const string &progName) const + { return Common_sp::which (progName); } void findProg (const string &progName) const; // Output: prog2dir string fullProg (const string &progName) const; // Return: shellQuote (directory + progName) + ' ' // Requires: After findProg(progName) + string exec2str (const string &cmd, + const string &tmpName, + const string &logFName = string()) const; + // Return: `cmd > .tmpName && cat .tmpName` + // Requires: cmd produces one line }; #endif diff --git a/common.inc b/common.inc index 3ab49b2..aea3900 100644 --- a/common.inc +++ b/common.inc @@ -80,21 +80,25 @@ #endif -[[noreturn]] void errorThrow (const std::string &msg); - -#define ERROR_MSG(msg) \ - { if (! std::uncaught_exception ()) \ - { std::ostringstream oss_; \ - oss_ << WHERE << ": " << (msg); \ - errorThrow (oss_. str ()); \ - } else {} \ +namespace Common_sp +{ + [[noreturn]] void errorThrow (const std::string &msg); +} + +#define ERROR_MSG(msg) \ + { if (! std::uncaught_exception ()) \ + { std::ostringstream oss_; \ + oss_ << WHERE << ": " << (msg); \ + Common_sp::errorThrow (oss_. str ()); \ + } else {} \ + exit (1); \ } #define ERROR ERROR_MSG ("") #define NOT_IMPLEMENTED ERROR_MSG ("Not implemented: ") #define NEVER_CALL ERROR_MSG ("Never call: ") -#define QC_ASSERT(cond) if (! (cond)) ERROR_MSG (#cond) else {} +#define QC_ASSERT(cond) if (! (cond)) ERROR_MSG (#cond) else {} // Must be the last statement in a function body // Logic errors diff --git a/dna_mutation.cpp b/dna_mutation.cpp index d5353ac..d8ff87e 100644 --- a/dna_mutation.cpp +++ b/dna_mutation.cpp @@ -50,6 +50,7 @@ namespace map > accession2mutations; unique_ptr mutation_all; +string input_name; @@ -109,6 +110,8 @@ struct BlastnAlignment : Alignment { ASSERT (! (seqChange. empty () && ! seqChange. mutation)); TabDel td (2, false); + if (! input_name. empty ()) + td << input_name;; td << na // PD-2534 << nvl (targetName, na) << (empty () ? 0 : targetStart + 1) @@ -153,7 +156,7 @@ struct BlastnAlignment : Alignment // HMM td << na << na; - if (! seqChange. empty () && seqChange. mutation) // resistant mutation + if (! seqChange. empty () && seqChange. mutation && ! seqChange. replacement) // resistant mutation os << td. str () << endl; if (mutation_all. get ()) *mutation_all << td. str () << endl; @@ -163,6 +166,7 @@ struct BlastnAlignment : Alignment bool good () const { return targetSeq. size () >= min (refLen, 2 * flankingLen + 1); } +#if 0 bool operator< (const BlastnAlignment &other) const { LESS_PART (*this, other, targetName); LESS_PART (other, *this, pIdentity ()); @@ -170,6 +174,7 @@ struct BlastnAlignment : Alignment LESS_PART (*this, other, refName); return false; } +#endif }; @@ -211,6 +216,8 @@ struct Batch { // Cf. BlastnAlignment::saveText() TabDel td; + if (! input_name. empty ()) + td << "Name"; td << "Protein identifier" // targetName // PD-2534 // Contig << "Contig id" @@ -267,6 +274,7 @@ struct ThisApplication : Application addPositional ("mutation", "Mutations table"); addPositional ("organism", "Organism name"); addKey ("mutation_all", "File to report all mutations"); + addKey ("name", "Text to be added as the first column \"name\" to all rows of the report"); version = SVN_REV; } @@ -278,6 +286,7 @@ struct ThisApplication : Application const string mutation_tab = getArg ("mutation"); const string organism = getArg ("organism"); const string mutation_all_FName = getArg ("mutation_all"); + input_name = getArg ("name"); if (! mutation_all_FName. empty ()) @@ -334,16 +343,17 @@ struct ThisApplication : Application && blastAl2->targetStrand == blastAl1->targetStrand && blastAl2 != blastAl1 ) - //for (SeqChange& seqChange2 : blastAl2. seqChanges) - for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + //for (Iter> iter (var_cast (blastAl2) -> seqChanges); iter. next (); ) + for (SeqChange& seqChange2 : var_cast (blastAl2) -> seqChanges) { - SeqChange& seqChange2 = *iter; + //SeqChange& seqChange2 = *iter; ASSERT (seqChange2. al == blastAl2); //ASSERT (seqChange2. mutation); - if ( seqChange1. start_target == seqChange2. start_target + if ( seqChange1. start_target == seqChange2. start_target && seqChange1. better (seqChange2) ) - iter. erase (); + //iter. erase (); + seqChange2. replacement = & seqChange1; } } diff --git a/fasta_check.cpp b/fasta_check.cpp index 8cdcc5d..ff9e726 100644 --- a/fasta_check.cpp +++ b/fasta_check.cpp @@ -148,7 +148,7 @@ struct ThisApplication : Application ids. sort (); const size_t index = ids. findDuplicate (); - if (index != NO_INDEX) + if (index != no_index) throw runtime_error ("Duplicate identifier: " + ids [index]); } }; diff --git a/fasta_extract.cpp b/fasta_extract.cpp new file mode 100644 index 0000000..7e1c1a7 --- /dev/null +++ b/fasta_extract.cpp @@ -0,0 +1,244 @@ +// fasta_check.cpp + +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +* Author: Vyacheslav Brover +* +* File Description: +* Extract sequences out of a FASTA file +* +*/ + + +#undef NDEBUG +#include "common.inc" + +#include "common.hpp" +using namespace Common_sp; + + + +namespace +{ + + + +struct Segment +// not circular +{ + size_t start {0}; + size_t stop {0}; + bool strand {true}; + // false <=> negative + string genesymbol; + string name; + + + bool isDna () const + { return stop; } + size_t size () const + { return stop - start; } + void saveText (ostream &os) const + { TabDel td; + td << start << stop << strand << genesymbol << name; + os << td. str (); + } +}; + + + +char complementaryNucleotide (char wildNucleotide) +{ + char r = ' '; + switch (toLower (wildNucleotide)) + { + case 'a': r = 't'; break; + case 'c': r = 'g'; break; + case 'g': r = 'c'; break; + case 't': r = 'a'; break; + case 'm': r = 'k'; break; + case 'r': r = 'y'; break; + case 'w': r = 'w'; break; + case 's': r = 's'; break; + case 'y': r = 'r'; break; + case 'k': r = 'm'; break; + case 'v': r = 'b'; break; + case 'h': r = 'd'; break; + case 'd': r = 'h'; break; + case 'b': r = 'v'; break; + case 'n': r = 'n'; break; + case '-': r = '-'; break; + default: + throw runtime_error ("Bad nucleotide " + to_string (wildNucleotide)); + } + if (isupper (wildNucleotide)) + r = toUpper (r); + + return r; +} + + + +void process (const string &id, + string &seq, + const map> &id2segments) +{ + if (id. empty ()) + return; + const Vector* segments = findPtr (id2segments, id); + if (! segments) + return; + + replaceStr (seq, "-", ""); + QC_ASSERT (! seq. empty ()); + + for (const Segment& seg : *segments) + { + cout << '>' << id; + if (seg. isDna ()) + cout << ':' << seg. start + 1 << '-' << seg. stop << ' ' << "strand:" << (seg. strand ? '+' : '-'); + cout << ' ' << seg. genesymbol << ' ' << seg. name << endl; + string seq1 (seq); + if (seg. isDna ()) + { + QC_ASSERT (seg. stop <= seq1. size ()); + seq1 = seq1. substr (seg. start, seg. size ()); + if (! seg. strand) + { + reverse (seq1); + for (char &c : seq1) + c = complementaryNucleotide (c); + } + } + constexpr size_t line_len = 60; // PAR + for (size_t i = 0; i < seq1. size (); i += line_len) + cout << seq1. substr (i, line_len) << endl; + } +} + + + +struct ThisApplication : Application +{ + ThisApplication () + : Application ("Extract sequences out of a FASTA file") + { + addPositional ("fasta", "FASTA file"); + addPositional ("target", "Target identifiers in the FASTA file to extract.\n\ +Line format for amino acid sequences : \n\ +Line format for nucleotide sequences : =1)> = start)> \ +"); + addFlag ("aa", "Amino acid sequenes, otherwise nucleotide"); + version = SVN_REV; + } + + + + void body () const final + { + const string fName = getArg ("fasta"); + const string targetFName = getArg ("target"); + const bool aa = getFlag ("aa"); + + + map> id2segments; + { + LineInput f (targetFName); + string id; + Istringstream iss; + while (f. nextLine ()) + { + iss. reset (f. line); + Segment seg; + iss >> id; + char strand; + if (! aa) + { + iss >> seg. start >> seg. stop >> strand; + QC_ASSERT (seg. start); + QC_ASSERT (seg. start <= seg. stop); + seg. start--; + seg. strand = (strand == '+'); + } + iss >> seg. genesymbol; + seg. name = f. line. substr ((size_t) iss. tellg ()); + trim (seg. name); + QC_ASSERT (aa == ! seg. isDna ()); + id2segments [id] << move (seg); + } + } + if (verbose ()) + for (const auto& it : id2segments) + { + cout << it. first << ": " << endl; + for (const Segment& seg : it. second) + { + cout << " "; + seg. saveText (cout); + cout << endl; + } + } + if (id2segments. empty ()) + return; + + + LineInput f (fName); + string id; + string seq; + while (f. nextLine ()) + { + if (f. line. empty ()) + continue; + if (f. line [0] == '>') + { + process (id, seq, id2segments); + size_t pos = 1; + while (pos < f. line. size () && ! isspace (f. line [pos])) + pos++; + id = f. line. substr (1, pos - 1); + seq. clear (); + } + else + seq += f. line; + } + process (id, seq, id2segments); + } +}; + + + +} // namespace + + + +int main (int argc, + const char* argv[]) +{ + ThisApplication app; + return app. run (argc, argv); +} + + + diff --git a/test.expected b/test.expected deleted file mode 100644 index 3854391..0000000 --- a/test.expected +++ /dev/null @@ -1,10 +0,0 @@ -Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description -blaTEM-156 contig1 101 961 + blaTEM-156 class A beta-lactamase TEM-156 AMR AMR BETA-LACTAM BETA-LACTAM ALLELEP 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase -NA contig10 486 1307 + blaOXA class D beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NF000270.1 class D beta-lactamase -blaPDC-114_blast contig2 1 1191 + blaPDC PDC family class C beta-lactamase AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase -blaOXA-436_partial contig3 101 802 + blaOXA OXA-48 family class D beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF000387.2 OXA-48 family class D beta-lactamase -vanG contig4 101 1147 + vanG D-alanine--D-serine ligase VanG AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG -NA contig8 101 700 + blaTEM TEM family class A beta-lactamase AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NF000531.2 TEM family class A beta-lactamase -NA contig9 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDX 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032895.1 aminoglycoside O-phosphotransferase APH(3'')-Ib -NA contig9 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDX 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NF000295.1 sulfonamide-resistant dihydropteroate synthase Sul2 -nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein diff --git a/test_both.expected b/test_both.expected index 48131fb..c1961a5 100644 --- a/test_both.expected +++ b/test_both.expected @@ -5,10 +5,10 @@ blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamas vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG NA contig04 1261 2391 + blaEC BlaEC family class C beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM BLASTX 377 377 100.00 98.14 377 WP_063610930.1 class C extended-spectrum beta-lactamase EC-15 NA NA NA contig08 101 700 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIAL_CONTIG_ENDX 200 286 69.93 100.00 200 WP_061158039.1 class A beta-lactamase TEM-156 NA NA -aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 99.56 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase +aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIAL_CONTIG_ENDP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA NA contig10 486 1307 + blaOXA class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 274 274 100.00 99.64 274 WP_000722315.1 oxacillin-hydrolyzing class D beta-lactamase OXA-9 NA NA -blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NA NA +blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM INTERNAL_STOP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NA NA qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA NA contig14 1 1089 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTX 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA @@ -17,4 +17,4 @@ NA contig15 1 2905 + 23S_A2058T Escherichia azithromycin/erythromycin/telithromy NA contig16 1 720 + nfsA_K141STOP Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA NA contig16 1 720 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTX 240 240 100.00 99.17 240 WP_089631889.1 nitroreductase NfsA NA NA NA contig17 1 247 + ampC_T-14TGT Escherichia cephalosporin resistant ampC core AMR POINT BETA-LACTAM CEPHALOSPORIN POINTN 247 245 100.00 99.19 247 NZ_CP041538.1:1149245-1149489 ampC/blaEC promoter region NA NA -nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein +nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family nitroimidazole resistance protein NF000262.1 NimIJ family nitroimidazole resistance protein diff --git a/test_prot.expected b/test_prot.expected index d78cb36..ea47025 100644 --- a/test_prot.expected +++ b/test_prot.expected @@ -3,11 +3,11 @@ blaTEM-156 contig01 101 961 + blaTEM-156 class A beta-lactamase TEM-156 core AMR blaPDC-114_blast contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTP 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NF000422.6 PDC family class C beta-lactamase blaOXA-436_partial contig03 101 802 + blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase vanG contig04 101 1147 + vanG D-alanine--D-serine ligase VanG core AMR AMR GLYCOPEPTIDE VANCOMYCIN EXACTP 349 349 100.00 100.00 349 WP_063856695.1 D-alanine--D-serine ligase VanG NF000091.3 D-alanine--D-serine ligase VanG -aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 99.56 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase +aph3pp-Ib_partial_5p_neg contig09 1 675 - aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIAL_CONTIG_ENDP 225 275 81.82 100.00 225 WP_109545041.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase sul2_partial_3p_neg contig09 715 1377 - sul2 sulfonamide-resistant dihydropteroate synthase Sul2 core AMR AMR SULFONAMIDE SULFONAMIDE PARTIALP 221 271 81.55 100.00 221 WP_001043265.1 sulfonamide-resistant dihydropteroate synthase Sul2 NA NA blaTEM-internal_stop contig11 113 547 + blaTEM TEM family class A beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 144 286 50.35 97.22 144 WP_000027057.1 class A broad-spectrum beta-lactamase TEM-1 NA NA qacR-curated_blast contig12 71 637 + qacR multidrug-binding transcriptional regulator QacR plus STRESS BIOCIDE QUATERNARY AMMONIUM QUATERNARY AMMONIUM BLASTP 188 188 100.00 99.47 188 ADK23698.1 multidrug-binding transcriptional regulator QacR NA NA emrD3-suppressed-in-vibrio contig13 1 1137 + emrD3 multidrug efflux MFS transporter EmrD-3 plus AMR AMR EFFLUX EFFLUX EXACTP 379 379 100.00 100.00 379 ABQ18953.1 multidrug efflux MFS transporter EmrD-3 NA NA pmrB_C84R contig14 1093 2181 + pmrB_C84R Escherichia colistin resistant PmrB core AMR POINT COLISTIN COLISTIN POINTP 363 363 100.00 99.72 363 WP_001300761.1 two-component system sensor histidine kinase PmrB NA NA -nfsA_R15C_K141STOP contig16 1 423 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN PARTIALP 140 240 58.33 99.29 140 WP_089631889.1 nitroreductase NfsA NA NA -nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 NA NA NA NA NA NA NF000262.1 NimIJ family nitroimidazole resistance protein +nfsA_R15C_K141STOP contig16 1 423 + nfsA_R15C Escherichia nitrofurantoin resistant NfsA core AMR POINT NITROFURAN NITROFURANTOIN POINTP 140 240 58.33 99.29 140 WP_089631889.1 nitroreductase NfsA NA NA +nimIJ_hmm contigX 1 501 + nimIJ NimIJ family nitroimidazole resistance protein core AMR AMR NITROIMIDAZOLE NITROIMIDAZOLE HMM 166 165 98.18 76.54 162 WP_005812825.1 NimIJ family nitroimidazole resistance protein NF000262.1 NimIJ family nitroimidazole resistance protein diff --git a/version.txt b/version.txt index ff313b8..2529892 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.8.4 +3.8.28