Skip to content

Commit

Permalink
Merge pull request #116 from ncbi/dev
Browse files Browse the repository at this point in the history
Release 3.11.8
- Performance improvements by optimizing blast parameters
    - Faster 70% single-threaded on nucleotide-only run
    - Faster by 64% on single-threaded protein-only run
    - Faster by 58% on single-threaded combined run
- Fixed handling for FASTA identifiers with leading underscore "_" (
- Empty NUC_FASTA_OUT #115)
- Improved handling of special characters in GFF files
- Added --annotation_format standard
  • Loading branch information
evolarjun authored Apr 10, 2023
2 parents f2840d9 + fdb4168 commit 2bc24b6
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 235 deletions.
12 changes: 8 additions & 4 deletions amr_report.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ struct BlastAlignment : Alignment
nident++;

if (! targetProt)
cdss << move (Locus (0, targetName, targetStart, targetEnd, targetStrand, partialDna, 0, string (), string ()));
cdss. emplace_back (0, targetName, targetStart, targetEnd, targetStrand, partialDna, 0, string (), string ());

if (const Vector<AmrMutation>* refMutations = findPtr (accession2mutations, refAccession))
{
Expand Down Expand Up @@ -561,7 +561,6 @@ struct BlastAlignment : Alignment
QC_IMPLY (targetProt, targetAlign == targetAlign_aa);
QC_IMPLY (! targetProt, targetAlign == 3 * targetAlign_aa);
QC_ASSERT (nident <= targetAlign_aa);
//QC_IMPLY (! targetProt, cdss. empty ());
QC_IMPLY (! refAccession. empty (), targetAlign_aa <= targetSeq. size ());
QC_IMPLY (! refMutation. empty (), isMutationProt ());
#if 0
Expand All @@ -571,6 +570,11 @@ struct BlastAlignment : Alignment
|| cds. size () == 3 * targetLen
);
#endif
if (! targetProt)
{
for (const Locus& cds : cdss)
QC_ASSERT (cds. contig == targetName);
}
QC_IMPLY (! seqChanges. empty (), isMutationProt ());
}
void report (ostream& os,
Expand Down Expand Up @@ -1434,7 +1438,7 @@ struct Batch
QC_ASSERT (pos > 0);
replace (organism_, '_', ' ');
if (organism_ == organism)
accession2mutations [accession] << move (AmrMutation ((size_t) pos, geneMutation, classS, subclass, name));
accession2mutations [accession]. emplace_back ((size_t) pos, geneMutation, classS, subclass, name);
else
alien_prots << accession;
}
Expand Down Expand Up @@ -1478,7 +1482,7 @@ struct Batch
{
if (contains (accession2susceptible, accession))
throw runtime_error ("Duplicate protein accession " + accession + " in " + susceptible_tab);
accession2susceptible [accession] = move (Susceptible (genesymbol, cutoff, classS, subclass, name));
accession2susceptible [accession] = move (Susceptible (genesymbol, cutoff, classS, subclass, name));
}
else
alien_prots << accession;
Expand Down
40 changes: 27 additions & 13 deletions amrfinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@
* Dependencies: NCBI BLAST, HMMer
*
* Release changes:
* 04/05/2023 PD-4522 blastp -task blastp-fast
* 04/05/2023 PD-4548 "-a standard" is added
* 3.11.8 04/01/2023 fasta_extract.cpp checks whether all requested identifiers are found in FASTA
* 3.11.7 03/30/2023 PD-4548 GFF parsing processes '%<hex>' characters
* 3.11.6 03/21/2023 PD-4522 tblastn: -word_size 3 --> -task tblastn-fast -threshold 100 -window_size 15
* 03/21/2023 PD-4533 '_' are incorrectly trimmed from contig names
* 03/13/2023 -mt_mode is restored for __APPLE__
* 3.11.5 03/10/2023 directories --blast_bin, directory DATABASE in amrfinder_index.cpp do not need to end with '/'
* 03/01/2023 PD-3597 amrfinder_index
* 02/27/2023 section()
* 3.11.4 01/24/2023 GPipe organism string in taxgroup.tab is a comma-separated list of GPipe organisms
Expand Down Expand Up @@ -282,7 +291,7 @@ struct ThisApplication : ShellApplication
addKey ("protein", "Input protein FASTA file", "", 'p', "PROT_FASTA");
addKey ("nucleotide", "Input nucleotide FASTA file", "", 'n', "NUC_FASTA");
addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=<id>' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE");
addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); // For compatibility ??
addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); // = --annotation_format pgap
addKey ("annotation_format", "Type of GFF file: " + Gff::names. toString (", "), "genbank", 'a', "ANNOTATION_FORMAT");
addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR");
addKey ("ident_min", "Minimum proportion of identical amino acids in alignment for hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT");
Expand Down Expand Up @@ -386,10 +395,10 @@ struct ThisApplication : ShellApplication
return string ();

string s (" -num_threads " + to_string (t));
#ifndef __APPLE__
//#ifndef __APPLE__
if (mt_modeP)
s += " -mt_mode 1";
#endif
//#endif

return s;
}
Expand Down Expand Up @@ -567,8 +576,7 @@ struct ThisApplication : ShellApplication
blast_bin = string (s);
if (! blast_bin. empty ())
{
if (! isRight (blast_bin, "/"))
blast_bin += "/";
addDirSlash (blast_bin);
prog2dir ["blastp"] = blast_bin;
prog2dir ["blastx"] = blast_bin;
prog2dir ["tblastn"] = blast_bin;
Expand Down Expand Up @@ -825,7 +833,7 @@ struct ThisApplication : ShellApplication


// PD-2967
const string blastp_par ("-comp_based_stats 0 -evalue 1e-10 -seg no -max_target_seqs 10000");
const string blastp_par ("-comp_based_stats 0 -evalue 1e-10 -seg no -max_target_seqs 10000"); // PAR
// was: -culling_limit 20 // PD-2967
if (! emptyArg (prot))
{
Expand Down Expand Up @@ -917,7 +925,8 @@ struct ThisApplication : ShellApplication
const Chronometer_OnePass cop ("blastp", cerr, false, qc_on && ! quiet);
// " -task blastp-fast -word_size 6 -threshold 21 " // PD-2303
exec (fullProg ("blastp") + " -query " + prot1 + " -db " + tmp + "/db/AMRProt" + " "
+ blastp_par + get_num_threads_param ("blastp", min (nProt, protLen_total / 10000)) + " " BLAST_FMT " -out " + tmp + "/blastp > /dev/null 2> " + tmp + "/blastp-err", tmp + "/blastp-err");
+ blastp_par + " -task blastp-fast" // "-threshold 100 -window_size 15" are faster, but may miss hits, see SB-3643
+ get_num_threads_param ("blastp", min (nProt, protLen_total / 10000)) + " " BLAST_FMT " -out " + tmp + "/blastp > /dev/null 2> " + tmp + "/blastp-err", tmp + "/blastp-err");
}

stderr. section ("Running hmmsearch");
Expand Down Expand Up @@ -966,14 +975,15 @@ struct ThisApplication : ShellApplication
size_t dnaLen_max = 0;
size_t dnaLen_total = 0;
EXEC_ASSERT (fastaCheck (dna, false, qcS, logFName, nDna, dnaLen_max, dnaLen_total));
const string blastx (dnaLen_max > 100000 ? "tblastn" : "blastx"); // PAR
const string blastx (/*"tblastn"*/ dnaLen_max > 100000 ? "tblastn" : "blastx"); // PAR // SB-3643

stderr. section ("Running " + blastx);
findProg (blastx);
{
const Chronometer_OnePass cop (blastx, cerr, false, qc_on && ! quiet);
const string tblastn_par (blastp_par + " -word_size 3"); // -max_target_seqs 10000
const string blastx_par (tblastn_par + " -query_gencode " + to_string (gencode));
const string tblastn_par (blastp_par + " -task tblastn-fast -threshold 100 -window_size 15"); // SB-3643, PD-4522
//const string tblastn_par (blastp_par + " -word_size 3");
const string blastx_par (blastp_par + " -word_size 3 -query_gencode " + to_string (gencode));
ASSERT (threads_max >= 1);
if (blastx == "blastx")
exec (fullProg ("blastx") + " -query " + dna + " -db " + tmp + "/db/AMRProt" + " "
Expand Down Expand Up @@ -1156,9 +1166,13 @@ struct ThisApplication : ShellApplication
}


// timing the run
const time_t end = time (NULL);
stderr << "AMRFinder took " << end - start << " seconds to complete\n";
//if (! quiet)
{
// timing the run
const time_t end = time (NULL);
//const OColor oc (cerr, Color::magenta, false, true);
stderr/*cerr*/ << "AMRFinder took " << end - start << " seconds to complete\n";
}
}
};

Expand Down
25 changes: 15 additions & 10 deletions amrfinder_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct ThisApplication : ShellApplication
: ShellApplication ("Index the database for AMRFinder", true, true, true)
{
addPositional ("DATABASE", "Directory with AMRFinder database");
addKey ("blast_bin", "Directory for BLAST ending with '/'", "", '\0', "BLAST_DIR");
addKey ("blast_bin", "Directory for BLAST", "", '\0', "BLAST_DIR");
addFlag ("quiet", "Suppress messages to STDERR", 'q');
version = SVN_REV;
}
Expand All @@ -72,10 +72,15 @@ struct ThisApplication : ShellApplication

void shellBody () const final
{
const string dbDir = getArg ("DATABASE");
const string blast_bin = getArg ("blast_bin");

string dbDir = getArg ("DATABASE");
string blast_bin = getArg ("blast_bin");
const bool quiet = getFlag ("quiet");


addDirSlash (dbDir);
addDirSlash (blast_bin);


Stderr stderr (quiet);
stderr << "Running: "<< getCommandLine () << '\n';
Expand All @@ -84,10 +89,7 @@ struct ThisApplication : ShellApplication

if (! directoryExists (dbDir))
throw runtime_error ("Database directory " + dbDir + " does not exist");

const Dir dir (dbDir);
const string dbDirS (dir. get () + "/");


if (! blast_bin. empty ())
prog2dir ["makeblastdb"] = blast_bin;
findProg ("makeblastdb");
Expand All @@ -97,7 +99,8 @@ struct ThisApplication : ShellApplication
// Cf. amrfinder_update.cpp
StringVector dnaPointMuts;
{
LineInput f (dbDirS + "taxgroup.tab");
LineInput f (dbDir + "taxgroup.tab");

while (f. nextLine ())
{
if (isLeft (f. line, "#"))
Expand All @@ -114,8 +117,10 @@ struct ThisApplication : ShellApplication


stderr << "Indexing" << "\n";
exec (fullProg ("hmmpress") + " -f " + shellQuote (dbDirS + "AMR.LIB") + " > /dev/null 2> " + tmp + "/hmmpress.err", tmp + "/hmmpress.err");
setSymlink (dbDirS, tmp + "/db", true);

exec (fullProg ("hmmpress") + " -f " + shellQuote (dbDir + "AMR.LIB") + " > /dev/null 2> " + tmp + "/hmmpress.err", tmp + "/hmmpress.err");
setSymlink (dbDir, tmp + "/db", true);

exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMRProt" + " -dbtype prot -logfile " + tmp + "/makeblastdb.AMRProt", tmp + "/makeblastdb.AMRProt");
exec (fullProg ("makeblastdb") + " -in " + tmp + "/db/AMR_CDS" + " -dbtype nucl -logfile " + tmp + "/makeblastdb.AMR_CDS", tmp + "/makeblastdb.AMR_CDS");
for (const string& dnaPointMut : dnaPointMuts)
Expand Down
15 changes: 9 additions & 6 deletions amrfinder_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@
* File Description:
* Updating of AMRFinder data
*
* Dependencies: NCBI BLAST, HMMer --> moved to amrfinder_index.cpp
* curl.{h,c}
* Dependencies: curl.{h,c}
*
* Release changes: see amrfinder.cpp
*
Expand Down Expand Up @@ -315,7 +314,7 @@ Requirement: the database directory contains subdirectories named by database ve
", false, true, true)
{
addKey ("database", "Directory for all versions of AMRFinder databases", "$BASE/data", 'd', "DATABASE_DIR");
addKey ("blast_bin", "Directory for BLAST ending with '/'", "", '\0', "BLAST_DIR");
addKey ("blast_bin", "Directory for BLAST", "", '\0', "BLAST_DIR");
addFlag ("force_update", "Force updating the AMRFinder database"); // PD-3469
addFlag ("quiet", "Suppress messages to STDERR", 'q');
version = SVN_REV;
Expand Down Expand Up @@ -345,10 +344,13 @@ Requirement: the database directory contains subdirectories named by database ve
void shellBody () const final
{
const string mainDirOrig = getArg ("database");
const string blast_bin = getArg ("blast_bin");
string blast_bin = getArg ("blast_bin");
const bool force_update = getFlag ("force_update");
const bool quiet = getFlag ("quiet");


addDirSlash (blast_bin);


Stderr stderr (quiet);
stderr << "Running: "<< getCommandLine () << '\n';
Expand Down Expand Up @@ -412,8 +414,9 @@ Requirement: the database directory contains subdirectories named by database ve
const Dir mainDir (mainDirOrig);
mainDirS = mainDir. get ();
}
if (! isRight (mainDirS, "/"))
mainDirS += "/";

addDirSlash (mainDirS);


const string versionFName ("version.txt");
const string urlDir (URL + load_minor + "/" + load_data_version + "/");
Expand Down
56 changes: 50 additions & 6 deletions common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,8 @@ void throwf (const string &s)
const Xml::Tag xml (*cxml, "ERROR");
*cxml << s;
}
throw runtime_error (s + "\nStack:\n" + getStack ());
throw logic_error (s + "\nStack:\n" + getStack ());

}


Expand Down Expand Up @@ -871,6 +872,49 @@ void reverse (string &s)



string unpercent (const string &s)
{
for (const char c : s)
if (! printable (c))
throwf (FUNC "Non-printable character: " + to_string (int (c)));

string r;
constexpr size_t hex_pos_max = 2;
size_t hex_pos = no_index;
uchar hex = 0;
for (const char c : s)
if (hex_pos == no_index)
{
if (c == '%')
hex_pos = 0;
else
r += c;
}
else
{
ASSERT (hex_pos < hex_pos_max);
if (! isHex (c))
throwf (FUNC "Bad hexadecimal character: " + to_string (c));
hex = uchar (hex + hex2uchar (c));
if (! hex_pos)
{
ASSERT (hex < 16);
hex = uchar (hex * 16);
}
hex_pos++;
if (hex_pos == hex_pos_max)
{
r += char (hex);
hex_pos = no_index;
hex = 0;
}
}

return r;
}



List<string> str2list (const string &s,
char c)
{
Expand Down Expand Up @@ -3550,7 +3594,7 @@ int Application::run (int argc,

seed_global = str2<ulong> (getArg ("seed"));
if (! seed_global)
throw runtime_error ("Seed cannot be 0");
throw runtime_error ("seed cannot be 0");

jsonFName = getArg ("json");
ASSERT (! jRoot);
Expand Down Expand Up @@ -3630,7 +3674,7 @@ void ShellApplication::initEnvironment ()
execDir = getProgramDirName ();
if (execDir. empty ())
execDir = which (programArgs. front ());
if (! isRight (execDir, "/"))
if (! isDirName (execDir))
throw logic_error ("Cannot identify the directory of the executable");
{
string s (programArgs. front ());
Expand Down Expand Up @@ -3700,7 +3744,7 @@ void ShellApplication::findProg (const string &progName) const
{
ASSERT (! progName. empty ());
ASSERT (! contains (progName, '/'));
ASSERT (isRight (execDir, "/"));
ASSERT (isDirName (execDir));

string dir;
if (! find (prog2dir, progName, dir))
Expand All @@ -3714,7 +3758,7 @@ void ShellApplication::findProg (const string &progName) const
prog2dir [progName] = dir;
}

ASSERT (isRight (dir, "/"));
ASSERT (isDirName (dir));
}


Expand All @@ -3724,7 +3768,7 @@ string ShellApplication::fullProg (const string &progName) const
string dir;
if (! find (prog2dir, progName, dir))
throw runtime_error ("Program " + shellQuote (progName) + " is not found");
ASSERT (isRight (dir, "/"));
ASSERT (isDirName (dir));
return shellQuote (dir + progName) + " ";
}
#endif
Expand Down
Loading

0 comments on commit 2bc24b6

Please sign in to comment.