Skip to content

Commit

Permalink
Add command to extract Neff scores for MSA (#647)
Browse files Browse the repository at this point in the history
Add command to extract Neff scores for an MSA
  • Loading branch information
neftlon authored Jan 30, 2023
1 parent a5d4965 commit 4148e09
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ extern int prefilter(int argc, const char **argv, const Command& command);
extern int prefixid(int argc, const char **argv, const Command& command);
extern int profile2cs(int argc, const char **argv, const Command& command);
extern int profile2pssm(int argc, const char **argv, const Command& command);
extern int profile2neff(int argc, const char **argv, const Command& command);
extern int profile2consensus(int argc, const char **argv, const Command& command);
extern int profile2repseq(int argc, const char **argv, const Command& command);
extern int proteinaln2nucl(int argc, const char **argv, const Command& command);
Expand Down
7 changes: 7 additions & 0 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,13 @@ std::vector<Command> baseCommands = {
"<i:profileDB> <o:pssmFile>",
CITATION_MMSEQS2, {{"profileDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
{"pssmFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
{"profile2neff", profile2neff, &par.profile2neff, COMMAND_PROFILE,
"Convert a profile DB to a tab-separated list of Neff scores",
NULL,
"Johannes Spies <johannes.spies@tum.de>",
"<I:profileDB> <o:neffFile>",
CITATION_MMSEQS2, {{"profileDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
{"neffFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
{"profile2consensus", profile2consensus, &par.profile2seq, COMMAND_PROFILE,
"Extract consensus sequence DB from a profile DB",
NULL,
Expand Down
1 change: 1 addition & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,7 @@ class Parameters {
std::vector<MMseqsParameter*> renamedbkeys;
std::vector<MMseqsParameter*> createtaxdb;
std::vector<MMseqsParameter*> profile2pssm;
std::vector<MMseqsParameter*> profile2neff;
std::vector<MMseqsParameter*> profile2seq;
std::vector<MMseqsParameter*> besthitbyset;
std::vector<MMseqsParameter*> combinepvalbyset;
Expand Down
1 change: 1 addition & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ set(util_source_files
util/pairaln.cpp
util/prefixid.cpp
util/profile2pssm.cpp
util/profile2neff.cpp
util/profile2seq.cpp
util/result2dnamsa.cpp
util/result2flat.cpp
Expand Down
79 changes: 79 additions & 0 deletions src/util/profile2neff.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#include "Parameters.h"
#include "DBReader.h"
#include "DBWriter.h"
#include "Util.h"
#include "Debug.h"
#include "Sequence.h"
#include "SubstitutionMatrix.h"
#include "itoa.h"

#ifdef OPENMP
#include <omp.h>
#endif

int profile2neff(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_PROFILE);

DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

const bool isDbOutput = par.dbOut;
const bool shouldCompress = isDbOutput == true && par.compressed == true;
const int dbType = isDbOutput == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE;
DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, shouldCompress, dbType);
writer.open();

SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0f, 0.0);

size_t entries = reader.getSize();
Debug::Progress progress(entries);
#pragma omp parallel
{
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());
#endif

Sequence seq(par.maxSeqLen, Parameters::DBTYPE_HMM_PROFILE, &subMat, 0, false, par.compBiasCorrection, false);

char buffer[256];
std::string result;
result.reserve(10 * 1024 * 1024);

#pragma omp for schedule(dynamic, 1000)
for (size_t i = 0; i < entries; ++i) {
progress.updateProgress();

unsigned int key = reader.getDbKey(i);
seq.mapSequence(i, key, reader.getData(i, thread_idx), reader.getSeqLen(i));

if (isDbOutput == false) {
result.append("Neff_Ms of sequence ");
Itoa::u32toa_sse2(key, buffer);
result.append(buffer);
result.push_back('\n');
}

// add a line containing Neff_M scores of `seq`
for(int j = 0; j < seq.L; ++j) {
// TODO(johannes): Here, the precision for the floating point conversion is not specified and set to 2
// digits. Consider setting the precision using a commandline parameter.
snprintf(buffer, sizeof(buffer), "%0.4f", seq.neffM[j]);
result.append(buffer);
if(j < (seq.L - 1)) result.push_back('\t');
}
result.push_back('\n');

writer.writeData(result.c_str(), result.length(), key, thread_idx, isDbOutput);
result.clear();
}
}
writer.close(isDbOutput == false);
if (isDbOutput == false) {
remove(par.db2Index.c_str());
}
reader.close();

return EXIT_SUCCESS;
}

0 comments on commit 4148e09

Please sign in to comment.