Skip to content

Commit

Permalink
Merge pull request #234 from s4hts/trim-bug
Browse files Browse the repository at this point in the history
fixes ntrimmer bug basepair output growing in size
  • Loading branch information
joe-angell authored Oct 12, 2020
2 parents caee236 + 9c10e42 commit 438994d
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 13 deletions.
2 changes: 1 addition & 1 deletion cmake/version.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required (VERSION 3.2)

execute_process(
COMMAND git describe --tags
COMMAND git describe --tags --dirty
OUTPUT_VARIABLE HTSTREAM_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE result
Expand Down
24 changes: 13 additions & 11 deletions common/src/read.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,18 +70,18 @@ class Read {
std::vector<std::string> get_comment() const { return comments;}
static char complement(char bp);

const std::string get_sub_seq() const { return cut_R < cut_L ? "N" : seq.substr(cut_L, cut_R - cut_L); }
const std::string get_sub_qual() const { return cut_R < cut_L ? "#" : qual.substr(cut_L, cut_R - cut_L); }
const std::string get_sub_seq() const { return cut_R <= cut_L ? "N" : seq.substr(cut_L, cut_R - cut_L); }
const std::string get_sub_qual() const { return cut_R <= cut_L ? "#" : qual.substr(cut_L, cut_R - cut_L); }


const std::string get_seq_rc() const { if (cut_R < cut_L) { return "N"; }
const std::string get_seq_rc() const { if (cut_R <= cut_L) { return "N"; }
std::string s = seq.substr(cut_L, cut_R - cut_L) ;
std::transform(begin(s), end(s), begin(s), complement);
std::reverse(begin(s), end(s));
return s; }


const std::string get_qual_rc() const { if (cut_R < cut_L) { return "#"; }
const std::string get_qual_rc() const { if (cut_R <= cut_L) { return "#"; }
std::string q = qual.substr(cut_L, cut_R - cut_L);
std::reverse(begin(q), end(q));
return q; }
Expand All @@ -90,7 +90,7 @@ class Read {
void add_comment( const std::string& tag ) { if (tag != "") comments.push_back(tag);}
void join_comment( const std::vector<std::string>& new_comments, size_t offset = 0) { comments.insert(comments.end(), new_comments.begin() + offset, new_comments.end()); }
void set_read_rc() {
if (cut_R < cut_L) {
if (cut_R <= cut_L) {
return;
}
std::string s = seq.substr(cut_L, cut_R - cut_L) ;
Expand All @@ -105,13 +105,15 @@ class Read {
void changeSeq( size_t loc, char bp ) { seq[loc] = bp; }
void changeQual( size_t loc, char score ) {qual[loc] = score; }

void setRCut( size_t cut_R_ ) { cut_R = cut_R_;}
void setLCut( size_t cut_L_ ) { cut_L = cut_L_; }
void setRCut( size_t cut_R_ ) { assert(cut_R_ <= length); cut_R = cut_R_; }
void setLCut( size_t cut_L_ ) { assert(cut_L_ <= length); cut_L = cut_L_; }
bool getDiscard() const { return discard; }
void setDiscard() { discard = true; }
size_t getLength() const { return length; }
size_t getLengthTrue() const { return cut_R < cut_L ? 0 : cut_R - cut_L; }
size_t getLengthTrue() const { return cut_R <= cut_L ? 1 : cut_R - cut_L; }
// number of bp that are trimmed off left side
size_t getLTrim() const { return cut_L; }
// number of bp that are trimmed off right side
size_t getRTrim() const { return length - cut_R; }
};

Expand All @@ -128,7 +130,7 @@ class ReadBase {

Reads& get_reads_non_const() { return reads; }
const Reads& get_reads() const { return reads; }

virtual boost::optional<boost::dynamic_bitset<>> get_key(size_t start, size_t length) = 0;
static boost::optional<BitSet> str_to_bit(const std::string& StrKey) {
// converts a string to a 2bit representation: A:00, T:11, C:01, G:10
Expand Down Expand Up @@ -216,12 +218,12 @@ class SingleEndRead: public ReadBase {
SingleEndRead(const ReadPtr& one_) : SingleEndRead() {
one = one_;
reads[0] = one;
}
}
SingleEndRead(const std::vector<ReadPtr> reads_) {
reads = reads_;
one = reads[0];
}

virtual boost::optional<BitSet> get_key(size_t start, size_t length);
Read& non_const_read_one() { return *one; }
const Read& get_read() const { return *one; }
Expand Down
16 changes: 16 additions & 0 deletions hts_CutTrim/test/hts_TestCutTrim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,19 @@ TEST_F(CutTrimTest, PETrim) {
}
ASSERT_EQ("Read2\tCTTGGGTCCTATGGTATCGGTACTGGTACT\t##############################\tRead1\tTGACATTAAGCAAGTACCAGTACCGATACC\t##############################\n", out1->str());
};


TEST_F(CutTrimTest, RightTrim) {
std::istringstream in1(readData_1);
std::istringstream in2(readData_2);

InputReader<PairedEndRead, PairedEndReadFastqImpl> ifp(in1, in2);

while(ifp.has_next()) {
auto i = ifp.next();
PairedEndRead *per = dynamic_cast<PairedEndRead*>(i.get());
ct.cut_trim(per->non_const_read_one(), 0, 2);
ASSERT_EQ("TGACTTGACATTAAGCAAGTACCAGTACCGATACCATAGGACCCAAGG", (per->non_const_read_one()).get_sub_seq());
ASSERT_EQ(per->get_read_one().getRTrim(), 2u);
}
};
7 changes: 6 additions & 1 deletion hts_NTrimmer/src/hts_NTrimmer.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class NTrimmer: public MainTemplate<NTrimCounters, NTrimmer> {
size_t i = 0;

for (std::string::iterator it = seq.begin(); it != seq.end(); ++it) {
i = static_cast<size_t>(it - seq.begin() );
i = it - seq.begin();

if (*it == 'N') {
if (exclude) {
Expand All @@ -82,6 +82,11 @@ class NTrimmer: public MainTemplate<NTrimCounters, NTrimmer> {
bestLeft = currentLeft;
}
}
// don't change rcut if sequence is empty
if (seq.length() == 0) {
return true;
}

rb.setLCut(bestLeft);
rb.setRCut(bestRight+1);
return true;
Expand Down
42 changes: 42 additions & 0 deletions hts_NTrimmer/test/hts_TestNTrimmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class TrimN : public ::testing::Test {
const std::string readData_8 = "@Read1\nNNNNNNNNNNNNNNNNNNNNNNN\n+\n#######################\n";
const std::string readData_9a = "@Read1\nCTGACTGACTGANNNACTGACTGACTGNCTGACTG\n+\n###################################\n";
const std::string readData_9b = "@Read1\nCTGACTGACTGANNNACTGACTGACTGANTGACTG\n+\n###################################\n";
const std::string readData_10a = "@Read1\nN\n+\n#\n";
const std::string readData_10b = "@Read1\nACTNNNTGCT\n+\n##########\n";
const std::string readData_empty_tab = "D00689:146:C9B2EANXX:8:2303:19367:3733#GATCAC ";
NTrimmer nt;
};

Expand Down Expand Up @@ -164,3 +167,42 @@ TEST_F(TrimN, longN) {
ASSERT_EQ("ACTGACTGACTGA", (per->non_const_read_two()).get_sub_seq());
}
};

TEST_F(TrimN, shortN) {
std::istringstream in1(readData_10a);
std::istringstream in2(readData_10b);

InputReader<PairedEndRead, PairedEndReadFastqImpl> ifp(in1, in2);

while(ifp.has_next()) {
auto i = ifp.next();
PairedEndRead *per = dynamic_cast<PairedEndRead*>(i.get());
nt.trim_n(per->non_const_read_one(), false);
nt.trim_n(per->non_const_read_two(), false);
ASSERT_EQ("N", (per->non_const_read_one()).get_sub_seq());
ASSERT_EQ(0u, (per->get_read_one().getRTrim()));
ASSERT_EQ(1u, (per->get_read_one().getLengthTrue()));
ASSERT_EQ(per->get_read_one().getLength(),per->get_read_one().getLengthTrue());

ASSERT_EQ("TGCT", (per->non_const_read_two()).get_sub_seq());
ASSERT_EQ(0u, (per->get_read_two().getRTrim()));
ASSERT_EQ(6u, (per->get_read_two().getLTrim()));
ASSERT_EQ(10u, per->get_read_two().getLength());
ASSERT_EQ(4u, per->get_read_two().getLengthTrue());
}
};

TEST_F(TrimN, emptyTab) {
std::istringstream in1(readData_empty_tab);

InputReader<ReadBase, TabReadImpl> ifp(in1);

while(ifp.has_next()) {
auto i = ifp.next();
SingleEndRead *per = dynamic_cast<SingleEndRead*>(i.get());
nt.trim_n(per->non_const_read_one(), false);
ASSERT_EQ(0u, per->get_read().getLength());
ASSERT_EQ(1u, per->get_read().getLengthTrue());
ASSERT_EQ(0u, per->get_read().getRTrim());
}
};

0 comments on commit 438994d

Please sign in to comment.