Skip to content

Commit 87082db

Browse files
src/c_curve.cpp: adding the ability to output the counts histogram
1 parent 8cc2b43 commit 87082db

File tree

1 file changed

+45
-48
lines changed

1 file changed

+45
-48
lines changed

src/c_curve.cpp

Lines changed: 45 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
#include "load_data_for_complexity.hpp"
2626
#include "moment_sequence.hpp"
2727

28-
#include <GenomicRegion.hpp>
2928
#include <OptionParser.hpp>
3029
#include <smithlab_os.hpp>
3130
#include <smithlab_utils.hpp>
@@ -38,17 +37,21 @@
3837
#include <cmath>
3938
#include <cstddef>
4039
#include <cstdint>
41-
#include <cstring>
40+
#include <filesystem>
4241
#include <fstream>
4342
#include <iomanip>
4443
#include <iostream>
4544
#include <numeric>
4645
#include <random>
4746
#include <string>
48-
#include <unordered_map>
4947
#include <vector>
5048

49+
namespace fs = std::filesystem;
50+
51+
using std::accumulate;
5152
using std::array;
53+
using std::cbegin;
54+
using std::cend;
5255
using std::cerr;
5356
using std::endl;
5457
using std::isfinite;
@@ -59,37 +62,33 @@ using std::runtime_error;
5962
using std::setprecision;
6063
using std::size;
6164
using std::string;
62-
using std::to_string;
6365
using std::uint64_t;
64-
using std::unordered_map;
6566
using std::vector;
6667

6768
template <typename T>
6869
T
69-
median_from_sorted_vector(const vector<T> sorted_data, const size_t stride,
70+
median_from_sorted_vector(const vector<T> &sorted_data, const size_t stride,
7071
const size_t n) {
7172
if (n == 0 || sorted_data.empty())
7273
return 0.0;
73-
7474
const size_t lhs = (n - 1) / 2;
7575
const size_t rhs = n / 2;
76-
7776
if (lhs == rhs)
7877
return sorted_data[lhs * stride];
79-
8078
return (sorted_data[lhs * stride] + sorted_data[rhs * stride]) / 2.0;
8179
}
8280

8381
int
8482
c_curve_main(const int argc, const char *argv[]) {
8583
try {
86-
bool VERBOSE = false;
84+
bool verbose = false;
8785
bool PAIRED_END = false;
8886
bool HIST_INPUT = false;
8987
bool VALS_INPUT = false;
9088
uint64_t seed = 408;
9189

9290
string outfile;
91+
string histogram_outfile;
9392

9493
size_t upper_limit = 0;
9594
double step_size = 1e6;
@@ -99,25 +98,32 @@ c_curve_main(const int argc, const char *argv[]) {
9998
uint32_t n_threads{1};
10099
#endif
101100

102-
const string description = R"(
103-
Generate the complexity curve for data. This does not extrapolate, \
104-
but instead resamples from the given data.)";
101+
const string description =
102+
R"(
103+
Generate the complexity curve for data. This does not extrapolate, but
104+
instead resamples from the given data.
105+
)";
106+
string program_name = fs::path(argv[0]).filename();
107+
program_name += " " + string(argv[1]);
105108

106109
/********** GET COMMAND LINE ARGUMENTS FOR C_CURVE ***********/
107-
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
110+
OptionParser opt_parse(program_name, description, "<input-file>");
108111
opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
109112
false, outfile);
110113
opt_parse.add_opt("step", 's', "step size in extrapolations", false,
111114
step_size);
112-
opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE);
113-
opt_parse.add_opt("pe", 'P', "input is paired end read file", false,
115+
opt_parse.add_opt("verbose", 'v', "print more information", false, verbose);
116+
opt_parse.add_opt("pe", 'P', "input paired end read file", false,
114117
PAIRED_END);
115118
opt_parse.add_opt("hist", 'H',
116-
"input is a text file containing the observed histogram",
117-
false, HIST_INPUT);
118-
opt_parse.add_opt(
119-
"vals", 'V', "input is a text file containing only the observed counts",
120-
false, VALS_INPUT);
119+
"input is text file containing observed histogram", false,
120+
HIST_INPUT);
121+
opt_parse.add_opt("hist-out", '\0',
122+
"output histogram to this file (for non-hist input)",
123+
false, histogram_outfile);
124+
opt_parse.add_opt("vals", 'V',
125+
"input is text file containing only observed counts",
126+
false, VALS_INPUT);
121127
#ifdef HAVE_HTSLIB
122128
opt_parse.add_opt("bam", 'B', "input is in BAM format", false,
123129
BAM_FORMAT_INPUT);
@@ -136,6 +142,7 @@ but instead resamples from the given data.)";
136142
opt_parse.parse(argc - 1, argv + 1, leftover_args);
137143
if (argc == 2 || opt_parse.help_requested()) {
138144
cerr << opt_parse.help_message() << endl;
145+
cerr << opt_parse.about_message() << endl;
139146
return EXIT_SUCCESS;
140147
}
141148
if (opt_parse.about_requested()) {
@@ -154,101 +161,91 @@ but instead resamples from the given data.)";
154161
/******************************************************************/
155162

156163
// Setup the random number generator
157-
srand(time(0) + getpid()); // give the random fxn a new seed
164+
srand(time(0) + getpid()); // random seed
158165
mt19937 rng(seed);
159166

160167
vector<double> counts_hist;
161168
size_t n_reads = 0;
162169

163170
// LOAD VALUES
164171
if (HIST_INPUT) {
165-
if (VERBOSE)
172+
if (verbose)
166173
cerr << "INPUT_HIST" << endl;
167174
n_reads = load_histogram(input_file_name, counts_hist);
168175
}
169176
else if (VALS_INPUT) {
170-
if (VERBOSE)
177+
if (verbose)
171178
cerr << "VALS_INPUT" << endl;
172179
n_reads = load_counts(input_file_name, counts_hist);
173180
}
174181
#ifdef HAVE_HTSLIB
175182
else if (BAM_FORMAT_INPUT && PAIRED_END) {
176-
if (VERBOSE)
183+
if (verbose)
177184
cerr << "PAIRED_END_BAM_INPUT" << endl;
178185
n_reads = load_counts_BAM_pe(n_threads, input_file_name, counts_hist);
179186
}
180187
else if (BAM_FORMAT_INPUT) {
181-
if (VERBOSE)
188+
if (verbose)
182189
cerr << "BAM_INPUT" << endl;
183190
n_reads = load_counts_BAM_se(n_threads, input_file_name, counts_hist);
184191
}
185192
#endif
186193
else if (PAIRED_END) {
187-
if (VERBOSE)
194+
if (verbose)
188195
cerr << "PAIRED_END_BED_INPUT" << endl;
189196
n_reads = load_counts_BED_pe(input_file_name, counts_hist);
190197
}
191198
else { // default is single end bed file
192-
if (VERBOSE)
199+
if (verbose)
193200
cerr << "BED_INPUT" << endl;
194201
n_reads = load_counts_BED_se(input_file_name, counts_hist);
195202
}
196203

197204
const size_t max_observed_count = counts_hist.size() - 1;
198205
const double distinct_reads =
199-
accumulate(begin(counts_hist), end(counts_hist), 0.0);
206+
accumulate(cbegin(counts_hist), cend(counts_hist), 0.0);
200207

201208
const size_t total_reads = get_counts_from_hist(counts_hist);
202209

203210
const size_t distinct_counts =
204-
std::count_if(begin(counts_hist), end(counts_hist),
211+
std::count_if(cbegin(counts_hist), cend(counts_hist),
205212
[](const double x) { return x > 0.0; });
206213

207-
if (VERBOSE)
214+
if (verbose)
208215
cerr << "TOTAL READS = " << n_reads << endl
209216
<< "COUNTS_SUM = " << total_reads << endl
210217
<< "DISTINCT READS = " << distinct_reads << endl
211218
<< "DISTINCT COUNTS = " << distinct_counts << endl
212219
<< "MAX COUNT = " << max_observed_count << endl
213220
<< "COUNTS OF 1 = " << counts_hist[1] << endl;
214221

215-
if (VERBOSE) {
216-
// output the original histogram
217-
cerr << "OBSERVED COUNTS (" << counts_hist.size() << ")" << endl;
218-
for (size_t i = 0; i < counts_hist.size(); i++)
219-
if (counts_hist[i] > 0)
220-
cerr << i << '\t' << static_cast<size_t>(counts_hist[i]) << endl;
221-
cerr << endl;
222-
}
222+
if (verbose)
223+
report_histogram(histogram_outfile, counts_hist);
223224

224225
if (upper_limit == 0)
225-
upper_limit = n_reads; // set upper limit to equal the number of
226+
upper_limit = n_reads; // set upper limit equal to number of
226227
// molecules
227228

228-
// handles output of c_curve
229+
// setup for output of the complexity curve
229230
std::ofstream of;
230231
if (!outfile.empty())
231-
of.open(outfile.c_str());
232+
of.open(outfile);
232233
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
233234

234235
// prints the complexity curve
235236
out << "total_reads" << "\t" << "distinct_reads" << endl;
236237
out << 0 << '\t' << 0 << endl;
237238
for (size_t i = step_size; i <= upper_limit; i += step_size) {
238-
if (VERBOSE)
239+
if (verbose)
239240
cerr << "sample size: " << i << endl;
240241
out << i << "\t"
241242
<< interpolate_distinct(counts_hist, total_reads, distinct_reads, i)
242243
<< endl;
243244
}
244245
}
245-
catch (runtime_error &e) {
246+
catch (const std::exception &e) {
246247
cerr << "ERROR:\t" << e.what() << endl;
247248
return EXIT_FAILURE;
248249
}
249-
catch (std::bad_alloc &ba) {
250-
cerr << "ERROR: could not allocate memory" << endl;
251-
return EXIT_FAILURE;
252-
}
253250
return EXIT_SUCCESS;
254251
}

0 commit comments

Comments
 (0)