Skip to content

Commit ce52e2b

Browse files
src/gc_extrap.cpp: adding the ability to output the counts histogram
1 parent 87082db commit ce52e2b

File tree

1 file changed

+34
-25
lines changed

1 file changed

+34
-25
lines changed

src/gc_extrap.cpp

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,13 @@ gc_extrap_main(const int argc, const char *argv[]) {
8282
try {
8383
const size_t MIN_REQUIRED_COUNTS = 4;
8484

85+
string outfile;
86+
string histogram_outfile;
87+
8588
int diagonal = 0;
8689
size_t orig_max_terms = 100;
8790
size_t bin_size = 10;
88-
bool VERBOSE = false;
89-
string outfile;
91+
bool verbose = false;
9092
double base_step_size = 1.0e8;
9193
size_t max_width = 10000;
9294
bool SINGLE_ESTIMATE = false;
@@ -103,16 +105,18 @@ gc_extrap_main(const int argc, const char *argv[]) {
103105
#endif
104106

105107
const string description = R"(
106-
"Extrapolate the size of the covered genome by mapped reads. This \
107-
approach is described in Daley & Smith (2014). The method is the \
108-
same as for lc_extrap: using rational function approximation to \
109-
a power-series expansion for the number of \"unobserved\" bases \
110-
in the initial sample. The gc_extrap method is adapted to deal \
111-
with individual nucleotides rather than distinct reads.)";
108+
Extrapolate the size of the covered genome by mapped reads. This
109+
approach is described in Daley & Smith (2014). The method is the same
110+
as for lc_extrap: using rational function approximation to a
111+
power-series expansion for the number of "unobserved" bases in the
112+
initial sample. The gc_extrap method is adapted to deal with
113+
individual nucleotides rather than distinct reads.
114+
)";
115+
string program_name = fs::path(argv[0]).filename();
116+
program_name += " " + string(argv[1]);
112117

113118
// ********* GET COMMAND LINE ARGUMENTS FOR GC EXTRAP **********
114-
OptionParser opt_parse(fs::path(argv[1]).filename(), description,
115-
"<input-file>");
119+
OptionParser opt_parse(program_name, description, "<input-file>");
116120
opt_parse.add_opt("output", 'o',
117121
"coverage yield output file (default: stdout)", false,
118122
outfile);
@@ -131,7 +135,9 @@ gc_extrap_main(const int argc, const char *argv[]) {
131135
c_level);
132136
opt_parse.add_opt("terms", 'x', "maximum number of terms", false,
133137
orig_max_terms);
134-
opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE);
138+
opt_parse.add_opt("verbose", 'v', "print more information", false, verbose);
139+
opt_parse.add_opt("hist-out", '\0', "output histogram to this file", false,
140+
histogram_outfile);
135141
opt_parse.add_opt("bed", 'B',
136142
"input is in bed format without sequence information",
137143
false, NO_SEQUENCE);
@@ -175,37 +181,40 @@ gc_extrap_main(const int argc, const char *argv[]) {
175181

176182
vector<double> coverage_hist;
177183
size_t n_reads = 0;
178-
if (VERBOSE)
184+
if (verbose)
179185
cerr << "LOADING READS" << endl;
180186

181187
if (NO_SEQUENCE) {
182-
if (VERBOSE)
188+
if (verbose)
183189
cerr << "BED FORMAT" << endl;
184190
n_reads = load_coverage_counts_GR(infile, seed, bin_size, max_width,
185191
coverage_hist);
186192
}
187193
#ifdef HAVE_HTSLIB
188194
else if (BAM_FORMAT_INPUT) {
189-
if (VERBOSE)
195+
if (verbose)
190196
cerr << "BAM_INPUT" << endl;
191197
n_reads = load_coverage_counts_BAM(n_threads, infile, seed, bin_size,
192198
max_width, coverage_hist);
193199
}
194200
#endif
195201
else {
196-
if (VERBOSE)
202+
if (verbose)
197203
cerr << "MAPPED READ FORMAT" << endl;
198204
n_reads = load_coverage_counts_MR(infile, seed, bin_size, max_width,
199205
coverage_hist);
200206
}
201207

202208
const double total_bins = get_counts_from_hist(coverage_hist);
203209

210+
if (verbose)
211+
report_histogram(histogram_outfile, coverage_hist);
212+
204213
const double distinct_bins =
205214
accumulate(cbegin(coverage_hist), cend(coverage_hist), 0.0);
206215

207216
const double avg_bins_per_read = total_bins / n_reads;
208-
double bin_step_size = base_step_size / bin_size;
217+
const double bin_step_size = base_step_size / bin_size;
209218

210219
const size_t max_observed_count = coverage_hist.size() - 1;
211220

@@ -216,7 +225,7 @@ gc_extrap_main(const int argc, const char *argv[]) {
216225

217226
orig_max_terms = min(orig_max_terms, first_zero - 1);
218227

219-
if (VERBOSE)
228+
if (verbose)
220229
cerr << "TOTAL READS = " << n_reads << endl
221230
<< "BASE STEP SIZE = " << base_step_size << endl
222231
<< "BIN STEP SIZE = " << bin_step_size << endl
@@ -228,7 +237,7 @@ gc_extrap_main(const int argc, const char *argv[]) {
228237
<< "MAX COVERAGE COUNT = " << max_observed_count << endl
229238
<< "COUNTS OF 1 = " << coverage_hist[1] << endl;
230239

231-
if (VERBOSE) {
240+
if (verbose) {
232241
// OUTPUT THE ORIGINAL HISTOGRAM
233242
cerr << "OBSERVED BIN COUNTS (" << coverage_hist.size() << ")" << endl;
234243
for (size_t i = 0; i < coverage_hist.size(); i++)
@@ -249,14 +258,14 @@ gc_extrap_main(const int argc, const char *argv[]) {
249258
throw runtime_error("Library expected to saturate in doubling of "
250259
"experiment size, unable to extrapolate");
251260

252-
if (VERBOSE)
261+
if (verbose)
253262
cerr << "[ESTIMATING COVERAGE CURVE]" << endl;
254263

255264
vector<double> coverage_estimates;
256265

257266
if (SINGLE_ESTIMATE) {
258267
bool SINGLE_ESTIMATE_SUCCESS = extrap_single_estimate(
259-
VERBOSE, allow_defects, coverage_hist, orig_max_terms, diagonal,
268+
verbose, allow_defects, coverage_hist, orig_max_terms, diagonal,
260269
bin_step_size, max_extrap / bin_size, coverage_estimates);
261270
// IF FAILURE, EXIT
262271
if (!SINGLE_ESTIMATE_SUCCESS)
@@ -265,7 +274,7 @@ gc_extrap_main(const int argc, const char *argv[]) {
265274

266275
std::ofstream of;
267276
if (!outfile.empty())
268-
of.open(outfile.c_str());
277+
of.open(outfile);
269278
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
270279

271280
out << "TOTAL_BASES\tEXPECTED_DISTINCT" << endl;
@@ -279,24 +288,24 @@ gc_extrap_main(const int argc, const char *argv[]) {
279288
<< coverage_estimates[i] * bin_size << endl;
280289
}
281290
else {
282-
if (VERBOSE)
291+
if (verbose)
283292
cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
284293

285294
const size_t max_iter = 10 * n_bootstraps;
286295

287296
vector<vector<double>> bootstrap_estimates;
288-
extrap_bootstrap(VERBOSE, allow_defects, seed, coverage_hist,
297+
extrap_bootstrap(verbose, allow_defects, seed, coverage_hist,
289298
n_bootstraps, orig_max_terms, diagonal, bin_step_size,
290299
max_extrap / bin_size, max_iter, bootstrap_estimates);
291300

292-
if (VERBOSE)
301+
if (verbose)
293302
cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
294303
vector<double> coverage_upper_ci_lognorm, coverage_lower_ci_lognorm;
295304
vector_median_and_ci(bootstrap_estimates, c_level, coverage_estimates,
296305
coverage_lower_ci_lognorm,
297306
coverage_upper_ci_lognorm);
298307

299-
if (VERBOSE)
308+
if (verbose)
300309
cerr << "[WRITING OUTPUT]" << endl;
301310

302311
write_predicted_coverage_curve(

0 commit comments

Comments
 (0)