@@ -82,11 +82,13 @@ gc_extrap_main(const int argc, const char *argv[]) {
82
82
try {
83
83
const size_t MIN_REQUIRED_COUNTS = 4 ;
84
84
85
+ string outfile;
86
+ string histogram_outfile;
87
+
85
88
int diagonal = 0 ;
86
89
size_t orig_max_terms = 100 ;
87
90
size_t bin_size = 10 ;
88
- bool VERBOSE = false ;
89
- string outfile;
91
+ bool verbose = false ;
90
92
double base_step_size = 1.0e8 ;
91
93
size_t max_width = 10000 ;
92
94
bool SINGLE_ESTIMATE = false ;
@@ -103,16 +105,18 @@ gc_extrap_main(const int argc, const char *argv[]) {
103
105
#endif
104
106
105
107
const string description = R"(
106
- "Extrapolate the size of the covered genome by mapped reads. This \
107
- approach is described in Daley & Smith (2014). The method is the \
108
- same as for lc_extrap: using rational function approximation to \
109
- a power-series expansion for the number of \"unobserved\" bases \
110
- in the initial sample. The gc_extrap method is adapted to deal \
111
- with individual nucleotides rather than distinct reads.)" ;
108
+ Extrapolate the size of the covered genome by mapped reads. This
109
+ approach is described in Daley & Smith (2014). The method is the same
110
+ as for lc_extrap: using rational function approximation to a
111
+ power-series expansion for the number of "unobserved" bases in the
112
+ initial sample. The gc_extrap method is adapted to deal with
113
+ individual nucleotides rather than distinct reads.
114
+ )" ;
115
+ string program_name = fs::path (argv[0 ]).filename ();
116
+ program_name += " " + string (argv[1 ]);
112
117
113
118
// ********* GET COMMAND LINE ARGUMENTS FOR GC EXTRAP **********
114
- OptionParser opt_parse (fs::path (argv[1 ]).filename (), description,
115
- " <input-file>" );
119
+ OptionParser opt_parse (program_name, description, " <input-file>" );
116
120
opt_parse.add_opt (" output" , ' o' ,
117
121
" coverage yield output file (default: stdout)" , false ,
118
122
outfile);
@@ -131,7 +135,9 @@ gc_extrap_main(const int argc, const char *argv[]) {
131
135
c_level);
132
136
opt_parse.add_opt (" terms" , ' x' , " maximum number of terms" , false ,
133
137
orig_max_terms);
134
- opt_parse.add_opt (" verbose" , ' v' , " print more information" , false , VERBOSE);
138
+ opt_parse.add_opt (" verbose" , ' v' , " print more information" , false , verbose);
139
+ opt_parse.add_opt (" hist-out" , ' \0 ' , " output histogram to this file" , false ,
140
+ histogram_outfile);
135
141
opt_parse.add_opt (" bed" , ' B' ,
136
142
" input is in bed format without sequence information" ,
137
143
false , NO_SEQUENCE);
@@ -175,37 +181,40 @@ gc_extrap_main(const int argc, const char *argv[]) {
175
181
176
182
vector<double > coverage_hist;
177
183
size_t n_reads = 0 ;
178
- if (VERBOSE )
184
+ if (verbose )
179
185
cerr << " LOADING READS" << endl;
180
186
181
187
if (NO_SEQUENCE) {
182
- if (VERBOSE )
188
+ if (verbose )
183
189
cerr << " BED FORMAT" << endl;
184
190
n_reads = load_coverage_counts_GR (infile, seed, bin_size, max_width,
185
191
coverage_hist);
186
192
}
187
193
#ifdef HAVE_HTSLIB
188
194
else if (BAM_FORMAT_INPUT) {
189
- if (VERBOSE )
195
+ if (verbose )
190
196
cerr << " BAM_INPUT" << endl;
191
197
n_reads = load_coverage_counts_BAM (n_threads, infile, seed, bin_size,
192
198
max_width, coverage_hist);
193
199
}
194
200
#endif
195
201
else {
196
- if (VERBOSE )
202
+ if (verbose )
197
203
cerr << " MAPPED READ FORMAT" << endl;
198
204
n_reads = load_coverage_counts_MR (infile, seed, bin_size, max_width,
199
205
coverage_hist);
200
206
}
201
207
202
208
const double total_bins = get_counts_from_hist (coverage_hist);
203
209
210
+ if (verbose)
211
+ report_histogram (histogram_outfile, coverage_hist);
212
+
204
213
const double distinct_bins =
205
214
accumulate (cbegin (coverage_hist), cend (coverage_hist), 0.0 );
206
215
207
216
const double avg_bins_per_read = total_bins / n_reads;
208
- double bin_step_size = base_step_size / bin_size;
217
+ const double bin_step_size = base_step_size / bin_size;
209
218
210
219
const size_t max_observed_count = coverage_hist.size () - 1 ;
211
220
@@ -216,7 +225,7 @@ gc_extrap_main(const int argc, const char *argv[]) {
216
225
217
226
orig_max_terms = min (orig_max_terms, first_zero - 1 );
218
227
219
- if (VERBOSE )
228
+ if (verbose )
220
229
cerr << " TOTAL READS = " << n_reads << endl
221
230
<< " BASE STEP SIZE = " << base_step_size << endl
222
231
<< " BIN STEP SIZE = " << bin_step_size << endl
@@ -228,7 +237,7 @@ gc_extrap_main(const int argc, const char *argv[]) {
228
237
<< " MAX COVERAGE COUNT = " << max_observed_count << endl
229
238
<< " COUNTS OF 1 = " << coverage_hist[1 ] << endl;
230
239
231
- if (VERBOSE ) {
240
+ if (verbose ) {
232
241
// OUTPUT THE ORIGINAL HISTOGRAM
233
242
cerr << " OBSERVED BIN COUNTS (" << coverage_hist.size () << " )" << endl;
234
243
for (size_t i = 0 ; i < coverage_hist.size (); i++)
@@ -249,14 +258,14 @@ gc_extrap_main(const int argc, const char *argv[]) {
249
258
throw runtime_error (" Library expected to saturate in doubling of "
250
259
" experiment size, unable to extrapolate" );
251
260
252
- if (VERBOSE )
261
+ if (verbose )
253
262
cerr << " [ESTIMATING COVERAGE CURVE]" << endl;
254
263
255
264
vector<double > coverage_estimates;
256
265
257
266
if (SINGLE_ESTIMATE) {
258
267
bool SINGLE_ESTIMATE_SUCCESS = extrap_single_estimate (
259
- VERBOSE , allow_defects, coverage_hist, orig_max_terms, diagonal,
268
+ verbose , allow_defects, coverage_hist, orig_max_terms, diagonal,
260
269
bin_step_size, max_extrap / bin_size, coverage_estimates);
261
270
// IF FAILURE, EXIT
262
271
if (!SINGLE_ESTIMATE_SUCCESS)
@@ -265,7 +274,7 @@ gc_extrap_main(const int argc, const char *argv[]) {
265
274
266
275
std::ofstream of;
267
276
if (!outfile.empty ())
268
- of.open (outfile. c_str () );
277
+ of.open (outfile);
269
278
std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
270
279
271
280
out << " TOTAL_BASES\t EXPECTED_DISTINCT" << endl;
@@ -279,24 +288,24 @@ gc_extrap_main(const int argc, const char *argv[]) {
279
288
<< coverage_estimates[i] * bin_size << endl;
280
289
}
281
290
else {
282
- if (VERBOSE )
291
+ if (verbose )
283
292
cerr << " [BOOTSTRAPPING HISTOGRAM]" << endl;
284
293
285
294
const size_t max_iter = 10 * n_bootstraps;
286
295
287
296
vector<vector<double >> bootstrap_estimates;
288
- extrap_bootstrap (VERBOSE , allow_defects, seed, coverage_hist,
297
+ extrap_bootstrap (verbose , allow_defects, seed, coverage_hist,
289
298
n_bootstraps, orig_max_terms, diagonal, bin_step_size,
290
299
max_extrap / bin_size, max_iter, bootstrap_estimates);
291
300
292
- if (VERBOSE )
301
+ if (verbose )
293
302
cerr << " [COMPUTING CONFIDENCE INTERVALS]" << endl;
294
303
vector<double > coverage_upper_ci_lognorm, coverage_lower_ci_lognorm;
295
304
vector_median_and_ci (bootstrap_estimates, c_level, coverage_estimates,
296
305
coverage_lower_ci_lognorm,
297
306
coverage_upper_ci_lognorm);
298
307
299
- if (VERBOSE )
308
+ if (verbose )
300
309
cerr << " [WRITING OUTPUT]" << endl;
301
310
302
311
write_predicted_coverage_curve (
0 commit comments