25
25
#include " load_data_for_complexity.hpp"
26
26
#include " moment_sequence.hpp"
27
27
28
- #include < GenomicRegion.hpp>
29
28
#include < OptionParser.hpp>
30
29
#include < smithlab_os.hpp>
31
30
#include < smithlab_utils.hpp>
38
37
#include < cmath>
39
38
#include < cstddef>
40
39
#include < cstdint>
41
- #include < cstring >
40
+ #include < filesystem >
42
41
#include < fstream>
43
42
#include < iomanip>
44
43
#include < iostream>
45
44
#include < numeric>
46
45
#include < random>
47
46
#include < string>
48
- #include < unordered_map>
49
47
#include < vector>
50
48
49
+ namespace fs = std::filesystem;
50
+
51
+ using std::accumulate;
51
52
using std::array;
53
+ using std::cbegin;
54
+ using std::cend;
52
55
using std::cerr;
53
56
using std::endl;
54
57
using std::isfinite;
@@ -59,37 +62,33 @@ using std::runtime_error;
59
62
using std::setprecision;
60
63
using std::size;
61
64
using std::string;
62
- using std::to_string;
63
65
using std::uint64_t ;
64
- using std::unordered_map;
65
66
using std::vector;
66
67
67
68
template <typename T>
68
69
T
69
- median_from_sorted_vector (const vector<T> sorted_data, const size_t stride,
70
+ median_from_sorted_vector (const vector<T> & sorted_data, const size_t stride,
70
71
const size_t n) {
71
72
if (n == 0 || sorted_data.empty ())
72
73
return 0.0 ;
73
-
74
74
const size_t lhs = (n - 1 ) / 2 ;
75
75
const size_t rhs = n / 2 ;
76
-
77
76
if (lhs == rhs)
78
77
return sorted_data[lhs * stride];
79
-
80
78
return (sorted_data[lhs * stride] + sorted_data[rhs * stride]) / 2.0 ;
81
79
}
82
80
83
81
int
84
82
c_curve_main (const int argc, const char *argv[]) {
85
83
try {
86
- bool VERBOSE = false ;
84
+ bool verbose = false ;
87
85
bool PAIRED_END = false ;
88
86
bool HIST_INPUT = false ;
89
87
bool VALS_INPUT = false ;
90
88
uint64_t seed = 408 ;
91
89
92
90
string outfile;
91
+ string histogram_outfile;
93
92
94
93
size_t upper_limit = 0 ;
95
94
double step_size = 1e6 ;
@@ -99,25 +98,32 @@ c_curve_main(const int argc, const char *argv[]) {
99
98
uint32_t n_threads{1 };
100
99
#endif
101
100
102
- const string description = R"(
103
- Generate the complexity curve for data. This does not extrapolate, \
104
- but instead resamples from the given data.)" ;
101
+ const string description =
102
+ R"(
103
+ Generate the complexity curve for data. This does not extrapolate, but
104
+ instead resamples from the given data.
105
+ )" ;
106
+ string program_name = fs::path (argv[0 ]).filename ();
107
+ program_name += " " + string (argv[1 ]);
105
108
106
109
/* ********* GET COMMAND LINE ARGUMENTS FOR C_CURVE ***********/
107
- OptionParser opt_parse (strip_path (argv[ 1 ]) , description, " <input-file>" );
110
+ OptionParser opt_parse (program_name , description, " <input-file>" );
108
111
opt_parse.add_opt (" output" , ' o' , " yield output file (default: stdout)" ,
109
112
false , outfile);
110
113
opt_parse.add_opt (" step" , ' s' , " step size in extrapolations" , false ,
111
114
step_size);
112
- opt_parse.add_opt (" verbose" , ' v' , " print more information" , false , VERBOSE );
113
- opt_parse.add_opt (" pe" , ' P' , " input is paired end read file" , false ,
115
+ opt_parse.add_opt (" verbose" , ' v' , " print more information" , false , verbose );
116
+ opt_parse.add_opt (" pe" , ' P' , " input paired end read file" , false ,
114
117
PAIRED_END);
115
118
opt_parse.add_opt (" hist" , ' H' ,
116
- " input is a text file containing the observed histogram" ,
117
- false , HIST_INPUT);
118
- opt_parse.add_opt (
119
- " vals" , ' V' , " input is a text file containing only the observed counts" ,
120
- false , VALS_INPUT);
119
+ " input is text file containing observed histogram" , false ,
120
+ HIST_INPUT);
121
+ opt_parse.add_opt (" hist-out" , ' \0 ' ,
122
+ " output histogram to this file (for non-hist input)" ,
123
+ false , histogram_outfile);
124
+ opt_parse.add_opt (" vals" , ' V' ,
125
+ " input is text file containing only observed counts" ,
126
+ false , VALS_INPUT);
121
127
#ifdef HAVE_HTSLIB
122
128
opt_parse.add_opt (" bam" , ' B' , " input is in BAM format" , false ,
123
129
BAM_FORMAT_INPUT);
@@ -136,6 +142,7 @@ but instead resamples from the given data.)";
136
142
opt_parse.parse (argc - 1 , argv + 1 , leftover_args);
137
143
if (argc == 2 || opt_parse.help_requested ()) {
138
144
cerr << opt_parse.help_message () << endl;
145
+ cerr << opt_parse.about_message () << endl;
139
146
return EXIT_SUCCESS;
140
147
}
141
148
if (opt_parse.about_requested ()) {
@@ -154,101 +161,91 @@ but instead resamples from the given data.)";
154
161
/* *****************************************************************/
155
162
156
163
// Setup the random number generator
157
- srand (time (0 ) + getpid ()); // give the random fxn a new seed
164
+ srand (time (0 ) + getpid ()); // random seed
158
165
mt19937 rng (seed);
159
166
160
167
vector<double > counts_hist;
161
168
size_t n_reads = 0 ;
162
169
163
170
// LOAD VALUES
164
171
if (HIST_INPUT) {
165
- if (VERBOSE )
172
+ if (verbose )
166
173
cerr << " INPUT_HIST" << endl;
167
174
n_reads = load_histogram (input_file_name, counts_hist);
168
175
}
169
176
else if (VALS_INPUT) {
170
- if (VERBOSE )
177
+ if (verbose )
171
178
cerr << " VALS_INPUT" << endl;
172
179
n_reads = load_counts (input_file_name, counts_hist);
173
180
}
174
181
#ifdef HAVE_HTSLIB
175
182
else if (BAM_FORMAT_INPUT && PAIRED_END) {
176
- if (VERBOSE )
183
+ if (verbose )
177
184
cerr << " PAIRED_END_BAM_INPUT" << endl;
178
185
n_reads = load_counts_BAM_pe (n_threads, input_file_name, counts_hist);
179
186
}
180
187
else if (BAM_FORMAT_INPUT) {
181
- if (VERBOSE )
188
+ if (verbose )
182
189
cerr << " BAM_INPUT" << endl;
183
190
n_reads = load_counts_BAM_se (n_threads, input_file_name, counts_hist);
184
191
}
185
192
#endif
186
193
else if (PAIRED_END) {
187
- if (VERBOSE )
194
+ if (verbose )
188
195
cerr << " PAIRED_END_BED_INPUT" << endl;
189
196
n_reads = load_counts_BED_pe (input_file_name, counts_hist);
190
197
}
191
198
else { // default is single end bed file
192
- if (VERBOSE )
199
+ if (verbose )
193
200
cerr << " BED_INPUT" << endl;
194
201
n_reads = load_counts_BED_se (input_file_name, counts_hist);
195
202
}
196
203
197
204
const size_t max_observed_count = counts_hist.size () - 1 ;
198
205
const double distinct_reads =
199
- accumulate (begin (counts_hist), end (counts_hist), 0.0 );
206
+ accumulate (cbegin (counts_hist), cend (counts_hist), 0.0 );
200
207
201
208
const size_t total_reads = get_counts_from_hist (counts_hist);
202
209
203
210
const size_t distinct_counts =
204
- std::count_if (begin (counts_hist), end (counts_hist),
211
+ std::count_if (cbegin (counts_hist), cend (counts_hist),
205
212
[](const double x) { return x > 0.0 ; });
206
213
207
- if (VERBOSE )
214
+ if (verbose )
208
215
cerr << " TOTAL READS = " << n_reads << endl
209
216
<< " COUNTS_SUM = " << total_reads << endl
210
217
<< " DISTINCT READS = " << distinct_reads << endl
211
218
<< " DISTINCT COUNTS = " << distinct_counts << endl
212
219
<< " MAX COUNT = " << max_observed_count << endl
213
220
<< " COUNTS OF 1 = " << counts_hist[1 ] << endl;
214
221
215
- if (VERBOSE) {
216
- // output the original histogram
217
- cerr << " OBSERVED COUNTS (" << counts_hist.size () << " )" << endl;
218
- for (size_t i = 0 ; i < counts_hist.size (); i++)
219
- if (counts_hist[i] > 0 )
220
- cerr << i << ' \t ' << static_cast <size_t >(counts_hist[i]) << endl;
221
- cerr << endl;
222
- }
222
+ if (verbose)
223
+ report_histogram (histogram_outfile, counts_hist);
223
224
224
225
if (upper_limit == 0 )
225
- upper_limit = n_reads; // set upper limit to equal the number of
226
+ upper_limit = n_reads; // set upper limit equal to number of
226
227
// molecules
227
228
228
- // handles output of c_curve
229
+ // setup for output of the complexity curve
229
230
std::ofstream of;
230
231
if (!outfile.empty ())
231
- of.open (outfile. c_str () );
232
+ of.open (outfile);
232
233
std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
233
234
234
235
// prints the complexity curve
235
236
out << " total_reads" << " \t " << " distinct_reads" << endl;
236
237
out << 0 << ' \t ' << 0 << endl;
237
238
for (size_t i = step_size; i <= upper_limit; i += step_size) {
238
- if (VERBOSE )
239
+ if (verbose )
239
240
cerr << " sample size: " << i << endl;
240
241
out << i << " \t "
241
242
<< interpolate_distinct (counts_hist, total_reads, distinct_reads, i)
242
243
<< endl;
243
244
}
244
245
}
245
- catch (runtime_error &e) {
246
+ catch (const std:: exception &e) {
246
247
cerr << " ERROR:\t " << e.what () << endl;
247
248
return EXIT_FAILURE;
248
249
}
249
- catch (std::bad_alloc &ba) {
250
- cerr << " ERROR: could not allocate memory" << endl;
251
- return EXIT_FAILURE;
252
- }
253
250
return EXIT_SUCCESS;
254
251
}
0 commit comments