|
23 | 23 | #include <cstring> |
24 | 24 | #include <memory> |
25 | 25 | #include <unordered_map> |
| 26 | +#include <vector> |
26 | 27 |
|
27 | 28 | #include "parquet/column/levels.h" |
28 | 29 | #include "parquet/column/page.h" |
@@ -124,8 +125,12 @@ class PARQUET_EXPORT TypedColumnReader : public ColumnReader { |
124 | 125 | // This API is the same for both V1 and V2 of the DataPage |
125 | 126 | // |
126 | 127 | // @returns: actual number of levels read (see values_read for number of values read) |
127 | | - int64_t ReadBatch(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels, |
128 | | - T* values, int64_t* values_read); |
| 128 | + int64_t ReadBatch(int batch_size, int16_t* def_levels, int16_t* rep_levels, T* values, |
| 129 | + int64_t* values_read); |
| 130 | + |
| 131 | + // Skip reading levels |
| 132 | + // Returns the number of levels skipped |
| 133 | + int64_t Skip(int64_t num_rows_to_skip); |
129 | 134 |
|
130 | 135 | private: |
131 | 136 | typedef Decoder<DType> DecoderType; |
@@ -166,7 +171,7 @@ inline int64_t TypedColumnReader<DType>::ReadBatch(int batch_size, int16_t* def_ |
166 | 171 |
|
167 | 172 | // TODO(wesm): keep reading data pages until batch_size is reached, or the |
168 | 173 | // row group is finished |
169 | | - batch_size = std::min(batch_size, num_buffered_values_); |
| 174 | + batch_size = std::min(batch_size, num_buffered_values_ - num_decoded_values_); |
170 | 175 |
|
171 | 176 | int64_t num_def_levels = 0; |
172 | 177 | int64_t num_rep_levels = 0; |
@@ -201,6 +206,39 @@ inline int64_t TypedColumnReader<DType>::ReadBatch(int batch_size, int16_t* def_ |
201 | 206 | return total_values; |
202 | 207 | } |
203 | 208 |
|
| 209 | +template <typename DType> |
| 210 | +inline int64_t TypedColumnReader<DType>::Skip(int64_t num_rows_to_skip) { |
| 211 | + int64_t rows_to_skip = num_rows_to_skip; |
| 212 | + while (HasNext() && rows_to_skip > 0) { |
| 213 | + // If the number of rows to skip is more than the number of undecoded values, skip the |
| 214 | + // Page. |
| 215 | + if (rows_to_skip > (num_buffered_values_ - num_decoded_values_)) { |
| 216 | + rows_to_skip -= num_buffered_values_ - num_decoded_values_; |
| 217 | + num_decoded_values_ = num_buffered_values_; |
| 218 | + } else { |
| 219 | + // We need to read this Page |
| 220 | + // Jump to the right offset in the Page |
| 221 | + int64_t batch_size = 1024; // ReadBatch with a smaller memory footprint |
| 222 | + int64_t values_read = 0; |
| 223 | + auto vals = std::make_shared<OwnedMutableBuffer>( |
| 224 | + batch_size * type_traits<DType::type_num>::value_byte_size, this->allocator_); |
| 225 | + auto def_levels = std::make_shared<OwnedMutableBuffer>( |
| 226 | + batch_size * sizeof(int16_t), this->allocator_); |
| 227 | + auto rep_levels = std::make_shared<OwnedMutableBuffer>( |
| 228 | + batch_size * sizeof(int16_t), this->allocator_); |
| 229 | + do { |
| 230 | + batch_size = std::min(batch_size, rows_to_skip); |
| 231 | + values_read = |
| 232 | + ReadBatch(batch_size, reinterpret_cast<int16_t*>(def_levels->mutable_data()), |
| 233 | + reinterpret_cast<int16_t*>(rep_levels->mutable_data()), |
| 234 | + reinterpret_cast<T*>(vals->mutable_data()), &values_read); |
| 235 | + rows_to_skip -= values_read; |
| 236 | + } while (values_read > 0 && rows_to_skip > 0); |
| 237 | + } |
| 238 | + } |
| 239 | + return num_rows_to_skip - rows_to_skip; |
| 240 | +} |
| 241 | + |
204 | 242 | typedef TypedColumnReader<BooleanType> BoolReader; |
205 | 243 | typedef TypedColumnReader<Int32Type> Int32Reader; |
206 | 244 | typedef TypedColumnReader<Int64Type> Int64Reader; |
|
0 commit comments