Skip to content

Commit 76ec50b

Browse files
authored
Fix page reader and support case-sensitivity in struct reader (#290)
1 parent 8d334db commit 76ec50b

14 files changed

+111
-23
lines changed

velox/dwio/parquet/reader/PageReader.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,10 @@ void PageReader::prepareDataPageV1(const PageHeader& pageHeader, int64_t row) {
276276
pageData_,
277277
pageData_ + defineLength,
278278
arrow::bit_util::NumRequiredBits(maxDefine_));
279+
wideDefineDecoder_ = std::make_unique<arrow::util::RleDecoder>(
280+
reinterpret_cast<const uint8_t*>(pageData_),
281+
defineLength,
282+
arrow::bit_util::NumRequiredBits(maxDefine_));
279283
} else {
280284
wideDefineDecoder_ = std::make_unique<arrow::util::RleDecoder>(
281285
reinterpret_cast<const uint8_t*>(pageData_),
@@ -615,7 +619,7 @@ void PageReader::preloadRepDefs() {
615619
}
616620

617621
void PageReader::decodeRepDefs(int32_t numTopLevelRows) {
618-
if (definitionLevels_.empty()) {
622+
if (definitionLevels_.empty() && maxDefine_ > 0) {
619623
preloadRepDefs();
620624
}
621625
repDefBegin_ = repDefEnd_;

velox/dwio/parquet/reader/ParquetColumnReader.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ namespace facebook::velox::parquet {
3737
std::unique_ptr<dwio::common::SelectiveColumnReader> ParquetColumnReader::build(
3838
const std::shared_ptr<const dwio::common::TypeWithId>& dataType,
3939
ParquetParams& params,
40-
common::ScanSpec& scanSpec) {
40+
common::ScanSpec& scanSpec,
41+
bool caseSensitive) {
4142
auto colName = scanSpec.fieldName();
4243

4344
switch (dataType->type->kind()) {
@@ -59,17 +60,20 @@ std::unique_ptr<dwio::common::SelectiveColumnReader> ParquetColumnReader::build(
5960
dataType, dataType->type, params, scanSpec);
6061

6162
case TypeKind::ROW:
62-
return std::make_unique<StructColumnReader>(dataType, params, scanSpec);
63+
return std::make_unique<StructColumnReader>(
64+
dataType, params, scanSpec, caseSensitive);
6365

6466
case TypeKind::VARBINARY:
6567
case TypeKind::VARCHAR:
6668
return std::make_unique<StringColumnReader>(dataType, params, scanSpec);
6769

6870
case TypeKind::ARRAY:
69-
return std::make_unique<ListColumnReader>(dataType, params, scanSpec);
71+
return std::make_unique<ListColumnReader>(
72+
dataType, params, scanSpec, caseSensitive);
7073

7174
case TypeKind::MAP:
72-
return std::make_unique<MapColumnReader>(dataType, params, scanSpec);
75+
return std::make_unique<MapColumnReader>(
76+
dataType, params, scanSpec, caseSensitive);
7377

7478
case TypeKind::BOOLEAN:
7579
return std::make_unique<BooleanColumnReader>(dataType, params, scanSpec);

velox/dwio/parquet/reader/ParquetColumnReader.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class ParquetColumnReader {
4646
static std::unique_ptr<dwio::common::SelectiveColumnReader> build(
4747
const std::shared_ptr<const dwio::common::TypeWithId>& dataType,
4848
ParquetParams& params,
49-
common::ScanSpec& scanSpec);
49+
common::ScanSpec& scanSpec,
50+
bool caseSensitive);
5051
};
5152
} // namespace facebook::velox::parquet

velox/dwio/parquet/reader/ParquetReader.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,8 @@ int64_t ReaderBase::rowGroupUncompressedSize(
491491

492492
ParquetRowReader::ParquetRowReader(
493493
const std::shared_ptr<ReaderBase>& readerBase,
494-
const dwio::common::RowReaderOptions& options)
494+
const dwio::common::RowReaderOptions& options,
495+
bool caseSensitive)
495496
: pool_(readerBase->getMemoryPool()),
496497
readerBase_(readerBase),
497498
options_(options),
@@ -521,7 +522,8 @@ ParquetRowReader::ParquetRowReader(
521522
columnReader_ = ParquetColumnReader::build(
522523
readerBase_->schemaWithId(), // Id is schema id
523524
params,
524-
*options_.getScanSpec());
525+
*options_.getScanSpec(),
526+
caseSensitive);
525527

526528
filterRowGroups();
527529
if (!rowGroupIds_.empty()) {
@@ -634,6 +636,7 @@ ParquetReader::ParquetReader(
634636

635637
std::unique_ptr<dwio::common::RowReader> ParquetReader::createRowReader(
636638
const dwio::common::RowReaderOptions& options) const {
637-
return std::make_unique<ParquetRowReader>(readerBase_, options);
639+
return std::make_unique<ParquetRowReader>(
640+
readerBase_, options, readerBase_->isCaseSensitive());
638641
}
639642
} // namespace facebook::velox::parquet

velox/dwio/parquet/reader/ParquetReader.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ class ParquetRowReader : public dwio::common::RowReader {
126126
public:
127127
ParquetRowReader(
128128
const std::shared_ptr<ReaderBase>& readerBase,
129-
const dwio::common::RowReaderOptions& options);
129+
const dwio::common::RowReaderOptions& options,
130+
bool caseSensitive);
130131
~ParquetRowReader() override = default;
131132

132133
uint64_t next(uint64_t size, velox::VectorPtr& result) override;

velox/dwio/parquet/reader/RepeatedColumnReader.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,19 @@ void ensureRepDefs(
111111
MapColumnReader::MapColumnReader(
112112
std::shared_ptr<const dwio::common::TypeWithId> requestedType,
113113
ParquetParams& params,
114-
common::ScanSpec& scanSpec)
114+
common::ScanSpec& scanSpec,
115+
bool caseSensitive)
115116
: dwio::common::SelectiveMapColumnReader(
116117
requestedType,
117118
requestedType,
118119
params,
119120
scanSpec) {
120121
auto& keyChildType = requestedType->childAt(0);
121122
auto& elementChildType = requestedType->childAt(1);
122-
keyReader_ =
123-
ParquetColumnReader::build(keyChildType, params, *scanSpec.children()[0]);
123+
keyReader_ = ParquetColumnReader::build(
124+
keyChildType, params, *scanSpec.children()[0], caseSensitive);
124125
elementReader_ = ParquetColumnReader::build(
125-
elementChildType, params, *scanSpec.children()[1]);
126+
elementChildType, params, *scanSpec.children()[1], caseSensitive);
126127
reinterpret_cast<const ParquetTypeWithId*>(requestedType.get())
127128
->makeLevelInfo(levelInfo_);
128129
children_ = {keyReader_.get(), elementReader_.get()};
@@ -219,15 +220,16 @@ void MapColumnReader::filterRowGroups(
219220
ListColumnReader::ListColumnReader(
220221
std::shared_ptr<const dwio::common::TypeWithId> requestedType,
221222
ParquetParams& params,
222-
common::ScanSpec& scanSpec)
223+
common::ScanSpec& scanSpec,
224+
bool caseSensitive)
223225
: dwio::common::SelectiveListColumnReader(
224226
requestedType,
225227
requestedType,
226228
params,
227229
scanSpec) {
228230
auto& childType = requestedType->childAt(0);
229-
child_ =
230-
ParquetColumnReader::build(childType, params, *scanSpec.children()[0]);
231+
child_ = ParquetColumnReader::build(
232+
childType, params, *scanSpec.children()[0], caseSensitive);
231233
reinterpret_cast<const ParquetTypeWithId*>(requestedType.get())
232234
->makeLevelInfo(levelInfo_);
233235
children_ = {child_.get()};

velox/dwio/parquet/reader/RepeatedColumnReader.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ class MapColumnReader : public dwio::common::SelectiveMapColumnReader {
5858
MapColumnReader(
5959
std::shared_ptr<const dwio::common::TypeWithId> requestedType,
6060
ParquetParams& params,
61-
common::ScanSpec& scanSpec);
61+
common::ScanSpec& scanSpec,
62+
bool caseSensitive);
6263

6364
void prepareRead(
6465
vector_size_t offset,
@@ -113,7 +114,8 @@ class ListColumnReader : public dwio::common::SelectiveListColumnReader {
113114
ListColumnReader(
114115
std::shared_ptr<const dwio::common::TypeWithId> requestedType,
115116
ParquetParams& params,
116-
common::ScanSpec& scanSpec);
117+
common::ScanSpec& scanSpec,
118+
bool caseSensitive);
117119

118120
void prepareRead(
119121
vector_size_t offset,

velox/dwio/parquet/reader/StructColumnReader.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,22 @@ namespace facebook::velox::parquet {
2222
StructColumnReader::StructColumnReader(
2323
const std::shared_ptr<const dwio::common::TypeWithId>& dataType,
2424
ParquetParams& params,
25-
common::ScanSpec& scanSpec)
25+
common::ScanSpec& scanSpec,
26+
bool caseSensitive)
2627
: SelectiveStructColumnReader(dataType, dataType, params, scanSpec) {
2728
auto& childSpecs = scanSpec_->children();
2829
for (auto i = 0; i < childSpecs.size(); ++i) {
2930
if (childSpecs[i]->isConstant()) {
3031
continue;
3132
}
32-
auto childDataType = nodeType_->childByName(childSpecs[i]->fieldName());
33+
std::string fieldName = childSpecs[i]->fieldName();
34+
if (!caseSensitive) {
35+
folly::toLowerAscii(fieldName);
36+
}
37+
auto childDataType = nodeType_->childByName(fieldName);
3338

34-
addChild(ParquetColumnReader::build(childDataType, params, *childSpecs[i]));
39+
addChild(ParquetColumnReader::build(
40+
childDataType, params, *childSpecs[i], caseSensitive));
3541
childSpecs[i]->setSubscript(children_.size() - 1);
3642
}
3743
auto type = reinterpret_cast<const ParquetTypeWithId*>(nodeType_.get());

velox/dwio/parquet/reader/StructColumnReader.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ class StructColumnReader : public dwio::common::SelectiveStructColumnReader {
2626
StructColumnReader(
2727
const std::shared_ptr<const dwio::common::TypeWithId>& dataType,
2828
ParquetParams& params,
29-
common::ScanSpec& scanSpec);
29+
common::ScanSpec& scanSpec,
30+
bool caseSensitive);
3031

3132
void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls)
3233
override;
1.14 KB
Binary file not shown.

0 commit comments

Comments
 (0)