Skip to content

Commit 3cfd907

Browse files
committed
add test case
1 parent cd59333 commit 3cfd907

File tree

8 files changed

+184
-319
lines changed

8 files changed

+184
-319
lines changed

velox/dwio/parquet/reader/Metadata.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,14 +220,17 @@ ColumnChunkMetaDataPtr::getColumnStatistics(
220220
};
221221

222222
bool ColumnChunkMetaDataPtr::hasEncodingStats() const {
223-
return hasMetadata() && thriftColumnChunkPtr(ptr_)->meta_data.__isset.encoding_stats;
223+
return hasMetadata() &&
224+
thriftColumnChunkPtr(ptr_)->meta_data.__isset.encoding_stats;
224225
}
225226

226-
const std::vector<thrift::PageEncodingStats>& ColumnChunkMetaDataPtr::getEncodingStats() const {
227+
const std::vector<thrift::PageEncodingStats>&
228+
ColumnChunkMetaDataPtr::getEncodingStats() const {
227229
return thriftColumnChunkPtr(ptr_)->meta_data.encoding_stats;
228230
}
229231

230-
const std::vector<thrift::Encoding::type>& ColumnChunkMetaDataPtr::getEncoding() const {
232+
const std::vector<thrift::Encoding::type>& ColumnChunkMetaDataPtr::getEncoding()
233+
const {
231234
return thriftColumnChunkPtr(ptr_)->meta_data.encodings;
232235
}
233236

velox/dwio/parquet/reader/Metadata.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ class ColumnChunkMetaDataPtr {
4444

4545
const std::vector<thrift::Encoding::type>& getEncoding() const;
4646

47-
4847
/// Return the ColumnChunk statistics.
4948
std::unique_ptr<dwio::common::ColumnStatistics> getColumnStatistics(
5049
const TypePtr type,

velox/dwio/parquet/reader/PageReader.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,6 @@ class PageReader {
325325
}
326326
} else {
327327
if (isDictionary()) {
328-
// LOG(DEBUG) << "PageReader: Using StringDictionaryColumnVisitor for row filtering (without nulls) - RowGroup:" << rowGroupIndex_ << " PageOrdinal:" << pageOrdinal_ << " PageIndex:" << pageIndex_;
329328
auto dictVisitor = visitor.toStringDictionaryColumnVisitor();
330329
dictionaryIdDecoder_->readWithVisitor<false>(nullptr, dictVisitor);
331330
} else if (encoding_ == thrift::Encoding::DELTA_BYTE_ARRAY) {

velox/dwio/parquet/reader/ParquetData.cpp

Lines changed: 60 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ namespace facebook::velox::parquet {
2828
std::unique_ptr<dwio::common::FormatData> ParquetParams::toFormatData(
2929
const std::shared_ptr<const dwio::common::TypeWithId>& type,
3030
const common::ScanSpec& /*scanSpec*/) {
31-
auto parquetData = std::make_unique<ParquetData>(
32-
type, metaData_, pool(), sessionTimezone_);
31+
auto parquetData =
32+
std::make_unique<ParquetData>(type, metaData_, pool(), sessionTimezone_);
3333
// Set the BufferedInput if available
3434
if (bufferedInput_) {
3535
parquetData->setBufferedInput(bufferedInput_);
@@ -95,27 +95,27 @@ bool ParquetData::rowGroupMatches(
9595
if (columnChunk.hasStatistics()) {
9696
auto columnStats =
9797
columnChunk.getColumnStatistics(type, rowGroup.numRows());
98-
bool statisticsResult = testFilter(filter, columnStats.get(), rowGroup.numRows(), type);
98+
bool statisticsResult =
99+
testFilter(filter, columnStats.get(), rowGroup.numRows(), type);
99100
if (!statisticsResult) {
100101
return false;
101102
}
102103
}
103-
104-
105-
bool canUseDictionaryFiltering = isOnlyDictionaryEncodingPagesImpl(columnChunk);
106-
if (canUseDictionaryFiltering) {
107-
bool dictionaryResult = testFilterAgainstDictionary(rowGroupId, filter, columnChunk);
108-
if (!dictionaryResult) {
109-
return false;
110-
}
104+
bool canUseDictionaryFiltering =
105+
isOnlyDictionaryEncodingPagesImpl(columnChunk);
106+
if (canUseDictionaryFiltering) {
107+
bool dictionaryResult =
108+
testFilterAgainstDictionary(rowGroupId, filter, columnChunk);
109+
if (!dictionaryResult) {
110+
return false;
111111
}
112-
return true;
112+
}
113+
return true;
113114
}
114115

115116
void ParquetData::enqueueRowGroup(
116117
uint32_t index,
117118
dwio::common::BufferedInput& input) {
118-
// Store the BufferedInput reference for creating streams on demand
119119
bufferedInput_ = &input;
120120

121121
auto chunk = fileMetaDataPtr_.rowGroup(index).columnChunk(type_->column());
@@ -175,9 +175,11 @@ std::pair<int64_t, int64_t> ParquetData::getRowGroupRegion(
175175
}
176176

177177
// Presto's exact isOnlyDictionaryEncodingPages function from PR #4779
178-
bool ParquetData::isOnlyDictionaryEncodingPagesImpl(const ColumnChunkMetaDataPtr& columnChunk) {
179-
// Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available
180-
// Otherwise, fallback to v1 logic
178+
bool ParquetData::isOnlyDictionaryEncodingPagesImpl(
179+
const ColumnChunkMetaDataPtr& columnChunk) {
180+
// Files written with newer versions of Parquet libraries (e.g.
181+
// parquet-mr 1.9.0) will have EncodingStats available Otherwise, fallback to
182+
// v1 logic
181183

182184
// Check for EncodingStats when available (newer Parquet files)
183185
if (columnChunk.hasEncodingStats()) {
@@ -187,19 +189,22 @@ bool ParquetData::isOnlyDictionaryEncodingPagesImpl(const ColumnChunkMetaDataPtr
187189

188190
// Fallback to v1 logic
189191
auto encodings = columnChunk.getEncoding();
190-
std::set<thrift::Encoding::type> encodingSet(encodings.begin(), encodings.end());
192+
std::set<thrift::Encoding::type> encodingSet(
193+
encodings.begin(), encodings.end());
191194

192195
if (encodingSet.count(thrift::Encoding::PLAIN_DICTIONARY)) {
193196
// PLAIN_DICTIONARY was present, which means at least one page was
194197
// dictionary-encoded and 1.0 encodings are used
195-
// The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
198+
// The only other allowed encodings are RLE and BIT_PACKED which are used
199+
// for repetition or definition levels
196200
std::set<thrift::Encoding::type> allowedEncodings = {
197-
thrift::Encoding::PLAIN_DICTIONARY,
198-
thrift::Encoding::RLE, // For repetition/definition levels
199-
thrift::Encoding::BIT_PACKED // For repetition/definition levels
201+
thrift::Encoding::PLAIN_DICTIONARY,
202+
thrift::Encoding::RLE, // For repetition/definition levels
203+
thrift::Encoding::BIT_PACKED // For repetition/definition levels
200204
};
201205

202-
// Check if there are any disallowed encodings (equivalent to Sets.difference in Java)
206+
// Check if there are any disallowed encodings (equivalent to
207+
// Sets.difference in Java)
203208
for (const auto& encoding : encodings) {
204209
if (allowedEncodings.find(encoding) == allowedEncodings.end()) {
205210
return false;
@@ -212,7 +217,8 @@ bool ParquetData::isOnlyDictionaryEncodingPagesImpl(const ColumnChunkMetaDataPtr
212217
}
213218

214219
// Helper methods for EncodingStats analysis (like Java Presto)
215-
bool ParquetData::hasDictionaryPages(const std::vector<thrift::PageEncodingStats>& stats) {
220+
bool ParquetData::hasDictionaryPages(
221+
const std::vector<thrift::PageEncodingStats>& stats) {
216222
for (const auto& pageStats : stats) {
217223
if (pageStats.page_type == thrift::PageType::DICTIONARY_PAGE) {
218224
return true;
@@ -221,7 +227,8 @@ bool ParquetData::hasDictionaryPages(const std::vector<thrift::PageEncodingStats
221227
return false;
222228
}
223229

224-
bool ParquetData::hasNonDictionaryEncodedPages(const std::vector<thrift::PageEncodingStats>& stats) {
230+
bool ParquetData::hasNonDictionaryEncodedPages(
231+
const std::vector<thrift::PageEncodingStats>& stats) {
225232
for (const auto& pageStats : stats) {
226233
if (pageStats.page_type == thrift::PageType::DATA_PAGE ||
227234
pageStats.page_type == thrift::PageType::DATA_PAGE_V2) {
@@ -239,17 +246,6 @@ bool ParquetData::testFilterAgainstDictionary(
239246
uint32_t rowGroupId,
240247
const common::Filter* filter,
241248
const ColumnChunkMetaDataPtr& columnChunk) {
242-
if (!filter) {
243-
VLOG(3) << " [DICT-TEST] No filter provided, dictionary test passes";
244-
}
245-
246-
// Log file path for debugging
247-
std::string filePath = "unknown";
248-
if (bufferedInput_ && bufferedInput_->getReadFile()) {
249-
filePath = bufferedInput_->getReadFile()->getName();
250-
}
251-
252-
// Special handling for IsNull filters - conservative include before dictionary parsing
253249
if (filter->kind() == common::FilterKind::kIsNull) {
254250
return true; // Conservative include for IsNull filters
255251
}
@@ -259,48 +255,54 @@ bool ParquetData::testFilterAgainstDictionary(
259255
return true;
260256
}
261257

262-
// Use the dictionary directly - no temp storage needed
263258
auto numValues = dictionaryPtr->numValues;
264259
const void* dictPtr = dictionaryPtr->values->as<void>();
265260

266261
// Test if any dictionary value passes the filter
267262
auto testDict = [&]<typename T>() {
268263
const T* dict = reinterpret_cast<const T*>(dictPtr);
269264

270-
// For larger dictionaries, we could use SIMD testValues() for better performance
271-
// For now, use simple scalar approach which is sufficient for typical dictionary sizes
265+
// For larger dictionaries, we could use SIMD testValues() for better
266+
// performance For now, use simple scalar approach which is sufficient for
267+
// typical dictionary sizes
272268
for (int32_t i = 0; i < numValues; ++i) {
273-
if (common::applyFilter(*filter, dict[i])) return true;
269+
if (common::applyFilter(*filter, dict[i]))
270+
return true;
274271
}
275272
return false;
276273
};
277274

278275
bool anyValuePasses = [&] {
279276
switch (type_->type()->kind()) {
280-
case TypeKind::BIGINT: return testDict.operator()<int64_t>();
281-
case TypeKind::INTEGER: return testDict.operator()<int32_t>();
277+
case TypeKind::BIGINT:
278+
return testDict.operator()<int64_t>();
279+
case TypeKind::INTEGER:
280+
return testDict.operator()<int32_t>();
282281
case TypeKind::VARCHAR:
283-
case TypeKind::VARBINARY: return testDict.operator()<StringView>();
284-
case TypeKind::REAL: return testDict.operator()<float>();
285-
case TypeKind::DOUBLE: return testDict.operator()<double>();
286-
case TypeKind::BOOLEAN: return testDict.operator()<bool>();
287-
default: return true; // Conservative fallback
282+
case TypeKind::VARBINARY:
283+
return testDict.operator()<StringView>();
284+
case TypeKind::REAL:
285+
return testDict.operator()<float>();
286+
case TypeKind::DOUBLE:
287+
return testDict.operator()<double>();
288+
case TypeKind::BOOLEAN:
289+
return testDict.operator()<bool>();
290+
default:
291+
return true; // Conservative fallback
288292
}
289293
}();
290-
291-
// If no dictionary values pass the filter, but the filter accepts NULLs,
292-
// we must conservatively include because the row group might contain NULLs that would match
293294
if (!anyValuePasses && filter->testNull()) {
294295
anyValuePasses = true;
295296
}
296297
return anyValuePasses;
297298
}
298299

299-
// Read dictionary page directly for row group filtering (like Presto's dictionaryPredicatesMatch)
300-
std::unique_ptr<dwio::common::DictionaryValues> ParquetData::readDictionaryPageForFiltering(
300+
// Read dictionary page directly for row group filtering (like Presto's
301+
// dictionaryPredicatesMatch)
302+
std::unique_ptr<dwio::common::DictionaryValues>
303+
ParquetData::readDictionaryPageForFiltering(
301304
uint32_t rowGroupId,
302305
const ColumnChunkMetaDataPtr& columnChunk) {
303-
304306
// Create input stream for the column chunk
305307
auto inputStream = getInputStream(rowGroupId, columnChunk);
306308
if (!inputStream) {
@@ -333,7 +335,8 @@ std::unique_ptr<dwio::common::DictionaryValues> ParquetData::readDictionaryPageF
333335
}
334336
// Helper method to get input stream for column chunk
335337
std::unique_ptr<dwio::common::SeekableInputStream> ParquetData::getInputStream(
336-
uint32_t rowGroupId, const ColumnChunkMetaDataPtr& columnChunk) {
338+
uint32_t rowGroupId,
339+
const ColumnChunkMetaDataPtr& columnChunk) {
337340
// Use existing stream if available
338341
if (rowGroupId < streams_.size() && streams_[rowGroupId]) {
339342
return std::move(streams_[rowGroupId]);
@@ -346,12 +349,13 @@ std::unique_ptr<dwio::common::SeekableInputStream> ParquetData::getInputStream(
346349

347350
// Calculate read parameters (same as enqueueRowGroup)
348351
uint64_t chunkReadOffset = columnChunk.dataPageOffset();
349-
if (columnChunk.hasDictionaryPageOffset() && columnChunk.dictionaryPageOffset() >= 4) {
352+
if (columnChunk.hasDictionaryPageOffset() &&
353+
columnChunk.dictionaryPageOffset() >= 4) {
350354
chunkReadOffset = columnChunk.dictionaryPageOffset();
351355
}
352356

353-
uint64_t readSize =
354-
(columnChunk.compression() == common::CompressionKind::CompressionKind_NONE)
357+
uint64_t readSize = (columnChunk.compression() ==
358+
common::CompressionKind::CompressionKind_NONE)
355359
? columnChunk.totalUncompressedSize()
356360
: columnChunk.totalCompressedSize();
357361

velox/dwio/parquet/reader/ParquetData.h

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ class ParquetData : public dwio::common::FormatData {
7676
rowsInRowGroup_(-1),
7777
sessionTimezone_(sessionTimezone) {}
7878

79-
/// Sets the BufferedInput for creating streams on demand (called before filtering)
79+
/// Sets the BufferedInput for creating streams on demand (called before
80+
/// filtering)
8081
void setBufferedInput(dwio::common::BufferedInput* input) {
8182
bufferedInput_ = input;
8283
}
@@ -217,27 +218,33 @@ class ParquetData : public dwio::common::FormatData {
217218
/// stats in 'rowGroup'.
218219
bool rowGroupMatches(uint32_t rowGroupId, const common::Filter* filter);
219220

220-
/// Dictionary-based row group filtering (like Java Presto's dictionaryPredicatesMatch)
221+
/// Dictionary-based row group filtering (like Java Presto's
222+
/// dictionaryPredicatesMatch)
221223
bool testFilterAgainstDictionary(
222224
uint32_t rowGroupId,
223225
const common::Filter* filter,
224226
const ColumnChunkMetaDataPtr& columnChunk);
225227

226228
/// Presto's exact isOnlyDictionaryEncodingPages function from PR #4779
227-
bool isOnlyDictionaryEncodingPagesImpl(const ColumnChunkMetaDataPtr& columnChunk);
229+
bool isOnlyDictionaryEncodingPagesImpl(
230+
const ColumnChunkMetaDataPtr& columnChunk);
228231

229232
/// Helper methods for EncodingStats analysis (like Java Presto)
230233
bool hasDictionaryPages(const std::vector<thrift::PageEncodingStats>& stats);
231-
bool hasNonDictionaryEncodedPages(const std::vector<thrift::PageEncodingStats>& stats);
234+
bool hasNonDictionaryEncodedPages(
235+
const std::vector<thrift::PageEncodingStats>& stats);
232236

233-
/// Read dictionary page directly for row group filtering (like Presto's approach)
234-
std::unique_ptr<dwio::common::DictionaryValues> readDictionaryPageForFiltering(
237+
/// Read dictionary page directly for row group filtering (like Presto's
238+
/// approach)
239+
std::unique_ptr<dwio::common::DictionaryValues>
240+
readDictionaryPageForFiltering(
235241
uint32_t rowGroupId,
236242
const ColumnChunkMetaDataPtr& columnChunk);
237243

238244
/// Helper methods for dictionary page reading
239245
std::unique_ptr<dwio::common::SeekableInputStream> getInputStream(
240-
uint32_t rowGroupId, const ColumnChunkMetaDataPtr& columnChunk);
246+
uint32_t rowGroupId,
247+
const ColumnChunkMetaDataPtr& columnChunk);
241248

242249
protected:
243250
memory::MemoryPool& pool_;

velox/dwio/parquet/reader/ParquetReader.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818

1919
#include <boost/algorithm/string.hpp>
2020
#include <glog/logging.h>
21-
#include <unordered_map>
2221
#include <thrift/TApplicationException.h>
2322
#include <thrift/protocol/TCompactProtocol.h> //@manual
23+
#include <unordered_map>
2424

2525
#include "velox/dwio/parquet/reader/ParquetColumnReader.h"
2626
#include "velox/dwio/parquet/reader/StructColumnReader.h"
@@ -1237,7 +1237,8 @@ class ParquetRowReader::Impl {
12371237
firstRowOfRowGroup_.reserve(rowGroups_.size());
12381238

12391239
// Use regular row group filtering for now
1240-
// Dictionary-based filtering will be handled when dictionaries are naturally loaded during page reading
1240+
// Dictionary-based filtering will be handled when dictionaries are
1241+
// naturally loaded during page reading
12411242
ParquetData::FilterRowGroupsResult res;
12421243
columnReader_->filterRowGroups(0, parquetStatsContext_, res);
12431244

19.4 KB
Binary file not shown.

0 commit comments

Comments
 (0)