Skip to content

Commit 54681c4

Browse files
committed
continue cleanup
1 parent 3566dfd commit 54681c4

File tree

3 files changed

+32
-10
lines changed

3 files changed

+32
-10
lines changed

cpp/src/parquet/variant.cc

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,21 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const {
567567

568568
std::optional<VariantValue> VariantValue::getObjectValueByKey(
569569
std::string_view key) const {
570-
if (getBasicType() != VariantBasicType::Object) {
571-
throw ParquetException("Not an object type");
572-
}
573-
574570
ObjectInfo info = getObjectInfo();
575571

572+
return getObjectValueByKey(key, info);
573+
}
574+
575+
std::optional<VariantValue> VariantValue::getObjectValueByKey(
576+
std::string_view key, const VariantValue::ObjectInfo& info) const {
577+
// TODO(mwish): Currently we just linear search here. The best way here is:
578+
// 1. check the num_elements
579+
// 2.1. If the element number is less than 8(or other magic number), we can keep
580+
// current method.
581+
// 2.2. If the element number is larger than 8, and metadata.sorted_strings is true,
582+
// we can first apply binary search on the metadata, and then binary search the
583+
// field id.
584+
576585
for (uint32_t i = 0; i < info.num_elements; ++i) {
577586
std::string_view field_key;
578587
std::optional<VariantValue> field_value = getObjectFieldByFieldId(i, &field_key);
@@ -692,18 +701,20 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const {
692701
next_offset = arrow::bit_util::FromLittleEndian(next_offset);
693702

694703
if (offset > next_offset) {
695-
throw ParquetException("Invalid array value: offsets not monotonically increasing");
704+
throw ParquetException(
705+
"Invalid array value: offsets not monotonically increasing: " +
706+
std::to_string(offset) + " > " + std::to_string(next_offset));
696707
}
697708
}
698709

699710
return info;
700711
}
701712

702-
VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const {
703-
ArrayInfo info = getArrayInfo();
704-
713+
VariantValue VariantValue::getArrayValueByIndex(uint32_t index,
714+
const ArrayInfo& info) const {
705715
if (index >= info.num_elements) {
706-
throw ParquetException("Array index out of range");
716+
throw ParquetException("Array index out of range: " + std::to_string(index) +
717+
" >= " + std::to_string(info.num_elements));
707718
}
708719

709720
// Read the offset and next offset
@@ -725,4 +736,9 @@ VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const {
725736
return element_value;
726737
}
727738

739+
VariantValue VariantValue::getArrayValueByIndex(uint32_t index) const {
740+
ArrayInfo info = getArrayInfo();
741+
return getArrayValueByIndex(index, info);
742+
}
743+
728744
} // namespace parquet::variant

cpp/src/parquet/variant.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#pragma once
1919

2020
#include <cstdint>
21+
#include <optional>
2122
#include <string_view>
2223
#include <vector>
2324

@@ -189,6 +190,8 @@ struct VariantValue {
189190
};
190191
ObjectInfo getObjectInfo() const;
191192
std::optional<VariantValue> getObjectValueByKey(std::string_view key) const;
193+
std::optional<VariantValue> getObjectValueByKey(std::string_view key,
194+
const ObjectInfo& info) const;
192195
VariantValue getObjectFieldByFieldId(uint32_t variantId, std::string_view* key) const;
193196

194197
struct ArrayInfo {
@@ -200,6 +203,7 @@ struct VariantValue {
200203
ArrayInfo getArrayInfo() const;
201204
// Would throw ParquetException if index is out of range.
202205
VariantValue getArrayValueByIndex(uint32_t index) const;
206+
VariantValue getArrayValueByIndex(uint32_t index, const ArrayInfo& info) const;
203207

204208
private:
205209
static constexpr uint8_t BASIC_TYPE_MASK = 0b00000011;
@@ -212,9 +216,11 @@ struct VariantValue {
212216
template <typename PrimitiveType>
213217
PrimitiveType getPrimitiveType(VariantPrimitiveType type) const;
214218

219+
// An extra function because decimal uses 1 byte for scale.
215220
template <typename DecimalType>
216221
DecimalValue<DecimalType> getPrimitiveDecimalType(VariantPrimitiveType type) const;
217222

223+
// An extra function because binary/string uses 4 bytes for length.
218224
std::string_view getPrimitiveBinaryType(VariantPrimitiveType type) const;
219225
void checkBasicType(VariantBasicType type) const;
220226
void checkPrimitiveType(VariantPrimitiveType type, size_t size_required) const;

cpp/src/parquet/variant_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,4 +424,4 @@ TEST(ParquetVariant, ArrayValuesNested) {
424424
}
425425
}
426426

427-
} // namespace parquet::variant
427+
} // namespace parquet::variant

0 commit comments

Comments
 (0)