@@ -326,7 +326,8 @@ std::string VariantValue::typeDebugString() const {
326
326
327
327
bool VariantValue::getBool () const {
328
328
if (getBasicType () != VariantBasicType::Primitive) {
329
- throw ParquetException (" Not a primitive type" );
329
+ throw ParquetException (" Expected primitive type, but got: " +
330
+ variantBasicTypeToString (getBasicType ()));
330
331
}
331
332
332
333
int8_t primitive_type = static_cast <int8_t >(value[0 ]) >> 2 ;
@@ -341,11 +342,16 @@ bool VariantValue::getBool() const {
341
342
std::to_string (primitive_type));
342
343
}
343
344
345
+ void VariantValue::checkBasicType (VariantBasicType type) const {
346
+ if (getBasicType () != type) {
347
+ throw ParquetException (" Expected basic type: " + variantBasicTypeToString (type) +
348
+ " , but got: " + variantBasicTypeToString (getBasicType ()));
349
+ }
350
+ }
351
+
344
352
void VariantValue::checkPrimitiveType (VariantPrimitiveType type,
345
353
size_t size_required) const {
346
- if (getBasicType () != VariantBasicType::Primitive) {
347
- throw ParquetException (" Not a primitive type" );
348
- }
354
+ checkBasicType (VariantBasicType::Primitive);
349
355
350
356
auto primitive_type = static_cast <VariantPrimitiveType>(value[0 ] >> 2 );
351
357
if (primitive_type != type) {
@@ -354,17 +360,17 @@ void VariantValue::checkPrimitiveType(VariantPrimitiveType type,
354
360
" , but got: " + variantPrimitiveTypeToString (primitive_type));
355
361
}
356
362
357
- if (value.size () < 1 + size_required) {
363
+ if (value.size () < size_required) {
358
364
throw ParquetException (" Invalid value: too short, expected at least " +
359
- std::to_string (1 + size_required) + " bytes for type " +
365
+ std::to_string (size_required) + " bytes for type " +
360
366
variantPrimitiveTypeToString (type) +
361
367
" , but got: " + std::to_string (value.size ()) + " bytes" );
362
368
}
363
369
}
364
370
365
371
template <typename PrimitiveType>
366
372
PrimitiveType VariantValue::getPrimitiveType (VariantPrimitiveType type) const {
367
- checkPrimitiveType (type, sizeof (PrimitiveType));
373
+ checkPrimitiveType (type, sizeof (PrimitiveType) + 1 );
368
374
369
375
PrimitiveType primitive_value{};
370
376
memcpy (&primitive_value, value.data () + 1 , sizeof (PrimitiveType));
@@ -378,38 +384,27 @@ int8_t VariantValue::getInt8() const {
378
384
}
379
385
380
386
int16_t VariantValue::getInt16 () const {
381
- return getPrimitiveType<int8_t >(VariantPrimitiveType::Int16);
387
+ return getPrimitiveType<int16_t >(VariantPrimitiveType::Int16);
382
388
}
383
389
384
390
int32_t VariantValue::getInt32 () const {
385
- return getPrimitiveType<int8_t >(VariantPrimitiveType::Int32);
391
+ return getPrimitiveType<int32_t >(VariantPrimitiveType::Int32);
386
392
}
387
393
388
394
int64_t VariantValue::getInt64 () const {
389
- return getPrimitiveType<int8_t >(VariantPrimitiveType::Int64);
395
+ return getPrimitiveType<int64_t >(VariantPrimitiveType::Int64);
390
396
}
391
397
392
398
float VariantValue::getFloat () const {
393
399
return getPrimitiveType<float >(VariantPrimitiveType::Float);
394
400
}
395
401
396
402
double VariantValue::getDouble () const {
397
- return getPrimitiveType<float >(VariantPrimitiveType::Double);
403
+ return getPrimitiveType<double >(VariantPrimitiveType::Double);
398
404
}
399
405
400
406
std::string_view VariantValue::getPrimitiveBinaryType (VariantPrimitiveType type) const {
401
- VariantBasicType basic_type = getBasicType ();
402
- if (basic_type != VariantBasicType::Primitive) {
403
- throw ParquetException (" Not a primitive type" );
404
- }
405
- auto primitive_type = static_cast <VariantPrimitiveType>(value[0 ] >> 2 );
406
- if (primitive_type != VariantPrimitiveType::String) {
407
- throw ParquetException (" Not a string type" );
408
- }
409
-
410
- if (value.size () < 5 ) {
411
- throw ParquetException (" Invalid string value: too short" );
412
- }
407
+ checkPrimitiveType (type, /* size_required=*/ 5 );
413
408
414
409
uint32_t length;
415
410
memcpy (&length, value.data () + 1 , sizeof (uint32_t ));
@@ -468,7 +463,7 @@ DecimalValue<::arrow::Decimal64> VariantValue::getDecimal8() const {
468
463
469
464
DecimalValue<::arrow::Decimal128> VariantValue::getDecimal16 () const {
470
465
checkPrimitiveType (VariantPrimitiveType::Decimal16,
471
- /* size_required=*/ sizeof (int64_t ) * 2 );
466
+ /* size_required=*/ sizeof (int64_t ) * 2 + 2 );
472
467
473
468
uint8_t scale = value[1 ];
474
469
@@ -524,9 +519,7 @@ std::string VariantValue::ObjectInfo::toDebugString() const {
524
519
525
520
526
521
VariantValue::ObjectInfo VariantValue::getObjectInfo () const {
527
- if (getBasicType () != VariantBasicType::Object) {
528
- throw ParquetException (" Not an object type" );
529
- }
522
+ checkBasicType (VariantBasicType::Object);
530
523
uint8_t value_header = value[0 ] >> 2 ;
531
524
uint8_t field_offset_size = (value_header & 0b11 ) + 1 ;
532
525
uint8_t field_id_size = ((value_header >> 2 ) & 0b11 ) + 1 ;
@@ -561,6 +554,7 @@ VariantValue::ObjectInfo VariantValue::getObjectInfo() const {
561
554
memcpy (&final_offset,
562
555
value.data () + info.offset_start_offset + num_elements * field_offset_size,
563
556
field_offset_size);
557
+ // It could be less than value size since it could be a sub-object.
564
558
if (final_offset + info.data_start_offset > value.size ()) {
565
559
throw ParquetException (" Invalid object value: final_offset=" +
566
560
std::to_string (final_offset) +
@@ -591,12 +585,13 @@ std::optional<VariantValue> VariantValue::getObjectValueByKey(
591
585
return std::nullopt;
592
586
}
593
587
594
- std::optional< VariantValue> VariantValue::getObjectFieldByFieldId (
595
- uint32_t variantId, std::string_view* key) const {
588
+ VariantValue VariantValue::getObjectFieldByFieldId (uint32_t variantId,
589
+ std::string_view* key) const {
596
590
ObjectInfo info = getObjectInfo ();
597
591
598
592
if (variantId >= info.num_elements ) {
599
- throw ParquetException (" Field ID out of range" );
593
+ throw ParquetException (" Field ID out of range: " + std::to_string (variantId) +
594
+ " >= " + std::to_string (info.num_elements ));
600
595
}
601
596
602
597
// Read the field ID
@@ -606,7 +601,7 @@ std::optional<VariantValue> VariantValue::getObjectFieldByFieldId(
606
601
field_id = arrow::bit_util::FromLittleEndian (field_id);
607
602
608
603
// Get the key from metadata
609
- *key = metadata.getMetadataKey (field_id);
604
+ *key = metadata.getMetadataKey (static_cast < int32_t >( field_id) );
610
605
611
606
// Read the offset and next offset
612
607
uint32_t offset = 0 , next_offset = 0 ;
@@ -633,10 +628,7 @@ std::optional<VariantValue> VariantValue::getObjectFieldByFieldId(
633
628
}
634
629
635
630
VariantValue::ArrayInfo VariantValue::getArrayInfo () const {
636
- if (getBasicType () != VariantBasicType::Array) {
637
- throw ParquetException (" Expected array type, but got: " +
638
- variantBasicTypeToString (getBasicType ()));
639
- }
631
+ checkBasicType (VariantBasicType::Array);
640
632
uint8_t value_header = value[0 ] >> 2 ;
641
633
uint8_t field_offset_size = (value_header & 0b11 ) + 1 ;
642
634
bool is_large = ((value_header >> 2 ) & 0b1 );
@@ -649,7 +641,7 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const {
649
641
" for at least " + std::to_string (1 + num_elements_size));
650
642
}
651
643
652
- // 解析 num_elements
644
+ // parse num_elements
653
645
uint32_t num_elements = 0 ;
654
646
{
655
647
memcpy (&num_elements, value.data () + 1 , num_elements_size);
@@ -663,14 +655,15 @@ VariantValue::ArrayInfo VariantValue::getArrayInfo() const {
663
655
info.data_start_offset =
664
656
info.offset_start_offset + (num_elements + 1 ) * field_offset_size;
665
657
666
- // 检查边界
658
+ // Boundary check
667
659
if (info.data_start_offset > value.size ()) {
668
660
throw ParquetException (" Invalid array value: data_start_offset=" +
669
661
std::to_string (info.data_start_offset ) +
670
662
" , value_size=" + std::to_string (value.size ()));
671
663
}
672
664
673
- // 检查最终偏移量
665
+ // Validate final offset is equal to the size of the value,
666
+ // it would work since even empty array would have an offset of 0.
674
667
{
675
668
uint32_t final_offset = 0 ;
676
669
memcpy (&final_offset,
0 commit comments