@@ -180,6 +180,37 @@ std::shared_ptr<arrow::Array> ArrowTypeAsYqlTimestamp(const std::shared_ptr<arro
180
180
return builder.Build (true ).make_array ();
181
181
}
182
182
183
+ template <bool isOptional, typename TArrowType>
184
+ std::shared_ptr<arrow::Array> ArrowTypeAsYqlString (const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, ui64 multiplier, const TString& format = {}) {
185
+ ::NYql::NUdf::TStringArrayBuilder<arrow::BinaryType, isOptional> builder (NKikimr::NMiniKQL::TTypeInfoHelper (), targetType, *arrow::system_memory_pool (), value->length ());
186
+ ::NYql::NUdf::TFixedSizeBlockReader<TArrowType, isOptional> reader;
187
+ for (i64 i = 0 ; i < value->length (); ++i) {
188
+ const NUdf::TBlockItem item = reader.GetItem (*value->data (), i);
189
+ if constexpr (isOptional) {
190
+ if (!item) {
191
+ builder.Add (item);
192
+ continue ;
193
+ }
194
+ } else if (!item) {
195
+ throw parquet::ParquetException (TStringBuilder () << " null value for timestamp could not be represented in non-optional type" );
196
+ }
197
+
198
+ const TArrowType baseValue = item.As <TArrowType>();
199
+ if (baseValue < 0 && baseValue > static_cast <i64 >(::NYql::NUdf::MAX_TIMESTAMP)) {
200
+ throw parquet::ParquetException (TStringBuilder () << " timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << " ]: " << baseValue);
201
+ }
202
+
203
+ if (static_cast <ui64>(baseValue) > ::NYql::NUdf::MAX_TIMESTAMP / multiplier) {
204
+ throw parquet::ParquetException (TStringBuilder () << " timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << " ] after transformation: " << baseValue);
205
+ }
206
+
207
+ const ui64 v = baseValue * multiplier;
208
+ TString result = format ? TInstant::FromValue (v).FormatGmTime (format.c_str ()) : TInstant::FromValue (v).ToString ();
209
+ builder.Add (NUdf::TBlockItem (NUdf::TStringRef (result.c_str (), result.Size ())));
210
+ }
211
+ return builder.Build (true ).make_array ();
212
+ }
213
+
183
214
template <bool isOptional>
184
215
std::shared_ptr<arrow::Array> ArrowStringAsYqlTimestamp (const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, const NDB::FormatSettings& formatSettings) {
185
216
::NYql::NUdf::TFixedSizeArrayBuilder<ui64, isOptional> builder (NKikimr::NMiniKQL::TTypeInfoHelper (), targetType, *arrow::system_memory_pool (), value->length ());
@@ -373,6 +404,30 @@ TColumnConverter ArrowTimestampAsYqlTimestamp(const std::shared_ptr<arrow::DataT
373
404
};
374
405
}
375
406
407
+ TColumnConverter ArrowTimestampAsYqlString (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::TimeUnit::type timeUnit) {
408
+ return [targetType, isOptional, multiplier=GetMultiplierForTimestamp (timeUnit)](const std::shared_ptr<arrow::Array>& value) {
409
+ return isOptional
410
+ ? ArrowTypeAsYqlString<true , i64 >(targetType, value, multiplier)
411
+ : ArrowTypeAsYqlString<false , i64 >(targetType, value, multiplier);
412
+ };
413
+ }
414
+
415
+ TColumnConverter ArrowDate64AsYqlString (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
416
+ return [targetType, isOptional, multiplier=GetMultiplierForDatetime (dateUnit)](const std::shared_ptr<arrow::Array>& value) {
417
+ return isOptional
418
+ ? ArrowTypeAsYqlString<true , i64 >(targetType, value, multiplier, " %Y-%m-%d" )
419
+ : ArrowTypeAsYqlString<false , i64 >(targetType, value, multiplier, " %Y-%m-%d" );
420
+ };
421
+ }
422
+
423
+ TColumnConverter ArrowDate32AsYqlString (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
424
+ return [targetType, isOptional, multiplier=GetMultiplierForTimestamp (dateUnit)](const std::shared_ptr<arrow::Array>& value) {
425
+ return isOptional
426
+ ? ArrowTypeAsYqlString<true , i32 >(targetType, value, multiplier, " %Y-%m-%d" )
427
+ : ArrowTypeAsYqlString<false , i32 >(targetType, value, multiplier, " %Y-%m-%d" );
428
+ };
429
+ }
430
+
376
431
TColumnConverter ArrowDate32AsYqlDate (const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit unit) {
377
432
if (unit == arrow::DateUnit::MILLI) {
378
433
throw parquet::ParquetException (TStringBuilder () << " millisecond accuracy does not fit into the date" );
@@ -457,6 +512,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
457
512
return ArrowDate32AsYqlDatetime (targetType, isOptional, dateType.unit ());
458
513
case NUdf::EDataSlot::Timestamp:
459
514
return ArrowDate32AsYqlTimestamp (targetType, isOptional, dateType.unit ());
515
+ case NUdf::EDataSlot::String:
516
+ case NUdf::EDataSlot::Utf8:
517
+ return ArrowDate32AsYqlString (targetType, isOptional, dateType.unit ());
460
518
default :
461
519
return {};
462
520
}
@@ -469,6 +527,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
469
527
return ArrowDate64AsYqlDatetime (targetType, isOptional, dateType.unit ());
470
528
case NUdf::EDataSlot::Timestamp:
471
529
return ArrowDate64AsYqlTimestamp (targetType, isOptional, dateType.unit ());
530
+ case NUdf::EDataSlot::String:
531
+ case NUdf::EDataSlot::Utf8:
532
+ return ArrowDate64AsYqlString (targetType, isOptional, dateType.unit ());
472
533
default :
473
534
return {};
474
535
}
@@ -480,10 +541,14 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
480
541
return ArrowTimestampAsYqlDatetime (targetType, isOptional, timestampType.unit ());
481
542
case NUdf::EDataSlot::Timestamp:
482
543
return ArrowTimestampAsYqlTimestamp (targetType, isOptional, timestampType.unit ());
544
+ case NUdf::EDataSlot::String:
545
+ case NUdf::EDataSlot::Utf8:
546
+ return ArrowTimestampAsYqlString (targetType, isOptional, timestampType.unit ());
483
547
default :
484
548
return {};
485
549
}
486
550
}
551
+ case arrow::Type::STRING:
487
552
case arrow::Type::BINARY: {
488
553
switch (slotItem) {
489
554
case NUdf::EDataSlot::Datetime:
0 commit comments