Skip to content

Commit 09aaeb0

Browse files
authored
Merge 29ff074 into 56e035d
2 parents 56e035d + 29ff074 commit 09aaeb0

File tree

2 files changed

+519
-172
lines changed

2 files changed

+519
-172
lines changed

ydb/library/yql/providers/s3/actors/yql_arrow_column_converters.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,37 @@ std::shared_ptr<arrow::Array> ArrowTypeAsYqlTimestamp(const std::shared_ptr<arro
180180
return builder.Build(true).make_array();
181181
}
182182

183+
template <bool isOptional, typename TArrowType>
184+
std::shared_ptr<arrow::Array> ArrowTypeAsYqlString(const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, ui64 multiplier, const TString& format = {}) {
185+
::NYql::NUdf::TStringArrayBuilder<arrow::BinaryType, isOptional> builder(NKikimr::NMiniKQL::TTypeInfoHelper(), targetType, *arrow::system_memory_pool(), value->length());
186+
::NYql::NUdf::TFixedSizeBlockReader<TArrowType, isOptional> reader;
187+
for (i64 i = 0; i < value->length(); ++i) {
188+
const NUdf::TBlockItem item = reader.GetItem(*value->data(), i);
189+
if constexpr (isOptional) {
190+
if (!item) {
191+
builder.Add(item);
192+
continue;
193+
}
194+
} else if (!item) {
195+
throw parquet::ParquetException(TStringBuilder() << "null value for timestamp could not be represented in non-optional type");
196+
}
197+
198+
const TArrowType baseValue = item.As<TArrowType>();
199+
if (baseValue < 0 && baseValue > static_cast<i64>(::NYql::NUdf::MAX_TIMESTAMP)) {
200+
throw parquet::ParquetException(TStringBuilder() << "timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << "]: " << baseValue);
201+
}
202+
203+
if (static_cast<ui64>(baseValue) > ::NYql::NUdf::MAX_TIMESTAMP / multiplier) {
204+
throw parquet::ParquetException(TStringBuilder() << "timestamp in parquet is out of range [0, " << ::NYql::NUdf::MAX_TIMESTAMP << "] after transformation: " << baseValue);
205+
}
206+
207+
const ui64 v = baseValue * multiplier;
208+
TString result = format ? TInstant::FromValue(v).FormatGmTime(format.c_str()) : TInstant::FromValue(v).ToString();
209+
builder.Add(NUdf::TBlockItem(NUdf::TStringRef(result.c_str(), result.Size())));
210+
}
211+
return builder.Build(true).make_array();
212+
}
213+
183214
template <bool isOptional>
184215
std::shared_ptr<arrow::Array> ArrowStringAsYqlTimestamp(const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, const NDB::FormatSettings& formatSettings) {
185216
::NYql::NUdf::TFixedSizeArrayBuilder<ui64, isOptional> builder(NKikimr::NMiniKQL::TTypeInfoHelper(), targetType, *arrow::system_memory_pool(), value->length());
@@ -373,6 +404,30 @@ TColumnConverter ArrowTimestampAsYqlTimestamp(const std::shared_ptr<arrow::DataT
373404
};
374405
}
375406

407+
TColumnConverter ArrowTimestampAsYqlString(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::TimeUnit::type timeUnit) {
408+
return [targetType, isOptional, multiplier=GetMultiplierForTimestamp(timeUnit)](const std::shared_ptr<arrow::Array>& value) {
409+
return isOptional
410+
? ArrowTypeAsYqlString<true, i64>(targetType, value, multiplier)
411+
: ArrowTypeAsYqlString<false, i64>(targetType, value, multiplier);
412+
};
413+
}
414+
415+
TColumnConverter ArrowDate64AsYqlString(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
416+
return [targetType, isOptional, multiplier=GetMultiplierForDatetime(dateUnit)](const std::shared_ptr<arrow::Array>& value) {
417+
return isOptional
418+
? ArrowTypeAsYqlString<true, i64>(targetType, value, multiplier, "%Y-%m-%d")
419+
: ArrowTypeAsYqlString<false, i64>(targetType, value, multiplier, "%Y-%m-%d");
420+
};
421+
}
422+
423+
TColumnConverter ArrowDate32AsYqlString(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit dateUnit) {
424+
return [targetType, isOptional, multiplier=GetMultiplierForTimestamp(dateUnit)](const std::shared_ptr<arrow::Array>& value) {
425+
return isOptional
426+
? ArrowTypeAsYqlString<true, i32>(targetType, value, multiplier, "%Y-%m-%d")
427+
: ArrowTypeAsYqlString<false, i32>(targetType, value, multiplier, "%Y-%m-%d");
428+
};
429+
}
430+
376431
TColumnConverter ArrowDate32AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit unit) {
377432
if (unit == arrow::DateUnit::MILLI) {
378433
throw parquet::ParquetException(TStringBuilder() << "millisecond accuracy does not fit into the date");
@@ -457,6 +512,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
457512
return ArrowDate32AsYqlDatetime(targetType, isOptional, dateType.unit());
458513
case NUdf::EDataSlot::Timestamp:
459514
return ArrowDate32AsYqlTimestamp(targetType, isOptional, dateType.unit());
515+
case NUdf::EDataSlot::String:
516+
case NUdf::EDataSlot::Utf8:
517+
return ArrowDate32AsYqlString(targetType, isOptional, dateType.unit());
460518
default:
461519
return {};
462520
}
@@ -469,6 +527,9 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
469527
return ArrowDate64AsYqlDatetime(targetType, isOptional, dateType.unit());
470528
case NUdf::EDataSlot::Timestamp:
471529
return ArrowDate64AsYqlTimestamp(targetType, isOptional, dateType.unit());
530+
case NUdf::EDataSlot::String:
531+
case NUdf::EDataSlot::Utf8:
532+
return ArrowDate64AsYqlString(targetType, isOptional, dateType.unit());
472533
default:
473534
return {};
474535
}
@@ -480,10 +541,14 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or
480541
return ArrowTimestampAsYqlDatetime(targetType, isOptional, timestampType.unit());
481542
case NUdf::EDataSlot::Timestamp:
482543
return ArrowTimestampAsYqlTimestamp(targetType, isOptional, timestampType.unit());
544+
case NUdf::EDataSlot::String:
545+
case NUdf::EDataSlot::Utf8:
546+
return ArrowTimestampAsYqlString(targetType, isOptional, timestampType.unit());
483547
default:
484548
return {};
485549
}
486550
}
551+
case arrow::Type::STRING:
487552
case arrow::Type::BINARY: {
488553
switch (slotItem) {
489554
case NUdf::EDataSlot::Datetime:

0 commit comments

Comments
 (0)