-
Notifications
You must be signed in to change notification settings - Fork 735
format setting supported for date type #11608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2b8ea06
6514377
97ffe30
9610885
2bcc132
bdeccfb
ff77d52
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -238,10 +238,10 @@ std::shared_ptr<arrow::Array> ArrowStringAsYqlTimestamp(const std::shared_ptr<ar | |
| return builder.Build(true).make_array(); | ||
| } | ||
|
|
||
| template <bool isOptional> | ||
| std::shared_ptr<arrow::Array> ArrowDate32AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value) { | ||
| template <bool isOptional, typename TArrowType> | ||
| std::shared_ptr<arrow::Array> ArrowTypeAsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value) { | ||
| ::NYql::NUdf::TFixedSizeArrayBuilder<ui16, isOptional> builder(NKikimr::NMiniKQL::TTypeInfoHelper(), targetType, *arrow::system_memory_pool(), value->length()); | ||
| ::NYql::NUdf::TFixedSizeBlockReader<i32, isOptional> reader; | ||
| ::NYql::NUdf::TFixedSizeBlockReader<TArrowType, isOptional> reader; | ||
| for (i64 i = 0; i < value->length(); ++i) { | ||
| const NUdf::TBlockItem item = reader.GetItem(*value->data(), i); | ||
| if constexpr (isOptional) { | ||
|
|
@@ -253,7 +253,7 @@ std::shared_ptr<arrow::Array> ArrowDate32AsYqlDate(const std::shared_ptr<arrow:: | |
| throw parquet::ParquetException(TStringBuilder() << "null value for date could not be represented in non-optional type"); | ||
| } | ||
|
|
||
| const i32 v = item.As<i32>(); | ||
| const TArrowType v = item.As<TArrowType>(); | ||
| if (v < 0 || v > ::NYql::NUdf::MAX_DATE) { | ||
| throw parquet::ParquetException(TStringBuilder() << "date in parquet is out of range [0, " << ::NYql::NUdf::MAX_DATE << "]: " << v); | ||
| } | ||
|
|
@@ -262,6 +262,31 @@ std::shared_ptr<arrow::Array> ArrowDate32AsYqlDate(const std::shared_ptr<arrow:: | |
| return builder.Build(true).make_array(); | ||
| } | ||
|
|
||
| template <bool isOptional> | ||
| std::shared_ptr<arrow::Array> ArrowStringAsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, const std::shared_ptr<arrow::Array>& value, const NDB::FormatSettings& formatSettings) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Давай еще остальные конвертеры поддержим по аналогии с Datetime/Timestamp? |
||
| ::NYql::NUdf::TFixedSizeArrayBuilder<ui32, isOptional> builder(NKikimr::NMiniKQL::TTypeInfoHelper(), targetType, *arrow::system_memory_pool(), value->length()); | ||
| ::NYql::NUdf::TStringBlockReader<arrow::BinaryType, isOptional, NKikimr::NUdf::EDataSlot::String> reader; | ||
| for (i64 i = 0; i < value->length(); ++i) { | ||
| NUdf::TBlockItem item = reader.GetItem(*value->data(), i); | ||
|
|
||
| if constexpr (isOptional) { | ||
| if (!item) { | ||
| builder.Add(item); | ||
| continue; | ||
| } | ||
| } else if (!item) { | ||
| throw parquet::ParquetException(TStringBuilder() << "null value for date could not be represented in non-optional type"); | ||
| } | ||
|
|
||
| auto ref = item.AsStringRef(); | ||
| NDB::ReadBufferFromMemory rb{ref.Data(), ref.Size()}; | ||
| uint16_t result = 0; | ||
| parseImpl<NDB::DataTypeDate>(result, rb, nullptr, formatSettings); | ||
| builder.Add(NUdf::TBlockItem(static_cast<ui16>(result))); | ||
| } | ||
| return builder.Build(true).make_array(); | ||
| } | ||
|
|
||
| TColumnConverter ArrowUInt32AsYqlDatetime(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional) { | ||
| return [targetType, isOptional](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
|
|
@@ -430,14 +455,62 @@ TColumnConverter ArrowDate32AsYqlString(const std::shared_ptr<arrow::DataType>& | |
| }; | ||
| } | ||
|
|
||
| TColumnConverter ArrowUInt16AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional) { | ||
| return [targetType, isOptional](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
| ? ArrowTypeAsYqlDate<true, ui16>(targetType, value) | ||
| : ArrowTypeAsYqlDate<false, ui16>(targetType, value); | ||
| }; | ||
| } | ||
|
|
||
| TColumnConverter ArrowInt32AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional) { | ||
| return [targetType, isOptional](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
| ? ArrowTypeAsYqlDate<true, i32>(targetType, value) | ||
| : ArrowTypeAsYqlDate<false, i32>(targetType, value); | ||
| }; | ||
| } | ||
|
|
||
| TColumnConverter ArrowUInt32AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional) { | ||
| return [targetType, isOptional](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
| ? ArrowTypeAsYqlDate<true, ui32>(targetType, value) | ||
| : ArrowTypeAsYqlDate<false, ui32>(targetType, value); | ||
| }; | ||
| } | ||
|
|
||
| TColumnConverter ArrowInt64AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional) { | ||
| return [targetType, isOptional](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
| ? ArrowTypeAsYqlDate<true, i64>(targetType, value) | ||
| : ArrowTypeAsYqlDate<false, i64>(targetType, value); | ||
| }; | ||
| } | ||
|
|
||
| TColumnConverter ArrowUInt64AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional) { | ||
| return [targetType, isOptional](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
| ? ArrowTypeAsYqlDate<true, ui64>(targetType, value) | ||
| : ArrowTypeAsYqlDate<false, ui64>(targetType, value); | ||
| }; | ||
| } | ||
|
|
||
| TColumnConverter ArrowDate32AsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, arrow::DateUnit unit) { | ||
| if (unit == arrow::DateUnit::MILLI) { | ||
| throw parquet::ParquetException(TStringBuilder() << "millisecond accuracy does not fit into the date"); | ||
| } | ||
| return [targetType, isOptional](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
| ? ArrowDate32AsYqlDate<true>(targetType, value) | ||
| : ArrowDate32AsYqlDate<false>(targetType, value); | ||
| ? ArrowTypeAsYqlDate<true, i32>(targetType, value) | ||
| : ArrowTypeAsYqlDate<false, i32>(targetType, value); | ||
| }; | ||
| } | ||
|
|
||
| TColumnConverter ArrowStringAsYqlDate(const std::shared_ptr<arrow::DataType>& targetType, bool isOptional, const NDB::FormatSettings& formatSettings) { | ||
| return [targetType, isOptional, formatSettings](const std::shared_ptr<arrow::Array>& value) { | ||
| return isOptional | ||
| ? ArrowStringAsYqlDate<true>(targetType, value, formatSettings) | ||
| : ArrowStringAsYqlDate<false>(targetType, value, formatSettings); | ||
| }; | ||
| } | ||
|
|
||
|
|
@@ -457,6 +530,8 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or | |
| switch (originalType->id()) { | ||
| case arrow::Type::UINT16: { | ||
| switch (slotItem) { | ||
| case NUdf::EDataSlot::Date: | ||
| return ArrowUInt16AsYqlDate(targetType, isOptional); | ||
| case NUdf::EDataSlot::Datetime: | ||
| return ArrowUInt16AsYqlDatetime(targetType, isOptional); | ||
| case NUdf::EDataSlot::Timestamp: | ||
|
|
@@ -467,6 +542,8 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or | |
| } | ||
| case arrow::Type::INT32: { | ||
| switch (slotItem) { | ||
| case NUdf::EDataSlot::Date: | ||
| return ArrowInt32AsYqlDate(targetType, isOptional); | ||
| case NUdf::EDataSlot::Datetime: | ||
| return ArrowInt32AsYqlDatetime(targetType, isOptional); | ||
| case NUdf::EDataSlot::Timestamp: | ||
|
|
@@ -477,6 +554,8 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or | |
| } | ||
| case arrow::Type::UINT32: { | ||
| switch (slotItem) { | ||
| case NUdf::EDataSlot::Date: | ||
| return ArrowUInt32AsYqlDate(targetType, isOptional); | ||
| case NUdf::EDataSlot::Datetime: | ||
| return ArrowUInt32AsYqlDatetime(targetType, isOptional); | ||
| case NUdf::EDataSlot::Timestamp: | ||
|
|
@@ -487,6 +566,8 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or | |
| } | ||
| case arrow::Type::INT64: { | ||
| switch (slotItem) { | ||
| case NUdf::EDataSlot::Date: | ||
| return ArrowInt64AsYqlDate(targetType, isOptional); | ||
| case NUdf::EDataSlot::Datetime: | ||
| return ArrowInt64AsYqlDatetime(targetType, isOptional); | ||
| case NUdf::EDataSlot::Timestamp: | ||
|
|
@@ -497,6 +578,8 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or | |
| } | ||
| case arrow::Type::UINT64: { | ||
| switch (slotItem) { | ||
| case NUdf::EDataSlot::Date: | ||
| return ArrowUInt64AsYqlDate(targetType, isOptional); | ||
| case NUdf::EDataSlot::Datetime: | ||
| return ArrowUInt64AsYqlDatetime(targetType, isOptional); | ||
| case NUdf::EDataSlot::Timestamp: | ||
|
|
@@ -553,6 +636,8 @@ TColumnConverter BuildCustomConverter(const std::shared_ptr<arrow::DataType>& or | |
| case arrow::Type::STRING: | ||
| case arrow::Type::BINARY: { | ||
| switch (slotItem) { | ||
| case NUdf::EDataSlot::Date: | ||
| return ArrowStringAsYqlDate(targetType, isOptional, formatSettings); | ||
| case NUdf::EDataSlot::Datetime: | ||
| return ArrowStringAsYqlDateTime(targetType, isOptional, formatSettings); | ||
| case NUdf::EDataSlot::Timestamp: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -271,6 +271,15 @@ class TS3DataSinkTypeAnnotationTransformer : public TVisitorTransformerBase { | |
| return true; | ||
| } | ||
|
|
||
| if (name == "data.date.format") { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Для parquet не дописана часть кода: https://github.com/ydb-platform/ydb/blob/main/ydb/library/yql/providers/s3/actors/yql_arrow_column_converters.cpp#L557
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. дописал |
||
| const auto& value = setting.Tail(); | ||
| if (!EnsureAtom(value, ctx)) { | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| if (name == "csvdelimiter") { | ||
| const auto& value = setting.Tail(); | ||
| if (!EnsureAtom(value, ctx)) { | ||
|
|
@@ -292,7 +301,7 @@ class TS3DataSinkTypeAnnotationTransformer : public TVisitorTransformerBase { | |
| return true; | ||
| }; | ||
|
|
||
| if (!EnsureValidSettings(*input->Child(TS3Target::idx_Settings), {"compression", "partitionedby", "mode", "userschema", "data.datetime.formatname", "data.datetime.format", "data.timestamp.formatname", "data.timestamp.format", "csvdelimiter", "filepattern"}, validator, ctx)) { | ||
| if (!EnsureValidSettings(*input->Child(TS3Target::idx_Settings), {"compression", "partitionedby", "mode", "userschema", "data.datetime.formatname", "data.datetime.format", "data.timestamp.formatname", "data.timestamp.format", "data.date.format", "csvdelimiter", "filepattern"}, validator, ctx)) { | ||
| return TStatus::Error; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
А здесь для date нету общееизвестных форматов POSIX/ISO как для Timestamp?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
POSIX/ISO для даты выглядят одинакого - %Y-%m-%d
по совместительству это совпадает с дефолтным парсингом, поэтому показалось бессмысленно добавлять несколько одинаковых форматов