@@ -99,22 +99,22 @@ TConclusion<std::shared_ptr<TDataContainer>> AdaptColumnsImpl(
9999}
100100
101101template <class TDataContainer , class TStringContainer >
102- std::shared_ptr<TDataContainer> ExtractImpl (const TColumnOperator::EExtractProblemsPolicy & policy,
102+ std::shared_ptr<TDataContainer> ExtractImpl (const TColumnOperator::EAbsentFieldPolicy & policy,
103103 const std::shared_ptr<TDataContainer>& incoming, const std::vector<TStringContainer>& columnNames) {
104104 AFL_VERIFY (incoming);
105105 AFL_VERIFY (columnNames.size ());
106106 auto result = ExtractColumnsValidateImpl (incoming, columnNames);
107107 switch (policy) {
108- case TColumnOperator::EExtractProblemsPolicy ::Verify:
108+ case TColumnOperator::EAbsentFieldPolicy ::Verify:
109109 AFL_VERIFY ((ui32)result->num_columns () == columnNames.size ())(" schema" , incoming->schema ()->ToString ())(
110110 " required" , TColumnNameAccessor<TStringContainer>::DebugString (columnNames));
111111 break ;
112- case TColumnOperator::EExtractProblemsPolicy::Null :
112+ case TColumnOperator::EAbsentFieldPolicy::Error :
113113 if ((ui32)result->num_columns () != columnNames.size ()) {
114114 return nullptr ;
115115 }
116116 break ;
117- case TColumnOperator::EExtractProblemsPolicy ::Skip:
117+ case TColumnOperator::EAbsentFieldPolicy ::Skip:
118118 break ;
119119 }
120120 return result;
@@ -211,8 +211,8 @@ NKikimr::TConclusion<std::shared_ptr<arrow::Table>> TColumnOperator::Reorder(
211211}
212212namespace {
213213template <class TDataContainer , class TSchemaImpl >
214- TConclusion<TSchemaSubset> BuildSequentialSubsetImpl (
215- const std::shared_ptr<TDataContainer >& srcBatch , const std::shared_ptr<TSchemaImpl>& dstSchema ) {
214+ TConclusion<TSchemaSubset> BuildSequentialSubsetImpl (const std::shared_ptr<TDataContainer>& srcBatch,
215+ const std::shared_ptr<TSchemaImpl >& dstSchema , const TColumnOperator::ECheckFieldTypesPolicy checkFieldTypesPolicy ) {
216216 AFL_VERIFY (srcBatch);
217217 AFL_VERIFY (dstSchema);
218218 if (dstSchema->num_fields () < srcBatch->schema ()->num_fields ()) {
@@ -228,10 +228,20 @@ TConclusion<TSchemaSubset> BuildSequentialSubsetImpl(
228228 ++itDst;
229229 } else {
230230 fieldIdx.emplace (itDst - dstSchema->fields ().begin ());
231- if (!(*itDst)->Equals (*itSrc)) {
232- AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
233- " column_type" , (*itDst)->ToString (true ))(" incoming_type" , (*itSrc)->ToString (true ));
234- return TConclusionStatus::Fail (" incompatible column types" );
231+ if (checkFieldTypesPolicy != TColumnOperator::ECheckFieldTypesPolicy::Ignore && (*itDst)->Equals (*itSrc)) {
232+ switch (checkFieldTypesPolicy) {
233+ case TColumnOperator::ECheckFieldTypesPolicy::Error: {
234+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
235+ " column_type" , (*itDst)->ToString (true ))(" incoming_type" , (*itSrc)->ToString (true ));
236+ return TConclusionStatus::Fail (" incompatible column types" );
237+ }
238+ case TColumnOperator::ECheckFieldTypesPolicy::Verify: {
239+ AFL_VERIFY (false )(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
240+ " column_type" , (*itDst)->ToString (true ))(" incoming_type" , (*itSrc)->ToString (true ));
241+ }
242+ case TColumnOperator::ECheckFieldTypesPolicy::Ignore:
243+ AFL_VERIFY (false );
244+ }
235245 }
236246
237247 ++itDst;
@@ -249,7 +259,82 @@ TConclusion<TSchemaSubset> BuildSequentialSubsetImpl(
249259
250260TConclusion<TSchemaSubset> TColumnOperator::BuildSequentialSubset (
251261 const std::shared_ptr<arrow::RecordBatch>& incoming, const std::shared_ptr<NArrow::TSchemaLite>& dstSchema) {
252- return BuildSequentialSubsetImpl (incoming, dstSchema);
262+ return BuildSequentialSubsetImpl (incoming, dstSchema, DifferentColumnTypesPolicy);
263+ }
264+ namespace {
265+ template <class TDataContainer >
266+ TConclusion<std::shared_ptr<TDataContainer>> AdaptIncomingToDestinationExtImpl (const std::shared_ptr<TDataContainer>& incoming,
267+ const std::shared_ptr<TSchemaLite>& dstSchema, const std::function<TConclusionStatus(const ui32, const i32 )>& checker,
268+ const std::function<i32(const std::string&)>& nameResolver,
269+ const TColumnOperator::ECheckFieldTypesPolicy differentColumnTypesPolicy,
270+ const TColumnOperator::EAbsentFieldPolicy absentColumnPolicy) {
271+ struct TFieldData {
272+ ui32 Index;
273+ std::shared_ptr<typename NAdapter::TDataBuilderPolicy<TDataContainer>::TColumn> Column;
274+ bool operator <(const TFieldData& item) const {
275+ return Index < item.Index ;
276+ }
277+ };
278+ AFL_VERIFY (incoming);
279+ AFL_VERIFY (dstSchema);
280+ std::vector<TFieldData> resultColumns;
281+ resultColumns.reserve (incoming->num_columns ());
282+ ui32 idx = 0 ;
283+ for (auto & srcField : incoming->schema ()->fields ()) {
284+ const int dstIndex = nameResolver (srcField->name ());
285+ if (dstIndex > -1 ) {
286+ const auto & dstField = dstSchema->GetFieldByIndexVerified (dstIndex);
287+ switch (differentColumnTypesPolicy) {
288+ case TColumnOperator::ECheckFieldTypesPolicy::Verify:
289+ AFL_VERIFY (dstField->Equals (srcField))(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
290+ " dst_column" , dstField->ToString (true ))(" src_column" , srcField->ToString (true ));
291+ break ;
292+ case TColumnOperator::ECheckFieldTypesPolicy::Error:
293+ if (!dstField->Equals (srcField)) {
294+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " invalid_column_type" )(
295+ " dst_column" , dstField->ToString (true ))(" src_column" , srcField->ToString (true ));
296+ return TConclusionStatus::Fail (" incompatible column types for '" + dstField->name () + " '" );
297+ }
298+ break ;
299+ case TColumnOperator::ECheckFieldTypesPolicy::Ignore:
300+ break ;
301+ }
302+ auto resultCheck = checker (idx, dstIndex);
303+ if (resultCheck.IsFail ()) {
304+ return resultCheck;
305+ }
306+ resultColumns.emplace_back (TFieldData{ .Index = (ui32)dstIndex, .Column = incoming->column (idx) });
307+ } else if (absentColumnPolicy == TColumnOperator::EAbsentFieldPolicy::Skip) {
308+ } else if (absentColumnPolicy == TColumnOperator::EAbsentFieldPolicy::Verify) {
309+ AFL_VERIFY (false )(" event" , " cannot_use_incoming_batch" )(" reason" , " absent_field" )(" dst_column" , srcField->ToString (true ));
310+ } else if (absentColumnPolicy == TColumnOperator::EAbsentFieldPolicy::Error) {
311+ AFL_ERROR (NKikimrServices::ARROW_HELPER)(" event" , " cannot_use_incoming_batch" )(" reason" , " absent_field" )(
312+ " dst_column" , srcField->ToString (true ));
313+ return TConclusionStatus::Fail (" not found column '" + srcField->name () + " '" );
314+ } else {
315+ AFL_VERIFY (false );
316+ }
317+ ++idx;
318+ }
319+ if (resultColumns.empty ()) {
320+ return TConclusionStatus::Fail (" not found any column" );
321+ }
322+ std::sort (resultColumns.begin (), resultColumns.end ());
323+ std::vector<std::shared_ptr<arrow::Field>> fields;
324+ std::vector<std::shared_ptr<typename NAdapter::TDataBuilderPolicy<TDataContainer>::TColumn>> columns;
325+ columns.reserve (resultColumns.size ());
326+ fields.reserve (resultColumns.size ());
327+ for (auto && i : resultColumns) {
328+ fields.emplace_back (dstSchema->field (i.Index ));
329+ columns.emplace_back (i.Column );
330+ }
331+ return NAdapter::TDataBuilderPolicy<TDataContainer>::Build (std::make_shared<arrow::Schema>(fields), std::move (columns), incoming->num_rows ());
332+ }
333+ } // namespace
334+ TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::AdaptIncomingToDestinationExt (
335+ const std::shared_ptr<arrow::RecordBatch>& incoming, const std::shared_ptr<TSchemaLite>& dstSchema,
336+ const std::function<TConclusionStatus(const ui32, const i32 )>& checker, const std::function<i32(const std::string&)>& nameResolver) const {
337+ return AdaptIncomingToDestinationExtImpl (incoming, dstSchema, checker, nameResolver, DifferentColumnTypesPolicy, AbsentColumnPolicy);
253338}
254339
255340} // namespace NKikimr::NArrow
0 commit comments