1
1
#include " csv_arrow.h"
2
2
3
- #include < ydb/core/formats/arrow/arrow_helpers.h>
4
- #include < ydb/core/formats/arrow/serializer/stream.h>
5
-
3
+ #include < contrib/libs/apache/arrow/cpp/src/arrow/array.h>
4
+ #include < contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h>
6
5
#include < contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h>
7
6
#include < contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h>
8
7
#include < util/string/join.h>
@@ -43,29 +42,6 @@ class TimestampIntParser: public arrow::TimestampParser {
43
42
44
43
}
45
44
46
- arrow::Result<TArrowCSV> TArrowCSV::Create (const TVector<std::pair<TString, NScheme::TTypeInfo>>& columns, bool header, const std::set<std::string>& notNullColumns) {
47
- TVector<TString> errors;
48
- TColummns convertedColumns;
49
- convertedColumns.reserve (columns.size ());
50
- for (auto & [name, type] : columns) {
51
- const auto arrowType = NArrow::GetArrowType (type);
52
- if (!arrowType.ok ()) {
53
- errors.emplace_back (" column " + name + " : " + arrowType.status ().ToString ());
54
- continue ;
55
- }
56
- const auto csvArrowType = NArrow::GetCSVArrowType (type);
57
- if (!csvArrowType.ok ()) {
58
- errors.emplace_back (" column " + name + " : " + csvArrowType.status ().ToString ());
59
- continue ;
60
- }
61
- convertedColumns.emplace_back (TColumnInfo{name, *arrowType, *csvArrowType});
62
- }
63
- if (!errors.empty ()) {
64
- return arrow::Status::TypeError (ErrorPrefix () + " columns errors: " + JoinSeq (" ; " , errors));
65
- }
66
- return TArrowCSV (convertedColumns, header, notNullColumns);
67
- }
68
-
69
45
TArrowCSV::TArrowCSV (const TColummns& columns, bool header, const std::set<std::string>& notNullColumns)
70
46
: ReadOptions(arrow::csv::ReadOptions::Defaults())
71
47
, ParseOptions(arrow::csv::ParseOptions::Defaults())
@@ -107,6 +83,27 @@ TArrowCSV::TArrowCSV(const TColummns& columns, bool header, const std::set<std::
107
83
SetNullValue (); // set default null value
108
84
}
109
85
86
+ namespace {
87
+
88
+ template <class TBuilder , class TOriginalArray >
89
+ std::shared_ptr<arrow::Array> ConvertArray (std::shared_ptr<arrow::ArrayData> data, ui64 dev) {
90
+ auto originalArr = std::make_shared<TOriginalArray>(data);
91
+ TBuilder aBuilder;
92
+ Y_ABORT_UNLESS (aBuilder.Reserve (originalArr->length ()).ok ());
93
+ for (long i = 0 ; i < originalArr->length (); ++i) {
94
+ if (originalArr->IsNull (i)) {
95
+ Y_ABORT_UNLESS (aBuilder.AppendNull ().ok ());
96
+ } else {
97
+ aBuilder.UnsafeAppend (originalArr->Value (i) / dev);
98
+ }
99
+ }
100
+ auto res = aBuilder.Finish ();
101
+ Y_ABORT_UNLESS (res.ok ());
102
+ return *res;
103
+ }
104
+
105
+ }
106
+
110
107
std::shared_ptr<arrow::RecordBatch> TArrowCSV::ConvertColumnTypes (std::shared_ptr<arrow::RecordBatch> parsedBatch) const {
111
108
if (!parsedBatch) {
112
109
return nullptr ;
@@ -134,59 +131,20 @@ std::shared_ptr<arrow::RecordBatch> TArrowCSV::ConvertColumnTypes(std::shared_pt
134
131
if (fArr ->type ()->Equals (originalType)) {
135
132
resultColumns.emplace_back (fArr );
136
133
} else if (fArr ->type ()->id () == arrow::TimestampType::type_id) {
137
- arrow::Result<std::shared_ptr<arrow::Array>> arrResult;
138
- {
139
- std::shared_ptr<arrow::TimestampArray> i64Arr = std::make_shared<arrow::TimestampArray>(fArr ->data ());
140
- if (originalType->id () == arrow::UInt16Type::type_id) {
141
- arrow::UInt16Builder aBuilder;
142
- Y_ABORT_UNLESS (aBuilder.Reserve (parsedBatch->num_rows ()).ok ());
143
- for (long i = 0 ; i < parsedBatch->num_rows (); ++i) {
144
- if (i64Arr->IsNull (i)) {
145
- Y_ABORT_UNLESS (aBuilder.AppendNull ().ok ());
146
- } else {
147
- aBuilder.UnsafeAppend (i64Arr->Value (i) / 86400ull );
148
- }
149
- }
150
- arrResult = aBuilder.Finish ();
151
- } else if (originalType->id () == arrow::UInt32Type::type_id) {
152
- arrow::UInt32Builder aBuilder;
153
- Y_ABORT_UNLESS (aBuilder.Reserve (parsedBatch->num_rows ()).ok ());
154
- for (long i = 0 ; i < parsedBatch->num_rows (); ++i) {
155
- if (i64Arr->IsNull (i)) {
156
- Y_ABORT_UNLESS (aBuilder.AppendNull ().ok ());
157
- } else {
158
- aBuilder.UnsafeAppend (i64Arr->Value (i));
159
- }
160
- }
161
- arrResult = aBuilder.Finish ();
162
- } else if (originalType->id () == arrow::Int32Type::type_id) {
163
- arrow::Int32Builder aBuilder;
164
- Y_ABORT_UNLESS (aBuilder.Reserve (parsedBatch->num_rows ()).ok ());
165
- for (long i = 0 ; i < parsedBatch->num_rows (); ++i) {
166
- if (i64Arr->IsNull (i)) {
167
- Y_ABORT_UNLESS (aBuilder.AppendNull ().ok ());
168
- } else {
169
- aBuilder.UnsafeAppend (i64Arr->Value (i) / 86400 );
170
- }
171
- }
172
- arrResult = aBuilder.Finish ();
173
- } else if (originalType->id () == arrow::Int64Type::type_id) {
174
- arrow::Int64Builder aBuilder;
175
- Y_ABORT_UNLESS (aBuilder.Reserve (parsedBatch->num_rows ()).ok ());
176
- for (long i = 0 ; i < parsedBatch->num_rows (); ++i) {
177
- if (i64Arr->IsNull (i)) {
178
- Y_ABORT_UNLESS (aBuilder.AppendNull ().ok ());
179
- } else {
180
- aBuilder.UnsafeAppend (i64Arr->Value (i));
181
- }
182
- }
183
- arrResult = aBuilder.Finish ();
184
- } else {
134
+ resultColumns.emplace_back ([originalType, fArr ]() {
135
+ switch (originalType->id ()) {
136
+ case arrow::UInt16Type::type_id: // Date
137
+ return ConvertArray<arrow::UInt16Builder, arrow::TimestampArray>(fArr ->data (), 86400 );
138
+ case arrow::UInt32Type::type_id: // Datetime
139
+ return ConvertArray<arrow::UInt32Builder, arrow::TimestampArray>(fArr ->data (), 1 );
140
+ case arrow::Int32Type::type_id: // Date32
141
+ return ConvertArray<arrow::Int32Builder, arrow::TimestampArray>(fArr ->data (), 86400 );
142
+ case arrow::Int64Type::type_id:// Datetime64, Timestamp64
143
+ return ConvertArray<arrow::Int64Builder, arrow::TimestampArray>(fArr ->data (), 1 );
144
+ default :
185
145
Y_ABORT_UNLESS (false );
186
146
}
187
- }
188
- Y_ABORT_UNLESS (arrResult.ok ());
189
- resultColumns.emplace_back (*arrResult);
147
+ }());
190
148
} else {
191
149
Y_ABORT_UNLESS (false );
192
150
}
@@ -204,7 +162,7 @@ std::shared_ptr<arrow::RecordBatch> TArrowCSV::ReadNext(const TString& csv, TStr
204
162
return {};
205
163
}
206
164
207
- auto buffer = std::make_shared<NArrow::NSerialization::TBufferOverString> (csv);
165
+ auto buffer = std::make_shared<arrow::Buffer>( arrow::util::string_view (csv. c_str (), csv. length ()) );
208
166
auto input = std::make_shared<arrow::io::BufferReader>(buffer);
209
167
auto res = arrow::csv::StreamingReader::Make (arrow::io::default_io_context (), input,
210
168
ReadOptions, ParseOptions, ConvertOptions);
@@ -249,11 +207,9 @@ std::shared_ptr<arrow::RecordBatch> TArrowCSV::ReadNext(const TString& csv, TStr
249
207
return {};
250
208
}
251
209
252
- if (batch && ResultColumns.size ()) {
253
- batch = NArrow::TColumnOperator ().ErrorIfAbsent ().Extract (batch, ResultColumns);
254
- if (!batch) {
255
- errString = ErrorPrefix () + " not all result columns present" ;
256
- }
210
+ if (batch && ResultColumns.size () && batch->schema ()->fields ().size () != ResultColumns.size ()) {
211
+ errString = ErrorPrefix () + " not all result columns present" ;
212
+ batch.reset ();
257
213
}
258
214
return batch;
259
215
}
@@ -279,5 +235,34 @@ std::shared_ptr<arrow::RecordBatch> TArrowCSV::ReadSingleBatch(const TString& cs
279
235
}
280
236
return batch;
281
237
}
238
+ std::shared_ptr<arrow::RecordBatch> TArrowCSV::ReadSingleBatch (const TString& csv, const Ydb::Formats::CsvSettings& csvSettings, TString& errString) {
239
+ const auto & quoting = csvSettings.quoting ();
240
+ if (quoting.quote_char ().length () > 1 ) {
241
+ errString = ErrorPrefix () + " Wrong quote char '" + quoting.quote_char () + " '" ;
242
+ return {};
243
+ }
244
+
245
+ const char qchar = quoting.quote_char ().empty () ? ' "' : quoting.quote_char ().front ();
246
+ SetQuoting (!quoting.disabled (), qchar, !quoting.double_quote_disabled ());
247
+ if (csvSettings.delimiter ()) {
248
+ if (csvSettings.delimiter ().size () != 1 ) {
249
+ errString = ErrorPrefix () + " Invalid delimitr in csv: " + csvSettings.delimiter ();
250
+ return {};
251
+ }
252
+ SetDelimiter (csvSettings.delimiter ().front ());
253
+ }
254
+ SetSkipRows (csvSettings.skip_rows ());
255
+
256
+ if (csvSettings.null_value ()) {
257
+ SetNullValue (csvSettings.null_value ());
258
+ }
259
+
260
+ if (csv.size () > NKikimr::NFormats::TArrowCSV::DEFAULT_BLOCK_SIZE) {
261
+ ui32 blockSize = NKikimr::NFormats::TArrowCSV::DEFAULT_BLOCK_SIZE;
262
+ blockSize *= csv.size () / blockSize + 1 ;
263
+ SetBlockSize (blockSize);
264
+ }
265
+ return ReadSingleBatch (csv, errString);
266
+ }
282
267
283
268
}
0 commit comments