33#include < arrow/table.h>
44#include < arrow/csv/options.h>
55#include < arrow/csv/reader.h>
6+ #include < arrow/json/options.h>
7+ #include < arrow/json/reader.h>
68#include < parquet/arrow/reader.h>
79
810#include < ydb/core/external_sources/object_storage/events.h>
@@ -182,6 +184,10 @@ struct CsvConfig : public FormatConfig {
182184 arrow::csv::ConvertOptions ConvOpts = arrow::csv::ConvertOptions::Defaults();
183185};
184186
187+ struct JsonConfig : public FormatConfig {
188+ arrow::json::ParseOptions ParseOpts = arrow::json::ParseOptions::Defaults();
189+ };
190+
185191using TsvConfig = CsvConfig;
186192
187193namespace {
@@ -190,23 +196,30 @@ using ArrowField = std::shared_ptr<arrow::Field>;
190196using ArrowFields = std::vector<ArrowField>;
191197
192198std::variant<ArrowFields, TString> InferCsvTypes (std::shared_ptr<arrow::io::RandomAccessFile> file, const CsvConfig& config) {
199+ int64_t fileSize;
200+ if (auto sizeStatus = file->GetSize ().Value (&fileSize); !sizeStatus.ok ()) {
201+ return TStringBuilder{} << " coudn't get file size: " << sizeStatus.ToString ();
202+ }
203+
193204 std::shared_ptr<arrow::csv::TableReader> reader;
194- auto fileSize = static_cast <int32_t >(file->GetSize ().ValueOr (1 << 20 ));
195- fileSize = std::min (fileSize, 1 << 20 );
196205 auto readerStatus = arrow::csv::TableReader::Make (
197- arrow::io::default_io_context (), std::move (file), arrow::csv::ReadOptions{.use_threads = false , .block_size = fileSize}, config.ParseOpts , config.ConvOpts
206+ arrow::io::default_io_context (),
207+ std::move (file),
208+ arrow::csv::ReadOptions{.use_threads = false , .block_size = static_cast <int32_t >(fileSize)},
209+ config.ParseOpts ,
210+ config.ConvOpts
198211 )
199212 .Value (&reader);
200213
201214 if (!readerStatus.ok ()) {
202- return TString{TStringBuilder{} << " couldn't parse csv/tsv file, check format and compression params: " << readerStatus.ToString ()};
215+ return TString{TStringBuilder{} << " couldn't open csv/tsv file, check format and compression params: " << readerStatus.ToString ()};
203216 }
204217
205218 std::shared_ptr<arrow::Table> table;
206219 auto tableRes = reader->Read ().Value (&table);
207220
208221 if (!tableRes.ok ()) {
209- return TStringBuilder{} << " couldn't parse csv/tsv file, check format and compression params: " << readerStatus .ToString ();
222+ return TStringBuilder{} << " couldn't parse csv/tsv file, check format and compression params: " << tableRes .ToString ();
210223 }
211224
212225 return table->fields ();
@@ -217,24 +230,52 @@ std::variant<ArrowFields, TString> InferParquetTypes(std::shared_ptr<arrow::io::
217230 builder.properties (parquet::ArrowReaderProperties (false ));
218231 auto openStatus = builder.Open (std::move (file));
219232 if (!openStatus.ok ()) {
220- return TStringBuilder{} << " couldn't parse parquet file, check format params: " << openStatus.ToString ();
233+ return TStringBuilder{} << " couldn't open parquet file, check format params: " << openStatus.ToString ();
221234 }
222235
223236 std::unique_ptr<parquet::arrow::FileReader> reader;
224237 auto readerStatus = builder.Build (&reader);
225238 if (!readerStatus.ok ()) {
226- return TStringBuilder{} << " couldn't parse parquet file, check format params: " << openStatus .ToString ();
239+ return TStringBuilder{} << " couldn't read parquet file, check format params: " << readerStatus .ToString ();
227240 }
228241
229242 std::shared_ptr<arrow::Schema> schema;
230243 auto schemaRes = reader->GetSchema (&schema);
231244 if (!schemaRes.ok ()) {
232- return TStringBuilder{} << " couldn't parse parquet file, check format params: " << openStatus .ToString ();
245+ return TStringBuilder{} << " couldn't parse parquet file, check format params: " << schemaRes .ToString ();
233246 }
234247
235248 return schema->fields ();
236249}
237250
251+ std::variant<ArrowFields, TString> InferJsonTypes (std::shared_ptr<arrow::io::RandomAccessFile> file, const JsonConfig& config) {
252+ int64_t fileSize;
253+ if (auto sizeStatus = file->GetSize ().Value (&fileSize); !sizeStatus.ok ()) {
254+ return TStringBuilder{} << " coudn't get file size: " << sizeStatus.ToString ();
255+ }
256+
257+ std::shared_ptr<arrow::json::TableReader> reader;
258+ auto readerStatus = arrow::json::TableReader::Make (
259+ arrow::default_memory_pool (),
260+ std::move (file),
261+ arrow::json::ReadOptions{.use_threads = false , .block_size = static_cast <int32_t >(fileSize)},
262+ config.ParseOpts
263+ ).Value (&reader);
264+
265+ if (!readerStatus.ok ()) {
266+ return TString{TStringBuilder{} << " couldn't open json file, check format and compression params: " << readerStatus.ToString ()};
267+ }
268+
269+ std::shared_ptr<arrow::Table> table;
270+ auto tableRes = reader->Read ().Value (&table);
271+
272+ if (!tableRes.ok ()) {
273+ return TString{TStringBuilder{} << " couldn't parse json file, check format and compression params: " << tableRes.ToString ()};
274+ }
275+
276+ return table->fields ();
277+ }
278+
238279std::variant<ArrowFields, TString> InferType (EFileFormat format, std::shared_ptr<arrow::io::RandomAccessFile> file, const FormatConfig& config) {
239280 switch (format) {
240281 case EFileFormat::CsvWithNames:
@@ -243,6 +284,9 @@ std::variant<ArrowFields, TString> InferType(EFileFormat format, std::shared_ptr
243284 return InferCsvTypes (std::move (file), static_cast <const TsvConfig&>(config));
244285 case EFileFormat::Parquet:
245286 return InferParquetTypes (std::move (file));
287+ case EFileFormat::JsonEachRow:
288+ case EFileFormat::JsonList:
289+ return InferJsonTypes (std::move (file), static_cast <const JsonConfig&>(config));
246290 case EFileFormat::Undefined:
247291 default :
248292 return std::variant<ArrowFields, TString>{std::in_place_type_t <TString>{}, TStringBuilder{} << " unexpected format: " << ConvertFileFormat (format)};
@@ -259,12 +303,19 @@ std::unique_ptr<TsvConfig> MakeTsvConfig(const THashMap<TString, TString>& param
259303 return config;
260304}
261305
306+ std::unique_ptr<JsonConfig> MakeJsonConfig (const THashMap<TString, TString>&) {
307+ return std::make_unique<JsonConfig>();
308+ }
309+
262310std::unique_ptr<FormatConfig> MakeFormatConfig (EFileFormat format, const THashMap<TString, TString>& params) {
263311 switch (format) {
264312 case EFileFormat::CsvWithNames:
265313 return MakeCsvConfig (params);
266314 case EFileFormat::TsvWithNames:
267315 return MakeTsvConfig (params);
316+ case EFileFormat::JsonEachRow:
317+ case EFileFormat::JsonList:
318+ return MakeJsonConfig (params);
268319 case EFileFormat::Undefined:
269320 default :
270321 return nullptr ;
0 commit comments