apache · arthurpassos · May 26, 2023 · May 26, 2023 · May 29, 2023 · May 29, 2023
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
@@ -724,6 +724,8 @@ using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
 using StringDictionaryBuilder = DictionaryBuilder<StringType>;
 using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
 using StringDictionary32Builder = Dictionary32Builder<StringType>;
+using LargeBinaryDictionary32Builder = Dictionary32Builder<LargeBinaryType>;
+using LargeStringDictionary32Builder = Dictionary32Builder<LargeStringType>;
 
 /// @}
 

diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
@@ -678,6 +678,8 @@ class ARROW_EXPORT BaseBinaryType : public DataType {
 
 constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
 
+constexpr int64_t kLargeBinaryMemoryLimit = std::numeric_limits<int64_t>::max() - 1;
+
 /// \addtogroup binary-datatypes
 ///
 /// @{

@@ -438,11 +438,11 @@ void CheckConfiguredRoundtrip(
 void DoSimpleRoundtrip(const std::shared_ptr<Table>& table, bool use_threads,
                        int64_t row_group_size, const std::vector<int>& column_subset,
                        std::shared_ptr<Table>* out,
-                       const std::shared_ptr<ArrowWriterProperties>& arrow_properties =
-                           default_arrow_writer_properties()) {
+                       const std::shared_ptr<ArrowWriterProperties>&
+                           arrow_writer_properties = default_arrow_writer_properties()) {
   std::shared_ptr<Buffer> buffer;
   ASSERT_NO_FATAL_FAILURE(
-      WriteTableToBuffer(table, row_group_size, arrow_properties, &buffer));
+      WriteTableToBuffer(table, row_group_size, arrow_writer_properties, &buffer));
 
   std::unique_ptr<FileReader> reader;
   ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
@@ -610,9 +610,18 @@ class ParquetIOTestBase : public ::testing::Test {
   }
 
   void ReaderFromSink(std::unique_ptr<FileReader>* out) {
+    return ReaderFromSink(out, default_arrow_reader_properties());
+  }
+
+  void ReaderFromSink(std::unique_ptr<FileReader>* out,
+                      const ArrowReaderProperties& arrow_reader_properties) {
     ASSERT_OK_AND_ASSIGN(auto buffer, sink_->Finish());
-    ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer),
-                                ::arrow::default_memory_pool(), out));
+
+    FileReaderBuilder builder;
+    ASSERT_OK_NO_THROW(builder.Open(std::make_shared<BufferReader>(buffer)));
+    ASSERT_OK_NO_THROW(builder.properties(arrow_reader_properties)
+                           ->memory_pool(::arrow::default_memory_pool())
+                           ->Build(out));
   }
 
   void ReadSingleColumnFile(std::unique_ptr<FileReader> file_reader,
@@ -660,18 +669,20 @@ class ParquetIOTestBase : public ::testing::Test {
 
   void RoundTripSingleColumn(
       const std::shared_ptr<Array>& values, const std::shared_ptr<Array>& expected,
-      const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_properties,
+      const std::shared_ptr<::parquet::ArrowWriterProperties>& arrow_writer_properties,
+      const ArrowReaderProperties& arrow_reader_properties =
+          default_arrow_reader_properties(),
       bool nullable = true) {
     std::shared_ptr<Table> table = MakeSimpleTable(values, nullable);
     this->ResetSink();
     ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
                                   values->length(), default_writer_properties(),
-                                  arrow_properties));
+                                  arrow_writer_properties));
 
     std::shared_ptr<Table> out;
     std::unique_ptr<FileReader> reader;
-    ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader));
-    const bool expect_metadata = arrow_properties->store_schema();
+    ASSERT_NO_FATAL_FAILURE(this->ReaderFromSink(&reader, arrow_reader_properties));
+    const bool expect_metadata = arrow_writer_properties->store_schema();
     ASSERT_NO_FATAL_FAILURE(
         this->ReadTableFromFile(std::move(reader), expect_metadata, &out));
     ASSERT_EQ(1, out->num_columns());
@@ -1342,6 +1353,23 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
 
 using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
+#if defined(_WIN64) || defined(__LP64__)
+TEST_F(TestStringParquetIO, SmallStringWithLargeBinaryVariantSetting) {
+  auto values = ArrayFromJSON(::arrow::utf8(), R"(["foo", "", null, "bar"])");
+
+  this->RoundTripSingleColumn(values, values, default_arrow_writer_properties());
+
+  ArrowReaderProperties arrow_reader_properties;
+  arrow_reader_properties.set_use_large_binary_variants(true);
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> casted,
+                       ::arrow::compute::Cast(*values, ::arrow::large_utf8()));
+
+  this->RoundTripSingleColumn(values, casted, default_arrow_writer_properties(),
+                              arrow_reader_properties);
+}
+#endif
+
 TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
   std::shared_ptr<Array> values;
   ::arrow::StringBuilder builder;
@@ -1369,6 +1397,7 @@ TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
 
 using TestLargeBinaryParquetIO = TestParquetIO<::arrow::LargeBinaryType>;
 
+#if defined(_WIN64) || defined(__LP64__)
 TEST_F(TestLargeBinaryParquetIO, Basics) {
   const char* json = "[\"foo\", \"\", null, \"\xff\"]";
 
@@ -1388,6 +1417,13 @@ TEST_F(TestLargeBinaryParquetIO, Basics) {
   const auto arrow_properties =
       ::parquet::ArrowWriterProperties::Builder().store_schema()->build();
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
+
+  ArrowReaderProperties arrow_reader_properties;
+  arrow_reader_properties.set_use_large_binary_variants(true);
+  // Input is narrow array, but expected output is large array, opposite of the above
+  // tests. This validates narrow arrays can be read as large arrays.
+  this->RoundTripSingleColumn(narrow_array, large_array,
+                              default_arrow_writer_properties(), arrow_reader_properties);
 }
 
 using TestLargeStringParquetIO = TestParquetIO<::arrow::LargeStringType>;
@@ -1412,6 +1448,7 @@ TEST_F(TestLargeStringParquetIO, Basics) {
       ::parquet::ArrowWriterProperties::Builder().store_schema()->build();
   this->RoundTripSingleColumn(large_array, large_array, arrow_properties);
 }
+#endif
 
 using TestNullParquetIO = TestParquetIO<::arrow::NullType>;
 
@@ -3834,13 +3871,14 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
   ASSERT_EQ(expected, calculated);
 }
 
-void TryReadDataFile(const std::string& path,
-                     ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
+void TryReadDataFileWithProperties(
+    const std::string& path, const ArrowReaderProperties& properties,
+    ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
   auto pool = ::arrow::default_memory_pool();
 
   std::unique_ptr<FileReader> arrow_reader;
-  Status s =
-      FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), &arrow_reader);
+  Status s = FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), properties,
+                              &arrow_reader);
   if (s.ok()) {
     std::shared_ptr<::arrow::Table> table;
     s = arrow_reader->ReadTable(&table);
@@ -3851,6 +3889,11 @@ void TryReadDataFile(const std::string& path,
       << ", but got " << s.ToString();
 }
 
+void TryReadDataFile(const std::string& path,
+                     ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
+  TryReadDataFileWithProperties(path, default_arrow_reader_properties(), expected_code);
+}
+
 TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
   // PARQUET-995
   TryReadDataFile(test::get_data_file("alltypes_plain.parquet"));
@@ -3862,6 +3905,19 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+#if defined(ARROW_WITH_BROTLI) && defined(__LP64__)
+TEST(TestArrowParquet, LargeByteArray) {
+  auto path = test::get_data_file("large_string_map.brotli.parquet");
+  TryReadDataFile(path, ::arrow::StatusCode::NotImplemented);
+  ArrowReaderProperties reader_properties;
+  reader_properties.set_use_large_binary_variants(true);
+  reader_properties.set_read_dictionary(0, false);
+  TryReadDataFileWithProperties(path, reader_properties);
+  reader_properties.set_read_dictionary(0, true);
+  TryReadDataFileWithProperties(path, reader_properties);
+}
+#endif
+
 TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) {
   // ARROW-3762
   ::arrow::StringBuilder builder;
@@ -4548,16 +4604,22 @@ TEST(TestArrowWriteDictionaries, NestedSubfield) {
 class TestArrowReadDeltaEncoding : public ::testing::Test {
  public:
   void ReadTableFromParquetFile(const std::string& file_name,
+                                const ArrowReaderProperties& properties,
                                 std::shared_ptr<Table>* out) {
     auto file = test::get_data_file(file_name);
     auto pool = ::arrow::default_memory_pool();
     std::unique_ptr<FileReader> parquet_reader;
-    ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false),
+    ASSERT_OK(FileReader::Make(pool, ParquetFileReader::OpenFile(file, false), properties,
                                &parquet_reader));
     ASSERT_OK(parquet_reader->ReadTable(out));
     ASSERT_OK((*out)->ValidateFull());
   }
 
+  void ReadTableFromParquetFile(const std::string& file_name,
+                                std::shared_ptr<Table>* out) {
+    return ReadTableFromParquetFile(file_name, default_arrow_reader_properties(), out);
+  }
+
   void ReadTableFromCSVFile(const std::string& file_name,
                             const ::arrow::csv::ConvertOptions& convert_options,
                             std::shared_ptr<Table>* out) {
@@ -4605,6 +4667,27 @@ TEST_F(TestArrowReadDeltaEncoding, DeltaByteArray) {
   ::arrow::AssertTablesEqual(*actual_table, *expect_table, false);
 }
 
+TEST_F(TestArrowReadDeltaEncoding, DeltaByteArrayWithLargeBinaryVariant) {
+  std::shared_ptr<::arrow::Table> actual_table, expect_table;
+  ArrowReaderProperties properties;
+  properties.set_use_large_binary_variants(true);
+
+  ReadTableFromParquetFile("delta_byte_array.parquet", properties, &actual_table);
+
+  auto convert_options = ::arrow::csv::ConvertOptions::Defaults();
+  std::vector<std::string> column_names = {
+      "c_customer_id", "c_salutation",          "c_first_name",
+      "c_last_name",   "c_preferred_cust_flag", "c_birth_country",
+      "c_login",       "c_email_address",       "c_last_review_date"};
+  for (auto name : column_names) {
+    convert_options.column_types[name] = ::arrow::large_utf8();
+  }
+  convert_options.strings_can_be_null = true;
+  ReadTableFromCSVFile("delta_byte_array_expect.csv", convert_options, &expect_table);
+
+  ::arrow::AssertTablesEqual(*actual_table, *expect_table, false);
+}
+
 TEST_F(TestArrowReadDeltaEncoding, IncrementalDecodeDeltaByteArray) {
   auto file = test::get_data_file("delta_byte_array.parquet");
   auto pool = ::arrow::default_memory_pool();

@@ -219,6 +219,7 @@ class FileReaderImpl : public FileReader {
     ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
     ctx->filter_leaves = true;
     ctx->included_leaves = included_leaves;
+    ctx->use_large_binary_variants = reader_properties_.use_large_binary_variants();
     return GetReader(manifest_.schema_fields[i], ctx, out);
   }
 
@@ -462,7 +463,8 @@ class LeafReader : public ColumnReaderImpl {
         input_(std::move(input)),
         descr_(input_->descr()) {
     record_reader_ = RecordReader::Make(
-        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
+        descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY,
+        /*read_dense_for_nullable*/ false, ctx_->use_large_binary_variants);
     NextRowGroup();
   }
 
@@ -1218,6 +1220,7 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto
   ctx->pool = pool_;
   ctx->iterator_factory = iterator_factory;
   ctx->filter_leaves = false;
+  ctx->use_large_binary_variants = reader_properties_.use_large_binary_variants();
   std::unique_ptr<ColumnReaderImpl> result;
   RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
   *out = std::move(result);

@@ -487,8 +487,9 @@ Status TransferBinary(RecordReader* reader, MemoryPool* pool,
   auto chunks = binary_reader->GetBuilderChunks();
   for (auto& chunk : chunks) {
     if (!chunk->type()->Equals(*logical_type_field->type())) {
-      // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
-      // will be lost because they are first created as int32 and then cast to int64.
+      // If a LargeBinary chunk is larger than 2GB and use_large_binary_variants
+      // is not set, the MSBs of offsets will be lost because they are first created
+      // as int32 and then cast to int64.
       ARROW_ASSIGN_OR_RAISE(
           chunk,
           ::arrow::compute::Cast(*chunk, logical_type_field->type(), cast_options, &ctx));

@@ -109,6 +109,7 @@ struct ReaderContext {
   FileColumnIteratorFactory iterator_factory;
   bool filter_leaves;
   std::shared_ptr<std::unordered_set<int>> included_leaves;
+  bool use_large_binary_variants = false;
 
   bool IncludesLeaf(int leaf_index) const {
     if (this->filter_leaves) {

@@ -462,7 +462,9 @@ struct SchemaTreeContext {
 
 bool IsDictionaryReadSupported(const ArrowType& type) {
   // Only supported currently for BYTE_ARRAY types
-  return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING;
+  return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING ||
+         type.id() == ::arrow::Type::LARGE_BINARY ||
+         type.id() == ::arrow::Type::LARGE_STRING;
 }
 
 // ----------------------------------------------------------------------
@@ -473,7 +475,8 @@ ::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
     SchemaTreeContext* ctx) {
   ASSIGN_OR_RAISE(
       std::shared_ptr<ArrowType> storage_type,
-      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
+      GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit(),
+                   ctx->properties.use_large_binary_variants()));
   if (ctx->properties.read_dictionary(column_index) &&
       IsDictionaryReadSupported(*storage_type)) {
     return ::arrow::dictionary(::arrow::int32(), storage_type);

@@ -110,17 +110,18 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical
   }
 }
 
-Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type,
+                                                 bool use_large_binary_variants) {
   switch (logical_type.type()) {
     case LogicalType::Type::STRING:
-      return ::arrow::utf8();
+      return use_large_binary_variants ? ::arrow::large_utf8() : ::arrow::utf8();
     case LogicalType::Type::DECIMAL:
       return MakeArrowDecimal(logical_type);
     case LogicalType::Type::NONE:
     case LogicalType::Type::ENUM:
     case LogicalType::Type::JSON:
     case LogicalType::Type::BSON:
-      return ::arrow::binary();
+      return use_large_binary_variants ? ::arrow::large_binary() : ::arrow::binary();
     default:
       return Status::NotImplemented("Unhandled logical logical_type ",
                                     logical_type.ToString(), " for binary array");
@@ -181,7 +182,7 @@ Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_large_binary_variants) {
   if (logical_type.is_invalid() || logical_type.is_null()) {
     return ::arrow::null();
   }
@@ -200,7 +201,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
     case ParquetType::DOUBLE:
       return ::arrow::float64();
     case ParquetType::BYTE_ARRAY:
-      return FromByteArray(logical_type);
+      return FromByteArray(logical_type, use_large_binary_variants);
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       return FromFLBA(logical_type, type_length);
     default: {
@@ -213,9 +214,10 @@ Result<std::shared_ptr<ArrowType>> GetArrowType(
 
 Result<std::shared_ptr<ArrowType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+    const ::arrow::TimeUnit::type int96_arrow_time_unit, bool use_large_binary_variants) {
   return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
-                      primitive.type_length(), int96_arrow_time_unit);
+                      primitive.type_length(), int96_arrow_time_unit,
+                      use_large_binary_variants);
 }
 
 }  // namespace arrow

@@ -29,23 +29,28 @@ namespace arrow {
 
 using ::arrow::Result;
 
-Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type,
+                                                         bool use_large_binary_variants);
+
 Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
                                                     int32_t physical_length);
 Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
 Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_type);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
                                                         const LogicalType& logical_type,
-                                                        int type_length);
+                                                        int type_length,
+                                                        bool use_large_binary_variants);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     Type::type physical_type, const LogicalType& logical_type, int type_length,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
+    bool use_large_binary_variants = false);
 
 Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
     const schema::PrimitiveNode& primitive,
-    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+    ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO,
+    bool use_large_binary_variants = false);
 
 }  // namespace arrow
 }  // namespace parquet