From 73348e85ebec5099a75a781abdf7ce605865a61f Mon Sep 17 00:00:00 2001 From: Elliot Morrison-Reed Date: Mon, 5 Feb 2024 10:45:46 -0500 Subject: [PATCH] GH-39925: [Go][Parquet] Fix re-slicing in maybeReplaceValidity function (#39926) ### Rationale for this change See #39925. ### What changes are included in this PR? Fixes re-slicing logic for multiple data-types and negative length bug. ### Are these changes tested? There is a new test in the PR. ### Are there any user-facing changes? No, it just fixes a bug. * Closes: #39925 Authored-by: Morrison-Reed Elliot (BEG/EVS1-NA) Signed-off-by: Matt Topol --- go/parquet/file/column_writer.go | 5 +++- go/parquet/file/column_writer_test.go | 38 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go index 36663b10b89dd..4d603c547ca6a 100755 --- a/go/parquet/file/column_writer.go +++ b/go/parquet/file/column_writer.go @@ -660,7 +660,10 @@ func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int if values.Data().Offset() > 0 { data := values.Data() - buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes]) + elemSize := data.DataType().(arrow.FixedWidthDataType).Bytes() + start := data.Offset() * elemSize + end := start + data.Len()*elemSize + buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[start:end]) } data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0) diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index 321e7b730d165..dd597e280b850 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -24,6 +24,8 @@ import ( "sync" "testing" + "github.com/apache/arrow/go/v16/arrow" + "github.com/apache/arrow/go/v16/arrow/array" "github.com/apache/arrow/go/v16/arrow/bitutil" "github.com/apache/arrow/go/v16/arrow/memory" arrutils "github.com/apache/arrow/go/v16/internal/utils" @@ -36,6 +38,7 @@ import ( "github.com/apache/arrow/go/v16/parquet/internal/testutils" "github.com/apache/arrow/go/v16/parquet/internal/utils" "github.com/apache/arrow/go/v16/parquet/metadata" + "github.com/apache/arrow/go/v16/parquet/pqarrow" "github.com/apache/arrow/go/v16/parquet/schema" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -736,3 +739,38 @@ func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() { b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i]) } } + +func TestDictionaryReslice(t *testing.T) { + pts := []arrow.DataType{ + arrow.PrimitiveTypes.Int8, + arrow.PrimitiveTypes.Int16, + arrow.PrimitiveTypes.Int32, + arrow.PrimitiveTypes.Int64, + arrow.PrimitiveTypes.Uint8, + arrow.PrimitiveTypes.Uint16, + arrow.PrimitiveTypes.Uint32, + arrow.PrimitiveTypes.Uint64, + } + for _, pt := range pts { + t.Run(pt.String(), func(t *testing.T) { + mem := memory.NewGoAllocator() + dt := &arrow.DictionaryType{ + IndexType: pt, + ValueType: &arrow.StringType{}, + } + field := arrow.Field{Name: "test_field", Type: dt, Nullable: true} + schema := arrow.NewSchema([]arrow.Field{field}, nil) + b := array.NewRecordBuilder(mem, schema) + for i := 0; i < 2000; i++ { + b.Field(0).(*array.BinaryDictionaryBuilder).AppendString("test_value") + } + rec := b.NewRecord() + out := &bytes.Buffer{} + pqw, err := pqarrow.NewFileWriter(rec.Schema(), out, nil, pqarrow.NewArrowWriterProperties()) + assert.NoError(t, err) + err = pqw.WriteBuffered(rec) + assert.NoError(t, err) + + }) + } +}