Skip to content

Commit

Permalink
apacheGH-39925: [Go][Parquet] Fix re-slicing in maybeReplaceValidity …
Browse files Browse the repository at this point in the history
…function (apache#39926)

### Rationale for this change

See apache#39925.

### What changes are included in this PR?

Fixes re-slicing logic for multiple data-types and negative length bug.

### Are these changes tested?

There is a new test in the PR.

### Are there any user-facing changes?

No, it just fixes a bug.
* Closes: apache#39925

Authored-by: Morrison-Reed Elliot (BEG/EVS1-NA) <Elliot.Morrison-Reed@de.bosch.com>
Signed-off-by: Matt Topol <zotthewizard@gmail.com>
  • Loading branch information
elliotmr authored and zanmato1984 committed Feb 28, 2024
1 parent f2fd444 commit d979eca
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 1 deletion.
5 changes: 4 additions & 1 deletion go/parquet/file/column_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,10 @@ func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int

if values.Data().Offset() > 0 {
data := values.Data()
buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes])
elemSize := data.DataType().(arrow.FixedWidthDataType).Bytes()
start := data.Offset() * elemSize
end := start + data.Len()*elemSize
buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[start:end])
}

data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0)
Expand Down
38 changes: 38 additions & 0 deletions go/parquet/file/column_writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
"sync"
"testing"

"github.com/apache/arrow/go/v16/arrow"
"github.com/apache/arrow/go/v16/arrow/array"
"github.com/apache/arrow/go/v16/arrow/bitutil"
"github.com/apache/arrow/go/v16/arrow/memory"
arrutils "github.com/apache/arrow/go/v16/internal/utils"
Expand All @@ -36,6 +38,7 @@ import (
"github.com/apache/arrow/go/v16/parquet/internal/testutils"
"github.com/apache/arrow/go/v16/parquet/internal/utils"
"github.com/apache/arrow/go/v16/parquet/metadata"
"github.com/apache/arrow/go/v16/parquet/pqarrow"
"github.com/apache/arrow/go/v16/parquet/schema"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
Expand Down Expand Up @@ -736,3 +739,38 @@ func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() {
b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i])
}
}

func TestDictionaryReslice(t *testing.T) {
pts := []arrow.DataType{
arrow.PrimitiveTypes.Int8,
arrow.PrimitiveTypes.Int16,
arrow.PrimitiveTypes.Int32,
arrow.PrimitiveTypes.Int64,
arrow.PrimitiveTypes.Uint8,
arrow.PrimitiveTypes.Uint16,
arrow.PrimitiveTypes.Uint32,
arrow.PrimitiveTypes.Uint64,
}
for _, pt := range pts {
t.Run(pt.String(), func(t *testing.T) {
mem := memory.NewGoAllocator()
dt := &arrow.DictionaryType{
IndexType: pt,
ValueType: &arrow.StringType{},
}
field := arrow.Field{Name: "test_field", Type: dt, Nullable: true}
schema := arrow.NewSchema([]arrow.Field{field}, nil)
b := array.NewRecordBuilder(mem, schema)
for i := 0; i < 2000; i++ {
b.Field(0).(*array.BinaryDictionaryBuilder).AppendString("test_value")
}
rec := b.NewRecord()
out := &bytes.Buffer{}
pqw, err := pqarrow.NewFileWriter(rec.Schema(), out, nil, pqarrow.NewArrowWriterProperties())
assert.NoError(t, err)
err = pqw.WriteBuffered(rec)
assert.NoError(t, err)

})
}
}

0 comments on commit d979eca

Please sign in to comment.