Skip to content

Commit 0253f70

Browse files
author
Marco
committed
fea(Arrow)t: Implement ListArray and ListArrayBuilder with associated functionality
1 parent e7e2ded commit 0253f70

11 files changed

+363
-31
lines changed

Arrow/Sources/Arrow/ArrowArray.swift

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ public class ArrowArrayHolderImpl: ArrowArrayHolder {
115115
return try ArrowArrayHolderImpl(BinaryArray(with))
116116
case .strct:
117117
return try ArrowArrayHolderImpl(StructArray(with))
118+
case .list:
119+
return try ArrowArrayHolderImpl(ListArray(with))
118120
default:
119121
throw ArrowError.invalid("Array not found for type: \(arrowType)")
120122
}
@@ -405,3 +407,69 @@ public class StructArray: ArrowArray<[Any?]> {
405407
return output
406408
}
407409
}
410+
411+
public class ListArray: ArrowArray<[Any?]> {
412+
public private(set) var values: ArrowArrayHolder?
413+
414+
public required init(_ arrowData: ArrowData) throws {
415+
try super.init(arrowData)
416+
guard arrowData.children.count == 1 else {
417+
throw ArrowError.invalid("List array must have exactly one child")
418+
}
419+
420+
guard let listType = arrowData.type as? ArrowTypeList else {
421+
throw ArrowError.invalid("Expected ArrowTypeList")
422+
}
423+
424+
self.values = try ArrowArrayHolderImpl.loadArray(
425+
listType.elementType,
426+
with: arrowData.children[0]
427+
)
428+
}
429+
430+
public override subscript(_ index: UInt) -> [Any?]? {
431+
guard let values = self.values else { return nil }
432+
433+
if self.arrowData.isNull(index) {
434+
return nil
435+
}
436+
437+
let offsets = self.arrowData.buffers[1]
438+
let offsetIndex = Int(index) * MemoryLayout<Int32>.stride
439+
440+
let startOffset = offsets.rawPointer.advanced(by: offsetIndex).load(as: Int32.self)
441+
let endOffset = offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).load(as: Int32.self)
442+
443+
var items = [Any?]()
444+
for i in startOffset..<endOffset {
445+
items.append(values.array.asAny(UInt(i)))
446+
}
447+
448+
return items
449+
}
450+
451+
public override func asString(_ index: UInt) -> String {
452+
guard let list = self[index] else {
453+
return "null"
454+
}
455+
456+
var output = "["
457+
458+
for (i, item) in list.enumerated() {
459+
if i > 0 {
460+
output.append(",")
461+
}
462+
463+
if item == nil {
464+
output.append("null")
465+
} else if let asStringItem = item as? AsString {
466+
output.append(asStringItem.asString(0))
467+
} else {
468+
output.append("\(item!)")
469+
}
470+
}
471+
472+
output.append("]")
473+
return output
474+
}
475+
}

Arrow/Sources/Arrow/ArrowArrayBuilder.swift

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
131131
public init(_ fields: [ArrowField], builders: [any ArrowArrayHolderBuilder]) throws {
132132
self.fields = fields
133133
self.builders = builders
134-
try super.init(ArrowNestedType(ArrowType.ArrowStruct, fields: fields))
134+
try super.init(ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields))
135135
self.bufferBuilder.initializeTypeInfo(fields)
136136
}
137137

@@ -143,7 +143,7 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
143143
}
144144

145145
self.builders = builders
146-
try super.init(ArrowNestedType(ArrowType.ArrowStruct, fields: fields))
146+
try super.init(ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields))
147147
}
148148

149149
public override func append(_ values: [Any?]?) {
@@ -174,6 +174,31 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
174174
}
175175
}
176176

177+
public class ListArrayBuilder: ArrowArrayBuilder<ListBufferBuilder, ListArray> {
178+
let valueBuilder: any ArrowArrayHolderBuilder
179+
180+
public override init(_ elementType: ArrowType) throws {
181+
self.valueBuilder = try ArrowArrayBuilders.loadBuilder(arrowType: elementType)
182+
try super.init(ArrowTypeList(elementType))
183+
}
184+
185+
public override func append(_ values: [Any?]?) {
186+
self.bufferBuilder.append(values)
187+
if let vals = values {
188+
for val in vals {
189+
self.valueBuilder.appendAny(val)
190+
}
191+
}
192+
}
193+
194+
public override func finish() throws -> ListArray {
195+
let buffers = self.bufferBuilder.finish()
196+
let childData = try valueBuilder.toHolder().array.arrowData
197+
let arrowData = try ArrowData(self.type, buffers: buffers, children: [childData], nullCount: self.nullCount, length: self.length)
198+
return try ListArray(arrowData)
199+
}
200+
}
201+
177202
public class ArrowArrayBuilders {
178203
public static func loadBuilder( // swiftlint:disable:this cyclomatic_complexity
179204
_ builderType: Any.Type) throws -> ArrowArrayHolderBuilder {
@@ -290,6 +315,16 @@ public class ArrowArrayBuilders {
290315
throw ArrowError.invalid("Expected arrow type for \(arrowType.id) not found")
291316
}
292317
return try TimestampArrayBuilder(timestampType.unit)
318+
case .list:
319+
guard let listType = arrowType as? ArrowTypeList else {
320+
throw ArrowError.invalid("Expected ArrowTypeList for \(arrowType.id)")
321+
}
322+
return try ListArrayBuilder(listType.elementType)
323+
case .strct:
324+
guard let structType = arrowType as? ArrowTypeStruct else {
325+
throw ArrowError.invalid("Expected ArrowStructType for \(arrowType.id)")
326+
}
327+
return try StructArrayBuilder(structType.fields)
293328
default:
294329
throw ArrowError.unknownType("Builder not found for arrow type: \(arrowType.id)")
295330
}
@@ -352,5 +387,12 @@ public class ArrowArrayBuilders {
352387

353388
public static func loadTimestampArrayBuilder(_ unit: ArrowTimestampUnit, timezone: String? = nil) throws -> TimestampArrayBuilder {
354389
return try TimestampArrayBuilder(unit, timezone: timezone)
390+
391+
public static func loadStructArrayBuilder(_ fields: [ArrowField]) throws -> StructArrayBuilder {
392+
return try StructArrayBuilder(fields)
393+
}
394+
395+
public static func loadListArrayBuilder(_ elementType: ArrowType) throws -> ListArrayBuilder {
396+
return try ListArrayBuilder(elementType)
355397
}
356398
}

Arrow/Sources/Arrow/ArrowBufferBuilder.swift

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,14 +338,14 @@ public class Date64BufferBuilder: AbstractWrapperBufferBuilder<Date, Int64> {
338338

339339
public final class StructBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
340340
public typealias ItemType = [Any?]
341-
var info: ArrowNestedType?
341+
var info: ArrowTypeStruct?
342342
public init() throws {
343343
let nulls = ArrowBuffer.createBuffer(0, size: UInt(MemoryLayout<UInt8>.stride))
344344
super.init(nulls)
345345
}
346346

347347
public func initializeTypeInfo(_ fields: [ArrowField]) {
348-
info = ArrowNestedType(ArrowType.ArrowStruct, fields: fields)
348+
info = ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields)
349349
}
350350

351351
public func append(_ newValue: [Any?]?) {
@@ -379,3 +379,62 @@ public final class StructBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
379379
return [nulls]
380380
}
381381
}
382+
383+
public class ListBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
384+
public typealias ItemType = [Any?]
385+
var offsets: ArrowBuffer
386+
387+
public required init() throws {
388+
self.offsets = ArrowBuffer.createBuffer(1, size: UInt(MemoryLayout<Int32>.stride))
389+
let nulls = ArrowBuffer.createBuffer(0, size: UInt(MemoryLayout<UInt8>.stride))
390+
super.init(nulls)
391+
self.offsets.rawPointer.storeBytes(of: Int32(0), as: Int32.self)
392+
}
393+
394+
public func append(_ newValue: [Any?]?) {
395+
let index = UInt(self.length)
396+
self.length += 1
397+
398+
if length >= self.offsets.length {
399+
self.resize(length + 1)
400+
}
401+
402+
let offsetIndex = Int(index) * MemoryLayout<Int32>.stride
403+
let currentOffset = self.offsets.rawPointer.advanced(by: offsetIndex).load(as: Int32.self)
404+
405+
if let vals = newValue {
406+
BitUtility.setBit(index + self.offset, buffer: self.nulls)
407+
let newOffset = currentOffset + Int32(vals.count)
408+
self.offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).storeBytes(of: newOffset, as: Int32.self)
409+
} else {
410+
self.nullCount += 1
411+
BitUtility.clearBit(index + self.offset, buffer: self.nulls)
412+
self.offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).storeBytes(of: currentOffset, as: Int32.self)
413+
}
414+
}
415+
416+
public override func isNull(_ index: UInt) -> Bool {
417+
return !BitUtility.isSet(index + self.offset, buffer: self.nulls)
418+
}
419+
420+
public func resize(_ length: UInt) {
421+
if length > self.offsets.length {
422+
let resizeLength = resizeLength(self.offsets)
423+
var offsets = ArrowBuffer.createBuffer(resizeLength, size: UInt(MemoryLayout<Int32>.size))
424+
var nulls = ArrowBuffer.createBuffer(resizeLength/8 + 1, size: UInt(MemoryLayout<UInt8>.size))
425+
ArrowBuffer.copyCurrent(self.offsets, to: &offsets, len: self.offsets.capacity)
426+
ArrowBuffer.copyCurrent(self.nulls, to: &nulls, len: self.nulls.capacity)
427+
self.offsets = offsets
428+
self.nulls = nulls
429+
}
430+
}
431+
432+
public func finish() -> [ArrowBuffer] {
433+
let length = self.length
434+
var nulls = ArrowBuffer.createBuffer(length/8 + 1, size: UInt(MemoryLayout<UInt8>.size))
435+
var offsets = ArrowBuffer.createBuffer(length + 1, size: UInt(MemoryLayout<Int32>.size))
436+
ArrowBuffer.copyCurrent(self.nulls, to: &nulls, len: nulls.capacity)
437+
ArrowBuffer.copyCurrent(self.offsets, to: &offsets, len: offsets.capacity)
438+
return [nulls, offsets]
439+
}
440+
}

Arrow/Sources/Arrow/ArrowReader.swift

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,35 @@ public class ArrowReader { // swiftlint:disable:this type_body_length
116116
rbLength: UInt(loadInfo.batchData.recordBatch.length))
117117
}
118118

119+
private func loadListData(_ loadInfo: DataLoadInfo, field: org_apache_arrow_flatbuf_Field) -> Result<ArrowArrayHolder, ArrowError> {
120+
guard let node = loadInfo.batchData.nextNode() else {
121+
return .failure(.invalid("Node not found"))
122+
}
123+
124+
guard let nullBuffer = loadInfo.batchData.nextBuffer() else {
125+
return .failure(.invalid("Null buffer not found"))
126+
}
127+
128+
guard let offsetBuffer = loadInfo.batchData.nextBuffer() else {
129+
return .failure(.invalid("Offset buffer not found"))
130+
}
131+
132+
let nullLength = UInt(ceil(Double(node.length) / 8))
133+
let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, length: nullLength, messageOffset: loadInfo.messageOffset)
134+
let arrowOffsetBuffer = makeBuffer(offsetBuffer, fileData: loadInfo.fileData, length: UInt(node.length + 1), messageOffset: loadInfo.messageOffset)
135+
136+
guard field.childrenCount == 1, let childField = field.children(at: 0) else {
137+
return .failure(.invalid("List must have exactly one child"))
138+
}
139+
140+
switch loadField(loadInfo, field: childField) {
141+
case .success(let childHolder):
142+
return makeArrayHolder(field, buffers: [arrowNullBuffer, arrowOffsetBuffer], nullCount: UInt(node.nullCount), children: [childHolder.array.arrowData], rbLength: UInt(loadInfo.batchData.recordBatch.length))
143+
case .failure(let error):
144+
return .failure(error)
145+
}
146+
}
147+
119148
private func loadPrimitiveData(
120149
_ loadInfo: DataLoadInfo,
121150
field: org_apache_arrow_flatbuf_Field)
@@ -178,12 +207,17 @@ public class ArrowReader { // swiftlint:disable:this type_body_length
178207
_ loadInfo: DataLoadInfo,
179208
field: org_apache_arrow_flatbuf_Field)
180209
-> Result<ArrowArrayHolder, ArrowError> {
181-
if isNestedType(field.typeType) {
210+
switch field.typeType {
211+
case .struct_:
182212
return loadStructData(loadInfo, field: field)
183-
} else if isFixedPrimitive(field.typeType) {
184-
return loadPrimitiveData(loadInfo, field: field)
185-
} else {
186-
return loadVariableData(loadInfo, field: field)
213+
case .list:
214+
return loadListData(loadInfo, field: field)
215+
default:
216+
if isFixedPrimitive(field.typeType) {
217+
return loadPrimitiveData(loadInfo, field: field)
218+
} else {
219+
return loadVariableData(loadInfo, field: field)
220+
}
187221
}
188222
}
189223

Arrow/Sources/Arrow/ArrowReaderHelper.swift

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,23 @@ func makeStructHolder(
154154
}
155155
}
156156

157+
func makeListHolder(
158+
_ field: ArrowField,
159+
buffers: [ArrowBuffer],
160+
nullCount: UInt,
161+
children: [ArrowData],
162+
rbLength: UInt
163+
) -> Result<ArrowArrayHolder, ArrowError> {
164+
do {
165+
let arrowData = try ArrowData(field.type, buffers: buffers, children: children, nullCount: nullCount, length: rbLength)
166+
return .success(ArrowArrayHolderImpl(try ListArray(arrowData)))
167+
} catch let error as ArrowError {
168+
return .failure(error)
169+
} catch {
170+
return .failure(.unknownError("\(error)"))
171+
}
172+
}
173+
157174
func makeArrayHolder(
158175
_ field: org_apache_arrow_flatbuf_Field,
159176
buffers: [ArrowBuffer],
@@ -208,6 +225,8 @@ func makeArrayHolder( // swiftlint:disable:this cyclomatic_complexity
208225
return makeTimestampHolder(field, buffers: buffers, nullCount: nullCount)
209226
case .strct:
210227
return makeStructHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength)
228+
case .list:
229+
return makeListHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength)
211230
default:
212231
return .failure(.unknownType("Type \(typeId) currently not supported"))
213232
}
@@ -230,15 +249,6 @@ func isFixedPrimitive(_ type: org_apache_arrow_flatbuf_Type_) -> Bool {
230249
}
231250
}
232251

233-
func isNestedType(_ type: org_apache_arrow_flatbuf_Type_) -> Bool {
234-
switch type {
235-
case .struct_:
236-
return true
237-
default:
238-
return false
239-
}
240-
}
241-
242252
func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_body_length
243253
_ field: org_apache_arrow_flatbuf_Field) -> ArrowType {
244254
let type = field.typeType
@@ -307,7 +317,13 @@ func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_bod
307317
ArrowField(childField.name ?? "", type: childType, isNullable: childField.nullable))
308318
}
309319

310-
return ArrowNestedType(ArrowType.ArrowStruct, fields: fields)
320+
return ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields)
321+
case .list:
322+
guard field.childrenCount == 1, let childField = field.children(at: 0) else {
323+
return ArrowType(ArrowType.ArrowUnknown)
324+
}
325+
let childType = findArrowType(childField)
326+
return ArrowTypeList(childType)
311327
default:
312328
return ArrowType(ArrowType.ArrowUnknown)
313329
}

0 commit comments

Comments
 (0)