diff --git a/src/format/from-arrow.js b/src/format/from-arrow.js index 4c300a1f..ebb6294e 100644 --- a/src/format/from-arrow.js +++ b/src/format/from-arrow.js @@ -1,5 +1,12 @@ import ColumnTable from '../table/column-table'; import error from '../util/error'; +import toString from '../util/to-string'; +import unroll from '../util/unroll'; + +// Hardwire Arrow type ids to avoid explicit dependency +// https://github.com/apache/arrow/blob/master/js/src/enum.ts +export const LIST = 12; +export const STRUCT = 13; /** * Options for Apache Arrow import. @@ -26,15 +33,51 @@ export default function(arrowTable, options = {}) { names.forEach(name => { const column = arrowTable.getColumn(name); if (column == null) { - error(`Arrow column does not exist: ${JSON.stringify(name)}`); + error(`Arrow column does not exist: ${toString(name)}`); } - columns[name] = unpack ? arrayFromArrow(column) : column; + columns[name] = column.numChildren ? arrayFromNested(column) + : unpack ? arrayFromVector(column) + : column; }); return new ColumnTable(columns); } -function arrayFromArrow(column) { +function arrayFromNested(vector) { + const create = vector.typeId === LIST ? listExtractor(vector) + : vector.typeId === STRUCT ? structExtractor(vector) + : error(`Unsupported Arrow type: ${toString(vector.VectorName)}`); + + // generate and return objects for each nested value + return Array.from({ length: vector.length }, create); +} + +function listExtractor(vector) { + // extract a list value. recurse if nested, otherwise convert to array + return (_, i) => { + const v = vector.get(i); + return v.numChildren ? arrayFromNested(v) : arrayFromVector(v); + }; +} + +function structExtractor(vector) { + // extract struct field names + const names = vector.type.children.map(field => field.name); + + // extract struct field values into parallel arrays + const data = names.map((_, i) => { + const v = vector.getChildAt(i); + return v.numChildren ? arrayFromNested(v) : arrayFromVector(v); + }); + + // function to generate objects with field name properties + return unroll( + data, '_,i', + '({' + names.map((_, d) => `${toString(_)}:_${d}[i]`) + '})' + ); +} + +function arrayFromVector(column) { // if dictionary column, perform more efficient extraction // if has null values, extract to standard array // otherwise, let Arrow try to use copy-less subarray call diff --git a/test/format/arrow-test.js b/test/format/arrow-test.js index 4c9e3da8..f5b5ba49 100644 --- a/test/format/arrow-test.js +++ b/test/format/arrow-test.js @@ -1,5 +1,5 @@ import tape from 'tape'; -import fromArrow from '../../src/format/from-arrow'; +import fromArrow, { LIST, STRUCT } from '../../src/format/from-arrow'; // test stubs for Arrow Column API function arrowColumn(data, nullCount = 0) { @@ -50,6 +50,23 @@ function arrowDictionary(data) { return column; } +function arrowListColumn(data) { + const c = arrowColumn(data.map(d => arrowColumn(d))); + c.typeId = LIST; + c.numChildren = 1; + return c; +} + +function arrowStructColumn(names, children) { + return { + type: { children: names.map(name => ({ name })) }, + typeId: STRUCT, + length: children[0].length, + numChildren: names.length, + getChildAt: i => children[i] + }; +} + // test stub for Arrow Table API function arrowTable(columns) { return { @@ -84,4 +101,40 @@ tape('fromArrow can unpack Apache Arrow tables', t => { t.deepEqual(dt.column('x').data, x._data, 'unpack dictionary column without nulls'); t.deepEqual(dt.column('y').data, y._data, 'unpack dictionary column with nulls'); t.end(); +}); + +tape('fromArrow can read Apache Arrow lists', t => { + const d = [[1, 2, 3], [4, 5]]; + const l = arrowListColumn(d); + const at = arrowTable({ l }); + const dt = fromArrow(at); + + t.deepEqual(dt.column('l').data, d, 'extract Arrow list'); + t.end(); +}); + +tape('fromArrow can read Apache Arrow structs', t => { + const d = [{ foo: 1, bar: [2, 3] }, { foo: 2, bar: [4] }]; + const s = arrowStructColumn(Object.keys(d[0]), [ + arrowColumn(d.map(v => v.foo)), + arrowListColumn(d.map(v => v.bar)) + ]); + const at = arrowTable({ s }); + const dt = fromArrow(at); + + t.deepEqual(dt.column('s').data, d, 'extract Arrow struct'); + t.end(); +}); + +tape('fromArrow can read nested Apache Arrow structs', t => { + const d = [{ foo: 1, bar: { bop: 2 } }, { foo: 2, bar: { bop: 3 } }]; + const s = arrowStructColumn(Object.keys(d[0]), [ + arrowColumn(d.map(v => v.foo)), + arrowStructColumn(['bop'], [ arrowColumn([2, 3]) ]) + ]); + const at = arrowTable({ s }); + const dt = fromArrow(at); + + t.deepEqual(dt.column('s').data, d, 'extract nested Arrow struct'); + t.end(); }); \ No newline at end of file