Skip to content

Commit

Permalink
feat: Support nested Arrow types List and Struct.
Browse files Browse the repository at this point in the history
  • Loading branch information
jheer committed Dec 9, 2020
1 parent 63575c9 commit 95215a4
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 4 deletions.
49 changes: 46 additions & 3 deletions src/format/from-arrow.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import ColumnTable from '../table/column-table';
import error from '../util/error';
import toString from '../util/to-string';
import unroll from '../util/unroll';

// Hardwire Arrow type ids to avoid explicit dependency
// https://github.com/apache/arrow/blob/master/js/src/enum.ts
export const LIST = 12;
export const STRUCT = 13;

/**
* Options for Apache Arrow import.
Expand All @@ -26,15 +33,51 @@ export default function(arrowTable, options = {}) {
names.forEach(name => {
const column = arrowTable.getColumn(name);
if (column == null) {
error(`Arrow column does not exist: ${JSON.stringify(name)}`);
error(`Arrow column does not exist: ${toString(name)}`);
}
columns[name] = unpack ? arrayFromArrow(column) : column;
columns[name] = column.numChildren ? arrayFromNested(column)
: unpack ? arrayFromVector(column)
: column;
});

return new ColumnTable(columns);
}

function arrayFromArrow(column) {
function arrayFromNested(vector) {
const create = vector.typeId === LIST ? listExtractor(vector)
: vector.typeId === STRUCT ? structExtractor(vector)
: error(`Unsupported Arrow type: ${toString(vector.VectorName)}`);

// generate and return objects for each nested value
return Array.from({ length: vector.length }, create);
}

function listExtractor(vector) {
// extract a list value. recurse if nested, otherwise convert to array
return (_, i) => {
const v = vector.get(i);
return v.numChildren ? arrayFromNested(v) : arrayFromVector(v);
};
}

function structExtractor(vector) {
// extract struct field names
const names = vector.type.children.map(field => field.name);

// extract struct field values into parallel arrays
const data = names.map((_, i) => {
const v = vector.getChildAt(i);
return v.numChildren ? arrayFromNested(v) : arrayFromVector(v);
});

// function to generate objects with field name properties
return unroll(
data, '_,i',
'({' + names.map((_, d) => `${toString(_)}:_${d}[i]`) + '})'
);
}

function arrayFromVector(column) {
// if dictionary column, perform more efficient extraction
// if has null values, extract to standard array
// otherwise, let Arrow try to use copy-less subarray call
Expand Down
55 changes: 54 additions & 1 deletion test/format/arrow-test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import tape from 'tape';
import fromArrow from '../../src/format/from-arrow';
import fromArrow, { LIST, STRUCT } from '../../src/format/from-arrow';

// test stubs for Arrow Column API
function arrowColumn(data, nullCount = 0) {
Expand Down Expand Up @@ -50,6 +50,23 @@ function arrowDictionary(data) {
return column;
}

function arrowListColumn(data) {
const c = arrowColumn(data.map(d => arrowColumn(d)));
c.typeId = LIST;
c.numChildren = 1;
return c;
}

function arrowStructColumn(names, children) {
return {
type: { children: names.map(name => ({ name })) },
typeId: STRUCT,
length: children[0].length,
numChildren: names.length,
getChildAt: i => children[i]
};
}

// test stub for Arrow Table API
function arrowTable(columns) {
return {
Expand Down Expand Up @@ -84,4 +101,40 @@ tape('fromArrow can unpack Apache Arrow tables', t => {
t.deepEqual(dt.column('x').data, x._data, 'unpack dictionary column without nulls');
t.deepEqual(dt.column('y').data, y._data, 'unpack dictionary column with nulls');
t.end();
});

tape('fromArrow can read Apache Arrow lists', t => {
const d = [[1, 2, 3], [4, 5]];
const l = arrowListColumn(d);
const at = arrowTable({ l });
const dt = fromArrow(at);

t.deepEqual(dt.column('l').data, d, 'extract Arrow list');
t.end();
});

tape('fromArrow can read Apache Arrow structs', t => {
const d = [{ foo: 1, bar: [2, 3] }, { foo: 2, bar: [4] }];
const s = arrowStructColumn(Object.keys(d[0]), [
arrowColumn(d.map(v => v.foo)),
arrowListColumn(d.map(v => v.bar))
]);
const at = arrowTable({ s });
const dt = fromArrow(at);

t.deepEqual(dt.column('s').data, d, 'extract Arrow struct');
t.end();
});

tape('fromArrow can read nested Apache Arrow structs', t => {
const d = [{ foo: 1, bar: { bop: 2 } }, { foo: 2, bar: { bop: 3 } }];
const s = arrowStructColumn(Object.keys(d[0]), [
arrowColumn(d.map(v => v.foo)),
arrowStructColumn(['bop'], [ arrowColumn([2, 3]) ])
]);
const at = arrowTable({ s });
const dt = fromArrow(at);

t.deepEqual(dt.column('s').data, d, 'extract nested Arrow struct');
t.end();
});

0 comments on commit 95215a4

Please sign in to comment.