Skip to content

Commit

Permalink
Merge pull request #87 from uwdata/jh/dict
Browse files Browse the repository at this point in the history
Add Arrow dictionary and other optimizations.
  • Loading branch information
jheer authored Feb 1, 2021
2 parents e5dea74 + 6e98c05 commit aedf7fa
Show file tree
Hide file tree
Showing 40 changed files with 1,222 additions and 725 deletions.
2 changes: 1 addition & 1 deletion docs/api/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ The *unpack* option determines if Arrow data should be "unpacked" from binary fo
* *arrowTable*: An [Apache Arrow](https://arrow.apache.org/docs/js/) data table.
* *options*: An Arrow import options object:
* *columns*: An ordered set of columns to import. The input may consist of: column name strings, column integer indices, objects with current column names as keys and new column names as values (for renaming), or a selection helper function such as [all](#all), [not](#not), or [range](#range)).
* *unpack*: A boolean flag (default `false`) to unpack binary-encoded Arrow data to standard JavaScript values. Unpacking can incur an upfront time and memory cost to extract data to new arrays, but can speed up later query processing by enabling faster data access.
* *unpack*: _As of v2.3.0, this option is deprecated and ignored if specified. Instead, Arquero now efficiently handles Arrow columns internally._ A boolean flag (default `false`) to unpack binary-encoded Arrow data to standard JavaScript values. Unpacking can incur an upfront time and memory cost to extract data to new arrays, but can speed up later query processing by enabling faster data access.
*Examples*
Expand Down
7 changes: 6 additions & 1 deletion docs/api/table.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ title: Table \| Arquero API Reference
* [Table Metadata](#metadata)
* [numCols](#numCols), [numRows](#numRows), [totalRows](#totalRows)
* [isFiltered](#isFiltered), [isGrouped](#isGrouped), [isOrdered](#isOrdered)
* [comparator](#foo), [groups](#groups)
* [comparator](#foo), [groups](#groups), [mask](#mask)
* [params](#params)
* [Table Columns](#columns)
* [column](#column), [columnAt](#columnAt), [columnIndex](#columnIndex)
Expand Down Expand Up @@ -144,6 +144,11 @@ Returns the groupby specification, if defined. A groupby specification is an obj
* *size*: The total number of groups.
* *keys*: Per-row group indices for every row in the table.

<hr/><a id="mask" href="#mask">#</a>
<em>table</em>.<b>mask</b>() · [Source](https://github.com/uwdata/arquero/blob/master/src/table/table.js)

Returns the bitset mask for filtered rows, or null if there is no filter.

<hr/><a id="params" href="#params">#</a>
<em>table</em>.<b>params</b>() · [Source](https://github.com/uwdata/arquero/blob/master/src/table/transformable.js)

Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"devDependencies": {
"@rollup/plugin-json": "^4.1.0",
"@rollup/plugin-node-resolve": "^11.1.1",
"apache-arrow": "3.0.0",
"eslint": "^7.19.0",
"esm": "^3.2.25",
"rimraf": "^3.0.2",
Expand Down
81 changes: 81 additions & 0 deletions perf/arrow-perf.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
const tape = require('tape');
const time = require('./time');
const { ints, sample, strings } = require('./data-gen');
const { fromArrow } = require('..');
const { Dictionary, Int32, Table, Utf8, Vector, predicate } = require('apache-arrow');

function run(N, nulls, msg) {
const vectors = [
Vector.from({
type: new Dictionary(new Utf8(), new Int32()),
values: sample(N, strings(100), nulls),
highWaterMark: 1e12
}),
Vector.from({
type: new Int32(),
values: ints(N, -10000, 10000, nulls),
highWaterMark: 1e12
})
];
const at = Table.new(vectors, ['k', 'v']);
const dt = fromArrow(at);

const arrowFilterDict = val => time(() => {
const p = new predicate.Equals(
new predicate.Col('k'),
new predicate.Literal(val)
);
at.filter(p).count();
});

const arqueroFilterDict = val => time(() => {
dt.filter(`d.k === '${val}'`).numRows();
});

const arrowFilterValue = val => time(() => {
const p = new predicate.GTeq(
new predicate.Col('v'),
new predicate.Literal(val)
);
at.filter(p).count();
});

const arqueroFilterValue = val => time(() => {
dt.filter(`d.v >= ${val}`).numRows();
});

tape(`arrow: ${msg}`, t => {
const k = at.getColumn('k').get(50);
console.table([ // eslint-disable-line
{
op: 'table init',
arrow: time(() => Table.new(vectors, ['k', 'v'])),
arquero: time(() => fromArrow(at))
},
{
op: 'count dict',
arrow: time(() => at.countBy('k')),
arquero: time(() => dt.groupby('k').count())
},
{
op: 'filter dict',
arrow: arrowFilterDict(k),
arquero: arqueroFilterDict(k)
},
{
op: 'filter value 0',
arrow: arrowFilterValue(0),
arquero: arqueroFilterValue(0)
},
{
op: 'filter value 1',
arrow: arrowFilterValue(1),
arquero: arqueroFilterValue(1)
}
]);
t.end();
});
}

run(2e6, 0, '2M values');
run(2e6, 0.05, '2M values, 5% nulls');
5 changes: 2 additions & 3 deletions perf/csv-perf.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const tape = require('tape');
const time = require('./time');
const { bools, dates, floats, ints, sample, strings } = require('./data-gen');
const { fromCSV, table } = require('..');

Expand All @@ -8,9 +9,7 @@ function toCSV(...values) {
}

function parse(csv, opt) {
const t0 = Date.now();
const tt = (fromCSV(csv, opt), Date.now() - t0);
return tt;
return time(() => fromCSV(csv, opt));
}

function run(N, nulls, msg) {
Expand Down
31 changes: 13 additions & 18 deletions perf/derive-perf.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
const tape = require('tape');
const time = require('./time');
const { floats, sample, strings } = require('./data-gen');
const { table } = require('..');

function time(fn, ...args) {
const t0 = Date.now();
fn(...args);
return Date.now() - t0;
}

function run(N, nulls, msg) {
const dt = table({
k: sample(N, strings(10), nulls),
Expand All @@ -17,26 +12,26 @@ function run(N, nulls, msg) {
});

const gt = dt.groupby('k');
const sum = { s: 'd.a + d.b' };
const pdf = { p: 'distinct(d.c) / count()'};
const zsc = { z: '(d.a - mean(d.a)) / stdev(d.a) || 0' };
const sum2 = { s: 'd.a + d.b' };
const fill = { p: 'fill_down(d.c)' };
const zscr = { z: '(d.a - mean(d.a)) / stdev(d.a) || 0' };

tape(`derive: ${msg}`, t => {
console.table([ // eslint-disable-line
{
op: 'sum',
flat: time(() => dt.derive(sum)),
group: time(() => gt.derive(sum))
op: 'sum2',
flat: time(() => dt.derive(sum2)),
group: time(() => gt.derive(sum2))
},
{
op: 'zscore',
flat: time(() => dt.derive(zsc)),
group: time(() => gt.derive(zsc))
op: 'fill',
flat: time(() => dt.derive(fill)),
group: time(() => gt.derive(fill))
},
{
op: 'prob',
flat: time(() => dt.derive(pdf)),
group: time(() => gt.derive(pdf))
op: 'zscore',
flat: time(() => dt.derive(zscr)),
group: time(() => gt.derive(zscr))
}
]);
t.end();
Expand Down
7 changes: 1 addition & 6 deletions perf/filter-perf.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
const tape = require('tape');
const time = require('./time');
const { floats, ints, sample, strings } = require('./data-gen');
const { table } = require('..');

function time(fn, ...args) {
const t0 = Date.now();
fn(...args);
return Date.now() - t0;
}

function run(N, nulls, msg) {
const dt = table({
a: ints(N, -10000, 10000, nulls),
Expand Down
67 changes: 22 additions & 45 deletions perf/rollup-perf.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
const tape = require('tape');
const time = require('./time');
const { floats, sample, strings } = require('./data-gen');
const { table, op } = require('..');

function time(fn, ...args) {
const t0 = Date.now();
fn(...args);
return Date.now() - t0;
}

function run(N, nulls, msg) {
const dt = table({
k: sample(N, strings(10), nulls),
Expand All @@ -16,6 +11,7 @@ function run(N, nulls, msg) {
c: floats(N, -10000, 10000, nulls)
});

const g = time(() => dt.groupby('k'));
const gt = dt.groupby('k');
const sum1 = { a: op.sum('a') };
const sum2 = { a: op.sum('a'), b: op.sum('b') };
Expand All @@ -26,41 +22,27 @@ function run(N, nulls, msg) {
const avg3 = { a: op.mean('a'), b: op.mean('b'), c: op.mean('c') };

tape(`rollup: ${msg}`, t => {
// console.table([ // eslint-disable-line
// {
// type: 'flat cnt',
// '1d': time(() => dt.count())
// },
// {
// type: 'flat sum',
// '1d': time(() => dt.rollup(sum1)),
// '2d': time(() => dt.rollup(sum2)),
// '3d': time(() => dt.rollup(sum3))
// },
// {
// type: 'flat avg',
// '1d': time(() => dt.rollup(avg1)),
// '2d': time(() => dt.rollup(avg2)),
// '3d': time(() => dt.rollup(avg3))
// },
// {
// type: 'group cnt',
// '1d': time(() => gt.count())
// },
// {
// type: 'group sum',
// '1d': time(() => dt.rollup(sum1)),
// '2d': time(() => dt.rollup(sum2)),
// '3d': time(() => dt.rollup(sum3))
// },
// {
// type: 'group avg',
// '1d': time(() => dt.rollup(avg1)),
// '2d': time(() => dt.rollup(avg2)),
// '3d': time(() => dt.rollup(avg3))
// }
// ]);
const fc = time(() => dt.count());
const gc = time(() => gt.count());
console.table([ // eslint-disable-line
{
op: 'group',
'flat-1': 0,
'flat-2': 0,
'flat-3': 0,
'group-1': g,
'group-2': g,
'group-3': g
},
{
op: 'count',
'flat-1': fc,
'flat-2': fc,
'flat-3': fc,
'group-1': gc,
'group-2': gc,
'group-3': gc
},
{
op: 'sum',
'flat-1': time(() => dt.rollup(sum1)),
Expand All @@ -78,11 +60,6 @@ function run(N, nulls, msg) {
'group-1': time(() => dt.rollup(avg1)),
'group-2': time(() => dt.rollup(avg2)),
'group-3': time(() => dt.rollup(avg3))
},
{
op: 'count',
'flat-1': time(() => dt.count()),
'group-1': time(() => gt.count())
}
]);
t.end();
Expand Down
7 changes: 7 additions & 0 deletions perf/time.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
const { performance } = require('perf_hooks');

module.exports = function time(fn, ...args) {
const t0 = performance.now();
fn(...args);
return Math.round(performance.now() - t0);
};
48 changes: 31 additions & 17 deletions src/engine/derive.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { window } from './window/window';
import { aggregate } from './reduce/util';
import { isWindow } from '../op';
import columnSet from '../table/column-set';
import repeat from '../util/repeat';

function isWindowed(op) {
return isWindow(op.name) ||
Expand All @@ -19,11 +20,16 @@ export default function(table, { names, exprs, ops }, options = {}) {

// analyze operations, compute non-windowed aggregates
const [ aggOps, winOps ] = segmentOps(ops);
const result = aggregate(table, aggOps);

const size = table.isGrouped() ? table.groups().size : 1;
const result = aggregate(
table, aggOps,
repeat(ops.length, () => Array(size))
);

// perform table scans to generate output values
winOps.length
? window(table, data, exprs, table.isGrouped() ? result : [result], winOps)
? window(table, data, exprs, result, winOps)
: output(table, data, exprs, result);

return table.create(cols);
Expand All @@ -43,22 +49,30 @@ function segmentOps(ops) {
return [aggOps, winOps];
}

function output(table, data, exprs, result = {}) {
const groups = table.groups();
const n = data.length;
function output(table, cols, exprs, result) {
const bits = table.mask();
const data = table.data();
const { keys } = table.groups() || {};
const op = keys
? (id, row) => result[id][keys[row]]
: id => result[id][0];

if (groups) {
const { keys } = groups;
for (let i = 0; i < n; ++i) {
const get = exprs[i];
const col = data[i];
table.scan((row, d) => col[row] = get(row, d, result[keys[row]]));
}
} else {
for (let i = 0; i < n; ++i) {
const get = exprs[i];
const col = data[i];
table.scan((row, d) => col[row] = get(row, d, result));
const m = cols.length;
for (let j = 0; j < m; ++j) {
const get = exprs[j];
const col = cols[j];

// inline the following for performance:
// table.scan((i, data) => col[i] = get(i, data, op));
if (bits) {
for (let i = bits.next(0); i >= 0; i = bits.next(i + 1)) {
col[i] = get(i, data, op);
}
} else {
const n = table.totalRows();
for (let i = 0; i < n; ++i) {
col[i] = get(i, data, op);
}
}
}
}
Loading

0 comments on commit aedf7fa

Please sign in to comment.