Skip to content

Commit 2715f2d

Browse files
author
Daniel Kerrigan
committed
delta vis
1 parent 44c6f98 commit 2715f2d

20 files changed

+3576
-29
lines changed

demo_data/normal.ipynb

Lines changed: 486 additions & 0 deletions
Large diffs are not rendered by default.

demo_data/synthetic.ipynb

Lines changed: 482 additions & 0 deletions
Large diffs are not rendered by default.

public/datasets/datasets-test.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ Server Performance,https://gist.githubusercontent.com/DanielKerrigan/60327aa04a8
1010
Bank Marketing,https://gist.githubusercontent.com/DanielKerrigan/13ccc22bb97431ba26ad03c3c74864d1/raw/27894766de9389811c665cb40d66d902f4fd5fe5/bank-marketing.csv
1111
Australia Rain,https://gist.githubusercontent.com/DanielKerrigan/ddb491dcf5b0abc9f2ad2773cc3839ee/raw/3a059f5b079b3a61ad9e2a9004d0b85c575d7177/weather-aus.csv
1212
UI Test,/datasets/test/ui-test.csv
13+
Normal,/datasets/normal.csv
14+
Synthetic,/datasets/synthetic.csv
1315
data-10-10000.csv,/datasets/test/data-10-10000.csv
1416
data-100-10000.csv,/datasets/test/data-100-10000.csv
1517
data-20-10000.csv,/datasets/test/data-20-10000.csv

public/datasets/normal.csv

Lines changed: 1001 additions & 0 deletions
Large diffs are not rendered by default.

public/datasets/synthetic.csv

Lines changed: 1001 additions & 0 deletions
Large diffs are not rendered by default.

src/DataTransformer.js

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,12 +222,31 @@ function getMetadata(dataset) {
222222
return acc;
223223
}, {});
224224

225+
const classes = {
226+
groundTruth: d3.rollup(dataset, v => v.length / dataset.length, d => d.label)
227+
};
228+
229+
if (hasPredictions) {
230+
// classes.predictionCounts = d3.rollup(dataset, v => v.length / dataset.length, d => d.prediction);
231+
classes.predictions = d3.rollup(
232+
dataset,
233+
v =>
234+
d3.rollup(
235+
v,
236+
g => g.length / dataset.length,
237+
p => p.prediction === p.label ? "correct" : "incorrect"
238+
),
239+
d => d.prediction
240+
);
241+
}
242+
225243
return {
226244
features: features,
227245
featureNames: featureNames,
228246
labelValues: labelValues,
229247
hasPredictions: hasPredictions,
230248
size: dataset.length,
249+
classes: classes,
231250
}
232251
}
233252

@@ -246,6 +265,17 @@ function getData(metadata, selectedFeatures, dataset) {
246265
size: g.length
247266
};
248267

268+
const diff = {
269+
groundTruth: new Map(
270+
metadata.labelValues.map(label => [
271+
label,
272+
(groundTruth.get(label) / g.length) - metadata.classes.groundTruth.get(label)
273+
])
274+
)
275+
};
276+
277+
node.diff = diff;
278+
249279
if (metadata.hasPredictions) {
250280
// if the dataset has model predictions,
251281
// also count the number of each prediction
@@ -268,6 +298,24 @@ function getData(metadata, selectedFeatures, dataset) {
268298

269299
node['predictionCounts'] = predictionCounts;
270300
node['predictionResults'] = predictionResults;
301+
302+
node.diff.predictions = d3.cross(metadata.labelValues, ["correct", "incorrect"])
303+
.map(([label, correct]) => {
304+
const predictedCount = predictionResults.get(label)?.get(correct) ?? 0;
305+
const predictedPercent = predictedCount / g.length;
306+
const rootPercent = metadata.classes.predictions.get(label)?.get(correct) ?? 0;
307+
const diffPercent = Math.max(0, predictedPercent - rootPercent);
308+
const diffCount = diffPercent * g.length;
309+
const sameCount = predictedCount - diffCount;
310+
311+
return {
312+
label,
313+
correct,
314+
diffCount,
315+
sameCount,
316+
count: predictedCount,
317+
};
318+
})
271319
}
272320

273321
return node;

src/FeatureRatings.js

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ import {
22
entropy,
33
errorDeviation,
44
errorCount,
5-
errorPercent
5+
errorPercent,
6+
antiEntropy,
67
} from './RatingMetrics.js';
78

89
import * as d3 from "d3";
@@ -26,6 +27,8 @@ function getFeatureRatings({criterion, selected, metadata, dataset}) {
2627
ratings = errorPercent({selected, metadata, dataset, available});
2728
} else if (criterion === 'errorDeviation') {
2829
ratings = errorDeviation({selected, metadata, dataset, available})
30+
} else if (criterion === 'antiEntropy') {
31+
ratings = antiEntropy({selected, metadata, dataset, available})
2932
}
3033

3134
return normalize(ratings);

src/RatingMetrics.js

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export {
88
errorCount,
99
errorPercent,
1010
getErrorCountForSquare,
11+
antiEntropy,
1112
};
1213

1314
/*
@@ -20,7 +21,29 @@ function entropy({selected, metadata, dataset, available}) {
2021
const data = getData(metadata, sel, dataset);
2122

2223
// give higher rating to lower entropy, so negate it
23-
const value = -d3.sum(data, square => {
24+
const value = 1-d3.sum(data, square => {
25+
const weight = square.size / metadata.size;
26+
return weight * H(square);
27+
});
28+
29+
return {feature, value};
30+
});
31+
32+
function H(square) {
33+
return -d3.sum(square.groundTruth.values(), v => {
34+
const p = v / square.size;
35+
return p * Math.log2(p);
36+
});
37+
}
38+
}
39+
40+
function antiEntropy({selected, metadata, dataset, available}) {
41+
return available.map(feature => {
42+
const sel = [...selected, feature];
43+
const data = getData(metadata, sel, dataset);
44+
45+
// give higher rating to lower entropy, so negate it
46+
const value = d3.sum(data, square => {
2447
const weight = square.size / metadata.size;
2548
return weight * H(square);
2649
});

src/SubsetRecommender.js

Lines changed: 62 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,18 @@ import { getData } from './DataTransformer.js';
99

1010
import * as d3 from "d3";
1111

12-
export { getRecommendedSubsets };
12+
export { getRecommendedSubsets, timeSubsets, topSubsets, randomSubsets };
13+
14+
const criteria = {
15+
'entropy': entropy,
16+
'antiEntropy': antiEntropy,
17+
}
1318

1419
function entropy({set, metadata, dataset}) {
1520
const data = getData(metadata, set, dataset);
1621

1722
// give higher rating to lower entropy, so negate it
18-
const value = -d3.sum(data, square => {
23+
const value = 1 - d3.sum(data, square => {
1924
const weight = square.size / metadata.size;
2025
return weight * H(square);
2126
});
@@ -30,18 +35,64 @@ function entropy({set, metadata, dataset}) {
3035
}
3136
}
3237

33-
function getRecommendedSubsets({criterion, selected, metadata, dataset}) {
38+
function antiEntropy({set, metadata, dataset}) {
39+
return 1 - entropy({set, metadata, dataset});
40+
}
41+
42+
function timeSubsets({criterion, selected, metadata, dataset}) {
43+
const data = [];
44+
45+
d3.range(0.5, 1, 0.01).reverse().forEach(threshold => {
46+
const t0 = performance.now();
47+
const subsets = getRecommendedSubsets({criterion, selected, metadata, dataset}, threshold);
48+
const t1 = performance.now();
49+
50+
data.push({
51+
threshold,
52+
percentBetter: 1 - threshold,
53+
numSubsets: subsets.length,
54+
ms: t1 - t0
55+
});
56+
});
57+
58+
console.log(JSON.stringify(data));
59+
}
60+
61+
function topSubsets({criterion, selected, metadata, dataset}) {
62+
const data = {};
63+
64+
[2, 3, 4].forEach(n => {
65+
console.log(n);
66+
const sets = getPermutations(metadata.featureNames, n)
67+
.map(cand => ({
68+
set: cand,
69+
score: entropy({set: cand, metadata, dataset}),
70+
}))
71+
.sort((a, b) => d3.descending(a.score, b.score));
72+
73+
data[n] = sets;
74+
});
75+
76+
console.log(JSON.stringify(data));
77+
}
78+
79+
function randomSubsets({criterion, selected, metadata, dataset}) {
80+
return d3.shuffle(getPermutations(metadata.featureNames, 2)).slice(0, 10);
81+
}
82+
83+
function getRecommendedSubsets({criterion, selected, metadata, dataset}, percent=1.0) {
3484
const L = [getLarge1ItemSets({criterion, metadata, dataset})];
3585

3686
for (let k = 1; k < 4; k++) {
3787
const min = d3.min(L[k - 1], d => d.score);
38-
const threshold = min * .75;
88+
89+
const threshold = min * percent;
3990

4091
const candidates = getCandidates(L[k - 1]);
4192

4293
const candScores = candidates.map(cand => ({
4394
set: cand,
44-
score: entropy({set: cand, metadata, dataset})
95+
score: criteria[criterion]({set: cand, metadata, dataset})
4596
}));
4697

4798
const valid = candScores.filter(({score}) => {
@@ -51,17 +102,20 @@ function getRecommendedSubsets({criterion, selected, metadata, dataset}) {
51102
L.push(valid);
52103
}
53104

54-
return L.flat().sort((a, b) => d3.descending(a.score, b.score)).map(d => d.set);
105+
const sorted = L.flat().sort((a, b) => d3.descending(a.score, b.score));
106+
107+
return sorted.map(d => d.set);
55108
}
56109

57110
function getLarge1ItemSets({criterion, metadata, dataset}) {
111+
const numStart = Math.min(Math.floor(metadata.featureNames.length / 2), 10);
58112
const scores = metadata.featureNames.map(feature => {
59113
return {
60-
score: entropy({set: [feature], metadata, dataset}),
114+
score: criteria[criterion]({set: [feature], metadata, dataset}),
61115
set: [feature]
62116
}
63117
}).sort((a, b) => d3.descending(a.score, b.score))
64-
.slice(0, 10);
118+
.slice(0, numStart);
65119

66120
return scores;
67121
}

src/SubsetSuggesterWorker.js

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1-
import { getRecommendedSubsets } from './SubsetRecommender';
1+
import { getRecommendedSubsets, timeSubsets, topSubsets, randomSubsets } from './SubsetRecommender';
22

33
self.onmessage = e => {
44
const recommendedSubsets = getRecommendedSubsets(e.data);
5+
// const recommendedSubsets = randomSubsets(e.data);
6+
// const recommendedSubsets = timeSubsets(e.data);
7+
// topSubsets(e.data);
58
postMessage(recommendedSubsets);
69
}

0 commit comments

Comments
 (0)