Skip to content

Commit

Permalink
refactor: make AGNES algorithm closer to R (#11)
Browse files Browse the repository at this point in the history
BREAKING CHANGE:

- use the Lance-Williams algorithm to update cluster distances.
- add other methods that exist in R.
- remove `ClusterLeaf` class and use an `isLeaf` property instead.
- remove `index` array from clusters. Instead, an `indexes()` method has been added to compute it.
- add a `size` property to clusters that indicates the number of leaves below it.
- the default `method` is now `'complete'`.
- DIANA has been removed from the package pending rewriting it.
  • Loading branch information
targos authored Jul 16, 2019
1 parent 5266b90 commit 1517124
Show file tree
Hide file tree
Showing 12 changed files with 518 additions and 465 deletions.
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,26 @@ Hierarchical clustering algorithms in JavaScript.

## Installation

`npm install ml-hclust`
`npm i ml-hclust`

## [API Documentation](https://mljs.github.io/hclust/)

## Methods
## Usage

Generate a clustering hierarchy.
### AGNES

```js
const { agnes } = require('ml-hclust');

const tree = agnes(data, {
method: 'ward',
});
```

## Implemented algorithms

- [x] [AGNES](http://dx.doi.org/10.1002/9780470316801.ch5) (AGglomerative NESting): Continuously merge nodes that have the least dissimilarity.
- [x] [DIANA](http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470276800.html) (Divisive ANAlysis): The process starts at the root with all the points as one cluster and recursively splits the higher level clusters to build the dendrogram.
- [ ] [DIANA](http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470276800.html) (Divisive ANAlysis): The process starts at the root with all the points as one cluster and recursively splits the higher level clusters to build the dendrogram.
- [ ] [BIRCH](http://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf) (Balanced Iterative Reducing and Clustering using Hierarchies): Incrementally construct a CF (Clustering Feature) tree, a hierarchical data structure for multiphase clustering
- [ ] [CURE](http://www.cs.bu.edu/fac/gkollios/ada05/LectNotes/guha98cure.pdf) (Clustering Using REpresentatives):
- [ ] [CHAMELEON](http://www.google.ch/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0CCQQFjAAahUKEwj6t4n_sZbGAhXDaxQKHXCLCmQ&url=http%3A%2F%2Fglaros.dtc.umn.edu%2Fgkhome%2Ffetch%2Fpapers%2FchameleonCOMPUTER99.pdf&ei=kDqBVfqvKsPXUfCWqqAG&usg=AFQjCNEYcGqCxN5N_GlP4Z__UF09aHegQg&sig2=9JkxZ5VS7iDbiJT-imX5Pg&bvm=bv.96041959,d.d24&cad=rja)
Expand All @@ -32,6 +42,7 @@ npm test
## Authors

- [Miguel Asencio](https://github.com/maasencioh)
- [Michael Zasso](https://github.com/targos)

## License

Expand Down
29 changes: 29 additions & 0 deletions experiment.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { agnes } from './src';

const d = [
[0, 17, 21, 31, 23],
[17, 0, 30, 34, 21],
[21, 30, 0, 28, 39],
[31, 34, 28, 0, 43],
[23, 21, 39, 43, 0],
];

const c = agnes(d, {
method: 'ward',
isDistanceMatrix: true,
});

const heights = [];
c.traverse((cluster) => {
if (cluster.isLeaf) {
console.log(cluster.index + 1);
}
if (cluster.height > 0) {
heights.push(cluster.height);
}
});

heights.sort((h1, h2) => h1 - h2);

console.log(heights);
// console.log(require('util').inspect(c, { depth: Infinity, colors: true }));
37 changes: 20 additions & 17 deletions hclust.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,43 @@ export type AgglomerationMethod =
| 'single'
| 'complete'
| 'average'
| 'upgma'
| 'wpgma'
| 'median'
| 'wpgmc'
| 'centroid'
| 'ward';
| 'upgmc'
| 'ward'
| 'ward2';

export interface AgnesOptions<T> {
distanceFunction?: (a: T, b: T) => number;
method?: AgglomerationMethod;
isDistanceMatrix?: boolean;
}

export interface DianaOptions<T> {
distanceFunction?: (a: T, b: T) => number;
}
// export interface DianaOptions<T> {
// distanceFunction?: (a: T, b: T) => number;
// }

export interface Cluster {
children: Cluster[];
distance: number;
index: ClusterLeaf[];
height: number;
size: number;
index: number;
isLeaf: boolean;
cut: (threshold: number) => Cluster[];
group: (minGroups: number) => Cluster;
group: (groups: number) => Cluster;
traverse: (cb: (cluster: Cluster) => void) => void;
}

export interface ClusterLeaf extends Cluster {
children: [];
distance: 0;
index: number;
indices: () => number[];
}

export function agnes<T = number[]>(
data: T[],
options?: AgnesOptions<T>,
): Cluster;

export function diana<T = number[]>(
data: T[],
options?: DianaOptions<T>,
): Cluster;
// export function diana<T = number[]>(
// data: T[],
// options?: DianaOptions<T>,
// ): Cluster;
19 changes: 10 additions & 9 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "ml-hclust",
"version": "2.0.3",
"version": "3.0.0-1",
"description": "Hierarchical clustering algorithms",
"main": "hclust.js",
"module": "src/index.js",
Expand Down Expand Up @@ -41,18 +41,19 @@
},
"homepage": "https://github.com/mljs/hclust",
"devDependencies": {
"@babel/plugin-transform-modules-commonjs": "^7.4.4",
"eslint": "^5.16.0",
"@babel/plugin-transform-modules-commonjs": "^7.5.0",
"eslint": "^6.0.1",
"eslint-config-cheminfo": "^1.20.1",
"eslint-plugin-import": "^2.17.2",
"eslint-plugin-jest": "^22.5.1",
"jest": "^24.7.1",
"rollup": "^1.10.1"
"eslint-plugin-import": "^2.18.0",
"eslint-plugin-jest": "^22.7.2",
"esm": "^3.2.25",
"jest": "^24.8.0",
"rollup": "^1.16.7"
},
"dependencies": {
"heap": "^0.2.6",
"ml-array-median": "^1.1.1",
"ml-distance-euclidean": "^2.0.0",
"ml-distance-matrix": "^1.0.0"
"ml-distance-matrix": "^2.0.0",
"ml-matrix": "^6.1.2"
}
}
69 changes: 43 additions & 26 deletions src/Cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,29 @@ import Heap from 'heap';
export default class Cluster {
constructor() {
this.children = [];
this.distance = -1;
this.index = [];
this.height = 0;
this.size = 1;
this.index = -1;
this.isLeaf = false;
}

/**
* Creates an array of values where maximum distance smaller than the threshold
* Creates an array of clusters where the maximum height is smaller than the threshold
* @param {number} threshold
* @return {Array <Cluster>}
* @return {Array<Cluster>}
*/
cut(threshold) {
if (threshold < 0) throw new RangeError('Threshold too small');
var root = new Cluster();
root.children = this.children;
root.distance = this.distance;
root.index = this.index;
var list = [root];
var ans = [];
if (typeof threshold !== 'number') {
throw new TypeError('threshold must be a number');
}
if (threshold < 0) {
throw new RangeError('threshold must be a positive number');
}
let list = [this];
const ans = [];
while (list.length > 0) {
var aux = list.shift();
if (threshold >= aux.distance) {
const aux = list.shift();
if (threshold >= aux.height) {
ans.push(aux);
} else {
list = list.concat(aux.children);
Expand All @@ -32,22 +35,22 @@ export default class Cluster {
}

/**
* Merge the leaves in the minimum way to have 'minGroups' number of clusters
* @param {number} minGroups - Them minimum number of children the first level of the tree should have
* Merge the leaves in the minimum way to have `groups` number of clusters.
* @param {number} groups - Them number of children the first level of the tree should have.
* @return {Cluster}
*/
group(minGroups) {
if (!Number.isInteger(minGroups) || minGroups < 1) {
throw new RangeError('Number of groups must be a positive integer');
group(groups) {
if (!Number.isInteger(groups) || groups < 1) {
throw new RangeError('groups must be a positive integer');
}

const heap = new Heap(function (a, b) {
return b.distance - a.distance;
const heap = new Heap((a, b) => {
return b.height - a.height;
});

heap.push(this);

while (heap.size() < minGroups) {
while (heap.size() < groups) {
var first = heap.pop();
if (first.children.length === 0) {
break;
Expand All @@ -57,25 +60,39 @@ export default class Cluster {

var root = new Cluster();
root.children = heap.toArray();
root.distance = this.distance;
root.height = this.height;

return root;
}

/**
* Traverses the tree depth-first and provide callback to be called on each individual node
* Traverses the tree depth-first and calls the provided callback with each individual node
* @param {function} cb - The callback to be called on each node encounter
* @type {Cluster}
*/
traverse(cb) {
function visit(root, callback) {
callback(root);
if (root.children) {
for (var i = root.children.length - 1; i >= 0; i--) {
visit(root.children[i], callback);
for (const child of root.children) {
visit(child, callback);
}
}
}
visit(this, cb);
}

/**
* Returns a list of indices for all the leaves of this cluster.
* The list is ordered in such a way that a dendrogram could be drawn without crossing branches.
* @returns {Array<number>}
*/
indices() {
const result = [];
this.traverse((cluster) => {
if (cluster.isLeaf) {
result.push(cluster.index);
}
});
return result;
}
}
10 changes: 0 additions & 10 deletions src/ClusterLeaf.js

This file was deleted.

31 changes: 31 additions & 0 deletions src/__tests__/agnes.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import * as data from '../../testData';

import { agnes } from '..';

test('AGNES with feature matrix', () => {
const clust = agnes(data.features1);
expect(clust.height).toBeCloseTo(7.2111, 4);
});

test('AGNES with distance matrix', () => {
var clust = agnes(data.distanceMatrix1, { isDistanceMatrix: true });
expect(clust.height).toBeCloseTo(7.2111, 4);
});

test('AGNES with distance matrix 2', () => {
const clust = agnes(data.distanceMatrix2, { isDistanceMatrix: true });
expect(clust.height).not.toBeGreaterThan(1);
});

test('AGNES centroid', () => {
const clust = agnes(data.distanceMatrix2, {
isDistanceMatrix: true,
method: 'centroid',
});

clust.traverse((node) => {
expect(typeof node.height).toBe('number');
expect(node.height).not.toBe(NaN);
expect(node.height).not.toBeLessThan(0);
});
});
46 changes: 46 additions & 0 deletions src/__tests__/cluster.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import * as data from '../../testData';

import { agnes } from '..';

test('size', () => {
const clust = agnes(data.features1);
expect(clust.size).toBe(10);
const [child1, child2] = clust.children;
expect(child1.size).toBe(5);
expect(child2.size).toBe(5);
});

test('cut', () => {
const clust = agnes(data.features1);
expect(clust.cut(1.5)).toHaveLength(5);
});

test('group', () => {
const clust = agnes(data.features1);
const group = clust.group(8);
expect(group.children).toHaveLength(8);
});

test('indices', () => {
const clust = agnes(data.features1);
const indices = clust.indices();
expect(indices).toHaveLength(data.features1.length);
expect(indices).toStrictEqual([6, 5, 9, 8, 7, 3, 1, 0, 4, 2]);
});

test('traverse, isLeaf and index', () => {
const clust = agnes(data.features1);
let other = 0;
let leaves = 0;
clust.traverse((cluster) => {
if (cluster.isLeaf) {
leaves++;
expect(cluster.index).toBeGreaterThan(-1);
} else {
other++;
expect(cluster.index).toBe(-1);
}
});
expect(other).toBe(9);
expect(leaves).toBe(10);
});
Loading

0 comments on commit 1517124

Please sign in to comment.