From 48808daa254499641d0c01d4815b8cfdadc2d0af Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Wed, 22 Jan 2014 22:08:53 +0000 Subject: [PATCH 01/22] [maven-release-plugin] prepare for next development iteration git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1560538 13f79535-47bb-0310-9956-ffa450edef68 --- buildtools/pom.xml | 8 +------- core/pom.xml | 2 +- distribution/pom.xml | 2 +- examples/pom.xml | 2 +- integration/pom.xml | 2 +- math-scala/pom.xml | 2 +- math/pom.xml | 2 +- pom.xml | 8 ++++---- 8 files changed, 11 insertions(+), 17 deletions(-) diff --git a/buildtools/pom.xml b/buildtools/pom.xml index bedb92ca3a..1636879460 100644 --- a/buildtools/pom.xml +++ b/buildtools/pom.xml @@ -29,7 +29,7 @@ org.apache.mahout mahout-buildtools - 0.9 + 1.0-SNAPSHOT Mahout Build Tools jar @@ -121,10 +121,4 @@ - - - scm:svn:http://svn.apache.org/repos/asf/maven/pom/tags/mahout-0.9/mahout-buildtools - scm:svn:https://svn.apache.org/repos/asf/maven/pom/tags/mahout-0.9/mahout-buildtools - http://svn.apache.org/viewvc/maven/pom/tags/mahout-0.9/mahout-buildtools - diff --git a/core/pom.xml b/core/pom.xml index 98eefa9a53..13d4aa8d93 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9 + 1.0-SNAPSHOT ../pom.xml diff --git a/distribution/pom.xml b/distribution/pom.xml index 2608f031e3..4caa1093bb 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -20,7 +20,7 @@ org.apache.mahout mahout - 0.9 + 1.0-SNAPSHOT ../pom.xml mahout-distribution diff --git a/examples/pom.xml b/examples/pom.xml index f9386227ed..48e1c491a2 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9 + 1.0-SNAPSHOT ../pom.xml diff --git a/integration/pom.xml b/integration/pom.xml index 61e9804b18..1d4588ec94 100644 --- a/integration/pom.xml +++ b/integration/pom.xml @@ -24,7 +24,7 @@ org.apache.mahout mahout - 0.9 + 1.0-SNAPSHOT ../pom.xml diff --git a/math-scala/pom.xml b/math-scala/pom.xml index 3f0f368aa6..ef36dac090 100644 --- a/math-scala/pom.xml +++ b/math-scala/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9 + 1.0-SNAPSHOT ../pom.xml diff --git a/math/pom.xml b/math/pom.xml index b7cad5ddd7..274194cccc 100644 --- a/math/pom.xml +++ b/math/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9 + 1.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index c777ff20c1..c4d2c80fba 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,7 @@ 4.0.0 org.apache.mahout mahout - 0.9 + 1.0-SNAPSHOT org.apache @@ -988,9 +988,9 @@ - scm:svn:https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9 - scm:svn:https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9 - https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9 + scm:svn:https://svn.apache.org/repos/asf/mahout/trunk + scm:svn:https://svn.apache.org/repos/asf/mahout/trunk + https://svn.apache.org/repos/asf/mahout From 149007d7e4a81579a08f88367ea64fc61f080d64 Mon Sep 17 00:00:00 2001 From: Ted Dunning Date: Fri, 24 Jan 2014 02:27:54 +0000 Subject: [PATCH 02/22] MAHOUT-1390 - Fixed extraneous commit. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1560889 13f79535-47bb-0310-9956-ffa450edef68 --- .../math/TestSingularValueDecomposition.java | 34 ++++--------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java b/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java index 045850a239..eaeb444eed 100644 --- a/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java +++ b/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java @@ -244,35 +244,13 @@ public void testConditionNumber() { assertEquals(3.0, svd.cond(), 1.5e-15); } - @Test + @Test(timeout=1000) public void testSvdHang() throws IOException, InterruptedException, ExecutionException, TimeoutException { - for (String s : new String[]{"hanging-svd", }) { - System.out.printf("starting %s\n", s); - final Matrix m = readTsv(s + ".tsv"); - try { - SingularValueDecomposition svd = timeout(2000, new Callable() { - @Override - public SingularValueDecomposition call() throws Exception { - return new SingularValueDecomposition(m); - } - }); - assertEquals(0, m.minus(svd.getU().times(svd.getS()).times(svd.getV().transpose())).aggregate(Functions.PLUS, Functions.ABS), 1e-10); - System.out.printf("%s worked\n", s); - } catch (ExecutionException e) { - System.out.printf("Failed during %s\n", s); - throw e; - } catch (TimeoutException e) { - System.out.printf("%s timed out\n", s); - throw e; - } - } - } - - T timeout(int timeLimit, Callable toDo) throws InterruptedException, ExecutionException, TimeoutException { - ExecutorService pool = Executors.newFixedThreadPool(1); - Future f = pool.submit(toDo); - pool.shutdown(); - return f.get(timeLimit, TimeUnit.MILLISECONDS); + System.out.printf("starting hanging-svd\n"); + final Matrix m = readTsv("hanging-svd.tsv"); + SingularValueDecomposition svd = new SingularValueDecomposition(m); + assertEquals(0, m.minus(svd.getU().times(svd.getS()).times(svd.getV().transpose())).aggregate(Functions.PLUS, Functions.ABS), 1e-10); + System.out.printf("No hang\n"); } Matrix readTsv(String name) throws IOException { From c4bc2174302894d8e3ad2e2900fbe3abcde42a67 Mon Sep 17 00:00:00 2001 From: Ted Dunning Date: Fri, 24 Jan 2014 02:27:57 +0000 Subject: [PATCH 03/22] MAHOUT-1409 - bad index checking in viewColumn or viewRow. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1560890 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/mahout/math/MatrixVectorView.java | 4 ++-- .../mahout/math/MatrixVectorViewTest.java | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java b/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java index da088d248e..2aa1c09cbf 100644 --- a/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java +++ b/math/src/main/java/org/apache/mahout/math/MatrixVectorView.java @@ -38,10 +38,10 @@ public MatrixVectorView(Matrix matrix, int row, int column, int rowStride, int c public MatrixVectorView(Matrix matrix, int row, int column, int rowStride, int columnStride) { super(viewSize(matrix, row, column, rowStride, columnStride)); - if (row < 0 || row > matrix.rowSize()) { + if (row < 0 || row >= matrix.rowSize()) { throw new IndexException(row, matrix.rowSize()); } - if (column < 0 || column > matrix.columnSize()) { + if (column < 0 || column >= matrix.columnSize()) { throw new IndexException(column, matrix.columnSize()); } diff --git a/math/src/test/java/org/apache/mahout/math/MatrixVectorViewTest.java b/math/src/test/java/org/apache/mahout/math/MatrixVectorViewTest.java index 6e9a592c1c..400df8d975 100644 --- a/math/src/test/java/org/apache/mahout/math/MatrixVectorViewTest.java +++ b/math/src/test/java/org/apache/mahout/math/MatrixVectorViewTest.java @@ -17,6 +17,7 @@ package org.apache.mahout.math; +import org.apache.mahout.math.function.Functions; import org.junit.Test; public class MatrixVectorViewTest extends MahoutTestCase { @@ -34,4 +35,24 @@ public void testColumnView() { assertEquals(matrix.numRows(), outerProduct.numRows()); assertEquals(matrix.numRows(), outerProduct.numCols()); } + + /** + * Test for out of range column or row access. + */ + @Test + public void testIndexRange() { + Matrix m = new DenseMatrix(20, 30).assign(Functions.random()); + try { + m.viewColumn(30); + fail("Should have thrown exception"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().startsWith("Index 30 is outside allowable")); + } + try { + m.viewRow(20); + fail("Should have thrown exception"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().startsWith("Index 20 is outside allowable")); + } + } } From 084ff3f1890dd38c45d9b731f38d04f566f3742c Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Sun, 26 Jan 2014 03:22:57 +0000 Subject: [PATCH 04/22] Reverting back to 0.9-SNAPSHOT git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561437 13f79535-47bb-0310-9956-ffa450edef68 --- buildtools/pom.xml | 2 +- core/pom.xml | 2 +- distribution/pom.xml | 2 +- examples/pom.xml | 2 +- integration/pom.xml | 2 +- math-scala/pom.xml | 2 +- math/pom.xml | 2 +- pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/buildtools/pom.xml b/buildtools/pom.xml index 1636879460..66390e16ea 100644 --- a/buildtools/pom.xml +++ b/buildtools/pom.xml @@ -29,7 +29,7 @@ org.apache.mahout mahout-buildtools - 1.0-SNAPSHOT + 0.9-SNAPSHOT Mahout Build Tools jar diff --git a/core/pom.xml b/core/pom.xml index 13d4aa8d93..9ed602a421 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 1.0-SNAPSHOT + 0.9-SNAPSHOT ../pom.xml diff --git a/distribution/pom.xml b/distribution/pom.xml index 4caa1093bb..4e9363ad76 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -20,7 +20,7 @@ org.apache.mahout mahout - 1.0-SNAPSHOT + 0.9-SNAPSHOT ../pom.xml mahout-distribution diff --git a/examples/pom.xml b/examples/pom.xml index 48e1c491a2..fa32a361d4 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 1.0-SNAPSHOT + 0.9-SNAPSHOT ../pom.xml diff --git a/integration/pom.xml b/integration/pom.xml index 1d4588ec94..051f1e0183 100644 --- a/integration/pom.xml +++ b/integration/pom.xml @@ -24,7 +24,7 @@ org.apache.mahout mahout - 1.0-SNAPSHOT + 0.9-SNAPSHOT ../pom.xml diff --git a/math-scala/pom.xml b/math-scala/pom.xml index ef36dac090..1060296c3a 100644 --- a/math-scala/pom.xml +++ b/math-scala/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 1.0-SNAPSHOT + 0.9-SNAPSHOT ../pom.xml diff --git a/math/pom.xml b/math/pom.xml index 274194cccc..41f854d32e 100644 --- a/math/pom.xml +++ b/math/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 1.0-SNAPSHOT + 0.9-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index c4d2c80fba..c47e83ef24 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,7 @@ 4.0.0 org.apache.mahout mahout - 1.0-SNAPSHOT + 0.9-SNAPSHOT org.apache From 74e460035c91aaeebcbf342ee6540fd9d8673670 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Sun, 26 Jan 2014 03:29:42 +0000 Subject: [PATCH 05/22] MAHOUT-1409: Adding CHANGELOG entry git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561439 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGELOG | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 7db3310794..52c7a07209 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 0.9 - unreleased + MAHOUT-1409: MatrixVectorView has index check error (tdunning) + MAHOUT-1402: Zero clusters using streaming k-means option in cluster-reuters.sh (smarthi) MAHOUT-1401: Resurrect Frequent Pattern mining (smarthi) From db08f41821603b6ff2d51fd7a36ec0340aceb9f9 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Sun, 26 Jan 2014 03:50:55 +0000 Subject: [PATCH 06/22] MAHOUT-1410: clusteredPoints do not contain a vector id git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561440 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGELOG | 2 + .../classify/ClusterClassificationDriver.java | 38 ++++++++++++++----- .../classify/ClusterClassificationMapper.java | 19 +++++++--- .../ClusterClassificationDriverTest.java | 13 +++++-- 4 files changed, 55 insertions(+), 17 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 52c7a07209..6b334ac446 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 0.9 - unreleased + MAHOUT-1410: clusteredPoints do not contain a vector id (smarthi, Andrew Musselman) + MAHOUT-1409: MatrixVectorView has index check error (tdunning) MAHOUT-1402: Zero clusters using streaming k-means option in cluster-reuters.sh (smarthi) diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java index 563abd52eb..0a634eeaf7 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java @@ -42,11 +42,13 @@ import org.apache.mahout.clustering.iterator.ClusterWritable; import org.apache.mahout.clustering.iterator.ClusteringPolicy; import org.apache.mahout.common.AbstractJob; +import org.apache.mahout.common.Pair; import org.apache.mahout.common.commandline.DefaultOptionCreator; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; -import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; +import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.Vector.Element; import org.apache.mahout.math.VectorWritable; @@ -186,7 +188,11 @@ private static Path finalClustersPath(Configuration conf, Path clusterOutputPath * @param output * the path to store classified data * @param clusterClassificationThreshold + * the threshold value of probability distribution function from 0.0 + * to 1.0. Any vector with pdf less that this threshold will not be + * classified for the cluster * @param emitMostLikely + * emit the vectors with the max pdf values per cluster * @throws IOException */ private static void selectCluster(Path input, List clusterModels, ClusterClassifier clusterClassifier, @@ -194,11 +200,20 @@ private static void selectCluster(Path input, List clusterModels, Clust Configuration conf = new Configuration(); SequenceFile.Writer writer = new SequenceFile.Writer(input.getFileSystem(conf), conf, new Path(output, "part-m-" + 0), IntWritable.class, WeightedPropertyVectorWritable.class); - for (VectorWritable vw : new SequenceFileDirValueIterable(input, PathType.LIST, + for (Pair vw : new SequenceFileDirIterable(input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { - Vector pdfPerCluster = clusterClassifier.classify(vw.get()); + Writable key = vw.getFirst(); + Vector vector = vw.getSecond().get(); + if (!(vector instanceof NamedVector)) { + if (key instanceof Text) { + vector = new NamedVector(vector, key.toString()); + } else if (key instanceof IntWritable) { + vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); + } + } + Vector pdfPerCluster = clusterClassifier.classify(vector); if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) { - classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, vw, pdfPerCluster); + classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster); } } writer.close(); @@ -209,8 +224,9 @@ private static void classifyAndWrite(List clusterModels, Double cluster Map props = Maps.newHashMap(); if (emitMostLikely) { int maxValueIndex = pdfPerCluster.maxValueIndex(); - WeightedPropertyVectorWritable wpvw = new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props); - write(clusterModels, writer, wpvw, maxValueIndex); + WeightedPropertyVectorWritable weightedPropertyVectorWritable = + new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props); + write(clusterModels, writer, weightedPropertyVectorWritable, maxValueIndex); } else { writeAllAboveThreshold(clusterModels, clusterClassificationThreshold, writer, vw, pdfPerCluster); } @@ -218,19 +234,23 @@ private static void classifyAndWrite(List clusterModels, Double cluster private static void writeAllAboveThreshold(List clusterModels, Double clusterClassificationThreshold, SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException { + Map props = Maps.newHashMap(); for (Element pdf : pdfPerCluster.nonZeroes()) { if (pdf.get() >= clusterClassificationThreshold) { - WeightedVectorWritable wvw = new WeightedVectorWritable(pdf.get(), vw.get()); + WeightedPropertyVectorWritable wvw = new WeightedPropertyVectorWritable(pdf.get(), vw.get(), props); int clusterIndex = pdf.index(); write(clusterModels, writer, wvw, clusterIndex); } } } - private static void write(List clusterModels, SequenceFile.Writer writer, WeightedVectorWritable wvw, + private static void write(List clusterModels, SequenceFile.Writer writer, + WeightedPropertyVectorWritable weightedPropertyVectorWritable, int maxValueIndex) throws IOException { Cluster cluster = clusterModels.get(maxValueIndex); - writer.append(new IntWritable(cluster.getId()), wvw); + double d = Math.sqrt(cluster.getCenter().getDistanceSquared(weightedPropertyVectorWritable.getVector())); + weightedPropertyVectorWritable.getProperties().put(new Text("distance"), new Text(Double.toString(d))); + writer.append(new IntWritable(cluster.getId()), weightedPropertyVectorWritable); } /** diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java index fea0f16cc3..efa7cad834 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java @@ -39,6 +39,7 @@ import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; +import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.Vector.Element; import org.apache.mahout.math.VectorWritable; @@ -83,13 +84,21 @@ protected void setup(Context context) throws IOException, InterruptedException { protected void map(WritableComparable key, VectorWritable vw, Context context) throws IOException, InterruptedException { if (!clusterModels.isEmpty()) { - Vector pdfPerCluster = clusterClassifier.classify(vw.get()); + Vector vector = vw.get(); + if (!(vector instanceof NamedVector)) { + if (key instanceof Text) { + vector = new NamedVector(vector, key.toString()); + } else if (key instanceof IntWritable) { + vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); + } + } + Vector pdfPerCluster = clusterClassifier.classify(vector); if (shouldClassify(pdfPerCluster)) { if (emitMostLikely) { int maxValueIndex = pdfPerCluster.maxValueIndex(); - write(vw, context, maxValueIndex, 1.0); + write(new VectorWritable(vector), context, maxValueIndex, 1.0); } else { - writeAllAboveThreshold(vw, context, pdfPerCluster); + writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster); } } } @@ -109,9 +118,9 @@ private void write(VectorWritable vw, Context context, int clusterIndex, double throws IOException, InterruptedException { Cluster cluster = clusterModels.get(clusterIndex); clusterId.set(cluster.getId()); - double d = cluster.getCenter().getDistanceSquared(vw.get()); + double d = Math.sqrt(cluster.getCenter().getDistanceSquared(vw.get())); Map props = Maps.newHashMap(); - props.put(new Text("distance-squared"), new Text(Double.toString(d))); + props.put(new Text("distance"), new Text(Double.toString(d))); context.write(clusterId, new WeightedPropertyVectorWritable(weight, vw.get(), props)); } diff --git a/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java b/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java index 6192fc36f2..93df7d0b89 100644 --- a/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java +++ b/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java @@ -37,6 +37,7 @@ import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.distance.ManhattanDistanceMeasure; import org.apache.mahout.common.iterator.sequencefile.PathFilters; +import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; @@ -235,9 +236,15 @@ private void checkClustersWithOutlierRemoval() { } else { singletonCnt++; assertEquals("expecting only singleton clusters; got size=" + vList.size(), 1, vList.size()); - Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(), - reference.contains(vList.get(0).asFormatString())); - reference.remove(vList.get(0).asFormatString()); + if (vList.get(0) instanceof NamedVector) { + Assert.assertTrue("not expecting cluster:" + ((NamedVector) vList.get(0)).getDelegate().asFormatString(), + reference.contains(((NamedVector) vList.get(0)).getDelegate().asFormatString())); + reference.remove(((NamedVector)vList.get(0)).getDelegate().asFormatString()); + } else if (vList.get(0) instanceof RandomAccessSparseVector) { + Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(), + reference.contains(vList.get(0).asFormatString())); + reference.remove(vList.get(0).asFormatString()); + } } } Assert.assertEquals("Different number of empty clusters than expected!", 1, emptyCnt); From 0c873632622639ecf30b8d16fb01385480255754 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Sun, 26 Jan 2014 08:06:40 +0000 Subject: [PATCH 07/22] MAHOUT-1390: Removed redundant timeout for tstSvdHang() and removed unused imports git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561463 13f79535-47bb-0310-9956-ffa450edef68 --- .../mahout/math/TestSingularValueDecomposition.java | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java b/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java index eaeb444eed..c9e4026da9 100644 --- a/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java +++ b/math/src/test/java/org/apache/mahout/math/TestSingularValueDecomposition.java @@ -21,8 +21,6 @@ import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.io.Resources; -import org.apache.commons.math3.linear.Array2DRowRealMatrix; -import org.apache.commons.math3.linear.RealMatrix; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.function.Functions; import org.junit.Test; @@ -30,16 +28,9 @@ import java.io.IOException; import java.util.List; import java.util.Random; -import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import static org.junit.Assert.assertEquals; - //To launch this test only : mvn test -Dtest=org.apache.mahout.math.TestSingularValueDecomposition public final class TestSingularValueDecomposition extends MahoutTestCase { @@ -244,7 +235,7 @@ public void testConditionNumber() { assertEquals(3.0, svd.cond(), 1.5e-15); } - @Test(timeout=1000) + @Test public void testSvdHang() throws IOException, InterruptedException, ExecutionException, TimeoutException { System.out.printf("starting hanging-svd\n"); final Matrix m = readTsv("hanging-svd.tsv"); From 239e108e512e7c946ec33c95dcd7dadc19ec81eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stevo=20Slavi=C4=87?= Date: Sun, 26 Jan 2014 23:43:43 +0000 Subject: [PATCH 08/22] Removed unnecessary override of mahout-math dependency version, it's managed in parent pom git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561573 13f79535-47bb-0310-9956-ffa450edef68 --- math-scala/pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/math-scala/pom.xml b/math-scala/pom.xml index 1060296c3a..bb0342b231 100644 --- a/math-scala/pom.xml +++ b/math-scala/pom.xml @@ -164,7 +164,6 @@ org.apache.mahout mahout-math - ${pom.version} From 9d43135c0bbee0759d3a1d45245d39cdbf2e1bfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stevo=20Slavi=C4=87?= Date: Mon, 27 Jan 2014 00:45:18 +0000 Subject: [PATCH 09/22] Fixed typos git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561574 13f79535-47bb-0310-9956-ffa450edef68 --- bin/mahout | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/mahout b/bin/mahout index 37c7eea264..fddd27ef97 100755 --- a/bin/mahout +++ b/bin/mahout @@ -24,13 +24,13 @@ # MAHOUT_CORE set to anything other than an empty string to force # mahout to run in developer 'core' mode, just as if the # -core option was presented on the command-line -# Commane-line Options +# Command-line Options # # -core -core is used to switch into 'developer mode' when # running mahout locally. If specified, the classes # from the 'target/classes' directories in each project -# are used. Otherwise classes will be retrived from -# jars in the binary releas collection or *-job.jar files +# are used. Otherwise classes will be retrieved from +# jars in the binary release collection or *-job.jar files # found in build directories. When running on hadoop # the job files will always be used. From 278a0398308f2e6409802dcceaa85925fc367184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stevo=20Slavi=C4=87?= Date: Mon, 27 Jan 2014 01:10:15 +0000 Subject: [PATCH 10/22] MAHOUT-1399: Fixed multiple slf4j bindings when running Mahout examples issue git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561578 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGELOG | 2 ++ core/src/main/assembly/job.xml | 1 + examples/pom.xml | 20 +++++++++++++++++--- pom.xml | 28 ++++++++++++++++++++++++++-- 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 6b334ac446..9d6725a17a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 0.9 - unreleased + MAHOUT-1399: Fixed multiple slf4j bindings when running Mahout examples issue (sslavic) + MAHOUT-1410: clusteredPoints do not contain a vector id (smarthi, Andrew Musselman) MAHOUT-1409: MatrixVectorView has index check error (tdunning) diff --git a/core/src/main/assembly/job.xml b/core/src/main/assembly/job.xml index c7b6256006..2bdb3ce3d1 100644 --- a/core/src/main/assembly/job.xml +++ b/core/src/main/assembly/job.xml @@ -36,6 +36,7 @@ runtime / + true org.apache.hadoop:hadoop-core diff --git a/examples/pom.xml b/examples/pom.xml index fa32a361d4..03f31d412d 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -158,12 +158,26 @@ org.slf4j slf4j-api + provided + + + org.slf4j + slf4j-log4j12 + provided - org.slf4j - slf4j-jcl - runtime + jcl-over-slf4j + + + commons-logging + commons-logging + provided + + + log4j + log4j + provided diff --git a/pom.xml b/pom.xml index c47e83ef24..6266432b7e 100644 --- a/pom.xml +++ b/pom.xml @@ -106,6 +106,7 @@ 2.9.1 1.2.1 4.6.0 + 1.7.5 Jira @@ -395,14 +396,37 @@ org.slf4j slf4j-api - 1.7.5 + ${slf4j.version} org.slf4j slf4j-jcl - 1.7.5 + ${slf4j.version} test + + org.slf4j + jcl-over-slf4j + ${slf4j.version} + test + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + + commons-logging + commons-logging + 1.1.3 + + + + log4j + log4j + 1.2.17 + org.apache.commons From fdaafcb2eac30fff8fcd06a4d14dfca50a48cb9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stevo=20Slavi=C4=87?= Date: Mon, 27 Jan 2014 01:43:14 +0000 Subject: [PATCH 11/22] MAHOUT-1399: Placed changelog entry on appropriate spot, sorted by Jira issue number git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561579 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGELOG | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 9d6725a17a..d1f4055ed6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,8 +2,6 @@ Mahout Change Log Release 0.9 - unreleased - MAHOUT-1399: Fixed multiple slf4j bindings when running Mahout examples issue (sslavic) - MAHOUT-1410: clusteredPoints do not contain a vector id (smarthi, Andrew Musselman) MAHOUT-1409: MatrixVectorView has index check error (tdunning) @@ -14,6 +12,8 @@ Release 0.9 - unreleased MAHOUT-1400: Remove references to deprecated and removed algorithms from examples scripts (ssc) + MAHOUT-1399: Fixed multiple slf4j bindings when running Mahout examples issue (sslavic) + MAHOUT-1398: FileDataModel should provide a constructor with a delimiterPattern (Roy Guo via ssc) MAHOUT-1396: Accidental use of commons-math won't work with next Hadoop 2 release (srowen) From a4f264af4396cbc4d8ea8f45a47fcda853702312 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Mon, 27 Jan 2014 05:25:19 +0000 Subject: [PATCH 12/22] MAHOUT-1410: Added Code comments. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561586 13f79535-47bb-0310-9956-ffa450edef68 --- .../mahout/clustering/classify/ClusterClassificationDriver.java | 2 ++ .../mahout/clustering/classify/ClusterClassificationMapper.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java index 0a634eeaf7..7728a8e075 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java @@ -202,6 +202,8 @@ private static void selectCluster(Path input, List clusterModels, Clust "part-m-" + 0), IntWritable.class, WeightedPropertyVectorWritable.class); for (Pair vw : new SequenceFileDirIterable(input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) { + // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point + // belongs to which cluster - fix for MAHOUT-1410 Writable key = vw.getFirst(); Vector vector = vw.getSecond().get(); if (!(vector instanceof NamedVector)) { diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java index efa7cad834..df03ac10f8 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java @@ -84,6 +84,8 @@ protected void setup(Context context) throws IOException, InterruptedException { protected void map(WritableComparable key, VectorWritable vw, Context context) throws IOException, InterruptedException { if (!clusterModels.isEmpty()) { + // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point + // belongs to which cluster - fix for MAHOUT-1410 Vector vector = vw.get(); if (!(vector instanceof NamedVector)) { if (key instanceof Text) { From 40d378897cb38fdadacf7803096f2a0688930c19 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Mon, 27 Jan 2014 08:29:58 +0000 Subject: [PATCH 13/22] MAHOUT-1382: Upgrading Guava to 16.0 git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561604 13f79535-47bb-0310-9956-ffa450edef68 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6266432b7e..421fb9253a 100644 --- a/pom.xml +++ b/pom.xml @@ -449,7 +449,7 @@ com.google.guava guava - 15.0 + 16.0 From 92dbb0f733740747e55f04241ca6041bc7855b3f Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 05:55:25 +0000 Subject: [PATCH 14/22] MAHOUT-1410: Committing updated patch. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561948 13f79535-47bb-0310-9956-ffa450edef68 --- .../classify/ClusterClassificationDriver.java | 12 ++++++------ .../classify/ClusterClassificationMapper.java | 7 ++++--- .../classify/ClusterClassificationDriverTest.java | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java index 7728a8e075..5515e5a6ee 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java @@ -204,13 +204,13 @@ private static void selectCluster(Path input, List clusterModels, Clust PathFilters.logsCRCFilter(), conf)) { // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point // belongs to which cluster - fix for MAHOUT-1410 - Writable key = vw.getFirst(); + Class keyClass = vw.getFirst().getClass(); Vector vector = vw.getSecond().get(); - if (!(vector instanceof NamedVector)) { - if (key instanceof Text) { - vector = new NamedVector(vector, key.toString()); - } else if (key instanceof IntWritable) { - vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); + if (!keyClass.equals(NamedVector.class)) { + if (keyClass.equals(Text.class)) { + vector = new NamedVector(vector, vw.getFirst().toString()); + } else if (keyClass.equals(IntWritable.class)) { + vector = new NamedVector(vector, Integer.toString(((IntWritable) vw.getFirst()).get())); } } Vector pdfPerCluster = clusterClassifier.classify(vector); diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java index df03ac10f8..0b1bcff6d4 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java @@ -86,11 +86,12 @@ protected void map(WritableComparable key, VectorWritable vw, Context context if (!clusterModels.isEmpty()) { // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point // belongs to which cluster - fix for MAHOUT-1410 + Class vectorClass = vw.get().getClass(); Vector vector = vw.get(); - if (!(vector instanceof NamedVector)) { - if (key instanceof Text) { + if (!vectorClass.equals(NamedVector.class)) { + if (key.getClass().equals(Text.class)) { vector = new NamedVector(vector, key.toString()); - } else if (key instanceof IntWritable) { + } else if (key.getClass().equals(IntWritable.class)) { vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); } } diff --git a/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java b/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java index 93df7d0b89..cbf0e55c3c 100644 --- a/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java +++ b/core/src/test/java/org/apache/mahout/clustering/classify/ClusterClassificationDriverTest.java @@ -236,11 +236,11 @@ private void checkClustersWithOutlierRemoval() { } else { singletonCnt++; assertEquals("expecting only singleton clusters; got size=" + vList.size(), 1, vList.size()); - if (vList.get(0) instanceof NamedVector) { + if (vList.get(0).getClass().equals(NamedVector.class)) { Assert.assertTrue("not expecting cluster:" + ((NamedVector) vList.get(0)).getDelegate().asFormatString(), reference.contains(((NamedVector) vList.get(0)).getDelegate().asFormatString())); reference.remove(((NamedVector)vList.get(0)).getDelegate().asFormatString()); - } else if (vList.get(0) instanceof RandomAccessSparseVector) { + } else if (vList.get(0).getClass().equals(RandomAccessSparseVector.class)) { Assert.assertTrue("not expecting cluster:" + vList.get(0).asFormatString(), reference.contains(vList.get(0).asFormatString())); reference.remove(vList.get(0).asFormatString()); From 1622bc190bbf911fdb1a8187aa57053a279e5798 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 06:55:14 +0000 Subject: [PATCH 15/22] MAHOUT-1310: Fixed to use the CLI provided DistanceMeasure for distance calculation of Point to Cluster Centroid. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561955 13f79535-47bb-0310-9956-ffa450edef68 --- .../classify/ClusterClassificationDriver.java | 10 ++++++++-- .../classify/ClusterClassificationMapper.java | 10 ++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java index 5515e5a6ee..6e2c3cff8a 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java @@ -41,9 +41,11 @@ import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.iterator.ClusterWritable; import org.apache.mahout.clustering.iterator.ClusteringPolicy; +import org.apache.mahout.clustering.iterator.DistanceMeasureCluster; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.Pair; import org.apache.mahout.common.commandline.DefaultOptionCreator; +import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; @@ -250,8 +252,12 @@ private static void write(List clusterModels, SequenceFile.Writer write WeightedPropertyVectorWritable weightedPropertyVectorWritable, int maxValueIndex) throws IOException { Cluster cluster = clusterModels.get(maxValueIndex); - double d = Math.sqrt(cluster.getCenter().getDistanceSquared(weightedPropertyVectorWritable.getVector())); - weightedPropertyVectorWritable.getProperties().put(new Text("distance"), new Text(Double.toString(d))); + + DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster; + DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure(); + double distance = distanceMeasure.distance(cluster.getCenter(), weightedPropertyVectorWritable.getVector()); + + weightedPropertyVectorWritable.getProperties().put(new Text("distance"), new Text(Double.toString(distance))); writer.append(new IntWritable(cluster.getId()), weightedPropertyVectorWritable); } diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java index 0b1bcff6d4..9edbd8e15e 100644 --- a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java +++ b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java @@ -36,6 +36,8 @@ import org.apache.mahout.clustering.Cluster; import org.apache.mahout.clustering.iterator.ClusterWritable; import org.apache.mahout.clustering.iterator.ClusteringPolicy; +import org.apache.mahout.clustering.iterator.DistanceMeasureCluster; +import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; @@ -121,9 +123,13 @@ private void write(VectorWritable vw, Context context, int clusterIndex, double throws IOException, InterruptedException { Cluster cluster = clusterModels.get(clusterIndex); clusterId.set(cluster.getId()); - double d = Math.sqrt(cluster.getCenter().getDistanceSquared(vw.get())); + + DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster; + DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure(); + double distance = distanceMeasure.distance(cluster.getCenter(), vw.get()); + Map props = Maps.newHashMap(); - props.put(new Text("distance"), new Text(Double.toString(d))); + props.put(new Text("distance"), new Text(Double.toString(distance))); context.write(clusterId, new WeightedPropertyVectorWritable(weight, vw.get(), props)); } From 027a6085210b62a6043380ea530c43398242043c Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 08:07:34 +0000 Subject: [PATCH 16/22] MAHOUT-1310: Changed method signatures to remove unused DistanceMeasure parameter. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561975 13f79535-47bb-0310-9956-ffa450edef68 --- .../fuzzykmeans/FuzzyKMeansDriver.java | 69 ++++++++----------- .../clustering/kmeans/KMeansDriver.java | 54 +++++++-------- .../spectral/kmeans/SpectralKMeansDriver.java | 3 +- .../kmeans/TestKmeansClustering.java | 4 +- .../postprocessor/ClusterCountReaderTest.java | 2 +- .../display/DisplayFuzzyKMeans.java | 2 +- .../clustering/display/DisplayKMeans.java | 2 +- .../syntheticcontrol/fuzzykmeans/Job.java | 8 +-- .../syntheticcontrol/kmeans/Job.java | 7 +- .../mahout/clustering/TestClusterDumper.java | 6 +- .../clustering/TestClusterEvaluator.java | 4 +- .../clustering/cdbw/TestCDbwEvaluator.java | 5 +- 12 files changed, 72 insertions(+), 94 deletions(-) diff --git a/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java b/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java index 4a75c8827a..6121189d1a 100644 --- a/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java +++ b/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java @@ -110,7 +110,6 @@ public int run(String[] args) throws Exception { input, clusters, output, - measure, convergenceDelta, maxIterations, fuzziness, @@ -124,32 +123,31 @@ public int run(String[] args) throws Exception { /** * Iterate over the input vectors to produce clusters and, if requested, use the * results of the final iteration to cluster the input vectors. - * + * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output - * the directory pathname for output points + * the directory pathname for output points * @param convergenceDelta - * the convergence delta value +* the convergence delta value * @param maxIterations - * the maximum number of iterations +* the maximum number of iterations * @param m - * the fuzzification factor, see - * http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering - * @param runClustering - * true if points are to be clustered after iterations complete +* the fuzzification factor, see +* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering + * @param runClustering +* true if points are to be clustered after iterations complete * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point - * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) +* a boolean if true emit only most likely cluster for each point + * @param threshold +* a double threshold value emits all clusters having greater pdf (emitMostLikely = false) * @param runSequential if true run in sequential execution mode */ public static void run(Path input, Path clustersIn, Path output, - DistanceMeasure measure, double convergenceDelta, int maxIterations, float m, @@ -162,7 +160,6 @@ public static void run(Path input, input, clustersIn, output, - measure, convergenceDelta, maxIterations, m, @@ -172,7 +169,6 @@ public static void run(Path input, clusterData(conf, input, clustersOut, output, - measure, convergenceDelta, m, emitMostLikely, @@ -189,27 +185,26 @@ public static void run(Path input, * @param clustersIn * the directory pathname for initial & computed clusters * @param output - * the directory pathname for output points + * the directory pathname for output points * @param convergenceDelta - * the convergence delta value +* the convergence delta value * @param maxIterations - * the maximum number of iterations +* the maximum number of iterations * @param m - * the fuzzification factor, see - * http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering - * @param runClustering - * true if points are to be clustered after iterations complete +* the fuzzification factor, see +* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering + * @param runClustering +* true if points are to be clustered after iterations complete * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point - * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) +* a boolean if true emit only most likely cluster for each point + * @param threshold +* a double threshold value emits all clusters having greater pdf (emitMostLikely = false) * @param runSequential if true run in sequential execution mode */ public static void run(Configuration conf, Path input, Path clustersIn, Path output, - DistanceMeasure measure, double convergenceDelta, int maxIterations, float m, @@ -219,14 +214,13 @@ public static void run(Configuration conf, boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException { Path clustersOut = - buildClusters(conf, input, clustersIn, output, measure, convergenceDelta, maxIterations, m, runSequential); + buildClusters(conf, input, clustersIn, output, convergenceDelta, maxIterations, m, runSequential); if (runClustering) { log.info("Clustering"); clusterData(conf, input, clustersOut, output, - measure, convergenceDelta, m, emitMostLikely, @@ -237,14 +231,13 @@ public static void run(Configuration conf, /** * Iterate over the input vectors to produce cluster directories for each iteration + * * @param input * the directory pathname for input points * @param clustersIn * the file pathname for initial cluster centers * @param output * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure * @param convergenceDelta * the convergence delta value * @param maxIterations @@ -253,14 +246,13 @@ public static void run(Configuration conf, * the fuzzification factor, see * http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering * @param runSequential if true run in sequential execution mode - * + * * @return the Path of the final clusters directory */ public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output, - DistanceMeasure measure, double convergenceDelta, int maxIterations, float m, @@ -293,28 +285,25 @@ public static Path buildClusters(Configuration conf, /** * Run the job using supplied arguments - * + * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for input clusters * @param output - * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure + * the directory pathname for output points * @param convergenceDelta - * the convergence delta value +* the convergence delta value * @param emitMostLikely - * a boolean if true emit only most likely cluster for each point +* a boolean if true emit only most likely cluster for each point * @param threshold - * a double threshold value emits all clusters having greater pdf (emitMostLikely = false) +* a double threshold value emits all clusters having greater pdf (emitMostLikely = false) * @param runSequential if true run in sequential execution mode */ public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, - DistanceMeasure measure, double convergenceDelta, float m, boolean emitMostLikely, diff --git a/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java b/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java index c3decb3883..c036f8e2b3 100644 --- a/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java +++ b/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java @@ -100,7 +100,7 @@ public int run(String[] args) throws Exception { if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); } - run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, + run(getConf(), input, clusters, output, convergenceDelta, maxIterations, runClustering, clusterClassificationThreshold, runSequential); return 0; } @@ -108,15 +108,13 @@ public int run(String[] args) throws Exception { /** * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to * cluster the input vectors. - * + * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points - * @param measure - * the DistanceMeasure to use * @param convergenceDelta * the convergence delta value * @param maxIterations @@ -129,36 +127,33 @@ public int run(String[] args) throws Exception { * @param runSequential * if true execute sequential algorithm */ - public static void run(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, - double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold, - boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { + public static void run(Configuration conf, Path input, Path clustersIn, Path output, + double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold, + boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { // iterate until the clusters converge String delta = Double.toString(convergenceDelta); if (log.isInfoEnabled()) { - log.info("Input: {} Clusters In: {} Out: {} Distance: {}", input, clustersIn, output, - measure.getClass().getName()); + log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output); log.info("convergence: {} max Iterations: {}", convergenceDelta, maxIterations); } - Path clustersOut = buildClusters(conf, input, clustersIn, output, measure, maxIterations, delta, runSequential); + Path clustersOut = buildClusters(conf, input, clustersIn, output, maxIterations, delta, runSequential); if (runClustering) { log.info("Clustering data"); - clusterData(conf, input, clustersOut, output, measure, clusterClassificationThreshold, runSequential); + clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential); } } /** * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to * cluster the input vectors. - * + * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points - * @param measure - * the DistanceMeasure to use * @param convergenceDelta * the convergence delta value * @param maxIterations @@ -166,21 +161,22 @@ public static void run(Configuration conf, Path input, Path clustersIn, Path out * @param runClustering * true if points are to be clustered after iterations are completed * @param clusterClassificationThreshold - * Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors + * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors * having pdf below this value will not be clustered. * @param runSequential * if true execute sequential algorithm */ - public static void run(Path input, Path clustersIn, Path output, DistanceMeasure measure, double convergenceDelta, - int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) + public static void run(Path input, Path clustersIn, Path output, double convergenceDelta, + int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { - run(new Configuration(), input, clustersIn, output, measure, convergenceDelta, maxIterations, runClustering, + run(new Configuration(), input, clustersIn, output, convergenceDelta, maxIterations, runClustering, clusterClassificationThreshold, runSequential); } /** * Iterate over the input vectors to produce cluster directories for each iteration * + * * @param conf * the Configuration to use * @param input @@ -189,20 +185,18 @@ public static void run(Path input, Path clustersIn, Path output, DistanceMeasure * the directory pathname for initial & computed clusters * @param output * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure * @param maxIterations * the maximum number of iterations * @param delta * the convergence delta value * @param runSequential * if true execute sequential algorithm - * + * * @return the Path of the final clusters directory */ public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output, - DistanceMeasure measure, int maxIterations, String delta, boolean runSequential) throws IOException, - InterruptedException, ClassNotFoundException { + int maxIterations, String delta, boolean runSequential) throws IOException, + InterruptedException, ClassNotFoundException { double convergenceDelta = Double.parseDouble(delta); List clusters = Lists.newArrayList(); @@ -227,28 +221,26 @@ public static Path buildClusters(Configuration conf, Path input, Path clustersIn /** * Run the job using supplied arguments - * + * * @param input * the directory pathname for input points * @param clustersIn * the directory pathname for input clusters * @param output * the directory pathname for output points - * @param measure - * the classname of the DistanceMeasure * @param clusterClassificationThreshold - * Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors + * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors * having pdf below this value will not be clustered. * @param runSequential * if true execute sequential algorithm */ - public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, - double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, - ClassNotFoundException { + public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, + double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, + ClassNotFoundException { if (log.isInfoEnabled()) { log.info("Running Clustering"); - log.info("Input: {} Clusters In: {} Out: {} Distance: {}", input, clustersIn, output, measure); + log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output); } ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn); ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), diff --git a/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java b/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java index 48e6de5af3..977d171454 100644 --- a/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java +++ b/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java @@ -152,6 +152,7 @@ public static void run(Configuration conf, Path input, Path output, int numDims, * @param ssvd * Flag to indicate the eigensolver to use * @param numReducers + * Number of reducers * @param blockHeight * @param oversampling * @param poweriters @@ -244,7 +245,7 @@ public static void run(Configuration conf, Path input, Path output, int numDims, // Run the KMeansDriver Path answer = new Path(output, "kmeans_out"); - KMeansDriver.run(conf, data, initialclusters, answer, measure, convergenceDelta, maxIterations, true, 0.0, false); + KMeansDriver.run(conf, data, initialclusters, answer, convergenceDelta, maxIterations, true, 0.0, false); // Restore name to id mapping and read through the cluster assignments Path mappingPath = new Path(new Path(conf.get("hadoop.tmp.dir")), "generic_input_mapping"); diff --git a/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java b/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java index 9ee0b2b5f4..194f052e94 100644 --- a/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java +++ b/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java @@ -349,8 +349,8 @@ public void testKMeansWithCanopyClusterInput() throws Exception { // now run the KMeans job Path kmeansOutput = new Path(outputPath, "kmeans"); - KMeansDriver.run(getConfiguration(), pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput, new EuclideanDistanceMeasure(), - 0.001, 10, true, 0.0, false); + KMeansDriver.run(getConfiguration(), pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput, + 0.001, 10, true, 0.0, false); // now compare the expected clusters with actual Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints"); diff --git a/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java b/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java index 6f25d0f70f..0934ff71aa 100644 --- a/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java +++ b/core/src/test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java @@ -101,7 +101,7 @@ private void topLevelClustering(Path pointsPath, Configuration conf) throws IOEx CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, 0.0, true); Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX)); - KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, 0.0, true); + KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, 1, 1, true, 0.0, true); } private static void verifyThatNumberOfClustersIsCorrect(Configuration conf, Path clusteredPointsPath) { diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java index 7738c8ca2d..f8ce7c7e38 100644 --- a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java +++ b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java @@ -102,7 +102,7 @@ private static void runSequentialFuzzyKClusterer(Configuration conf, Path sample ClassNotFoundException, InterruptedException { Path clustersIn = new Path(output, "random-seeds"); RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure); - FuzzyKMeansDriver.run(samples, clustersIn, output, measure, threshold, maxIterations, m, true, true, threshold, + FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold, true); loadClustersWritable(output); diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java index ff86688ab4..336d69e295 100644 --- a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java +++ b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java @@ -93,7 +93,7 @@ private static void runSequentialKMeansClusterer(Configuration conf, Path sample throws IOException, InterruptedException, ClassNotFoundException { Path clustersIn = new Path(output, "random-seeds"); RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure); - KMeansDriver.run(samples, clustersIn, output, measure, convergenceDelta, maxIterations, true, 0.0, true); + KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true); loadClustersWritable(output); } diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java index 7f51b20935..43beb78895 100644 --- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java +++ b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java @@ -133,14 +133,12 @@ public static void run(Configuration conf, Path input, Path output, DistanceMeas InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); - CanopyDriver - .run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); + CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running FuzzyKMeans"); FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output, - measure, convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false); + convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false); // run ClusterDumper - ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, - "clusteredPoints")); + ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); } } diff --git a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java index 51daf42685..70c41feadb 100644 --- a/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java +++ b/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java @@ -132,12 +132,11 @@ public static void run(Configuration conf, Path input, Path output, DistanceMeas Path clusters = new Path(output, "random-seeds"); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans with k = {}", k); - KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, + KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper Path outGlob = new Path(output, "clusters-*-final"); - Path clusteredPoints = new Path(output, - "clusteredPoints"); + Path clusteredPoints = new Path(output,"clusteredPoints"); log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints); ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints); clusterDumper.printClusters(null); @@ -179,7 +178,7 @@ public static void run(Configuration conf, Path input, Path output, DistanceMeas false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR - + "-final"), output, measure, convergenceDelta, maxIterations, true, 0.0, false); + + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java index df07ed0ffc..82482586ff 100644 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java +++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java @@ -201,7 +201,7 @@ public void testKmeans() throws Exception { // now run the KMeans job Path kMeansOutput = new Path(output, "kmeans"); KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output, - "clusters-0-final"), kMeansOutput, measure, 0.001, 10, true, 0.0, false); + "clusters-0-final"), kMeansOutput, 0.001, 10, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, 10), new Path(kMeansOutput, "clusteredPoints")); @@ -219,7 +219,7 @@ public void testJsonClusterDumper() throws Exception { // now run the KMeans job Path kmeansOutput = new Path(output, "kmeans"); KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output, - "clusters-0-final"), kmeansOutput, measure, 0.001, 10, true, 0.0, false); + "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, 10), new Path(kmeansOutput, "clusteredPoints")); @@ -238,7 +238,7 @@ public void testFuzzyKmeans() throws Exception { // now run the Fuzzy KMeans job Path kMeansOutput = new Path(output, "kmeans"); FuzzyKMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path( - output, "clusters-0-final"), kMeansOutput, measure, 0.001, 10, 1.1f, true, + output, "clusters-0-final"), kMeansOutput, 0.001, 10, 1.1f, true, true, 0, true); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, diff --git a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java index 343f4f535e..d0a54cfc76 100644 --- a/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java +++ b/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java @@ -284,7 +284,7 @@ public void testKmeans() throws Exception { CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true); // now run the KMeans job Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure, 0.001, 10, true, 0.0, true); + KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true); int numIterations = 10; Path clustersIn = new Path(kmeansOutput, "clusters-2"); RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure, @@ -305,7 +305,7 @@ public void testFuzzyKmeans() throws Exception { CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, 0.0, true); Path fuzzyKMeansOutput = new Path(output, "fuzzyk"); // now run the KMeans job - FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, measure, 0.001, 10, 2, + FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2, true, true, 0, true); int numIterations = 10; Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4"); diff --git a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java index 0ad0c4a5d3..78367cc08f 100644 --- a/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java +++ b/integration/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java @@ -124,7 +124,6 @@ private void initData(double dC, double dP, DistanceMeasure measure) { * double y-value of the sample mean * @param sd * double standard deviation of the samples - * @throws Exception */ private void generateSamples(int num, double mx, double my, double sd) { log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd); @@ -288,7 +287,7 @@ public void testKmeans() throws Exception { CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true); // now run the KMeans job Path kmeansOutput = new Path(output, "kmeans"); - KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, measure, 0.001, 10, true, 0.0, true); + KMeansDriver.run(testdata, new Path(output, "clusters-0-final"), kmeansOutput, 0.001, 10, true, 0.0, true); int numIterations = 10; Path clustersIn = new Path(kmeansOutput, "clusters-10-final"); RepresentativePointsDriver.run(conf, clustersIn, new Path(kmeansOutput, "clusteredPoints"), kmeansOutput, measure, @@ -310,7 +309,7 @@ public void testFuzzyKmeans() throws Exception { CanopyDriver.run(getConfiguration(), testdata, output, measure, 3.1, 2.1, false, 0.0, true); Path fuzzyKMeansOutput = new Path(output, "fuzzyk"); // now run the KMeans job - FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, measure, 0.001, 10, 2, + FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0-final"), fuzzyKMeansOutput, 0.001, 10, 2, true, true, 0, true); int numIterations = 10; Path clustersIn = new Path(fuzzyKMeansOutput, "clusters-4"); From d52036726a28f9d32dd8ecbd702ab8bb3a483514 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 18:40:37 +0000 Subject: [PATCH 17/22] MAHOUT-1411: Random test failures from TDigestTest git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1562146 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGELOG | 2 ++ .../test/java/org/apache/mahout/math/stats/TDigestTest.java | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index d1f4055ed6..96680621cb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 0.9 - unreleased + MAHOUT-1411: Random test failures from TDigestTest (smarthi) + MAHOUT-1410: clusteredPoints do not contain a vector id (smarthi, Andrew Musselman) MAHOUT-1409: MatrixVectorView has index check error (tdunning) diff --git a/math/src/test/java/org/apache/mahout/math/stats/TDigestTest.java b/math/src/test/java/org/apache/mahout/math/stats/TDigestTest.java index b4573d9444..526c07d99a 100644 --- a/math/src/test/java/org/apache/mahout/math/stats/TDigestTest.java +++ b/math/src/test/java/org/apache/mahout/math/stats/TDigestTest.java @@ -366,11 +366,11 @@ private void runTest(AbstractContinousDistribution gen, double sizeGuide, double double q = qValues[i]; double estimate = dist.cdf(x); errorDump.printf("%s\t%s\t%.8g\t%.8f\t%.8f\n", tag, "cdf", x, q, estimate - q); - assertEquals(q, estimate, 0.005); + assertEquals(q, estimate, 0.006); estimate = cdf(dist.quantile(q), data); errorDump.printf("%s\t%s\t%.8g\t%.8f\t%.8f\n", tag, "quantile", x, q, estimate - q); - assertEquals(q, estimate, 0.005); + assertEquals(q, estimate, 0.006); } if (recordAllData) { From d099c48f73fd0a7180e1d238c35eaab0a11c5f13 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 22:46:31 +0000 Subject: [PATCH 18/22] MAHOUT-1364: Changing the Base class to 'BaseDirectory' per fix suggested in LUCENE-5204 git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1562293 13f79535-47bb-0310-9956-ffa450edef68 --- .../text/ReadOnlyFileSystemDirectory.java | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java index 89b5d55598..e97e35bf03 100644 --- a/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java +++ b/integration/src/main/java/org/apache/mahout/text/ReadOnlyFileSystemDirectory.java @@ -24,7 +24,14 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.lucene.store.*; +import org.apache.lucene.store.BaseDirectory; +import org.apache.lucene.store.BufferedIndexInput; +import org.apache.lucene.store.BufferedIndexOutput; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.Lock; +import org.apache.lucene.store.LockFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,7 +46,7 @@ *

* // TODO: Rename to FileSystemReadOnlyDirectory */ -public class ReadOnlyFileSystemDirectory extends Directory { +public class ReadOnlyFileSystemDirectory extends BaseDirectory { private final FileSystem fs; private final Path directory; @@ -50,10 +57,10 @@ public class ReadOnlyFileSystemDirectory extends Directory { /** * Constructor * - * @param fs - * @param directory - * @param create - * @param conf + * @param fs - filesystem + * @param directory - directory path + * @param create - if true create the directory + * @param conf - MR Job Configuration * @throws IOException */ @@ -280,7 +287,8 @@ public long length() { } @Override - protected void finalize() throws IOException { + protected void finalize() throws Throwable { + super.finalize(); if (!isClone && isOpen) { close(); // close the file } @@ -335,7 +343,8 @@ public long length() throws IOException { } @Override - protected void finalize() throws IOException { + protected void finalize() throws Throwable { + super.finalize(); if (isOpen) { close(); // close the file } From bd875829906dead9932fb9d3c90743d4fb27100e Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 23:01:32 +0000 Subject: [PATCH 19/22] MAHOUT-1364: upgraded to Lucene 4.6.1 git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1562302 13f79535-47bb-0310-9956-ffa450edef68 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 421fb9253a..729833b4e8 100644 --- a/pom.xml +++ b/pom.xml @@ -105,7 +105,7 @@ 2.5.2 2.9.1 1.2.1 - 4.6.0 + 4.6.1 1.7.5 From 5a861c7b14a2edc02d8c8be6b42da6df8c2ca1d8 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 23:46:18 +0000 Subject: [PATCH 20/22] MAHOUT-1364: Removed commented code (from Lucene 2.x) git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1562315 13f79535-47bb-0310-9956-ffa450edef68 --- .../collocations/llr/CollocMapperTest.java | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java b/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java index 188ce23ac8..2b52788173 100644 --- a/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java +++ b/core/src/test/java/org/apache/mahout/vectorizer/collocations/llr/CollocMapperTest.java @@ -177,18 +177,4 @@ public void testCollectNgramsWithUnigrams() throws Exception { EasyMock.verify(context); } - - /** A lucene 2.9 standard analyzer with no stopwords. */ - /* public static class TestAnalyzer extends DefaultAnalyzer { - private final Analyzer a; - - public TestAnalyzer() { - a = new StandardAnalyzer(Version.LUCENE_40, Collections.emptySet()); - } - - @Override - public TokenStream tokenStream(String arg0, Reader arg1) { - return a.tokenStream(arg0, arg1); - } - }*/ } From b7b852fe4177b41b3eb6545e3a7fd21d2ba39c4b Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Tue, 28 Jan 2014 23:54:46 +0000 Subject: [PATCH 21/22] NoJira: Fixed the Javvadoc warning on line 102. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1562317 13f79535-47bb-0310-9956-ffa450edef68 --- .../mahout/classifier/sgd/AdaptiveLogisticRegression.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java b/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java index d83321a354..8a5011acae 100644 --- a/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java +++ b/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java @@ -99,7 +99,7 @@ public AdaptiveLogisticRegression() {} * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector) * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use * - * @see {@link #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)} + * @see #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int) */ public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) { this(numCategories, numFeatures, prior, DEFAULT_THREAD_COUNT, DEFAULT_POOL_SIZE); From fb8dfbf79bf99a13fe07366a74d227c0f59de4f1 Mon Sep 17 00:00:00 2001 From: Suneel Marthi Date: Wed, 29 Jan 2014 00:32:06 +0000 Subject: [PATCH 22/22] [maven-release-plugin] prepare release mahout-0.9 git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1562328 13f79535-47bb-0310-9956-ffa450edef68 --- buildtools/pom.xml | 8 +++++++- core/pom.xml | 2 +- distribution/pom.xml | 2 +- examples/pom.xml | 2 +- integration/pom.xml | 2 +- math-scala/pom.xml | 2 +- math/pom.xml | 2 +- pom.xml | 8 ++++---- 8 files changed, 17 insertions(+), 11 deletions(-) diff --git a/buildtools/pom.xml b/buildtools/pom.xml index 66390e16ea..bedb92ca3a 100644 --- a/buildtools/pom.xml +++ b/buildtools/pom.xml @@ -29,7 +29,7 @@ org.apache.mahout mahout-buildtools - 0.9-SNAPSHOT + 0.9 Mahout Build Tools jar @@ -121,4 +121,10 @@ + + + scm:svn:http://svn.apache.org/repos/asf/maven/pom/tags/mahout-0.9/mahout-buildtools + scm:svn:https://svn.apache.org/repos/asf/maven/pom/tags/mahout-0.9/mahout-buildtools + http://svn.apache.org/viewvc/maven/pom/tags/mahout-0.9/mahout-buildtools + diff --git a/core/pom.xml b/core/pom.xml index 9ed602a421..98eefa9a53 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9-SNAPSHOT + 0.9 ../pom.xml diff --git a/distribution/pom.xml b/distribution/pom.xml index 4e9363ad76..2608f031e3 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -20,7 +20,7 @@ org.apache.mahout mahout - 0.9-SNAPSHOT + 0.9 ../pom.xml mahout-distribution diff --git a/examples/pom.xml b/examples/pom.xml index 03f31d412d..19ac1e9544 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9-SNAPSHOT + 0.9 ../pom.xml diff --git a/integration/pom.xml b/integration/pom.xml index 051f1e0183..61e9804b18 100644 --- a/integration/pom.xml +++ b/integration/pom.xml @@ -24,7 +24,7 @@ org.apache.mahout mahout - 0.9-SNAPSHOT + 0.9 ../pom.xml diff --git a/math-scala/pom.xml b/math-scala/pom.xml index bb0342b231..f6c89e0c5c 100644 --- a/math-scala/pom.xml +++ b/math-scala/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9-SNAPSHOT + 0.9 ../pom.xml diff --git a/math/pom.xml b/math/pom.xml index 41f854d32e..b7cad5ddd7 100644 --- a/math/pom.xml +++ b/math/pom.xml @@ -23,7 +23,7 @@ org.apache.mahout mahout - 0.9-SNAPSHOT + 0.9 ../pom.xml diff --git a/pom.xml b/pom.xml index 729833b4e8..28bcb4228a 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,7 @@ 4.0.0 org.apache.mahout mahout - 0.9-SNAPSHOT + 0.9 org.apache @@ -1012,9 +1012,9 @@ - scm:svn:https://svn.apache.org/repos/asf/mahout/trunk - scm:svn:https://svn.apache.org/repos/asf/mahout/trunk - https://svn.apache.org/repos/asf/mahout + scm:svn:https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9 + scm:svn:https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9 + https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9