[maven-release-plugin] copy for tag mahout-0.9

git-svn-id: https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9@1562329 13f79535-47bb-0310-9956-ffa450edef68
HEGALLIS · Jan 29, 2014 · bd150f5 · bd150f5
2 parents 4ee3ac9 + fb8dfbf
commit bd150f5
Show file tree

Hide file tree

Showing 28 changed files with 247 additions and 185 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,12 +2,20 @@ Mahout Change Log
 
 Release 0.9 - unreleased
 
+  MAHOUT-1411: Random test failures from TDigestTest (smarthi)
+
+  MAHOUT-1410: clusteredPoints do not contain a vector id (smarthi, Andrew Musselman)
+
+  MAHOUT-1409: MatrixVectorView has index check error (tdunning)
+
   MAHOUT-1402: Zero clusters using streaming k-means option in cluster-reuters.sh (smarthi)
 
   MAHOUT-1401: Resurrect Frequent Pattern mining (smarthi)
 
   MAHOUT-1400: Remove references to deprecated and removed algorithms from examples scripts (ssc)
 
+  MAHOUT-1399: Fixed multiple slf4j bindings when running Mahout examples issue (sslavic)
+
   MAHOUT-1398: FileDataModel should provide a constructor with a delimiterPattern (Roy Guo via ssc)
 
   MAHOUT-1396: Accidental use of commons-math won't work with next Hadoop 2 release (srowen)

diff --git a/bin/mahout b/bin/mahout
@@ -24,13 +24,13 @@
 #   MAHOUT_CORE        set to anything other than an empty string to force
 #                      mahout to run in developer 'core' mode, just as if the
 #                      -core option was presented on the command-line
-# Commane-line Options
+# Command-line Options
 #
 #   -core              -core is used to switch into 'developer mode' when
 #                      running mahout locally. If specified, the classes
 #                      from the 'target/classes' directories in each project
-#                      are used. Otherwise classes will be retrived from
-#                      jars in the binary releas collection or *-job.jar files
+#                      are used. Otherwise classes will be retrieved from
+#                      jars in the binary release collection or *-job.jar files
 #                      found in build directories. When running on hadoop
 #                      the job files will always be used.
 

diff --git a/core/src/main/assembly/job.xml b/core/src/main/assembly/job.xml
@@ -36,6 +36,7 @@
       </unpackOptions>
       <scope>runtime</scope>
       <outputDirectory>/</outputDirectory>
+      <useTransitiveFiltering>true</useTransitiveFiltering>
       <excludes>
         <exclude>org.apache.hadoop:hadoop-core</exclude>
       </excludes>

diff --git a/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java b/core/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
@@ -99,7 +99,7 @@ public AdaptiveLogisticRegression() {}
    * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
    * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
    *
-   * @see {@link #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)}
+   * @see #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)
    */
   public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) {
     this(numCategories, numFeatures, prior, DEFAULT_THREAD_COUNT, DEFAULT_POOL_SIZE);

diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
@@ -41,12 +41,16 @@
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.iterator.ClusterWritable;
 import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
 import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.Vector.Element;
 import org.apache.mahout.math.VectorWritable;
@@ -186,19 +190,34 @@ private static Path finalClustersPath(Configuration conf, Path clusterOutputPath
    * @param output
    *          the path to store classified data
    * @param clusterClassificationThreshold
+   *          the threshold value of probability distribution function from 0.0
+   *          to 1.0. Any vector with pdf less that this threshold will not be
+   *          classified for the cluster
    * @param emitMostLikely
+   *          emit the vectors with the max pdf values per cluster
    * @throws IOException
    */
   private static void selectCluster(Path input, List<Cluster> clusterModels, ClusterClassifier clusterClassifier,
       Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException {
     Configuration conf = new Configuration();
     SequenceFile.Writer writer = new SequenceFile.Writer(input.getFileSystem(conf), conf, new Path(output,
         "part-m-" + 0), IntWritable.class, WeightedPropertyVectorWritable.class);
-    for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST,
+    for (Pair<Writable, VectorWritable> vw : new SequenceFileDirIterable<Writable, VectorWritable>(input, PathType.LIST,
         PathFilters.logsCRCFilter(), conf)) {
-      Vector pdfPerCluster = clusterClassifier.classify(vw.get());
+      // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
+      // belongs to which cluster - fix for MAHOUT-1410
+      Class<? extends Writable> keyClass = vw.getFirst().getClass();
+      Vector vector = vw.getSecond().get();
+      if (!keyClass.equals(NamedVector.class)) {
+        if (keyClass.equals(Text.class)) {
+          vector = new NamedVector(vector, vw.getFirst().toString());
+        } else if (keyClass.equals(IntWritable.class)) {
+          vector = new NamedVector(vector, Integer.toString(((IntWritable) vw.getFirst()).get()));
+        }
+      }
+      Vector pdfPerCluster = clusterClassifier.classify(vector);
       if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) {
-        classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, vw, pdfPerCluster);
+        classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster);
       }
     }
     writer.close();
@@ -209,28 +228,37 @@ private static void classifyAndWrite(List<Cluster> clusterModels, Double cluster
     Map<Text, Text> props = Maps.newHashMap();
     if (emitMostLikely) {
       int maxValueIndex = pdfPerCluster.maxValueIndex();
-      WeightedPropertyVectorWritable wpvw = new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props);
-      write(clusterModels, writer, wpvw, maxValueIndex);
+      WeightedPropertyVectorWritable weightedPropertyVectorWritable =
+          new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props);
+      write(clusterModels, writer, weightedPropertyVectorWritable, maxValueIndex);
     } else {
       writeAllAboveThreshold(clusterModels, clusterClassificationThreshold, writer, vw, pdfPerCluster);
     }
   }
 
   private static void writeAllAboveThreshold(List<Cluster> clusterModels, Double clusterClassificationThreshold,
       SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException {
+    Map<Text, Text> props = Maps.newHashMap();
     for (Element pdf : pdfPerCluster.nonZeroes()) {
       if (pdf.get() >= clusterClassificationThreshold) {
-        WeightedVectorWritable wvw = new WeightedVectorWritable(pdf.get(), vw.get());
+        WeightedPropertyVectorWritable wvw = new WeightedPropertyVectorWritable(pdf.get(), vw.get(), props);
         int clusterIndex = pdf.index();
         write(clusterModels, writer, wvw, clusterIndex);
       }
     }
   }
 
-  private static void write(List<Cluster> clusterModels, SequenceFile.Writer writer, WeightedVectorWritable wvw,
+  private static void write(List<Cluster> clusterModels, SequenceFile.Writer writer,
+      WeightedPropertyVectorWritable weightedPropertyVectorWritable,
       int maxValueIndex) throws IOException {
     Cluster cluster = clusterModels.get(maxValueIndex);
-    writer.append(new IntWritable(cluster.getId()), wvw);
+
+    DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
+    DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
+    double distance = distanceMeasure.distance(cluster.getCenter(), weightedPropertyVectorWritable.getVector());
+
+    weightedPropertyVectorWritable.getProperties().put(new Text("distance"), new Text(Double.toString(distance)));
+    writer.append(new IntWritable(cluster.getId()), weightedPropertyVectorWritable);
   }
 
   /**

diff --git a/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/core/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
@@ -36,9 +36,12 @@
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.iterator.ClusterWritable;
 import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.Vector.Element;
 import org.apache.mahout.math.VectorWritable;
@@ -83,13 +86,24 @@ protected void setup(Context context) throws IOException, InterruptedException {
   protected void map(WritableComparable<?> key, VectorWritable vw, Context context)
     throws IOException, InterruptedException {
     if (!clusterModels.isEmpty()) {
-      Vector pdfPerCluster = clusterClassifier.classify(vw.get());
+      // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
+      // belongs to which cluster - fix for MAHOUT-1410
+      Class<? extends Vector> vectorClass = vw.get().getClass();
+      Vector vector = vw.get();
+      if (!vectorClass.equals(NamedVector.class)) {
+        if (key.getClass().equals(Text.class)) {
+          vector = new NamedVector(vector, key.toString());
+        } else if (key.getClass().equals(IntWritable.class)) {
+          vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get()));
+        }
+      }
+      Vector pdfPerCluster = clusterClassifier.classify(vector);
       if (shouldClassify(pdfPerCluster)) {
         if (emitMostLikely) {
           int maxValueIndex = pdfPerCluster.maxValueIndex();
-          write(vw, context, maxValueIndex, 1.0);
+          write(new VectorWritable(vector), context, maxValueIndex, 1.0);
         } else {
-          writeAllAboveThreshold(vw, context, pdfPerCluster);
+          writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster);
         }
       }
     }
@@ -109,9 +123,13 @@ private void write(VectorWritable vw, Context context, int clusterIndex, double
     throws IOException, InterruptedException {
     Cluster cluster = clusterModels.get(clusterIndex);
     clusterId.set(cluster.getId());
-    double d = cluster.getCenter().getDistanceSquared(vw.get());
+
+    DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
+    DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
+    double distance = distanceMeasure.distance(cluster.getCenter(), vw.get());
+
     Map<Text, Text> props = Maps.newHashMap();
-    props.put(new Text("distance-squared"), new Text(Double.toString(d)));
+    props.put(new Text("distance"), new Text(Double.toString(distance)));
     context.write(clusterId, new WeightedPropertyVectorWritable(weight, vw.get(), props));
   }
 

diff --git a/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java b/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
@@ -110,7 +110,6 @@ public int run(String[] args) throws Exception {
         input,
         clusters,
         output,
-        measure,
         convergenceDelta,
         maxIterations,
         fuzziness,
@@ -124,32 +123,31 @@ public int run(String[] args) throws Exception {
   /**
    * Iterate over the input vectors to produce clusters and, if requested, use the
    * results of the final iteration to cluster the input vectors.
-   * 
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for initial & computed clusters
    * @param output
-   *          the directory pathname for output points
+ *          the directory pathname for output points
    * @param convergenceDelta
-   *          the convergence delta value
+*          the convergence delta value
    * @param maxIterations
-   *          the maximum number of iterations
+*          the maximum number of iterations
    * @param m
-   *          the fuzzification factor, see
-   *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
-   * @param runClustering 
-   *          true if points are to be clustered after iterations complete
+*          the fuzzification factor, see
+*          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param runClustering
+*          true if points are to be clustered after iterations complete
    * @param emitMostLikely
-   *          a boolean if true emit only most likely cluster for each point
-   * @param threshold 
-   *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+*          a boolean if true emit only most likely cluster for each point
+   * @param threshold
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
    * @param runSequential if true run in sequential execution mode
    */
   public static void run(Path input,
                          Path clustersIn,
                          Path output,
-                         DistanceMeasure measure,
                          double convergenceDelta,
                          int maxIterations,
                          float m,
@@ -162,7 +160,6 @@ public static void run(Path input,
                                      input,
                                      clustersIn,
                                      output,
-                                     measure,
                                      convergenceDelta,
                                      maxIterations,
                                      m,
@@ -172,7 +169,6 @@ public static void run(Path input,
       clusterData(conf, input,
                   clustersOut,
                   output,
-                  measure,
                   convergenceDelta,
                   m,
                   emitMostLikely,
@@ -189,27 +185,26 @@ public static void run(Path input,
    * @param clustersIn
    *          the directory pathname for initial & computed clusters
    * @param output
-   *          the directory pathname for output points
+ *          the directory pathname for output points
    * @param convergenceDelta
-   *          the convergence delta value
+*          the convergence delta value
    * @param maxIterations
-   *          the maximum number of iterations
+*          the maximum number of iterations
    * @param m
-   *          the fuzzification factor, see
-   *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
-   * @param runClustering 
-   *          true if points are to be clustered after iterations complete
+*          the fuzzification factor, see
+*          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param runClustering
+*          true if points are to be clustered after iterations complete
    * @param emitMostLikely
-   *          a boolean if true emit only most likely cluster for each point
-   * @param threshold 
-   *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+*          a boolean if true emit only most likely cluster for each point
+   * @param threshold
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
    * @param runSequential if true run in sequential execution mode
    */
   public static void run(Configuration conf,
                          Path input,
                          Path clustersIn,
                          Path output,
-                         DistanceMeasure measure,
                          double convergenceDelta,
                          int maxIterations,
                          float m,
@@ -219,14 +214,13 @@ public static void run(Configuration conf,
                          boolean runSequential)
     throws IOException, ClassNotFoundException, InterruptedException {
     Path clustersOut =
-        buildClusters(conf, input, clustersIn, output, measure, convergenceDelta, maxIterations, m, runSequential);
+        buildClusters(conf, input, clustersIn, output, convergenceDelta, maxIterations, m, runSequential);
     if (runClustering) {
       log.info("Clustering");
       clusterData(conf, 
                   input,
                   clustersOut,
                   output,
-                  measure,
                   convergenceDelta,
                   m,
                   emitMostLikely,
@@ -237,14 +231,13 @@ public static void run(Configuration conf,
 
   /**
    * Iterate over the input vectors to produce cluster directories for each iteration
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the file pathname for initial cluster centers
    * @param output
    *          the directory pathname for output points
-   * @param measure
-   *          the classname of the DistanceMeasure
    * @param convergenceDelta
    *          the convergence delta value
    * @param maxIterations
@@ -253,14 +246,13 @@ public static void run(Configuration conf,
    *          the fuzzification factor, see
    *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
    * @param runSequential if true run in sequential execution mode
-   * 
+   *
    * @return the Path of the final clusters directory
    */
   public static Path buildClusters(Configuration conf,
                                    Path input,
                                    Path clustersIn,
                                    Path output,
-                                   DistanceMeasure measure,
                                    double convergenceDelta,
                                    int maxIterations,
                                    float m,
@@ -293,28 +285,25 @@ public static Path buildClusters(Configuration conf,
 
   /**
    * Run the job using supplied arguments
-   * 
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for input clusters
    * @param output
-   *          the directory pathname for output points
-   * @param measure
-   *          the classname of the DistanceMeasure
+ *          the directory pathname for output points
    * @param convergenceDelta
-   *          the convergence delta value
+*          the convergence delta value
    * @param emitMostLikely
-   *          a boolean if true emit only most likely cluster for each point
+*          a boolean if true emit only most likely cluster for each point
    * @param threshold
-   *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
    * @param runSequential if true run in sequential execution mode
    */
   public static void clusterData(Configuration conf,
                                  Path input,
                                  Path clustersIn,
                                  Path output,
-                                 DistanceMeasure measure,
                                  double convergenceDelta,
                                  float m,
                                  boolean emitMostLikely,