MAHOUT-1310: Changed method signatures to remove unused DistanceMeasu…

…re parameter. git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561975 13f79535-47bb-0310-9956-ffa450edef68
HEGALLIS · Jan 28, 2014 · 027a608 · 027a608
1 parent 1622bc1
commit 027a608
Show file tree

Hide file tree

Showing 12 changed files with 72 additions and 94 deletions.
diff --git a/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java b/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
@@ -110,7 +110,6 @@ public int run(String[] args) throws Exception {
         input,
         clusters,
         output,
-        measure,
         convergenceDelta,
         maxIterations,
         fuzziness,
@@ -124,32 +123,31 @@ public int run(String[] args) throws Exception {
   /**
    * Iterate over the input vectors to produce clusters and, if requested, use the
    * results of the final iteration to cluster the input vectors.
-   * 
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for initial & computed clusters
    * @param output
-   *          the directory pathname for output points
+ *          the directory pathname for output points
    * @param convergenceDelta
-   *          the convergence delta value
+*          the convergence delta value
    * @param maxIterations
-   *          the maximum number of iterations
+*          the maximum number of iterations
    * @param m
-   *          the fuzzification factor, see
-   *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
-   * @param runClustering 
-   *          true if points are to be clustered after iterations complete
+*          the fuzzification factor, see
+*          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param runClustering
+*          true if points are to be clustered after iterations complete
    * @param emitMostLikely
-   *          a boolean if true emit only most likely cluster for each point
-   * @param threshold 
-   *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+*          a boolean if true emit only most likely cluster for each point
+   * @param threshold
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
    * @param runSequential if true run in sequential execution mode
    */
   public static void run(Path input,
                          Path clustersIn,
                          Path output,
-                         DistanceMeasure measure,
                          double convergenceDelta,
                          int maxIterations,
                          float m,
@@ -162,7 +160,6 @@ public static void run(Path input,
                                      input,
                                      clustersIn,
                                      output,
-                                     measure,
                                      convergenceDelta,
                                      maxIterations,
                                      m,
@@ -172,7 +169,6 @@ public static void run(Path input,
       clusterData(conf, input,
                   clustersOut,
                   output,
-                  measure,
                   convergenceDelta,
                   m,
                   emitMostLikely,
@@ -189,27 +185,26 @@ public static void run(Path input,
    * @param clustersIn
    *          the directory pathname for initial & computed clusters
    * @param output
-   *          the directory pathname for output points
+ *          the directory pathname for output points
    * @param convergenceDelta
-   *          the convergence delta value
+*          the convergence delta value
    * @param maxIterations
-   *          the maximum number of iterations
+*          the maximum number of iterations
    * @param m
-   *          the fuzzification factor, see
-   *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
-   * @param runClustering 
-   *          true if points are to be clustered after iterations complete
+*          the fuzzification factor, see
+*          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+   * @param runClustering
+*          true if points are to be clustered after iterations complete
    * @param emitMostLikely
-   *          a boolean if true emit only most likely cluster for each point
-   * @param threshold 
-   *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+*          a boolean if true emit only most likely cluster for each point
+   * @param threshold
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
    * @param runSequential if true run in sequential execution mode
    */
   public static void run(Configuration conf,
                          Path input,
                          Path clustersIn,
                          Path output,
-                         DistanceMeasure measure,
                          double convergenceDelta,
                          int maxIterations,
                          float m,
@@ -219,14 +214,13 @@ public static void run(Configuration conf,
                          boolean runSequential)
     throws IOException, ClassNotFoundException, InterruptedException {
     Path clustersOut =
-        buildClusters(conf, input, clustersIn, output, measure, convergenceDelta, maxIterations, m, runSequential);
+        buildClusters(conf, input, clustersIn, output, convergenceDelta, maxIterations, m, runSequential);
     if (runClustering) {
       log.info("Clustering");
       clusterData(conf, 
                   input,
                   clustersOut,
                   output,
-                  measure,
                   convergenceDelta,
                   m,
                   emitMostLikely,
@@ -237,14 +231,13 @@ public static void run(Configuration conf,
 
   /**
    * Iterate over the input vectors to produce cluster directories for each iteration
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the file pathname for initial cluster centers
    * @param output
    *          the directory pathname for output points
-   * @param measure
-   *          the classname of the DistanceMeasure
    * @param convergenceDelta
    *          the convergence delta value
    * @param maxIterations
@@ -253,14 +246,13 @@ public static void run(Configuration conf,
    *          the fuzzification factor, see
    *          http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
    * @param runSequential if true run in sequential execution mode
-   * 
+   *
    * @return the Path of the final clusters directory
    */
   public static Path buildClusters(Configuration conf,
                                    Path input,
                                    Path clustersIn,
                                    Path output,
-                                   DistanceMeasure measure,
                                    double convergenceDelta,
                                    int maxIterations,
                                    float m,
@@ -293,28 +285,25 @@ public static Path buildClusters(Configuration conf,
 
   /**
    * Run the job using supplied arguments
-   * 
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for input clusters
    * @param output
-   *          the directory pathname for output points
-   * @param measure
-   *          the classname of the DistanceMeasure
+ *          the directory pathname for output points
    * @param convergenceDelta
-   *          the convergence delta value
+*          the convergence delta value
    * @param emitMostLikely
-   *          a boolean if true emit only most likely cluster for each point
+*          a boolean if true emit only most likely cluster for each point
    * @param threshold
-   *          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+*          a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
    * @param runSequential if true run in sequential execution mode
    */
   public static void clusterData(Configuration conf,
                                  Path input,
                                  Path clustersIn,
                                  Path output,
-                                 DistanceMeasure measure,
                                  double convergenceDelta,
                                  float m,
                                  boolean emitMostLikely,

diff --git a/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java b/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
@@ -100,23 +100,21 @@ public int run(String[] args) throws Exception {
     if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
       clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
     }
-    run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
+    run(getConf(), input, clusters, output, convergenceDelta, maxIterations, runClustering,
         clusterClassificationThreshold, runSequential);
     return 0;
   }
 
   /**
    * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
    * cluster the input vectors.
-   * 
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for initial & computed clusters
    * @param output
    *          the directory pathname for output points
-   * @param measure
-   *          the DistanceMeasure to use
    * @param convergenceDelta
    *          the convergence delta value
    * @param maxIterations
@@ -129,58 +127,56 @@ public int run(String[] args) throws Exception {
    * @param runSequential
    *          if true execute sequential algorithm
    */
-  public static void run(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure,
-      double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold,
-      boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
+  public static void run(Configuration conf, Path input, Path clustersIn, Path output,
+    double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold,
+    boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
 
     // iterate until the clusters converge
     String delta = Double.toString(convergenceDelta);
     if (log.isInfoEnabled()) {
-      log.info("Input: {} Clusters In: {} Out: {} Distance: {}", input, clustersIn, output,
-               measure.getClass().getName());
+      log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
       log.info("convergence: {} max Iterations: {}", convergenceDelta, maxIterations);
     }
-    Path clustersOut = buildClusters(conf, input, clustersIn, output, measure, maxIterations, delta, runSequential);
+    Path clustersOut = buildClusters(conf, input, clustersIn, output, maxIterations, delta, runSequential);
     if (runClustering) {
       log.info("Clustering data");
-      clusterData(conf, input, clustersOut, output, measure, clusterClassificationThreshold, runSequential);
+      clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential);
     }
   }
 
   /**
    * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
    * cluster the input vectors.
-   * 
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for initial & computed clusters
    * @param output
    *          the directory pathname for output points
-   * @param measure
-   *          the DistanceMeasure to use
    * @param convergenceDelta
    *          the convergence delta value
    * @param maxIterations
    *          the maximum number of iterations
    * @param runClustering
    *          true if points are to be clustered after iterations are completed
    * @param clusterClassificationThreshold
-   *          Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors
+   *          Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
    *          having pdf below this value will not be clustered.
    * @param runSequential
    *          if true execute sequential algorithm
    */
-  public static void run(Path input, Path clustersIn, Path output, DistanceMeasure measure, double convergenceDelta,
-      int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
+  public static void run(Path input, Path clustersIn, Path output, double convergenceDelta,
+    int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
     throws IOException, InterruptedException, ClassNotFoundException {
-    run(new Configuration(), input, clustersIn, output, measure, convergenceDelta, maxIterations, runClustering,
+    run(new Configuration(), input, clustersIn, output, convergenceDelta, maxIterations, runClustering,
         clusterClassificationThreshold, runSequential);
   }
 
   /**
    * Iterate over the input vectors to produce cluster directories for each iteration
    * 
+   *
    * @param conf
    *          the Configuration to use
    * @param input
@@ -189,20 +185,18 @@ public static void run(Path input, Path clustersIn, Path output, DistanceMeasure
    *          the directory pathname for initial & computed clusters
    * @param output
    *          the directory pathname for output points
-   * @param measure
-   *          the classname of the DistanceMeasure
    * @param maxIterations
    *          the maximum number of iterations
    * @param delta
    *          the convergence delta value
    * @param runSequential
    *          if true execute sequential algorithm
-   * 
+   *
    * @return the Path of the final clusters directory
    */
   public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output,
-      DistanceMeasure measure, int maxIterations, String delta, boolean runSequential) throws IOException,
-      InterruptedException, ClassNotFoundException {
+    int maxIterations, String delta, boolean runSequential) throws IOException,
+    InterruptedException, ClassNotFoundException {
 
     double convergenceDelta = Double.parseDouble(delta);
     List<Cluster> clusters = Lists.newArrayList();
@@ -227,28 +221,26 @@ public static Path buildClusters(Configuration conf, Path input, Path clustersIn
 
   /**
    * Run the job using supplied arguments
-   * 
+   *
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
    *          the directory pathname for input clusters
    * @param output
    *          the directory pathname for output points
-   * @param measure
-   *          the classname of the DistanceMeasure
    * @param clusterClassificationThreshold
-   *          Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors
+   *          Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
    *          having pdf below this value will not be clustered.
    * @param runSequential
    *          if true execute sequential algorithm
    */
-  public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure,
-      double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
-      ClassNotFoundException {
+  public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output,
+    double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
+    ClassNotFoundException {
 
     if (log.isInfoEnabled()) {
       log.info("Running Clustering");
-      log.info("Input: {} Clusters In: {} Out: {} Distance: {}", input, clustersIn, output, measure);
+      log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
     }
     ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
     ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),

diff --git a/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java b/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
@@ -152,6 +152,7 @@ public static void run(Configuration conf, Path input, Path output, int numDims,
    * @param ssvd
    *          Flag to indicate the eigensolver to use
    * @param numReducers
+   *          Number of reducers
    * @param blockHeight
    * @param oversampling
    * @param poweriters
@@ -244,7 +245,7 @@ public static void run(Configuration conf, Path input, Path output, int numDims,
 
     // Run the KMeansDriver
     Path answer = new Path(output, "kmeans_out");
-    KMeansDriver.run(conf, data, initialclusters, answer, measure, convergenceDelta, maxIterations, true, 0.0, false);
+    KMeansDriver.run(conf, data, initialclusters, answer, convergenceDelta, maxIterations, true, 0.0, false);
 
     // Restore name to id mapping and read through the cluster assignments
     Path mappingPath = new Path(new Path(conf.get("hadoop.tmp.dir")), "generic_input_mapping");

diff --git a/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java b/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
@@ -349,8 +349,8 @@ public void testKMeansWithCanopyClusterInput() throws Exception {
 
     // now run the KMeans job
     Path kmeansOutput = new Path(outputPath, "kmeans");
-	KMeansDriver.run(getConfiguration(), pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput, new EuclideanDistanceMeasure(),
-        0.001, 10, true, 0.0, false);
+	  KMeansDriver.run(getConfiguration(), pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput,
+      0.001, 10, true, 0.0, false);
 
     // now compare the expected clusters with actual
     Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints");

diff --git a/.../test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java b/.../test/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReaderTest.java
@@ -101,7 +101,7 @@ private void topLevelClustering(Path pointsPath, Configuration conf) throws IOEx
     CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, 0.0, true);
     Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0'
                                                                    + Cluster.FINAL_ITERATION_SUFFIX));
-    KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, 0.0, true);
+    KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, 1, 1, true, 0.0, true);
   }
 
   private static void verifyThatNumberOfClustersIsCorrect(Configuration conf, Path clusteredPointsPath) {

diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
@@ -102,7 +102,7 @@ private static void runSequentialFuzzyKClusterer(Configuration conf, Path sample
       ClassNotFoundException, InterruptedException {
     Path clustersIn = new Path(output, "random-seeds");
     RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
-    FuzzyKMeansDriver.run(samples, clustersIn, output, measure, threshold, maxIterations, m, true, true, threshold,
+    FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
         true);
 
     loadClustersWritable(output);

diff --git a/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java b/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
@@ -93,7 +93,7 @@ private static void runSequentialKMeansClusterer(Configuration conf, Path sample
     throws IOException, InterruptedException, ClassNotFoundException {
     Path clustersIn = new Path(output, "random-seeds");
     RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
-    KMeansDriver.run(samples, clustersIn, output, measure, convergenceDelta, maxIterations, true, 0.0, true);
+    KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
     loadClustersWritable(output);
   }