Skip to content

Commit

Permalink
MAHOUT-1310: Changed method signatures to remove unused DistanceMeasu…
Browse files Browse the repository at this point in the history
…re parameter.

git-svn-id: https://svn.apache.org/repos/asf/mahout/trunk@1561975 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
smarthi committed Jan 28, 2014
1 parent 1622bc1 commit 027a608
Show file tree
Hide file tree
Showing 12 changed files with 72 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ public int run(String[] args) throws Exception {
input,
clusters,
output,
measure,
convergenceDelta,
maxIterations,
fuzziness,
Expand All @@ -124,32 +123,31 @@ public int run(String[] args) throws Exception {
/**
* Iterate over the input vectors to produce clusters and, if requested, use the
* results of the final iteration to cluster the input vectors.
*
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
* the directory pathname for output points
* @param convergenceDelta
* the convergence delta value
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
* the maximum number of iterations
* @param m
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential if true run in sequential execution mode
*/
public static void run(Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
int maxIterations,
float m,
Expand All @@ -162,7 +160,6 @@ public static void run(Path input,
input,
clustersIn,
output,
measure,
convergenceDelta,
maxIterations,
m,
Expand All @@ -172,7 +169,6 @@ public static void run(Path input,
clusterData(conf, input,
clustersOut,
output,
measure,
convergenceDelta,
m,
emitMostLikely,
Expand All @@ -189,27 +185,26 @@ public static void run(Path input,
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
* the directory pathname for output points
* @param convergenceDelta
* the convergence delta value
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
* the maximum number of iterations
* @param m
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential if true run in sequential execution mode
*/
public static void run(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
int maxIterations,
float m,
Expand All @@ -219,14 +214,13 @@ public static void run(Configuration conf,
boolean runSequential)
throws IOException, ClassNotFoundException, InterruptedException {
Path clustersOut =
buildClusters(conf, input, clustersIn, output, measure, convergenceDelta, maxIterations, m, runSequential);
buildClusters(conf, input, clustersIn, output, convergenceDelta, maxIterations, m, runSequential);
if (runClustering) {
log.info("Clustering");
clusterData(conf,
input,
clustersOut,
output,
measure,
convergenceDelta,
m,
emitMostLikely,
Expand All @@ -237,14 +231,13 @@ public static void run(Configuration conf,

/**
* Iterate over the input vectors to produce cluster directories for each iteration
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the file pathname for initial cluster centers
* @param output
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
Expand All @@ -253,14 +246,13 @@ public static void run(Configuration conf,
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runSequential if true run in sequential execution mode
*
*
* @return the Path of the final clusters directory
*/
public static Path buildClusters(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
int maxIterations,
float m,
Expand Down Expand Up @@ -293,28 +285,25 @@ public static Path buildClusters(Configuration conf,

/**
* Run the job using supplied arguments
*
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for input clusters
* @param output
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
* the directory pathname for output points
* @param convergenceDelta
* the convergence delta value
* the convergence delta value
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential if true run in sequential execution mode
*/
public static void clusterData(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
float m,
boolean emitMostLikely,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,21 @@ public int run(String[] args) throws Exception {
if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
}
run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
run(getConf(), input, clusters, output, convergenceDelta, maxIterations, runClustering,
clusterClassificationThreshold, runSequential);
return 0;
}

/**
* Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
* cluster the input vectors.
*
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
* @param measure
* the DistanceMeasure to use
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
Expand All @@ -129,58 +127,56 @@ public int run(String[] args) throws Exception {
* @param runSequential
* if true execute sequential algorithm
*/
public static void run(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure,
double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold,
boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
public static void run(Configuration conf, Path input, Path clustersIn, Path output,
double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold,
boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {

// iterate until the clusters converge
String delta = Double.toString(convergenceDelta);
if (log.isInfoEnabled()) {
log.info("Input: {} Clusters In: {} Out: {} Distance: {}", input, clustersIn, output,
measure.getClass().getName());
log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
log.info("convergence: {} max Iterations: {}", convergenceDelta, maxIterations);
}
Path clustersOut = buildClusters(conf, input, clustersIn, output, measure, maxIterations, delta, runSequential);
Path clustersOut = buildClusters(conf, input, clustersIn, output, maxIterations, delta, runSequential);
if (runClustering) {
log.info("Clustering data");
clusterData(conf, input, clustersOut, output, measure, clusterClassificationThreshold, runSequential);
clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential);
}
}

/**
* Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
* cluster the input vectors.
*
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
* @param measure
* the DistanceMeasure to use
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
* @param runClustering
* true if points are to be clustered after iterations are completed
* @param clusterClassificationThreshold
* Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors
* Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
* having pdf below this value will not be clustered.
* @param runSequential
* if true execute sequential algorithm
*/
public static void run(Path input, Path clustersIn, Path output, DistanceMeasure measure, double convergenceDelta,
int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
public static void run(Path input, Path clustersIn, Path output, double convergenceDelta,
int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
throws IOException, InterruptedException, ClassNotFoundException {
run(new Configuration(), input, clustersIn, output, measure, convergenceDelta, maxIterations, runClustering,
run(new Configuration(), input, clustersIn, output, convergenceDelta, maxIterations, runClustering,
clusterClassificationThreshold, runSequential);
}

/**
* Iterate over the input vectors to produce cluster directories for each iteration
*
*
* @param conf
* the Configuration to use
* @param input
Expand All @@ -189,20 +185,18 @@ public static void run(Path input, Path clustersIn, Path output, DistanceMeasure
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
* @param maxIterations
* the maximum number of iterations
* @param delta
* the convergence delta value
* @param runSequential
* if true execute sequential algorithm
*
*
* @return the Path of the final clusters directory
*/
public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output,
DistanceMeasure measure, int maxIterations, String delta, boolean runSequential) throws IOException,
InterruptedException, ClassNotFoundException {
int maxIterations, String delta, boolean runSequential) throws IOException,
InterruptedException, ClassNotFoundException {

double convergenceDelta = Double.parseDouble(delta);
List<Cluster> clusters = Lists.newArrayList();
Expand All @@ -227,28 +221,26 @@ public static Path buildClusters(Configuration conf, Path input, Path clustersIn

/**
* Run the job using supplied arguments
*
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for input clusters
* @param output
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
* @param clusterClassificationThreshold
* Is a clustering strictness / outlier removal parrameter. Its value should be between 0 and 1. Vectors
* Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
* having pdf below this value will not be clustered.
* @param runSequential
* if true execute sequential algorithm
*/
public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure,
double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
ClassNotFoundException {
public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output,
double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
ClassNotFoundException {

if (log.isInfoEnabled()) {
log.info("Running Clustering");
log.info("Input: {} Clusters In: {} Out: {} Distance: {}", input, clustersIn, output, measure);
log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
}
ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ public static void run(Configuration conf, Path input, Path output, int numDims,
* @param ssvd
* Flag to indicate the eigensolver to use
* @param numReducers
* Number of reducers
* @param blockHeight
* @param oversampling
* @param poweriters
Expand Down Expand Up @@ -244,7 +245,7 @@ public static void run(Configuration conf, Path input, Path output, int numDims,

// Run the KMeansDriver
Path answer = new Path(output, "kmeans_out");
KMeansDriver.run(conf, data, initialclusters, answer, measure, convergenceDelta, maxIterations, true, 0.0, false);
KMeansDriver.run(conf, data, initialclusters, answer, convergenceDelta, maxIterations, true, 0.0, false);

// Restore name to id mapping and read through the cluster assignments
Path mappingPath = new Path(new Path(conf.get("hadoop.tmp.dir")), "generic_input_mapping");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,8 @@ public void testKMeansWithCanopyClusterInput() throws Exception {

// now run the KMeans job
Path kmeansOutput = new Path(outputPath, "kmeans");
KMeansDriver.run(getConfiguration(), pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput, new EuclideanDistanceMeasure(),
0.001, 10, true, 0.0, false);
KMeansDriver.run(getConfiguration(), pointsPath, new Path(outputPath, "clusters-0-final"), kmeansOutput,
0.001, 10, true, 0.0, false);

// now compare the expected clusters with actual
Path clusteredPointsPath = new Path(kmeansOutput, "clusteredPoints");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ private void topLevelClustering(Path pointsPath, Configuration conf) throws IOEx
CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, 0.0, true);
Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0'
+ Cluster.FINAL_ITERATION_SUFFIX));
KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, measure, 1, 1, true, 0.0, true);
KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, 1, 1, true, 0.0, true);
}

private static void verifyThatNumberOfClustersIsCorrect(Configuration conf, Path clusteredPointsPath) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ private static void runSequentialFuzzyKClusterer(Configuration conf, Path sample
ClassNotFoundException, InterruptedException {
Path clustersIn = new Path(output, "random-seeds");
RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
FuzzyKMeansDriver.run(samples, clustersIn, output, measure, threshold, maxIterations, m, true, true, threshold,
FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
true);

loadClustersWritable(output);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ private static void runSequentialKMeansClusterer(Configuration conf, Path sample
throws IOException, InterruptedException, ClassNotFoundException {
Path clustersIn = new Path(output, "random-seeds");
RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
KMeansDriver.run(samples, clustersIn, output, measure, convergenceDelta, maxIterations, true, 0.0, true);
KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
loadClustersWritable(output);
}

Expand Down
Loading

0 comments on commit 027a608

Please sign in to comment.