Skip to content

Commit

Permalink
[maven-release-plugin] copy for tag mahout-0.9
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/mahout/tags/mahout-0.9@1562329 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
smarthi committed Jan 29, 2014
2 parents 4ee3ac9 + fb8dfbf commit bd150f5
Show file tree
Hide file tree
Showing 28 changed files with 247 additions and 185 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@ Mahout Change Log

Release 0.9 - unreleased

MAHOUT-1411: Random test failures from TDigestTest (smarthi)

MAHOUT-1410: clusteredPoints do not contain a vector id (smarthi, Andrew Musselman)

MAHOUT-1409: MatrixVectorView has index check error (tdunning)

MAHOUT-1402: Zero clusters using streaming k-means option in cluster-reuters.sh (smarthi)

MAHOUT-1401: Resurrect Frequent Pattern mining (smarthi)

MAHOUT-1400: Remove references to deprecated and removed algorithms from examples scripts (ssc)

MAHOUT-1399: Fixed multiple slf4j bindings when running Mahout examples issue (sslavic)

MAHOUT-1398: FileDataModel should provide a constructor with a delimiterPattern (Roy Guo via ssc)

MAHOUT-1396: Accidental use of commons-math won't work with next Hadoop 2 release (srowen)
Expand Down
6 changes: 3 additions & 3 deletions bin/mahout
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@
# MAHOUT_CORE set to anything other than an empty string to force
# mahout to run in developer 'core' mode, just as if the
# -core option was presented on the command-line
# Commane-line Options
# Command-line Options
#
# -core -core is used to switch into 'developer mode' when
# running mahout locally. If specified, the classes
# from the 'target/classes' directories in each project
# are used. Otherwise classes will be retrived from
# jars in the binary releas collection or *-job.jar files
# are used. Otherwise classes will be retrieved from
# jars in the binary release collection or *-job.jar files
# found in build directories. When running on hadoop
# the job files will always be used.

Expand Down
1 change: 1 addition & 0 deletions core/src/main/assembly/job.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
</unpackOptions>
<scope>runtime</scope>
<outputDirectory>/</outputDirectory>
<useTransitiveFiltering>true</useTransitiveFiltering>
<excludes>
<exclude>org.apache.hadoop:hadoop-core</exclude>
</excludes>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public AdaptiveLogisticRegression() {}
* @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
* @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
*
* @see {@link #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)}
* @see #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)
*/
public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) {
this(numCategories, numFeatures, prior, DEFAULT_THREAD_COUNT, DEFAULT_POOL_SIZE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,16 @@
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.clustering.iterator.ClusteringPolicy;
import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.math.VectorWritable;
Expand Down Expand Up @@ -186,19 +190,34 @@ private static Path finalClustersPath(Configuration conf, Path clusterOutputPath
* @param output
* the path to store classified data
* @param clusterClassificationThreshold
* the threshold value of probability distribution function from 0.0
* to 1.0. Any vector with pdf less that this threshold will not be
* classified for the cluster
* @param emitMostLikely
* emit the vectors with the max pdf values per cluster
* @throws IOException
*/
private static void selectCluster(Path input, List<Cluster> clusterModels, ClusterClassifier clusterClassifier,
Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException {
Configuration conf = new Configuration();
SequenceFile.Writer writer = new SequenceFile.Writer(input.getFileSystem(conf), conf, new Path(output,
"part-m-" + 0), IntWritable.class, WeightedPropertyVectorWritable.class);
for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST,
for (Pair<Writable, VectorWritable> vw : new SequenceFileDirIterable<Writable, VectorWritable>(input, PathType.LIST,
PathFilters.logsCRCFilter(), conf)) {
Vector pdfPerCluster = clusterClassifier.classify(vw.get());
// Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
// belongs to which cluster - fix for MAHOUT-1410
Class<? extends Writable> keyClass = vw.getFirst().getClass();
Vector vector = vw.getSecond().get();
if (!keyClass.equals(NamedVector.class)) {
if (keyClass.equals(Text.class)) {
vector = new NamedVector(vector, vw.getFirst().toString());
} else if (keyClass.equals(IntWritable.class)) {
vector = new NamedVector(vector, Integer.toString(((IntWritable) vw.getFirst()).get()));
}
}
Vector pdfPerCluster = clusterClassifier.classify(vector);
if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) {
classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, vw, pdfPerCluster);
classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster);
}
}
writer.close();
Expand All @@ -209,28 +228,37 @@ private static void classifyAndWrite(List<Cluster> clusterModels, Double cluster
Map<Text, Text> props = Maps.newHashMap();
if (emitMostLikely) {
int maxValueIndex = pdfPerCluster.maxValueIndex();
WeightedPropertyVectorWritable wpvw = new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props);
write(clusterModels, writer, wpvw, maxValueIndex);
WeightedPropertyVectorWritable weightedPropertyVectorWritable =
new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props);
write(clusterModels, writer, weightedPropertyVectorWritable, maxValueIndex);
} else {
writeAllAboveThreshold(clusterModels, clusterClassificationThreshold, writer, vw, pdfPerCluster);
}
}

private static void writeAllAboveThreshold(List<Cluster> clusterModels, Double clusterClassificationThreshold,
SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException {
Map<Text, Text> props = Maps.newHashMap();
for (Element pdf : pdfPerCluster.nonZeroes()) {
if (pdf.get() >= clusterClassificationThreshold) {
WeightedVectorWritable wvw = new WeightedVectorWritable(pdf.get(), vw.get());
WeightedPropertyVectorWritable wvw = new WeightedPropertyVectorWritable(pdf.get(), vw.get(), props);
int clusterIndex = pdf.index();
write(clusterModels, writer, wvw, clusterIndex);
}
}
}

private static void write(List<Cluster> clusterModels, SequenceFile.Writer writer, WeightedVectorWritable wvw,
private static void write(List<Cluster> clusterModels, SequenceFile.Writer writer,
WeightedPropertyVectorWritable weightedPropertyVectorWritable,
int maxValueIndex) throws IOException {
Cluster cluster = clusterModels.get(maxValueIndex);
writer.append(new IntWritable(cluster.getId()), wvw);

DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
double distance = distanceMeasure.distance(cluster.getCenter(), weightedPropertyVectorWritable.getVector());

weightedPropertyVectorWritable.getProperties().put(new Text("distance"), new Text(Double.toString(distance)));
writer.append(new IntWritable(cluster.getId()), weightedPropertyVectorWritable);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.iterator.ClusterWritable;
import org.apache.mahout.clustering.iterator.ClusteringPolicy;
import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.math.VectorWritable;
Expand Down Expand Up @@ -83,13 +86,24 @@ protected void setup(Context context) throws IOException, InterruptedException {
protected void map(WritableComparable<?> key, VectorWritable vw, Context context)
throws IOException, InterruptedException {
if (!clusterModels.isEmpty()) {
Vector pdfPerCluster = clusterClassifier.classify(vw.get());
// Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
// belongs to which cluster - fix for MAHOUT-1410
Class<? extends Vector> vectorClass = vw.get().getClass();
Vector vector = vw.get();
if (!vectorClass.equals(NamedVector.class)) {
if (key.getClass().equals(Text.class)) {
vector = new NamedVector(vector, key.toString());
} else if (key.getClass().equals(IntWritable.class)) {
vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get()));
}
}
Vector pdfPerCluster = clusterClassifier.classify(vector);
if (shouldClassify(pdfPerCluster)) {
if (emitMostLikely) {
int maxValueIndex = pdfPerCluster.maxValueIndex();
write(vw, context, maxValueIndex, 1.0);
write(new VectorWritable(vector), context, maxValueIndex, 1.0);
} else {
writeAllAboveThreshold(vw, context, pdfPerCluster);
writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster);
}
}
}
Expand All @@ -109,9 +123,13 @@ private void write(VectorWritable vw, Context context, int clusterIndex, double
throws IOException, InterruptedException {
Cluster cluster = clusterModels.get(clusterIndex);
clusterId.set(cluster.getId());
double d = cluster.getCenter().getDistanceSquared(vw.get());

DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
double distance = distanceMeasure.distance(cluster.getCenter(), vw.get());

Map<Text, Text> props = Maps.newHashMap();
props.put(new Text("distance-squared"), new Text(Double.toString(d)));
props.put(new Text("distance"), new Text(Double.toString(distance)));
context.write(clusterId, new WeightedPropertyVectorWritable(weight, vw.get(), props));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ public int run(String[] args) throws Exception {
input,
clusters,
output,
measure,
convergenceDelta,
maxIterations,
fuzziness,
Expand All @@ -124,32 +123,31 @@ public int run(String[] args) throws Exception {
/**
* Iterate over the input vectors to produce clusters and, if requested, use the
* results of the final iteration to cluster the input vectors.
*
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
* the directory pathname for output points
* @param convergenceDelta
* the convergence delta value
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
* the maximum number of iterations
* @param m
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential if true run in sequential execution mode
*/
public static void run(Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
int maxIterations,
float m,
Expand All @@ -162,7 +160,6 @@ public static void run(Path input,
input,
clustersIn,
output,
measure,
convergenceDelta,
maxIterations,
m,
Expand All @@ -172,7 +169,6 @@ public static void run(Path input,
clusterData(conf, input,
clustersOut,
output,
measure,
convergenceDelta,
m,
emitMostLikely,
Expand All @@ -189,27 +185,26 @@ public static void run(Path input,
* @param clustersIn
* the directory pathname for initial & computed clusters
* @param output
* the directory pathname for output points
* the directory pathname for output points
* @param convergenceDelta
* the convergence delta value
* the convergence delta value
* @param maxIterations
* the maximum number of iterations
* the maximum number of iterations
* @param m
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runClustering
* true if points are to be clustered after iterations complete
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential if true run in sequential execution mode
*/
public static void run(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
int maxIterations,
float m,
Expand All @@ -219,14 +214,13 @@ public static void run(Configuration conf,
boolean runSequential)
throws IOException, ClassNotFoundException, InterruptedException {
Path clustersOut =
buildClusters(conf, input, clustersIn, output, measure, convergenceDelta, maxIterations, m, runSequential);
buildClusters(conf, input, clustersIn, output, convergenceDelta, maxIterations, m, runSequential);
if (runClustering) {
log.info("Clustering");
clusterData(conf,
input,
clustersOut,
output,
measure,
convergenceDelta,
m,
emitMostLikely,
Expand All @@ -237,14 +231,13 @@ public static void run(Configuration conf,

/**
* Iterate over the input vectors to produce cluster directories for each iteration
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the file pathname for initial cluster centers
* @param output
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
* @param convergenceDelta
* the convergence delta value
* @param maxIterations
Expand All @@ -253,14 +246,13 @@ public static void run(Configuration conf,
* the fuzzification factor, see
* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
* @param runSequential if true run in sequential execution mode
*
*
* @return the Path of the final clusters directory
*/
public static Path buildClusters(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
int maxIterations,
float m,
Expand Down Expand Up @@ -293,28 +285,25 @@ public static Path buildClusters(Configuration conf,

/**
* Run the job using supplied arguments
*
*
* @param input
* the directory pathname for input points
* @param clustersIn
* the directory pathname for input clusters
* @param output
* the directory pathname for output points
* @param measure
* the classname of the DistanceMeasure
* the directory pathname for output points
* @param convergenceDelta
* the convergence delta value
* the convergence delta value
* @param emitMostLikely
* a boolean if true emit only most likely cluster for each point
* a boolean if true emit only most likely cluster for each point
* @param threshold
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
* @param runSequential if true run in sequential execution mode
*/
public static void clusterData(Configuration conf,
Path input,
Path clustersIn,
Path output,
DistanceMeasure measure,
double convergenceDelta,
float m,
boolean emitMostLikely,
Expand Down
Loading

0 comments on commit bd150f5

Please sign in to comment.