diff --git a/DESCRIPTION b/DESCRIPTION index 91f8501..922bd65 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,24 +1,24 @@ Package: Ckmeans.1d.dp Type: Package -Version: 3.4.6-2 -Date: 2016-09-25 +Version: 3.4.6-3 +Date: 2016-10-04 Title: Optimal and Fast Univariate k-Means Clustering Authors@R: c(person("Joe", "Song", role = c("aut", "cre"), email = "joemsong@cs.nmsu.edu"), person("Haizhou", "Wang", role = "aut")) Author: Joe Song [aut, cre], Haizhou Wang [aut] Maintainer: Joe Song -Description: A dynamic programming algorithm for optimal one-dimensional - k-means clustering. The algorithm minimizes the sum of squares of - within-cluster distances. As an alternative to heuristic k-means algorithms, - this method guarantees optimality and reproducibility. Its advantage in - efficiency and accuracy over k-means is increasingly pronounced as the - number of clusters k increases. +Description: A dynamic programming algorithm for optimal univariate k-means + clustering. Minimizing the sum of squares of within-cluster distances, the + algorithm guarantees optimality and reproducibility. Its advantage over + heuristic k-means algorithms in efficiency and accuracy is increasingly + pronounced as the number of clusters k increases. It provides an alternative + to heuristic k-means algorithms for univariate clustering. License: LGPL (>= 3) NeedsCompilation: yes Suggests: testthat Depends: R (>= 2.10.0) LazyData: true -Packaged: 2016-09-25 19:50:13 UTC; joemsong +Packaged: 2016-10-05 03:19:39 UTC; joemsong Repository: CRAN -Date/Publication: 2016-09-26 00:51:35 +Date/Publication: 2016-10-05 09:29:38 diff --git a/MD5 b/MD5 index 87f74aa..dd1787c 100644 --- a/MD5 +++ b/MD5 @@ -1,11 +1,13 @@ -0e0fda746fd3b2d3f8f15bf58e29322e *DESCRIPTION -787647a684ca7485170df7528cab08f2 *NAMESPACE -0d03ec89982b5283c907357ad793bb85 *NEWS -657ef2e93cfa5b9317ccdb1ee38e1a63 *R/Ckmeans.1d.dp.R +86daa6e9d5094b51d8330c94169fc094 *DESCRIPTION +ec13f712d0b4de25fce333839f245322 *NAMESPACE +abf3af4b69e0b0276ef5bbbf3f5c464a *NEWS +c6537261e0b3d27d460bdbf8c9b6bd1c *R/Ckmeans.1d.dp.R +f0f54fcabc88536341f3d6cf90a1b095 *R/visualize.R 235c2a4e13b77aec40c03c3748848ff2 *inst/CITATION -a8eb03a0c0df529e0956c99a68a8b013 *man/Ckmeans.1d.dp-package.Rd -ac5228dfac8bc73bf4ee557b5f5c3e57 *man/Ckmeans.1d.dp.Rd -089624d96433df2c64044cf4b1ce9657 *man/ahist.Rd +0b11da238df6f0230400514f88c45fd1 *man/Ckmeans.1d.dp-package.Rd +fe195b8f3c1cb4570135f35a3f13fcf8 *man/Ckmeans.1d.dp.Rd +ae8237321bd311770afba9a52a7e105c *man/ahist.Rd +f34cdffd6b3af79589d4ac2c36b2681e *man/plot.Ckmeans.1d.dp.Rd c71309b404b58d441411946a44a78557 *man/print.Ckmeans.1d.dp.Rd bce4a1d748d943de93cc14d57aa9a862 *src/Ckmeans.1d.dp.cpp 39ea771032788c1d8107f62e6660c2eb *src/Ckmeans.1d.dp.h diff --git a/NAMESPACE b/NAMESPACE index 106c914..10745dd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,4 +3,6 @@ useDynLib(Ckmeans.1d.dp) export(Ckmeans.1d.dp) export(print.Ckmeans.1d.dp) export(ahist) +export(plot.Ckmeans.1d.dp) S3method(print, Ckmeans.1d.dp) +S3method(plot, Ckmeans.1d.dp) diff --git a/NEWS b/NEWS index b276914..fbfb834 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,29 @@ NEWS +Version 3.4.6-3 + Changes + 2016-10-03 + 1. Updated examples in Ckmeans.1d.dp() manual + + Changes + 2016-10-01 + 1. Added a new function plot() to visualize the clusters. + + 2016-09-30 + 1. Updated examples of ahist(). + 2. Added sticks to ahist() to show the original input data. + + 2016-09-27 + 1. Fixed ahist() when there is only a single bin detected. + 2. Made ahist() run much faster than the previous version. + 3. Updated previous examples and added more examples to illustrate + the use of ahist() better. + Version 3.4.6-2 Changes 2016-09-25 1. Introduced a new function ahist() to generate adaptive histograms - corresponding to the optimal univariate k-means clustering + corresponding to the optimal univariate k-means clustering. Version 3.4.6-1 Changes diff --git a/R/Ckmeans.1d.dp.R b/R/Ckmeans.1d.dp.R index b206386..91c1096 100644 --- a/R/Ckmeans.1d.dp.R +++ b/R/Ckmeans.1d.dp.R @@ -11,25 +11,6 @@ # May 17, 2016. MS # September 25, 2016. MS. Introduced function ahist() -ahist <- function( - x, k=c(1,9), plot = TRUE, xlab = deparse(substitute(x)), - main = paste("Adaptive histogram of", deparse(substitute(x))), - ...) - # adaptive histogram -{ - xd <- Ckmeans.1d.dp(x, k=k)$cluster - breaks <- sapply(1:(max(xd)-1), function(l) (max(x[xd==l]) + min(x[xd==l+1]))/2) - h <- graphics::hist(x, breaks=c(min(x), breaks, max(x)), plot=FALSE, - warn.unused=FALSE, ...) - h$xname <- deparse(substitute(x)) - if(plot) { - graphics::plot(h, main=main, xlab=xlab, ...) - invisible(h) - } else { - return(h) - } -} - ## print method for Ckmeans.1d.dp object print.Ckmeans.1d.dp <- function(x, ...) { @@ -67,6 +48,8 @@ Ckmeans.1d.dp <- function( x, k=c(1,9), y=1 )# y=rep(1, length(x))) { if(is.null(k)) { k <- 1: min( 9, length(x) ) + } else { + k <- as.integer(ceiling(k)) } if(length(k) > 1) { @@ -135,7 +118,8 @@ Ckmeans.1d.dp <- function( x, k=c(1,9), y=1 )# y=rep(1, length(x))) betweenss <- totss - tot.withinss r <- structure(list(cluster = result$cluster, centers = result$centers[1:k.opt], withinss = result$withinss[1:k.opt], size = result$size[1:k.opt], - totss = totss, tot.withinss = tot.withinss, betweenss = betweenss), + totss = totss, tot.withinss = tot.withinss, betweenss = betweenss, + xname=deparse(substitute(x)), yname=deparse(substitute(y))), class = "Ckmeans.1d.dp") return( r ) diff --git a/R/visualize.R b/R/visualize.R new file mode 100644 index 0000000..bb8a7be --- /dev/null +++ b/R/visualize.R @@ -0,0 +1,89 @@ +# visualize.R -- visualization functions for Ckmeans.1d.dp +# +# Joe Song +# Created: Oct 1, 2016 + +ahist <- function( + x, k = c(1,9), plot = TRUE, xlab = deparse(substitute(x)), + main = paste("Adaptive histogram of", deparse(substitute(x))), + col = NULL, lwd = graphics::par("lwd"), col.stick = "gray", lwd.stick = 1, + ...) + # adaptive histogram +{ + xs <- sort(x) + r <- Ckmeans.1d.dp(xs, k=k) + kopt <- length(r$size) + breaks <- vector("double", length=kopt+1) + i <- r$size[1] + + if(kopt > 1) { + for(q in 2:kopt) { + breaks[q] <- (xs[i] + xs[i+1])/2 + i <- i + r$size[q] + } + } + + breaks[1] <- xs[1] + breaks[kopt+1] <- xs[length(xs)] + + h <- graphics::hist(x, breaks=breaks, plot=FALSE, + warn.unused=FALSE, ...) + h$xname <- deparse(substitute(x)) + + if(plot) { + opar <- graphics::par(lwd=lwd) + graphics::plot(h, main=main, xlab=xlab, col=col, ...) + if(h$equidist) { + graphics::segments(x, -max(h$count)/10, x, 0, + col=col.stick, lwd=lwd.stick) + } else { + graphics::segments(x, -max(h$density)/10, x, 0, + col=col.stick, lwd=lwd.stick) + } + graphics::par(opar) + invisible(h) + } else { + return(h) + } +} + +plot.Ckmeans.1d.dp <- + function(x, xlab=NULL, ylab=NULL, main=NULL, + sub=NULL, col.clusters=NULL, ...) + { + ck <- x + if(is.null(xlab)) xlab <- ck$xname + if(is.null(ylab)) ylab <- ifelse(ck$yname=="1", "Weight", ck$yname) + if(is.null(main)) main <- paste("Optimal k-means clustering of", ck$xname) + if(is.null(sub)) sub=paste("n =", length(ck$cluster)) + if(is.null(col.clusters)) col.clusters <- seq_along(x$size) + + if(exists(ck$xname, mode="numeric")) { + x <- get(ck$xname, mode="numeric") + } else { + x <- eval(parse(text=ck$xname)) + } + + if(exists(ck$yname, mode="numeric")) { + y <- get(ck$yname, mode="numeric") + } else { + y <- eval(parse(text=ck$yname)) + } + + if(length(y) == 1) { + y <- rep(y, length(x)) + } + + graphics::plot(x, y, type="p", + xlab=xlab, ylab=ylab, main=main, sub=sub, + col=col.clusters[ck$cluster], + ...) + + ks <- seq_along(ck$size) + sapply(ks, function(q) { + graphics::segments(x[ck$cluster == q], 0, + x[ck$cluster == q], y[ck$cluster == q], + col=col.clusters[q], ...) } ) + + invisible(ck) + } diff --git a/man/Ckmeans.1d.dp-package.Rd b/man/Ckmeans.1d.dp-package.Rd index 2ceee99..baa86e5 100644 --- a/man/Ckmeans.1d.dp-package.Rd +++ b/man/Ckmeans.1d.dp-package.Rd @@ -19,7 +19,7 @@ Richard Bellman (1973) first described a general dynamic programming strategy fo \tabular{ll}{ Package: \tab Ckmeans.1d.dp\cr Type: \tab Package\cr -Version: \tab 3.4.6-2\cr +Version: \tab 3.4.6-3\cr Initial version: \tab 1.0\cr Initial date: \tab 2010-10-26\cr License: \tab LGPL (>= 3) \cr diff --git a/man/Ckmeans.1d.dp.Rd b/man/Ckmeans.1d.dp.Rd index d6edc09..c2e236b 100644 --- a/man/Ckmeans.1d.dp.Rd +++ b/man/Ckmeans.1d.dp.Rd @@ -54,11 +54,12 @@ Wang, H. and Song, M. (2011) Ckmeans.1d.dp: optimal \var{k}-means clustering in \examples{ # Ex. 1 The number of clusters is provided. -# Generate data from a Gaussian mixture model of two components -x <- c(rnorm(50, sd=0.3), rnorm(50, mean=1, sd=0.3)) -# Divide x into 2 clusters -k <- 2 +# Generate data from a Gaussian mixture model of three components +x <- c(rnorm(50, sd=0.3), rnorm(50, mean=1, sd=0.3), rnorm(50, mean=2, sd=0.3)) +# Divide x into 3 clusters +k <- 3 result <- Ckmeans.1d.dp(x, k) +plot(result) plot(x, col=result$cluster, pch=result$cluster, cex=1.5, main="Optimal k-means clustering given k", sub=paste("Number of clusters given:", k)) @@ -66,10 +67,11 @@ abline(h=result$centers, col=1:k, lty="dashed", lwd=2) legend("bottom", paste("Cluster", 1:k), col=1:k, pch=1:k, cex=1.5, bty="n") # Ex. 2 The number of clusters is determined by Bayesian information criterion -# Generate data from a Gaussian mixture model of two components -x <- c(rnorm(50, mean=-1, sd=0.3), rnorm(50, mean=1, sd=1)) +# Generate data from a Gaussian mixture model of three components +x <- c(rnorm(50, mean=-1, sd=0.3), rnorm(50, mean=1, sd=1), rnorm(50, mean=2, sd=0.4)) # Divide x into k clusters, k automatically selected (default: 1~9) result <- Ckmeans.1d.dp(x) +plot(result) k <- max(result$cluster) plot(x, col=result$cluster, pch=result$cluster, cex=1.5, main="Optimal k-means clustering with k estimated", @@ -87,6 +89,7 @@ y <- c(y1, y2) w <- y^8 res <- Ckmeans.1d.dp(t, k=c(1:10), w) +plot(res) plot(t, w, main = "Time course segmentation", col=res$cluster, pch=res$cluster, xlab="Time t", ylab="Transformed intensity w", type="h") abline(v=res$centers, col="chocolate", lty="dashed") diff --git a/man/ahist.Rd b/man/ahist.Rd index 10c1c9e..d43deaf 100644 --- a/man/ahist.Rd +++ b/man/ahist.Rd @@ -2,24 +2,34 @@ \alias{ahist} \title{Adaptive Histograms by Optimal Univariate k-Means Clustering} \description{ -Generate histograms that are adaptive to patterns in data. The number and widths of histogram bins are automatically adjusted to the data and thus the bins are unlikely of equal width. The breaks are computed from optimal univariate k-means clusters of the input data. +Generate histograms that are adaptive to patterns in data. The number and widths of histogram bins are automatically adjusted to the data and thus the bins are unlikely of equal width. The breaks are computed from optimal univariate \var{k}-means clusters of the input data. } \usage{ ahist(x, k=c(1,9), plot = TRUE, xlab = deparse(substitute(x)), main = paste("Adaptive histogram of", deparse(substitute(x))), + col = NULL, lwd = graphics::par("lwd"), col.stick = "gray", lwd.stick = 1, \dots) } \arguments{ \item{x}{a numeric vector of data. All \code{NA} elements must be removed from \code{x} before calling this function. The function will run faster on sorted \code{x} (in non-decreasing order) than an unsorted input.} - \item{k}{either an exact integer number of clusters, or a vector of length two specifying the minimum and maximum numbers of clusters to be examined. The default is \code{c(1,9)}. When \code{k} is a range, the actual number of clusters is determined by Bayesian information criterion.} + \item{k}{either an exact integer number of bins/clusters, or a vector of length two specifying the minimum and maximum numbers of bins/clusters to be examined. The default is \code{c(1,9)}. When \code{k} is a range, the actual number of clusters is determined by Bayesian information criterion.} \item{plot}{a logical. If \code{TRUE}, the histogram will be plotted.} - \item{xlab}{a character string of the x-axis lab for the plot} + \item{xlab}{a character string. The x-axis label for the plot.} - \item{main}{a character string of the title for the plot} - \item{...}{additional arguments passed to \code{\link{hist}} and \code{\link{plot.histogram}}.} + \item{main}{a character string. The title for the plot.} + + \item{col}{a character string. The fill color of the histogram bars.} + + \item{lwd}{a numeric value. The line width of the border of the histogram bars} + + \item{col.stick}{a character string. The color of the sticks above the x-axis. See Details.} + + \item{lwd.stick}{a numeric value. The line width of the sticks above the x-axis. See Details.} + + \item{...}{additional arguments to be passed to \code{\link{hist}} or \code{\link{plot.histogram}}.} } \author{ @@ -27,7 +37,7 @@ ahist(x, k=c(1,9), plot = TRUE, xlab = deparse(substitute(x)), } \details{ -The number of histogram bins is the optimal number of univariate k-means clusters estimated using Bayesian information criterion evaluated on Gaussian mixture models fitted to the input data in \code{x}. Breaks in the histogram are the midpoints between clusters identified by optimal univariate k-means (\code{\link{Ckmeans.1d.dp}}). The histogram is by default plotted using the \code{\link{plot.histogram}} method. The plot can be optionally disabled with the \code{plot=FALSE} argument. +The number of histogram bins is the optimal number of univariate k-means clusters estimated using Bayesian information criterion evaluated on Gaussian mixture models fitted to the input data in \code{x}. Breaks in the histogram are the midpoints between clusters identified by optimal univariate k-means (\code{\link{Ckmeans.1d.dp}}). The histogram is by default plotted using the \code{\link{plot.histogram}} method. The plot can be optionally disabled with the \code{plot=FALSE} argument. The original input data are shown as sticks just above the x-axis. } \value{ @@ -43,10 +53,29 @@ Wang, H. and Song, M. (2011) Ckmeans.1d.dp: optimal \var{k}-means clustering in } \examples{ -# Example: plot an optimal histogram from a Gaussian mixture -# model with three components -x <- c(rnorm(40, mean=-1, sd=0.3), - rnorm(25, mean=1, sd=0.3), - rnorm(60, mean=3, sd=0.2)) -ahist(x, col="seagreen") +# Example 1: plot an adaptive histogram from data generated by +# a Gaussian mixture model with three components using a given +# number of bins +x <- c(rnorm(40, mean=-2, sd=0.3), + rnorm(45, mean=1, sd=0.1), + rnorm(70, mean=3, sd=0.2)) +ahist(x, k=9, col="lavender", col.stick="salmon", + sub=paste("n =", length(x)), lwd=2, + main="Example 1. Gaussian mixture model with 3 components\n(on average 3 bins per component)") + +# Example 2: plot an adaptive histogram from data generated by +# a Gaussian mixture model with three components +ahist(x, col="lightblue", sub=paste("n =", length(x)), + col.stick="salmon", lwd=2, + main="Example 2. Gaussian mixture model with 3 components\n(one bin per component)") + +# Example 3: The DNase data frame has 176 rows and 3 columns of +# data obtained during development of an ELISA assay for the +# recombinant protein DNase in rat serum. + +data(DNase) +ahist(DNase$density, col="gray", col.stick="salmon", + sub=paste("n =", length(x)), lwd=2, + xlab="Optical density of protein DNase", + main="Example 3. Elisa assay of DNase in rat serum") } diff --git a/man/plot.Ckmeans.1d.dp.Rd b/man/plot.Ckmeans.1d.dp.Rd new file mode 100644 index 0000000..4aa7613 --- /dev/null +++ b/man/plot.Ckmeans.1d.dp.Rd @@ -0,0 +1,59 @@ +\name{plot.Ckmeans.1d.dp} +\alias{plot.Ckmeans.1d.dp} +\title{Plot Optimal Univariate \var{k}-Means Clustering Results} +\description{ +Plot optimal univariate clustering results returned from \code{Ckmeans.1d.dp}. +} +\usage{ +\method{plot}{Ckmeans.1d.dp}(x, xlab=NULL, ylab=NULL, main=NULL, + sub=NULL, col.clusters=NULL, \dots) +%(x, xlab=ck$xname, +% ylab=ifelse(x$yname=="1", "Weight", x$yname), +% main="Optimal k-means clustering of", x$xname), +% sub=paste("n =", length(x$cluster)), +% col.clusters=seq_along(x$size), +% \dots) +} +\arguments{ + \item{x}{an object of class \code{Ckmeans.1d.dp} returned by \code{\link{Ckmeans.1d.dp}}.} + + \item{xlab}{a character string. The x-axis label for the plot.} + + \item{ylab}{a character string. The x-axis label for the plot.} + + \item{main}{a character string. The title for the plot.} + + \item{sub}{a character string. The subtitle for the plot.} + + \item{col.clusters}{a vector of colors, defined either by integers or by color names. If the length is shorter than the number of clusters, the colors will be reused.} + + \item{...}{arguments passed to \code{\link{plot}} function in package \pkg{graphics}.} +} + +\author{ + Joe Song +} + +\details{ +The function visualizes the input data as sticks whose heights are the weights. It uses different colors to indicate optimal \var{k}-means clusters. +} + +\value{ + An object of class "\code{Ckmeans.1d.dp}" defined in \code{\link{Ckmeans.1d.dp}}. +} + +\references{ + Wang, H. and Song, M. (2011) Ckmeans.1d.dp: optimal \var{k}-means clustering in one dimension by dynamic programming. \emph{The R Journal} \bold{3}(2), 29--33. Retrieved from \url{http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf} +} + +\examples{ +# Example: clustering data generated from a Gaussian mixture model of two components +x <- rnorm(50, mean=-1, sd=0.3) +x <- append(x, rnorm(50, mean=1, sd=0.3) ) +res <- Ckmeans.1d.dp(x) +plot(res) + +y <- (rnorm(length(x)))^2 +res <- Ckmeans.1d.dp(x, y=y) +plot(res) +}