version 3.4.6-3

BlueTea88 · Oct 5, 2016 · 5eae0fb · 5eae0fb
1 parent 53beebe
commit 5eae0fb
Show file tree

Hide file tree

Showing 10 changed files with 244 additions and 57 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,24 +1,24 @@
 Package: Ckmeans.1d.dp
 Type: Package
-Version: 3.4.6-2
-Date: 2016-09-25
+Version: 3.4.6-3
+Date: 2016-10-04
 Title: Optimal and Fast Univariate k-Means Clustering
 Authors@R: c(person("Joe", "Song", role = c("aut", "cre"),
 		     email = "joemsong@cs.nmsu.edu"),
 	      person("Haizhou", "Wang", role = "aut"))
 Author: Joe Song [aut, cre], Haizhou Wang [aut]
 Maintainer: Joe Song <joemsong@cs.nmsu.edu>
-Description: A dynamic programming algorithm for optimal one-dimensional
- k-means clustering. The algorithm minimizes the sum of squares of
- within-cluster distances. As an alternative to heuristic k-means algorithms,
- this method guarantees optimality and reproducibility. Its advantage in
- efficiency and accuracy over k-means is increasingly pronounced as the
- number of clusters k increases.
+Description: A dynamic programming algorithm for optimal univariate k-means
+ clustering. Minimizing the sum of squares of within-cluster distances, the
+ algorithm guarantees optimality and reproducibility. Its advantage over
+ heuristic k-means algorithms in efficiency and accuracy is increasingly
+ pronounced as the number of clusters k increases. It provides an alternative
+ to heuristic k-means algorithms for univariate clustering.
 License: LGPL (>= 3)
 NeedsCompilation: yes
 Suggests: testthat
 Depends: R (>= 2.10.0)
 LazyData: true
-Packaged: 2016-09-25 19:50:13 UTC; joemsong
+Packaged: 2016-10-05 03:19:39 UTC; joemsong
 Repository: CRAN
-Date/Publication: 2016-09-26 00:51:35
+Date/Publication: 2016-10-05 09:29:38
diff --git a/MD5 b/MD5
@@ -1,11 +1,13 @@
-0e0fda746fd3b2d3f8f15bf58e29322e *DESCRIPTION
-787647a684ca7485170df7528cab08f2 *NAMESPACE
-0d03ec89982b5283c907357ad793bb85 *NEWS
-657ef2e93cfa5b9317ccdb1ee38e1a63 *R/Ckmeans.1d.dp.R
+86daa6e9d5094b51d8330c94169fc094 *DESCRIPTION
+ec13f712d0b4de25fce333839f245322 *NAMESPACE
+abf3af4b69e0b0276ef5bbbf3f5c464a *NEWS
+c6537261e0b3d27d460bdbf8c9b6bd1c *R/Ckmeans.1d.dp.R
+f0f54fcabc88536341f3d6cf90a1b095 *R/visualize.R
 235c2a4e13b77aec40c03c3748848ff2 *inst/CITATION
-a8eb03a0c0df529e0956c99a68a8b013 *man/Ckmeans.1d.dp-package.Rd
-ac5228dfac8bc73bf4ee557b5f5c3e57 *man/Ckmeans.1d.dp.Rd
-089624d96433df2c64044cf4b1ce9657 *man/ahist.Rd
+0b11da238df6f0230400514f88c45fd1 *man/Ckmeans.1d.dp-package.Rd
+fe195b8f3c1cb4570135f35a3f13fcf8 *man/Ckmeans.1d.dp.Rd
+ae8237321bd311770afba9a52a7e105c *man/ahist.Rd
+f34cdffd6b3af79589d4ac2c36b2681e *man/plot.Ckmeans.1d.dp.Rd
 c71309b404b58d441411946a44a78557 *man/print.Ckmeans.1d.dp.Rd
 bce4a1d748d943de93cc14d57aa9a862 *src/Ckmeans.1d.dp.cpp
 39ea771032788c1d8107f62e6660c2eb *src/Ckmeans.1d.dp.h

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,4 +3,6 @@ useDynLib(Ckmeans.1d.dp)
 export(Ckmeans.1d.dp)
 export(print.Ckmeans.1d.dp)
 export(ahist)
+export(plot.Ckmeans.1d.dp)
 S3method(print, Ckmeans.1d.dp)
+S3method(plot, Ckmeans.1d.dp)
diff --git a/NEWS b/NEWS
@@ -1,10 +1,29 @@
 NEWS
 
+Version 3.4.6-3
+  Changes
+  2016-10-03
+  1. Updated examples in Ckmeans.1d.dp() manual
+
+  Changes
+  2016-10-01
+  1. Added a new function plot() to visualize the clusters.
+
+  2016-09-30
+  1. Updated examples of ahist().
+  2. Added sticks to ahist() to show the original input data.
+
+  2016-09-27
+  1. Fixed ahist() when there is only a single bin detected.
+  2. Made ahist() run much faster than the previous version.
+  3. Updated previous examples and added more examples to illustrate
+     the use of ahist() better.
+
 Version 3.4.6-2
   Changes
   2016-09-25
   1. Introduced a new function ahist() to generate adaptive histograms
-     corresponding to the optimal univariate k-means clustering
+     corresponding to the optimal univariate k-means clustering.
 
 Version 3.4.6-1
   Changes

diff --git a/R/Ckmeans.1d.dp.R b/R/Ckmeans.1d.dp.R
@@ -11,25 +11,6 @@
 #    May 17, 2016. MS
 #    September 25, 2016. MS. Introduced function ahist()
 
-ahist <- function(
-  x, k=c(1,9), plot = TRUE, xlab = deparse(substitute(x)),
-  main = paste("Adaptive histogram of", deparse(substitute(x))),
-  ...)
-  # adaptive histogram
-{
-  xd <- Ckmeans.1d.dp(x, k=k)$cluster
-  breaks <- sapply(1:(max(xd)-1), function(l) (max(x[xd==l]) + min(x[xd==l+1]))/2)
-  h <- graphics::hist(x, breaks=c(min(x), breaks, max(x)), plot=FALSE,
-                      warn.unused=FALSE, ...)
-  h$xname <- deparse(substitute(x))
-  if(plot) {
-    graphics::plot(h, main=main, xlab=xlab, ...)
-    invisible(h)
-  } else {
-    return(h)
-  }
-}
-
 ## print method for Ckmeans.1d.dp object
 print.Ckmeans.1d.dp <- function(x, ...)
 {
@@ -67,6 +48,8 @@ Ckmeans.1d.dp <- function( x, k=c(1,9), y=1 )# y=rep(1, length(x)))
 {
   if(is.null(k)) {
     k <- 1: min( 9, length(x) )
+  } else {
+    k <- as.integer(ceiling(k))
   }
 
   if(length(k) > 1) {
@@ -135,7 +118,8 @@ Ckmeans.1d.dp <- function( x, k=c(1,9), y=1 )# y=rep(1, length(x)))
   betweenss <- totss - tot.withinss
   r <- structure(list(cluster = result$cluster, centers = result$centers[1:k.opt],
                       withinss = result$withinss[1:k.opt], size = result$size[1:k.opt],
-                      totss = totss, tot.withinss = tot.withinss, betweenss = betweenss),
+                      totss = totss, tot.withinss = tot.withinss, betweenss = betweenss,
+                      xname=deparse(substitute(x)), yname=deparse(substitute(y))),
                  class = "Ckmeans.1d.dp")
 
   return( r )

diff --git a/R/visualize.R b/R/visualize.R
@@ -0,0 +1,89 @@
+# visualize.R -- visualization functions for Ckmeans.1d.dp
+#
+# Joe Song
+# Created: Oct 1, 2016
+
+ahist <- function(
+  x, k = c(1,9), plot = TRUE, xlab = deparse(substitute(x)),
+  main = paste("Adaptive histogram of", deparse(substitute(x))),
+  col = NULL, lwd = graphics::par("lwd"), col.stick = "gray", lwd.stick = 1,
+  ...)
+  # adaptive histogram
+{
+  xs <- sort(x)
+  r <- Ckmeans.1d.dp(xs, k=k)
+  kopt <- length(r$size)
+  breaks <- vector("double", length=kopt+1)
+  i <- r$size[1]
+
+  if(kopt > 1) {
+    for(q in 2:kopt) {
+      breaks[q] <- (xs[i] + xs[i+1])/2
+      i <- i + r$size[q]
+    }
+  }
+
+  breaks[1] <- xs[1]
+  breaks[kopt+1] <- xs[length(xs)]
+
+  h <- graphics::hist(x, breaks=breaks, plot=FALSE,
+                      warn.unused=FALSE, ...)
+  h$xname <- deparse(substitute(x))
+
+  if(plot) {
+    opar <- graphics::par(lwd=lwd)
+    graphics::plot(h, main=main, xlab=xlab, col=col, ...)
+    if(h$equidist) {
+      graphics::segments(x, -max(h$count)/10, x, 0,
+                         col=col.stick, lwd=lwd.stick)
+    } else {
+      graphics::segments(x, -max(h$density)/10, x, 0,
+                         col=col.stick, lwd=lwd.stick)
+    }
+    graphics::par(opar)
+    invisible(h)
+  } else {
+    return(h)
+  }
+}
+
+plot.Ckmeans.1d.dp <-
+  function(x, xlab=NULL, ylab=NULL, main=NULL,
+           sub=NULL, col.clusters=NULL, ...)
+  {
+    ck <- x
+    if(is.null(xlab)) xlab <- ck$xname
+    if(is.null(ylab)) ylab <- ifelse(ck$yname=="1", "Weight", ck$yname)
+    if(is.null(main)) main <- paste("Optimal k-means clustering of", ck$xname)
+    if(is.null(sub)) sub=paste("n =", length(ck$cluster))
+    if(is.null(col.clusters)) col.clusters <- seq_along(x$size)
+
+    if(exists(ck$xname, mode="numeric")) {
+      x <- get(ck$xname, mode="numeric")
+    } else {
+      x <- eval(parse(text=ck$xname))
+    }
+
+    if(exists(ck$yname, mode="numeric")) {
+      y <- get(ck$yname, mode="numeric")
+    } else {
+      y <- eval(parse(text=ck$yname))
+    }
+
+    if(length(y) == 1) {
+      y <- rep(y, length(x))
+    }
+
+    graphics::plot(x, y, type="p",
+                   xlab=xlab, ylab=ylab, main=main, sub=sub,
+                   col=col.clusters[ck$cluster],
+                   ...)
+
+    ks <- seq_along(ck$size)
+    sapply(ks, function(q) {
+      graphics::segments(x[ck$cluster == q], 0,
+                         x[ck$cluster == q], y[ck$cluster == q],
+                         col=col.clusters[q], ...) } )
+
+    invisible(ck)
+  }
diff --git a/man/Ckmeans.1d.dp-package.Rd b/man/Ckmeans.1d.dp-package.Rd
@@ -19,7 +19,7 @@ Richard Bellman (1973) first described a general dynamic programming strategy fo
 \tabular{ll}{
 Package: \tab Ckmeans.1d.dp\cr
 Type: \tab Package\cr
-Version: \tab 3.4.6-2\cr
+Version: \tab 3.4.6-3\cr
 Initial version: \tab 1.0\cr
 Initial date: \tab 2010-10-26\cr
 License: \tab LGPL (>= 3) \cr

diff --git a/man/Ckmeans.1d.dp.Rd b/man/Ckmeans.1d.dp.Rd
@@ -54,22 +54,24 @@ Wang, H. and Song, M. (2011) Ckmeans.1d.dp: optimal \var{k}-means clustering in
 
 \examples{
 # Ex. 1 The number of clusters is provided.
-# Generate data from a Gaussian mixture model of two components
-x <- c(rnorm(50, sd=0.3), rnorm(50, mean=1, sd=0.3))
-# Divide x into 2 clusters
-k <- 2
+# Generate data from a Gaussian mixture model of three components
+x <- c(rnorm(50, sd=0.3), rnorm(50, mean=1, sd=0.3), rnorm(50, mean=2, sd=0.3))
+# Divide x into 3 clusters
+k <- 3
 result <- Ckmeans.1d.dp(x, k)
+plot(result)
 plot(x, col=result$cluster, pch=result$cluster, cex=1.5,
      main="Optimal k-means clustering given k",
      sub=paste("Number of clusters given:", k))
 abline(h=result$centers, col=1:k, lty="dashed", lwd=2)
 legend("bottom", paste("Cluster", 1:k), col=1:k, pch=1:k, cex=1.5, bty="n")
 
 # Ex. 2 The number of clusters is determined by Bayesian information criterion
-# Generate data from a Gaussian mixture model of two components
-x <- c(rnorm(50, mean=-1, sd=0.3), rnorm(50, mean=1, sd=1))
+# Generate data from a Gaussian mixture model of three components
+x <- c(rnorm(50, mean=-1, sd=0.3), rnorm(50, mean=1, sd=1), rnorm(50, mean=2, sd=0.4))
 # Divide x into k clusters, k automatically selected (default: 1~9)
 result <- Ckmeans.1d.dp(x)
+plot(result)
 k <- max(result$cluster)
 plot(x, col=result$cluster, pch=result$cluster, cex=1.5,
      main="Optimal k-means clustering with k estimated",
@@ -87,6 +89,7 @@ y <- c(y1, y2)
 
 w <- y^8
 res <- Ckmeans.1d.dp(t, k=c(1:10), w)
+plot(res)
 plot(t, w, main = "Time course segmentation", col=res$cluster,
      pch=res$cluster, xlab="Time t", ylab="Transformed intensity w", type="h")
 abline(v=res$centers, col="chocolate", lty="dashed")

diff --git a/man/ahist.Rd b/man/ahist.Rd
@@ -2,32 +2,42 @@
 \alias{ahist}
 \title{Adaptive Histograms by Optimal Univariate k-Means Clustering}
 \description{
-Generate histograms that are adaptive to patterns in data. The number and widths of histogram bins are automatically adjusted to the data and thus the bins are unlikely of equal width. The breaks are computed from optimal univariate k-means clusters of the input data.
+Generate histograms that are adaptive to patterns in data. The number and widths of histogram bins are automatically adjusted to the data and thus the bins are unlikely of equal width. The breaks are computed from optimal univariate \var{k}-means clusters of the input data.
 }
 \usage{
 ahist(x, k=c(1,9), plot = TRUE, xlab = deparse(substitute(x)),
   main = paste("Adaptive histogram of", deparse(substitute(x))),
+  col = NULL, lwd = graphics::par("lwd"), col.stick = "gray", lwd.stick = 1,
   \dots)
 }
 \arguments{
   \item{x}{a numeric vector of data. All \code{NA} elements must be removed from \code{x} before calling this function. The function will run faster on sorted \code{x} (in non-decreasing order) than an unsorted input.}
 
-  \item{k}{either an exact integer number of clusters, or a vector of length two specifying the minimum and maximum numbers of clusters to be examined. The default is \code{c(1,9)}. When \code{k} is a range, the actual number of clusters is determined by Bayesian information criterion.}
+  \item{k}{either an exact integer number of bins/clusters, or a vector of length two specifying the minimum and maximum numbers of bins/clusters to be examined. The default is \code{c(1,9)}. When \code{k} is a range, the actual number of clusters is determined by Bayesian information criterion.}
 
  \item{plot}{a logical. If \code{TRUE}, the histogram will be plotted.}
 
- \item{xlab}{a character string of the x-axis lab for the plot}
+ \item{xlab}{a character string. The x-axis label for the plot.}
 
- \item{main}{a character string of the title for the plot}
- \item{...}{additional arguments passed to \code{\link{hist}} and \code{\link{plot.histogram}}.}
+ \item{main}{a character string. The title for the plot.}
+
+ \item{col}{a character string. The fill color of the histogram bars.}
+
+ \item{lwd}{a numeric value. The line width of the border of the histogram bars}
+
+ \item{col.stick}{a character string. The color of the sticks above the x-axis. See Details.}
+
+ \item{lwd.stick}{a numeric value. The line width of the sticks above the x-axis. See Details.}
+
+ \item{...}{additional arguments to be passed to \code{\link{hist}} or \code{\link{plot.histogram}}.}
 }
 
 \author{
 	Joe Song
 }
 
 \details{
-The number of histogram bins is the optimal number of univariate k-means clusters estimated using Bayesian information criterion evaluated on Gaussian mixture models fitted to the input data in \code{x}. Breaks in the histogram are the midpoints between clusters identified by optimal univariate k-means (\code{\link{Ckmeans.1d.dp}}). The histogram is by default plotted using the \code{\link{plot.histogram}} method. The plot can be optionally disabled with the \code{plot=FALSE} argument.
+The number of histogram bins is the optimal number of univariate k-means clusters estimated using Bayesian information criterion evaluated on Gaussian mixture models fitted to the input data in \code{x}. Breaks in the histogram are the midpoints between clusters identified by optimal univariate k-means (\code{\link{Ckmeans.1d.dp}}). The histogram is by default plotted using the \code{\link{plot.histogram}} method. The plot can be optionally disabled with the \code{plot=FALSE} argument. The original input data are shown as sticks just above the x-axis.
 }
 
 \value{
@@ -43,10 +53,29 @@ Wang, H. and Song, M. (2011) Ckmeans.1d.dp: optimal \var{k}-means clustering in
 }
 
 \examples{
-# Example: plot an optimal histogram from a Gaussian mixture
-#          model with three components
-x <- c(rnorm(40, mean=-1, sd=0.3),
-       rnorm(25, mean=1, sd=0.3),
-       rnorm(60, mean=3, sd=0.2))
-ahist(x, col="seagreen")
+# Example 1: plot an adaptive histogram from data generated by
+#   a Gaussian mixture model with three components using a given
+#   number of bins
+x <- c(rnorm(40, mean=-2, sd=0.3),
+       rnorm(45, mean=1, sd=0.1),
+       rnorm(70, mean=3, sd=0.2))
+ahist(x, k=9, col="lavender", col.stick="salmon",
+      sub=paste("n =", length(x)), lwd=2,
+      main="Example 1. Gaussian mixture model with 3 components\n(on average 3 bins per component)")
+
+# Example 2: plot an adaptive histogram from data generated by
+#   a Gaussian mixture model with three components
+ahist(x, col="lightblue", sub=paste("n =", length(x)),
+      col.stick="salmon", lwd=2,
+      main="Example 2. Gaussian mixture model with 3 components\n(one bin per component)")
+
+# Example 3: The DNase data frame has 176 rows and 3 columns of
+#   data obtained during development of an ELISA assay for the
+#   recombinant protein DNase in rat serum.
+
+data(DNase)
+ahist(DNase$density, col="gray", col.stick="salmon",
+      sub=paste("n =", length(x)), lwd=2,
+      xlab="Optical density of protein DNase",
+      main="Example 3. Elisa assay of DNase in rat serum")
 }