R package (microsoft#168)

* finish R's c_api * clean code * fix sizeof pointer in 32bit system. * add predictor class * add Dataset class * format code * add booster * add type check for expose function * add a simple callback * add all callbacks * finish the basic training logic * update docs * add an simple training interface * add basic test * adapt the changes in c_api * add test for Dataset * add test for custom obj/eval functions * fix python test * fix bug in metadata init * fix R CMD check
Alnusjaponica · Jan 8, 2017 · 551d59c · 551d59c
1 parent acbd4f3
commit 551d59c
Show file tree

Hide file tree

Showing 46 changed files with 4,348 additions and 4 deletions.
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
@@ -0,0 +1,37 @@
+Package: lightgbm
+Type: Package
+Title: Light Gradient Boosting Machine
+Version: 0.1
+Date: 2016-12-29
+Author: Guolin Ke <guolin.ke@microsoft.com>
+Maintainer: Guolin Ke <guolin.ke@microsoft.com>
+Description: LightGBM is a gradient boosting framework that uses tree based learning algorithms. 
+    It is designed to be distributed and efficient with the following advantages:
+        1.Faster training speed and higher efficiency. 
+        2.Lower memory usage. 
+        3.Better accuracy. 
+        4.Parallel learning supported 
+        5. Capable of handling large-scale data
+License: The MIT License (MIT) | file LICENSE
+URL: https://github.com/Microsoft/LightGBM
+BugReports: https://github.com/Microsoft/LightGBM/issues
+VignetteBuilder: knitr
+Suggests:
+    knitr,
+    rmarkdown,
+    ggplot2 (>= 1.0.1),
+    DiagrammeR (>= 0.8.1),
+    Ckmeans.1d.dp (>= 3.3.1),
+    vcd (>= 1.3),
+    testthat,
+    igraph (>= 1.0.1),
+    methods,
+    data.table (>= 1.9.6),
+    magrittr (>= 1.5),
+    stringi (>= 0.5.2)
+Depends:
+    R (>= 3.0),
+    R6
+Imports:
+    Matrix (>= 1.1-0)
+RoxygenNote: 5.0.1
diff --git a/R-package/LICENSE b/R-package/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) Microsoft Corporation 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
@@ -0,0 +1,26 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method("dimnames<-",lgb.Dataset)
+S3method(dim,lgb.Dataset)
+S3method(dimnames,lgb.Dataset)
+S3method(getinfo,lgb.Dataset)
+S3method(predict,lgb.Booster)
+S3method(setinfo,lgb.Dataset)
+S3method(slice,lgb.Dataset)
+export(getinfo)
+export(lgb.Dataset)
+export(lgb.Dataset.construct)
+export(lgb.Dataset.create.valid)
+export(lgb.Dataset.save)
+export(lgb.Dataset.set.categorical)
+export(lgb.Dataset.set.reference)
+export(lgb.dump)
+export(lgb.get.eval.result)
+export(lgb.load)
+export(lgb.save)
+export(lgb.train)
+export(lightgbm)
+export(setinfo)
+export(slice)
+importFrom(R6,R6Class)
+useDynLib(lightgbm)
diff --git a/R-package/R/callback.R b/R-package/R/callback.R
@@ -0,0 +1,249 @@
+CB_ENV <- R6Class(
+  "lgb.cb_env",
+  cloneable=FALSE,
+  public = list(
+    model=NULL,
+    iteration=NULL,
+    begin_iteration=NULL,
+    end_iteration=NULL,
+    eval_list=list(),
+    eval_err_list=list(),
+    best_iter=-1,
+    met_early_stop=FALSE
+  )
+)
+
+cb.reset.parameters <- function(new_params) {
+  if (typeof(new_params) != "list") 
+    stop("'new_params' must be a list")
+  pnames <- gsub("\\.", "_", names(new_params))
+  nrounds <- NULL
+
+  # run some checks in the begining
+  init <- function(env) {
+    nrounds <<- env$end_iteration - env$begin_iteration + 1
+
+    if (is.null(env$model))
+      stop("Env should has 'model'")
+
+    # Some parameters are not allowed to be changed,
+    # since changing them would simply wreck some chaos
+    not_allowed <- pnames %in% 
+      c('num_class', 'metric', 'boosting_type')
+    if (any(not_allowed))
+      stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
+
+    for (n in pnames) {
+      p <- new_params[[n]]
+      if (is.function(p)) {
+        if (length(formals(p)) != 2)
+          stop("Parameter '", n, "' is a function but not of two arguments")
+      } else if (is.numeric(p) || is.character(p)) {
+        if (length(p) != nrounds)
+          stop("Length of '", n, "' has to be equal to 'nrounds'")
+      } else {
+        stop("Parameter '", n, "' is not a function or a vector")
+      }
+    }
+  }
+
+  callback <- function(env) {
+    if (is.null(nrounds))
+      init(env)
+
+    i <- env$iteration - env$begin_iteration
+    pars <- lapply(new_params, function(p) {
+      if (is.function(p))
+        return(p(i, nrounds))
+      p[i]
+    })
+    # to-do check pars
+    if (!is.null(env$model)) {
+      env$model$reset_parameter(pars)
+    } 
+  }
+  attr(callback, 'call') <- match.call()
+  attr(callback, 'is_pre_iteration') <- TRUE
+  attr(callback, 'name') <- 'cb.reset.parameters'
+  return(callback)
+}
+
+# Format the evaluation metric string
+format.eval.string <- function(eval_res, eval_err=NULL) {
+  if (is.null(eval_res))
+    stop('no evaluation results')
+  if (length(eval_res) == 0)
+    stop('no evaluation results')
+  if (!is.null(eval_err)) {
+    res <- sprintf('%s\'s %s:%g+%g', eval_res$data_name, eval_res$name, eval_res$value, eval_err)
+  } else {
+    res <- sprintf('%s\'s %s:%g', eval_res$data_name, eval_res$name, eval_res$value)
+  }
+  return(res)
+}
+
+merge.eval.string <- function(env){
+  if(length(env$eval_list) <= 0){
+    return("")
+  }
+  msg <- list(sprintf('[%d]:',env$iteration))
+  is_eval_err <- FALSE
+  if(length(env$eval_err_list) > 0){
+    is_eval_err <- TRUE
+  }
+  for(j in 1:length(env$eval_list)) {
+    eval_err <- NULL
+    if(is_eval_err){
+      eval_err <- env$eval_err_list[[j]]
+    }
+    msg <- c(msg, format.eval.string(env$eval_list[[j]],eval_err))
+  }
+  return(paste0(msg, collapse='\t'))
+}
+
+cb.print.evaluation <- function(period=1){
+  callback <- function(env){
+    if(period > 0){
+      i <- env$iteration
+      if( (i - 1) %% period == 0
+         | i == env$begin_iteration
+         | i == env$end_iteration ){
+        cat(merge.eval.string(env), "\n")
+      }
+    }
+  }
+  attr(callback, 'call') <- match.call()
+  attr(callback, 'name') <- 'cb.print.evaluation'
+  return(callback)
+}
+
+cb.record.evaluation <- function() {
+  callback <- function(env){
+    if(length(env$eval_list) <= 0) return()
+    is_eval_err <- FALSE
+    if(length(env$eval_err_list) > 0){
+      is_eval_err <- TRUE
+    }
+    if(length(env$model$record_evals) == 0){
+      for(j in 1:length(env$eval_list)) {
+        data_name <- env$eval_list[[j]]$data_name
+        name <- env$eval_list[[j]]$name
+        env$model$record_evals$start_iter <- env$begin_iteration
+        if(is.null(env$model$record_evals[[data_name]])){
+          env$model$record_evals[[data_name]] <- list()
+        }
+        env$model$record_evals[[data_name]][[name]] <- list()
+        env$model$record_evals[[data_name]][[name]]$eval <- list()
+        env$model$record_evals[[data_name]][[name]]$eval_err <- list()
+      }
+    }
+    for(j in 1:length(env$eval_list)) {
+      eval_res <- env$eval_list[[j]]
+      eval_err <- NULL
+      if(is_eval_err){
+        eval_err <- env$eval_err_list[[j]]
+      }
+      data_name <- eval_res$data_name
+      name <- eval_res$name
+      env$model$record_evals[[data_name]][[name]]$eval <- c(env$model$record_evals[[data_name]][[name]]$eval, eval_res$value)
+      env$model$record_evals[[data_name]][[name]]$eval_err <- c(env$model$record_evals[[data_name]][[name]]$eval_err, eval_err)
+    }
+
+  }
+  attr(callback, 'call') <- match.call()
+  attr(callback, 'name') <- 'cb.record.evaluation'
+  return(callback)
+}
+
+cb.early.stop <- function(stopping_rounds, verbose=TRUE) {
+  # state variables
+  factor_to_bigger_better <- NULL
+  best_iter <- NULL
+  best_score <- NULL
+  best_msg <- NULL
+  eval_len <- NULL
+  init <- function(env) {
+    eval_len <<-  length(env$eval_list)
+    if (eval_len == 0)
+      stop("For early stopping, valids must have at least one element")
+
+    if (verbose)
+      cat("Will train until hasn't improved in ", 
+          stopping_rounds, " rounds.\n\n", sep = '')
+
+    factor_to_bigger_better <<- rep(1.0, eval_len)
+    best_iter <<- rep(-1, eval_len)
+    best_score <<- rep(-Inf, eval_len)
+    best_msg <<- list()
+    for(i in 1:eval_len){
+      best_msg <<- c(best_msg, "")
+      if(!env$eval_list[[i]]$higher_better){
+        factor_to_bigger_better[i] <<- -1.0
+      }
+    }
+  }
+
+  callback <- function(env, finalize = FALSE) {
+    if (is.null(eval_len))
+      init(env)
+    cur_iter <- env$iteration
+    for(i in 1:eval_len){
+      score <- env$eval_list[[i]]$value * factor_to_bigger_better[i]
+      if(score > best_score[i]){
+        best_score[i] <<- score
+        best_iter[i] <<- cur_iter
+        if(verbose){
+          best_msg[[i]] <<- as.character(merge.eval.string(env))
+        }
+      } else {
+        if(cur_iter - best_iter[i] >= stopping_rounds){
+          if(!is.null(env$model)){
+            env$model$best_iter <- best_iter[i]
+          }
+          if(verbose){
+            cat('Early stopping, best iteration is:',"\n")
+            cat(best_msg[[i]],"\n")
+          }
+          env$best_iter <- best_iter[i]
+          env$met_early_stop <- TRUE
+        }
+      }
+    }
+  }
+  attr(callback, 'call') <- match.call()
+  attr(callback, 'name') <- 'cb.early.stop'
+  return(callback)
+}
+
+# Extract callback names from the list of callbacks
+callback.names <- function(cb_list) {
+  unlist(lapply(cb_list, function(x) attr(x, 'name')))
+}
+
+add.cb <- function(cb_list, cb) {
+  cb_list <- c(cb_list, cb)
+  names(cb_list) <- callback.names(cb_list)
+  if ('cb.early.stop' %in% names(cb_list)) {
+    cb_list <- c(cb_list, cb_list['cb.early.stop'])
+    # this removes only the first one
+    cb_list['cb.early.stop'] <- NULL 
+  }
+  if ('cb.cv.predict' %in% names(cb_list)) {
+    cb_list <- c(cb_list, cb_list['cb.cv.predict'])
+    cb_list['cb.cv.predict'] <- NULL 
+  }
+  cb_list
+}
+
+categorize.callbacks <- function(cb_list) {
+  list(
+    pre_iter = Filter(function(x) {
+        pre <- attr(x, 'is_pre_iteration')
+        !is.null(pre) && pre 
+      }, cb_list),
+    post_iter = Filter(function(x) {
+        pre <- attr(x, 'is_pre_iteration')
+        is.null(pre) || !pre
+      }, cb_list)
+  )
+}