Skip to content

Commit

Permalink
Merge branch 'master' of ssh://github.com/tqchen/xgboost
Browse files Browse the repository at this point in the history
Conflicts:
	.gitignore
  • Loading branch information
tqchen committed Jan 26, 2015
2 parents c34367b + 97e058d commit 1f6b8eb
Show file tree
Hide file tree
Showing 175 changed files with 15,217 additions and 410 deletions.
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*.slo
*.lo
*.o

*.page
# Compiled Dynamic libraries
*.so
*.dylib
Expand Down Expand Up @@ -45,5 +45,13 @@ Debug
*save
*csv
.Rproj.user
*.cpage.col
*.cpage
xgboost
xgboost.mpi
xgboost.mock
train*
rabit
.Rbuildignore
R-package.Rproj

6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@ xgboost-0.3
* Linear booster is now parallelized, using parallel coordinated descent.
* Add [Code Guide](src/README.md) for customizing objective function and evaluation
* Add R module

in progress version
=====
* Distributed version
* Feature importance visualization in R module, thanks to Michael Benesty
* Predict leaf inde
64 changes: 49 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,65 +1,99 @@
export CC = gcc
export CXX = g++
export MPICXX = mpicxx
export LDFLAGS= -pthread -lm

export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -pedantic
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC

ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
else
CFLAGS += -fopenmp
endif

# by default use c++11
ifeq ($(cxx11),1)
CFLAGS += -std=c++11
else
endif

# specify tensor path
BIN = xgboost
OBJ = updater.o gbm.o io.o
MOCKBIN = xgboost.mock
OBJ = updater.o gbm.o io.o main.o
MPIBIN = xgboost.mpi
SLIB = wrapper/libxgboostwrapper.so

.PHONY: clean all python Rpack
.PHONY: clean all mpi python Rpack

all: $(BIN) $(OBJ) $(SLIB)
all: $(BIN) $(OBJ) $(SLIB) $(MOCKBIN)
mpi: $(MPIBIN)

python: wrapper/libxgboostwrapper.so
# now the wrapper takes in two files. io and wrapper part
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ)
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*.h
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ)
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
xgboost.mpi: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mpi.a
xgboost.mock: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit_mock.a
xgboost: updater.o gbm.o io.o main.o subtree/rabit/lib/librabit.a
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o subtree/rabit/lib/librabit.a

# dependency on rabit
subtree/rabit/lib/librabit.a: subtree/rabit/src/engine.cc
cd subtree/rabit;make lib/librabit.a; cd ../..
subtree/rabit/lib/librabit_empty.a: subtree/rabit/src/engine_empty.cc
cd subtree/rabit;make lib/librabit_empty.a; cd ../..
subtree/rabit/lib/librabit_mock.a: subtree/rabit/src/engine_mock.cc
cd subtree/rabit;make lib/librabit_mock.a; cd ../..
subtree/rabit/lib/librabit_mpi.a: subtree/rabit/src/engine_mpi.cc
cd subtree/rabit;make lib/librabit_mpi.a; cd ../..

$(BIN) :
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)

$(MOCKBIN) :
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)

$(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)

$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )

$(MPIOBJ) :
$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )

$(MPIBIN) :
$(MPICXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)

install:
cp -f -r $(BIN) $(INSTALL_PATH)

Rpack:
make clean
cd subtree/rabit;make clean;cd ..
rm -rf xgboost xgboost*.tar.gz
cp -r R-package xgboost
rm -rf xgboost/inst/examples/*.buffer
rm -rf xgboost/inst/examples/*.model
rm -rf xgboost/inst/examples/dump*
rm -rf xgboost/src/*.o xgboost/src/*.so xgboost/src/*.dll
rm -rf subtree/rabit/src/*.o
rm -rf xgboost/demo/*.model xgboost/demo/*.buffer xgboost/demo/*.txt
rm -rf xgboost/demo/runall.R
cp -r src xgboost/src/src
cp -r subtree xgboost/src/subtree
mkdir xgboost/src/wrapper
cp wrapper/xgboost_wrapper.h xgboost/src/wrapper
cp wrapper/xgboost_wrapper.cpp xgboost/src/wrapper
cp ./LICENSE xgboost
cat R-package/src/Makevars|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars
cat R-package/src/Makevars.win|sed '2s/.*/PKGROOT=./' > xgboost/src/Makevars.win
cp xgboost/src/Makevars xgboost/src/Makevars.win
R CMD build xgboost
rm -rf xgboost
R CMD check --as-cran xgboost*.tar.gz

clean:
$(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
cd subtree/rabit; make clean; cd ..
4 changes: 2 additions & 2 deletions R-package/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Depends:
Imports:
Matrix (>= 1.1-0),
methods,
data.table (>= 1.9),
data.table (>= 1.9.4),
magrittr (>= 1.5),
stringr,
DiagrammeR
DiagrammeR
2 changes: 1 addition & 1 deletion R-package/NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by roxygen2 (4.1.0): do not edit by hand
# Generated by roxygen2 (4.0.1): do not edit by hand

export(getinfo)
export(setinfo)
Expand Down
9 changes: 7 additions & 2 deletions R-package/R/getinfo.xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,15 @@ setMethod("getinfo", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name != "label" && name != "weight" && name != "base_margin") {
if (name != "label" && name != "weight" &&
name != "base_margin" && name != "nrow") {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
if (name != "nrow"){
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
} else {
ret <- xgb.numrow(object)
}
return(ret)
})

25 changes: 23 additions & 2 deletions R-package/R/predict.xgb.Booster.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@ setClass("xgb.Booster")
#' @param object Object of class "xgb.Boost"
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
#' output value before logistic transformation.
#' @param ntreelimit limit number of trees used in prediction, this parameter is
#' only valid for gbtree, but not for gblinear. set it to be value bigger
#' than 0. It will use all trees by default.
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
Expand All @@ -25,7 +28,8 @@ setClass("xgb.Booster")
#' @export
#'
setMethod("predict", signature = "xgb.Booster",
definition = function(object, newdata, missing = NULL, outputmargin = FALSE, ntreelimit = NULL) {
definition = function(object, newdata, missing = NULL,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
if (class(newdata) != "xgb.DMatrix") {
if (is.null(missing)) {
newdata <- xgb.DMatrix(newdata)
Expand All @@ -40,7 +44,24 @@ setMethod("predict", signature = "xgb.Booster",
stop("predict: ntreelimit must be equal to or greater than 1")
}
}
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(outputmargin), as.integer(ntreelimit), PACKAGE = "xgboost")
option = 0
if (outputmargin) {
option <- option + 1
}
if (predleaf) {
option <- option + 2
}
ret <- .Call("XGBoosterPredict_R", object, newdata, as.integer(option),
as.integer(ntreelimit), PACKAGE = "xgboost")
if (predleaf){
len <- getinfo(newdata, "nrow")
if (length(ret) == len){
ret <- matrix(ret,ncol = 1)
} else {
ret <- matrix(ret, ncol = len)
ret <- t(ret)
}
}
return(ret)
})

14 changes: 13 additions & 1 deletion R-package/R/slice.xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ setMethod("slice", signature = "xgb.DMatrix",
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
PACKAGE = "xgboost")

attr_list <- attributes(object)
nr <- xgb.numrow(object)
len <- sapply(attr_list,length)
ind <- which(len==nr)
if (length(ind)>0) {
nms <- names(attr_list)[ind]
for (i in 1:length(ind)) {
attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
}
}
return(structure(ret, class = "xgb.DMatrix"))
})
21 changes: 15 additions & 6 deletions R-package/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
}

# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
Expand Down Expand Up @@ -169,18 +169,27 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL) {
} else {
msg <- ""
}
if (prediction){
preds <- predict(booster,watchlist[[2]])
return(list(msg,preds))
}
return(msg)
}
}
#------------------------------------------
# helper functions for cross validation
#
xgb.cv.mknfold <- function(dall, nfold, param) {
if (nfold <= 1) {
stop("nfold must be bigger than 1")
}
randidx <- sample(1 : xgb.numrow(dall))
kstep <- length(randidx) / nfold
kstep <- length(randidx) %/% nfold
idset <- list()
for (i in 1:nfold) {
idset[[i]] <- randidx[ ((i-1) * kstep + 1) : min(i * kstep, length(randidx)) ]
for (i in 1:(nfold-1)) {
idset[[i]] = randidx[1:kstep]
randidx = setdiff(randidx,idset[[i]])
}
idset[[nfold]] = randidx
ret <- list()
for (k in 1:nfold) {
dtest <- slice(dall, idset[[k]])
Expand All @@ -193,7 +202,7 @@ xgb.cv.mknfold <- function(dall, nfold, param) {
dtrain <- slice(dall, didx)
bst <- xgb.Booster(param, list(dtrain, dtest))
watchlist = list(train=dtrain, test=dtest)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist)
ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=idset[[k]])
}
return (ret)
}
Expand Down
2 changes: 1 addition & 1 deletion R-package/R/xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' indicating the data file.
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#
#' @param ... other information to pass to \code{info}.
#'
Expand Down
32 changes: 25 additions & 7 deletions R-package/R/xgb.cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
#' @param nrounds the max number of iterations
#' @param nfold number of folds used
#' @param label option field, when data is Matrix
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param prediction A logical value indicating whether to return the prediction vector.
#' @param showsd \code{boolean}, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation,
#' when it is not specified, the evaluation metric is chosen according to objective function.
Expand All @@ -47,8 +50,6 @@
#' @param feval custimized evaluation function. Returns
#' \code{list(metric='metric-name', value='metric-value')} with given
#' prediction and dtrain,
#' @param missing Missing is only used when input is dense matrix, pick a float
# value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param verbose \code{boolean}, print the statistics during the process.
#' @param ... other parameters to pass to \code{params}.
#'
Expand All @@ -71,7 +72,8 @@
#' @export
#'
xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL,
showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, verbose = T,...) {
prediction = FALSE, showsd = TRUE, metrics=list(),
obj = NULL, feval = NULL, verbose = T,...) {
if (typeof(params) != "list") {
stop("xgb.cv: first argument params must be list")
}
Expand All @@ -90,13 +92,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
}

folds <- xgb.cv.mknfold(dtrain, nfold, params)
predictValues <- rep(0,xgb.numrow(dtrain))
history <- c()
for (i in 1:nrounds) {
msg <- list()
for (k in 1:nfold) {
fd <- folds[[k]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
succ <- xgb.iter.update(fd$booster, fd$dtrain, i - 1, obj)
if (!prediction){
msg[[k]] <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval) %>% str_split("\t") %>% .[[1]]
} else {
res <- xgb.iter.eval(fd$booster, fd$watchlist, i - 1, feval, prediction)
predictValues[fd$index] <- res[[2]]
msg[[k]] <- res[[1]] %>% str_split("\t") %>% .[[1]]
}
}
ret <- xgb.cv.aggcv(msg, showsd)
history <- c(history, ret)
Expand All @@ -115,5 +124,14 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
split <- str_split(string = history, pattern = "\t")

for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.list %>% {vec <- .; rbindlist(list(dt, vec), use.names = F, fill = F)}
dt
}

if (prediction) {
return(list(dt = dt,pred = predictValues))
}
return(dt)
}

# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(".")
Loading

0 comments on commit 1f6b8eb

Please sign in to comment.