Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R-package] Add more examples and bank dataset #887

Merged
merged 3 commits into from
Sep 4, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add bank dataset and more examples
  • Loading branch information
Laurae2 committed Sep 3, 2017
commit c688045e7dfce88183439e49707cdd9ad623451d
21 changes: 21 additions & 0 deletions R-package/R/lightgbm.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,27 @@ NULL
#' rows and 126 variables
NULL

#' Bank Marketing Data Set
#'
#' This data set is originally from the Bank Marketing data set,
#' UCI Machine Learning Repository.
#'
#' It contains only the following: bank.csv with 10% of the examples and 17 inputs,
#' randomly selected from 3 (older version of this dataset with less inputs).
#'
#' @references
#' http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
#'
#' S. Moro, P. Cortez and P. Rita. (2014)
#' A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems
#'
#' @docType data
#' @keywords datasets
#' @name bank
#' @usage data(bank)
#' @format A data.table with 4521 rows and 17 variables
NULL

# Various imports
#' @import methods
#' @importFrom R6 R6Class
Expand Down
Binary file added R-package/data/bank.rda
Binary file not shown.
5 changes: 4 additions & 1 deletion R-package/demo/00Index
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
basic_walkthrough Basic feature walkthrough
boost_from_prediction Boosting from existing prediction
early_stopping Early Stop in training
categorical_feature_prepare Categorical Feature Preparation
categorical_feature_rules Categorical Feature Preparation with Rules
cross_validation Cross Validation
early_stopping Early Stop in training
efficient_many_training Efficiency for Many Model Trainings
multiclass Multiclass training/prediction
leaf_stability Leaf (in)Stability example
weight_param Weight-Parameter adjustment relationship
81 changes: 81 additions & 0 deletions R-package/demo/categorical_features_prepare.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Here we are going to try training a model with categorical features

# Load libraries
library(data.table)
library(lightgbm)

# Load data and look at the structure
#
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables:
# $ age : int 30 33 35 30 59 35 36 39 41 43 ...
# $ job : chr "unemployed" "services" "management" "management" ...
# $ marital : chr "married" "married" "single" "married" ...
# $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
# $ default : chr "no" "no" "no" "no" ...
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
# $ housing : chr "no" "yes" "yes" "yes" ...
# $ loan : chr "no" "yes" "no" "yes" ...
# $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
# $ day : int 19 11 16 3 5 23 14 6 14 17 ...
# $ month : chr "oct" "may" "apr" "jun" ...
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
# $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
# $ y : chr "no" "no" "no" "no" ...
data(bank, package = "lightgbm")
str(bank)

# We must now transform the data to fit in LightGBM
# For this task, we use lgb.prepare
# The function transforms the data into a fittable data
#
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables:
# $ age : int 30 33 35 30 59 35 36 39 41 43 ...
# $ job : chr "unemployed" "services" "management" "management" ...
# $ marital : chr "married" "married" "single" "married" ...
# $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
# $ default : chr "no" "no" "no" "no" ...
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
# $ housing : chr "no" "yes" "yes" "yes" ...
# $ loan : chr "no" "yes" "no" "yes" ...
# $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
# $ day : int 19 11 16 3 5 23 14 6 14 17 ...
# $ month : chr "oct" "may" "apr" "jun" ...
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
# $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
# $ y : chr "no" "no" "no" "no" ...
bank <- lgb.prepare(data = bank)
str(bank)

# Remove 1 to label because it must be between 0 and 1
bank$y <- bank$y - 1

# Data input to LightGBM must be a matrix, without the label
my_data <- as.matrix(bank[, 1:16, with = FALSE])

# Creating the LightGBM dataset with categorical features
# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
lgb_data <- lgb.Dataset(data = my_data,
label = bank$y,
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))

# We can now train a model
model <- lgb.train(list(objective = "binary",
metric = "l2",
min_data = 1,
learning_rate = 0.1,
min_data = 0,
min_hessian = 1,
max_depth = 2),
lgb_data,
100,
valids = list(train = lgb_data))

# Try to find split_feature: 2
# If you find it, it means it used a categorical feature in the first tree
lgb.dump(model, num_iteration = 1)
91 changes: 91 additions & 0 deletions R-package/demo/categorical_features_rules.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Here we are going to try training a model with categorical features

# Load libraries
library(data.table)
library(lightgbm)

# Load data and look at the structure
#
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables:
# $ age : int 30 33 35 30 59 35 36 39 41 43 ...
# $ job : chr "unemployed" "services" "management" "management" ...
# $ marital : chr "married" "married" "single" "married" ...
# $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
# $ default : chr "no" "no" "no" "no" ...
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
# $ housing : chr "no" "yes" "yes" "yes" ...
# $ loan : chr "no" "yes" "no" "yes" ...
# $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
# $ day : int 19 11 16 3 5 23 14 6 14 17 ...
# $ month : chr "oct" "may" "apr" "jun" ...
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
# $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
# $ y : chr "no" "no" "no" "no" ...
data(bank, package = "lightgbm")
str(bank)

# We are dividing the dataset into two: one train, one validation
bank_train <- bank[1:4000, ]
bank_test <- bank[4001:4521, ]

# We must now transform the data to fit in LightGBM
# For this task, we use lgb.prepare
# The function transforms the data into a fittable data
#
# Classes 'data.table' and 'data.frame': 521 obs. of 17 variables:
# $ age : int 53 36 58 26 34 55 55 34 41 38 ...
# $ job : num 1 10 10 9 10 2 2 3 3 4 ...
# $ marital : num 1 2 1 3 3 2 2 2 1 1 ...
# $ education: num 2 2 2 2 2 1 2 3 2 2 ...
# $ default : num 1 1 1 1 1 1 1 1 1 1 ...
# $ balance : int 26 191 -123 -147 179 1086 471 105 1588 70 ...
# $ housing : num 2 1 1 1 1 2 2 2 2 1 ...
# $ loan : num 1 1 1 1 1 1 1 1 2 1 ...
# $ contact : num 1 1 1 3 1 1 3 3 3 1 ...
# $ day : int 7 31 5 4 19 6 30 28 20 27 ...
# $ month : num 9 2 2 7 2 9 9 9 7 11 ...
# $ duration : int 56 69 131 95 294 146 58 249 10 255 ...
# $ campaign : int 1 1 2 2 3 1 2 2 8 3 ...
# $ pdays : int 359 -1 -1 -1 -1 272 -1 -1 -1 148 ...
# $ previous : int 1 0 0 0 0 2 0 0 0 1 ...
# $ poutcome : num 1 4 4 4 4 1 4 4 4 3 ...
# $ y : num 1 1 1 1 1 1 1 1 1 2 ...
bank_rules <- lgb.prepare_rules(data = bank_train)
bank_train <- bank_rules$data
bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data
str(bank_test)

# Remove 1 to label because it must be between 0 and 1
bank_train$y <- bank_train$y - 1
bank_test$y <- bank_test$y - 1

# Data input to LightGBM must be a matrix, without the label
my_data_train <- as.matrix(bank_train[, 1:16, with = FALSE])
my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE])

# Creating the LightGBM dataset with categorical features
# The categorical features can be passed to lgb.train to not copy and paste a lot
dtrain <- lgb.Dataset(data = my_data_train,
label = bank_train$y)
dtest <- lgb.Dataset(data = my_data_test,
label = bank_test$y)

# We can now train a model
model <- lgb.train(list(objective = "binary",
metric = "l2",
min_data = 1,
learning_rate = 0.1,
min_data = 0,
min_hessian = 1,
max_depth = 2,
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)),
dtrain,
100,
valids = list(train = dtrain, valid = dtest))

# Try to find split_feature: 11
# If you find it, it means it used a categorical feature in the first tree
lgb.dump(model, num_iteration = 1)