Skip to content

Commit df685c8

Browse files
committed
added more algorithms and two possible optimization methods. Non breaking changes described on news.md
1 parent a78f405 commit df685c8

22 files changed

+1426
-190
lines changed

DESCRIPTION

Lines changed: 19 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,34 @@
11
Package: SmartML
2-
Type: Package
2+
Version: 0.3.0
33
Title: Machine Learning Automation
4-
Version: 0.1
5-
Authors@R: c(
6-
person("Mohamed", "Maher", email = "s-mohamed.zenhom@zewailcity.edu.eg", role = c("aut", "cre")),
7-
person("Sherif", "Sakr", email = "sherif.sakr@ut.ee", role = "aut"))
8-
Maintainer: Mohamed Maher <s-mohamed.zenhom@zewailcity.edu.eg>
4+
Authors@R:
5+
c(person(given = "Mohamed",
6+
family = "Maher",
7+
email = "s-mohamed.zenhom@zewailcity.edu.eg",
8+
role = c("aut", "cre")),
9+
person(given = "Sherif",
10+
family = "Sakr",
11+
email = "sherif.sakr@ut.ee",
12+
role = "aut"),
13+
person(given = "Bruno Rucy",
14+
family = "Carneiro Alves de Lima",
15+
email = "brurucy@protonmail.ch",
16+
role = "ctb"))
917
Description: This package is a meta-learning based framework for automated selection and hyper-parameter tuning for machine learning algorithms. Being meta-learning based, the framework is able to simulate the role of the machine learning expert. In particular, the framework is equipped with a continuously updated knowledge base that stores information about statistical meta features of all processed datasets along with the associated performance of the different classifiers and their tuned parameters. Thus, for any new dataset, SmartML automatically extracts its meta features and searches its knowledge base for the best performing algorithm to start its optimization process. In addition, SmartML makes use of the new runs to continuously enrich its knowledge base to improve its performance and robustness for future runs.
1018
License: GPL-3
1119
Encoding: UTF-8
1220
LazyData: false
1321
Imports:
14-
devtools,
15-
R.utils,
16-
stats,
17-
httr,
18-
UBL,
19-
imputeMissings,
20-
mice,
21-
RCurl,
22-
tictoc,
23-
e1071,
24-
mlbench,
25-
fastICA,
26-
RMySQL,
27-
BBmisc,
28-
rjson,
29-
ggplot2,
30-
RWeka,
31-
farff,
32-
pls,
33-
randomForest,
34-
FNN,
35-
klaR,
36-
rpart,
37-
ipred,
38-
C50,
39-
mda,
40-
MASS,
41-
nnet,
42-
deepboost,
43-
iml,
44-
datasets,
45-
xts
22+
devtools, R.utils, stats, httr, UBL, imputeMissings, mice, RCurl, tictoc, e1071, mlbench,
23+
fastICA, RMySQL, BBmisc, rjson, ggplot2, RWeka, farff, pls, purrr, truncnorm, tidyr, dplyr,
24+
xgboost, ranger, fastNaiveBayes, KernSmooth, LiblineaR, data.table, randomForest, FNN, klaR,
25+
rpart, ipred, C50, mda, MASS, nnet, deepboost, iml, datasets, xts
4626
Suggests:
4727
knitr,
4828
covr,
4929
testthat,
5030
rmarkdown
5131
Depends:
5232
caret
53-
RoxygenNote: 6.1.1
33+
RoxygenNote: 7.1.0
5434
VignetteBuilder: knitr

NAMESPACE

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export(autoRLearn)
4+
export(autoRLearn_)
45
export(runClassifier)
56
import(RWeka)
67
import(caret)
78
import(devtools)
89
import(farff)
910
import(ggplot2)
1011
import(mice)
12+
import(purrr)
1113
import(rjson)
1214
importFrom(BBmisc,normalize)
1315
importFrom(C50,C5.0)
1416
importFrom(C50,C5.0Control)
1517
importFrom(FNN,knn)
18+
importFrom(KernSmooth,bkde)
19+
importFrom(KernSmooth,dpik)
20+
importFrom(LiblineaR,LiblineaR)
1621
importFrom(MASS,lda)
1722
importFrom(R.utils,withTimeout)
1823
importFrom(RCurl,getURL)
@@ -25,13 +30,27 @@ importFrom(RWeka,J48)
2530
importFrom(RWeka,LMT)
2631
importFrom(UBL,SmoteClassif)
2732
importFrom(caret,confusionMatrix)
33+
importFrom(caret,createDataPartition)
2834
importFrom(caret,plsda)
35+
importFrom(data.table,fcase)
2936
importFrom(deepboost,deepboost)
3037
importFrom(deepboost,deepboost.predict)
38+
importFrom(dplyr,arrange)
39+
importFrom(dplyr,case_when)
40+
importFrom(dplyr,distinct)
41+
importFrom(dplyr,filter)
42+
importFrom(dplyr,group_by)
43+
importFrom(dplyr,mutate)
44+
importFrom(dplyr,mutate_all)
45+
importFrom(dplyr,mutate_if)
46+
importFrom(dplyr,n)
47+
importFrom(dplyr,select)
48+
importFrom(dplyr,top_frac)
3149
importFrom(e1071,kurtosis)
3250
importFrom(e1071,naiveBayes)
3351
importFrom(e1071,skewness)
3452
importFrom(e1071,svm)
53+
importFrom(fastNaiveBayes,fnb.train)
3554
importFrom(graphics,plot)
3655
importFrom(httr,POST)
3756
importFrom(httr,content)
@@ -49,6 +68,7 @@ importFrom(mda,mars)
4968
importFrom(mda,polyreg)
5069
importFrom(nnet,nnet)
5170
importFrom(randomForest,randomForest)
71+
importFrom(ranger,ranger)
5272
importFrom(rjson,fromJSON)
5373
importFrom(rpart,rpart)
5474
importFrom(rpart,rpart.control)
@@ -62,5 +82,14 @@ importFrom(stats,setNames)
6282
importFrom(stats,var)
6383
importFrom(tictoc,tic)
6484
importFrom(tictoc,toc)
85+
importFrom(tidyr,drop_na)
86+
importFrom(tidyr,gather)
87+
importFrom(tidyr,separate)
88+
importFrom(tidyr,spread)
89+
importFrom(tidyr,unite)
90+
importFrom(truncnorm,dtruncnorm)
91+
importFrom(truncnorm,rtruncnorm)
6592
importFrom(utils,capture.output)
6693
importFrom(utils,read.csv)
94+
importFrom(xgboost,xgb.DMatrix)
95+
importFrom(xgboost,xgboost)

R/autoRLearn_.R

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
#' @title Advanced version of autoRLearn.
2+
#'
3+
#' @description Tunes the hyperparameters of the desired algorithm/s using either hyperband or BOHB.
4+
#'
5+
#' @param df_train Dataframe of the training dataset. Assumes it is in perfect shape with all numeric variables and factor response variable named "class".
6+
#' @param df_test Dataframe of the test dataset. Assumes it is in perfect shape with all numeric variables and factor response variable named "class".
7+
#' @param maxTime Float representing the maximum time the algorithm should be run (seconds).
8+
#' @param models List of strings denoting which algorithms to use for the process:
9+
#' \itemize{
10+
#' \item "randomForest" - Random forests using the randomForest package
11+
#' \item "ranger - Random forests using the ranger package (unstable)
12+
#' \item "naiveBayes" - Naive bayes using the fastNaiveBayes package
13+
#' \item "boosting" - Gradient boosting using xgboost
14+
#' \item "l2-linear-classifier" - Linear primal Support vector machine from LibLinear
15+
#' \item "svm" - RBF kernel svm from e1071
16+
#' }
17+
#' @param optimizationAlgorithm - String of which hyperparameter tuning algorithm to use:
18+
#' \itemize{
19+
#' \item "hyperband" - Hyperband with uniformly initiated parameters
20+
#' \item "bohb" - Hyperband with bayesian optimization as described on F. Hutter et al 2018 paper BOHB. Has extra parameters bw and kde_type
21+
#' }
22+
#' @param bw - (only applies to BOHB) Double representing how much should the KDE bandwidth be widened. Higher values allow the algorithm to explore more hyperparameter combinations
23+
#' @param max_iter - (affects both hyperband and BOHB) Integer representing the maximum number of iterations that one successive halving run can have
24+
#' @param kde_type - (only applies to BOHB) String representing whether a model's hyperparameters should be tuned individually of each other or have their probability densities multiplied:
25+
#' \itemize{
26+
#' \item "single" - each hyperparameter has its own expected improvement calculated
27+
#' \item "mixed" - all hyperparameters' probabilty densities are multiplied and only one mixed expected improvement is calculated
28+
#' }
29+
#' @return List of Results
30+
#' \itemize{
31+
#' \item \code{perf} - accuracy of the best performing model on the test data
32+
#' \item \code{pred} - prediction on the test data using the best model
33+
#' \item \code{model} - best model object
34+
#' \item \code{best_models} - table with the best hyperparameters found for the selected models.
35+
#' }
36+
37+
#' @importFrom R.utils withTimeout
38+
#' @importFrom tictoc tic toc
39+
40+
#' @export autoRLearn_
41+
autoRLearn_ <- function(df_train, df_test, maxTime = 10, models = c("randomForest", "naiveBayes", "boosting", "l2-linear-classifier", "svm"), optimizationAlgorithm = "hyperband", bw = 3, max_iter = 81, kde_type = "single") {
42+
43+
total_time = maxTime * 60
44+
45+
parameters_per_model <- map_int(models, .f = ~ length(jsons[[.x]]$params))
46+
47+
times = (parameters_per_model * total_time) / (sum(parameters_per_model))
48+
49+
print("Time distribution:")
50+
print(times)
51+
print("Models selected:")
52+
print(models)
53+
54+
run_optimization = function(model, time) {
55+
56+
results = NULL
57+
58+
priors = data.frame()
59+
60+
tic(model, "optimization time:")
61+
62+
if(optimizationAlgorithm == "hyperband") {
63+
64+
current <- Sys.time() %>% as.integer()
65+
66+
end <- (Sys.time() %>% as.integer()) + time
67+
68+
repeat {
69+
70+
gc(verbose = F)
71+
72+
tic("current hyperband runtime")
73+
74+
print(paste("started", model))
75+
76+
time_left <- max(end - (Sys.time() %>% as.integer()), 1)
77+
78+
print(paste("There are:", time_left, "seconds left for this hyperband run"))
79+
80+
res <- hyperband(df = df_train, model = model, max_iter = max_iter, maxtime = time_left)
81+
82+
if(is_empty(flatten(res)) == F) {
83+
84+
res <- res %>%
85+
map_dfr(.f = ~ .x[["answer"]]) %>%
86+
arrange(desc(acc)) %>%
87+
head(1)
88+
89+
results <- c(list(res), results)
90+
91+
print(paste('Best accuracy from hyperband this round: ', res$acc))
92+
93+
}
94+
95+
elapsed <- (Sys.time() %>% as.integer()) - current
96+
97+
if(elapsed >= time) {
98+
99+
break
100+
101+
}
102+
103+
}
104+
105+
}
106+
107+
else if(optimizationAlgorithm == "bohb") {
108+
109+
current <- Sys.time() %>% as.integer()
110+
111+
end <- (Sys.time() %>% as.integer()) + time
112+
113+
repeat {
114+
115+
gc(verbose = F)
116+
117+
tic("current bohb time")
118+
119+
print(paste("started", model))
120+
121+
time_left <- max(end - (Sys.time() %>% as.integer()), 1)
122+
123+
print(paste("There are:", time_left, "seconds left for this bohb run"))
124+
125+
res <- bohb(df = df_train, model = model, bw = bw, max_iter = max_iter, maxtime = time_left, priors = priors, kde_type = kde_type)
126+
127+
if(is_empty(flatten(res)) == F) {
128+
129+
priors <- res %>%
130+
map_dfr(.f = ~ .x[["sh_runs"]])
131+
132+
res <- res %>%
133+
map_dfr(.f = ~ .x[["answer"]]) %>%
134+
arrange(desc(acc)) %>%
135+
head(1)
136+
137+
results <- c(list(res), results)
138+
139+
print(paste('Best accuracy from hyperband this round: ', res$acc))
140+
141+
}
142+
143+
elapsed <- (Sys.time() %>% as.integer()) - current
144+
145+
if(elapsed >= time) {
146+
147+
break
148+
149+
}
150+
151+
}
152+
153+
154+
}
155+
156+
else {
157+
158+
errorCondition(message = "Only hyperband and bohb are valid optimization algorithms at this moment.")
159+
160+
break
161+
162+
}
163+
164+
toc()
165+
166+
results
167+
168+
}
169+
170+
print("Finished all optimizations.")
171+
172+
ans = vector(mode = "list", length = length(models))
173+
174+
175+
for(i in 1:length(models)) {
176+
177+
flag <- TRUE
178+
179+
tryCatch(expr = {
180+
181+
ans[[i]] <- run_optimization(models[[i]], times[[i]])
182+
183+
}, error = function(e) {
184+
185+
print("Error spotted, going to the next model")
186+
187+
flag <<- FALSE
188+
189+
})
190+
191+
if (!flag) next
192+
193+
}
194+
195+
ans = ans %>%
196+
map(.f = ~ map_dfr(.x = .x, .f = ~ .x %>% select(model, params, acc))) %>%
197+
map_dfr(.f = ~ .x %>% arrange(desc(acc)) %>% head(1)) %>%
198+
arrange(desc(acc))
199+
200+
best_model <- ans %>% head(1)
201+
202+
final_evaluation <- eval_loss(model = best_model[["model"]], train_df = df_train, test_df = df_test, params = best_model[["params"]])
203+
204+
final_evaluation$best_models <- ans
205+
206+
print(paste("Winner:", best_model$model, "test accuracy:", final_evaluation$perf))
207+
208+
final_evaluation
209+
210+
}

0 commit comments

Comments
 (0)