Skip to content

Commit 45da194

Browse files
authored
[ML] take training_percent into account when estimating memory (#1111)
This change adjust the memory estimation by taking the parameter `training_percent` into account. It is an optional parameter that defaults to `100.0`. The Java side will write out the parameter as it is input by the user.
1 parent 67a7ced commit 45da194

File tree

4 files changed

+82
-1
lines changed

4 files changed

+82
-1
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141

4242
* Reduce CPU scheduling priority of native analysis processes to favor the ES JVM
4343
when CPU is constrained. (See {ml-pull}1109[#1109].)
44+
* Take `training_percent` into account when estimating memory usage for classification and regression.
45+
(See {ml-pull}1111[1111].)
4446

4547
== {es} version 7.7.0
4648

include/api/CDataFrameTrainBoostedTreeRunner.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
5050
static const std::string MAX_OPTIMIZATION_ROUNDS_PER_HYPERPARAMETER;
5151
static const std::string BAYESIAN_OPTIMISATION_RESTARTS;
5252
static const std::string NUM_TOP_FEATURE_IMPORTANCE_VALUES;
53+
static const std::string TRAINING_PERCENT_FIELD_NAME;
5354

5455
//Output
5556
static const std::string IS_TRAINING_FIELD_NAME;
@@ -115,6 +116,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
115116

116117
std::string m_DependentVariableFieldName;
117118
std::string m_PredictionFieldName;
119+
double m_TrainingPercent;
118120
TBoostedTreeFactoryUPtr m_BoostedTreeFactory;
119121
TBoostedTreeUPtr m_BoostedTree;
120122
CDataFrameTrainBoostedTreeInstrumentation m_Instrumentation;

lib/api/CDataFrameTrainBoostedTreeRunner.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ const CDataFrameAnalysisConfigReader& CDataFrameTrainBoostedTreeRunner::paramete
6161
CDataFrameAnalysisConfigReader::E_OptionalParameter);
6262
theReader.addParameter(NUM_TOP_FEATURE_IMPORTANCE_VALUES,
6363
CDataFrameAnalysisConfigReader::E_OptionalParameter);
64+
theReader.addParameter(TRAINING_PERCENT_FIELD_NAME,
65+
CDataFrameAnalysisConfigReader::E_OptionalParameter);
6466
return theReader;
6567
}()};
6668
return PARAMETER_READER;
@@ -77,6 +79,7 @@ CDataFrameTrainBoostedTreeRunner::CDataFrameTrainBoostedTreeRunner(
7779
m_PredictionFieldName = parameters[PREDICTION_FIELD_NAME].fallback(
7880
m_DependentVariableFieldName + "_prediction");
7981

82+
m_TrainingPercent = parameters[TRAINING_PERCENT_FIELD_NAME].fallback(100.0) / 100.0;
8083
std::size_t downsampleRowsPerFeature{
8184
parameters[DOWNSAMPLE_ROWS_PER_FEATURE].fallback(std::size_t{0})};
8285
double downsampleFactor{parameters[DOWNSAMPLE_FACTOR].fallback(-1.0)};
@@ -290,7 +293,9 @@ std::size_t CDataFrameTrainBoostedTreeRunner::estimateBookkeepingMemoryUsage(
290293
std::size_t totalNumberRows,
291294
std::size_t /*partitionNumberRows*/,
292295
std::size_t numberColumns) const {
293-
return m_BoostedTreeFactory->estimateMemoryUsage(totalNumberRows, numberColumns);
296+
return m_BoostedTreeFactory->estimateMemoryUsage(
297+
static_cast<std::size_t>(static_cast<double>(totalNumberRows) * m_TrainingPercent + 0.5),
298+
numberColumns);
294299
}
295300

296301
const CDataFrameAnalysisInstrumentation&
@@ -305,6 +310,7 @@ CDataFrameAnalysisInstrumentation& CDataFrameTrainBoostedTreeRunner::instrumenta
305310
// clang-format off
306311
const std::string CDataFrameTrainBoostedTreeRunner::DEPENDENT_VARIABLE_NAME{"dependent_variable"};
307312
const std::string CDataFrameTrainBoostedTreeRunner::PREDICTION_FIELD_NAME{"prediction_field_name"};
313+
const std::string CDataFrameTrainBoostedTreeRunner::TRAINING_PERCENT_FIELD_NAME{"training_percent"};
308314
const std::string CDataFrameTrainBoostedTreeRunner::DOWNSAMPLE_ROWS_PER_FEATURE{"downsample_rows_per_feature"};
309315
const std::string CDataFrameTrainBoostedTreeRunner::DOWNSAMPLE_FACTOR{"downsample_factor"};
310316
const std::string CDataFrameTrainBoostedTreeRunner::ALPHA{"alpha"};

lib/maths/unittest/CBoostedTreeTest.cc

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <algorithm>
2929
#include <fstream>
3030
#include <functional>
31+
#include <limits>
3132
#include <memory>
3233
#include <streambuf>
3334
#include <utility>
@@ -1221,6 +1222,76 @@ BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByTrain) {
12211222
}
12221223
}
12231224

1225+
BOOST_AUTO_TEST_CASE(testEstimateMemoryUsedByTrainWithTestRows) {
1226+
1227+
// Test estimation of the memory used training a model.
1228+
1229+
test::CRandomNumbers rng;
1230+
1231+
std::size_t rows{1000};
1232+
std::size_t cols{6};
1233+
std::size_t capacity{600};
1234+
std::int64_t previousEstimatedMemory{std::numeric_limits<std::int64_t>::max()};
1235+
1236+
for (std::size_t test = 0; test < 3; ++test) {
1237+
TDoubleVecVec x(cols - 1);
1238+
std::size_t numTestRows{((test + 1) * 100)};
1239+
for (std::size_t i = 0; i < cols - 1; ++i) {
1240+
rng.generateUniformSamples(0.0, 10.0, rows, x[i]);
1241+
}
1242+
1243+
auto target = [&](std::size_t i) {
1244+
double result{0.0};
1245+
for (std::size_t j = 0; j < cols - 1; ++j) {
1246+
result += x[j][i];
1247+
}
1248+
return result;
1249+
};
1250+
1251+
auto frame = core::makeMainStorageDataFrame(cols, capacity).first;
1252+
frame->categoricalColumns(TBoolVec{true, false, false, false, false, false});
1253+
for (std::size_t i = 0; i < rows; ++i) {
1254+
frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) {
1255+
*(column++) = std::floor(x[0][i]);
1256+
for (std::size_t j = 1; j < cols - 1; ++j, ++column) {
1257+
*column = x[j][i];
1258+
}
1259+
if (i < numTestRows) {
1260+
*column = core::CDataFrame::valueOfMissing();
1261+
} else {
1262+
*column = target(i);
1263+
}
1264+
});
1265+
}
1266+
frame->finishWritingRows();
1267+
1268+
double percentTrainingRows = 1.0 - static_cast<double>(numTestRows) /
1269+
static_cast<double>(rows);
1270+
1271+
std::int64_t estimatedMemory(
1272+
maths::CBoostedTreeFactory::constructFromParameters(
1273+
1, std::make_unique<maths::boosted_tree::CMse>())
1274+
.estimateMemoryUsage(static_cast<std::size_t>(static_cast<double>(rows) * percentTrainingRows),
1275+
cols));
1276+
1277+
CTestInstrumentation instrumentation;
1278+
auto regression = maths::CBoostedTreeFactory::constructFromParameters(
1279+
1, std::make_unique<maths::boosted_tree::CMse>())
1280+
.analysisInstrumentation(instrumentation)
1281+
.buildFor(*frame, cols - 1);
1282+
1283+
regression->train();
1284+
1285+
LOG_DEBUG(<< "percent training rows = " << percentTrainingRows);
1286+
LOG_DEBUG(<< "estimated memory usage = " << estimatedMemory);
1287+
LOG_DEBUG(<< "high water mark = " << instrumentation.maxMemoryUsage());
1288+
1289+
BOOST_TEST_REQUIRE(instrumentation.maxMemoryUsage() < estimatedMemory);
1290+
BOOST_TEST_REQUIRE(previousEstimatedMemory > estimatedMemory);
1291+
previousEstimatedMemory = estimatedMemory;
1292+
}
1293+
}
1294+
12241295
BOOST_AUTO_TEST_CASE(testProgressMonitoring) {
12251296

12261297
// Test progress monitoring invariants.

0 commit comments

Comments
 (0)