Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide method for auto-optimization of FIL parameters #5368

Merged
merged 23 commits into from
May 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
3bd3ab0
Add method for automatic optimization of FIL models
wphicks Apr 10, 2023
21a3cfe
Merge branch 'branch-23.06' into dev-auto_optimize_fil
wphicks Apr 10, 2023
e7bcd82
Correct random data generation
wphicks Apr 14, 2023
11ef8a4
Add test for model optimization
wphicks Apr 14, 2023
3a1c2cd
Merge branch 'branch-23.06' into dev-auto_optimize_fil
wphicks Apr 14, 2023
36e751c
Do not reduce output size if output not placed in shmem
wphicks Apr 17, 2023
c57b17c
Correct range for optimization data generation
wphicks Apr 17, 2023
8858a4f
Merge branch 'branch-23.06' into dev-auto_optimize_fil
wphicks Apr 18, 2023
c669490
Apply suggestions from code review
wphicks Apr 24, 2023
cc1b14a
Begin refactoring into ForestInferenceOptimizer
wphicks Apr 25, 2023
53946ba
Revert "Begin refactoring into ForestInferenceOptimizer"
wphicks May 19, 2023
46d81fe
Add timeout-based auto-optimization to FIL
wphicks May 19, 2023
a3fbb74
Merge branch 'branch-23.06' into dev-auto_optimize_fil
wphicks May 19, 2023
992e93e
Correct timeout-based auto-optimization for FIL
wphicks May 22, 2023
d7fca00
Add default timeout to optimize docs
wphicks May 22, 2023
8e512f0
Remove _get_chunk_size method
wphicks May 22, 2023
6ee1e37
Merge branch 'branch-23.06' into dev-auto_optimize_fil
wphicks May 22, 2023
eb06e41
Merge branch 'branch-23.06' into dev-auto_optimize_fil
wphicks May 22, 2023
eacc790
Merge remote-tracking branch 'origin/dev-auto_optimize_fil' into dev-…
wphicks May 22, 2023
6c9dddc
Initialize sequence per instance instead of as class attribute
wphicks May 22, 2023
1af0fac
Merge branch 'branch-23.06' into dev-auto_optimize_fil
dantegd May 26, 2023
cbf3fa2
Merge branch 'branch-23.06' into dev-auto_optimize_fil
wphicks May 30, 2023
4dc3bc7
Merge branch 'branch-23.06' into dev-auto_optimize_fil
dantegd May 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 180 additions & 2 deletions python/cuml/experimental/fil/fil.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#
import cupy as cp
import functools
import itertools
import numpy as np
import pathlib
import treelite.sklearn
Expand All @@ -24,6 +25,10 @@ from libc.stdint cimport uint32_t, uintptr_t

from cuml.common.device_selection import using_device_type
from cuml.internals.input_utils import input_to_cuml_array
from cuml.internals.safe_imports import (
gpu_only_import_from,
null_decorator
)
from cuml.internals.array import CumlArray
from cuml.internals.mixins import CMajorInputTagMixin
from cuml.experimental.fil.postprocessing cimport element_op, row_op
Expand All @@ -39,6 +44,9 @@ from cuml.internals.device_type import DeviceType, DeviceTypeError
from cuml.internals.global_settings import GlobalSettings
from cuml.internals.mem_type import MemoryType
from pylibraft.common.handle cimport handle_t as raft_handle_t
from time import perf_counter

nvtx_annotate = gpu_only_import_from('nvtx', 'annotate', alt=null_decorator)

from cuml.internals.safe_imports import (
cpu_only_import,
Expand Down Expand Up @@ -79,6 +87,7 @@ cdef extern from "cuml/experimental/fil/forest_model.hpp" namespace "ML::experim
) except +

bool is_double_precision() except +
size_t num_features() except +
size_t num_outputs() except +
size_t num_trees() except +
bool has_vector_leaves() except +
Expand Down Expand Up @@ -165,6 +174,9 @@ cdef class ForestInference_impl():
def get_dtype(self):
return [np.float32, np.float64][self.model.is_double_precision()]

def num_features(self):
return self.model.num_features()

def num_outputs(self):
return self.model.num_outputs()

Expand Down Expand Up @@ -298,6 +310,24 @@ cdef class ForestInference_impl():
output_dtype=output_dtype
)


class _AutoIterations:
"""Used to generate sequence of iterations (1, 2, 5, 10, 20, 50...) during
FIL optimization"""

def __init__(self):
self.invocations = 0
self.sequence = (1, 2, 5)

def next(self):
result = (
(10 ** (
self.invocations // len(self.sequence)
)) * self.sequence[self.invocations % len(self.sequence)]
)
self.invocations += 1
return result

def _handle_legacy_fil_args(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
Expand Down Expand Up @@ -611,7 +641,6 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
if old_value != value:
self._reload_model()


def __init__(
self,
*,
Expand All @@ -622,13 +651,15 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
is_classifier=False,
output_class=None,
layout='depth_first',
default_chunk_size=None,
align_bytes=None,
precision='single',
device_id=0):
super().__init__(
handle=handle, verbose=verbose, output_type=output_type
)

self.default_chunk_size = default_chunk_size
self.align_bytes = align_bytes
self.layout = layout
self.precision = precision
Expand Down Expand Up @@ -717,6 +748,7 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
model_type=None,
output_type=None,
verbose=False,
default_chunk_size=None,
align_bytes=None,
layout='depth_first',
device_id=0,
Expand Down Expand Up @@ -776,6 +808,9 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
verbose : int or boolean, default=False
Sets logging level. It must be one of `cuml.common.logger.level_*`.
See :ref:`verbosity-levels` for more info.
default_chunk_size : int or None, default=None
If set, predict calls without a specified chunk size will use
this default value.
align_bytes : int or None, default=None
If set, each tree will be padded with empty nodes until its
in-memory size is a multiple of the given value. It is recommended
Expand Down Expand Up @@ -810,12 +845,15 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
tl_model = treelite.frontend.Model.load(
path, model_type
)
if default_chunk_size is None:
default_chunk_size = threads_per_tree
return cls(
treelite_model=tl_model,
handle=handle,
output_type=output_type,
verbose=verbose,
output_class=output_class,
default_chunk_size=default_chunk_size,
align_bytes=align_bytes,
layout=layout,
precision=precision,
Expand All @@ -840,6 +878,7 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
model_type=None,
output_type=None,
verbose=False,
default_chunk_size=None,
align_bytes=None,
layout='breadth_first',
device_id=0,
Expand Down Expand Up @@ -897,6 +936,9 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
verbose : int or boolean, default=False
Sets logging level. It must be one of `cuml.common.logger.level_*`.
See :ref:`verbosity-levels` for more info.
default_chunk_size : int or None, default=None
If set, predict calls without a specified chunk size will use
this default value.
align_bytes : int or None, default=None
If set, each tree will be padded with empty nodes until its
in-memory size is a multiple of the given value. It is recommended
Expand Down Expand Up @@ -926,12 +968,15 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
pool to use during loading and inference.
"""
tl_model = treelite.sklearn.import_model(skl_model)
if default_chunk_size is None:
default_chunk_size = threads_per_tree
result = cls(
treelite_model=tl_model,
handle=handle,
output_type=output_type,
verbose=verbose,
output_class=output_class,
default_chunk_size=default_chunk_size,
align_bytes=align_bytes,
layout=layout,
precision=precision,
Expand All @@ -956,6 +1001,7 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
model_type=None,
output_type=None,
verbose=False,
default_chunk_size=None,
align_bytes=None,
layout='breadth_first',
device_id=0,
Expand Down Expand Up @@ -1013,6 +1059,9 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
verbose : int or boolean, default=False
Sets logging level. It must be one of `cuml.common.logger.level_*`.
See :ref:`verbosity-levels` for more info.
default_chunk_size : int or None, default=None
If set, predict calls without a specified chunk size will use
this default value.
align_bytes : int or None, default=None
If set, each tree will be padded with empty nodes until its
in-memory size is a multiple of the given value. It is recommended
Expand Down Expand Up @@ -1041,12 +1090,15 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
For GPU execution, the RAFT handle containing the stream or stream
pool to use during loading and inference.
"""
if default_chunk_size is None:
default_chunk_size = threads_per_tree
return cls(
treelite_model=tl_model,
handle=handle,
output_type=output_type,
verbose=verbose,
output_class=output_class,
default_chunk_size=default_chunk_size,
align_bytes=align_bytes,
layout=layout,
precision=precision,
Expand Down Expand Up @@ -1099,7 +1151,9 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
"predict_proba is not available for regression models. Load"
" with is_classifer=True if this is a classifier."
)
return self.forest.predict(X, preds=preds, chunk_size=chunk_size)
return self.forest.predict(
X, preds=preds, chunk_size=(chunk_size or self.default_chunk_size)
)

@nvtx_annotate(
message='ForestInference.predict',
Expand Down Expand Up @@ -1158,6 +1212,7 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
classifiers, the highest probability class is chosen regardless
of threshold.
"""
chunk_size = (chunk_size or self.default_chunk_size)
if self.forest.row_postprocessing() == 'max_index':
raw_out = self.forest.predict(X, chunk_size=chunk_size)
result = raw_out[:, 0]
Expand Down Expand Up @@ -1237,6 +1292,7 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
any power of 2, but little benefit is expected above a chunk size
of 512.
"""
chunk_size = (chunk_size or self.default_chunk_size)
return self.forest.predict(
X, predict_type="per_tree", preds=preds, chunk_size=chunk_size
)
Expand Down Expand Up @@ -1291,3 +1347,125 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
return self.forest.predict(
X, predict_type="leaf_id", preds=preds, chunk_size=chunk_size
)

def optimize(
self,
*,
data=None,
batch_size=1024,
unique_batches=10,
timeout=0.2,
predict_method='predict',
max_chunk_size=None,
seed=0
):
"""
Find the optimal layout and chunk size for this model

The optimal value for layout and chunk size depends on the model,
batch size, and available hardware. In order to get the most
realistic performance distribution, example data can be provided. If
it is not, random data will be generated based on the indicated batch
size. After finding the optimal layout, the model will be reloaded if
necessary. The optimal chunk size will be used to set the default chunk
size used if none is passed to the predict call.

Parameters
----------
data
Example data either of shape unique_batches x batch size x features
or batch_size x features or None. If None, random data will be
generated instead.
batch_size : int
If example data is not provided, random data with this many rows
per batch will be used.
unique_batches : int
The number of unique batches to generate if random data are used.
Increasing this number decreases the chance that the optimal
configuration will be skewed by a single batch with unusual
performance characteristics.
timeout : float
Time in seconds to target for optimization. The optimization loop
will be repeatedly run a number of times increasing in the sequence
1, 2, 5, 10, 20, 50, ... until the time taken is at least the given
value. Note that for very large batch sizes and large models, the
total elapsed time may exceed this timeout; it is a soft target for
elapsed time. Setting the timeout to zero will run through the
indicated number of unique batches exactly once. Defaults to 0.2s.
predict_method : str
If desired, optimization can occur over one of the prediction
method variants (e.g. "predict_per_tree") rather than the
default `predict` method. To do so, pass the name of the method
here.
max_chunk_size : int or None
The maximum chunk size to explore during optimization. If not
set, a value will be picked based on the current device type.
Setting this to a lower value will reduce the optimization search
time but may not result in optimal performance.
seed : int
The random seed used for generating example data if none is
provided.
"""
if data is None:
xpy = GlobalSettings().xpy
dtype = self.forest.get_dtype()
data = xpy.random.uniform(
xpy.finfo(dtype).min / 2,
xpy.finfo(dtype).max / 2,
(unique_batches, batch_size, self.forest.num_features())
)
else:
data = CumlArray.from_input(
data,
order='K',
).to_output('array')
try:
unique_batches, batch_size, features = data.shape
except ValueError:
unique_batches = 1
batch_size, features = data.shape
data = [data]

if max_chunk_size is None:
max_chunk_size = 512
if GlobalSettings().device_type is DeviceType.device:
max_chunk_size = min(max_chunk_size, 32)

infer = getattr(self, predict_method)

optimal_layout = 'depth_first'
optimal_chunk_size = 1

valid_layouts = ('depth_first', 'breadth_first')
chunk_size = 1
valid_chunk_sizes = []
while chunk_size <= max_chunk_size:
valid_chunk_sizes.append(chunk_size)
chunk_size *= 2

all_params = list(itertools.product(valid_layouts, valid_chunk_sizes))
auto_iterator = _AutoIterations()
loop_start = perf_counter()
while True:
optimal_time = float('inf')
iterations = auto_iterator.next()
for layout, chunk_size in all_params:
self.layout = layout
infer(data[0], chunk_size=chunk_size)
elapsed = float('inf')
for _ in range(iterations):
start = perf_counter()
for iter_index in range(unique_batches):
infer(
data[iter_index], chunk_size=chunk_size
)
elapsed = min(elapsed, perf_counter() - start)
if elapsed < optimal_time:
optimal_time = elapsed
optimal_layout = layout
optimal_chunk_size = chunk_size
if (perf_counter() - loop_start > timeout):
break

self.layout = optimal_layout
self.default_chunk_size = optimal_chunk_size
Loading