Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Save models and RandomForestRegressor #349

Merged
merged 51 commits into from
Aug 9, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
7430eef
Added saving and loading utils
gcasadesus Jul 16, 2021
f68c847
Format and doc changes
gcasadesus Jul 19, 2021
2b4e097
cbor2 not always required
gcasadesus Jul 19, 2021
d2b036d
Install dislib requirements
gcasadesus Jul 21, 2021
30da467
Added directory to save models during testing.
gcasadesus Jul 21, 2021
86e9492
Install requirements using pip3.
gcasadesus Jul 21, 2021
4e8ad6e
Changed environment language.
gcasadesus Jul 21, 2021
7760772
Changed Jenkins timeout from 2h to 3h
gcasadesus Jul 21, 2021
3cb436d
Merge branch 'master' into save-models
gcasadesus Jul 23, 2021
55385a7
Changed names of constant variables
gcasadesus Jul 23, 2021
bad4ca1
Merge branch 'save-models' of github.com:gcasadesus/dislib into save-…
gcasadesus Jul 23, 2021
ee43f76
Merge branch 'master' into save-models
gcasadesus Jul 26, 2021
8b22122
Add RF Classifier and started modifying _data.py
gcasadesus Jul 21, 2021
5487ab0
Added DecisionTreeRegressor with MSE criterion
gcasadesus Jul 22, 2021
606de7c
Added RandomForestRegressor
gcasadesus Jul 23, 2021
c9250a3
Added RF to a new 'commons' module
gcasadesus Jul 26, 2021
8c738dc
Removed RF from 'classification' and 'regression'
gcasadesus Jul 26, 2021
79981db
Edited tests.
gcasadesus Jul 26, 2021
6eaac6c
Changed tests and file names in commons/rf
gcasadesus Jul 27, 2021
676530b
Modified test_saving.py to raise ModuleNotFound
gcasadesus Jul 27, 2021
3476dbe
Reduced saving tests and added tests for RFRegr
gcasadesus Jul 28, 2021
4b3126f
Merge branch 'master' into save-models
gcasadesus Jul 29, 2021
efcfa1f
Added tests for RF dataset
gcasadesus Jul 30, 2021
95538b6
Added setup and teardown for saving tests.
gcasadesus Jul 30, 2021
1c8f7ef
Updated user guide with RF Regressor
gcasadesus Aug 2, 2021
89b6db3
Added saving and loading utils
gcasadesus Jul 16, 2021
2b9f8f3
Format and doc changes
gcasadesus Jul 19, 2021
6dbd625
cbor2 not always required
gcasadesus Jul 19, 2021
004e713
Install dislib requirements
gcasadesus Jul 21, 2021
456e783
Added directory to save models during testing.
gcasadesus Jul 21, 2021
e67cfa1
Install requirements using pip3.
gcasadesus Jul 21, 2021
addf6d5
Changed environment language.
gcasadesus Jul 21, 2021
8e40e41
Changed Jenkins timeout from 2h to 3h
gcasadesus Jul 21, 2021
eb852e6
Changed names of constant variables
gcasadesus Jul 23, 2021
db0db92
Add RF Classifier and started modifying _data.py
gcasadesus Jul 21, 2021
ff62b9b
Added DecisionTreeRegressor with MSE criterion
gcasadesus Jul 22, 2021
9969964
Added RandomForestRegressor
gcasadesus Jul 23, 2021
85d481b
Added RF to a new 'commons' module
gcasadesus Jul 26, 2021
b3c909e
Removed RF from 'classification' and 'regression'
gcasadesus Jul 26, 2021
f99f61b
Edited tests.
gcasadesus Jul 26, 2021
d5fae27
Changed tests and file names in commons/rf
gcasadesus Jul 27, 2021
25f68b3
Modified test_saving.py to raise ModuleNotFound
gcasadesus Jul 27, 2021
bab5e7c
Reduced saving tests and added tests for RFRegr
gcasadesus Jul 28, 2021
7ae3b9d
Added tests for RF dataset
gcasadesus Jul 30, 2021
fe86b92
Added setup and teardown for saving tests.
gcasadesus Jul 30, 2021
da90427
Updated user guide with RF Regressor
gcasadesus Aug 2, 2021
b79756f
Resolved conflicts with RF score
gcasadesus Aug 2, 2021
42893b7
Merge branch 'save-models' of github.com:gcasadesus/dislib into save-…
gcasadesus Aug 2, 2021
0445030
Added creation of features file
gcasadesus Aug 6, 2021
579f993
Added tests for decision tree
gcasadesus Aug 6, 2021
4a44cd3
Style changes
gcasadesus Aug 9, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ target/
*compss*.out
*compss*.err

# Saving
**/saving/*

# ========== C & C++ ignores =================
# Prerequisites
*.d
Expand Down
3 changes: 2 additions & 1 deletion dislib/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from dislib.utils.base import shuffle
from dislib.utils.saving import save_model, load_model

__all__ = ['shuffle']
__all__ = ["shuffle", "save_model", "load_model"]
366 changes: 366 additions & 0 deletions dislib/utils/saving.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
import json
import os
import numpy as np
import cbor2

from pycompss.runtime.management.classes import Future
from pycompss.api.api import compss_wait_on

from sklearn.svm import SVC as SklearnSVC
from sklearn.tree import DecisionTreeClassifier as SklearnDTClassifier
from sklearn.tree._tree import Tree as SklearnTree
from scipy.sparse import csr_matrix

import dislib as ds
import dislib.classification
import dislib.cluster
import dislib.recommendation
import dislib.regression
from dislib.data.array import Array
from dislib.classification.rf.decision_tree import (
DecisionTreeClassifier,
_Node,
_InnerNodeInfo,
_LeafInfo,
_SkTreeWrapper,
)

# Dislib models with saving tested (model: str -> module: str)
_implemented_models = {
"KMeans": "cluster",
"GaussianMixture": "cluster",
"CascadeSVM": "classification",
"RandomForestClassifier": "classification",
"ALS": "recommendation",
"LinearRegression": "regression",
"Lasso": "regression",
}

# Classes used by models
_dislib_classes = {
"KMeans": dislib.cluster.KMeans,
"DecisionTreeClassifier": DecisionTreeClassifier,
"_Node": _Node,
"_InnerNodeInfo": _InnerNodeInfo,
"_LeafInfo": _LeafInfo,
"_SkTreeWrapper": _SkTreeWrapper,
}

_sklearn_classes = {
"SVC": SklearnSVC,
"DecisionTreeClassifier": SklearnDTClassifier,
}


def save_model(model, filepath, overwrite=True, save_format="json"):
""" Saves a model to a file.

The model is synchronized before saving and can be reinstantiated in the
exact same state, without any of the code used for model definition or
fitting.

Parameters
----------
model : dislib model.
Dislib model to serialize and save.
filepath : str
Path where to save the model
overwrite : bool, optional (default=True)
Whether any existing model at the target
location should be overwritten.
save_format : str, optional (default='json)
Format used to save the models.

Examples
--------
>>> from dislib.cluster import KMeans
>>> from dislib.utils import save_model, load_model
>>> import numpy as np
>>> import dislib as ds
>>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
>>> x_train = ds.array(x, (2, 2))
>>> model = KMeans(n_clusters=2, random_state=0)
>>> model.fit(x_train)
>>> save_model(model, '/tmp/model')
>>> loaded_model = load_model('/tmp/model')
>>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2))
>>> model_pred = model.predict(x_test)
>>> loaded_model_pred = loaded_model.predict(x_test)
>>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect())
"""

# Check overwrite
if not overwrite and os.path.isfile(filepath):
return

# Check for dislib model
model_name = model.__class__.__name__
if model_name not in _implemented_models.keys():
raise NotImplementedError(
"Saving has only been implemented for the following models:\n%s"
% _implemented_models.keys()
)

# Synchronize model
if model_name == "RandomForestClassifier":
_sync_rf(model)

_sync_obj(model.__dict__)
model_metadata = model.__dict__.copy()
model_metadata["model_name"] = model_name

# Save model
if save_format == "json":
with open(filepath, "w") as f:
json.dump(model_metadata, f, default=_encode_helper)
elif save_format == "cbor":
with open(filepath, "wb") as f:
cbor2.dump(model_metadata, f, default=_encode_helper_cbor)
else:
raise ValueError("Wrong save format.")


def load_model(filepath, load_format="json"):
""" Loads a model from a file.

The model is reinstantiated in the exact same state in which it was saved,
without any of the code used for model definition or fitting.

Parameters
----------
filepath : str
Path of the saved the model
load_format : str, optional (default='json')
Format used to load the model.

Examples
--------
>>> from dislib.cluster import KMeans
>>> from dislib.utils import save_model, load_model
>>> import numpy as np
>>> import dislib as ds
>>> x = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
>>> x_train = ds.array(x, (2, 2))
>>> model = KMeans(n_clusters=2, random_state=0)
>>> model.fit(x_train)
>>> save_model(model, '/tmp/model')
>>> loaded_model = load_model('/tmp/model')
>>> x_test = ds.array(np.array([[0, 0], [4, 4]]), (2, 2))
>>> model_pred = model.predict(x_test)
>>> loaded_model_pred = loaded_model.predict(x_test)
>>> assert np.allclose(model_pred.collect(), loaded_model_pred.collect())
"""
# Load model
if load_format == "json":
with open(filepath, "r") as f:
model_metadata = json.load(f, object_hook=_decode_helper)
elif load_format == "cbor":
with open(filepath, "rb") as f:
model_metadata = cbor2.load(f, object_hook=_decode_helper_cbor)
else:
raise ValueError("Wrong load format.")

# Check for dislib model
model_name = model_metadata["model_name"]
if model_name not in _implemented_models.keys():
raise NotImplementedError(
"Saving has only been implemented for the following models:\n%s"
% _implemented_models.keys()
)
del model_metadata["model_name"]

# Create model
model_module = getattr(ds, _implemented_models[model_name])
model_class = getattr(model_module, model_name)
model = model_class()
model.__dict__.update(model_metadata)

# Set class methods
if model_name == "CascadeSVM" and "kernel" in model_metadata:
try:
model._kernel_f = getattr(
model, model._name_to_kernel[model_metadata["kernel"]]
)
except AttributeError:
model._kernel_f = getattr(model, "_rbf_kernel")

return model


def _encode_helper_cbor(encoder, obj):
""" Special encoder wrapper for dislib using cbor2"""
encoder.encode(_encode_helper(obj))


def _decode_helper_cbor(decoder, obj):
""" Special decoder wrapper for dislib using cbor2"""
return _decode_helper(obj)


def _encode_helper(obj):
""" Special encoder for dislib that serializes the different objectes
and stores their state for future loading.
"""
if isinstance(obj, np.generic):
return obj.item()
elif isinstance(obj, range):
return {
"class_name": "range",
"start": obj.start,
"stop": obj.stop,
"step": obj.step,
}
elif isinstance(obj, csr_matrix):
return {
"class_name": "csr_matrix",
**obj.__dict__,
}
elif isinstance(obj, np.ndarray):
return {
"class_name": "ndarray",
"dtype_list": len(obj.dtype.descr) > 1,
"dtype": str(obj.dtype),
"items": obj.tolist(),
}
elif isinstance(obj, Array):
return {"class_name": "dsarray", **obj.__dict__}
elif isinstance(obj, np.random.RandomState):
return {"class_name": "RandomState", "items": obj.get_state()}
elif callable(obj):
return {
"class_name": "callable",
"module": obj.__module__,
"name": obj.__name__,
}
elif isinstance(obj, SklearnTree):
return {
"class_name": obj.__class__.__name__,
"n_features": obj.n_features,
"n_classes": obj.n_classes,
"n_outputs": obj.n_outputs,
"items": obj.__getstate__(),
}
elif isinstance(
obj, tuple(_dislib_classes.values()) + tuple(_sklearn_classes.values())
):
return {
"class_name": obj.__class__.__name__,
"module_name": obj.__module__,
"items": obj.__dict__,
}
raise TypeError("Not JSON Serializable:", obj)


def _decode_helper(obj):
""" Special decoder for dislib that instantiates the different objects
and updates their attributes to recover the saved state.
"""
if isinstance(obj, dict) and "class_name" in obj:

class_name = obj["class_name"]
if class_name == "range":
return range(obj["start"], obj["stop"], obj["step"])
elif class_name == "tuple":
return tuple(obj["items"])
elif class_name == "ndarray":
if obj["dtype_list"]:
items = list(map(tuple, obj["items"]))
return np.rec.fromrecords(items, dtype=eval(obj["dtype"]))
else:
return np.array(obj["items"], dtype=obj["dtype"])
elif class_name == "csr_matrix":
return csr_matrix(
(obj["data"], obj["indices"], obj["indptr"]),
shape=obj["_shape"],
)
elif class_name == "dsarray":
return Array(
blocks=obj["_blocks"],
top_left_shape=obj["_top_left_shape"],
reg_shape=obj["_reg_shape"],
shape=obj["_shape"],
sparse=obj["_sparse"],
delete=obj["_delete"],
)
elif class_name == "RandomState":
random_state = np.random.RandomState()
random_state.set_state(_decode_helper(obj["items"]))
return random_state
elif class_name == "Tree":
dict_ = _decode_helper(obj["items"])
model = SklearnTree(
obj["n_features"], obj["n_classes"], obj["n_outputs"]
)
model.__setstate__(dict_)
return model
elif (
class_name in _dislib_classes.keys()
and "dislib" in obj["module_name"]
):
dict_ = _decode_helper(obj["items"])
if class_name == "DecisionTreeClassifier":
model = _dislib_classes[obj["class_name"]](
try_features=dict_.pop("try_features"),
max_depth=dict_.pop("max_depth"),
distr_depth=dict_.pop("distr_depth"),
sklearn_max=dict_.pop("sklearn_max"),
bootstrap=dict_.pop("bootstrap"),
random_state=dict_.pop("random_state"),
)
elif class_name == "_SkTreeWrapper":
sk_tree = _decode_helper(dict_.pop("sk_tree"))
model = _dislib_classes[obj["class_name"]](sk_tree)
else:
model = _dislib_classes[obj["class_name"]]()
model.__dict__.update(dict_)
return model
elif (
class_name in _sklearn_classes.keys()
and "sklearn" in obj["module_name"]
):
dict_ = _decode_helper(obj["items"])
model = _sklearn_classes[obj["class_name"]]()
model.__dict__.update(dict_)
return model
elif class_name == "callable":
if obj["module"] == "numpy":
return getattr(np, obj["name"])
return None

return obj


def _sync_obj(obj):
""" Recursively synchronizes the Future objects of a list or dictionary
by using `compss_wait_on(obj)`.
"""
if isinstance(obj, dict):
iterator = iter(obj.items())
elif isinstance(obj, list):
iterator = iter(enumerate(obj))
else:
raise ValueError("Expected dict or list and received %s." % type(obj))

for key, val in iterator:
if isinstance(val, (dict, list)):
_sync_obj(obj[key])
else:
obj[key] = compss_wait_on(val)
if isinstance(obj[key], Future):
raise TypeError(
"Could not synchronize Future (%s, %s)." % (key, val)
)
if hasattr(obj[key], "__dict__"):
_sync_obj(obj[key].__dict__)


def _sync_rf(rf):
""" Sync the `try_features` and `n_classes` attribute of the different trees
since they cannot be synced recursively.
"""
if isinstance(rf.trees[0].try_features, Future):
try_features = compss_wait_on(rf.trees[0].try_features)
n_classes = compss_wait_on(rf.trees[0].n_classes)
for tree in rf.trees:
tree.try_features = try_features
tree.n_classes = n_classes
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ scipy>=1.3.0
numpy>=1.18.1, <=1.19.5
numpydoc>=0.8.0
cvxpy>=1.1.5
cbor2>=5.4.0
Loading