Skip to content

Commit

Permalink
Add RF Classifier and started modifying _data.py
Browse files Browse the repository at this point in the history
  • Loading branch information
gcasadesus committed Aug 2, 2021
1 parent eb852e6 commit db0db92
Show file tree
Hide file tree
Showing 5 changed files with 1,155 additions and 0 deletions.
Empty file.
279 changes: 279 additions & 0 deletions dislib/regression/rf/_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
import tempfile

import numpy as np
from numpy.lib import format
from pycompss.api.parameter import (
FILE_IN,
FILE_INOUT,
COLLECTION_IN,
Depth,
Type,
)
from pycompss.api.task import task

from dislib.data.array import Array


class RfDataset(object):
"""Dataset format used by the fit() of the RandomForestRegressor.
The RfDataset contains a file path for the samples and another one for the
targets. Optionally, a path can be provided for a transposed version of the
samples matrix, i.e., the features.
Note: For a representation of a dataset distributed in multiple files, use
dislib.data.Dataset instead.
Parameters
----------
samples_path : str
Path of the .npy file containing the 2-d array of samples. It can be a
pycompss.runtime.Future object. If so, self.n_samples and
self.n_features must be set manually (they can also be
pycompss.runtime.Future objects).
targets_path : str
Path of the .dat file containing the 1-d array of targets. It can be a
pycompss.runtime.Future object.
features_path : str, optional (default=None)
Path of the .npy file containing the 2-d array of samples transposed.
The array must be C-ordered. Providing this array may improve the
performance as it allows sequential access to the features.
Attributes
----------
n_samples : int
The number of samples of the dataset. It can be a
pycompss.runtime.Future object.
n_features : int
The number of features of the dataset. It can be a
pycompss.runtime.Future object.
y_targets : ndarray
The array of targets for this RfDataset. It can be a
pycompss.runtime.Future object.
"""

def __init__(self, samples_path, targets_path, features_path=None):
self.samples_path = samples_path
self.targets_path = targets_path
self.features_path = features_path
self.n_samples = None
self.n_features = None

self.y_targets = None

def get_n_samples(self):
"""Gets the number of samples obtained from the samples file.
Returns
-------
n_samples : int
Raises
------
AssertionError
If self.n_samples is None and self.samples_path is not a string.
ValueError
If invalid content is encountered in the samples file.
"""
if self.n_samples is None:
assert isinstance(self.samples_path, str), (
"self.n_samples must be set manually if self.samples_path "
"is a pycompss.runtime.Future object"
)
shape = _NpyFile(self.samples_path).get_shape()
if len(shape) != 2:
raise ValueError("Cannot read 2D array from the samples file.")
self.n_samples, self.n_features = shape
return self.n_samples

def get_n_features(self):
"""Gets the number of features obtained from the samples file.
Returns
-------
n_features : int
Raises
------
AssertionError
If self.n_features is None and self.samples_path is not a string.
ValueError
If invalid content is encountered in the samples file.
"""
if self.n_features is None:
assert isinstance(self.samples_path, str), (
"self.n_features must be set manually if self.samples_path "
"is a pycompss.runtime.Future object"
)
shape = _NpyFile(self.samples_path).get_shape()
if len(shape) != 2:
raise ValueError("Cannot read 2D array from the samples file.")
self.n_samples, self.n_features = shape
return self.n_features

def get_y_targets(self):
"""Obtains the array of targets.
Returns
-------
y_targets : ndarray
"""
if self.y_targets is None:
targets = _get_targets(self.targets_path)
self.y_targets = targets
return self.y_targets

def validate_features_file(self):
"""Validates the features file header information.
Raises
------
ValueError
If the shape of the array in the features_file doesn't match this
class n_samples and n_features or if the array is in fortran order.
"""
features_npy_file = _NpyFile(self.features_path)
shape = features_npy_file.get_shape()
fortran_order = features_npy_file.get_fortran_order()
if len(shape) != 2:
raise ValueError("Cannot read 2D array from features_file.")
if (self.get_n_features(), self.get_n_samples()) != shape:
raise ValueError("Invalid dimensions for the features_file.")
if fortran_order:
raise ValueError("Fortran order not supported for features array.")


def transform_to_rf_dataset(x: Array, y: Array) -> RfDataset:
"""Creates a RfDataset object from samples x and targets y.
This function creates a dislib.regression.rf.data.RfDataset by saving
x and y in files.
Parameters
----------
x : ds-array, shape = (n_samples, n_features)
The training input samples.
y : ds-array, shape = (n_samples,) or (n_samples, n_outputs)
The target values.
Returns
-------
rf_dataset : dislib.regression.rf._data.RfDataset
"""
n_samples = x.shape[0]
n_features = x.shape[1]

samples_file = tempfile.NamedTemporaryFile(
mode="wb", prefix="tmp_rf_samples_", delete=False
)
samples_path = samples_file.name
samples_file.close()
_allocate_samples_file(samples_path, n_samples, n_features)

start_idx = 0
row_blocks_iterator = x._iterator(axis=0)
top_row = next(row_blocks_iterator)
_fill_samples_file(samples_path, top_row._blocks, start_idx)
start_idx += x._top_left_shape[0]
for x_row in row_blocks_iterator:
_fill_samples_file(samples_path, x_row._blocks, start_idx)
start_idx += x._reg_shape[0]

targets_file = tempfile.NamedTemporaryFile(
mode="w", prefix="tmp_rf_targets_", delete=False
)
targets_path = targets_file.name
targets_file.close()
for y_row in y._iterator(axis=0):
_fill_targets_file(targets_path, y_row._blocks)

rf_dataset = RfDataset(samples_path, targets_path)
rf_dataset.n_samples = n_samples
rf_dataset.n_features = n_features
return rf_dataset


class _NpyFile(object):
def __init__(self, path):
self.path = path

self.shape = None
self.fortran_order = None
self.dtype = None

def get_shape(self):
if self.shape is None:
self._read_header()
return self.shape

def get_fortran_order(self):
if self.fortran_order is None:
self._read_header()
return self.fortran_order

def get_dtype(self):
if self.dtype is None:
self._read_header()
return self.dtype

def _read_header(self):
with open(self.path, "rb") as fp:
version = format.read_magic(fp)
try:
format._check_version(version)
except ValueError:
raise ValueError("Invalid file format.")
header_data = format._read_array_header(fp, version)
self.shape, self.fortran_order, self.dtype = header_data


@task(targets_path=FILE_IN, returns=1)
def _get_targets(targets_path):
y = np.genfromtxt(targets_path, dtype=None, encoding="utf-8")
return y


@task(returns=1)
def _get_samples_shape(subset):
return subset.samples.shape


@task(returns=3)
def _merge_shapes(*samples_shapes):
n_samples = 0
n_features = samples_shapes[0][1]
for shape in samples_shapes:
n_samples += shape[0]
assert shape[1] == n_features, "Subsamples with different n_features."
return samples_shapes, n_samples, n_features


@task(samples_path=FILE_INOUT)
def _allocate_samples_file(samples_path, n_samples, n_features):
np.lib.format.open_memmap(
samples_path,
mode="w+",
dtype="float32",
shape=(int(n_samples), int(n_features)),
)


@task(samples_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2})
def _fill_samples_file(samples_path, row_blocks, start_idx):
rows_samples = Array._merge_blocks(row_blocks)
rows_samples = rows_samples.astype(dtype="float32", casting="same_kind")
samples = np.lib.format.open_memmap(samples_path, mode="r+")
samples[start_idx : start_idx + rows_samples.shape[0]] = rows_samples


@task(targets_path=FILE_INOUT, row_blocks={Type: COLLECTION_IN, Depth: 2})
def _fill_targets_file(targets_path, row_blocks):
rows_targets = Array._merge_blocks(row_blocks)
with open(targets_path, "at") as f:
np.savetxt(f, rows_targets, fmt="%s", encoding="utf-8")
Loading

0 comments on commit db0db92

Please sign in to comment.