Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pickling for HBDSCAN #5102

Merged
merged 18 commits into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 64 additions & 34 deletions cpp/include/cuml/cluster/hdbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,8 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {
handle_, n_leaves_, labels_, children_, sizes_, deltas_, mst_src_, mst_dst_, mst_weights_),
probabilities(probabilities_),
stabilities(0, handle_.get_stream()),
condensed_tree(handle_, n_leaves_)
condensed_tree(handle_, n_leaves_),
inverse_label_map(0, handle_.get_stream())
{
}

Expand All @@ -276,6 +277,9 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {
// it much easier to use / debug.
value_t* get_probabilities() { return probabilities; }
value_t* get_stabilities() { return stabilities.data(); }
value_idx* get_inverse_label_map() { return inverse_label_map.data(); }
// internal function
rmm::device_uvector<value_idx>& _get_inverse_label_map() { return inverse_label_map; }

/**
* Once n_clusters is known, the stabilities array
Expand All @@ -293,6 +297,9 @@ class hdbscan_output : public robust_single_linkage_output<value_idx, value_t> {

private:
value_t* probabilities; // size n_leaves
// inversely maps normalized labels to pre-normalized labels
// used for out-of-sample prediction
rmm::device_uvector<value_idx> inverse_label_map; // size n_clusters

// Size not known ahead of time. Initialize
// with `initialize_stabilities()` method.
Expand All @@ -316,14 +323,14 @@ template class CondensedHierarchy<int, float>;
template <typename value_idx, typename value_t>
class PredictionData {
public:
PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n)
PredictionData(const raft::handle_t& handle_, value_idx m, value_idx n, value_t* core_dists_)
: handle(handle_),
exemplar_idx(0, handle.get_stream()),
exemplar_label_offsets(0, handle.get_stream()),
n_selected_clusters(0),
selected_clusters(0, handle.get_stream()),
deaths(0, handle.get_stream()),
core_dists(m, handle.get_stream()),
core_dists(core_dists_),
index_into_children(0, handle.get_stream()),
n_exemplars(0),
n_rows(m),
Expand All @@ -342,7 +349,7 @@ class PredictionData {
value_idx* get_exemplar_label_offsets() { return exemplar_label_offsets.data(); }
value_idx* get_selected_clusters() { return selected_clusters.data(); }
value_t* get_deaths() { return deaths.data(); }
value_t* get_core_dists() { return core_dists.data(); }
value_t* get_core_dists() { return core_dists; }
value_idx* get_index_into_children() { return index_into_children.data(); }

/**
Expand Down Expand Up @@ -376,18 +383,18 @@ class PredictionData {
value_idx n_selected_clusters;
rmm::device_uvector<value_idx> selected_clusters;
rmm::device_uvector<value_t> deaths;
rmm::device_uvector<value_t> core_dists;
value_t* core_dists;
rmm::device_uvector<value_idx> index_into_children;
};

template class PredictionData<int, float>;

void build_prediction_data(const raft::handle_t& handle,
CondensedHierarchy<int, float>& condensed_tree,
int* labels,
int* label_map,
int n_selected_clusters,
PredictionData<int, float>& prediction_data);
void generate_prediction_data(const raft::handle_t& handle,
CondensedHierarchy<int, float>& condensed_tree,
int* labels,
int* inverse_label_map,
int n_selected_clusters,
PredictionData<int, float>& prediction_data);

}; // namespace Common
}; // namespace HDBSCAN
Expand All @@ -413,36 +420,14 @@ void build_prediction_data(const raft::handle_t& handle,
* @param params struct of configuration hyper-parameters
* @param out struct of output data and arrays on device
*/
void hdbscan(const raft::handle_t& handle,
const float* X,
size_t m,
size_t n,
raft::distance::DistanceType metric,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out);

/**
* Executes HDBSCAN clustering on an mxn-dimensional input array, X, then builds the PredictionData
* object which computes and stores information needed later for prediction algorithms.
*
* @param[in] handle raft handle for resource reuse
* @param[in] X array (size m, n) on device in row-major format
* @param m number of rows in X
* @param n number of columns in X
* @param metric distance metric to use
* @param params struct of configuration hyper-parameters
* @param out struct of output data and arrays on device
* @param prediction_data_ struct for storing computing and storing information to be used during
* prediction
*/
void hdbscan(const raft::handle_t& handle,
const float* X,
size_t m,
size_t n,
raft::distance::DistanceType metric,
HDBSCAN::Common::HDBSCANParams& params,
HDBSCAN::Common::hdbscan_output<int, float>& out,
HDBSCAN::Common::PredictionData<int, float>& prediction_data_);
float* core_dists);

void build_condensed_hierarchy(const raft::handle_t& handle,
const int* children,
Expand Down Expand Up @@ -485,4 +470,49 @@ void out_of_sample_predict(const raft::handle_t& handle,
int min_samples,
int* out_labels,
float* out_probabilities);

namespace HDBSCAN::HELPER {

/**
* @brief Compute the core distances for each point in the training matrix
*
* @param[in] handle raft handle for resource reuse
* @param[in] X array (size m, n) on device in row-major format
* @param[out] core_dists array (size m, 1) of core distances
* @param m number of rows in X
* @param n number of columns in X
* @param metric distance metric to use
* @param min_samples minimum number of samples to use for computing core distances
*/
void compute_core_dists(const raft::handle_t& handle,
const float* X,
float* core_dists,
size_t m,
size_t n,
raft::distance::DistanceType metric,
int min_samples);

/**
* @brief Compute the map from final, normalize labels to the labels in the CondensedHierarchy
*
* @param[in] handle raft handle for resource reuse
* @param[in] condensed_tree the Condensed Hiearchy object
* @param[in] n_leaves number of leaves in the input data
* @param[in] cluster_selection_method cluster selection method
* @param[out] inverse_label_map rmm::device_uvector of size 0. It will be resized during the
* computation
* @param[in] allow_single_cluster allow single cluster
* @param[in] max_cluster_size max cluster size
* @param[in] cluster_selection_epsilon cluster selection epsilon
*/
void compute_inverse_label_map(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
size_t n_leaves,
HDBSCAN::Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
rmm::device_uvector<int>& inverse_label_map,
bool allow_single_cluster,
int max_cluster_size,
float cluster_selection_epsilon);

} // namespace HDBSCAN::HELPER
} // END namespace ML
69 changes: 68 additions & 1 deletion cpp/src/hdbscan/detail/extract.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,60 @@ void do_labelling_on_host(const raft::handle_t& handle,
raft::update_device(labels, result.data(), n_leaves, stream);
}

/*
@brief Internal function compute inverse_label_map for CPU/GPU interop
*/
template <typename value_idx, typename value_t>
void _compute_inverse_label_map(const raft::handle_t& handle,
Common::CondensedHierarchy<value_idx, value_t>& condensed_tree,
size_t n_leaves,
Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
rmm::device_uvector<value_idx>& inverse_label_map,
bool allow_single_cluster,
value_idx max_cluster_size,
value_t cluster_selection_epsilon)
{
auto stream = handle.get_stream();
rmm::device_uvector<value_t> tree_stabilities(condensed_tree.get_n_clusters(),
handle.get_stream());
Stability::compute_stabilities(handle, condensed_tree, tree_stabilities.data());
rmm::device_uvector<int> is_cluster(condensed_tree.get_n_clusters(), handle.get_stream());

if (max_cluster_size <= 0) max_cluster_size = n_leaves; // negates the max cluster size

Select::select_clusters(handle,
condensed_tree,
tree_stabilities.data(),
is_cluster.data(),
cluster_selection_method,
allow_single_cluster,
max_cluster_size,
cluster_selection_epsilon);

std::vector<int> is_cluster_h(is_cluster.size());
raft::update_host(is_cluster_h.data(), is_cluster.data(), is_cluster_h.size(), stream);
handle.sync_stream(stream);

std::set<value_idx> clusters;
for (std::size_t i = 0; i < is_cluster_h.size(); i++) {
if (is_cluster_h[i] != 0) { clusters.insert(i + n_leaves); }
}

std::vector<value_idx> inverse_label_map_h(clusters.size(), -1);
value_idx i = 0;
// creating inverse index between
// original and final labels
for (const value_idx cluster : clusters) {
inverse_label_map_h[i] = cluster - n_leaves;
i++;
}

// resizing is to n_clusters
inverse_label_map.resize(clusters.size(), stream);
raft::copy(
inverse_label_map.data(), inverse_label_map_h.data(), inverse_label_map_h.size(), stream);
}

/**
* Compute cluster stabilities, perform cluster selection, and
* label the resulting clusters. In addition, probabilities
Expand All @@ -191,8 +245,11 @@ void do_labelling_on_host(const raft::handle_t& handle,
* @param[out] labels array of labels on device (size n_leaves)
* @param[out] stabilities array of stabilities on device (size n_clusters)
* @param[out] probabilities array of probabilities on device (size n_leaves)
* @param[out] label_map array mapping condensed label ids to selected label ids (size n_leaves)
* @param[out] label_map array mapping condensed label ids to selected label ids (size
* n_condensed_trees)
* @param[in] cluster_selection_method method to use for cluster selection
* @param[out] inverse_label_map array mapping final label ids to condensed label ids, used for
* prediction APIs (size n_clusters)
* @param[in] allow_single_cluster allows a single cluster to be returned (rather than just noise)
* @param[in] max_cluster_size maximium number of points that can be considered in a cluster before
* it is split into multiple sub-clusters.
Expand All @@ -208,6 +265,7 @@ value_idx extract_clusters(const raft::handle_t& handle,
value_t* probabilities,
value_idx* label_map,
Common::CLUSTER_SELECTION_METHOD cluster_selection_method,
rmm::device_uvector<value_idx>& inverse_label_map,
bool allow_single_cluster = false,
value_idx max_cluster_size = 0,
value_t cluster_selection_epsilon = 0.0)
Expand Down Expand Up @@ -239,12 +297,21 @@ value_idx extract_clusters(const raft::handle_t& handle,
}

std::vector<value_idx> label_map_h(condensed_tree.get_n_clusters(), -1);
std::vector<value_idx> inverse_label_map_h(clusters.size(), -1);
value_idx i = 0;
// creating forward and inverse index between
// original and final labels
for (const value_idx cluster : clusters) {
label_map_h[cluster - n_leaves] = i;
inverse_label_map_h[i] = cluster - n_leaves;
i++;
}

// resizing is to n_clusters
inverse_label_map.resize(clusters.size(), stream);
raft::copy(
inverse_label_map.data(), inverse_label_map_h.data(), inverse_label_map_h.size(), stream);

raft::copy(label_map, label_map_h.data(), label_map_h.size(), stream);
do_labelling_on_host<value_idx, value_t>(handle,
condensed_tree,
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/hdbscan/detail/reachability.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,34 @@ void compute_knn(const raft::handle_t& handle,
[] __device__(int64_t in) -> value_idx { return in; });
}

/*
@brief Internal function for CPU->GPU interop
to compute core_dists
*/
template <typename value_idx, typename value_t>
void _compute_core_dists(const raft::handle_t& handle,
const value_t* X,
value_t* core_dists,
size_t m,
size_t n,
raft::distance::DistanceType metric,
int min_samples)
{
RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
"Currently only L2 expanded distance is supported");

auto stream = handle.get_stream();

rmm::device_uvector<value_idx> inds(min_samples * m, stream);
rmm::device_uvector<value_t> dists(min_samples * m, stream);

// perform knn
compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);

// Slice core distances (distances to kth nearest neighbor)
core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
}

/**
* Constructs a mutual reachability graph, which is a k-nearest neighbors
* graph projected into mutual reachability space using the following
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/hdbscan/detail/stabilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ void get_stability_scores(const raft::handle_t& handle,
/**
* 1. Populate cluster sizes
*/
rmm::device_uvector<value_idx> cluster_sizes(n_leaves, handle.get_stream());
rmm::device_uvector<value_idx> cluster_sizes(n_condensed_clusters, handle.get_stream());
thrust::fill(exec_policy, cluster_sizes.data(), cluster_sizes.data() + cluster_sizes.size(), 0);

value_idx* sizes = cluster_sizes.data();
Expand Down
Loading