Skip to content

[ML] Multinomial logistic regression #1037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Mar 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

* Add instrumentation to report statistics related to data frame analytics jobs, i.e.
progress, memory usage, etc. (See {ml-pull}906[#906].)
* Multiclass classification. (See {ml-pull}1037[#1037].)

=== Enhancements

Expand Down
142 changes: 131 additions & 11 deletions include/maths/CBoostedTreeLoss.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
#define INCLUDED_ml_maths_CBoostedTreeLoss_h

#include <maths/CBasicStatistics.h>
#include <maths/CKMeansOnline.h>
#include <maths/CLinearAlgebra.h>
#include <maths/CLinearAlgebraEigen.h>
#include <maths/CPRNG.h>
#include <maths/ImportExport.h>
#include <maths/MathsTypes.h>

Expand Down Expand Up @@ -66,9 +68,26 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {

//! \brief Finds the value to add to a set of predicted log-odds which minimises
//! regularised cross entropy loss w.r.t. the actual categories.
class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
//!
//! DESCRIPTION:\n
//! We want to find the weight which minimizes the log-loss, i.e. which satisfies
//! <pre class="fragment">
//! \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_i{ a_i \log(S(p_i + w)) + (1 - a_i) \log(1 - S(p_i + w)) } }\f$
//! </pre>
//!
//! Rather than working with this function directly we bucket the predictions `p_i`
//! in a first pass over the data and compute weight which minimizes the approximate
//! function
//! <pre class="fragment">
//! \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_{B}{ c_{1,B} \log(S(\bar{p}_B + w)) + c_{0,B} \log(1 - S(\bar{p}_B + w)) } }\f$
//! </pre>
//!
//! Here, \f$B\f$ ranges over the buckets, \f$\bar{p}_B\f$ denotes the B'th bucket
//! centre and \f$c_{0,B}\f$ and \f$c_{1,B}\f$ denote the counts of actual classes
//! 0 and 1, respectively, in the bucket \f$B\f$.
class MATHS_EXPORT CArgMinBinomialLogisticLossImpl final : public CArgMinLossImpl {
public:
CArgMinLogisticImpl(double lambda);
CArgMinBinomialLogisticLossImpl(double lambda);
std::unique_ptr<CArgMinLossImpl> clone() const override;
bool nextPass() override;
void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
Expand All @@ -80,11 +99,13 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
using TDoubleVector2x1 = CVectorNx1<double, 2>;
using TDoubleVector2x1Vec = std::vector<TDoubleVector2x1>;

private:
static constexpr std::size_t NUMBER_BUCKETS = 128;

private:
std::size_t bucket(double prediction) const {
double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()};
return std::min(static_cast<std::size_t>(bucket),
m_BucketCategoryCounts.size() - 1);
return std::min(static_cast<std::size_t>(bucket), m_BucketsClassCounts.size() - 1);
}

double bucketCentre(std::size_t bucket) const {
Expand All @@ -95,15 +116,74 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
double bucketWidth() const {
return m_PredictionMinMax.initialized()
? m_PredictionMinMax.range() /
static_cast<double>(m_BucketCategoryCounts.size())
static_cast<double>(m_BucketsClassCounts.size())
: 0.0;
}

private:
std::size_t m_CurrentPass = 0;
TMinMaxAccumulator m_PredictionMinMax;
TDoubleVector2x1 m_CategoryCounts;
TDoubleVector2x1Vec m_BucketCategoryCounts;
TDoubleVector2x1 m_ClassCounts;
TDoubleVector2x1Vec m_BucketsClassCounts;
};

//! \brief Finds the value to add to a set of predicted multinomial logit which
//! minimises regularised cross entropy loss w.r.t. the actual classes.
//!
//! DESCRIPTION:\n
//! We want to find the weight which minimizes the log-loss, i.e. which satisfies
//! <pre class="fragment">
//! \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_i{ \log([softmax(p_i + w)]_{a_i}) } }\f$
//! </pre>
//!
//! Here, \f$a_i\f$ is the index of the i'th example's true class. Rather than
//! working with this function directly we approximate it by the means and count
//! of predictions in a partition of the original data, i.e. we compute the weight
//! weight which satisfies
//! <pre class="fragment">
//! \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_P{ c_{a_i, P} \log([softmax(\bar{p}_P + w)]) } }\f$
//! </pre>
//!
//! Here, \f$P\f$ ranges over the subsets of the partition, \f$\bar{p}_P\f$ denotes
//! the mean of the predictions in the P'th subset and \f$c_{a_i, P}\f$ denote the
//! counts of each classes \f$\{a_i\}\f$ in the subset \f$P\f$. We compute this
//! partition by k-means.
class MATHS_EXPORT CArgMinMultinomialLogisticLossImpl final : public CArgMinLossImpl {
public:
using TObjective = std::function<double(const TDoubleVector&)>;
using TObjectiveGradient = std::function<TDoubleVector(const TDoubleVector&)>;

public:
CArgMinMultinomialLogisticLossImpl(std::size_t numberClasses,
double lambda,
const CPRNG::CXorOShiro128Plus& rng);
std::unique_ptr<CArgMinLossImpl> clone() const override;
bool nextPass() override;
void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
void merge(const CArgMinLossImpl& other) override;
TDoubleVector value() const override;

// Exposed for unit testing.
TObjective objective() const;
TObjectiveGradient objectiveGradient() const;

private:
using TDoubleVectorVec = std::vector<TDoubleVector>;
using TKMeans = CKMeansOnline<TDoubleVector>;

private:
static constexpr std::size_t NUMBER_CENTRES = 128;
static constexpr std::size_t NUMBER_RESTARTS = 5;

private:
std::size_t m_NumberClasses = 0;
std::size_t m_CurrentPass = 0;
mutable CPRNG::CXorOShiro128Plus m_Rng;
TDoubleVector m_ClassCounts;
TDoubleVector m_DoublePrediction;
TKMeans m_PredictionSketch;
TDoubleVectorVec m_Centres;
TDoubleVectorVec m_CentresClassCounts;
};
}

Expand Down Expand Up @@ -185,7 +265,8 @@ class MATHS_EXPORT CLoss {
//! Transforms a prediction from the forest to the target space.
virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0;
//! Get an object which computes the leaf value that minimises loss.
virtual CArgMinLoss minimizer(double lambda) const = 0;
virtual CArgMinLoss minimizer(double lambda,
const CPRNG::CXorOShiro128Plus& rng) const = 0;
//! Get the name of the loss function
virtual const std::string& name() const = 0;

Expand Down Expand Up @@ -214,7 +295,7 @@ class MATHS_EXPORT CMse final : public CLoss {
double weight = 1.0) const override;
bool isCurvatureConstant() const override;
TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
CArgMinLoss minimizer(double lambda) const override;
CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
const std::string& name() const override;
};

Expand All @@ -227,11 +308,47 @@ class MATHS_EXPORT CMse final : public CLoss {
//! </pre>
//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the
//! prediction and \f$S(\cdot)\f$ denotes the logistic function.
class MATHS_EXPORT CBinomialLogistic final : public CLoss {
class MATHS_EXPORT CBinomialLogisticLoss final : public CLoss {
public:
static const std::string NAME;

public:
std::unique_ptr<CLoss> clone() const override;
std::size_t numberParameters() const override;
double value(const TMemoryMappedFloatVector& prediction,
double actual,
double weight = 1.0) const override;
void gradient(const TMemoryMappedFloatVector& prediction,
double actual,
TWriter writer,
double weight = 1.0) const override;
void curvature(const TMemoryMappedFloatVector& prediction,
double actual,
TWriter writer,
double weight = 1.0) const override;
bool isCurvatureConstant() const override;
TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
const std::string& name() const override;
};

//! \brief Implements loss for multinomial logistic regression.
//!
//! DESCRIPTION:\n
//! This targets the cross-entropy loss using the forest to predict the class
//! probabilities via the softmax function:
//! <pre class="fragment">
//! \f$\displaystyle l_i(p) = -\sum_i a_{ij} \log(\sigma(p))\f$
//! </pre>
//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ denotes
//! the vector valued prediction and \f$\sigma(p)\$ is the softmax function, i.e.
//! \f$[\sigma(p)]_j = \frac{e^{p_i}}{\sum_k e^{p_k}}\f$.
class MATHS_EXPORT CMultinomialLogisticLoss final : public CLoss {
public:
static const std::string NAME;

public:
CMultinomialLogisticLoss(std::size_t numberClasses);
std::unique_ptr<CLoss> clone() const override;
std::size_t numberParameters() const override;
double value(const TMemoryMappedFloatVector& prediction,
Expand All @@ -247,8 +364,11 @@ class MATHS_EXPORT CBinomialLogistic final : public CLoss {
double weight = 1.0) const override;
bool isCurvatureConstant() const override;
TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
CArgMinLoss minimizer(double lambda) const override;
CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
const std::string& name() const override;

private:
std::size_t m_NumberClasses;
};
}
}
Expand Down
22 changes: 12 additions & 10 deletions include/maths/CKMeans.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <boost/iterator/counting_iterator.hpp>

#include <cstddef>
#include <cstdint>
#include <sstream>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -125,15 +126,15 @@ class CKMeans {
const TPointVec& points() const { return m_Points; }

//! Get the cluster checksum.
uint64_t checksum() const { return m_Checksum; }
std::uint64_t checksum() const { return m_Checksum; }

private:
//! The centroid of the points in this cluster.
POINT m_Centre;
//! The points in the cluster.
TPointVec m_Points;
//! A checksum for the points in the cluster.
uint64_t m_Checksum;
std::uint64_t m_Checksum;
};

using TClusterVec = std::vector<CCluster>;
Expand Down Expand Up @@ -183,8 +184,9 @@ class CKMeans {
if (m_Centres.empty()) {
return true;
}
TMeanAccumulatorVec newCentres;
for (std::size_t i = 0u; i < maxIterations; ++i) {
if (!this->updateCentres()) {
if (!this->updateCentres(newCentres)) {
return true;
}
}
Expand Down Expand Up @@ -481,19 +483,19 @@ class CKMeans {

private:
//! Single iteration of Lloyd's algorithm to update \p centres.
bool updateCentres() {
const TCoordinate precision = TCoordinate(5) *
std::numeric_limits<TCoordinate>::epsilon();
TMeanAccumulatorVec newCentres(m_Centres.size(),
TMeanAccumulator(las::zero(m_Centres[0])));
bool updateCentres(TMeanAccumulatorVec& newCentres) {
const TCoordinate precision{TCoordinate(5) *
std::numeric_limits<TCoordinate>::epsilon()};
newCentres.assign(m_Centres.size(), TMeanAccumulator(las::zero(m_Centres[0])));
CCentroidComputer computer(m_Centres, newCentres);
m_Points.preorderDepthFirst(computer);
bool changed = false;
POINT newCentre;
for (std::size_t i = 0u; i < newCentres.size(); ++i) {
POINT newCentre(CBasicStatistics::mean(newCentres[i]));
newCentre = CBasicStatistics::mean(newCentres[i]);
if (las::distance(m_Centres[i], newCentre) >
precision * las::norm(m_Centres[i])) {
m_Centres[i] = newCentre;
las::swap(m_Centres[i], newCentre);
changed = true;
}
}
Expand Down
Loading