elastic · tveasey · Mar 12, 2020 · Feb 27, 2020 · Feb 28, 2020 · Mar 2, 2020
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -41,6 +41,7 @@
 
 * Add instrumentation to report statistics related to data frame analytics jobs, i.e.
 progress, memory usage, etc. (See {ml-pull}906[#906].)
+* Multiclass classification. (See {ml-pull}1037[#1037].)
 
 === Enhancements
 

diff --git a/include/maths/CBoostedTreeLoss.h b/include/maths/CBoostedTreeLoss.h
@@ -8,8 +8,10 @@
 #define INCLUDED_ml_maths_CBoostedTreeLoss_h
 
 #include <maths/CBasicStatistics.h>
+#include <maths/CKMeansOnline.h>
 #include <maths/CLinearAlgebra.h>
 #include <maths/CLinearAlgebraEigen.h>
+#include <maths/CPRNG.h>
 #include <maths/ImportExport.h>
 #include <maths/MathsTypes.h>
 
@@ -66,9 +68,26 @@ class MATHS_EXPORT CArgMinMseImpl final : public CArgMinLossImpl {
 
 //! \brief Finds the value to add to a set of predicted log-odds which minimises
 //! regularised cross entropy loss w.r.t. the actual categories.
-class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
+//!
+//! DESCRIPTION:\n
+//! We want to find the weight which minimizes the log-loss, i.e. which satisfies
+//! <pre class="fragment">
+//!   \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_i{ a_i \log(S(p_i + w)) + (1 - a_i) \log(1 - S(p_i + w)) } }\f$
+//! </pre>
+//!
+//! Rather than working with this function directly we bucket the predictions `p_i`
+//! in a first pass over the data and compute weight which minimizes the approximate
+//! function
+//! <pre class="fragment">
+//! \f$\displaystyle arg\min_w{ \lambda w^2 -\sum_{B}{ c_{1,B} \log(S(\bar{p}_B + w)) + c_{0,B} \log(1 - S(\bar{p}_B + w)) } }\f$
+//! </pre>
+//!
+//! Here, \f$B\f$ ranges over the buckets, \f$\bar{p}_B\f$ denotes the B'th bucket
+//! centre and \f$c_{0,B}\f$ and \f$c_{1,B}\f$ denote the counts of actual classes
+//! 0 and 1, respectively, in the bucket \f$B\f$.
+class MATHS_EXPORT CArgMinBinomialLogisticLossImpl final : public CArgMinLossImpl {
 public:
-    CArgMinLogisticImpl(double lambda);
+    CArgMinBinomialLogisticLossImpl(double lambda);
     std::unique_ptr<CArgMinLossImpl> clone() const override;
     bool nextPass() override;
     void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
@@ -80,11 +99,13 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
     using TDoubleVector2x1 = CVectorNx1<double, 2>;
     using TDoubleVector2x1Vec = std::vector<TDoubleVector2x1>;
 
+private:
+    static constexpr std::size_t NUMBER_BUCKETS = 128;
+
 private:
     std::size_t bucket(double prediction) const {
         double bucket{(prediction - m_PredictionMinMax.min()) / this->bucketWidth()};
-        return std::min(static_cast<std::size_t>(bucket),
-                        m_BucketCategoryCounts.size() - 1);
+        return std::min(static_cast<std::size_t>(bucket), m_BucketsClassCounts.size() - 1);
     }
 
     double bucketCentre(std::size_t bucket) const {
@@ -95,15 +116,74 @@ class MATHS_EXPORT CArgMinLogisticImpl final : public CArgMinLossImpl {
     double bucketWidth() const {
         return m_PredictionMinMax.initialized()
                    ? m_PredictionMinMax.range() /
-                         static_cast<double>(m_BucketCategoryCounts.size())
+                         static_cast<double>(m_BucketsClassCounts.size())
                    : 0.0;
     }
 
 private:
     std::size_t m_CurrentPass = 0;
     TMinMaxAccumulator m_PredictionMinMax;
-    TDoubleVector2x1 m_CategoryCounts;
-    TDoubleVector2x1Vec m_BucketCategoryCounts;
+    TDoubleVector2x1 m_ClassCounts;
+    TDoubleVector2x1Vec m_BucketsClassCounts;
+};
+
+//! \brief Finds the value to add to a set of predicted multinomial logit which
+//! minimises regularised cross entropy loss w.r.t. the actual classes.
+//!
+//! DESCRIPTION:\n
+//! We want to find the weight which minimizes the log-loss, i.e. which satisfies
+//! <pre class="fragment">
+//!   \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_i{ \log([softmax(p_i + w)]_{a_i}) } }\f$
+//! </pre>
+//!
+//! Here, \f$a_i\f$ is the index of the i'th example's true class. Rather than
+//! working with this function directly we approximate it by the means and count
+//! of predictions in a partition of the original data, i.e. we compute the weight
+//! weight which satisfies
+//! <pre class="fragment">
+//! \f$\displaystyle arg\min_w{ \lambda \|w\|^2 -\sum_P{ c_{a_i, P} \log([softmax(\bar{p}_P + w)]) } }\f$
+//! </pre>
+//!
+//! Here, \f$P\f$ ranges over the subsets of the partition, \f$\bar{p}_P\f$ denotes
+//! the mean of the predictions in the P'th subset and \f$c_{a_i, P}\f$ denote the
+//! counts of each classes \f$\{a_i\}\f$ in the subset \f$P\f$. We compute this
+//! partition by k-means.
+class MATHS_EXPORT CArgMinMultinomialLogisticLossImpl final : public CArgMinLossImpl {
+public:
+    using TObjective = std::function<double(const TDoubleVector&)>;
+    using TObjectiveGradient = std::function<TDoubleVector(const TDoubleVector&)>;
+
+public:
+    CArgMinMultinomialLogisticLossImpl(std::size_t numberClasses,
+                                       double lambda,
+                                       const CPRNG::CXorOShiro128Plus& rng);
+    std::unique_ptr<CArgMinLossImpl> clone() const override;
+    bool nextPass() override;
+    void add(const TMemoryMappedFloatVector& prediction, double actual, double weight = 1.0) override;
+    void merge(const CArgMinLossImpl& other) override;
+    TDoubleVector value() const override;
+
+    // Exposed for unit testing.
+    TObjective objective() const;
+    TObjectiveGradient objectiveGradient() const;
+
+private:
+    using TDoubleVectorVec = std::vector<TDoubleVector>;
+    using TKMeans = CKMeansOnline<TDoubleVector>;
+
+private:
+    static constexpr std::size_t NUMBER_CENTRES = 128;
+    static constexpr std::size_t NUMBER_RESTARTS = 5;
+
+private:
+    std::size_t m_NumberClasses = 0;
+    std::size_t m_CurrentPass = 0;
+    mutable CPRNG::CXorOShiro128Plus m_Rng;
+    TDoubleVector m_ClassCounts;
+    TDoubleVector m_DoublePrediction;
+    TKMeans m_PredictionSketch;
+    TDoubleVectorVec m_Centres;
+    TDoubleVectorVec m_CentresClassCounts;
 };
 }
 
@@ -185,7 +265,8 @@ class MATHS_EXPORT CLoss {
     //! Transforms a prediction from the forest to the target space.
     virtual TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const = 0;
     //! Get an object which computes the leaf value that minimises loss.
-    virtual CArgMinLoss minimizer(double lambda) const = 0;
+    virtual CArgMinLoss minimizer(double lambda,
+                                  const CPRNG::CXorOShiro128Plus& rng) const = 0;
     //! Get the name of the loss function
     virtual const std::string& name() const = 0;
 
@@ -214,7 +295,7 @@ class MATHS_EXPORT CMse final : public CLoss {
                    double weight = 1.0) const override;
     bool isCurvatureConstant() const override;
     TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
-    CArgMinLoss minimizer(double lambda) const override;
+    CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
     const std::string& name() const override;
 };
 
@@ -227,11 +308,47 @@ class MATHS_EXPORT CMse final : public CLoss {
 //! </pre>
 //! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ is the
 //! prediction and \f$S(\cdot)\f$ denotes the logistic function.
-class MATHS_EXPORT CBinomialLogistic final : public CLoss {
+class MATHS_EXPORT CBinomialLogisticLoss final : public CLoss {
+public:
+    static const std::string NAME;
+
+public:
+    std::unique_ptr<CLoss> clone() const override;
+    std::size_t numberParameters() const override;
+    double value(const TMemoryMappedFloatVector& prediction,
+                 double actual,
+                 double weight = 1.0) const override;
+    void gradient(const TMemoryMappedFloatVector& prediction,
+                  double actual,
+                  TWriter writer,
+                  double weight = 1.0) const override;
+    void curvature(const TMemoryMappedFloatVector& prediction,
+                   double actual,
+                   TWriter writer,
+                   double weight = 1.0) const override;
+    bool isCurvatureConstant() const override;
+    TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
+    CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
+    const std::string& name() const override;
+};
+
+//!  \brief Implements loss for multinomial logistic regression.
+//!
+//! DESCRIPTION:\n
+//! This targets the cross-entropy loss using the forest to predict the class
+//! probabilities via the softmax function:
+//! <pre class="fragment">
+//!   \f$\displaystyle l_i(p) = -\sum_i a_{ij} \log(\sigma(p))\f$
+//! </pre>
+//! where \f$a_i\f$ denotes the actual class of the i'th example, \f$p\f$ denotes
+//! the vector valued prediction and \f$\sigma(p)\$ is the softmax function, i.e.
+//! \f$[\sigma(p)]_j = \frac{e^{p_i}}{\sum_k e^{p_k}}\f$.
+class MATHS_EXPORT CMultinomialLogisticLoss final : public CLoss {
 public:
     static const std::string NAME;
 
 public:
+    CMultinomialLogisticLoss(std::size_t numberClasses);
     std::unique_ptr<CLoss> clone() const override;
     std::size_t numberParameters() const override;
     double value(const TMemoryMappedFloatVector& prediction,
@@ -247,8 +364,11 @@ class MATHS_EXPORT CBinomialLogistic final : public CLoss {
                    double weight = 1.0) const override;
     bool isCurvatureConstant() const override;
     TDoubleVector transform(const TMemoryMappedFloatVector& prediction) const override;
-    CArgMinLoss minimizer(double lambda) const override;
+    CArgMinLoss minimizer(double lambda, const CPRNG::CXorOShiro128Plus& rng) const override;
     const std::string& name() const override;
+
+private:
+    std::size_t m_NumberClasses;
 };
 }
 }

diff --git a/include/maths/CKMeans.h b/include/maths/CKMeans.h
@@ -21,6 +21,7 @@
 #include <boost/iterator/counting_iterator.hpp>
 
 #include <cstddef>
+#include <cstdint>
 #include <sstream>
 #include <utility>
 #include <vector>
@@ -125,15 +126,15 @@ class CKMeans {
         const TPointVec& points() const { return m_Points; }
 
         //! Get the cluster checksum.
-        uint64_t checksum() const { return m_Checksum; }
+        std::uint64_t checksum() const { return m_Checksum; }
 
     private:
         //! The centroid of the points in this cluster.
         POINT m_Centre;
         //! The points in the cluster.
         TPointVec m_Points;
         //! A checksum for the points in the cluster.
-        uint64_t m_Checksum;
+        std::uint64_t m_Checksum;
     };
 
     using TClusterVec = std::vector<CCluster>;
@@ -183,8 +184,9 @@ class CKMeans {
         if (m_Centres.empty()) {
             return true;
         }
+        TMeanAccumulatorVec newCentres;
         for (std::size_t i = 0u; i < maxIterations; ++i) {
-            if (!this->updateCentres()) {
+            if (!this->updateCentres(newCentres)) {
                 return true;
             }
         }
@@ -481,19 +483,19 @@ class CKMeans {
 
 private:
     //! Single iteration of Lloyd's algorithm to update \p centres.
-    bool updateCentres() {
-        const TCoordinate precision = TCoordinate(5) *
-                                      std::numeric_limits<TCoordinate>::epsilon();
-        TMeanAccumulatorVec newCentres(m_Centres.size(),
-                                       TMeanAccumulator(las::zero(m_Centres[0])));
+    bool updateCentres(TMeanAccumulatorVec& newCentres) {
+        const TCoordinate precision{TCoordinate(5) *
+                                    std::numeric_limits<TCoordinate>::epsilon()};
+        newCentres.assign(m_Centres.size(), TMeanAccumulator(las::zero(m_Centres[0])));
         CCentroidComputer computer(m_Centres, newCentres);
         m_Points.preorderDepthFirst(computer);
         bool changed = false;
+        POINT newCentre;
         for (std::size_t i = 0u; i < newCentres.size(); ++i) {
-            POINT newCentre(CBasicStatistics::mean(newCentres[i]));
+            newCentre = CBasicStatistics::mean(newCentres[i]);
             if (las::distance(m_Centres[i], newCentre) >
                 precision * las::norm(m_Centres[i])) {
-                m_Centres[i] = newCentre;
+                las::swap(m_Centres[i], newCentre);
                 changed = true;
             }
         }