flucoma · tremblap · May 17, 2022 · May 16, 2022 · May 16, 2022
diff --git a/include/algorithms/public/Normalization.hpp b/include/algorithms/public/Normalization.hpp
@@ -10,7 +10,7 @@ under the European Union’s Horizon 2020 research and innovation programme
 
 #pragma once
 
-#include "../util/AlgorithmUtils.hpp"
+#include "../util/ScalerUtils.hpp"
 #include "../util/FluidEigenMappings.hpp"
 #include "../../data/TensorTypes.hpp"
 #include <Eigen/Core>
@@ -32,10 +32,13 @@ class Normalization
     using namespace _impl;
     mMin = min;
     mMax = max;
+    mRange = mMax - mMin;
+    handleZerosInScale(mRange);
     ArrayXXd input = asEigen<Array>(in);
     mDataMin = input.colwise().minCoeff();
     mDataMax = input.colwise().maxCoeff();
     mDataRange = mDataMax - mDataMin;
+    handleZerosInScale(mDataRange);
     mInitialized = true;
   }
 
@@ -49,7 +52,7 @@ class Normalization
     mDataMin = asEigen<Array>(dataMin);
     mDataMax = asEigen<Array>(dataMax);
     mDataRange = mDataMax - mDataMin;
-    mDataRange = mDataRange.max(epsilon);
+    handleZerosInScale(mDataRange);
     mInitialized = true;
   }
 
@@ -62,12 +65,12 @@ class Normalization
     ArrayXd result;
     if (!inverse)
     {
-      result = (input - mDataMin) / mDataRange.max(epsilon);
-      result = mMin + (result * (mMax - mMin));
+      result = (input - mDataMin) / mDataRange;
+      result = mMin + (result * mRange);
     }
     else
     {
-      result = (input - mMin) / std::max((mMax - mMin), epsilon);
+      result = (input - mMin) / mRange;
       result = mDataMin + (result * mDataRange);
     }
     out <<= asFluid(result);
@@ -83,21 +86,29 @@ class Normalization
     if (!inverse)
     {
       result = (input.rowwise() - mDataMin.transpose());
-      result = result.rowwise() / mDataRange.transpose().max(epsilon);
-      result = mMin + (result * (mMax - mMin));
+      result = result.rowwise() / mDataRange.transpose();
+      result = mMin + (result * mRange);
     }
     else
     {
       result = input - mMin;
-      result = result / std::max((mMax - mMin), epsilon);
+      result = result / mRange;
       result = (result.rowwise() * mDataRange.transpose());
       result = (result.rowwise() + mDataMin.transpose());
     }
     out <<= asFluid(result);
   }
 
-  void setMin(double min) { mMin = min; }
-  void setMax(double max) { mMax = max; }
+  void setMin(double min) { 
+    mMin = min; 
+    mRange = mMax - mMin;
+    handleZerosInScale(mRange);
+  }
+  void setMax(double max) { 
+    mMax = max;
+    mRange = mMax - mMin;
+    handleZerosInScale(mRange);
+  }
   bool initialized() const { return mInitialized; }
 
   double getMin() const { return mMin; }
@@ -130,6 +141,7 @@ class Normalization
 
   double  mMin{0.0};
   double  mMax{1.0};
+  double  mRange{1.0};
   ArrayXd mDataMin;
   ArrayXd mDataMax;
   ArrayXd mDataRange;

diff --git a/include/algorithms/public/RobustScaling.hpp b/include/algorithms/public/RobustScaling.hpp
@@ -11,6 +11,7 @@ under the European Union’s Horizon 2020 research and innovation programme
 // modified version of Normalization.hpp code
 #pragma once
 
+#include "../util/ScalerUtils.hpp"
 #include "../util/FluidEigenMappings.hpp"
 #include "../../data/TensorTypes.hpp"
 #include <Eigen/Core>
@@ -30,7 +31,6 @@ class RobustScaling
   {
     using namespace Eigen;
     using namespace _impl;
-    const double epsilon = std::numeric_limits<double>::epsilon();
     mLow = low;
     mHigh = high;
     ArrayXXd input = asEigen<Array>(in);
@@ -48,7 +48,7 @@ class RobustScaling
       mDataHigh(i) = sorted(lrint((mHigh / 100.0) * (length - 1)));
     }
     mRange = mDataHigh - mDataLow;
-    mRange = mRange.max(epsilon);
+    handleZerosInScale(mRange);
     mInitialized = true;
   }
 
@@ -58,15 +58,13 @@ class RobustScaling
   {
     using namespace Eigen;
     using namespace _impl;
-    const double epsilon = std::numeric_limits<double>::epsilon();
     mLow = low;
     mHigh = high;
     mDataLow = asEigen<Array>(dataLow);
     mDataHigh = asEigen<Array>(dataHigh);
     mMedian = asEigen<Array>(median);
     mRange = asEigen<Array>(range);
-    mRange =
-        mRange.max(epsilon); // in case it is imported from the outside world
+    handleZerosInScale(mRange); // in case it is imported from the outside world
     mInitialized = true;
   }
 

diff --git a/include/algorithms/public/Standardization.hpp b/include/algorithms/public/Standardization.hpp
@@ -10,7 +10,7 @@ under the European Union’s Horizon 2020 research and innovation programme
 
 #pragma once
 
-#include "../util/AlgorithmUtils.hpp"
+#include "../util/ScalerUtils.hpp"
 #include "../util/FluidEigenMappings.hpp"
 #include "../../data/TensorTypes.hpp"
 #include <Eigen/Core>
@@ -34,6 +34,7 @@ class Standardization
     mMean = input.colwise().mean();
     mStd = ((input.rowwise() - mMean.transpose()).square().colwise().mean())
                .sqrt();
+    handleZerosInScale(mStd);
     mInitialized = true;
   }
 
@@ -43,6 +44,7 @@ class Standardization
     using namespace _impl;
     mMean = asEigen<Array>(mean);
     mStd = asEigen<Array>(std);
+    handleZerosInScale(mStd);
     mInitialized = true;
   }
 
@@ -53,7 +55,7 @@ class Standardization
     using namespace _impl;
     ArrayXd input = asEigen<Array>(in);
     ArrayXd result;
-    if (!inverse) { result = (input - mMean) / mStd.max(epsilon); }
+    if (!inverse) { result = (input - mMean) / mStd; }
     else
     {
       result = (input * mStd) + mMean;
@@ -72,7 +74,7 @@ class Standardization
     if (!inverse)
     {
       result = (input.rowwise() - mMean.transpose());
-      result = result.rowwise() / mStd.transpose().max(epsilon);
+      result = result.rowwise() / mStd.transpose();
     }
     else
     {

diff --git a/include/algorithms/util/ScalerUtils.hpp b/include/algorithms/util/ScalerUtils.hpp
@@ -0,0 +1,28 @@
+/*
+Part of the Fluid Corpus Manipulation Project (http://www.flucoma.org/)
+Copyright 2017-2019 University of Huddersfield.
+Licensed under the BSD-3 License.
+See license.md file in the project root for full license information.
+This project has received funding from the European Research Council (ERC)
+under the European Union’s Horizon 2020 research and innovation programme
+(grant agreement No 725899).
+*/
+
+#pragma once
+
+#include <Eigen/Core>
+#include "../util/AlgorithmUtils.hpp"
+
+// In scalers, the range cannot be too small otherwise it gets unmanageable as denominator
+// To sanitize, we set an arbitrary threshold of 10*epsilon and replace by 1 if smaller
+// This is in line with scikit learn behaviour (https://github.com/scikit-learn/scikit-learn/blob/16625450b58f555dc3955d223f0c3b64a5686984/sklearn/preprocessing/_data.py#L88-L118)
+
+void handleZerosInScale(Eigen::ArrayXd& rangeArray)
+{
+  rangeArray = (rangeArray < 10 * fluid::algorithm::epsilon).select(1,rangeArray);
+}
+
+void handleZerosInScale(double& range)
+{
+  range = (range < (10 * fluid::algorithm::epsilon)) ? 1 : range;
+}