Skip to content

Commit

Permalink
SIMD style exp for doubles
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Sep 26, 2024
1 parent db714e3 commit 05e3b26
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 6 deletions.
53 changes: 49 additions & 4 deletions shared/libebm/compute/cpu_ebm/cpu_64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ struct Cpu_64_Int final {

inline Cpu_64_Int operator+(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data + other.m_data); }

inline Cpu_64_Int operator-(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data - other.m_data); }

inline Cpu_64_Int operator*(const T& other) const noexcept { return Cpu_64_Int(m_data * other); }

inline Cpu_64_Int operator>>(int shift) const noexcept { return Cpu_64_Int(m_data >> shift); }
Expand All @@ -82,13 +84,28 @@ struct Cpu_64_Int final {

inline Cpu_64_Int operator&(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data & other.m_data); }

inline Cpu_64_Int operator|(const Cpu_64_Int& other) const noexcept { return Cpu_64_Int(m_data | other.m_data); }

friend inline Cpu_64_Int IfThenElse(const bool cmp, const Cpu_64_Int& trueVal, const Cpu_64_Int& falseVal) noexcept {
return cmp ? trueVal : falseVal;
}

private:
TPack m_data;
};
static_assert(std::is_standard_layout<Cpu_64_Int>::value && std::is_trivially_copyable<Cpu_64_Int>::value,
"This allows offsetof, memcpy, memset, inter-language, GPU and cross-machine use where needed");

template<bool bNegateInput = false,
bool bNaNPossible = true,
bool bUnderflowPossible = true,
bool bOverflowPossible = true>
inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept;

struct Cpu_64_Float final {
template<bool bNegateInput, bool bNaNPossible, bool bUnderflowPossible, bool bOverflowPossible>
friend Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept;

using T = double;
using TPack = double;
using TInt = Cpu_64_Int;
Expand All @@ -106,6 +123,8 @@ struct Cpu_64_Float final {
inline Cpu_64_Float(const double val) noexcept : m_data(static_cast<T>(val)) {}
inline Cpu_64_Float(const float val) noexcept : m_data(static_cast<T>(val)) {}
inline Cpu_64_Float(const int val) noexcept : m_data(static_cast<T>(val)) {}
inline Cpu_64_Float(const int64_t val) noexcept : m_data(static_cast<T>(val)) {}
explicit Cpu_64_Float(const Cpu_64_Int& val) : m_data(static_cast<T>(val.m_data)) {}

inline Cpu_64_Float operator+() const noexcept { return *this; }

Expand Down Expand Up @@ -179,6 +198,10 @@ struct Cpu_64_Float final {
return Cpu_64_Float(val) / other;
}

friend inline bool operator<=(const Cpu_64_Float& left, const Cpu_64_Float& right) noexcept {
return left.m_data <= right.m_data;
}

inline static Cpu_64_Float Load(const T* const a) noexcept { return Cpu_64_Float(*a); }

inline void Store(T* const a) const noexcept { *a = m_data; }
Expand Down Expand Up @@ -207,6 +230,11 @@ struct Cpu_64_Float final {
return cmp1.m_data < cmp2.m_data ? trueVal : falseVal;
}

friend inline Cpu_64_Float IfThenElse(
const bool cmp, const Cpu_64_Float& trueVal, const Cpu_64_Float& falseVal) noexcept {
return cmp ? trueVal : falseVal;
}

friend inline Cpu_64_Float IfEqual(const Cpu_64_Float& cmp1,
const Cpu_64_Float& cmp2,
const Cpu_64_Float& trueVal,
Expand All @@ -226,10 +254,24 @@ struct Cpu_64_Float final {
return cmp1.m_data == cmp2.m_data ? trueVal : falseVal;
}

friend inline Cpu_64_Float Abs(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::abs(val.m_data)); }
static inline bool ReinterpretInt(const bool val) noexcept { return val; }

static inline Cpu_64_Int ReinterpretInt(const Cpu_64_Float& val) noexcept {
typename Cpu_64_Int::T mem;
memcpy(&mem, &val.m_data, sizeof(T));
return Cpu_64_Int(mem);
}

static inline Cpu_64_Float ReinterpretFloat(const Cpu_64_Int& val) noexcept {
T mem;
memcpy(&mem, &val.m_data, sizeof(T));
return Cpu_64_Float(mem);
}

friend inline Cpu_64_Float Round(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::round(val.m_data)); }

friend inline Cpu_64_Float Abs(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::abs(val.m_data)); }

friend inline Cpu_64_Float FastApproxReciprocal(const Cpu_64_Float& val) noexcept {
return Cpu_64_Float(T{1.0} / val.m_data);
}
Expand All @@ -250,8 +292,6 @@ struct Cpu_64_Float final {

friend inline Cpu_64_Float Sqrt(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::sqrt(val.m_data)); }

friend inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::exp(val.m_data)); }

friend inline Cpu_64_Float Log(const Cpu_64_Float& val) noexcept { return Cpu_64_Float(std::log(val.m_data)); }

template<bool bDisableApprox,
Expand All @@ -264,7 +304,7 @@ struct Cpu_64_Float final {
static inline Cpu_64_Float ApproxExp(const Cpu_64_Float& val,
const int32_t addExpSchraudolphTerm = k_expTermZeroMeanErrorForSoftmaxWithZeroedLogit) noexcept {
UNUSED(addExpSchraudolphTerm);
return Exp(bNegateInput ? -val : val);
return Exp<bNegateInput, bNaNPossible, bUnderflowPossible, bOverflowPossible>(val);
}

template<bool bDisableApprox,
Expand Down Expand Up @@ -354,6 +394,11 @@ struct Cpu_64_Float final {
static_assert(std::is_standard_layout<Cpu_64_Float>::value && std::is_trivially_copyable<Cpu_64_Float>::value,
"This allows offsetof, memcpy, memset, inter-language, GPU and cross-machine use where needed");

template<bool bNegateInput, bool bNaNPossible, bool bUnderflowPossible, bool bOverflowPossible>
inline Cpu_64_Float Exp(const Cpu_64_Float& val) noexcept {
return Exp64<Cpu_64_Float, bNegateInput, bNaNPossible, bUnderflowPossible, bOverflowPossible>(val);
}

INTERNAL_IMPORT_EXPORT_BODY ErrorEbm ApplyUpdate_Cpu_64(
const ObjectiveWrapper* const pObjectiveWrapper, ApplyUpdateBridge* const pData) {
const Objective* const pObjective = static_cast<const Objective*>(pObjectiveWrapper->m_pObjective);
Expand Down
108 changes: 106 additions & 2 deletions shared/libebm/compute/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,15 @@ template<typename TFloat> static INLINE_ALWAYS typename TFloat::TInt Exponent32(
return ((TFloat::ReinterpretInt(val) << 1) >> 24) - typename TFloat::TInt(0x7F);
}

template<typename TFloat> static INLINE_ALWAYS TFloat Power2(const TFloat val) {
template<typename TFloat> static INLINE_ALWAYS TFloat Power32(const TFloat val) {
return TFloat::ReinterpretFloat(TFloat::ReinterpretInt(val + TFloat{8388608 + 127}) << 23);
}

template<typename TFloat> static INLINE_ALWAYS TFloat Power64(const TFloat val) {
return TFloat::ReinterpretFloat(
TFloat::ReinterpretInt(val + TFloat{int64_t{4503599627370496} + int64_t{1023}}) << 52);
}

template<typename TFloat>
static INLINE_ALWAYS TFloat Polynomial(const TFloat x,
const TFloat c0,
Expand Down Expand Up @@ -60,6 +65,32 @@ static INLINE_ALWAYS TFloat Polynomial(const TFloat x,
FusedMultiplyAdd(FusedMultiplyAdd(c3, x, c2), x2, FusedMultiplyAdd(c1, x, c0) + c8 * x8));
}

template<typename TFloat>
static INLINE_ALWAYS TFloat Polynomial(const TFloat x,
const TFloat c2,
const TFloat c3,
const TFloat c4,
const TFloat c5,
const TFloat c6,
const TFloat c7,
const TFloat c8,
const TFloat c9,
const TFloat c10,
const TFloat c11,
const TFloat c12,
const TFloat c13) {
TFloat x2 = x * x;
TFloat x4 = x2 * x2;
TFloat x8 = x4 * x4;
return FusedMultiplyAdd(FusedMultiplyAdd(FusedMultiplyAdd(c13, x, c12),
x4,
FusedMultiplyAdd(FusedMultiplyAdd(c11, x, c10), x2, FusedMultiplyAdd(c9, x, c8))),
x8,
FusedMultiplyAdd(FusedMultiplyAdd(FusedMultiplyAdd(c7, x, c6), x2, FusedMultiplyAdd(c5, x, c4)),
x4,
FusedMultiplyAdd(FusedMultiplyAdd(c3, x, c2), x2, x)));
}

template<typename TFloat,
bool bNegateInput = false,
bool bNaNPossible = true,
Expand Down Expand Up @@ -89,7 +120,7 @@ static INLINE_ALWAYS TFloat Exp32(const TFloat val) {
TFloat{1} / TFloat{5040});
ret = FusedMultiplyAdd(ret, x2, x);

const TFloat rounded2 = Power2(rounded);
const TFloat rounded2 = Power32(rounded);

ret = (ret + TFloat{1}) * rounded2;

Expand Down Expand Up @@ -210,6 +241,79 @@ static INLINE_ALWAYS TFloat Log32(const TFloat& val) noexcept {
return ret;
}

template<typename TFloat,
bool bNegateInput = false,
bool bNaNPossible = true,
bool bUnderflowPossible = true,
bool bOverflowPossible = true>
static INLINE_ALWAYS TFloat Exp64(const TFloat val) {
// algorithm comes from:
// https://github.com/vectorclass/version2/blob/f4617df57e17efcd754f5bbe0ec87883e0ed9ce6/vectormath_exp.h#L327

// k_expUnderflow is set to a value that prevents us from returning a denormal number.
static constexpr float k_expUnderflow = -708.25f; // this is exactly representable in IEEE 754
static constexpr float k_expOverflow = 708.25f; // this is exactly representable in IEEE 754

// TODO: make this negation more efficient
TFloat x = bNegateInput ? -val : val;
const TFloat rounded = Round(x * TFloat{1.44269504088896340736});
x = FusedNegateMultiplyAdd(rounded, TFloat{0.693145751953125}, x);
x = FusedNegateMultiplyAdd(rounded, TFloat{1.42860682030941723212E-6}, x);

TFloat ret = Polynomial(x,
TFloat{1} / TFloat{2},
TFloat{1} / TFloat{6},
TFloat{1} / TFloat{24},
TFloat{1} / TFloat{120},
TFloat{1} / TFloat{720},
TFloat{1} / TFloat{5040},
TFloat{1} / TFloat{40320},
TFloat{1} / TFloat{362880},
TFloat{1} / TFloat{3628800},
TFloat{1} / TFloat{39916800},
TFloat{1} / TFloat{479001600},
TFloat{1} / TFloat{int64_t{6227020800}});

const TFloat rounded2 = Power64(rounded);

ret = (ret + TFloat{1}) * rounded2;

if(bOverflowPossible) {
if(bNegateInput) {
ret = IfLess(val,
static_cast<typename TFloat::T>(-k_expOverflow),
std::numeric_limits<typename TFloat::T>::infinity(),
ret);
} else {
ret = IfLess(static_cast<typename TFloat::T>(k_expOverflow),
val,
std::numeric_limits<typename TFloat::T>::infinity(),
ret);
}
}
if(bUnderflowPossible) {
if(bNegateInput) {
ret = IfLess(static_cast<typename TFloat::T>(-k_expUnderflow), val, 0.0f, ret);
} else {
ret = IfLess(val, static_cast<typename TFloat::T>(k_expUnderflow), 0.0f, ret);
}
}
if(bNaNPossible) {
ret = IfNaN(val, val, ret);
}

#ifndef NDEBUG
TFloat::Execute(
[](int, typename TFloat::T orig, typename TFloat::T ret) {
EBM_ASSERT(IsApproxEqual(std::exp(orig), ret, typename TFloat::T{1e-12}));
},
bNegateInput ? -val : val,
ret);
#endif // NDEBUG

return ret;
}

} // namespace DEFINED_ZONE_NAME

#endif // REGISTRATION_HPP

0 comments on commit 05e3b26

Please sign in to comment.