Skip to content

algo: implement sine/cosine based on Intel SVML for single/double precision #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions npsr/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#ifndef NUMPY_SIMD_ROUTINES_NPSR_COMMON_H_
#define NUMPY_SIMD_ROUTINES_NPSR_COMMON_H_

#include "hwy/highway.h"

#include <cfenv>
#include <type_traits>

namespace npsr {

struct _NoLargeArgument {};
struct _NoSpecialCases {};
struct _NoExceptions {};
struct _LowAccuracy {};
constexpr auto kNoLargeArgument = _NoLargeArgument{};
constexpr auto kNoSpecialCases = _NoSpecialCases{};
constexpr auto kNoExceptions = _NoExceptions{};
constexpr auto kLowAccuracy = _LowAccuracy{};

struct Round {
struct _Force {};
struct _Nearest {};
struct _Down {};
struct _Up {};
struct _Zero {};
static constexpr auto kForce = _Force{};
static constexpr auto kNearest = _Nearest{};
#if 0 // not used yet
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add these when we come across use cases for them?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, this good point .. I'm gonna remove them

static constexpr auto kDown = _Down{};
static constexpr auto kUp = _Up{};
static constexpr auto kZero = _Zero{};
#endif
};

struct Subnormal {
struct _DAZ {};
struct _FTZ {};
struct _IEEE754 {};
#if 0 // not used yet
static constexpr auto kDAZ = _DAZ{};
static constexpr auto kFTZ = _FTZ{};
#endif
static constexpr auto kIEEE754 = _IEEE754{};
};

struct FPExceptions {
static constexpr auto kNone = 0;
static constexpr auto kInvalid = FE_INVALID;
static constexpr auto kDivByZero = FE_DIVBYZERO;
static constexpr auto kOverflow = FE_OVERFLOW;
static constexpr auto kUnderflow = FE_UNDERFLOW;
};

/**
* @brief RAII floating-point precision control class
*
* The Precise class provides automatic management of floating-point environment
* settings during its lifetime. It uses RAII principles to save the current
* floating-point state on construction and restore it on destruction.
*
* The class is configured using variadic template arguments that specify
* the desired floating-point behavior through tag types.
*
* **IMPORTANT PERFORMANCE NOTE**: Create the Precise object BEFORE loops,
* not inside them. The constructor and destructor have overhead from saving
* and restoring floating-point state, so it should be done once per
* computational scope, not per iteration.
*
* @tparam Args Variadic template arguments for configuration flags
*
* @example
* ```cpp
* using namespace hwy::HWY_NAMESPACE;
* using namespace npsr;
* using namespace npsr::HWY_NAMESPACE;
*
* Precise precise = {kLowAccuracy, kNoSpecialCases, kNoLargeArgument};
* const ScalableTag<float> d;
* typename V = Vec<DFromV<SclableTag>>;
Copy link
Preview

Copilot AI Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a typo in the documentation comment: 'SclableTag' should be corrected to 'ScalableTag'.

Suggested change
* typename V = Vec<DFromV<SclableTag>>;
* typename V = Vec<DFromV<ScalableTag>>;

Copilot uses AI. Check for mistakes.

* for (size_t i = 0; i < n; i += Lanes(d)) {
* V input = LoadU(d, &input[i]);
* V result = Sin(precise, input);
* StoreU(result, d, &output[i]);
* }
* ```
*/
template <typename... Args> class Precise {
public:
Precise(Args...) {
if constexpr (!kNoExceptions) {
fegetexceptflag(&_exceptions, FE_ALL_EXCEPT);
}
if constexpr (kRoundForce) {
_rounding_mode = fegetround();
int new_mode = _NewRoundingMode();
if (_rounding_mode != new_mode) {
_retrieve_rounding_mode = true;
fesetround(new_mode);
}
}
}
void FlushExceptions() { fesetexceptflag(&_exceptions, FE_ALL_EXCEPT); }

void Raise(int errors) {
static_assert(!kNoExceptions,
"Cannot raise exceptions in NoExceptions mode");
_exceptions |= errors;
}
~Precise() {
FlushExceptions();
if constexpr (kRoundForce) {
if (_retrieve_rounding_mode) {
fesetround(_rounding_mode);
}
}
}
static constexpr bool kNoExceptions =
(std::is_same_v<_NoExceptions, Args> || ...);
static constexpr bool kNoLargeArgument =
(std::is_same_v<_NoLargeArgument, Args> || ...);
static constexpr bool kNoSpecialCases =
(std::is_same_v<_NoSpecialCases, Args> || ...);
static constexpr bool kLowAccuracy =
(std::is_same_v<_LowAccuracy, Args> || ...);
// defaults to high accuracy if no low accuracy flag is set
static constexpr bool kHighAccuracy = !kLowAccuracy;
// defaults to large argument support if no no large argument flag is set
static constexpr bool kLargeArgument = !kNoLargeArgument;
// defaults to special cases support if no no special cases flag is set
static constexpr bool kSpecialCases = !kNoSpecialCases;
// defaults to exception support if no no exception flag is set
static constexpr bool kException = !kNoExceptions;

static constexpr bool kRoundForce =
(std::is_same_v<Round::_Force, Args> || ...);
static constexpr bool _kRoundNearest =
(std::is_same_v<Round::_Nearest, Args> || ...);
static constexpr bool kRoundZero =
(std::is_same_v<Round::_Zero, Args> || ...);
static constexpr bool kRoundDown =
(std::is_same_v<Round::_Down, Args> || ...);
static constexpr bool kRoundUp = (std::is_same_v<Round::_Up, Args> || ...);
// only one rounding mode can be set
static_assert((_kRoundNearest + kRoundDown + kRoundUp + kRoundZero) <= 1,
"Only one rounding mode can be set at a time");
// if no rounding mode is set, default to round nearest
static constexpr bool kRoundNearest =
_kRoundNearest || (!kRoundDown && !kRoundUp && !kRoundZero);

static constexpr bool kDAZ = (std::is_same_v<Subnormal::_DAZ, Args> || ...);
static constexpr bool kFTZ = (std::is_same_v<Subnormal::_FTZ, Args> || ...);
static constexpr bool _kIEEE754 =
(std::is_same_v<Subnormal::_IEEE754, Args> || ...);
static_assert(!_kIEEE754 || !(kDAZ || kFTZ),
"IEEE754 mode cannot be used "
"with Denormals Are Zero (DAZ) or Flush To Zero (FTZ) "
"subnormal handling");
static constexpr bool kIEEE754 = _kIEEE754 || !(kDAZ || kFTZ);

private:
int _NewRoundingMode() const {
if constexpr (kRoundDown) {
return FE_DOWNWARD;
} else if constexpr (kRoundUp) {
return FE_UPWARD;
} else if constexpr (kRoundZero) {
return FE_TOWARDZERO;
} else {
return FE_TONEAREST;
}
}
int _rounding_mode = 0;
bool _retrieve_rounding_mode = false;
fexcept_t _exceptions;
};

} // namespace npsr

#endif // NUMPY_SIMD_ROUTINES_NPSR_COMMON_H_
12 changes: 12 additions & 0 deletions npsr/npsr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// To include them once per target, which is ensured by the toggle check.
// clang-format off
#if defined(_NPSR_NPSR_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
#ifdef _NPSR_NPSR_H_
#undef _NPSR_NPSR_H_
#else
#define _NPSR_NPSR_H_
#endif

#include "npsr/trig/inl.h"

#endif // _NPSR_NPSR_H_
61 changes: 61 additions & 0 deletions npsr/trig/inl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "npsr/common.h"

#include "npsr/sincos/large-inl.h"
#include "npsr/sincos/small-inl.h"

// clang-format off
#if defined(NPSR_TRIG_INL_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be trig-inl.h?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indeed

#ifdef NPSR_TRIG_INL_H_
#undef NPSR_TRIG_INL_H_
#else
#define NPSR_TRIG_INL_H_
#endif

HWY_BEFORE_NAMESPACE();

namespace npsr::HWY_NAMESPACE::sincos {
template <bool IS_COS, typename V, typename Prec>
HWY_API V SinCos(Prec &prec, V x) {
Copy link
Member

@Mousius Mousius Jun 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potentially worth creating a new macro for HWY_API so it doesn't change behaviour for Highway?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest a name? HWY_API just combine HWY_ATTR and HWY_INLINE.

using namespace hwy::HWY_NAMESPACE;
constexpr bool kIsSingle = std::is_same_v<TFromV<V>, float>;
const DFromV<V> d;
V ret;
if constexpr (Prec::kLowAccuracy) {
ret = SmallArgLow<IS_COS>(x);
}
else {
ret = SmallArg<IS_COS>(x);
}
if constexpr (Prec::kLargeArgument) {
// Identify inputs requiring extended precision (very large arguments)
auto has_large_arg = Gt(Abs(x), Set(d, kIsSingle ? 10000.0 : 16777216.0));
if (HWY_UNLIKELY(!AllFalse(d, has_large_arg))) {
// Use extended precision algorithm for large arguments
ret = IfThenElse(has_large_arg, LargeArg<IS_COS>(x), ret);
}
}
if constexpr (Prec::kSpecialCases || Prec::kException) {
auto is_finite = IsFinite(x);
ret = IfThenElse(is_finite, ret, NaN(d));
if constexpr (Prec::kException) {
prec.Raise(!AllFalse(d, IsInf(x)) ? FPExceptions::kInvalid : 0);
}
}
return ret;
}
} // namespace npsr::HWY_NAMESPACE::sincos

namespace npsr::HWY_NAMESPACE {
template <typename V, typename Prec>
HWY_API V Sin(Prec &prec, V x) {
return sincos::SinCos<false>(prec, x);
}
template <typename V, typename Prec>
HWY_API V Cos(Prec &prec, V x) {
return sincos::SinCos<true>(prec, x);
}
} // namespace npsr::HWY_NAMESPACE

HWY_AFTER_NAMESPACE();

#endif // NPSR_TRIG_INL_H_
Loading