Skip to content

Commit

Permalink
Fix SSE detection on non-AVX CPUs (google#135)
Browse files Browse the repository at this point in the history
Fixes google#4. This is based on google#115 with a few modifications:
 - Removed use of __builtin_cpu_supports since it relies on cpuid and doesn't improve on the current situation,
 - Added detection for all of sse, sse2, sse3, ssse3, sse4_1 and sse4_2,
 - Added tests for Atom, Nehalem, and P3 processors,

Thx to @gadoofou87 for providing the original PR.
It also removes the need for google#92

* Fix SSE detection on non-AVX CPUs
* Fixes typo
* Mock OSX sysctlbyname in tests
* Also update other tests
* FakeCpu is reset between each tests
* Fix conflicting name on Windows
* Disable pre AVX cpu sse detection tests on Windows
* Guard OS specific code with macros
* Fix missing import for tests
* Fix wrong function prototype
* Fix wrong mocking of P3 on Windows
* Completely guard OS specific parts in x86 tests
* Store DWORD instead unsigned long for x86 tests
  • Loading branch information
gchatelet authored Oct 9, 2020
1 parent 22a5362 commit 4795373
Show file tree
Hide file tree
Showing 6 changed files with 428 additions and 38 deletions.
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ macro(add_cpu_features_headers_and_sources HDRS_LIST_NAME SRCS_LIST_NAME)
endif()
endmacro()

if(UNIX AND PROCESSOR_IS_X86)
check_include_file(sys/utsname.h HAVE_UTSNAME_H)
endif()


#
# library : utils
#
Expand Down Expand Up @@ -148,6 +153,14 @@ set_property(TARGET cpu_features PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}
target_include_directories(cpu_features
PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/cpu_features>
)
if(PROCESSOR_IS_X86)
if(HAVE_UTSNAME_H)
target_compile_definitions(cpu_features PRIVATE HAVE_UTSNAME_H)
endif()
if(APPLE)
target_compile_definitions(cpu_features PRIVATE HAVE_SYSCTLBYNAME)
endif()
endif()
add_library(CpuFeature::cpu_features ALIAS cpu_features)

#
Expand Down
4 changes: 4 additions & 0 deletions include/cpu_features_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@
#define CPU_FEATURES_OS_WINDOWS
#endif

#if (defined(__apple__) || defined(__APPLE__) || defined(__MACH__))
#define CPU_FEATURES_OS_DARWIN
#endif

////////////////////////////////////////////////////////////////////////////////
// Compilers
////////////////////////////////////////////////////////////////////////////////
Expand Down
3 changes: 2 additions & 1 deletion include/internal/cpuid_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ typedef struct {
uint32_t eax, ebx, ecx, edx;
} Leaf;

Leaf CpuIdEx(uint32_t leaf_id, int ecx);
// Returns the result of a call to the cpuid instruction.
Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx);

// Returns the eax value of the XCR0 register.
uint32_t GetXCR0Eax(void);
Expand Down
166 changes: 140 additions & 26 deletions src/cpuinfo_x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,21 @@
#error "Cannot compile cpuinfo_x86 on a non x86 platform."
#endif

// The following includes are necessary to provide SSE detections on pre-AVX
// microarchitectures.
#if defined(CPU_FEATURES_OS_WINDOWS)
#include <windows.h> // IsProcessorFeaturePresent
#elif defined(HAVE_UTSNAME_H)
#include <sys/utsname.h>

#include "internal/filesystem.h" // Needed to parse /proc/cpuinfo
#include "internal/stack_line_reader.h" // Needed to parse /proc/cpuinfo
#include "internal/string_view.h" // Needed to parse /proc/cpuinfo
#if defined(HAVE_SYSCTLBYNAME)
#include <sys/sysctl.h>
#endif // HAVE_SYSCTLBYNAME
#endif // HAVE_UTSNAME_H

////////////////////////////////////////////////////////////////////////////////
// Definitions for CpuId and GetXCR0Eax.
////////////////////////////////////////////////////////////////////////////////
Expand All @@ -35,7 +50,7 @@

#include <cpuid.h>

Leaf CpuIdEx(uint32_t leaf_id, int ecx) {
Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) {
Leaf leaf;
__cpuid_count(leaf_id, ecx, leaf.eax, leaf.ebx, leaf.ecx, leaf.edx);
return leaf;
Expand All @@ -55,7 +70,7 @@ uint32_t GetXCR0Eax(void) {
#include <immintrin.h>
#include <intrin.h> // For __cpuidex()

Leaf CpuIdEx(uint32_t leaf_id, int ecx) {
Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) {
Leaf leaf;
int data[4];
__cpuidex(data, leaf_id, ecx);
Expand All @@ -72,13 +87,13 @@ uint32_t GetXCR0Eax(void) { return (uint32_t)_xgetbv(0); }
#error "Unsupported compiler, x86 cpuid requires either GCC, Clang or MSVC."
#endif

static Leaf CpuId(uint32_t leaf_id) { return CpuIdEx(leaf_id, 0); }
static Leaf CpuId(uint32_t leaf_id) { return GetCpuidLeaf(leaf_id, 0); }

static const Leaf kEmptyLeaf;

static Leaf SafeCpuIdEx(uint32_t max_cpuid_leaf, uint32_t leaf_id, int ecx) {
if (leaf_id <= max_cpuid_leaf) {
return CpuIdEx(leaf_id, ecx);
return GetCpuidLeaf(leaf_id, ecx);
} else {
return kEmptyLeaf;
}
Expand Down Expand Up @@ -1082,27 +1097,125 @@ static void ParseLeaf4(const int max_cpuid_leaf, CacheInfo* info) {
// Internal structure to hold the OS support for vector operations.
// Avoid to recompute them since each call to cpuid is ~100 cycles.
typedef struct {
bool have_sse;
bool have_sse_via_os;
bool have_sse_via_cpuid;
bool have_avx;
bool have_avx512;
bool have_amx;
} OsSupport;

static const OsSupport kEmptyOsSupport;

static OsSupport CheckOsSupport(const uint32_t max_cpuid_leaf) {
const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1);
const bool have_xsave = IsBitSet(leaf_1.ecx, 26);
const bool have_osxsave = IsBitSet(leaf_1.ecx, 27);
const bool have_xcr0 = have_xsave && have_osxsave;

OsSupport os_support = kEmptyOsSupport;

if (have_xcr0) {
// AVX capable cpu will expose XCR0.
const uint32_t xcr0_eax = GetXCR0Eax();
os_support.have_sse_via_cpuid = HasXmmOsXSave(xcr0_eax);
os_support.have_avx = HasYmmOsXSave(xcr0_eax);
os_support.have_avx512 = HasZmmOsXSave(xcr0_eax);
os_support.have_amx = HasTmmOsXSave(xcr0_eax);
} else {
// Atom based or older cpus need to ask the OS for sse support.
os_support.have_sse_via_os = true;
}

return os_support;
}

#if defined(CPU_FEATURES_OS_WINDOWS)
#if defined(CPU_FEATURES_MOCK_CPUID_X86)
extern bool GetWindowsIsProcessorFeaturePresent(DWORD);
#else // CPU_FEATURES_MOCK_CPUID_X86
static bool GetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) {
return IsProcessorFeaturePresent(ProcessorFeature);
}
#endif
#endif // CPU_FEATURES_OS_WINDOWS

#if defined(CPU_FEATURES_OS_DARWIN) && defined(HAVE_SYSCTLBYNAME)
#if defined(CPU_FEATURES_MOCK_CPUID_X86)
extern bool GetDarwinSysCtlByName(const char*);
#else // CPU_FEATURES_MOCK_CPUID_X86
static bool GetDarwinSysCtlByName(const char* name) {
int enabled;
size_t enabled_len = sizeof(enabled);
const int failure = sysctlbyname(name, &enabled, &enabled_len, NULL, 0);
return failure ? false : enabled;
}
#endif
#endif // CPU_FEATURES_OS_DARWIN && HAVE_SYSCTLBYNAME

static void DetectSseViaOs(X86Features* features) {
#if defined(CPU_FEATURES_OS_WINDOWS)
// https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
features->sse =
GetWindowsIsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE);
features->sse2 =
GetWindowsIsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE);
features->sse3 =
GetWindowsIsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE);
#elif defined(HAVE_UTSNAME_H)
struct utsname buf;
uname(&buf);
#if defined(CPU_FEATURES_OS_DARWIN) && defined(HAVE_SYSCTLBYNAME)
if (CpuFeatures_StringView_IsEquals(str(buf.sysname), str("Darwin"))) {
// Handling Darwin platform through sysctlbyname when available.
features->sse = GetDarwinSysCtlByName("hw.optional.sse");
features->sse2 = GetDarwinSysCtlByName("hw.optional.sse2");
features->sse3 = GetDarwinSysCtlByName("hw.optional.sse3");
features->ssse3 = GetDarwinSysCtlByName("hw.optional.supplementalsse3");
features->sse4_1 = GetDarwinSysCtlByName("hw.optional.sse4_1");
features->sse4_2 = GetDarwinSysCtlByName("hw.optional.sse4_2");
}
#elif defined(CPU_FEATURES_OS_LINUX_OR_ANDROID)
if (CpuFeatures_StringView_IsEquals(str(buf.sysname), str("Linux"))) {
// Handling Linux platform through /proc/cpuinfo when available.
const int fd = CpuFeatures_OpenFile("/proc/cpuinfo");
if (fd >= 0) {
StackLineReader reader;
StackLineReader_Initialize(&reader, fd);
for (;;) {
const LineResult result = StackLineReader_NextLine(&reader);
const StringView line = result.line;
StringView key, value;
if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) {
if (CpuFeatures_StringView_IsEquals(key, str("flags"))) {
features->sse = CpuFeatures_StringView_HasWord(value, "sse");
features->sse2 = CpuFeatures_StringView_HasWord(value, "sse2");
features->sse3 = CpuFeatures_StringView_HasWord(value, "sse3");
features->ssse3 = CpuFeatures_StringView_HasWord(value, "ssse3");
features->sse4_1 = CpuFeatures_StringView_HasWord(value, "sse4_1");
features->sse4_2 = CpuFeatures_StringView_HasWord(value, "sse4_2");
break;
}
}
if (result.eof) break;
}
CpuFeatures_CloseFile(fd);
}
}
#else // CPU_FEATURES_OS_DARWIN || CPU_FEATURES_OS_LINUX_OR_ANDROID
#error "Unsupported fallback detection of SSE OS support."
#endif
#else // HAVE_UTSNAME_H
#error "Unsupported fallback detection of SSE OS support."
#endif
}

// Reference https://en.wikipedia.org/wiki/CPUID.
static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
OsSupport* os_support) {
static void ParseCpuId(const uint32_t max_cpuid_leaf,
const OsSupport os_support, X86Info* info) {
const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1);
const Leaf leaf_7 = SafeCpuId(max_cpuid_leaf, 7);
const Leaf leaf_7_1 = SafeCpuIdEx(max_cpuid_leaf, 7, 1);

const bool have_xsave = IsBitSet(leaf_1.ecx, 26);
const bool have_osxsave = IsBitSet(leaf_1.ecx, 27);
const uint32_t xcr0_eax = (have_xsave && have_osxsave) ? GetXCR0Eax() : 0;
os_support->have_sse = HasXmmOsXSave(xcr0_eax);
os_support->have_avx = HasYmmOsXSave(xcr0_eax);
os_support->have_avx512 = HasZmmOsXSave(xcr0_eax);
os_support->have_amx = HasTmmOsXSave(xcr0_eax);

const uint32_t family = ExtractBitRange(leaf_1.eax, 11, 8);
const uint32_t extended_family = ExtractBitRange(leaf_1.eax, 27, 20);
const uint32_t model = ExtractBitRange(leaf_1.eax, 7, 4);
Expand Down Expand Up @@ -1142,7 +1255,9 @@ static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
features->vaes = IsBitSet(leaf_7.ecx, 9);
features->vpclmulqdq = IsBitSet(leaf_7.ecx, 10);

if (os_support->have_sse) {
if (os_support.have_sse_via_os) {
DetectSseViaOs(features);
} else if (os_support.have_sse_via_cpuid) {
features->sse = IsBitSet(leaf_1.edx, 25);
features->sse2 = IsBitSet(leaf_1.edx, 26);
features->sse3 = IsBitSet(leaf_1.ecx, 0);
Expand All @@ -1151,13 +1266,13 @@ static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
features->sse4_2 = IsBitSet(leaf_1.ecx, 20);
}

if (os_support->have_avx) {
if (os_support.have_avx) {
features->fma3 = IsBitSet(leaf_1.ecx, 12);
features->avx = IsBitSet(leaf_1.ecx, 28);
features->avx2 = IsBitSet(leaf_7.ebx, 5);
}

if (os_support->have_avx512) {
if (os_support.have_avx512) {
features->avx512f = IsBitSet(leaf_7.ebx, 16);
features->avx512cd = IsBitSet(leaf_7.ebx, 28);
features->avx512er = IsBitSet(leaf_7.ebx, 27);
Expand All @@ -1179,7 +1294,7 @@ static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
features->avx512_vp2intersect = IsBitSet(leaf_7.edx, 8);
}

if (os_support->have_amx) {
if (os_support.have_amx) {
features->amx_bf16 = IsBitSet(leaf_7.edx, 22);
features->amx_tile = IsBitSet(leaf_7.edx, 24);
features->amx_int8 = IsBitSet(leaf_7.edx, 25);
Expand All @@ -1195,7 +1310,7 @@ static void ParseExtraAMDCpuId(X86Info* info, OsSupport os_support) {

X86Features* const features = &info->features;

if (os_support.have_sse) {
if (os_support.have_sse_via_cpuid) {
features->sse4a = IsBitSet(leaf_80000001.ecx, 6);
}

Expand All @@ -1205,22 +1320,21 @@ static void ParseExtraAMDCpuId(X86Info* info, OsSupport os_support) {
}

static const X86Info kEmptyX86Info;
static const OsSupport kEmptyOsSupport;
static const CacheInfo kEmptyCacheInfo;

X86Info GetX86Info(void) {
X86Info info = kEmptyX86Info;
OsSupport os_support = kEmptyOsSupport;
const Leaf leaf_0 = CpuId(0);
const bool is_intel = IsVendor(leaf_0, "GenuineIntel");
const bool is_amd = IsVendor(leaf_0, "AuthenticAMD");
SetVendor(leaf_0, info.vendor);
if (is_intel || is_amd) {
const uint32_t max_cpuid_leaf = leaf_0.eax;
ParseCpuId(max_cpuid_leaf, &info, &os_support);
}
if (is_amd) {
ParseExtraAMDCpuId(&info, os_support);
const OsSupport os_support = CheckOsSupport(max_cpuid_leaf);
ParseCpuId(max_cpuid_leaf, os_support, &info);
if (is_amd) {
ParseExtraAMDCpuId(&info, os_support);
}
}
return info;
}
Expand Down
6 changes: 6 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ add_test(NAME unix_features_aggregator_test COMMAND unix_features_aggregator_tes
if(PROCESSOR_IS_X86)
add_executable(cpuinfo_x86_test cpuinfo_x86_test.cc ../src/cpuinfo_x86.c)
target_compile_definitions(cpuinfo_x86_test PUBLIC CPU_FEATURES_MOCK_CPUID_X86)
if(HAVE_UTSNAME_H)
target_compile_definitions(cpuinfo_x86_test PRIVATE HAVE_UTSNAME_H)
endif()
if(APPLE)
target_compile_definitions(cpuinfo_x86_test PRIVATE HAVE_SYSCTLBYNAME)
endif()
target_link_libraries(cpuinfo_x86_test all_libraries)
add_test(NAME cpuinfo_x86_test COMMAND cpuinfo_x86_test)
endif()
Expand Down
Loading

0 comments on commit 4795373

Please sign in to comment.