Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SSE detection on non-AVX CPUs #135

Merged
merged 13 commits into from
Oct 9, 2020
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ macro(add_cpu_features_headers_and_sources HDRS_LIST_NAME SRCS_LIST_NAME)
endif()
endmacro()

if(UNIX AND PROCESSOR_IS_X86)
check_include_file(sys/utsname.h HAVE_UTSNAME_H)
endif()


#
# library : utils
#
Expand Down Expand Up @@ -148,6 +153,14 @@ set_property(TARGET cpu_features PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}
target_include_directories(cpu_features
PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/cpu_features>
)
if(PROCESSOR_IS_X86)
if(HAVE_UTSNAME_H)
target_compile_definitions(cpu_features PRIVATE HAVE_UTSNAME_H)
endif()
if(APPLE)
target_compile_definitions(cpu_features PRIVATE HAVE_SYSCTLBYNAME)
endif()
endif()
add_library(CpuFeature::cpu_features ALIAS cpu_features)

#
Expand Down
3 changes: 2 additions & 1 deletion include/internal/cpuid_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ typedef struct {
uint32_t eax, ebx, ecx, edx;
} Leaf;

Leaf CpuIdEx(uint32_t leaf_id, int ecx);
// Returns the result of a call to the cpuid instruction.
Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx);

// Returns the eax value of the XCR0 register.
uint32_t GetXCR0Eax(void);
Expand Down
151 changes: 125 additions & 26 deletions src/cpuinfo_x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,21 @@
#error "Cannot compile cpuinfo_x86 on a non x86 platform."
#endif

// The following includes are necessary to provide SSE detections on pre-AVX
// microarchitectures.
#if defined(CPU_FEATURES_COMPILER_MSC)
#include <windows.h> // IsProcessorFeaturePresent
gchatelet marked this conversation as resolved.
Show resolved Hide resolved
#elif defined(HAVE_UTSNAME_H)
#include <sys/utsname.h>

#include "internal/filesystem.h" // Needed to parse /proc/cpuinfo
#include "internal/stack_line_reader.h" // Needed to parse /proc/cpuinfo
#include "internal/string_view.h" // Needed to parse /proc/cpuinfo
#if defined(HAVE_SYSCTLBYNAME)
#include <sys/sysctl.h>
#endif // HAVE_SYSCTLBYNAME
#endif // HAVE_UTSNAME_H

////////////////////////////////////////////////////////////////////////////////
// Definitions for CpuId and GetXCR0Eax.
////////////////////////////////////////////////////////////////////////////////
Expand All @@ -35,7 +50,7 @@

#include <cpuid.h>

Leaf CpuIdEx(uint32_t leaf_id, int ecx) {
Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) {
Leaf leaf;
__cpuid_count(leaf_id, ecx, leaf.eax, leaf.ebx, leaf.ecx, leaf.edx);
return leaf;
Expand All @@ -55,7 +70,7 @@ uint32_t GetXCR0Eax(void) {
#include <immintrin.h>
#include <intrin.h> // For __cpuidex()

Leaf CpuIdEx(uint32_t leaf_id, int ecx) {
Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) {
Leaf leaf;
int data[4];
__cpuidex(data, leaf_id, ecx);
Expand All @@ -72,13 +87,13 @@ uint32_t GetXCR0Eax(void) { return (uint32_t)_xgetbv(0); }
#error "Unsupported compiler, x86 cpuid requires either GCC, Clang or MSVC."
#endif

static Leaf CpuId(uint32_t leaf_id) { return CpuIdEx(leaf_id, 0); }
static Leaf CpuId(uint32_t leaf_id) { return GetCpuidLeaf(leaf_id, 0); }

static const Leaf kEmptyLeaf;

static Leaf SafeCpuIdEx(uint32_t max_cpuid_leaf, uint32_t leaf_id, int ecx) {
if (leaf_id <= max_cpuid_leaf) {
return CpuIdEx(leaf_id, ecx);
return GetCpuidLeaf(leaf_id, ecx);
} else {
return kEmptyLeaf;
}
Expand Down Expand Up @@ -1082,27 +1097,110 @@ static void ParseLeaf4(const int max_cpuid_leaf, CacheInfo* info) {
// Internal structure to hold the OS support for vector operations.
// Avoid to recompute them since each call to cpuid is ~100 cycles.
typedef struct {
bool have_sse;
bool have_sse_via_os;
bool have_sse_via_cpuid;
bool have_avx;
bool have_avx512;
bool have_amx;
} OsSupport;

static const OsSupport kEmptyOsSupport;

static OsSupport CheckOsSupport(const uint32_t max_cpuid_leaf) {
const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1);
const bool have_xsave = IsBitSet(leaf_1.ecx, 26);
const bool have_osxsave = IsBitSet(leaf_1.ecx, 27);
const bool have_xcr0 = have_xsave && have_osxsave;

OsSupport os_support = kEmptyOsSupport;

if (have_xcr0) {
// AVX capable cpu will expose XCR0.
const uint32_t xcr0_eax = GetXCR0Eax();
os_support.have_sse_via_cpuid = HasXmmOsXSave(xcr0_eax);
os_support.have_avx = HasYmmOsXSave(xcr0_eax);
os_support.have_avx512 = HasZmmOsXSave(xcr0_eax);
os_support.have_amx = HasTmmOsXSave(xcr0_eax);
} else {
// Atom based or older cpus need to ask the OS for sse support.
os_support.have_sse_via_os = true;
}

return os_support;
}

#if defined(HAVE_SYSCTLBYNAME)
#if defined(CPU_FEATURES_MOCK_CPUID_X86)
extern bool SysCtlByName(const char* name);
#else
static bool SysCtlByName(const char* name) {
int enabled;
size_t enabled_len = sizeof(enabled);
const int failure = sysctlbyname(name, &enabled, &enabled_len, NULL, 0);
return failure ? false : enabled;
}
#endif // CPU_FEATURES_MOCK_CPUID_X86
#endif // HAVE_SYSCTLBYNAME

static void DetectSseViaOs(X86Features* features) {
#if defined(CPU_FEATURES_COMPILER_MSC)
// https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent
features->sse = IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE);
features->sse2 = IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE);
features->sse3 = IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE);
#elif defined(HAVE_UTSNAME_H)
struct utsname buf;
uname(&buf);
if (CpuFeatures_StringView_IsEquals(str(buf.sysname), str("Darwin"))) {
#if defined(HAVE_SYSCTLBYNAME)
// Handling Darwin platform through sysctlbyname when available.
features->sse = SysCtlByName("hw.optional.sse");
features->sse2 = SysCtlByName("hw.optional.sse2");
features->sse3 = SysCtlByName("hw.optional.sse3");
features->ssse3 = SysCtlByName("hw.optional.supplementalsse3");
features->sse4_1 = SysCtlByName("hw.optional.sse4_1");
features->sse4_2 = SysCtlByName("hw.optional.sse4_2");
gchatelet marked this conversation as resolved.
Show resolved Hide resolved
#endif // HAVE_SYSCTLBYNAME
} else if (CpuFeatures_StringView_IsEquals(str(buf.sysname), str("Linux"))) {
// Handling Linux platform through /proc/cpuinfo when available.
const int fd = CpuFeatures_OpenFile("/proc/cpuinfo");
if (fd >= 0) {
StackLineReader reader;
StackLineReader_Initialize(&reader, fd);
for (;;) {
const LineResult result = StackLineReader_NextLine(&reader);
const StringView line = result.line;
StringView key, value;
if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) {
if (CpuFeatures_StringView_IsEquals(key, str("flags"))) {
features->sse = CpuFeatures_StringView_HasWord(value, "sse");
features->sse2 = CpuFeatures_StringView_HasWord(value, "sse2");
features->sse3 = CpuFeatures_StringView_HasWord(value, "sse3");
features->ssse3 = CpuFeatures_StringView_HasWord(value, "ssse3");
features->sse4_1 = CpuFeatures_StringView_HasWord(value, "sse4_1");
features->sse4_2 = CpuFeatures_StringView_HasWord(value, "sse4_2");
break;
}
}
if (result.eof) break;
}
CpuFeatures_CloseFile(fd);
}
} else {
// Failed to probe the system.
}
#else // HAVE_UTSNAME_H
#error "Unsupported fallback detection of SSE OS support."
#endif
}

// Reference https://en.wikipedia.org/wiki/CPUID.
static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
OsSupport* os_support) {
static void ParseCpuId(const uint32_t max_cpuid_leaf,
const OsSupport os_support, X86Info* info) {
const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1);
const Leaf leaf_7 = SafeCpuId(max_cpuid_leaf, 7);
const Leaf leaf_7_1 = SafeCpuIdEx(max_cpuid_leaf, 7, 1);

const bool have_xsave = IsBitSet(leaf_1.ecx, 26);
const bool have_osxsave = IsBitSet(leaf_1.ecx, 27);
const uint32_t xcr0_eax = (have_xsave && have_osxsave) ? GetXCR0Eax() : 0;
os_support->have_sse = HasXmmOsXSave(xcr0_eax);
os_support->have_avx = HasYmmOsXSave(xcr0_eax);
os_support->have_avx512 = HasZmmOsXSave(xcr0_eax);
os_support->have_amx = HasTmmOsXSave(xcr0_eax);

const uint32_t family = ExtractBitRange(leaf_1.eax, 11, 8);
const uint32_t extended_family = ExtractBitRange(leaf_1.eax, 27, 20);
const uint32_t model = ExtractBitRange(leaf_1.eax, 7, 4);
Expand Down Expand Up @@ -1142,7 +1240,9 @@ static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
features->vaes = IsBitSet(leaf_7.ecx, 9);
features->vpclmulqdq = IsBitSet(leaf_7.ecx, 10);

if (os_support->have_sse) {
if (os_support.have_sse_via_os) {
DetectSseViaOs(features);
} else if (os_support.have_sse_via_cpuid) {
features->sse = IsBitSet(leaf_1.edx, 25);
features->sse2 = IsBitSet(leaf_1.edx, 26);
features->sse3 = IsBitSet(leaf_1.ecx, 0);
Expand All @@ -1151,13 +1251,13 @@ static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
features->sse4_2 = IsBitSet(leaf_1.ecx, 20);
}

if (os_support->have_avx) {
if (os_support.have_avx) {
features->fma3 = IsBitSet(leaf_1.ecx, 12);
features->avx = IsBitSet(leaf_1.ecx, 28);
features->avx2 = IsBitSet(leaf_7.ebx, 5);
}

if (os_support->have_avx512) {
if (os_support.have_avx512) {
features->avx512f = IsBitSet(leaf_7.ebx, 16);
features->avx512cd = IsBitSet(leaf_7.ebx, 28);
features->avx512er = IsBitSet(leaf_7.ebx, 27);
Expand All @@ -1179,7 +1279,7 @@ static void ParseCpuId(const uint32_t max_cpuid_leaf, X86Info* info,
features->avx512_vp2intersect = IsBitSet(leaf_7.edx, 8);
}

if (os_support->have_amx) {
if (os_support.have_amx) {
features->amx_bf16 = IsBitSet(leaf_7.edx, 22);
features->amx_tile = IsBitSet(leaf_7.edx, 24);
features->amx_int8 = IsBitSet(leaf_7.edx, 25);
Expand All @@ -1195,7 +1295,7 @@ static void ParseExtraAMDCpuId(X86Info* info, OsSupport os_support) {

X86Features* const features = &info->features;

if (os_support.have_sse) {
if (os_support.have_sse_via_cpuid) {
features->sse4a = IsBitSet(leaf_80000001.ecx, 6);
}

Expand All @@ -1205,22 +1305,21 @@ static void ParseExtraAMDCpuId(X86Info* info, OsSupport os_support) {
}

static const X86Info kEmptyX86Info;
static const OsSupport kEmptyOsSupport;
static const CacheInfo kEmptyCacheInfo;

X86Info GetX86Info(void) {
X86Info info = kEmptyX86Info;
OsSupport os_support = kEmptyOsSupport;
const Leaf leaf_0 = CpuId(0);
const bool is_intel = IsVendor(leaf_0, "GenuineIntel");
const bool is_amd = IsVendor(leaf_0, "AuthenticAMD");
SetVendor(leaf_0, info.vendor);
if (is_intel || is_amd) {
const uint32_t max_cpuid_leaf = leaf_0.eax;
ParseCpuId(max_cpuid_leaf, &info, &os_support);
}
if (is_amd) {
ParseExtraAMDCpuId(&info, os_support);
const OsSupport os_support = CheckOsSupport(max_cpuid_leaf);
ParseCpuId(max_cpuid_leaf, os_support, &info);
if (is_amd) {
ParseExtraAMDCpuId(&info, os_support);
}
}
return info;
}
Expand Down
6 changes: 6 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ add_test(NAME unix_features_aggregator_test COMMAND unix_features_aggregator_tes
if(PROCESSOR_IS_X86)
add_executable(cpuinfo_x86_test cpuinfo_x86_test.cc ../src/cpuinfo_x86.c)
target_compile_definitions(cpuinfo_x86_test PUBLIC CPU_FEATURES_MOCK_CPUID_X86)
if(HAVE_UTSNAME_H)
target_compile_definitions(cpuinfo_x86_test PRIVATE HAVE_UTSNAME_H)
endif()
if(APPLE)
target_compile_definitions(cpuinfo_x86_test PRIVATE HAVE_SYSCTLBYNAME)
endif()
target_link_libraries(cpuinfo_x86_test all_libraries)
add_test(NAME cpuinfo_x86_test COMMAND cpuinfo_x86_test)
endif()
Expand Down
Loading