Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SSE41 path for streamvbyte_compressedbytes. #57

Merged
merged 5 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 2 additions & 30 deletions include/streamvbyte.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,43 +43,15 @@ static inline size_t streamvbyte_max_compressedbytes(const uint32_t length) {
// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
// the compressed data: the user needs to ensure that this region is allocated, and it
// is not included by streamvbyte_compressedbytes.
static inline size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;
// maximum number of control bytes:
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

if (val < (1 << 8)) db += 1;
else if (val < (1 << 16)) db += 2;
else if (val < (1 << 24)) db += 3;
else db += 4;
}
return cb + db;
}
size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length);

// return the exact number of compressed bytes given length input integers
// runtime in O(n) wrt. in; use streamvbyte_max_compressedbyte if you
// care about speed more than potentially over-allocating memory
// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
// the compressed data: the user needs to ensure that this region is allocated, and it
// is not included by streamvbyte_compressedbytes.
static inline size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;
// maximum number of control bytes:
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

if (val == 0) db += 0;
else if (val < (1 << 8)) db += 1;
else if (val < (1 << 16)) db += 2;
else db += 4;
}
return cb + db;
}
size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length);

// Read "length" 32-bit integers in varint format from in, storing the result in out.
// Returns the number of bytes read. We may read up to STREAMVBYTE_PADDING extra bytes
Expand Down
40 changes: 40 additions & 0 deletions src/streamvbyte_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,46 @@ static uint8_t *svb_encode_scalar(const uint32_t *in,
#include "streamvbyte_arm_encode.c"
#endif

static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length) {
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

uint32_t bytes = 1 + (val > 0x000000FF) + (val > 0x0000FFFF) + (val > 0x00FFFFFF);
db += bytes;
}
return db;
}

static size_t svb_data_bytes_0124_scalar(const uint32_t* in, uint32_t length) {
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

uint32_t bytes = (val > 0x00000000) + (val > 0x000000FF) + (val > 0x0000FFFF) * 2;
db += bytes;
}
return db;
}

size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;

#ifdef STREAMVBYTE_X64
if (streamvbyte_sse41()) {
return cb + svb_data_bytes_SSE41(in, length);
}
#endif
return cb + svb_data_bytes_scalar(in, length);
}

size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;

return cb + svb_data_bytes_0124_scalar(in, length);
}


// Encode an array of a given length read from in to bout in streamvbyte format.
Expand Down
53 changes: 44 additions & 9 deletions src/streamvbyte_x64_encode.c
Original file line number Diff line number Diff line change
@@ -1,15 +1,55 @@
#include "streamvbyte_isadetection.h"
#ifdef STREAMVBYTE_X64
// contributed by aqrit

static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length);

STREAMVBYTE_TARGET_SSE41
static inline size_t svb_control_SSE41 (__m128i lo, __m128i hi) {
const __m128i mask_01 = _mm_set1_epi8(0x01);
const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);

__m128i m0, m1;
size_t keys;

m0 = _mm_min_epu8(mask_01, lo);
m1 = _mm_min_epu8(mask_01, hi);
m0 = _mm_packus_epi16(m0, m1);
m0 = _mm_min_epi16(m0, mask_01); // convert 0x01FF to 0x0101
m0 = _mm_adds_epu16(m0, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
keys = (size_t)_mm_movemask_epi8(m0);
return keys;
}
STREAMVBYTE_UNTARGET_REGION

STREAMVBYTE_TARGET_SSE41
size_t svb_data_bytes_SSE41 (const uint32_t* in, uint32_t count) {
size_t dataLen = 0;

for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
{
__m128i r0, r1;
size_t keys;

r0 = _mm_loadu_si128((__m128i *) &in[0]);
r1 = _mm_loadu_si128((__m128i *) &in[4]);

keys = svb_control_SSE41(r0, r1);
dataLen += len_lut[keys & 0xFF];
dataLen += len_lut[keys >> 8];
}

dataLen += svb_data_bytes_scalar(in, count & 7);
return dataLen;
}
STREAMVBYTE_UNTARGET_REGION

STREAMVBYTE_TARGET_SSE41
size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* out) {
uint32_t keyLen = (count >> 2) + (((count & 3) + 3) >> 2); // 2-bits per each rounded up to byte boundry
uint8_t *restrict keyPtr = &out[0];
uint8_t *restrict dataPtr = &out[keyLen]; // variable length data after keys

const __m128i mask_01 = _mm_set1_epi8(0x01);
const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);

for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
{
__m128i r0, r1, r2, r3;
Expand All @@ -18,12 +58,7 @@ size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* ou
r0 = _mm_loadu_si128((__m128i*)&in[0]);
r1 = _mm_loadu_si128((__m128i*)&in[4]);

r2 = _mm_min_epu8(mask_01, r0);
r3 = _mm_min_epu8(mask_01, r1);
r2 = _mm_packus_epi16(r2, r3);
r2 = _mm_min_epi16(r2, mask_01); // convert 0x01FF to 0x0101
r2 = _mm_adds_epu16(r2, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
keys = (size_t)_mm_movemask_epi8(r2);
keys = svb_control_SSE41(r0, r1);

r2 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys << 4) & 0x03F0]);
r3 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys >> 4) & 0x03F0]);
Expand Down