Skip to content

Commit

Permalink
Bug #24823885: PERFORMANCE REGRESSION WHEN CHANGING CHARACTER SET TO …
Browse files Browse the repository at this point in the history
…UTF8MB4

Unroll the ASCII fast path to check four bytes at a time. This is a tradeoff;
we lose out on the cases where we have four-byte blocks with mixed ASCII/non-ASCII
(e.g. in text with mostly ASCII but some accents) and on some relatively common
ASCII code points outside the 0x20..0x7e range, such as newlines.

  BM_SimpleUTF8MB4          232 -> 146 ns/iter  [+58.9%]
  BM_MixedUTF8MB4           230 -> 276 ns/iter  [-16.7%]
  BM_MixedUTF8MB4_AS_CS     759 -> 828 ns/iter  [ -8.3%]
  BM_NewlineFilledUTF8MB4   123 -> 231 ns/iter  [-46.8%]
  BM_HashSimpleUTF8MB4      299 -> 306 ns/iter  [ -2.3%]

Change-Id: I64dc2fa06482809cc2e530f2434e5c8890a4edb2
  • Loading branch information
Steinar H. Gunderson committed Nov 22, 2016
1 parent d68887b commit cd89578
Show file tree
Hide file tree
Showing 2 changed files with 263 additions and 30 deletions.
116 changes: 87 additions & 29 deletions strings/ctype-uca.cc
Original file line number Diff line number Diff line change
Expand Up @@ -795,25 +795,30 @@ class uca_scanner_900 : public my_uca_scanner

/**
For each weight in sequence, call "func", which should have
a function signature of "bool func(int weight)". Stops the
iteration early if "func" returns false.
a function signature of "bool func(int weight, bool is_level_separator)".
Stops the iteration early if "func" returns false.
This is morally equivalent to
int weight;
while ((weight= next()) >= 0)
{
if (!func(weight)) break;
if (!func(weight, weight == 0)) break;
}
except that it might employ optimizations internally to speed up
the process. These optimizations will not modify the number of calls
to func() (or their order), but might affect the internal scanner
state during the calls, so func() should not try to read from
the scanner except by calling public member functions.
As a special optimization, if "bool preaccept_data(int num_weights)"
returns true, the next "num_weights" calls to func() _must_ return
true. This is so that bounds checking costs can be amortized
over fewer calls.
*/
template<class T>
ALWAYS_INLINE(void for_each_weight(T func));
template<class T, class U>
ALWAYS_INLINE(void for_each_weight(T func, U preaccept_data));

private:
const Mb_wc mb_wc;
Expand Down Expand Up @@ -1646,16 +1651,17 @@ inline int uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::next_raw_single_level()
}

template<class Mb_wc, int LEVELS_FOR_COMPARE>
template<class T>
inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(T func)
template<class T, class U>
inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(
T func, U preaccept_data)
{
if (cs->tailoring || cs->mbminlen != 1)
{
// Slower, generic path.
int s_res;
while ((s_res= next()) >= 0)
{
if (!func(s_res)) return;
if (!func(s_res, s_res == 0)) return;
}
return;
}
Expand All @@ -1676,27 +1682,48 @@ inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(T func)
int s_res;
while ((s_res= more_weight()) >= 0)
{
if (!func(s_res)) return;
if (!func(s_res, s_res == 0)) return;
}

/*
Loop in a simple fast path as long as we only have ASCII characters.
ASCII characters always have just a single weight and consist of
only a single byte, so we can skip a lot of the checks we'd otherwise
have to do.
Loop in a simple fast path as long as we only have non-ignorable
ASCII characters. These characters always have exactly a single weight
and consist of only a single byte, so we can skip a lot of the checks
we'd otherwise have to do.
*/
const uchar *sbeg_copy= sbeg;
const uchar *sbeg_local= sbeg;
const uchar *send_local=
std::min(send, sbeg + (max_char_toscan - char_index));
while (sbeg < send_local && *sbeg < 0x80)
std::min(send, sbeg + (max_char_toscan - char_index)) - (sizeof(uint32) - 1);
while (sbeg_local < send_local && preaccept_data(sizeof(uint32)))
{
const int s_res= ascii_wpage[*sbeg++];
if (s_res && !func(s_res))
{
char_index+= sbeg - sbeg_copy;
return;
}
/*
Check if all four bytes are in the range 0x20..0x7e, inclusive.
These have exactly one weight. Note that this unfortunately does not
include tab and newline, which would otherwise be legal candidates.
See the FastOutOfRange unit test for verification that the bitfiddling
trick used here is correct.
*/
uint32 four_bytes;
memcpy(&four_bytes, sbeg_local, sizeof(four_bytes));
if (((four_bytes + 0x01010101u) & 0x80808080) ||
((four_bytes - 0x20202020u) & 0x80808080)) break;
const int s_res0= ascii_wpage[sbeg_local[0]];
const int s_res1= ascii_wpage[sbeg_local[1]];
const int s_res2= ascii_wpage[sbeg_local[2]];
const int s_res3= ascii_wpage[sbeg_local[3]];
DBUG_ASSERT(s_res0 != 0);
DBUG_ASSERT(s_res1 != 0);
DBUG_ASSERT(s_res2 != 0);
DBUG_ASSERT(s_res3 != 0);
func(s_res0, /*is_level_separator=*/false);
func(s_res1, /*is_level_separator=*/false);
func(s_res2, /*is_level_separator=*/false);
func(s_res3, /*is_level_separator=*/false);
sbeg_local+= sizeof(uint32);
}
sbeg= sbeg_local;
char_index+= sbeg - sbeg_copy;

// Do a single character in the generic path.
Expand All @@ -1706,7 +1733,7 @@ inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(T func)
// Level separator, so we have to update our page pointer.
ascii_wpage+= UCA900_DISTANCE_BETWEEN_LEVELS;
}
if (s_res < 0 || !func(s_res)) return;
if (s_res < 0 || !func(s_res, s_res == 0)) return;
}
}

Expand Down Expand Up @@ -5188,11 +5215,37 @@ static void my_hash_sort_uca_900_tmpl(const CHARSET_INFO *cs,
uint64 h= *n1;
h^= 14695981039346656037ULL;

scanner.for_each_weight([&](int s_res) {
h^= s_res;
h*= 1099511628211ULL;
/*
We don't want any 0x0001 weights before level markers or end-of-string
to count (see comments on my_strnxfrm_uca_900_tmpl for rationale).
Thus, whenever we see a 0x0001 weight, we keep updating pending_hash,
but we don't actually update h before we see something else. This way,
we can roll back the effect of these weights (similar to
strip_space_weights()) if we need to.
*/
uint64 pending_hash= h;

scanner.for_each_weight([&](int s_res, bool is_level_separator) {
if (is_level_separator)
{
/*
Level marker; roll back the hash to the last point we saw
a non-0x0001 weight, effectively doing space stripping.
*/
pending_hash= h;
}

pending_hash^= s_res;
pending_hash*= 1099511628211ULL;

if (s_res != 0x0001)
{
// Commit any pending 0x0001 weights.
h= pending_hash;
}

return true;
});
}, [](int num_weights) { return true; });

*n1= static_cast<ulong>(h);
}
Expand Down Expand Up @@ -5369,14 +5422,19 @@ static size_t my_strnxfrm_uca_900_tmpl(const CHARSET_INFO *cs,
restart:
if (dst != dst_end)
{
scanner.for_each_weight([&](int s_res) {
scanner.for_each_weight([&dst, d0, dst_end, flags]
(int s_res, bool is_level_separator) {
DBUG_ASSERT(is_level_separator == (s_res == 0));
if (LEVELS_FOR_COMPARE == 1)
DBUG_ASSERT(s_res != 0); // Level separator should never happen.
else if (s_res == 0 && (flags & MY_STRXFRM_PAD_WITH_SPACE))
DBUG_ASSERT(!is_level_separator);
else if (is_level_separator && (flags & MY_STRXFRM_PAD_WITH_SPACE))
dst= strip_space_weights(d0, dst);

dst= store16be(dst, s_res);
return (dst < dst_end);
},
[&dst, dst_end](int num_weights) {
return (dst < dst_end - num_weights * 2);
});
}

Expand Down
177 changes: 176 additions & 1 deletion unittest/gunit/strings_strnxfrm-t.cc
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,70 @@ static void BM_MixedUTF8MB4_AS_CS(size_t num_iterations)
}
BENCHMARK(BM_MixedUTF8MB4_AS_CS);

/*
A benchmark that illustrates the potential perils of not including the
range [0x00,0x20) in our fast path; newlines throw us off the fast path
and reduce speed.
The newlines are spaced a bit randomly in order not to create a perfectly
predictable pattern for the branch predictor (benchmark paranoia).
*/
static void BM_NewlineFilledUTF8MB4(size_t num_iterations)
{
StopBenchmarkTiming();

const char *content= "This is a\n prett\ny unrealist\nic case; a\nn "
"Eng\nlish sente\nnce where\n we'\nve added a new\nline every te\nn "
"bytes or\n so.\n";
const int len= strlen(content);

// Just recorded from a trial run on the string above.
static constexpr uchar expected[]= {
0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71,
0x00, 0x01, 0x1d, 0x32, 0x1e, 0x71, 0x00, 0x01,
0x1c, 0x47, 0x02, 0x02, 0x00, 0x01, 0x1e, 0x0c,
0x1e, 0x33, 0x1c, 0xaa, 0x1e, 0x95, 0x1e, 0x95,
0x02, 0x02, 0x1f, 0x0b, 0x00, 0x01, 0x1e, 0xb5,
0x1d, 0xb9, 0x1e, 0x33, 0x1c, 0xaa, 0x1c, 0x47,
0x1d, 0x77, 0x1d, 0x32, 0x1e, 0x71, 0x1e, 0x95,
0x02, 0x02, 0x1d, 0x32, 0x1c, 0x7a, 0x00, 0x01,
0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x71, 0x1c, 0xaa,
0x02, 0x34, 0x00, 0x01, 0x1c, 0x47, 0x02, 0x02,
0x1d, 0xb9, 0x00, 0x01, 0x1c, 0xaa, 0x1d, 0xb9,
0x1c, 0xf4, 0x02, 0x02, 0x1d, 0x77, 0x1d, 0x32,
0x1e, 0x71, 0x1d, 0x18, 0x00, 0x01, 0x1e, 0x71,
0x1c, 0xaa, 0x1d, 0xb9, 0x1e, 0x95, 0x1c, 0xaa,
0x02, 0x02, 0x1d, 0xb9, 0x1c, 0x7a, 0x1c, 0xaa,
0x00, 0x01, 0x1e, 0xf5, 0x1d, 0x18, 0x1c, 0xaa,
0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x02, 0x00, 0x01,
0x1e, 0xf5, 0x1c, 0xaa, 0x03, 0x05, 0x02, 0x02,
0x1e, 0xe3, 0x1c, 0xaa, 0x00, 0x01, 0x1c, 0x47,
0x1c, 0x8f, 0x1c, 0x8f, 0x1c, 0xaa, 0x1c, 0x8f,
0x00, 0x01, 0x1c, 0x47, 0x00, 0x01, 0x1d, 0xb9,
0x1c, 0xaa, 0x1e, 0xf5, 0x02, 0x02, 0x1d, 0x77,
0x1d, 0x32, 0x1d, 0xb9, 0x1c, 0xaa, 0x00, 0x01,
0x1c, 0xaa, 0x1e, 0xe3, 0x1c, 0xaa, 0x1e, 0x33,
0x1f, 0x0b, 0x00, 0x01, 0x1e, 0x95, 0x1c, 0xaa,
0x02, 0x02, 0x1d, 0xb9, 0x00, 0x01, 0x1c, 0x60,
0x1f, 0x0b, 0x1e, 0x95, 0x1c, 0xaa, 0x1e, 0x71,
0x00, 0x01, 0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x02,
0x00, 0x01, 0x1e, 0x71, 0x1d, 0xdd, 0x02, 0x77,
0x02, 0x02
};
uchar dest[sizeof(expected)];

StartBenchmarkTiming();
for (size_t i= 0; i < num_iterations; ++i)
{
my_strnxfrm(&my_charset_utf8mb4_0900_ai_ci, dest, sizeof(dest),
reinterpret_cast<const uchar *>(content), len);
}
StopBenchmarkTiming();

expect_arrays_equal(expected, dest, sizeof(dest));
}
BENCHMARK(BM_NewlineFilledUTF8MB4);

static void BM_HashSimpleUTF8MB4(size_t num_iterations)
{
StopBenchmarkTiming();
Expand Down Expand Up @@ -779,4 +843,115 @@ TEST(PadCollationTest, Strxfrm)
&my_charset_utf8mb4_0900_as_cs, "", "\t"), 0);
}

} // namespace
/*
This test is disabled by default since it needs ~10 seconds to run,
even in optimized mode.
*/
TEST(BitfiddlingTest, DISABLED_FastOutOfRange)
{
unsigned char bytes[4];
for (int a= 0; a < 256; ++a)
{
bytes[0]= a;
for (int b= 0; b < 256; ++b)
{
bytes[1]= b;
for (int c= 0; c < 256; ++c)
{
bytes[2]= c;
for (int d= 0; d < 256; ++d)
{
bytes[3]= d;
bool any_out_of_range_slow=
(a < 0x20 || a > 0x7e) ||
(b < 0x20 || b > 0x7e) ||
(c < 0x20 || c > 0x7e) ||
(d < 0x20 || d > 0x7e);

uint32 four_bytes;
memcpy(&four_bytes, bytes, sizeof(four_bytes));
bool any_out_of_range_fast=
(((four_bytes + 0x01010101u) & 0x80808080) ||
((four_bytes - 0x20202020u) & 0x80808080));

EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast);
}
}
}
}
}

/*
A version of FastOutOfRange that tests the analogous trick for 16-bit
integers instead (much, much faster).
*/
TEST(BitfiddlingTest, FastOutOfRange16)
{
unsigned char bytes[2];
for (int a= 0; a < 256; ++a)
{
bytes[0]= a;
for (int b= 0; b < 256; ++b)
{
bytes[1]= b;
bool any_out_of_range_slow=
(a < 0x20 || a > 0x7e) ||
(b < 0x20 || b > 0x7e);

uint16 two_bytes;
memcpy(&two_bytes, bytes, sizeof(two_bytes));
bool any_out_of_range_fast=
(((two_bytes + uint16{0x0101}) & uint16{0x8080}) ||
((two_bytes - uint16{0x2020}) & uint16{0x8080}));

EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast);
}
}
}

ulong hash(CHARSET_INFO *cs, const char *str)
{
ulong nr1=1, nr2= 4;
cs->coll->hash_sort(
cs, pointer_cast<const uchar *>(str), strlen(str), &nr1, &nr2);
return nr1;
}

/*
NOTE: In this entire test, there's an infinitesimal chance
that something that we expect doesn't match, still matches
by pure accident.
*/
TEST(PadCollationTest, HashSort)
{
CHARSET_INFO *ai_ci= &my_charset_utf8mb4_0900_ai_ci;
CHARSET_INFO *as_cs= &my_charset_utf8mb4_0900_as_cs;

// Basic sanity checks.
EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, "abc"));
EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, "def"));

// Spaces from the end should not matter, no matter the collation.
EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, "abc "));
EXPECT_EQ(hash(as_cs, "abc"), hash(as_cs, "abc "));
EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, "Abc "));

// Same with other types of spaces.
EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, u8"abc \u00a0"));

// Non-breaking space should compare _equal_ to space in ai_ci,
// but _inequal_ in as_cs.
EXPECT_EQ(hash(ai_ci, "abc "), hash(ai_ci, u8"abc\u00a0"));
EXPECT_NE(hash(as_cs, "abc "), hash(as_cs, u8"abc\u00a0"));
EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, u8"abc\u00a0"));

// Also in the middle of the string.
EXPECT_EQ(hash(ai_ci, "a c"), hash(ai_ci, u8"a\u00a0c"));
EXPECT_NE(hash(as_cs, "a c"), hash(as_cs, u8"a\u00a0c"));

// Verify that space in the middle of the string isn't stripped.
EXPECT_NE(hash(ai_ci, "ab c"), hash(ai_ci, "abc"));
EXPECT_NE(hash(as_cs, "ab c"), hash(as_cs, "abc"));
}

} // namespace strnxfrm_unittest

0 comments on commit cd89578

Please sign in to comment.