Bug #24823885: PERFORMANCE REGRESSION WHEN CHANGING CHARACTER SET TO …

…UTF8MB4 Unroll the ASCII fast path to check four bytes at a time. This is a tradeoff; we lose out on the cases where we have four-byte blocks with mixed ASCII/non-ASCII (e.g. in text with mostly ASCII but some accents) and on some relatively common ASCII code points outside the 0x20..0x7e range, such as newlines. BM_SimpleUTF8MB4 232 -> 146 ns/iter [+58.9%] BM_MixedUTF8MB4 230 -> 276 ns/iter [-16.7%] BM_MixedUTF8MB4_AS_CS 759 -> 828 ns/iter [ -8.3%] BM_NewlineFilledUTF8MB4 123 -> 231 ns/iter [-46.8%] BM_HashSimpleUTF8MB4 299 -> 306 ns/iter [ -2.3%] Change-Id: I64dc2fa06482809cc2e530f2434e5c8890a4edb2
mysql · Nov 22, 2016 · cd89578 · cd89578
1 parent d68887b
commit cd89578
Show file tree

Hide file tree

Showing 2 changed files with 263 additions and 30 deletions.
diff --git a/strings/ctype-uca.cc b/strings/ctype-uca.cc
@@ -795,25 +795,30 @@ class uca_scanner_900 : public my_uca_scanner
 
   /**
     For each weight in sequence, call "func", which should have
-    a function signature of "bool func(int weight)". Stops the
-    iteration early if "func" returns false.
+    a function signature of "bool func(int weight, bool is_level_separator)".
+    Stops the iteration early if "func" returns false.
 
     This is morally equivalent to
 
       int weight;
       while ((weight= next()) >= 0)
       {
-        if (!func(weight)) break;
+        if (!func(weight, weight == 0)) break;
       }
 
     except that it might employ optimizations internally to speed up
     the process. These optimizations will not modify the number of calls
     to func() (or their order), but might affect the internal scanner
     state during the calls, so func() should not try to read from
     the scanner except by calling public member functions.
+
+    As a special optimization, if "bool preaccept_data(int num_weights)"
+    returns true, the next "num_weights" calls to func() _must_ return
+    true. This is so that bounds checking costs can be amortized
+    over fewer calls.
   */
-  template<class T>
-  ALWAYS_INLINE(void for_each_weight(T func));
+  template<class T, class U>
+  ALWAYS_INLINE(void for_each_weight(T func, U preaccept_data));
 
 private:
   const Mb_wc mb_wc;
@@ -1646,16 +1651,17 @@ inline int uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::next_raw_single_level()
 }
 
 template<class Mb_wc, int LEVELS_FOR_COMPARE>
-template<class T>
-inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(T func)
+template<class T, class U>
+inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(
+  T func, U preaccept_data)
 {
   if (cs->tailoring || cs->mbminlen != 1)
   {
     // Slower, generic path.
     int s_res;
     while ((s_res= next()) >= 0)
     {
-      if (!func(s_res)) return;
+      if (!func(s_res, s_res == 0)) return;
     }
     return;
   }
@@ -1676,27 +1682,48 @@ inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(T func)
     int s_res;
     while ((s_res= more_weight()) >= 0)
     {
-      if (!func(s_res)) return;
+      if (!func(s_res, s_res == 0)) return;
     }
 
     /*
-      Loop in a simple fast path as long as we only have ASCII characters.
-      ASCII characters always have just a single weight and consist of
-      only a single byte, so we can skip a lot of the checks we'd otherwise
-      have to do.
+      Loop in a simple fast path as long as we only have non-ignorable
+      ASCII characters. These characters always have exactly a single weight
+      and consist of only a single byte, so we can skip a lot of the checks
+      we'd otherwise have to do.
     */
     const uchar *sbeg_copy= sbeg;
+    const uchar *sbeg_local= sbeg;
     const uchar *send_local=
-      std::min(send, sbeg + (max_char_toscan - char_index));
-    while (sbeg < send_local && *sbeg < 0x80)
+      std::min(send, sbeg + (max_char_toscan - char_index)) - (sizeof(uint32) - 1);
+    while (sbeg_local < send_local && preaccept_data(sizeof(uint32)))
     {
-      const int s_res= ascii_wpage[*sbeg++];
-      if (s_res && !func(s_res))
-      {
-        char_index+= sbeg - sbeg_copy;
-        return;
-      }
+      /*
+        Check if all four bytes are in the range 0x20..0x7e, inclusive.
+        These have exactly one weight. Note that this unfortunately does not
+        include tab and newline, which would otherwise be legal candidates.
+
+        See the FastOutOfRange unit test for verification that the bitfiddling
+        trick used here is correct.
+      */
+      uint32 four_bytes;
+      memcpy(&four_bytes, sbeg_local, sizeof(four_bytes));
+      if (((four_bytes + 0x01010101u) & 0x80808080) ||
+          ((four_bytes - 0x20202020u) & 0x80808080)) break;
+      const int s_res0= ascii_wpage[sbeg_local[0]];
+      const int s_res1= ascii_wpage[sbeg_local[1]];
+      const int s_res2= ascii_wpage[sbeg_local[2]];
+      const int s_res3= ascii_wpage[sbeg_local[3]];
+      DBUG_ASSERT(s_res0 != 0);
+      DBUG_ASSERT(s_res1 != 0);
+      DBUG_ASSERT(s_res2 != 0);
+      DBUG_ASSERT(s_res3 != 0);
+      func(s_res0, /*is_level_separator=*/false);
+      func(s_res1, /*is_level_separator=*/false);
+      func(s_res2, /*is_level_separator=*/false);
+      func(s_res3, /*is_level_separator=*/false);
+      sbeg_local+= sizeof(uint32);
     }
+    sbeg= sbeg_local;
     char_index+= sbeg - sbeg_copy;
 
     // Do a single character in the generic path.
@@ -1706,7 +1733,7 @@ inline void uca_scanner_900<Mb_wc, LEVELS_FOR_COMPARE>::for_each_weight(T func)
       // Level separator, so we have to update our page pointer.
       ascii_wpage+= UCA900_DISTANCE_BETWEEN_LEVELS;
     }
-    if (s_res < 0 || !func(s_res)) return;
+    if (s_res < 0 || !func(s_res, s_res == 0)) return;
   }
 }
 
@@ -5188,11 +5215,37 @@ static void my_hash_sort_uca_900_tmpl(const CHARSET_INFO *cs,
   uint64 h= *n1;
   h^= 14695981039346656037ULL;
 
-  scanner.for_each_weight([&](int s_res) {
-    h^= s_res;
-    h*= 1099511628211ULL;
+  /*
+    We don't want any 0x0001 weights before level markers or end-of-string
+    to count (see comments on my_strnxfrm_uca_900_tmpl for rationale).
+    Thus, whenever we see a 0x0001 weight, we keep updating pending_hash,
+    but we don't actually update h before we see something else. This way,
+    we can roll back the effect of these weights (similar to
+    strip_space_weights()) if we need to.
+  */
+  uint64 pending_hash= h;
+
+  scanner.for_each_weight([&](int s_res, bool is_level_separator) {
+    if (is_level_separator)
+    {
+      /*
+        Level marker; roll back the hash to the last point we saw
+        a non-0x0001 weight, effectively doing space stripping.
+      */
+      pending_hash= h;
+    }
+
+    pending_hash^= s_res;
+    pending_hash*= 1099511628211ULL;
+
+    if (s_res != 0x0001)
+    {
+      // Commit any pending 0x0001 weights.
+      h= pending_hash;
+    }
+
     return true;
-  });
+  }, [](int num_weights) { return true; });
 
   *n1= static_cast<ulong>(h);
 }
@@ -5369,14 +5422,19 @@ static size_t my_strnxfrm_uca_900_tmpl(const CHARSET_INFO *cs,
 restart:
   if (dst != dst_end)
   {
-    scanner.for_each_weight([&](int s_res) {
+    scanner.for_each_weight([&dst, d0, dst_end, flags]
+                              (int s_res, bool is_level_separator) {
+      DBUG_ASSERT(is_level_separator == (s_res == 0));
       if (LEVELS_FOR_COMPARE == 1)
-        DBUG_ASSERT(s_res != 0);  // Level separator should never happen.
-      else if (s_res == 0 && (flags & MY_STRXFRM_PAD_WITH_SPACE))
+        DBUG_ASSERT(!is_level_separator);
+      else if (is_level_separator && (flags & MY_STRXFRM_PAD_WITH_SPACE))
         dst= strip_space_weights(d0, dst);
 
       dst= store16be(dst, s_res);
       return (dst < dst_end);
+    },
+    [&dst, dst_end](int num_weights) {
+      return (dst < dst_end - num_weights * 2);
     });
   }
 

diff --git a/unittest/gunit/strings_strnxfrm-t.cc b/unittest/gunit/strings_strnxfrm-t.cc
@@ -619,6 +619,70 @@ static void BM_MixedUTF8MB4_AS_CS(size_t num_iterations)
 }
 BENCHMARK(BM_MixedUTF8MB4_AS_CS);
 
+/*
+  A benchmark that illustrates the potential perils of not including the
+  range [0x00,0x20) in our fast path; newlines throw us off the fast path
+  and reduce speed.
+
+  The newlines are spaced a bit randomly in order not to create a perfectly
+  predictable pattern for the branch predictor (benchmark paranoia).
+*/
+static void BM_NewlineFilledUTF8MB4(size_t num_iterations)
+{
+  StopBenchmarkTiming();
+
+  const char *content= "This is a\n prett\ny unrealist\nic case; a\nn "
+    "Eng\nlish sente\nnce where\n we'\nve added a new\nline every te\nn "
+    "bytes or\n so.\n";
+  const int len= strlen(content);
+
+  // Just recorded from a trial run on the string above.
+  static constexpr uchar expected[]= {
+    0x1e, 0x95, 0x1d, 0x18, 0x1d, 0x32, 0x1e, 0x71,
+    0x00, 0x01, 0x1d, 0x32, 0x1e, 0x71, 0x00, 0x01,
+    0x1c, 0x47, 0x02, 0x02, 0x00, 0x01, 0x1e, 0x0c,
+    0x1e, 0x33, 0x1c, 0xaa, 0x1e, 0x95, 0x1e, 0x95,
+    0x02, 0x02, 0x1f, 0x0b, 0x00, 0x01, 0x1e, 0xb5,
+    0x1d, 0xb9, 0x1e, 0x33, 0x1c, 0xaa, 0x1c, 0x47,
+    0x1d, 0x77, 0x1d, 0x32, 0x1e, 0x71, 0x1e, 0x95,
+    0x02, 0x02, 0x1d, 0x32, 0x1c, 0x7a, 0x00, 0x01,
+    0x1c, 0x7a, 0x1c, 0x47, 0x1e, 0x71, 0x1c, 0xaa,
+    0x02, 0x34, 0x00, 0x01, 0x1c, 0x47, 0x02, 0x02,
+    0x1d, 0xb9, 0x00, 0x01, 0x1c, 0xaa, 0x1d, 0xb9,
+    0x1c, 0xf4, 0x02, 0x02, 0x1d, 0x77, 0x1d, 0x32,
+    0x1e, 0x71, 0x1d, 0x18, 0x00, 0x01, 0x1e, 0x71,
+    0x1c, 0xaa, 0x1d, 0xb9, 0x1e, 0x95, 0x1c, 0xaa,
+    0x02, 0x02, 0x1d, 0xb9, 0x1c, 0x7a, 0x1c, 0xaa,
+    0x00, 0x01, 0x1e, 0xf5, 0x1d, 0x18, 0x1c, 0xaa,
+    0x1e, 0x33, 0x1c, 0xaa, 0x02, 0x02, 0x00, 0x01,
+    0x1e, 0xf5, 0x1c, 0xaa, 0x03, 0x05, 0x02, 0x02,
+    0x1e, 0xe3, 0x1c, 0xaa, 0x00, 0x01, 0x1c, 0x47,
+    0x1c, 0x8f, 0x1c, 0x8f, 0x1c, 0xaa, 0x1c, 0x8f,
+    0x00, 0x01, 0x1c, 0x47, 0x00, 0x01, 0x1d, 0xb9,
+    0x1c, 0xaa, 0x1e, 0xf5, 0x02, 0x02, 0x1d, 0x77,
+    0x1d, 0x32, 0x1d, 0xb9, 0x1c, 0xaa, 0x00, 0x01,
+    0x1c, 0xaa, 0x1e, 0xe3, 0x1c, 0xaa, 0x1e, 0x33,
+    0x1f, 0x0b, 0x00, 0x01, 0x1e, 0x95, 0x1c, 0xaa,
+    0x02, 0x02, 0x1d, 0xb9, 0x00, 0x01, 0x1c, 0x60,
+    0x1f, 0x0b, 0x1e, 0x95, 0x1c, 0xaa, 0x1e, 0x71,
+    0x00, 0x01, 0x1d, 0xdd, 0x1e, 0x33, 0x02, 0x02,
+    0x00, 0x01, 0x1e, 0x71, 0x1d, 0xdd, 0x02, 0x77,
+    0x02, 0x02
+  };
+  uchar dest[sizeof(expected)];
+
+  StartBenchmarkTiming();
+  for (size_t i= 0; i < num_iterations; ++i)
+  {
+    my_strnxfrm(&my_charset_utf8mb4_0900_ai_ci, dest, sizeof(dest),
+      reinterpret_cast<const uchar *>(content), len);
+  }
+  StopBenchmarkTiming();
+
+  expect_arrays_equal(expected, dest, sizeof(dest));
+}
+BENCHMARK(BM_NewlineFilledUTF8MB4);
+
 static void BM_HashSimpleUTF8MB4(size_t num_iterations)
 {
   StopBenchmarkTiming();
@@ -779,4 +843,115 @@ TEST(PadCollationTest, Strxfrm)
     &my_charset_utf8mb4_0900_as_cs, "", "\t"), 0);
 }
 
-}  // namespace
+/*
+  This test is disabled by default since it needs ~10 seconds to run,
+  even in optimized mode.
+*/
+TEST(BitfiddlingTest, DISABLED_FastOutOfRange)
+{
+  unsigned char bytes[4];
+  for (int a= 0; a < 256; ++a)
+  {
+    bytes[0]= a;
+    for (int b= 0; b < 256; ++b)
+    {
+      bytes[1]= b;
+      for (int c= 0; c < 256; ++c)
+      {
+        bytes[2]= c;
+        for (int d= 0; d < 256; ++d)
+        {
+          bytes[3]= d;
+          bool any_out_of_range_slow=
+            (a < 0x20 || a > 0x7e) ||
+            (b < 0x20 || b > 0x7e) ||
+            (c < 0x20 || c > 0x7e) ||
+            (d < 0x20 || d > 0x7e);
+
+          uint32 four_bytes;
+          memcpy(&four_bytes, bytes, sizeof(four_bytes));
+          bool any_out_of_range_fast=
+            (((four_bytes + 0x01010101u) & 0x80808080) ||
+             ((four_bytes - 0x20202020u) & 0x80808080));
+
+          EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast);
+        }
+      }
+    }
+  }
+}
+
+/*
+  A version of FastOutOfRange that tests the analogous trick for 16-bit
+  integers instead (much, much faster).
+*/
+TEST(BitfiddlingTest, FastOutOfRange16)
+{
+  unsigned char bytes[2];
+  for (int a= 0; a < 256; ++a)
+  {
+    bytes[0]= a;
+    for (int b= 0; b < 256; ++b)
+    {
+      bytes[1]= b;
+      bool any_out_of_range_slow=
+        (a < 0x20 || a > 0x7e) ||
+        (b < 0x20 || b > 0x7e);
+
+      uint16 two_bytes;
+      memcpy(&two_bytes, bytes, sizeof(two_bytes));
+      bool any_out_of_range_fast=
+        (((two_bytes + uint16{0x0101}) & uint16{0x8080}) ||
+         ((two_bytes - uint16{0x2020}) & uint16{0x8080}));
+
+      EXPECT_EQ(any_out_of_range_slow, any_out_of_range_fast);
+    }
+  }
+}
+
+ulong hash(CHARSET_INFO *cs, const char *str)
+{
+  ulong nr1=1, nr2= 4;
+  cs->coll->hash_sort(
+    cs, pointer_cast<const uchar *>(str), strlen(str), &nr1, &nr2);
+  return nr1;
+}
+
+/*
+  NOTE: In this entire test, there's an infinitesimal chance
+  that something that we expect doesn't match, still matches
+  by pure accident.
+*/
+TEST(PadCollationTest, HashSort)
+{
+  CHARSET_INFO *ai_ci= &my_charset_utf8mb4_0900_ai_ci;
+  CHARSET_INFO *as_cs= &my_charset_utf8mb4_0900_as_cs;
+
+  // Basic sanity checks.
+  EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, "abc"));
+  EXPECT_NE(hash(ai_ci, "abc"), hash(ai_ci, "def"));
+
+  // Spaces from the end should not matter, no matter the collation.
+  EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, "abc  "));
+  EXPECT_EQ(hash(as_cs, "abc"), hash(as_cs, "abc  "));
+  EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, "Abc  "));
+
+  // Same with other types of spaces.
+  EXPECT_EQ(hash(ai_ci, "abc"), hash(ai_ci, u8"abc \u00a0"));
+
+  // Non-breaking space should compare _equal_ to space in ai_ci,
+  // but _inequal_ in as_cs.
+  EXPECT_EQ(hash(ai_ci, "abc "), hash(ai_ci, u8"abc\u00a0"));
+  EXPECT_NE(hash(as_cs, "abc "), hash(as_cs, u8"abc\u00a0"));
+  EXPECT_NE(hash(as_cs, "abc"), hash(as_cs, u8"abc\u00a0"));
+
+  // Also in the middle of the string.
+  EXPECT_EQ(hash(ai_ci, "a c"), hash(ai_ci, u8"a\u00a0c"));
+  EXPECT_NE(hash(as_cs, "a c"), hash(as_cs, u8"a\u00a0c"));
+
+  // Verify that space in the middle of the string isn't stripped.
+  EXPECT_NE(hash(ai_ci, "ab  c"), hash(ai_ci, "abc"));
+  EXPECT_NE(hash(as_cs, "ab  c"), hash(as_cs, "abc"));
+}
+
+}  // namespace strnxfrm_unittest