Move MatchPattern to its own header and the base namespace.

BUG= Review URL: https://codereview.chromium.org/1226673003 Cr-Commit-Position: refs/heads/master@{#337488}
hylschwy · Jul 6, 2015 · d97eede · d97eede
1 parent 7d47f7a
commit d97eede
Show file tree

Hide file tree

Showing 48 changed files with 482 additions and 396 deletions.
diff --git a/base/BUILD.gn b/base/BUILD.gn
@@ -425,6 +425,8 @@ component("base") {
     "strings/latin1_string_conversions.h",
     "strings/nullable_string16.cc",
     "strings/nullable_string16.h",
+    "strings/pattern.cc",
+    "strings/pattern.h",
     "strings/safe_sprintf.cc",
     "strings/safe_sprintf.h",
     "strings/string16.cc",
@@ -1276,6 +1278,7 @@ test("base_unittests") {
     "sha1_unittest.cc",
     "stl_util_unittest.cc",
     "strings/nullable_string16_unittest.cc",
+    "strings/pattern_unittest.cc",
     "strings/safe_sprintf_unittest.cc",
     "strings/string16_unittest.cc",
     "strings/string_number_conversions_unittest.cc",

diff --git a/base/base.gyp b/base/base.gyp
@@ -580,6 +580,7 @@
         'sha1_unittest.cc',
         'stl_util_unittest.cc',
         'strings/nullable_string16_unittest.cc',
+        'strings/pattern_unittest.cc',
         'strings/safe_sprintf_unittest.cc',
         'strings/string16_unittest.cc',
         'strings/string_number_conversions_unittest.cc',

diff --git a/base/base.gypi b/base/base.gypi
@@ -544,6 +544,8 @@
           'strings/latin1_string_conversions.h',
           'strings/nullable_string16.cc',
           'strings/nullable_string16.h',
+          'strings/pattern.cc',
+          'strings/pattern.h',
           'strings/safe_sprintf.cc',
           'strings/safe_sprintf.h',
           'strings/string16.cc',

diff --git a/base/strings/pattern.cc b/base/strings/pattern.cc
@@ -0,0 +1,169 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/pattern.h"
+
+#include "base/third_party/icu/icu_utf.h"
+
+namespace base {
+
+namespace {
+
+static bool IsWildcard(base_icu::UChar32 character) {
+  return character == '*' || character == '?';
+}
+
+// Move the strings pointers to the point where they start to differ.
+template <typename CHAR, typename NEXT>
+static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
+                         const CHAR** string, const CHAR* string_end,
+                         NEXT next) {
+  const CHAR* escape = NULL;
+  while (*pattern != pattern_end && *string != string_end) {
+    if (!escape && IsWildcard(**pattern)) {
+      // We don't want to match wildcard here, except if it's escaped.
+      return;
+    }
+
+    // Check if the escapement char is found. If so, skip it and move to the
+    // next character.
+    if (!escape && **pattern == '\\') {
+      escape = *pattern;
+      next(pattern, pattern_end);
+      continue;
+    }
+
+    // Check if the chars match, if so, increment the ptrs.
+    const CHAR* pattern_next = *pattern;
+    const CHAR* string_next = *string;
+    base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
+    if (pattern_char == next(&string_next, string_end) &&
+        pattern_char != CBU_SENTINEL) {
+      *pattern = pattern_next;
+      *string = string_next;
+    } else {
+      // Uh oh, it did not match, we are done. If the last char was an
+      // escapement, that means that it was an error to advance the ptr here,
+      // let's put it back where it was. This also mean that the MatchPattern
+      // function will return false because if we can't match an escape char
+      // here, then no one will.
+      if (escape) {
+        *pattern = escape;
+      }
+      return;
+    }
+
+    escape = NULL;
+  }
+}
+
+template <typename CHAR, typename NEXT>
+static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
+  while (*pattern != end) {
+    if (!IsWildcard(**pattern))
+      return;
+    next(pattern, end);
+  }
+}
+
+template <typename CHAR, typename NEXT>
+static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
+                          const CHAR* pattern, const CHAR* pattern_end,
+                          int depth,
+                          NEXT next) {
+  const int kMaxDepth = 16;
+  if (depth > kMaxDepth)
+    return false;
+
+  // Eat all the matching chars.
+  EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
+
+  // If the string is empty, then the pattern must be empty too, or contains
+  // only wildcards.
+  if (eval == eval_end) {
+    EatWildcard(&pattern, pattern_end, next);
+    return pattern == pattern_end;
+  }
+
+  // Pattern is empty but not string, this is not a match.
+  if (pattern == pattern_end)
+    return false;
+
+  // If this is a question mark, then we need to compare the rest with
+  // the current string or the string with one character eaten.
+  const CHAR* next_pattern = pattern;
+  next(&next_pattern, pattern_end);
+  if (pattern[0] == '?') {
+    if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+                      depth + 1, next))
+      return true;
+    const CHAR* next_eval = eval;
+    next(&next_eval, eval_end);
+    if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
+                      depth + 1, next))
+      return true;
+  }
+
+  // This is a *, try to match all the possible substrings with the remainder
+  // of the pattern.
+  if (pattern[0] == '*') {
+    // Collapse duplicate wild cards (********** into *) so that the
+    // method does not recurse unnecessarily. http://crbug.com/52839
+    EatWildcard(&next_pattern, pattern_end, next);
+
+    while (eval != eval_end) {
+      if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+                        depth + 1, next))
+        return true;
+      eval++;
+    }
+
+    // We reached the end of the string, let see if the pattern contains only
+    // wildcards.
+    if (eval == eval_end) {
+      EatWildcard(&pattern, pattern_end, next);
+      if (pattern != pattern_end)
+        return false;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+struct NextCharUTF8 {
+  base_icu::UChar32 operator()(const char** p, const char* end) {
+    base_icu::UChar32 c;
+    int offset = 0;
+    CBU8_NEXT(*p, offset, end - *p, c);
+    *p += offset;
+    return c;
+  }
+};
+
+struct NextCharUTF16 {
+  base_icu::UChar32 operator()(const char16** p, const char16* end) {
+    base_icu::UChar32 c;
+    int offset = 0;
+    CBU16_NEXT(*p, offset, end - *p, c);
+    *p += offset;
+    return c;
+  }
+};
+
+}  // namespace
+
+bool MatchPattern(const StringPiece& eval, const StringPiece& pattern) {
+  return MatchPatternT(eval.data(), eval.data() + eval.size(),
+                       pattern.data(), pattern.data() + pattern.size(),
+                       0, NextCharUTF8());
+}
+
+bool MatchPattern(const StringPiece16& eval, const StringPiece16& pattern) {
+  return MatchPatternT(eval.data(), eval.data() + eval.size(),
+                       pattern.data(), pattern.data() + pattern.size(),
+                       0, NextCharUTF16());
+}
+
+}  // namespace base
diff --git a/base/strings/pattern.h b/base/strings/pattern.h
@@ -0,0 +1,26 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_STRINGS_PATTERN_H_
+#define BASE_STRINGS_PATTERN_H_
+
+#include "base/base_export.h"
+#include "base/strings/string_piece.h"
+
+namespace base {
+
+// Returns true if the string passed in matches the pattern. The pattern
+// string can contain wildcards like * and ?
+//
+// The backslash character (\) is an escape character for * and ?
+// We limit the patterns to having a max of 16 * or ? characters.
+// ? matches 0 or 1 character, while * matches 0 or more characters.
+BASE_EXPORT bool MatchPattern(const StringPiece& string,
+                              const StringPiece& pattern);
+BASE_EXPORT bool MatchPattern(const StringPiece16& string,
+                              const StringPiece16& pattern);
+
+}  // namespace base
+
+#endif  // BASE_STRINGS_PATTERN_H_
diff --git a/base/strings/pattern_unittest.cc b/base/strings/pattern_unittest.cc
@@ -0,0 +1,50 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/strings/pattern.h"
+#include "base/strings/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace base {
+
+TEST(StringUtilTest, MatchPatternTest) {
+  EXPECT_TRUE(MatchPattern("www.google.com", "*.com"));
+  EXPECT_TRUE(MatchPattern("www.google.com", "*"));
+  EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org"));
+  EXPECT_TRUE(MatchPattern("Hello", "H?l?o"));
+  EXPECT_FALSE(MatchPattern("www.google.com", "http://*)"));
+  EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM"));
+  EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*"));
+  EXPECT_FALSE(MatchPattern("", "*.*"));
+  EXPECT_TRUE(MatchPattern("", "*"));
+  EXPECT_TRUE(MatchPattern("", "?"));
+  EXPECT_TRUE(MatchPattern("", ""));
+  EXPECT_FALSE(MatchPattern("Hello", ""));
+  EXPECT_TRUE(MatchPattern("Hello*", "Hello*"));
+  // Stop after a certain recursion depth.
+  EXPECT_FALSE(MatchPattern("123456789012345678", "?????????????????*"));
+
+  // Test UTF8 matching.
+  EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0"));
+  EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?."));
+  EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*"));
+  // Invalid sequences should be handled as a single invalid character.
+  EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?"));
+  // If the pattern has invalid characters, it shouldn't match anything.
+  EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80"));
+
+  // Test UTF16 character matching.
+  EXPECT_TRUE(MatchPattern(UTF8ToUTF16("www.google.com"),
+                           UTF8ToUTF16("*.com")));
+  EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello*1234"),
+                           UTF8ToUTF16("He??o\\*1*")));
+
+  // This test verifies that consecutive wild cards are collapsed into 1
+  // wildcard (when this doesn't occur, MatchPattern reaches it's maximum
+  // recursion depth).
+  EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello"),
+                           UTF8ToUTF16("He********************************o")));
+}
+
+}  // namespace base