Skip to content

Commit

Permalink
Move MatchPattern to its own header and the base namespace.
Browse files Browse the repository at this point in the history
BUG=

Review URL: https://codereview.chromium.org/1226673003

Cr-Commit-Position: refs/heads/master@{#337488}
  • Loading branch information
brettw authored and Commit bot committed Jul 6, 2015
1 parent 7d47f7a commit d97eede
Show file tree
Hide file tree
Showing 48 changed files with 482 additions and 396 deletions.
3 changes: 3 additions & 0 deletions base/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,8 @@ component("base") {
"strings/latin1_string_conversions.h",
"strings/nullable_string16.cc",
"strings/nullable_string16.h",
"strings/pattern.cc",
"strings/pattern.h",
"strings/safe_sprintf.cc",
"strings/safe_sprintf.h",
"strings/string16.cc",
Expand Down Expand Up @@ -1276,6 +1278,7 @@ test("base_unittests") {
"sha1_unittest.cc",
"stl_util_unittest.cc",
"strings/nullable_string16_unittest.cc",
"strings/pattern_unittest.cc",
"strings/safe_sprintf_unittest.cc",
"strings/string16_unittest.cc",
"strings/string_number_conversions_unittest.cc",
Expand Down
1 change: 1 addition & 0 deletions base/base.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,7 @@
'sha1_unittest.cc',
'stl_util_unittest.cc',
'strings/nullable_string16_unittest.cc',
'strings/pattern_unittest.cc',
'strings/safe_sprintf_unittest.cc',
'strings/string16_unittest.cc',
'strings/string_number_conversions_unittest.cc',
Expand Down
2 changes: 2 additions & 0 deletions base/base.gypi
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,8 @@
'strings/latin1_string_conversions.h',
'strings/nullable_string16.cc',
'strings/nullable_string16.h',
'strings/pattern.cc',
'strings/pattern.h',
'strings/safe_sprintf.cc',
'strings/safe_sprintf.h',
'strings/string16.cc',
Expand Down
169 changes: 169 additions & 0 deletions base/strings/pattern.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/strings/pattern.h"

#include "base/third_party/icu/icu_utf.h"

namespace base {

namespace {

static bool IsWildcard(base_icu::UChar32 character) {
return character == '*' || character == '?';
}

// Move the strings pointers to the point where they start to differ.
template <typename CHAR, typename NEXT>
static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
const CHAR** string, const CHAR* string_end,
NEXT next) {
const CHAR* escape = NULL;
while (*pattern != pattern_end && *string != string_end) {
if (!escape && IsWildcard(**pattern)) {
// We don't want to match wildcard here, except if it's escaped.
return;
}

// Check if the escapement char is found. If so, skip it and move to the
// next character.
if (!escape && **pattern == '\\') {
escape = *pattern;
next(pattern, pattern_end);
continue;
}

// Check if the chars match, if so, increment the ptrs.
const CHAR* pattern_next = *pattern;
const CHAR* string_next = *string;
base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
if (pattern_char == next(&string_next, string_end) &&
pattern_char != CBU_SENTINEL) {
*pattern = pattern_next;
*string = string_next;
} else {
// Uh oh, it did not match, we are done. If the last char was an
// escapement, that means that it was an error to advance the ptr here,
// let's put it back where it was. This also mean that the MatchPattern
// function will return false because if we can't match an escape char
// here, then no one will.
if (escape) {
*pattern = escape;
}
return;
}

escape = NULL;
}
}

template <typename CHAR, typename NEXT>
static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
while (*pattern != end) {
if (!IsWildcard(**pattern))
return;
next(pattern, end);
}
}

template <typename CHAR, typename NEXT>
static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
const CHAR* pattern, const CHAR* pattern_end,
int depth,
NEXT next) {
const int kMaxDepth = 16;
if (depth > kMaxDepth)
return false;

// Eat all the matching chars.
EatSameChars(&pattern, pattern_end, &eval, eval_end, next);

// If the string is empty, then the pattern must be empty too, or contains
// only wildcards.
if (eval == eval_end) {
EatWildcard(&pattern, pattern_end, next);
return pattern == pattern_end;
}

// Pattern is empty but not string, this is not a match.
if (pattern == pattern_end)
return false;

// If this is a question mark, then we need to compare the rest with
// the current string or the string with one character eaten.
const CHAR* next_pattern = pattern;
next(&next_pattern, pattern_end);
if (pattern[0] == '?') {
if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
depth + 1, next))
return true;
const CHAR* next_eval = eval;
next(&next_eval, eval_end);
if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
depth + 1, next))
return true;
}

// This is a *, try to match all the possible substrings with the remainder
// of the pattern.
if (pattern[0] == '*') {
// Collapse duplicate wild cards (********** into *) so that the
// method does not recurse unnecessarily. http://crbug.com/52839
EatWildcard(&next_pattern, pattern_end, next);

while (eval != eval_end) {
if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
depth + 1, next))
return true;
eval++;
}

// We reached the end of the string, let see if the pattern contains only
// wildcards.
if (eval == eval_end) {
EatWildcard(&pattern, pattern_end, next);
if (pattern != pattern_end)
return false;
return true;
}
}

return false;
}

struct NextCharUTF8 {
base_icu::UChar32 operator()(const char** p, const char* end) {
base_icu::UChar32 c;
int offset = 0;
CBU8_NEXT(*p, offset, end - *p, c);
*p += offset;
return c;
}
};

struct NextCharUTF16 {
base_icu::UChar32 operator()(const char16** p, const char16* end) {
base_icu::UChar32 c;
int offset = 0;
CBU16_NEXT(*p, offset, end - *p, c);
*p += offset;
return c;
}
};

} // namespace

bool MatchPattern(const StringPiece& eval, const StringPiece& pattern) {
return MatchPatternT(eval.data(), eval.data() + eval.size(),
pattern.data(), pattern.data() + pattern.size(),
0, NextCharUTF8());
}

bool MatchPattern(const StringPiece16& eval, const StringPiece16& pattern) {
return MatchPatternT(eval.data(), eval.data() + eval.size(),
pattern.data(), pattern.data() + pattern.size(),
0, NextCharUTF16());
}

} // namespace base
26 changes: 26 additions & 0 deletions base/strings/pattern.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef BASE_STRINGS_PATTERN_H_
#define BASE_STRINGS_PATTERN_H_

#include "base/base_export.h"
#include "base/strings/string_piece.h"

namespace base {

// Returns true if the string passed in matches the pattern. The pattern
// string can contain wildcards like * and ?
//
// The backslash character (\) is an escape character for * and ?
// We limit the patterns to having a max of 16 * or ? characters.
// ? matches 0 or 1 character, while * matches 0 or more characters.
BASE_EXPORT bool MatchPattern(const StringPiece& string,
const StringPiece& pattern);
BASE_EXPORT bool MatchPattern(const StringPiece16& string,
const StringPiece16& pattern);

} // namespace base

#endif // BASE_STRINGS_PATTERN_H_
50 changes: 50 additions & 0 deletions base/strings/pattern_unittest.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/strings/pattern.h"
#include "base/strings/utf_string_conversions.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace base {

TEST(StringUtilTest, MatchPatternTest) {
EXPECT_TRUE(MatchPattern("www.google.com", "*.com"));
EXPECT_TRUE(MatchPattern("www.google.com", "*"));
EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org"));
EXPECT_TRUE(MatchPattern("Hello", "H?l?o"));
EXPECT_FALSE(MatchPattern("www.google.com", "http://*)"));
EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM"));
EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*"));
EXPECT_FALSE(MatchPattern("", "*.*"));
EXPECT_TRUE(MatchPattern("", "*"));
EXPECT_TRUE(MatchPattern("", "?"));
EXPECT_TRUE(MatchPattern("", ""));
EXPECT_FALSE(MatchPattern("Hello", ""));
EXPECT_TRUE(MatchPattern("Hello*", "Hello*"));
// Stop after a certain recursion depth.
EXPECT_FALSE(MatchPattern("123456789012345678", "?????????????????*"));

// Test UTF8 matching.
EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0"));
EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?."));
EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*"));
// Invalid sequences should be handled as a single invalid character.
EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?"));
// If the pattern has invalid characters, it shouldn't match anything.
EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80"));

// Test UTF16 character matching.
EXPECT_TRUE(MatchPattern(UTF8ToUTF16("www.google.com"),
UTF8ToUTF16("*.com")));
EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello*1234"),
UTF8ToUTF16("He??o\\*1*")));

// This test verifies that consecutive wild cards are collapsed into 1
// wildcard (when this doesn't occur, MatchPattern reaches it's maximum
// recursion depth).
EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello"),
UTF8ToUTF16("He********************************o")));
}

} // namespace base
Loading

0 comments on commit d97eede

Please sign in to comment.