From 10a23ff2ea4bbc936562a2ce1da04f3231b50088 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 5 May 2015 23:29:34 -0700 Subject: [PATCH] Use ICU for correct Unicode handling --- README.md | 4 +++- setup.py | 4 +++- src/matcher.cc | 6 +++--- src/str_util.cc | 22 ++++++++++++++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0122f5e..8cada05 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,9 @@ Requirements - A C++ compiler supporting C++11. -- Boost headers (Ubuntu: package `libboost-all-dev`). +- Boost (Ubuntu: package `libboost-all-dev`). + +- ICU (Ubuntu: package `libicu-dev`). Installation ------------ diff --git a/setup.py b/setup.py index f723563..e68acb4 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,9 @@ "src/python_extension_main.cc", ] -cpsm = Extension("cpsm", sources=srcs, extra_compile_args=["-std=c++11"]) +cpsm = Extension("cpsm", sources=srcs, extra_compile_args=["-std=c++11"], + define_macros=[("CPSM_CONFIG_ICU", "1")], + libraries=["icudata", "icuuc"]) setup(name="cpsm", version="0.1", description="A path matcher.", ext_modules=[cpsm]) diff --git a/src/matcher.cc b/src/matcher.cc index d2cce09..08f24f9 100644 --- a/src/matcher.cc +++ b/src/matcher.cc @@ -52,7 +52,7 @@ Matcher::Matcher(std::string query, MatcherOpts opts) query_parts_chars_.emplace_back(std::move(query_chars)); } // Queries are smartcased (case-sensitive only if any uppercase appears in the - // query). Casing only applies to ASCII letters. + // query). is_case_sensitive_ = std::any_of(query_.begin(), query_.end(), is_uppercase); cur_file_parts_= path_components_of(opts_.cur_file); // Keeping the filename in cur_file_parts_ causes the path distance metric to @@ -111,8 +111,8 @@ bool Matcher::append_match(boost::string_ref const item, key_part_chars_.clear(); decompose_utf8_string(key_part, key_part_chars_); if (!is_case_sensitive_) { - // The query must not contain any uppercase ASCII letters since otherwise - // the query would be case-sensitive. + // The query must not contain any uppercase letters since otherwise the + // query would be case-sensitive. for (char32_t& c : key_part_chars_) { if (is_uppercase(c)) { c = to_lowercase(c); diff --git a/src/str_util.cc b/src/str_util.cc index a1a08a7..36c1928 100644 --- a/src/str_util.cc +++ b/src/str_util.cc @@ -13,6 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#if CPSM_CONFIG_ICU +#include +#endif + #include "str_util.h" namespace cpsm { @@ -91,6 +95,22 @@ void decompose_utf8_string(boost::string_ref str, } } +#if CPSM_CONFIG_ICU + +bool is_alphanumeric(char32_t const c) { + return u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM); +} + +bool is_uppercase(char32_t const c) { + return u_hasBinaryProperty(c, UCHAR_UPPERCASE); +} + +char32_t to_lowercase(char32_t const c) { + return u_tolower(c); +} + +#else // CPSM_CONFIG_ICU + bool is_alphanumeric(char32_t const c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); @@ -104,4 +124,6 @@ char32_t to_lowercase(char32_t c) { return c + ('a' - 'A'); } +#endif // CPSM_CONFIG_ICU + } // namespace cpsm