From 10a23ff2ea4bbc936562a2ce1da04f3231b50088 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jliu@nixprime.com>
Date: Tue, 5 May 2015 23:29:34 -0700
Subject: [PATCH] Use ICU for correct Unicode handling

---
 README.md       |  4 +++-
 setup.py        |  4 +++-
 src/matcher.cc  |  6 +++---
 src/str_util.cc | 22 ++++++++++++++++++++++
 4 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 0122f5e..8cada05 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,9 @@ Requirements
 
 - A C++ compiler supporting C++11.
 
-- Boost headers (Ubuntu: package `libboost-all-dev`).
+- Boost (Ubuntu: package `libboost-all-dev`).
+
+- ICU (Ubuntu: package `libicu-dev`).
 
 Installation
 ------------
diff --git a/setup.py b/setup.py
index f723563..e68acb4 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,9 @@
         "src/python_extension_main.cc",
 ]
 
-cpsm = Extension("cpsm", sources=srcs, extra_compile_args=["-std=c++11"])
+cpsm = Extension("cpsm", sources=srcs, extra_compile_args=["-std=c++11"],
+                 define_macros=[("CPSM_CONFIG_ICU", "1")],
+                 libraries=["icudata", "icuuc"])
 
 setup(name="cpsm", version="0.1", description="A path matcher.",
       ext_modules=[cpsm])
diff --git a/src/matcher.cc b/src/matcher.cc
index d2cce09..08f24f9 100644
--- a/src/matcher.cc
+++ b/src/matcher.cc
@@ -52,7 +52,7 @@ Matcher::Matcher(std::string query, MatcherOpts opts)
     query_parts_chars_.emplace_back(std::move(query_chars));
   }
   // Queries are smartcased (case-sensitive only if any uppercase appears in the
-  // query). Casing only applies to ASCII letters.
+  // query).
   is_case_sensitive_ = std::any_of(query_.begin(), query_.end(), is_uppercase);
   cur_file_parts_= path_components_of(opts_.cur_file);
   // Keeping the filename in cur_file_parts_ causes the path distance metric to
@@ -111,8 +111,8 @@ bool Matcher::append_match(boost::string_ref const item,
     key_part_chars_.clear();
     decompose_utf8_string(key_part, key_part_chars_);
     if (!is_case_sensitive_) {
-      // The query must not contain any uppercase ASCII letters since otherwise
-      // the query would be case-sensitive.
+      // The query must not contain any uppercase letters since otherwise the
+      // query would be case-sensitive.
       for (char32_t& c : key_part_chars_) {
         if (is_uppercase(c)) {
           c = to_lowercase(c);
diff --git a/src/str_util.cc b/src/str_util.cc
index a1a08a7..36c1928 100644
--- a/src/str_util.cc
+++ b/src/str_util.cc
@@ -13,6 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if CPSM_CONFIG_ICU
+#include <unicode/uchar.h>
+#endif
+
 #include "str_util.h"
 
 namespace cpsm {
@@ -91,6 +95,22 @@ void decompose_utf8_string(boost::string_ref str,
   }
 }
 
+#if CPSM_CONFIG_ICU
+
+bool is_alphanumeric(char32_t const c) {
+  return u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM);
+}
+
+bool is_uppercase(char32_t const c) {
+  return u_hasBinaryProperty(c, UCHAR_UPPERCASE);
+}
+
+char32_t to_lowercase(char32_t const c) {
+  return u_tolower(c);
+}
+
+#else // CPSM_CONFIG_ICU
+
 bool is_alphanumeric(char32_t const c) {
   return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
          (c >= 'A' && c <= 'Z');
@@ -104,4 +124,6 @@ char32_t to_lowercase(char32_t c) {
   return c + ('a' - 'A');
 }
 
+#endif // CPSM_CONFIG_ICU
+
 } // namespace cpsm