Skip to content

Commit

Permalink
Use ICU for correct Unicode handling
Browse files Browse the repository at this point in the history
  • Loading branch information
nixprime committed May 6, 2015
1 parent 5a6c130 commit 10a23ff
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 5 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ Requirements

- A C++ compiler supporting C++11.

- Boost headers (Ubuntu: package `libboost-all-dev`).
- Boost (Ubuntu: package `libboost-all-dev`).

- ICU (Ubuntu: package `libicu-dev`).

Installation
------------
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
"src/python_extension_main.cc",
]

cpsm = Extension("cpsm", sources=srcs, extra_compile_args=["-std=c++11"])
cpsm = Extension("cpsm", sources=srcs, extra_compile_args=["-std=c++11"],
define_macros=[("CPSM_CONFIG_ICU", "1")],
libraries=["icudata", "icuuc"])

setup(name="cpsm", version="0.1", description="A path matcher.",
ext_modules=[cpsm])
6 changes: 3 additions & 3 deletions src/matcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ Matcher::Matcher(std::string query, MatcherOpts opts)
query_parts_chars_.emplace_back(std::move(query_chars));
}
// Queries are smartcased (case-sensitive only if any uppercase appears in the
// query). Casing only applies to ASCII letters.
// query).
is_case_sensitive_ = std::any_of(query_.begin(), query_.end(), is_uppercase);
cur_file_parts_= path_components_of(opts_.cur_file);
// Keeping the filename in cur_file_parts_ causes the path distance metric to
Expand Down Expand Up @@ -111,8 +111,8 @@ bool Matcher::append_match(boost::string_ref const item,
key_part_chars_.clear();
decompose_utf8_string(key_part, key_part_chars_);
if (!is_case_sensitive_) {
// The query must not contain any uppercase ASCII letters since otherwise
// the query would be case-sensitive.
// The query must not contain any uppercase letters since otherwise the
// query would be case-sensitive.
for (char32_t& c : key_part_chars_) {
if (is_uppercase(c)) {
c = to_lowercase(c);
Expand Down
22 changes: 22 additions & 0 deletions src/str_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#if CPSM_CONFIG_ICU
#include <unicode/uchar.h>
#endif

#include "str_util.h"

namespace cpsm {
Expand Down Expand Up @@ -91,6 +95,22 @@ void decompose_utf8_string(boost::string_ref str,
}
}

#if CPSM_CONFIG_ICU

bool is_alphanumeric(char32_t const c) {
return u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM);
}

bool is_uppercase(char32_t const c) {
return u_hasBinaryProperty(c, UCHAR_UPPERCASE);
}

char32_t to_lowercase(char32_t const c) {
return u_tolower(c);
}

#else // CPSM_CONFIG_ICU

bool is_alphanumeric(char32_t const c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z');
Expand All @@ -104,4 +124,6 @@ char32_t to_lowercase(char32_t c) {
return c + ('a' - 'A');
}

#endif // CPSM_CONFIG_ICU

} // namespace cpsm

0 comments on commit 10a23ff

Please sign in to comment.