diff --git a/README.md b/README.md index 8cada05..8a1e260 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,25 @@ Installation to your `vimrc`. +Options +------- + +- By default, cpsm will automatically detect the number of matcher threads + based on the available hardware concurrency. To limit the number of threads + that cpsm can use, add + + let g:cpsm_max_threads = (maximum number of threads) + + to your .vimrc. + +- To enable Unicode support, add + + let g:cpsm_unicode = 1 + + to your .vimrc. Unicode support is currently very limited, and consists + mostly of parsing input strings as UTF-8 and handling the case of non-ASCII + letters correctly. + Algorithm --------- diff --git a/autoload/cpsm.py b/autoload/cpsm.py index 8eae156..acf3549 100644 --- a/autoload/cpsm.py +++ b/autoload/cpsm.py @@ -28,7 +28,9 @@ def ctrlp_match(): limit=int(vim.eval("a:limit")), mmode=vim.eval("a:mmode"), ispath=int(vim.eval("a:ispath")), - crfile=vim.eval("a:crfile")) + crfile=vim.eval("a:crfile"), + max_threads=int(vim.eval("g:cpsm_max_threads")), + unicode=int(vim.eval("g:cpsm_unicode"))) # Escape backslashes and ". vim.command("let s:results = [%s]" % ",".join( '"%s"' % r.replace("\\", "\\\\").replace('"', '\\"') diff --git a/autoload/cpsm.vim b/autoload/cpsm.vim index 90e3cd7..cf00889 100644 --- a/autoload/cpsm.vim +++ b/autoload/cpsm.vim @@ -21,3 +21,7 @@ function cpsm#CtrlPMatch(items, str, limit, mmode, ispath, crfile, regex) py ctrlp_match() return s:results endfunction + +" Default settings +let g:cpsm_max_threads = 0 +let g:cpsm_unicode = 0 diff --git a/python/bench_cpsm.py b/python/bench_cpsm.py index a9a857b..a7fcb71 100755 --- a/python/bench_cpsm.py +++ b/python/bench_cpsm.py @@ -36,7 +36,7 @@ for _ in xrange(args.iterations): start = linuxclock.monotonic() results = cpsm.ctrlp_match(bench.ITEMS, query, limit=bench.LIMIT, - ispath=True, nr_threads=args.threads) + ispath=True, max_threads=args.threads) finish = linuxclock.monotonic() times.append(finish - start) print("Query '%s': avg time %fs, results: [%s]" % ( diff --git a/src/matcher.cc b/src/matcher.cc index f40f528..04bed77 100644 --- a/src/matcher.cc +++ b/src/matcher.cc @@ -27,9 +27,10 @@ namespace cpsm { -Matcher::Matcher(boost::string_ref const query, MatcherOpts opts) - : opts_(std::move(opts)) { - decompose_utf8_string(query, query_); +Matcher::Matcher(boost::string_ref const query, MatcherOpts opts, + StringHandler strings) + : opts_(std::move(opts)), strings_(std::move(strings)) { + strings_.decompose(query, query_); if (opts_.is_path) { // Store the index of the first character after the rightmost path // separator in the query. (Store an index rather than an iterator to keep @@ -56,24 +57,22 @@ Matcher::Matcher(boost::string_ref const query, MatcherOpts opts) // Queries are smartcased (case-sensitive only if any uppercase appears in the // query). - is_case_sensitive_ = std::any_of(query_.begin(), query_.end(), is_uppercase); + is_case_sensitive_ = + std::any_of(query_.begin(), query_.end(), + [&](char32_t const c) { return strings_.is_uppercase(c); }); cur_file_parts_ = path_components_of(opts_.cur_file); } bool Matcher::match_base(boost::string_ref const item, MatchBase& m, - std::vector* key_chars, - std::vector* temp_chars) const { + std::vector* const buf, + std::vector* const buf2) const { m = MatchBase(); std::vector key_chars_local; - if (!key_chars) { - key_chars = &key_chars_local; - } + std::vector& key_chars = buf ? *buf : key_chars_local; std::vector temp_chars_local; - if (!temp_chars) { - temp_chars = &temp_chars_local; - } + std::vector& temp_chars = buf2 ? *buf2 : temp_chars_local; std::vector item_parts; if (opts_.is_path) { @@ -103,15 +102,15 @@ bool Matcher::match_base(boost::string_ref const item, MatchBase& m, break; } - std::vector* const item_part_chars = + std::vector& item_part_chars = part_index ? temp_chars : key_chars; - item_part_chars->clear(); - decompose_utf8_string(item_part, *item_part_chars); + item_part_chars.clear(); + strings_.decompose(item_part, item_part_chars); // Since path components are matched right-to-left, query characters must be // consumed greedily right-to-left. auto query_prev = query_it; - for (char32_t const c : boost::adaptors::reverse(*item_part_chars)) { + for (char32_t const c : boost::adaptors::reverse(item_part_chars)) { if (match_char(c, *query_it)) { ++query_it; if (query_it == query_end) { @@ -145,7 +144,7 @@ bool Matcher::match_base(boost::string_ref const item, MatchBase& m, // Now do more refined matching on the key (the rightmost path component of // the item for a path match, and just the full item otherwise). - match_key(*key_chars, query_key_begin, m); + match_key(key_chars, query_key_begin, m); return true; } @@ -175,10 +174,11 @@ void Matcher::match_key(std::vector const& key, if (i == 0) { return true; } - if (is_alphanumeric(key[i]) && !is_alphanumeric(key[i - 1])) { + if (strings_.is_alphanumeric(key[i]) && + !strings_.is_alphanumeric(key[i - 1])) { return true; } - if (is_uppercase(key[i]) && !is_uppercase(key[i - 1])) { + if (strings_.is_uppercase(key[i]) && !strings_.is_uppercase(key[i - 1])) { return true; } return false; @@ -213,7 +213,7 @@ void Matcher::match_key(std::vector const& key, at_word_start = true; word_matched = false; } - if (pass == 0 && is_alphanumeric(*query_key) && !at_word_start) { + if (pass == 0 && strings_.is_alphanumeric(*query_key) && !at_word_start) { is_full_prefix = false; continue; } @@ -249,8 +249,8 @@ bool Matcher::match_char(char32_t item, char32_t const query) const { // The query must not contain any uppercase letters since otherwise the // query would be case-sensitive, so just force all uppercase characters to // lowercase. - if (is_uppercase(item)) { - item = to_lowercase(item); + if (strings_.is_uppercase(item)) { + item = strings_.to_lowercase(item); } } return item == query; diff --git a/src/matcher.h b/src/matcher.h index a6b8f34..a38219d 100644 --- a/src/matcher.h +++ b/src/matcher.h @@ -45,9 +45,10 @@ struct MatcherOpts { QueryPathMode query_path_mode = QueryPathMode::AUTO; }; -class Matcher { +class Matcher final { public: - explicit Matcher(boost::string_ref query, MatcherOpts opts = MatcherOpts()); + explicit Matcher(boost::string_ref query, MatcherOpts opts = MatcherOpts(), + StringHandler strings = StringHandler()); // If the query represented by this matcher matches the given item, fills the // given match object with information about the match and returns true. @@ -77,9 +78,10 @@ class Matcher { bool match_char(char32_t item, char32_t query) const; + MatcherOpts opts_; + StringHandler strings_; std::vector query_; std::size_t query_key_begin_index_; - MatcherOpts opts_; bool is_case_sensitive_; bool require_full_part_; std::vector cur_file_parts_; diff --git a/src/python_extension_main.cc b/src/python_extension_main.cc index 8f45eae..777af24 100644 --- a/src/python_extension_main.cc +++ b/src/python_extension_main.cc @@ -86,7 +86,8 @@ extern "C" { static PyObject* cpsm_ctrlp_match(PyObject* self, PyObject* args, PyObject* kwargs) { static char const* kwlist[] = {"items", "query", "limit", "mmode", - "ispath", "crfile", "nr_threads", nullptr}; + "ispath", "crfile", "max_threads", "unicode", + nullptr}; PyObject* items_obj; char const* query_data; Py_ssize_t query_size; @@ -96,11 +97,13 @@ static PyObject* cpsm_ctrlp_match(PyObject* self, PyObject* args, int is_path = 0; char const* cur_file_data = nullptr; Py_ssize_t cur_file_size = 0; - int nr_threads_int = 0; + int max_threads_int = 0; + int unicode = 0; if (!PyArg_ParseTupleAndKeywords( - args, kwargs, "Os#|is#is#i", const_cast(kwlist), &items_obj, + args, kwargs, "Os#|is#is#ii", const_cast(kwlist), &items_obj, &query_data, &query_size, &limit_int, &mmode_data, &mmode_size, - &is_path, &cur_file_data, &cur_file_size, &nr_threads_int)) { + &is_path, &cur_file_data, &cur_file_size, &max_threads_int, + &unicode)) { return nullptr; } @@ -113,12 +116,15 @@ static PyObject* cpsm_ctrlp_match(PyObject* self, PyObject* args, cpsm::MatcherOpts mopts; mopts.cur_file = std::string(cur_file_data, cur_file_size); mopts.is_path = is_path; - cpsm::Matcher matcher(std::move(query), std::move(mopts)); + cpsm::StringHandlerOpts sopts; + sopts.unicode = unicode; + cpsm::Matcher matcher(std::move(query), std::move(mopts), + cpsm::StringHandler(sopts)); auto item_substr_fn = cpsm::match_mode_item_substr_fn( boost::string_ref(mmode_data, mmode_size)); std::size_t const limit = (limit_int >= 0) ? std::size_t(limit_int) : 0; unsigned int const max_threads = - (nr_threads_int >= 0) ? static_cast(nr_threads_int) : 0; + (max_threads_int >= 0) ? static_cast(max_threads_int) : 0; unsigned int nr_threads; std::size_t items_per_batch; diff --git a/src/str_util.cc b/src/str_util.cc index 1be8598..261ad3c 100644 --- a/src/str_util.cc +++ b/src/str_util.cc @@ -13,14 +13,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "str_util.h" + #if CPSM_CONFIG_ICU #include #endif -#include "str_util.h" +#include namespace cpsm { +namespace { + void decompose_utf8_string(boost::string_ref str, std::vector& chars) { // Even though most of this function deals with byte-sized quantities, use @@ -94,29 +98,54 @@ void decompose_utf8_string(boost::string_ref str, } } -#if CPSM_CONFIG_ICU +} // namespace -bool is_alphanumeric(char32_t const c) { - return u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM); +StringHandler::StringHandler(StringHandlerOpts opts) : opts_(std::move(opts)) { +#if !CPSM_CONFIG_ICU + if (opts_.unicode) { + throw Error("cpsm built without Unicode support"); + } +#endif } -bool is_uppercase(char32_t const c) { - return u_hasBinaryProperty(c, UCHAR_UPPERCASE); +void StringHandler::decompose(boost::string_ref const str, + std::vector& chars) const { + if (opts_.unicode) { + decompose_utf8_string(str, chars); + } else { + chars.reserve(str.size()); + for (char const c : str) { + chars.push_back(c); + } + } } -char32_t to_lowercase(char32_t const c) { return u_tolower(c); } - -#else // CPSM_CONFIG_ICU - -bool is_alphanumeric(char32_t const c) { +bool StringHandler::is_alphanumeric(char32_t const c) const { +#if CPSM_CONFIG_ICU + if (opts_.unicode) { + return u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM); + } +#endif return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } -bool is_uppercase(char32_t const c) { return c >= 'A' && c <= 'Z'; } - -char32_t to_lowercase(char32_t const c) { return c + ('a' - 'A'); } +bool StringHandler::is_uppercase(char32_t const c) const { +#if CPSM_CONFIG_ICU + if (opts_.unicode) { + return u_hasBinaryProperty(c, UCHAR_UPPERCASE); + } +#endif + return c >= 'A' && c <= 'Z'; +} -#endif // CPSM_CONFIG_ICU +char32_t StringHandler::to_lowercase(char32_t const c) const { +#if CPSM_CONFIG_ICU + if (opts_.unicode) { + return u_tolower(c); + } +#endif + return c + ('a' - 'A'); +} } // namespace cpsm diff --git a/src/str_util.h b/src/str_util.h index 7b5ae88..680d42e 100644 --- a/src/str_util.h +++ b/src/str_util.h @@ -76,20 +76,35 @@ inline std::string copy_string_ref(boost::string_ref const sref) { return std::string(sref.data(), sref.size()); } -// Splits a UTF-8-encoded string into code points and append them to the given -// vector. If the string is not a valid UTF-8 encoded string, invalid bytes are -// are replaced by the invalid code point 0xdc00+(byte). (This is so that a -// match can still be attempted.) -void decompose_utf8_string(boost::string_ref str, std::vector& chars); +struct StringHandlerOpts { + bool unicode = false; +}; + +class StringHandler final { + public: + explicit StringHandler(StringHandlerOpts opts = StringHandlerOpts()); + + // If opts.unicode is false, appends each byte in the given string to the + // given vector. + // + // If opts.unicode is true, attempts to parse the given string as UTF-8, + // appending each code point in the string to the given vector. Non-UTF-8 + // bytes are appended to the given vector as the invalid code point + // 0xdc00+(byte) so that a match can still be attempted. + void decompose(boost::string_ref str, std::vector& chars) const; -// Returns true if the given code point represents a letter or number. -bool is_alphanumeric(char32_t c); + // Returns true if the given code point represents a letter or number. + bool is_alphanumeric(char32_t c) const; -// Returns true if the given code point represents a uppercase letter. -bool is_uppercase(char32_t c); + // Returns true if the given code point represents an uppercase letter. + bool is_uppercase(char32_t c) const; -// Returns the lowercased version of c. c must be an uppercase letter. -char32_t to_lowercase(char32_t c); + // Returns the lowercase version of c. c must be an uppercase letter. + char32_t to_lowercase(char32_t c) const; + + private: + StringHandlerOpts opts_; +}; } // namespace cpsm