Skip to content

Commit

Permalink
Improve configurability
Browse files Browse the repository at this point in the history
- Add g:cpsm_max_threads and g:cpsm_unicode options.

- CtrlP is mostly used for finding files in source code; source code
  filenames almost always consist only of characters that are 7-bit
  clean. Disable Unicode support by default for performance.
  • Loading branch information
nixprime committed Jun 2, 2015
1 parent 2abec32 commit ad8c3c9
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 59 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,25 @@ Installation

to your `vimrc`.

Options
-------

- By default, cpsm will automatically detect the number of matcher threads
based on the available hardware concurrency. To limit the number of threads
that cpsm can use, add

let g:cpsm_max_threads = (maximum number of threads)

to your .vimrc.

- To enable Unicode support, add

let g:cpsm_unicode = 1

to your .vimrc. Unicode support is currently very limited, and consists
mostly of parsing input strings as UTF-8 and handling the case of non-ASCII
letters correctly.

Algorithm
---------

Expand Down
4 changes: 3 additions & 1 deletion autoload/cpsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ def ctrlp_match():
limit=int(vim.eval("a:limit")),
mmode=vim.eval("a:mmode"),
ispath=int(vim.eval("a:ispath")),
crfile=vim.eval("a:crfile"))
crfile=vim.eval("a:crfile"),
max_threads=int(vim.eval("g:cpsm_max_threads")),
unicode=int(vim.eval("g:cpsm_unicode")))
# Escape backslashes and ".
vim.command("let s:results = [%s]" % ",".join(
'"%s"' % r.replace("\\", "\\\\").replace('"', '\\"')
Expand Down
4 changes: 4 additions & 0 deletions autoload/cpsm.vim
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,7 @@ function cpsm#CtrlPMatch(items, str, limit, mmode, ispath, crfile, regex)
py ctrlp_match()
return s:results
endfunction

" Default settings
let g:cpsm_max_threads = 0
let g:cpsm_unicode = 0
2 changes: 1 addition & 1 deletion python/bench_cpsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
for _ in xrange(args.iterations):
start = linuxclock.monotonic()
results = cpsm.ctrlp_match(bench.ITEMS, query, limit=bench.LIMIT,
ispath=True, nr_threads=args.threads)
ispath=True, max_threads=args.threads)
finish = linuxclock.monotonic()
times.append(finish - start)
print("Query '%s': avg time %fs, results: [%s]" % (
Expand Down
44 changes: 22 additions & 22 deletions src/matcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@

namespace cpsm {

Matcher::Matcher(boost::string_ref const query, MatcherOpts opts)
: opts_(std::move(opts)) {
decompose_utf8_string(query, query_);
Matcher::Matcher(boost::string_ref const query, MatcherOpts opts,
StringHandler strings)
: opts_(std::move(opts)), strings_(std::move(strings)) {
strings_.decompose(query, query_);
if (opts_.is_path) {
// Store the index of the first character after the rightmost path
// separator in the query. (Store an index rather than an iterator to keep
Expand All @@ -56,24 +57,22 @@ Matcher::Matcher(boost::string_ref const query, MatcherOpts opts)

// Queries are smartcased (case-sensitive only if any uppercase appears in the
// query).
is_case_sensitive_ = std::any_of(query_.begin(), query_.end(), is_uppercase);
is_case_sensitive_ =
std::any_of(query_.begin(), query_.end(),
[&](char32_t const c) { return strings_.is_uppercase(c); });

cur_file_parts_ = path_components_of(opts_.cur_file);
}

bool Matcher::match_base(boost::string_ref const item, MatchBase& m,
std::vector<char32_t>* key_chars,
std::vector<char32_t>* temp_chars) const {
std::vector<char32_t>* const buf,
std::vector<char32_t>* const buf2) const {
m = MatchBase();

std::vector<char32_t> key_chars_local;
if (!key_chars) {
key_chars = &key_chars_local;
}
std::vector<char32_t>& key_chars = buf ? *buf : key_chars_local;
std::vector<char32_t> temp_chars_local;
if (!temp_chars) {
temp_chars = &temp_chars_local;
}
std::vector<char32_t>& temp_chars = buf2 ? *buf2 : temp_chars_local;

std::vector<boost::string_ref> item_parts;
if (opts_.is_path) {
Expand Down Expand Up @@ -103,15 +102,15 @@ bool Matcher::match_base(boost::string_ref const item, MatchBase& m,
break;
}

std::vector<char32_t>* const item_part_chars =
std::vector<char32_t>& item_part_chars =
part_index ? temp_chars : key_chars;
item_part_chars->clear();
decompose_utf8_string(item_part, *item_part_chars);
item_part_chars.clear();
strings_.decompose(item_part, item_part_chars);

// Since path components are matched right-to-left, query characters must be
// consumed greedily right-to-left.
auto query_prev = query_it;
for (char32_t const c : boost::adaptors::reverse(*item_part_chars)) {
for (char32_t const c : boost::adaptors::reverse(item_part_chars)) {
if (match_char(c, *query_it)) {
++query_it;
if (query_it == query_end) {
Expand Down Expand Up @@ -145,7 +144,7 @@ bool Matcher::match_base(boost::string_ref const item, MatchBase& m,

// Now do more refined matching on the key (the rightmost path component of
// the item for a path match, and just the full item otherwise).
match_key(*key_chars, query_key_begin, m);
match_key(key_chars, query_key_begin, m);
return true;
}

Expand Down Expand Up @@ -175,10 +174,11 @@ void Matcher::match_key(std::vector<char32_t> const& key,
if (i == 0) {
return true;
}
if (is_alphanumeric(key[i]) && !is_alphanumeric(key[i - 1])) {
if (strings_.is_alphanumeric(key[i]) &&
!strings_.is_alphanumeric(key[i - 1])) {
return true;
}
if (is_uppercase(key[i]) && !is_uppercase(key[i - 1])) {
if (strings_.is_uppercase(key[i]) && !strings_.is_uppercase(key[i - 1])) {
return true;
}
return false;
Expand Down Expand Up @@ -213,7 +213,7 @@ void Matcher::match_key(std::vector<char32_t> const& key,
at_word_start = true;
word_matched = false;
}
if (pass == 0 && is_alphanumeric(*query_key) && !at_word_start) {
if (pass == 0 && strings_.is_alphanumeric(*query_key) && !at_word_start) {
is_full_prefix = false;
continue;
}
Expand Down Expand Up @@ -249,8 +249,8 @@ bool Matcher::match_char(char32_t item, char32_t const query) const {
// The query must not contain any uppercase letters since otherwise the
// query would be case-sensitive, so just force all uppercase characters to
// lowercase.
if (is_uppercase(item)) {
item = to_lowercase(item);
if (strings_.is_uppercase(item)) {
item = strings_.to_lowercase(item);
}
}
return item == query;
Expand Down
8 changes: 5 additions & 3 deletions src/matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,10 @@ struct MatcherOpts {
QueryPathMode query_path_mode = QueryPathMode::AUTO;
};

class Matcher {
class Matcher final {
public:
explicit Matcher(boost::string_ref query, MatcherOpts opts = MatcherOpts());
explicit Matcher(boost::string_ref query, MatcherOpts opts = MatcherOpts(),
StringHandler strings = StringHandler());

// If the query represented by this matcher matches the given item, fills the
// given match object with information about the match and returns true.
Expand Down Expand Up @@ -77,9 +78,10 @@ class Matcher {

bool match_char(char32_t item, char32_t query) const;

MatcherOpts opts_;
StringHandler strings_;
std::vector<char32_t> query_;
std::size_t query_key_begin_index_;
MatcherOpts opts_;
bool is_case_sensitive_;
bool require_full_part_;
std::vector<boost::string_ref> cur_file_parts_;
Expand Down
18 changes: 12 additions & 6 deletions src/python_extension_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ extern "C" {
static PyObject* cpsm_ctrlp_match(PyObject* self, PyObject* args,
PyObject* kwargs) {
static char const* kwlist[] = {"items", "query", "limit", "mmode",
"ispath", "crfile", "nr_threads", nullptr};
"ispath", "crfile", "max_threads", "unicode",
nullptr};
PyObject* items_obj;
char const* query_data;
Py_ssize_t query_size;
Expand All @@ -96,11 +97,13 @@ static PyObject* cpsm_ctrlp_match(PyObject* self, PyObject* args,
int is_path = 0;
char const* cur_file_data = nullptr;
Py_ssize_t cur_file_size = 0;
int nr_threads_int = 0;
int max_threads_int = 0;
int unicode = 0;
if (!PyArg_ParseTupleAndKeywords(
args, kwargs, "Os#|is#is#i", const_cast<char**>(kwlist), &items_obj,
args, kwargs, "Os#|is#is#ii", const_cast<char**>(kwlist), &items_obj,
&query_data, &query_size, &limit_int, &mmode_data, &mmode_size,
&is_path, &cur_file_data, &cur_file_size, &nr_threads_int)) {
&is_path, &cur_file_data, &cur_file_size, &max_threads_int,
&unicode)) {
return nullptr;
}

Expand All @@ -113,12 +116,15 @@ static PyObject* cpsm_ctrlp_match(PyObject* self, PyObject* args,
cpsm::MatcherOpts mopts;
mopts.cur_file = std::string(cur_file_data, cur_file_size);
mopts.is_path = is_path;
cpsm::Matcher matcher(std::move(query), std::move(mopts));
cpsm::StringHandlerOpts sopts;
sopts.unicode = unicode;
cpsm::Matcher matcher(std::move(query), std::move(mopts),
cpsm::StringHandler(sopts));
auto item_substr_fn = cpsm::match_mode_item_substr_fn(
boost::string_ref(mmode_data, mmode_size));
std::size_t const limit = (limit_int >= 0) ? std::size_t(limit_int) : 0;
unsigned int const max_threads =
(nr_threads_int >= 0) ? static_cast<unsigned int>(nr_threads_int) : 0;
(max_threads_int >= 0) ? static_cast<unsigned int>(max_threads_int) : 0;

unsigned int nr_threads;
std::size_t items_per_batch;
Expand Down
59 changes: 44 additions & 15 deletions src/str_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "str_util.h"

#if CPSM_CONFIG_ICU
#include <unicode/uchar.h>
#endif

#include "str_util.h"
#include <utility>

namespace cpsm {

namespace {

void decompose_utf8_string(boost::string_ref str,
std::vector<char32_t>& chars) {
// Even though most of this function deals with byte-sized quantities, use
Expand Down Expand Up @@ -94,29 +98,54 @@ void decompose_utf8_string(boost::string_ref str,
}
}

#if CPSM_CONFIG_ICU
} // namespace

bool is_alphanumeric(char32_t const c) {
return u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM);
StringHandler::StringHandler(StringHandlerOpts opts) : opts_(std::move(opts)) {
#if !CPSM_CONFIG_ICU
if (opts_.unicode) {
throw Error("cpsm built without Unicode support");
}
#endif
}

bool is_uppercase(char32_t const c) {
return u_hasBinaryProperty(c, UCHAR_UPPERCASE);
void StringHandler::decompose(boost::string_ref const str,
std::vector<char32_t>& chars) const {
if (opts_.unicode) {
decompose_utf8_string(str, chars);
} else {
chars.reserve(str.size());
for (char const c : str) {
chars.push_back(c);
}
}
}

char32_t to_lowercase(char32_t const c) { return u_tolower(c); }

#else // CPSM_CONFIG_ICU

bool is_alphanumeric(char32_t const c) {
bool StringHandler::is_alphanumeric(char32_t const c) const {
#if CPSM_CONFIG_ICU
if (opts_.unicode) {
return u_hasBinaryProperty(c, UCHAR_POSIX_ALNUM);
}
#endif
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z');
}

bool is_uppercase(char32_t const c) { return c >= 'A' && c <= 'Z'; }

char32_t to_lowercase(char32_t const c) { return c + ('a' - 'A'); }
bool StringHandler::is_uppercase(char32_t const c) const {
#if CPSM_CONFIG_ICU
if (opts_.unicode) {
return u_hasBinaryProperty(c, UCHAR_UPPERCASE);
}
#endif
return c >= 'A' && c <= 'Z';
}

#endif // CPSM_CONFIG_ICU
char32_t StringHandler::to_lowercase(char32_t const c) const {
#if CPSM_CONFIG_ICU
if (opts_.unicode) {
return u_tolower(c);
}
#endif
return c + ('a' - 'A');
}

} // namespace cpsm
37 changes: 26 additions & 11 deletions src/str_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,35 @@ inline std::string copy_string_ref(boost::string_ref const sref) {
return std::string(sref.data(), sref.size());
}

// Splits a UTF-8-encoded string into code points and append them to the given
// vector. If the string is not a valid UTF-8 encoded string, invalid bytes are
// are replaced by the invalid code point 0xdc00+(byte). (This is so that a
// match can still be attempted.)
void decompose_utf8_string(boost::string_ref str, std::vector<char32_t>& chars);
struct StringHandlerOpts {
bool unicode = false;
};

class StringHandler final {
public:
explicit StringHandler(StringHandlerOpts opts = StringHandlerOpts());

// If opts.unicode is false, appends each byte in the given string to the
// given vector.
//
// If opts.unicode is true, attempts to parse the given string as UTF-8,
// appending each code point in the string to the given vector. Non-UTF-8
// bytes are appended to the given vector as the invalid code point
// 0xdc00+(byte) so that a match can still be attempted.
void decompose(boost::string_ref str, std::vector<char32_t>& chars) const;

// Returns true if the given code point represents a letter or number.
bool is_alphanumeric(char32_t c);
// Returns true if the given code point represents a letter or number.
bool is_alphanumeric(char32_t c) const;

// Returns true if the given code point represents a uppercase letter.
bool is_uppercase(char32_t c);
// Returns true if the given code point represents an uppercase letter.
bool is_uppercase(char32_t c) const;

// Returns the lowercased version of c. c must be an uppercase letter.
char32_t to_lowercase(char32_t c);
// Returns the lowercase version of c. c must be an uppercase letter.
char32_t to_lowercase(char32_t c) const;

private:
StringHandlerOpts opts_;
};

} // namespace cpsm

Expand Down

0 comments on commit ad8c3c9

Please sign in to comment.