Add cur_file_prefix_len

cur_file_prefix_len gives a low-priority bonus to files sharing a prefix with the currently open file, which selects for closely related files (e.g. .h vs. .cc files, _test.cc files), especially for empty queries.
nixprime · Jun 2, 2015 · 2abec32 · 2abec32
1 parent 54c497d
commit 2abec32
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 32 deletions.
diff --git a/src/match.h b/src/match.h
@@ -55,6 +55,11 @@ struct MatchBase {
   // indicate higher confidence that the matches are correct.
   CharCount part_index_sum = 0;
 
+  // The number of bytes that are shared between the beginning of the rightmost
+  // path component of the match and the rightmost path component of the
+  // current file.
+  CharCount cur_file_prefix_len = 0;
+
   // The number of path components that must be traversed between the query and
   // item paths.
   CharCount path_distance = 0;
@@ -68,6 +73,7 @@ struct MatchBase {
   std::string debug_string() const {
     return str_cat("prefix_score=", prefix_score, ", word_prefix_len=",
                    word_prefix_len, ", part_index_sum=", part_index_sum,
+                   ", cur_file_prefix_len=", cur_file_prefix_len,
                    ", path_distance=", path_distance, ", unmatched_len=",
                    unmatched_len);
   }
@@ -95,6 +101,9 @@ bool operator<(Match<T> const& lhs, Match<T> const& rhs) {
   if (lhs.part_index_sum != rhs.part_index_sum) {
     return lhs.part_index_sum < rhs.part_index_sum;
   }
+  if (lhs.cur_file_prefix_len != rhs.cur_file_prefix_len) {
+    return lhs.cur_file_prefix_len > rhs.cur_file_prefix_len;
+  }
   if (lhs.path_distance != rhs.path_distance) {
     return lhs.path_distance < rhs.path_distance;
   }

diff --git a/src/matcher.cc b/src/matcher.cc
@@ -29,15 +29,14 @@ namespace cpsm {
 
 Matcher::Matcher(boost::string_ref const query, MatcherOpts opts)
     : opts_(std::move(opts)) {
-  decompose_utf8_string(query, query_chars_);
+  decompose_utf8_string(query, query_);
   if (opts_.is_path) {
     // Store the index of the first character after the rightmost path
     // separator in the query. (Store an index rather than an iterator to keep
     // Matcher copyable/moveable.)
     query_key_begin_index_ =
-        std::find(query_chars_.crbegin(), query_chars_.crend(),
-                  path_separator()).base() -
-        query_chars_.cbegin();
+        std::find(query_.crbegin(), query_.crend(), path_separator()).base() -
+        query_.cbegin();
     switch (opts_.query_path_mode) {
       case MatcherOpts::QueryPathMode::NORMAL:
         require_full_part_ = false;
@@ -57,17 +56,9 @@ Matcher::Matcher(boost::string_ref const query, MatcherOpts opts)
 
   // Queries are smartcased (case-sensitive only if any uppercase appears in the
   // query).
-  is_case_sensitive_ =
-      std::any_of(query_chars_.begin(), query_chars_.end(), is_uppercase);
+  is_case_sensitive_ = std::any_of(query_.begin(), query_.end(), is_uppercase);
 
   cur_file_parts_ = path_components_of(opts_.cur_file);
-  // Keeping the filename in cur_file_parts_ causes the path distance metric to
-  // favor the currently open file. While we don't want to exclude the
-  // currently open file from being matched, it shouldn't be favored over its
-  // siblings on path distance.
-  if (!cur_file_parts_.empty()) {
-    cur_file_parts_.pop_back();
-  }
 }
 
 bool Matcher::match_base(boost::string_ref const item, MatchBase& m,
@@ -87,23 +78,23 @@ bool Matcher::match_base(boost::string_ref const item, MatchBase& m,
   std::vector<boost::string_ref> item_parts;
   if (opts_.is_path) {
     item_parts = path_components_of(item);
-    m.path_distance = path_distance_between(cur_file_parts_, item_parts);
   } else {
     item_parts.push_back(item);
   }
   if (!item_parts.empty()) {
     m.unmatched_len = item_parts.back().size();
   }
 
-  if (query_chars_.empty()) {
+  if (query_.empty()) {
+    match_path(item_parts, m);
     return true;
   }
 
   // Since for paths (the common case) we prefer rightmost path components, we
   // scan path components right-to-left.
-  auto query_it = query_chars_.crbegin();
-  auto const query_end = query_chars_.crend();
-  auto query_key_begin = query_chars_.cend();
+  auto query_it = query_.crbegin();
+  auto const query_end = query_.crend();
+  auto query_key_begin = query_.cend();
   // Index into item_parts, counting from the right.
   CharCount part_index = 0;
   for (boost::string_ref const item_part :
@@ -149,22 +140,37 @@ bool Matcher::match_base(boost::string_ref const item, MatchBase& m,
     return false;
   }
 
+  // Fill path match data.
+  match_path(item_parts, m);
+
   // Now do more refined matching on the key (the rightmost path component of
   // the item for a path match, and just the full item otherwise).
   match_key(*key_chars, query_key_begin, m);
   return true;
 }
 
+void Matcher::match_path(std::vector<boost::string_ref> const& item_parts,
+                         MatchBase& m) const {
+  if (!opts_.is_path) {
+    return;
+  }
+  m.path_distance = path_distance_between(cur_file_parts_, item_parts);
+  if (!cur_file_parts_.empty() && !item_parts.empty()) {
+    m.cur_file_prefix_len =
+        common_prefix(cur_file_parts_.back(), item_parts.back());
+  }
+}
+
 void Matcher::match_key(std::vector<char32_t> const& key,
                         std::vector<char32_t>::const_iterator query_key,
                         MatchBase& m) const {
-  auto const query_key_end = query_chars_.cend();
+  auto const query_key_end = query_.cend();
   if (query_key == query_key_end) {
     return;
   }
   bool const query_key_at_begin =
-      (query_key == (query_chars_.cbegin() + query_key_begin_index_));
-  // key can't be empty since [query_key, query_chars_.end()) is non-empty.
+      (query_key == (query_.cbegin() + query_key_begin_index_));
+  // key can't be empty since [query_key, query_.end()) is non-empty.
   const auto is_word_prefix = [&](std::size_t const i) -> bool {
     if (i == 0) {
       return true;

diff --git a/src/matcher.h b/src/matcher.h
@@ -68,13 +68,16 @@ class Matcher {
                   std::vector<char32_t>* buf,
                   std::vector<char32_t>* buf2) const;
 
+  void match_path(std::vector<boost::string_ref> const& item_parts,
+                  MatchBase& m) const;
+
   void match_key(std::vector<char32_t> const& key,
                  std::vector<char32_t>::const_iterator query_key,
                  MatchBase& m) const;
 
   bool match_char(char32_t item, char32_t query) const;
 
-  std::vector<char32_t> query_chars_;
+  std::vector<char32_t> query_;
   std::size_t query_key_begin_index_;
   MatcherOpts opts_;
   bool is_case_sensitive_;

diff --git a/src/path_util.cc b/src/path_util.cc
@@ -15,8 +15,6 @@
 
 #include "path_util.h"
 
-#include <algorithm>
-
 // TODO: Support non-Unix non-UTF-8 paths.
 
 namespace cpsm {
@@ -47,14 +45,7 @@ std::vector<boost::string_ref> path_components_of(boost::string_ref path) {
 
 CharCount path_distance_between(std::vector<boost::string_ref> const& x,
                                 std::vector<boost::string_ref> const& y) {
-  auto const end = std::min(x.size(), y.size());
-  CharCount common_ancestors;
-  for (common_ancestors = 0; common_ancestors < end; common_ancestors++) {
-    if (x[common_ancestors] != y[common_ancestors]) {
-      break;
-    }
-  }
-  return x.size() + y.size() - (2 * common_ancestors);
+  return x.size() + y.size() - (2 * common_prefix(x, y));
 }
 
 }  // namespace cpsm
diff --git a/src/path_util.h b/src/path_util.h
@@ -16,6 +16,7 @@
 #ifndef CPSM_PATH_UTIL_H_
 #define CPSM_PATH_UTIL_H_
 
+#include <algorithm>
 #include <vector>
 
 #include <boost/utility/string_ref.hpp>
@@ -44,6 +45,26 @@ std::vector<boost::string_ref> path_components_of(boost::string_ref path);
 CharCount path_distance_between(std::vector<boost::string_ref> const& x,
                                 std::vector<boost::string_ref> const& y);
 
+// Returns the number of elements that are common at the beginning of the two
+// given iterables.
+template <typename T>
+std::size_t common_prefix(T const& x, T const& y) {
+  auto x_it = x.cbegin();
+  auto y_it = y.cbegin();
+  auto const x_end = x.cend();
+  auto const y_end = y.cend();
+  std::size_t common_ancestors = 0;
+  while (x_it != x_end && y_it != y_end) {
+    if (*x_it != *y_it) {
+      break;
+    }
+    ++x_it;
+    ++y_it;
+    common_ancestors++;
+  }
+  return common_ancestors;
+}
+
 }  // namespace cpsm
 
 #endif /* CPSM_PATH_UTIL_H_ */