Skip to content

Commit 277e165

Browse files
initial commit
0 parents  commit 277e165

File tree

6 files changed

+269
-0
lines changed

6 files changed

+269
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/build*

CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
cmake_minimum_required(VERSION 2.6)
2+
project(guess)
3+
4+
add_library(guess src/guesser.cc)
5+
set_target_properties(guess PROPERTIES COMPILE_FLAGS "-std=c++11")
6+
7+
include_directories(include)
8+
add_executable(guess-example EXCLUDE_FROM_ALL example/main.cc)
9+
target_link_libraries(guess-example guess)
10+
set_target_properties(guess-example PROPERTIES COMPILE_FLAGS "-std=c++11")

example/main.cc

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include <iostream>
2+
#include <fstream>
3+
#include <chrono>
4+
5+
#include "guess/guesser.h"
6+
7+
using namespace guess;
8+
9+
struct new_line_tokenizer : std::ctype<char> {
10+
new_line_tokenizer() : std::ctype<char>(get_table()) {}
11+
static mask const* get_table() {
12+
static mask rc[table_size];
13+
rc['\n'] = std::ctype_base::space;
14+
return &rc[0];
15+
}
16+
};
17+
18+
int main() {
19+
std::vector<std::string> stations;
20+
21+
try {
22+
std::fstream in("stations.txt");
23+
in.exceptions(std::ifstream::failbit);
24+
in.imbue(std::locale(std::cin.getloc(), new new_line_tokenizer));
25+
std::copy(std::istream_iterator<std::string>(in),
26+
std::istream_iterator<std::string>(),
27+
std::back_inserter(stations));
28+
} catch (std::exception const& e) {
29+
std::cout << "unable to read file stations.txt\n";
30+
return 1;
31+
}
32+
33+
guesser g(stations);
34+
35+
std::string input;
36+
while (std::cout << "$ " && std::getline(std::cin, input)) {
37+
using std::chrono::system_clock;
38+
using std::chrono::duration_cast;
39+
using std::chrono::milliseconds;
40+
auto start = system_clock::now();
41+
auto candidates = g.guess(input);
42+
auto duration = duration_cast<milliseconds>(system_clock::now() - start);
43+
std::cout << "\n\t" << duration.count() << "ms\n\n";
44+
45+
for (auto const& guess : candidates) {
46+
std::cout << " " << stations[guess] << "\n";
47+
}
48+
std::cout << "\n";
49+
}
50+
}

include/guess/guesser.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#ifndef GUESS_GUESSER_H_
2+
#define GUESS_GUESSER_H_
3+
4+
#include <string>
5+
#include <vector>
6+
7+
namespace guess {
8+
9+
struct guesser {
10+
guesser(std::vector<std::string> const& candidates);
11+
12+
std::vector<int> guess(std::string in, int count = 10) const;
13+
14+
private:
15+
struct match {
16+
match() = default;
17+
explicit match(int index) : index(index), cos_sim(0) {}
18+
bool operator<(match const& o) const { return cos_sim > o.cos_sim; }
19+
int index;
20+
double cos_sim;
21+
};
22+
23+
std::vector<match> match_trigrams(std::string& in) const;
24+
25+
void score_exact_word_matches(std::string& in,
26+
std::vector<match>& matches) const;
27+
28+
static void normalize(std::string& s);
29+
30+
std::vector<std::string> candidates_;
31+
};
32+
33+
} // namespace guess
34+
35+
#endif // GUESS_GUESSER_H_

include/guess/string_util.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#ifndef GUESS_STRING_UTIL_H_
2+
#define GUESS_STRING_UTIL_H_
3+
4+
#include <string>
5+
6+
namespace guess {
7+
8+
template <typename Function>
9+
inline void for_each_token(char* s, int length, Function f) {
10+
int base = 0;
11+
int i = 0;
12+
while (i < length) {
13+
if (s[i] == ' ') {
14+
char tmp = s[i];
15+
s[i] = '\0';
16+
bool exit = f(s + base);
17+
s[i] = tmp;
18+
19+
if (exit) {
20+
return;
21+
}
22+
23+
base = i + 1;
24+
}
25+
++i;
26+
}
27+
f(s + base);
28+
}
29+
30+
template <typename Function>
31+
inline void for_each_token(std::string const& in, Function f) {
32+
for_each_token(const_cast<char*>(in.c_str()), in.size(), f);
33+
}
34+
35+
void replace_all(std::string& s,
36+
std::string const& from,
37+
std::string const& to) {
38+
std::string::size_type pos;
39+
while ((pos = s.find(from)) != std::string::npos) {
40+
s.replace(pos, from.size(), to);
41+
}
42+
}
43+
44+
} // namespace guess
45+
46+
#endif // GUESS_STRING_UTIL_H_

src/guesser.cc

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#include "guess/guesser.h"
2+
3+
#include <cmath>
4+
#include <cinttypes>
5+
#include <fstream>
6+
#include <string>
7+
#include <vector>
8+
9+
#include "guess/string_util.h"
10+
11+
namespace guess {
12+
13+
guesser::guesser(std::vector<std::string> const& candidates)
14+
: candidates_(candidates) {
15+
for (auto& candidate : candidates_) {
16+
normalize(candidate);
17+
}
18+
}
19+
20+
std::vector<int> guesser::guess(std::string in, int count) const {
21+
auto matches = match_trigrams(in);
22+
matches.resize(std::min(100ul, matches.size()));
23+
24+
score_exact_word_matches(in, matches);
25+
matches.resize(count);
26+
27+
std::vector<int> ret(matches.size());
28+
for (int i = 0; i < matches.size(); ++i) {
29+
ret[i] = matches[i].index;
30+
}
31+
32+
return ret;
33+
}
34+
35+
std::vector<guesser::match> guesser::match_trigrams(std::string& in) const {
36+
std::vector<match> matches;
37+
matches.reserve(candidates_.size());
38+
for (int i = 0; i < candidates_.size(); ++i) {
39+
matches.emplace_back(i);
40+
}
41+
42+
normalize(in);
43+
44+
char const* input = in.c_str();
45+
double sqrt_len_vec_input = std::sqrt(in.size() - 2);
46+
47+
char trigram_input[4] = {0};
48+
char trigram_candidate[4] = {0};
49+
50+
for (int i = 0; i < candidates_.size(); ++i) {
51+
int match_count = 0;
52+
const int len_vec_candidate = candidates_[i].length() - 2;
53+
54+
char const* substr_input = input;
55+
while (substr_input[2] != '\0') {
56+
trigram_input[0] = substr_input[0];
57+
trigram_input[1] = substr_input[1];
58+
trigram_input[2] = substr_input[2];
59+
++substr_input;
60+
61+
char const* substr_candidate = candidates_[i].c_str();
62+
while (substr_candidate[2] != '\0') {
63+
trigram_candidate[0] = substr_candidate[0];
64+
trigram_candidate[1] = substr_candidate[1];
65+
trigram_candidate[2] = substr_candidate[2];
66+
++substr_candidate;
67+
68+
if (*(uint32_t*) trigram_input == *(uint32_t*) trigram_candidate) {
69+
++match_count;
70+
break;
71+
}
72+
}
73+
}
74+
75+
double denominator = sqrt_len_vec_input * std::sqrt(len_vec_candidate);
76+
matches[i].cos_sim = match_count / denominator;
77+
}
78+
std::sort(std::begin(matches), std::end(matches));
79+
80+
return matches;
81+
}
82+
83+
void guesser::score_exact_word_matches(std::string& in,
84+
std::vector<match>& matches) const {
85+
for (int i = 0; i < matches.size(); ++i) {
86+
auto& candidate = candidates_[matches[i].index];
87+
for_each_token(in, [&](char* input_token) {
88+
for_each_token(candidate, [&](char* candidate_token) {
89+
if (strcmp(candidate_token, input_token) == 0) {
90+
matches[i].cos_sim *= 1.33;
91+
return true;
92+
}
93+
return false;
94+
});
95+
return false;
96+
});
97+
}
98+
std::sort(std::begin(matches), std::end(matches));
99+
}
100+
101+
void guesser::normalize(std::string& s) {
102+
replace_all(s, "Ä", "a" );
103+
replace_all(s, "ä", "a" );
104+
replace_all(s, "Ö", "o" );
105+
replace_all(s, "ö", "o" );
106+
replace_all(s, "Ü", "u" );
107+
replace_all(s, "ü", "u" );
108+
replace_all(s, "ß", "ss");
109+
replace_all(s, "-", " " );
110+
replace_all(s, "/", " " );
111+
replace_all(s, ".", " " );
112+
replace_all(s, ",", " " );
113+
replace_all(s, "(", " " );
114+
replace_all(s, ")", " " );
115+
116+
for (int i = 0; i < s.length(); ++i) {
117+
if (!isalnum(s[i])) {
118+
s[i] = ' ';
119+
}
120+
}
121+
122+
replace_all(s, " ", " ");
123+
124+
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
125+
}
126+
127+
} // namespace guess

0 commit comments

Comments
 (0)