Skip to content

Commit

Permalink
+ method longest_prefix
Browse files Browse the repository at this point in the history
+ trienode_get_next_UCS2, trienode_get_next_UCS4
+ missing 'store' member
* bug: missing Py_DECREF


git-svn-id: file:///mnt/data/SVNrepo@2294 a13746d7-5020-4d46-8679-245a07f888a7
  • Loading branch information
wojtek committed Apr 2, 2011
1 parent 2edf309 commit b2d1477
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 26 deletions.
41 changes: 41 additions & 0 deletions Automaton.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ automaton_match(PyObject* self, PyObject* args) {
else
node = trie_find(automaton->root, word, wordlen);

Py_DECREF(py_word);
if (node)
Py_RETURN_TRUE;
else
Expand All @@ -379,6 +380,37 @@ automaton_match(PyObject* self, PyObject* args) {
}


#define automaton_longest_prefix_doc \
"longest_prefix(word) => integer - length of longest prefix"

static PyObject*
automaton_longest_prefix(PyObject* self, PyObject* args) {
#define automaton ((Automaton*)self)
ssize_t wordlen;
char* word;
bool unicode;
PyObject* py_word;

py_word = pymod_get_string_from_tuple(args, 0, &word, &wordlen, &unicode);
if (py_word == NULL)
return NULL;

int len;
if (unicode)
#ifndef Py_UNICODE_WIDE
len = trie_longest_UCS2(automaton->root, (uint16_t*)word, wordlen);
#else
len = trie_longest_UCS4(automaton->root, (uint32_t*)word, wordlen);
#endif
else
len = trie_longest(automaton->root, word, wordlen);

Py_DECREF(py_word);
return Py_BuildValue("i", len);
#undef automaton
}


#define automaton_get_doc \
"get(word, [def]) => obj - returns object associated with given word; " \
"if word isn't present, then def is returned, when def isn't defined, " \
Expand Down Expand Up @@ -806,6 +838,7 @@ PyMethodDef automaton_methods[] = {
method(clear, METH_NOARGS),
method(exists, METH_VARARGS),
method(match, METH_VARARGS),
method(longest_prefix, METH_VARARGS),
method(get, METH_VARARGS),
method(make_automaton, METH_NOARGS),
method(find_all, METH_VARARGS),
Expand Down Expand Up @@ -835,6 +868,14 @@ PyMemberDef automaton_members[] = {
"current kind of automaton"
},

{
"store",
T_INT,
offsetof(Automaton, store),
READONLY,
"type of values (ahocorasick.STORE_ANY/STORE_INTS/STORE_LEN)"
},

{NULL}
};

Expand Down
85 changes: 59 additions & 26 deletions trie.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,9 @@ trie_find_UCS2(TrieNode* root, const uint16_t* word, const size_t wordlen) {
node = root;
ssize_t i;
for (i=0; i < wordlen; i++) {
const uint16_t w = word[i];
node = trienode_get_next(node, w & 0xff);
node = trienode_get_next_UCS2(node, word[i]);
if (node == NULL)
return NULL;

if (w < 0x0100) {
node = trienode_get_next(node, (w >> 8) & 0xff);
if (node == NULL)
return NULL;
}
}

return node;
Expand All @@ -202,29 +195,69 @@ trie_find_UCS4(TrieNode* root, const uint32_t* word, const size_t wordlen) {
node = root;
ssize_t i;
for (i=0; i < wordlen; i++) {
#define NEXT(byte) \
node = trienode_get_next(node, (byte)); \
if (node == NULL) \
node = trienode_get_next_UCS4(node, word[i]);
if (node == NULL)
return NULL;
}

return node;
}

uint32_t w = word[i];

NEXT(w & 0xff);
if (w > 0x000000ff) {
NEXT((w >> 8) & 0xff);
static int PURE
trie_longest(TrieNode* root, const char* word, const size_t wordlen) {
TrieNode* node;
int len = 0;

if (w < 0x0000ffff) {
NEXT((w >> 16) & 0xff);
node = root;
ssize_t i;
for (i=0; i < wordlen; i++) {
node = trienode_get_next(node, word[i]);
if (node == NULL)
break;
else
len += 1;
}

if (w > 0x00ffffff) {
NEXT((w >> 24) & 0xff);
}
}
}
#undef NEXT
} // for

return node;
return len;
}


static int PURE
trie_longest_UCS2(TrieNode* root, const uint16_t* word, const size_t wordlen) {
TrieNode* node;
int len = 0;

node = root;
ssize_t i;
for (i=0; i < wordlen; i++) {
node = trienode_get_next_UCS2(node, word[i]);
if (node == NULL)
break;
else
len += 1;
}

return len;
}


static int PURE
trie_longest_UCS4(TrieNode* root, const uint32_t* word, const size_t wordlen) {
TrieNode* node;
int len = 0;

node = root;
ssize_t i;
for (i=0; i < wordlen; i++) {
node = trienode_get_next_UCS4(node, word[i]);
if (node == NULL)
break;
else
len += 1;
}

return len;
}


Expand Down
12 changes: 12 additions & 0 deletions trie.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,22 @@
static TrieNode*
trie_add_word(Automaton* automaton, char* word, size_t wordlen, bool* new_word);

static TrieNode*
trie_add_word_UCS2(Automaton* automaton, uint16_t* word, size_t wordlen, bool* new_word);

static TrieNode*
trie_add_word_UCS4(Automaton* automaton, uint32_t* word, size_t wordlen, bool* new_word);

/* returns last node on a path for given word */
static TrieNode* PURE
trie_find(TrieNode* root, const char* word, const size_t wordlen);

static TrieNode* PURE
trie_find_UCS2(TrieNode* root, const uint16_t* word, const size_t wordlen);

static TrieNode* PURE
trie_find_UCS4(TrieNode* root, const uint32_t* word, const size_t wordlen);

/* returns node linked by edge labeled with byte including paths going
through fail links */
static TrieNode* PURE
Expand Down
52 changes: 52 additions & 0 deletions trienode.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,58 @@ trienode_get_next(TrieNode* node, const uint8_t byte) {
}
}


static TrieNode* PURE
trienode_get_next_UCS2(TrieNode* node, const uint16_t word) {
TrieNode* tmp = node;
uint8_t byte = word & 0xff;
tmp = trienode_get_next(node, byte);
if (tmp == NULL)
return NULL;

if (word > 0xff) {
byte = (word >> 8) & 0xff;
return trienode_get_next(tmp, byte);
}
else
return NULL;
}


static TrieNode* PURE
trienode_get_next_UCS4(TrieNode* node, const uint32_t dword) {
TrieNode* tmp = node;
uint8_t byte = dword & 0xff;
tmp = trienode_get_next(node, byte);
if (tmp == NULL)
return NULL;

if (dword > 0xff) {
byte = (dword >> 8) & 0xff;
tmp = trienode_get_next(tmp, byte);
if (tmp == NULL)
return NULL;

if (dword > 0xffff) {
byte = (dword >> 16) & 0xff;
tmp = trienode_get_next(tmp, byte);
if (tmp == NULL)
return NULL;

if (dword > 0xffffff) {
byte = (dword >> 24) & 0xff;
tmp = trienode_get_next(tmp, byte);
if (tmp == NULL)
return NULL;
}
}
}

return tmp;
}



int
trienode_sort_cmp(const void* a, const void* b) {
#define A ((TrieNode*)a)
Expand Down
6 changes: 6 additions & 0 deletions trienode.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ trienode_new(char byte, char eow);
static TrieNode* PURE ALWAYS_INLINE
trienode_get_next(TrieNode* node, const uint8_t byte);

static TrieNode* PURE ALWAYS_INLINE
trienode_get_next_UCS2(TrieNode* node, const uint16_t byte);

static TrieNode* PURE ALWAYS_INLINE
trienode_get_next_UCS4(TrieNode* node, const uint32_t byte);

/* link with child node by edge labeled with byte */
static TrieNode*
trienode_set_next(TrieNode* node, const uint8_t byte, TrieNode* child);
Expand Down
14 changes: 14 additions & 0 deletions unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,20 @@ def test_get3(self):
with self.assertRaises(KeyError):
A.get(w)


def test_longest_prefix(self):
A = self.A
for i, w in enumerate(self.words):
A.add_word(w, i+1)

# there is "word"
self.assertEqual(A.longest_prefix(b"wo"), 2)
self.assertEqual(A.longest_prefix(b"working"), 3)
self.assertEqual(A.longest_prefix(b"word"), 4)
self.assertEqual(A.longest_prefix(b"wordbook"), 4)
self.assertEqual(A.longest_prefix(b"void"), 0)
self.assertEqual(A.longest_prefix(b""), 0)


def test_stats1(self):
A = self.A
Expand Down

0 comments on commit b2d1477

Please sign in to comment.