Skip to content

Commit

Permalink
#49 Allow to specify Payload with Keyword (#68)
Browse files Browse the repository at this point in the history
* #49: Allow to fix Payload with Keyword
  • Loading branch information
danbeck authored and Dave Jarvis committed Aug 20, 2019
1 parent b7cc113 commit 9f80565
Show file tree
Hide file tree
Showing 25 changed files with 1,572 additions and 292 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
language: java
install: mvn install -DskipTests=true -Dgpg.skip=true
jdk:
- oraclejdk8
- openjdk8
after_success:
- bash <(curl -s https://codecov.io/bash)
- bash <(curl -s https://codecov.io/bash)
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,26 @@ matches as soon as you encounter them. Let's look at an example where we want to
System.out.println(html);
```

You can also emit custom outputs. This might for example be useful to implement a trivial named entity
recognizer. In this case use a PayloadTrie instead of a Trie:

```java
class Word {
private final String gender;
public Word(String gender) {
this.gender = gender;
}
}

PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
.addKeyword("hers", new Word("f")
.addKeyword("his", new Word("m"))
.addKeyword("she", new Word("f"))
.addKeyword("he", new Word("m"))
.build();
Collection<PayloadEmit<Word>> emits = trie.parseText("ushers");
```

Releases
--------
Information on the aho-corasick [releases](https://github.com/robert-bor/aho-corasick/releases).
Expand Down
21 changes: 21 additions & 0 deletions src/main/java/org/ahocorasick/trie/DefaultToken.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package org.ahocorasick.trie;

public class DefaultToken extends Token {

private PayloadToken<String> payloadToken;

public DefaultToken(PayloadToken<String> payloadToken) {
super(payloadToken.getFragment());
this.payloadToken = payloadToken;
}

public boolean isMatch() {
return payloadToken.isMatch();
}

public Emit getEmit() {
PayloadEmit<String> emit = payloadToken.getEmit();
return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
}

}
4 changes: 2 additions & 2 deletions src/main/java/org/ahocorasick/trie/Emit.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
import org.ahocorasick.interval.Intervalable;

public class Emit extends Interval implements Intervalable {

private final String keyword;

public Emit(final int start, final int end, final String keyword) {
public Emit(final int start, final int end, String keyword) {
super(start, end);
this.keyword = keyword;
}
Expand All @@ -20,4 +19,5 @@ public String getKeyword() {
public String toString() {
return super.toString() + "=" + this.keyword;
}

}
1 change: 1 addition & 0 deletions src/main/java/org/ahocorasick/trie/FragmentToken.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ public boolean isMatch() {
public Emit getEmit() {
return null;
}

}
1 change: 1 addition & 0 deletions src/main/java/org/ahocorasick/trie/MatchToken.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public class MatchToken extends Token {
public MatchToken(final String fragment, final Emit emit) {
super(fragment);
this.emit = emit;

}

@Override
Expand Down
33 changes: 33 additions & 0 deletions src/main/java/org/ahocorasick/trie/Payload.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package org.ahocorasick.trie;

/**
* Payload holds the matched keyword and some payload-data.
*
* @author Daniel Beck
*
* @param <T> The type of the wrapped payload data.
*/
public class Payload<T> implements Comparable<Payload<T>> {

private final String keyword;
private final T data;

public Payload(final String keyword, final T data) {
super();
this.keyword = keyword;
this.data = data;
}

public String getKeyword() {
return keyword;
}

public T getData() {
return data;
}

@Override
public int compareTo(Payload<T> other) {
return keyword.compareTo(other.getKeyword());
}
}
50 changes: 50 additions & 0 deletions src/main/java/org/ahocorasick/trie/PayloadEmit.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.ahocorasick.trie;

import org.ahocorasick.interval.Interval;
import org.ahocorasick.interval.Intervalable;

/**
* PayloadEmit contains a matched term and its associated payload data.
*
* @param <T> Type of the wrapped payload-data.
* @author Daniel Beck
*
*/
public class PayloadEmit<T> extends Interval implements Intervalable {

private final String keyword;

private final T payload;

/**
* Created a PayloadEmit
*
* @param start Start of the matched search term.
* @param end End of the matched search term.
* @param keyword Keyword that matched.
* @param payload Emitted payload data.
*/
public PayloadEmit(final int start, final int end, String keyword, T payload) {
super(start, end);
this.keyword = keyword;
this.payload = payload;
}

public String getKeyword() {
return this.keyword;
}

/**
* Returns the payload associated to this emit.
*
* @return the associated payload
*/
public T getPayload() {
return this.payload;
}

@Override
public String toString() {
return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : "");
}
}
31 changes: 31 additions & 0 deletions src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package org.ahocorasick.trie;

/***
* PayloadFragmentToken holds a text ("the fragment").
* <p>
* It does not matches a search term - so its <code>isMatch</code>-method
* returns always false. <code>getEmits</code> returns not Emits.
*
* @author Daniel Beck
*
* @param <T> The Type of the emitted payloads.
*/
public class PayloadFragmentToken<T> extends PayloadToken<T> {

public PayloadFragmentToken(String fragment) {
super(fragment);
}

@Override
public boolean isMatch() {
return false;
}

/**
* Returns null.
*/
@Override
public PayloadEmit<T> getEmit() {
return null;
}
}
31 changes: 31 additions & 0 deletions src/main/java/org/ahocorasick/trie/PayloadMatchToken.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package org.ahocorasick.trie;

/**
* PayloadMatchToken holds a text ("the fragment") an emits some output.
* <p>
* It matches a search term - so its <code>isMatch</code>-method returns always
* true..
*
* @author Daniel Beck
*
* @param <T> The Type of the emitted payloads.
*/
public class PayloadMatchToken<T> extends PayloadToken<T> {

private final PayloadEmit<T> emit;

public PayloadMatchToken(final String fragment, final PayloadEmit<T> emit) {
super(fragment);
this.emit = emit;
}

@Override
public boolean isMatch() {
return true;
}

@Override
public PayloadEmit<T> getEmit() {
return this.emit;
}
}
156 changes: 156 additions & 0 deletions src/main/java/org/ahocorasick/trie/PayloadState.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
package org.ahocorasick.trie;

import java.util.*;

/**
* <p>
* A state has various important tasks it must attend to:
* </p>
* <p>
* <ul>
* <li>success; when a character points to another state, it must return that
* state</li>
* <li>failure; when a character has no matching state, the algorithm must be
* able to fall back on a state with less depth</li>
* <li>emits; when this state is passed and keywords have been matched, the
* matches and their payloads must be 'emitted' so that they can be used later
* on.</li>
* </ul>
* <p>
* <p>
* The root state is special in the sense that it has no failure state; it
* cannot fail. If it 'fails' it will still parse the next character and start
* from the root node. This ensures that the algorithm always runs. All other
* states always have a fail state.
* </p>
*
* @author Daniel Beck
*/
public class PayloadState<T> {

/**
* effective the size of the keyword
*/
private final int depth;

/**
* only used for the root state to refer to itself in case no matches have been
* found
*/
private final PayloadState<T> rootState;

/**
* referred to in the white paper as the 'goto' structure. From a state it is
* possible to go to other states, depending on the character passed.
*/
private final Map<Character, PayloadState<T>> success = new HashMap<>();

/**
* if no matching states are found, the failure state will be returned
*/
private PayloadState<T> failure;

/**
* whenever this state is reached, it will emit the matches keywords for future
* reference
*/
private Set<Payload<T>> emits;

public PayloadState() {
this(0);
}

public PayloadState(final int depth) {
this.depth = depth;
this.rootState = depth == 0 ? this : null;
}

private PayloadState<T> nextState(final Character character, final boolean ignoreRootState) {
PayloadState<T> nextState = this.success.get(character);

if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}

return nextState;
}

public PayloadState<T> nextState(final Character character) {
return nextState(character, false);
}

public PayloadState<T> nextStateIgnoreRootState(Character character) {
return nextState(character, true);
}

public PayloadState<T> addState(String keyword) {
PayloadState<T> state = this;

for (final Character character : keyword.toCharArray()) {
state = state.addState(character);
}

return state;
}

public PayloadState<T> addState(Character character) {
PayloadState<T> nextState = nextStateIgnoreRootState(character);
if (nextState == null) {
nextState = new PayloadState<T>(this.depth + 1);
this.success.put(character, nextState);
}
return nextState;
}

public int getDepth() {
return this.depth;
}

/**
* Adds a payload to be emitted for this state.
*
* @param emit Payload to be emitted.
*/
public void addEmit(Payload<T> payload) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(payload);
}

/**
* Adds a collection of payloads to be emitted for this state.
*
* @param emits Collection of payloads to be emitted.
*/
public void addEmit(Collection<Payload<T>> emits) {
for (Payload<T> emit : emits) {
addEmit(emit);
}
}

/**
* Returns a collection of emitted payloads for this state.
*
* @return Collection of emitted payloads.
*/
public Collection<Payload<T>> emit() {
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits;
}

public PayloadState<T> failure() {
return this.failure;
}

public void setFailure(PayloadState<T> failState) {
this.failure = failState;
}

public Collection<PayloadState<T>> getStates() {
return this.success.values();
}

public Collection<Character> getTransitions() {
return this.success.keySet();
}
}
Loading

0 comments on commit 9f80565

Please sign in to comment.