Skip to content

Commit fe02f18

Browse files
committed
fixed trie building, closes #9
1 parent 6be74b1 commit fe02f18

File tree

6 files changed

+121
-134
lines changed

6 files changed

+121
-134
lines changed

pom.xml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,12 @@
4444
<dependency>
4545
<groupId>net.amygdalum</groupId>
4646
<artifactId>compilerutils</artifactId>
47-
<version>0.2.2</version>
47+
<version>0.2.3</version>
4848
</dependency>
4949
<dependency>
5050
<groupId>junit</groupId>
5151
<artifactId>junit</artifactId>
52-
<version>4.12</version>
52+
<version>4.13</version>
5353
<scope>test</scope>
5454
</dependency>
5555
<dependency>
@@ -67,13 +67,13 @@
6767
<dependency>
6868
<groupId>net.amygdalum</groupId>
6969
<artifactId>xrayinterface</artifactId>
70-
<version>0.3.2</version>
70+
<version>0.3.3</version>
7171
<scope>test</scope>
7272
</dependency>
7373
<dependency>
7474
<groupId>org.mockito</groupId>
7575
<artifactId>mockito-core</artifactId>
76-
<version>2.23.4</version>
76+
<version>2.28.2</version>
7777
<scope>test</scope>
7878
</dependency>
7979
</dependencies>

src/main/java/net/amygdalum/stringsearchalgorithms/search/bytes/SetBackwardOracleMatching.java

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import java.util.Arrays;
1313
import java.util.Collection;
1414
import java.util.IdentityHashMap;
15-
import java.util.Iterator;
1615
import java.util.LinkedList;
1716
import java.util.List;
1817
import java.util.Map;
@@ -41,23 +40,23 @@
4140
*/
4241
public class SetBackwardOracleMatching implements StringSearchAlgorithm {
4342

44-
private ByteWordSet<List<byte[]>> trie;
43+
private ByteWordSet<byte[][]> trie;
4544
private int minLength;
4645

4746
public SetBackwardOracleMatching(Collection<String> patterns, Charset charset) {
48-
List<byte[]> bytepatterns = toByteArray(patterns, charset);
47+
byte[][] bytepatterns = toByteArray(patterns, charset).toArray(new byte[0][]);
4948
this.minLength = minLength(bytepatterns);
5049
this.trie = computeTrie(bytepatterns, minLength);
5150
}
5251

53-
private static ByteWordSet<List<byte[]>> computeTrie(List<byte[]> bytepatterns, int length) {
54-
ByteWordSetBuilder<List<byte[]>, ByteDawg<List<byte[]>>> builder = new ByteWordSetBuilder<>(new LinkedByteDawgCompiler<List<byte[]>>(), new MergePatterns());
52+
private static ByteWordSet<byte[][]> computeTrie(byte[][] bytepatterns, int length) {
53+
ByteWordSetBuilder<byte[][], ByteDawg<byte[][]>> builder = new ByteWordSetBuilder<>(new LinkedByteDawgCompiler<byte[][]>(), new MergePatterns());
5554

5655
for (byte[] pattern : bytepatterns) {
5756
byte[] prefix = copyOfRange(pattern, 0, length);
5857
byte[] reversePrefix = revert(prefix);
5958
byte[] suffix = copyOfRange(pattern, length, pattern.length);
60-
builder.extend(reversePrefix, asList(prefix, suffix));
59+
builder.extend(reversePrefix, new byte[][] {prefix, suffix});
6160
}
6261
builder.work(new BuildOracle());
6362

@@ -79,43 +78,53 @@ public String toString() {
7978
return getClass().getSimpleName();
8079
}
8180

82-
public static class MergePatterns implements JoinStrategy<List<byte[]>> {
81+
public static class MergePatterns implements JoinStrategy<byte[][]> {
8382

8483
@Override
85-
public List<byte[]> join(List<byte[]> existing, List<byte[]> next) {
84+
public byte[][] join(byte[][] existing, byte[][] next) {
8685
if (existing == null) {
87-
return new ArrayList<>(next);
86+
return next;
8887
} else {
89-
existing.add(next.get(1));
90-
return existing;
88+
byte[][] result = new byte[existing.length + 1][];
89+
byte[] insert = next[1];
90+
int i = 1;
91+
while (i < existing.length && existing[i].length > insert.length) {
92+
i++;
93+
}
94+
System.arraycopy(existing, 0, result, 0, i);
95+
result[i] = insert;
96+
if (i < existing.length) {
97+
System.arraycopy(existing, i, result, i + 1, existing.length - i);
98+
}
99+
return result;
91100
}
92101
}
93102

94103
}
95104

96-
public static class BuildOracle implements ByteTask<List<byte[]>> {
97-
private Map<ByteNode<List<byte[]>>, ByteNode<List<byte[]>>> oracle;
98-
private ByteNode<List<byte[]>> init;
105+
public static class BuildOracle implements ByteTask<byte[][]> {
106+
private Map<ByteNode<byte[][]>, ByteNode<byte[][]>> oracle;
107+
private ByteNode<byte[][]> init;
99108

100109
public BuildOracle() {
101110
oracle = new IdentityHashMap<>();
102111
}
103112

104113
@Override
105-
public List<ByteNode<List<byte[]>>> init(ByteNode<List<byte[]>> root) {
114+
public List<ByteNode<byte[][]>> init(ByteNode<byte[][]> root) {
106115
this.init = root;
107116
return asList(root);
108117
}
109118

110119
@Override
111-
public List<ByteNode<List<byte[]>>> process(ByteNode<List<byte[]>> node) {
112-
List<ByteNode<List<byte[]>>> nexts = new ArrayList<>();
120+
public List<ByteNode<byte[][]>> process(ByteNode<byte[][]> node) {
121+
List<ByteNode<byte[][]>> nexts = new ArrayList<>();
113122
for (byte b : node.getAlternatives()) {
114-
ByteNode<List<byte[]>> current = node.nextNode(b);
123+
ByteNode<byte[][]> current = node.nextNode(b);
115124

116-
ByteNode<List<byte[]>> down = oracle.get(node);
125+
ByteNode<byte[][]> down = oracle.get(node);
117126
while (down != null) {
118-
ByteNode<List<byte[]>> next = down.nextNode(b);
127+
ByteNode<byte[][]> next = down.nextNode(b);
119128
if (next != null) {
120129
oracle.put(current, next);
121130
break;
@@ -133,8 +142,8 @@ public List<ByteNode<List<byte[]>>> process(ByteNode<List<byte[]>> node) {
133142
}
134143

135144
@SuppressWarnings("unchecked")
136-
private void addNextNode(ByteNode<List<byte[]>> node, byte b, ByteNode<List<byte[]>> next) {
137-
((ByteConnectionAdaptor<List<byte[]>>) node).addNextNode(b, next);
145+
private void addNextNode(ByteNode<byte[][]> node, byte b, ByteNode<byte[][]> next) {
146+
((ByteConnectionAdaptor<byte[][]>) node).addNextNode(b, next);
138147
}
139148
}
140149

@@ -143,10 +152,10 @@ private static class Finder extends AbstractStringFinder {
143152
private final int minLength;
144153
private final int lookahead;
145154
private ByteProvider bytes;
146-
private ByteAutomaton<List<byte[]>> cursor;
155+
private ByteAutomaton<byte[][]> cursor;
147156
private Queue<StringMatch> buffer;
148157

149-
public Finder(ByteWordSet<List<byte[]>> trie, int minLength, ByteProvider bytes, StringFinderOption... options) {
158+
public Finder(ByteWordSet<byte[][]> trie, int minLength, ByteProvider bytes, StringFinderOption... options) {
150159
super(options);
151160
this.minLength = minLength;
152161
this.lookahead = minLength - 1;
@@ -181,12 +190,11 @@ public StringMatch findNext() {
181190
long currentWindowEnd = currentWindowStart + minLength;
182191
byte[] matchedPrefix = bytes.between(currentPos, currentWindowEnd);
183192
if (success && j < 0) {
184-
List<byte[]> patterns = cursor.iterator().next();
185-
Iterator<byte[]> iPatterns = patterns.iterator();
186-
byte[] prefix = iPatterns.next();
193+
byte[][] patterns = cursor.iterator().next();
194+
byte[] prefix = patterns[0];
187195
if (Arrays.equals(prefix, matchedPrefix)) {
188-
while (iPatterns.hasNext()) {
189-
byte[] suffix = iPatterns.next();
196+
for (int i = 1; i < patterns.length; i++) {
197+
byte[] suffix = patterns[i];
190198
long currentWordEnd = currentWindowEnd + suffix.length;
191199
if (!bytes.finished((int) (currentWordEnd - currentWindowStart - 1))) {
192200
byte[] matchedSuffix = bytes.between(currentWindowEnd, currentWordEnd);

src/main/java/net/amygdalum/stringsearchalgorithms/search/chars/SetBackwardOracleMatching.java

Lines changed: 46 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import java.util.Collections;
1313
import java.util.HashSet;
1414
import java.util.IdentityHashMap;
15-
import java.util.Iterator;
1615
import java.util.LinkedList;
1716
import java.util.List;
1817
import java.util.Map;
@@ -44,7 +43,7 @@
4443
public class SetBackwardOracleMatching implements StringSearchAlgorithm {
4544

4645
private CharMapping mapping;
47-
private CharWordSet<List<char[]>> trie;
46+
private CharWordSet<char[][]> trie;
4847
private int minLength;
4948

5049
public SetBackwardOracleMatching(Collection<String> patterns) {
@@ -58,22 +57,22 @@ public SetBackwardOracleMatching(Collection<String> patterns, CharMapping mappin
5857
this.trie = computeTrie(normalized(mapping, charpatterns), minLength, mapping);
5958
}
6059

61-
private List<char[]> normalized(CharMapping mapping, List<char[]> charpatterns) {
60+
private char[][] normalized(CharMapping mapping, List<char[]> charpatterns) {
6261
List<char[]> normalized = new ArrayList<>(charpatterns.size());
6362
for (char[] cs : charpatterns) {
6463
normalized.add(mapping.normalized(cs));
6564
}
66-
return normalized;
65+
return normalized.toArray(new char[0][]);
6766
}
6867

69-
private static CharWordSet<List<char[]>> computeTrie(List<char[]> charpatterns, int length, CharMapping mapping) {
70-
CharWordSetBuilder<List<char[]>, CharDawg<List<char[]>>> builder = new CharWordSetBuilder<>(new LinkedCharDawgCompiler<List<char[]>>(), new MergePatterns());
68+
private static CharWordSet<char[][]> computeTrie(char[][] charpatterns, int length, CharMapping mapping) {
69+
CharWordSetBuilder<char[][], CharDawg<char[][]>> builder = new CharWordSetBuilder<>(new LinkedCharDawgCompiler<char[][]>(), new MergePatterns());
7170

7271
for (char[] pattern : charpatterns) {
7372
char[] prefix = copyOfRange(pattern, 0, length);
7473
char[] reversePrefix = revert(prefix);
7574
char[] suffix = copyOfRange(pattern, length, pattern.length);
76-
builder.extend(reversePrefix, asList(prefix, suffix));
75+
builder.extend(reversePrefix, new char[][] {prefix, suffix});
7776
}
7877
builder.work(new BuildOracle());
7978
builder.work(new UseCharClasses(mapping));
@@ -96,43 +95,53 @@ public String toString() {
9695
return getClass().getSimpleName();
9796
}
9897

99-
public static class MergePatterns implements JoinStrategy<List<char[]>> {
98+
public static class MergePatterns implements JoinStrategy<char[][]> {
10099

101100
@Override
102-
public List<char[]> join(List<char[]> existing, List<char[]> next) {
101+
public char[][] join(char[][] existing, char[][] next) {
103102
if (existing == null) {
104-
return new ArrayList<>(next);
103+
return next;
105104
} else {
106-
existing.add(next.get(1));
107-
return existing;
105+
char[][] result = new char[existing.length + 1][];
106+
char[] insert = next[1];
107+
int i = 1;
108+
while (i < existing.length && existing[i].length > insert.length) {
109+
i++;
110+
}
111+
System.arraycopy(existing, 0, result, 0, i);
112+
result[i] = insert;
113+
if (i < existing.length) {
114+
System.arraycopy(existing, i, result, i + 1, existing.length - i);
115+
}
116+
return result;
108117
}
109118
}
110119

111120
}
112121

113-
public static class BuildOracle implements CharTask<List<char[]>> {
114-
private Map<CharNode<List<char[]>>, CharNode<List<char[]>>> oracle;
115-
private CharNode<List<char[]>> init;
122+
public static class BuildOracle implements CharTask<char[][]> {
123+
private Map<CharNode<char[][]>, CharNode<char[][]>> oracle;
124+
private CharNode<char[][]> init;
116125

117126
public BuildOracle() {
118127
oracle = new IdentityHashMap<>();
119128
}
120129

121130
@Override
122-
public List<CharNode<List<char[]>>> init(CharNode<List<char[]>> root) {
131+
public List<CharNode<char[][]>> init(CharNode<char[][]> root) {
123132
this.init = root;
124133
return asList(root);
125134
}
126135

127136
@Override
128-
public List<CharNode<List<char[]>>> process(CharNode<List<char[]>> node) {
129-
List<CharNode<List<char[]>>> nexts = new ArrayList<>();
137+
public List<CharNode<char[][]>> process(CharNode<char[][]> node) {
138+
List<CharNode<char[][]>> nexts = new ArrayList<>();
130139
for (char c : node.getAlternatives()) {
131-
CharNode<List<char[]>> current = node.nextNode(c);
140+
CharNode<char[][]> current = node.nextNode(c);
132141

133-
CharNode<List<char[]>> down = oracle.get(node);
142+
CharNode<char[][]> down = oracle.get(node);
134143
while (down != null) {
135-
CharNode<List<char[]>> next = down.nextNode(c);
144+
CharNode<char[][]> next = down.nextNode(c);
136145
if (next != null) {
137146
oracle.put(current, next);
138147
break;
@@ -150,35 +159,35 @@ public List<CharNode<List<char[]>>> process(CharNode<List<char[]>> node) {
150159
}
151160

152161
@SuppressWarnings("unchecked")
153-
private void addNextNode(CharNode<List<char[]>> node, char c, CharNode<List<char[]>> next) {
154-
((CharConnectionAdaptor<List<char[]>>) node).addNextNode(c, next);
162+
private void addNextNode(CharNode<char[][]> node, char c, CharNode<char[][]> next) {
163+
((CharConnectionAdaptor<char[][]>) node).addNextNode(c, next);
155164
}
156165
}
157166

158-
public static class UseCharClasses implements CharTask<List<char[]>> {
167+
public static class UseCharClasses implements CharTask<char[][]> {
159168

160169
private CharMapping mapping;
161-
private Set<CharNode<List<char[]>>> done;
170+
private Set<CharNode<char[][]>> done;
162171

163172
public UseCharClasses(CharMapping mapping) {
164173
this.mapping = mapping;
165174
this.done = new HashSet<>();
166175
}
167176

168177
@Override
169-
public List<CharNode<List<char[]>>> init(CharNode<List<char[]>> root) {
178+
public List<CharNode<char[][]>> init(CharNode<char[][]> root) {
170179
if (mapping == CharMapping.IDENTITY) {
171180
return Collections.emptyList();
172181
}
173182
return asList(root);
174183
}
175184

176185
@Override
177-
public List<CharNode<List<char[]>>> process(CharNode<List<char[]>> node) {
178-
List<CharNode<List<char[]>>> nexts = new ArrayList<>();
186+
public List<CharNode<char[][]>> process(CharNode<char[][]> node) {
187+
List<CharNode<char[][]>> nexts = new ArrayList<>();
179188

180189
for (char c : node.getAlternatives()) {
181-
CharNode<List<char[]>> next = node.nextNode(c);
190+
CharNode<char[][]> next = node.nextNode(c);
182191
for (char cc : mapping.map(c)) {
183192
addNextNode(node, cc, next);
184193
}
@@ -191,8 +200,8 @@ public List<CharNode<List<char[]>>> process(CharNode<List<char[]>> node) {
191200
}
192201

193202
@SuppressWarnings("unchecked")
194-
private void addNextNode(CharNode<List<char[]>> node, char c, CharNode<List<char[]>> next) {
195-
((CharConnectionAdaptor<List<char[]>>) node).addNextNode(c, next);
203+
private void addNextNode(CharNode<char[][]> node, char c, CharNode<char[][]> next) {
204+
((CharConnectionAdaptor<char[][]>) node).addNextNode(c, next);
196205
}
197206
}
198207

@@ -202,10 +211,10 @@ private static class Finder extends AbstractStringFinder {
202211
private final int lookahead;
203212
private final CharMapping mapping;
204213
private CharProvider chars;
205-
private CharAutomaton<List<char[]>> cursor;
214+
private CharAutomaton<char[][]> cursor;
206215
private Queue<StringMatch> buffer;
207216

208-
public Finder(CharWordSet<List<char[]>> trie, int minLength, CharMapping mapping, CharProvider chars, StringFinderOption... options) {
217+
public Finder(CharWordSet<char[][]> trie, int minLength, CharMapping mapping, CharProvider chars, StringFinderOption... options) {
209218
super(options);
210219
this.minLength = minLength;
211220
this.lookahead = minLength - 1;
@@ -241,12 +250,11 @@ public StringMatch findNext() {
241250
long currentWindowEnd = currentWindowStart + minLength;
242251
char[] matchedPrefix = chars.between(currentPos, currentWindowEnd);
243252
if (success && j < 0) {
244-
List<char[]> patterns = cursor.iterator().next();
245-
Iterator<char[]> iPatterns = patterns.iterator();
246-
char[] prefix = iPatterns.next();
253+
char[][] patterns = cursor.iterator().next();
254+
char[] prefix = patterns[0];
247255
if (Arrays.equals(prefix, mapping.normalized(matchedPrefix))) {
248-
while (iPatterns.hasNext()) {
249-
char[] suffix = iPatterns.next();
256+
for (int i = 1; i < patterns.length; i++) {
257+
char[] suffix = patterns[i];
250258
long currentWordEnd = currentWindowEnd + suffix.length;
251259
if (!chars.finished((int) (currentWordEnd - currentWindowStart - 1))) {
252260
char[] matchedSuffix = chars.between(currentWindowEnd, currentWordEnd);

0 commit comments

Comments
 (0)