Skip to content

Commit f718f71

Browse files
authored
Merge pull request #5 from myibu/master
Master
2 parents 63d7b7c + 72a21d0 commit f718f71

File tree

19 files changed

+2309
-14
lines changed

19 files changed

+2309
-14
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ tasks.withType(JavaCompile) { options.encoding = "UTF-8" }
2626

2727
group = 'com.github.myibu'
2828
archivesBaseName = "algorithm-java"
29-
version = "1.0.0a"
29+
version = "1.0.0c"
3030

3131
repositories {
3232
mavenCentral()

docs/DictionaryTree.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
hi
2+
hello
3+
nihao
4+
see
5+
hey
6+
7+
start: 初始状态,end:结束状态
8+
State {
9+
sides
10+
next-State
11+
}
12+
13+
14+
1.初始化
15+
State root = State.start;
16+
17+
2.插入字符串序列
18+
- 遍历字符串序列,判断当前遍历的字符是否已经存在起始状态的边
19+
- 如果存在,状态转移至已存在的边指向的后一个状态;否则新建一个状态,并将当前状态指向新建的状态;
20+
- 如果已经到了字符串末尾,则指向结束状态
21+
```
22+
start -> [h] -> 1 -> [i] -> 2 -> end
23+
start -> [h] -> 1 -> [e] -> 3 -> [l] -> 4 -> [l] -> 5 -> [l] -> 6 -> end
24+
start -> [n] -> 7 -> [i] -> 8 -> [h] -> 8 -> [a] -> 10 -> [o] -> 11 -> end
25+
start -> [s] -> 12 -> [e] -> 13 -> [e] -> 14 -> end
26+
start -> [h] -> 1 -> [e] -> 3 -> [y] -> 15 -> end
27+
```
28+
29+
3.查看文本包含哪些字典单词
30+
- 遍历文本,判断当前遍历的字符是否已经存在起始状态的边
31+
- 如果存在,状态转移至已存在的边指向的后一个状态;否则从起始状态开始继续文本文本下一个字符
32+
- 如果已经到了字符串末尾,找到并记录
33+

docs/HoffmanAndGolombCoding.pdf

947 KB
Binary file not shown.

docs/LZ77.pdf

169 KB
Binary file not shown.

readme.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,22 @@ Reference to: [LinearCongruence.pdf](./docs/LinearCongruence.pdf)
3434
### MersenneTwisterRandom
3535
Reference to: [MersenneTwister.pdf](./docs/MersenneTwister.pdf)
3636

37+
### DFASensitiveWordFilter
38+
39+
### AhoCorasickSensitiveWordFilter
40+
41+
### LZ77Compressor
42+
Reference to: [MersenneTwister.pdf](./docs/LZ77.pdf)
43+
44+
### GolombEncoder
45+
Reference to: [MersenneTwister.pdf](./docs/HoffmanAndGolombCoding.pdf)
46+
3747
## Installation
3848
```bash
3949
<dependency>
4050
<groupId>com.github.myibu</groupId>
4151
<artifactId>algorithm-java</artifactId>
42-
<version>1.0.0a</version>
52+
<version>1.0.0c</version>
4353
</dependency>
4454
```
4555

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.github.myibu.algorithm.compress;
2+
3+
/**
4+
* compressor for compress and decompress
5+
* @author myibu
6+
* Created on 2021/10/11
7+
*/
8+
public interface Compressor extends Debugable {
9+
/**
10+
* compress bytes
11+
* @param in_data input
12+
* @param in_len length of input
13+
* @param out_data output
14+
* @return offset in output
15+
*/
16+
int compress(byte[] in_data, int in_len, byte[] out_data);
17+
18+
/**
19+
* decompress bytes
20+
* @param in_data input
21+
* @param in_len length of input
22+
* @param out_data output
23+
* @return offset in output
24+
*/
25+
int decompress(byte[] in_data, int in_len, byte[] out_data);
26+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package com.github.myibu.algorithm.compress;
2+
3+
/**
4+
* compressor for compress and decompress
5+
* @author myibu
6+
* Created on 2021/10/15
7+
*/
8+
public interface Debugable {
9+
/**
10+
* enable Debug or not, default should be not enabled
11+
* @param isDebug is debug
12+
*/
13+
void setDebug(boolean isDebug);
14+
}
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
package com.github.myibu.algorithm.compress;
2+
3+
import com.github.myibu.algorithm.data.Bits;
4+
import com.github.myibu.algorithm.endode.GolombEncoder;
5+
6+
import java.math.BigDecimal;
7+
import java.math.RoundingMode;
8+
import java.util.*;
9+
import java.util.stream.Collectors;
10+
11+
/**
12+
* LZ77 compress algorithm
13+
* @author myibu
14+
* Created on 2021/10/11
15+
*/
16+
public class LZ77Compressor implements Compressor {
17+
private static int DEFAULT_SEARCH_BUFFER_LENGTH = 7;
18+
private static int DEFAULT_LOOK_AHEAD_WINDOW_LENGTH = 5;
19+
20+
/**
21+
* S is the length of the search buffer
22+
*/
23+
private int s;
24+
/**
25+
* L is the length of the look ahead window
26+
*/
27+
private int l;
28+
29+
public LZ77Compressor() {
30+
s = DEFAULT_SEARCH_BUFFER_LENGTH;
31+
l = DEFAULT_LOOK_AHEAD_WINDOW_LENGTH;
32+
}
33+
34+
/**
35+
* while look-ahead buffer is not empty
36+
* go backwards in search buffer to find longest match of the look-ahead buffer
37+
* if match found
38+
* print: (offset from window boundary, length of match, next symbol in look ahead buffer);
39+
* shift window by length+1;
40+
* else
41+
* print: (0, 0, first symbol in look-ahead buffer);
42+
* shift window by 1;
43+
* fi
44+
* end while
45+
* @param in_data input
46+
* @param in_len length of input
47+
* @param out_data output
48+
* @return offset in output
49+
*/
50+
@Override
51+
public int compress(byte[] in_data, int in_len, byte[] out_data) {
52+
// no need to compress
53+
if (l > in_len) {
54+
System.arraycopy(in_data, 0, out_data, 0, in_len);
55+
return in_len;
56+
}
57+
List<List<Integer>> tuples = new ArrayList<>();
58+
// search buffer
59+
byte[] sBuf = new byte[s];
60+
// look ahead window
61+
byte[] lWindow = new byte[l];
62+
int sp = 0, lp = l, ip = 0, op = 0;
63+
while (lWindow.length > 0 && ip < in_len) {
64+
// update search buffer
65+
int sStart = 0, sEnd = sp < s ? sp : s;
66+
for (int i = sStart; i < sEnd; i++) {
67+
sBuf[i] = in_data[ip - i - 1];
68+
}
69+
// update look ahead window
70+
int lStart = 0, lEnd = ip + l < in_len ? l : in_len - ip;
71+
if (lEnd < l) {
72+
lWindow = new byte[lEnd];
73+
}
74+
for (int i = lStart; i < lEnd; i++) {
75+
lWindow[i] = in_data[ip + i];
76+
}
77+
int llStart = sEnd - 1, rrStart = 0, llEnd = 0, rrEnd = (lp = lEnd);
78+
int minMatched = 1, minIndex = 0;
79+
for (int i = llStart; i >= 0; i--) {
80+
int matched = 0, left = i, right = rrStart;
81+
while (left >= llEnd && right < rrEnd && sBuf[left--] == lWindow[right++]) {
82+
matched++;
83+
}
84+
if (matched >= minMatched) {
85+
minIndex = i;
86+
minMatched = matched;
87+
}
88+
}
89+
int lWindowLen = lWindow.length;
90+
// only one byte in window, set tuple to (0, 0, lWindow[0])
91+
if (lWindowLen == 1) {
92+
minIndex = 0;
93+
}
94+
// matched
95+
if (minIndex > 0) {
96+
tuples.add(Arrays.asList( minIndex + 1, minMatched, (minMatched == lWindowLen) ? null : (int)lWindow[minMatched]));
97+
sp += ((minMatched == lWindowLen) ? minMatched : (minMatched + 1));
98+
ip += ((minMatched == lWindowLen) ? minMatched : (minMatched + 1));
99+
} else {
100+
sp++;
101+
ip++;
102+
tuples.add(Arrays.asList(0, 0, (int)lWindow[0]));
103+
}
104+
if (isDebug) {
105+
System.out.println(", SearchBuffer="
106+
+ new StringBuilder(new String(sBuf)).reverse().toString() + ", LookaheadWindow=" + new String(lWindow)
107+
+ " | " + tuples.get(tuples.size()-1)/* + " | " + (char)(tuples.get(tuples.size()-1).get(2).intValue())*/);
108+
}
109+
}
110+
int compressedLen = doEncode(tuples, out_data);
111+
if (isDebug) {
112+
System.out.println("after encode: compressed rate=" + new BigDecimal(compressedLen * 100.0 / in_len).setScale(2, RoundingMode.HALF_UP) + "%");
113+
}
114+
return compressedLen;
115+
}
116+
117+
private int doEncode(List<List<Integer>> tuples, byte[] out_data) {
118+
Bits finalRes = new Bits();
119+
GolombEncoder encoder = new GolombEncoder();
120+
for (List<Integer> tuple: tuples) {
121+
Bits bits = new Bits();
122+
Bits bits1 = encoder.encodeToBinary(tuple.get(0), (int)(Math.ceil(Math.log(s) / Math.log(2))));
123+
bits.append(bits1);
124+
Bits bits2 = encoder.encode(tuple.get(1), l);
125+
bits.append(bits2);
126+
Bits bits3 = new Bits();
127+
if (tuple.get(2) != null) {
128+
bits3 = Bits.ofByte((byte) tuple.get(2).intValue());
129+
bits.append(bits3);
130+
}
131+
if (isDebug) {
132+
System.out.println(tuple + " encoded result: " + "("+ bits1 + ", "+ bits2 + ", "+ bits3 + ")");
133+
}
134+
finalRes.append(bits);
135+
}
136+
byte[] fr = finalRes.toByteArray();
137+
System.arraycopy(fr, 0, out_data, 0, fr.length);
138+
if (isDebug) {
139+
System.out.println("after encode: bits=" + finalRes);
140+
}
141+
return fr.length;
142+
}
143+
144+
/**
145+
* for each token (offset, length, symbol)
146+
* if offset = 0 then
147+
* print symbol;
148+
* else
149+
* go reverse in previous output by offset characters and copy
150+
* character wise for length symbols;
151+
* print symbol;
152+
* fi
153+
* next
154+
* @param in_data input
155+
* @param in_len length of input
156+
* @param out_data output
157+
* @return offset in output
158+
*/
159+
@Override
160+
public int decompress(byte[] in_data, int in_len, byte[] out_data) {
161+
int e1 = (int)(Math.ceil(Math.log(s) / Math.log(2)));
162+
GolombEncoder encoder = new GolombEncoder();
163+
Set<Bits> allEncodeSeq = new HashSet<>();
164+
for (int i = 0; i <= l; i++) {
165+
allEncodeSeq.add(encoder.encode(i, l));
166+
}
167+
List<Bits> sortedEncodeSeq = allEncodeSeq.stream().sorted(Comparator.comparingInt(Bits::length)).collect(Collectors.toList());
168+
Bits bits = Bits.ofByte(in_data);
169+
if (isDebug) {
170+
System.out.println("before decode: bits=" + bits);
171+
}
172+
int ip = 0;
173+
List<List<Integer>> tuples = new ArrayList<>();
174+
while (ip < bits.length() && ip + e1 <= bits.length()) {
175+
Bits b1 = bits.subBits(ip, ip + e1);
176+
ip = ip + e1;
177+
int offset = encoder.encodeToBinary(b1);
178+
int length = -1;
179+
for (Bits sortedEncode: sortedEncodeSeq) {
180+
if (ip + sortedEncode.length() < bits.length()) {
181+
if (sortedEncode.equals(bits.subBits(ip, ip+sortedEncode.length()))) {
182+
length = encoder.decode(sortedEncode, l);
183+
ip += sortedEncode.length();
184+
break;
185+
}
186+
}
187+
}
188+
if (length == -1 ) {
189+
break;
190+
}
191+
if (length != l && ip + 8 <= bits.length()) {
192+
int symbol = (int) bits.subBits(ip, ip + 8).toByte();
193+
tuples.add(Arrays.asList(offset, length, symbol));
194+
ip += 8;
195+
} else {
196+
tuples.add(Arrays.asList(offset, length, null));
197+
}
198+
}
199+
if (isDebug) {
200+
System.out.println("decode tuples=" + tuples);
201+
}
202+
return doDecode(tuples, out_data);
203+
}
204+
205+
private int doDecode(List<List<Integer>> tuples, byte[] out_data) {
206+
Bits seq = new Bits();
207+
for (List<Integer> tuple: tuples) {
208+
int offset = tuple.get(0), length = tuple.get(1);
209+
if (tuple.get(2) != null) {
210+
int symbol = tuple.get(2);
211+
Bits sb = Bits.ofByte((byte) symbol);
212+
if (offset == 0) {
213+
seq.append(sb);
214+
if (isDebug) {
215+
System.out.println(tuple + ", seq=" + new String(seq.toByteArray()));
216+
}
217+
} else {
218+
int start = seq.byteLength() < s ? seq.byteLength() - offset: s - offset;
219+
int used = seq.byteLength() < s ? 0 : seq.byteLength() - s;
220+
seq.append(seq.subBits((used + start) * 8, (used + start + length) * 8)).append(sb);
221+
if (isDebug) {
222+
System.out.println(tuple + ", seq=" + new String(seq.toByteArray()));
223+
}
224+
}
225+
} else {
226+
int start = seq.byteLength() < s ? seq.byteLength() - offset: s - offset;
227+
int used = seq.byteLength() < s ? 0 : seq.byteLength() - s;
228+
seq.append(seq.subBits((used + start) * 8, (used + start + length) * 8));
229+
if (isDebug) {
230+
System.out.println(tuple + ", seq=" + new String(seq.toByteArray()));
231+
}
232+
}
233+
}
234+
if (isDebug) {
235+
System.out.println("after decode, bits=" + seq);
236+
}
237+
int len = seq.byteLength();
238+
for (int i = 0; i < len; i++) {
239+
out_data[i] = seq.getByte(i).toByte();
240+
}
241+
return len;
242+
}
243+
244+
245+
private boolean isDebug = false;
246+
247+
@Override
248+
public void setDebug(boolean isDebug) {
249+
this.isDebug = isDebug;
250+
}
251+
252+
public void setSL(int s, int l) {
253+
this.s = s;
254+
this.l = l;
255+
}
256+
}

0 commit comments

Comments
 (0)