|
| 1 | +package com.github.myibu.algorithm.compress; |
| 2 | + |
| 3 | +import com.github.myibu.algorithm.data.Bits; |
| 4 | +import com.github.myibu.algorithm.endode.GolombEncoder; |
| 5 | + |
| 6 | +import java.math.BigDecimal; |
| 7 | +import java.math.RoundingMode; |
| 8 | +import java.util.*; |
| 9 | +import java.util.stream.Collectors; |
| 10 | + |
| 11 | +/** |
| 12 | + * LZ77 compress algorithm |
| 13 | + * @author myibu |
| 14 | + * Created on 2021/10/11 |
| 15 | + */ |
| 16 | +public class LZ77Compressor implements Compressor { |
| 17 | + private static int DEFAULT_SEARCH_BUFFER_LENGTH = 7; |
| 18 | + private static int DEFAULT_LOOK_AHEAD_WINDOW_LENGTH = 5; |
| 19 | + |
| 20 | + /** |
| 21 | + * S is the length of the search buffer |
| 22 | + */ |
| 23 | + private int s; |
| 24 | + /** |
| 25 | + * L is the length of the look ahead window |
| 26 | + */ |
| 27 | + private int l; |
| 28 | + |
| 29 | + public LZ77Compressor() { |
| 30 | + s = DEFAULT_SEARCH_BUFFER_LENGTH; |
| 31 | + l = DEFAULT_LOOK_AHEAD_WINDOW_LENGTH; |
| 32 | + } |
| 33 | + |
| 34 | + /** |
| 35 | + * while look-ahead buffer is not empty |
| 36 | + * go backwards in search buffer to find longest match of the look-ahead buffer |
| 37 | + * if match found |
| 38 | + * print: (offset from window boundary, length of match, next symbol in look ahead buffer); |
| 39 | + * shift window by length+1; |
| 40 | + * else |
| 41 | + * print: (0, 0, first symbol in look-ahead buffer); |
| 42 | + * shift window by 1; |
| 43 | + * fi |
| 44 | + * end while |
| 45 | + * @param in_data input |
| 46 | + * @param in_len length of input |
| 47 | + * @param out_data output |
| 48 | + * @return offset in output |
| 49 | + */ |
| 50 | + @Override |
| 51 | + public int compress(byte[] in_data, int in_len, byte[] out_data) { |
| 52 | + // no need to compress |
| 53 | + if (l > in_len) { |
| 54 | + System.arraycopy(in_data, 0, out_data, 0, in_len); |
| 55 | + return in_len; |
| 56 | + } |
| 57 | + List<List<Integer>> tuples = new ArrayList<>(); |
| 58 | + // search buffer |
| 59 | + byte[] sBuf = new byte[s]; |
| 60 | + // look ahead window |
| 61 | + byte[] lWindow = new byte[l]; |
| 62 | + int sp = 0, lp = l, ip = 0, op = 0; |
| 63 | + while (lWindow.length > 0 && ip < in_len) { |
| 64 | + // update search buffer |
| 65 | + int sStart = 0, sEnd = sp < s ? sp : s; |
| 66 | + for (int i = sStart; i < sEnd; i++) { |
| 67 | + sBuf[i] = in_data[ip - i - 1]; |
| 68 | + } |
| 69 | + // update look ahead window |
| 70 | + int lStart = 0, lEnd = ip + l < in_len ? l : in_len - ip; |
| 71 | + if (lEnd < l) { |
| 72 | + lWindow = new byte[lEnd]; |
| 73 | + } |
| 74 | + for (int i = lStart; i < lEnd; i++) { |
| 75 | + lWindow[i] = in_data[ip + i]; |
| 76 | + } |
| 77 | + int llStart = sEnd - 1, rrStart = 0, llEnd = 0, rrEnd = (lp = lEnd); |
| 78 | + int minMatched = 1, minIndex = 0; |
| 79 | + for (int i = llStart; i >= 0; i--) { |
| 80 | + int matched = 0, left = i, right = rrStart; |
| 81 | + while (left >= llEnd && right < rrEnd && sBuf[left--] == lWindow[right++]) { |
| 82 | + matched++; |
| 83 | + } |
| 84 | + if (matched >= minMatched) { |
| 85 | + minIndex = i; |
| 86 | + minMatched = matched; |
| 87 | + } |
| 88 | + } |
| 89 | + int lWindowLen = lWindow.length; |
| 90 | + // only one byte in window, set tuple to (0, 0, lWindow[0]) |
| 91 | + if (lWindowLen == 1) { |
| 92 | + minIndex = 0; |
| 93 | + } |
| 94 | + // matched |
| 95 | + if (minIndex > 0) { |
| 96 | + tuples.add(Arrays.asList( minIndex + 1, minMatched, (minMatched == lWindowLen) ? null : (int)lWindow[minMatched])); |
| 97 | + sp += ((minMatched == lWindowLen) ? minMatched : (minMatched + 1)); |
| 98 | + ip += ((minMatched == lWindowLen) ? minMatched : (minMatched + 1)); |
| 99 | + } else { |
| 100 | + sp++; |
| 101 | + ip++; |
| 102 | + tuples.add(Arrays.asList(0, 0, (int)lWindow[0])); |
| 103 | + } |
| 104 | + if (isDebug) { |
| 105 | + System.out.println(", SearchBuffer=" |
| 106 | + + new StringBuilder(new String(sBuf)).reverse().toString() + ", LookaheadWindow=" + new String(lWindow) |
| 107 | + + " | " + tuples.get(tuples.size()-1)/* + " | " + (char)(tuples.get(tuples.size()-1).get(2).intValue())*/); |
| 108 | + } |
| 109 | + } |
| 110 | + int compressedLen = doEncode(tuples, out_data); |
| 111 | + if (isDebug) { |
| 112 | + System.out.println("after encode: compressed rate=" + new BigDecimal(compressedLen * 100.0 / in_len).setScale(2, RoundingMode.HALF_UP) + "%"); |
| 113 | + } |
| 114 | + return compressedLen; |
| 115 | + } |
| 116 | + |
| 117 | + private int doEncode(List<List<Integer>> tuples, byte[] out_data) { |
| 118 | + Bits finalRes = new Bits(); |
| 119 | + GolombEncoder encoder = new GolombEncoder(); |
| 120 | + for (List<Integer> tuple: tuples) { |
| 121 | + Bits bits = new Bits(); |
| 122 | + Bits bits1 = encoder.encodeToBinary(tuple.get(0), (int)(Math.ceil(Math.log(s) / Math.log(2)))); |
| 123 | + bits.append(bits1); |
| 124 | + Bits bits2 = encoder.encode(tuple.get(1), l); |
| 125 | + bits.append(bits2); |
| 126 | + Bits bits3 = new Bits(); |
| 127 | + if (tuple.get(2) != null) { |
| 128 | + bits3 = Bits.ofByte((byte) tuple.get(2).intValue()); |
| 129 | + bits.append(bits3); |
| 130 | + } |
| 131 | + if (isDebug) { |
| 132 | + System.out.println(tuple + " encoded result: " + "("+ bits1 + ", "+ bits2 + ", "+ bits3 + ")"); |
| 133 | + } |
| 134 | + finalRes.append(bits); |
| 135 | + } |
| 136 | + byte[] fr = finalRes.toByteArray(); |
| 137 | + System.arraycopy(fr, 0, out_data, 0, fr.length); |
| 138 | + if (isDebug) { |
| 139 | + System.out.println("after encode: bits=" + finalRes); |
| 140 | + } |
| 141 | + return fr.length; |
| 142 | + } |
| 143 | + |
| 144 | + /** |
| 145 | + * for each token (offset, length, symbol) |
| 146 | + * if offset = 0 then |
| 147 | + * print symbol; |
| 148 | + * else |
| 149 | + * go reverse in previous output by offset characters and copy |
| 150 | + * character wise for length symbols; |
| 151 | + * print symbol; |
| 152 | + * fi |
| 153 | + * next |
| 154 | + * @param in_data input |
| 155 | + * @param in_len length of input |
| 156 | + * @param out_data output |
| 157 | + * @return offset in output |
| 158 | + */ |
| 159 | + @Override |
| 160 | + public int decompress(byte[] in_data, int in_len, byte[] out_data) { |
| 161 | + int e1 = (int)(Math.ceil(Math.log(s) / Math.log(2))); |
| 162 | + GolombEncoder encoder = new GolombEncoder(); |
| 163 | + Set<Bits> allEncodeSeq = new HashSet<>(); |
| 164 | + for (int i = 0; i <= l; i++) { |
| 165 | + allEncodeSeq.add(encoder.encode(i, l)); |
| 166 | + } |
| 167 | + List<Bits> sortedEncodeSeq = allEncodeSeq.stream().sorted(Comparator.comparingInt(Bits::length)).collect(Collectors.toList()); |
| 168 | + Bits bits = Bits.ofByte(in_data); |
| 169 | + if (isDebug) { |
| 170 | + System.out.println("before decode: bits=" + bits); |
| 171 | + } |
| 172 | + int ip = 0; |
| 173 | + List<List<Integer>> tuples = new ArrayList<>(); |
| 174 | + while (ip < bits.length() && ip + e1 <= bits.length()) { |
| 175 | + Bits b1 = bits.subBits(ip, ip + e1); |
| 176 | + ip = ip + e1; |
| 177 | + int offset = encoder.encodeToBinary(b1); |
| 178 | + int length = -1; |
| 179 | + for (Bits sortedEncode: sortedEncodeSeq) { |
| 180 | + if (ip + sortedEncode.length() < bits.length()) { |
| 181 | + if (sortedEncode.equals(bits.subBits(ip, ip+sortedEncode.length()))) { |
| 182 | + length = encoder.decode(sortedEncode, l); |
| 183 | + ip += sortedEncode.length(); |
| 184 | + break; |
| 185 | + } |
| 186 | + } |
| 187 | + } |
| 188 | + if (length == -1 ) { |
| 189 | + break; |
| 190 | + } |
| 191 | + if (length != l && ip + 8 <= bits.length()) { |
| 192 | + int symbol = (int) bits.subBits(ip, ip + 8).toByte(); |
| 193 | + tuples.add(Arrays.asList(offset, length, symbol)); |
| 194 | + ip += 8; |
| 195 | + } else { |
| 196 | + tuples.add(Arrays.asList(offset, length, null)); |
| 197 | + } |
| 198 | + } |
| 199 | + if (isDebug) { |
| 200 | + System.out.println("decode tuples=" + tuples); |
| 201 | + } |
| 202 | + return doDecode(tuples, out_data); |
| 203 | + } |
| 204 | + |
| 205 | + private int doDecode(List<List<Integer>> tuples, byte[] out_data) { |
| 206 | + Bits seq = new Bits(); |
| 207 | + for (List<Integer> tuple: tuples) { |
| 208 | + int offset = tuple.get(0), length = tuple.get(1); |
| 209 | + if (tuple.get(2) != null) { |
| 210 | + int symbol = tuple.get(2); |
| 211 | + Bits sb = Bits.ofByte((byte) symbol); |
| 212 | + if (offset == 0) { |
| 213 | + seq.append(sb); |
| 214 | + if (isDebug) { |
| 215 | + System.out.println(tuple + ", seq=" + new String(seq.toByteArray())); |
| 216 | + } |
| 217 | + } else { |
| 218 | + int start = seq.byteLength() < s ? seq.byteLength() - offset: s - offset; |
| 219 | + int used = seq.byteLength() < s ? 0 : seq.byteLength() - s; |
| 220 | + seq.append(seq.subBits((used + start) * 8, (used + start + length) * 8)).append(sb); |
| 221 | + if (isDebug) { |
| 222 | + System.out.println(tuple + ", seq=" + new String(seq.toByteArray())); |
| 223 | + } |
| 224 | + } |
| 225 | + } else { |
| 226 | + int start = seq.byteLength() < s ? seq.byteLength() - offset: s - offset; |
| 227 | + int used = seq.byteLength() < s ? 0 : seq.byteLength() - s; |
| 228 | + seq.append(seq.subBits((used + start) * 8, (used + start + length) * 8)); |
| 229 | + if (isDebug) { |
| 230 | + System.out.println(tuple + ", seq=" + new String(seq.toByteArray())); |
| 231 | + } |
| 232 | + } |
| 233 | + } |
| 234 | + if (isDebug) { |
| 235 | + System.out.println("after decode, bits=" + seq); |
| 236 | + } |
| 237 | + int len = seq.byteLength(); |
| 238 | + for (int i = 0; i < len; i++) { |
| 239 | + out_data[i] = seq.getByte(i).toByte(); |
| 240 | + } |
| 241 | + return len; |
| 242 | + } |
| 243 | + |
| 244 | + |
| 245 | + private boolean isDebug = false; |
| 246 | + |
| 247 | + @Override |
| 248 | + public void setDebug(boolean isDebug) { |
| 249 | + this.isDebug = isDebug; |
| 250 | + } |
| 251 | + |
| 252 | + public void setSL(int s, int l) { |
| 253 | + this.s = s; |
| 254 | + this.l = l; |
| 255 | + } |
| 256 | +} |
0 commit comments