Skip to content

Commit 7e5d9d4

Browse files
authored
feat: add HuffmanCoding with fail-fast validation and immutable design (#7289)
1 parent 023f856 commit 7e5d9d4

File tree

2 files changed

+363
-0
lines changed

2 files changed

+363
-0
lines changed
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.Collections;
4+
import java.util.HashMap;
5+
import java.util.Map;
6+
import java.util.PriorityQueue;
7+
8+
/**
9+
* Huffman Coding Compression Algorithm Implementation.
10+
* <p>
11+
* Huffman Coding is a popular greedy algorithm used for lossless data compression.
12+
* It reduces the overall size of data by assigning variable-length, prefix-free
13+
* binary codes to input characters, ensuring that more frequent characters receive
14+
* the shortest possible codes.
15+
* </p>
16+
* <p>
17+
* <strong>Key Features:</strong>
18+
* <ul>
19+
* <li>Uses a PriorityQueue (min-heap) to efficiently construct the optimal prefix tree.</li>
20+
* <li>Fail-fast design throws exceptions for unsupported characters and malformed binary payloads.</li>
21+
* <li>Immutable internal dictionary state prevents external tampering with generated codes.</li>
22+
* <li>Robust handling of edge cases, including single-character strings and incomplete sequences.</li>
23+
* </ul>
24+
* </p>
25+
* @author Chahat Sandhu, <a href="https://github.com/singhc7">singhc7</a>
26+
* @see <a href="https://en.wikipedia.org/wiki/Huffman_coding">Huffman Coding (Wikipedia)</a>
27+
*/
28+
public class HuffmanCoding {
29+
30+
private Node root;
31+
private final Map<Character, String> huffmanCodes;
32+
33+
/**
34+
* Represents a node within the Huffman Tree.
35+
* Implements {@link Comparable} to allow sorting by frequency in a PriorityQueue.
36+
*/
37+
private static class Node implements Comparable<Node> {
38+
final char ch;
39+
final int freq;
40+
final Node left;
41+
final Node right;
42+
43+
/**
44+
* Constructs a leaf node containing a specific character and its frequency.
45+
*
46+
* @param ch The character stored in this leaf.
47+
* @param freq The frequency of occurrence of the character.
48+
*/
49+
Node(char ch, int freq) {
50+
this.ch = ch;
51+
this.freq = freq;
52+
this.left = null;
53+
this.right = null;
54+
}
55+
56+
/**
57+
* Constructs an internal node that merges two child nodes.
58+
* The character is defaulted to the null character ('\0').
59+
*
60+
* @param freq The combined frequency of the left and right child nodes.
61+
* @param left The left child node.
62+
* @param right The right child node.
63+
*/
64+
Node(int freq, Node left, Node right) {
65+
this.ch = '\0';
66+
this.freq = freq;
67+
this.left = left;
68+
this.right = right;
69+
}
70+
71+
/**
72+
* Determines if the current node is a leaf (contains no children).
73+
*
74+
* @return {@code true} if both left and right children are null, {@code false} otherwise.
75+
*/
76+
boolean isLeaf() {
77+
return left == null && right == null;
78+
}
79+
80+
/**
81+
* Compares this node with another node based on their frequencies.
82+
* Used by the PriorityQueue to maintain the min-heap property.
83+
*
84+
* @param other The other Node to compare against.
85+
* @return A negative integer, zero, or a positive integer as this node's frequency
86+
* is less than, equal to, or greater than the specified node's frequency.
87+
*/
88+
@Override
89+
public int compareTo(Node other) {
90+
return Integer.compare(this.freq, other.freq);
91+
}
92+
}
93+
94+
/**
95+
* Initializes the Huffman Tree and generates immutable prefix-free codes
96+
* based on the character frequencies in the provided text.
97+
*
98+
* @param text The input string used to calculate frequencies and build the optimal tree.
99+
* If null or empty, an empty tree and dictionary are created.
100+
*/
101+
public HuffmanCoding(String text) {
102+
if (text == null || text.isEmpty()) {
103+
this.huffmanCodes = Collections.emptyMap();
104+
return;
105+
}
106+
107+
Map<Character, String> tempCodes = new HashMap<>();
108+
buildTree(text);
109+
generateCodes(root, "", tempCodes);
110+
111+
if (tempCodes.size() == 1) {
112+
tempCodes.put(root.ch, "0");
113+
}
114+
115+
this.huffmanCodes = Collections.unmodifiableMap(tempCodes);
116+
}
117+
118+
/**
119+
* Computes character frequencies and constructs the Huffman Tree using a min-heap.
120+
* The optimal tree is built by repeatedly extracting the two lowest-frequency nodes
121+
* and merging them until a single root node remains.
122+
*
123+
* @param text The input text to analyze.
124+
*/
125+
private void buildTree(String text) {
126+
Map<Character, Integer> freqMap = new HashMap<>();
127+
for (char c : text.toCharArray()) {
128+
freqMap.put(c, freqMap.getOrDefault(c, 0) + 1);
129+
}
130+
131+
PriorityQueue<Node> pq = new PriorityQueue<>();
132+
for (Map.Entry<Character, Integer> entry : freqMap.entrySet()) {
133+
pq.add(new Node(entry.getKey(), entry.getValue()));
134+
}
135+
136+
while (pq.size() > 1) {
137+
Node left = pq.poll();
138+
Node right = pq.poll();
139+
pq.add(new Node(left.freq + right.freq, left, right));
140+
}
141+
142+
root = pq.poll();
143+
}
144+
145+
/**
146+
* Recursively traverses the Huffman Tree to generate prefix-free binary codes.
147+
* Left traversals append a '0' to the code, while right traversals append a '1'.
148+
*
149+
* @param node The current node in the traversal.
150+
* @param code The accumulated binary string for the current path.
151+
* @param map The temporary dictionary to populate with the final character-to-code mappings.
152+
*/
153+
private void generateCodes(Node node, String code, Map<Character, String> map) {
154+
if (node == null) {
155+
return;
156+
}
157+
if (node.isLeaf()) {
158+
map.put(node.ch, code);
159+
return;
160+
}
161+
generateCodes(node.left, code + "0", map);
162+
generateCodes(node.right, code + "1", map);
163+
}
164+
165+
/**
166+
* Encodes the given plaintext string into a binary string using the generated Huffman dictionary.
167+
*
168+
* @param text The plaintext string to compress.
169+
* @return A string of '0's and '1's representing the compressed data.
170+
* Returns an empty string if the input is null or empty.
171+
* @throws IllegalStateException If attempting to encode when the Huffman tree is empty.
172+
* @throws IllegalArgumentException If the input text contains a character not present
173+
* in the original text used to build the tree.
174+
*/
175+
public String encode(String text) {
176+
if (text == null || text.isEmpty()) {
177+
return "";
178+
}
179+
if (root == null) {
180+
throw new IllegalStateException("Huffman tree is empty.");
181+
}
182+
183+
StringBuilder sb = new StringBuilder();
184+
for (char c : text.toCharArray()) {
185+
if (!huffmanCodes.containsKey(c)) {
186+
throw new IllegalArgumentException(String.format("Character '%c' (U+%04X) not found in Huffman dictionary.", c, (int) c));
187+
}
188+
sb.append(huffmanCodes.get(c));
189+
}
190+
return sb.toString();
191+
}
192+
193+
/**
194+
* Decodes the given binary string back into the original plaintext using the Huffman Tree.
195+
* Validates the integrity of the binary payload during traversal.
196+
*
197+
* @param encodedText The binary string of '0's and '1's to decompress.
198+
* @return The reconstructed plaintext string. Returns an empty string if the input is null or empty.
199+
* @throws IllegalStateException If attempting to decode when the Huffman tree is empty.
200+
* @throws IllegalArgumentException If the binary string contains characters other than '0' or '1',
201+
* or if the sequence ends abruptly without reaching a leaf node.
202+
*/
203+
public String decode(String encodedText) {
204+
if (encodedText == null || encodedText.isEmpty()) {
205+
return "";
206+
}
207+
if (root == null) {
208+
throw new IllegalStateException("Huffman tree is empty.");
209+
}
210+
211+
StringBuilder sb = new StringBuilder();
212+
213+
if (root.isLeaf()) {
214+
for (char bit : encodedText.toCharArray()) {
215+
if (bit != '0') {
216+
throw new IllegalArgumentException("Invalid binary sequence for single-character tree.");
217+
}
218+
sb.append(root.ch);
219+
}
220+
return sb.toString();
221+
}
222+
223+
Node current = root;
224+
for (char bit : encodedText.toCharArray()) {
225+
if (bit != '0' && bit != '1') {
226+
throw new IllegalArgumentException("Encoded text contains invalid characters: " + bit);
227+
}
228+
229+
current = (bit == '0') ? current.left : current.right;
230+
231+
if (current.isLeaf()) {
232+
sb.append(current.ch);
233+
current = root;
234+
}
235+
}
236+
237+
if (current != root) {
238+
throw new IllegalArgumentException("Malformed encoded string: incomplete sequence ending.");
239+
}
240+
241+
return sb.toString();
242+
}
243+
244+
/**
245+
* Retrieves the generated Huffman dictionary mapping characters to their binary codes.
246+
*
247+
* @return An unmodifiable map containing the character-to-binary-code mappings to prevent
248+
* external mutation of the algorithm's state.
249+
*/
250+
public Map<Character, String> getHuffmanCodes() {
251+
return huffmanCodes;
252+
}
253+
}
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
package com.thealgorithms.compression;
2+
3+
import static org.junit.jupiter.api.Assertions.assertEquals;
4+
import static org.junit.jupiter.api.Assertions.assertNotNull;
5+
import static org.junit.jupiter.api.Assertions.assertThrows;
6+
import static org.junit.jupiter.api.Assertions.assertTrue;
7+
8+
import org.junit.jupiter.api.Test;
9+
10+
class HuffmanCodingTest {
11+
12+
@Test
13+
void testStandardLifecycle() {
14+
String input = "efficiency is key";
15+
HuffmanCoding huffman = new HuffmanCoding(input);
16+
17+
String encoded = huffman.encode(input);
18+
assertNotNull(encoded);
19+
assertTrue(encoded.matches("[01]+"));
20+
assertEquals(input, huffman.decode(encoded));
21+
}
22+
23+
@Test
24+
void testNullAndEmptyHandling() {
25+
HuffmanCoding huffman = new HuffmanCoding("");
26+
assertEquals("", huffman.encode(""));
27+
assertEquals("", huffman.decode(""));
28+
29+
HuffmanCoding huffmanNull = new HuffmanCoding(null);
30+
assertEquals("", huffmanNull.encode(null));
31+
assertEquals("", huffmanNull.decode(null));
32+
}
33+
34+
@Test
35+
void testSingleCharacterEdgeCase() {
36+
String input = "aaaaa";
37+
HuffmanCoding huffman = new HuffmanCoding(input);
38+
39+
String encoded = huffman.encode(input);
40+
assertEquals("00000", encoded);
41+
assertEquals(input, huffman.decode(encoded));
42+
}
43+
44+
@Test
45+
void testUnicodeAndSpecialCharacters() {
46+
// Tests spacing, symbols, non-latin alphabets, and surrogate pairs (emojis)
47+
String input = "Hello, World! 🚀\nLine 2: こんにちは";
48+
HuffmanCoding huffman = new HuffmanCoding(input);
49+
50+
String encoded = huffman.encode(input);
51+
assertEquals(input, huffman.decode(encoded));
52+
}
53+
54+
@Test
55+
void testFailFastOnUnseenCharacter() {
56+
HuffmanCoding huffman = new HuffmanCoding("abc");
57+
58+
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
59+
() -> huffman.encode("abcd") // 'd' was not in the original tree
60+
);
61+
assertTrue(exception.getMessage().contains("not found in Huffman dictionary"));
62+
}
63+
64+
@Test
65+
void testFailFastOnInvalidBinaryCharacter() {
66+
HuffmanCoding huffman = new HuffmanCoding("abc");
67+
String encoded = huffman.encode("abc");
68+
69+
// Inject a '2' into the binary stream
70+
String corruptedEncoded = encoded + "2";
71+
72+
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> huffman.decode(corruptedEncoded));
73+
assertTrue(exception.getMessage().contains("contains invalid characters"));
74+
}
75+
76+
@Test
77+
void testFailFastOnIncompleteSequence() {
78+
HuffmanCoding huffman = new HuffmanCoding("abcd");
79+
String encoded = huffman.encode("abc");
80+
81+
// Truncate the last bit to simulate an incomplete byte/sequence transfer
82+
String truncatedEncoded = encoded.substring(0, encoded.length() - 1);
83+
84+
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> huffman.decode(truncatedEncoded));
85+
assertTrue(exception.getMessage().contains("incomplete sequence"));
86+
}
87+
88+
@Test
89+
void testImmutabilityOfDictionary() {
90+
HuffmanCoding huffman = new HuffmanCoding("abc");
91+
var codes = huffman.getHuffmanCodes();
92+
93+
assertThrows(UnsupportedOperationException.class, () -> codes.put('z', "0101"));
94+
}
95+
96+
@Test
97+
void testStressVolume() {
98+
StringBuilder sb = new StringBuilder();
99+
// Generate a 100,000 character string
100+
for (int i = 0; i < 100000; i++) {
101+
sb.append((char) ('a' + (i % 26)));
102+
}
103+
String largeInput = sb.toString();
104+
105+
HuffmanCoding huffman = new HuffmanCoding(largeInput);
106+
String encoded = huffman.encode(largeInput);
107+
108+
assertEquals(largeInput, huffman.decode(encoded));
109+
}
110+
}

0 commit comments

Comments
 (0)