Skip to content

Commit ce78182

Browse files
committed
feat: add HuffmanCoding with fail-fast validation and immutable design
1 parent 023f856 commit ce78182

File tree

2 files changed

+349
-0
lines changed

2 files changed

+349
-0
lines changed
Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.Collections;
4+
import java.util.HashMap;
5+
import java.util.Map;
6+
import java.util.PriorityQueue;
7+
8+
/**
9+
* Huffman Coding Compression Algorithm Implementation.
10+
* <p>
11+
* Huffman Coding is a popular greedy algorithm used for lossless data compression.
12+
* It reduces the overall size of data by assigning variable-length, prefix-free
13+
* binary codes to input characters, ensuring that more frequent characters receive
14+
* the shortest possible codes.
15+
* </p>
16+
* <p>
17+
* <strong>Key Features:</strong>
18+
* <ul>
19+
* <li>Uses a PriorityQueue (min-heap) to efficiently construct the optimal prefix tree.</li>
20+
* <li>Fail-fast design throws exceptions for unsupported characters and malformed binary payloads.</li>
21+
* <li>Immutable internal dictionary state prevents external tampering with generated codes.</li>
22+
* <li>Robust handling of edge cases, including single-character strings and incomplete sequences.</li>
23+
* </ul>
24+
* </p>
25+
* @author Chahat Sandhu, <a href="https://github.com/singhc7">singhc7</a>
26+
* @see <a href="https://en.wikipedia.org/wiki/Huffman_coding">Huffman Coding (Wikipedia)</a>
27+
*/
28+
public class HuffmanCoding {
29+
30+
private Node root;
31+
private final Map<Character, String> huffmanCodes;
32+
33+
/**
34+
* Represents a node within the Huffman Tree.
35+
* Implements {@link Comparable} to allow sorting by frequency in a PriorityQueue.
36+
*/
37+
private static class Node implements Comparable<Node> {
38+
final char ch;
39+
final int freq;
40+
final Node left, right;
41+
42+
/**
43+
* Constructs a leaf node containing a specific character and its frequency.
44+
*
45+
* @param ch The character stored in this leaf.
46+
* @param freq The frequency of occurrence of the character.
47+
*/
48+
Node(char ch, int freq) {
49+
this.ch = ch;
50+
this.freq = freq;
51+
this.left = null;
52+
this.right = null;
53+
}
54+
55+
/**
56+
* Constructs an internal node that merges two child nodes.
57+
* The character is defaulted to the null character ('\0').
58+
*
59+
* @param freq The combined frequency of the left and right child nodes.
60+
* @param left The left child node.
61+
* @param right The right child node.
62+
*/
63+
Node(int freq, Node left, Node right) {
64+
this.ch = '\0';
65+
this.freq = freq;
66+
this.left = left;
67+
this.right = right;
68+
}
69+
70+
/**
71+
* Determines if the current node is a leaf (contains no children).
72+
*
73+
* @return {@code true} if both left and right children are null, {@code false} otherwise.
74+
*/
75+
boolean isLeaf() {
76+
return left == null && right == null;
77+
}
78+
79+
/**
80+
* Compares this node with another node based on their frequencies.
81+
* Used by the PriorityQueue to maintain the min-heap property.
82+
*
83+
* @param other The other Node to compare against.
84+
* @return A negative integer, zero, or a positive integer as this node's frequency
85+
* is less than, equal to, or greater than the specified node's frequency.
86+
*/
87+
@Override
88+
public int compareTo(Node other) {
89+
return Integer.compare(this.freq, other.freq);
90+
}
91+
}
92+
93+
/**
94+
* Initializes the Huffman Tree and generates immutable prefix-free codes
95+
* based on the character frequencies in the provided text.
96+
*
97+
* @param text The input string used to calculate frequencies and build the optimal tree.
98+
* If null or empty, an empty tree and dictionary are created.
99+
*/
100+
public HuffmanCoding(String text) {
101+
if (text == null || text.isEmpty()) {
102+
this.huffmanCodes = Collections.emptyMap();
103+
return;
104+
}
105+
106+
Map<Character, String> tempCodes = new HashMap<>();
107+
buildTree(text);
108+
generateCodes(root, "", tempCodes);
109+
110+
if (tempCodes.size() == 1) {
111+
tempCodes.put(root.ch, "0");
112+
}
113+
114+
this.huffmanCodes = Collections.unmodifiableMap(tempCodes);
115+
}
116+
117+
/**
118+
* Computes character frequencies and constructs the Huffman Tree using a min-heap.
119+
* The optimal tree is built by repeatedly extracting the two lowest-frequency nodes
120+
* and merging them until a single root node remains.
121+
*
122+
* @param text The input text to analyze.
123+
*/
124+
private void buildTree(String text) {
125+
Map<Character, Integer> freqMap = new HashMap<>();
126+
for (char c : text.toCharArray()) {
127+
freqMap.put(c, freqMap.getOrDefault(c, 0) + 1);
128+
}
129+
130+
PriorityQueue<Node> pq = new PriorityQueue<>();
131+
for (Map.Entry<Character, Integer> entry : freqMap.entrySet()) {
132+
pq.add(new Node(entry.getKey(), entry.getValue()));
133+
}
134+
135+
while (pq.size() > 1) {
136+
Node left = pq.poll();
137+
Node right = pq.poll();
138+
pq.add(new Node(left.freq + right.freq, left, right));
139+
}
140+
141+
root = pq.poll();
142+
}
143+
144+
/**
145+
* Recursively traverses the Huffman Tree to generate prefix-free binary codes.
146+
* Left traversals append a '0' to the code, while right traversals append a '1'.
147+
*
148+
* @param node The current node in the traversal.
149+
* @param code The accumulated binary string for the current path.
150+
* @param map The temporary dictionary to populate with the final character-to-code mappings.
151+
*/
152+
private void generateCodes(Node node, String code, Map<Character, String> map) {
153+
if (node == null) return;
154+
if (node.isLeaf()) {
155+
map.put(node.ch, code);
156+
return;
157+
}
158+
generateCodes(node.left, code + "0", map);
159+
generateCodes(node.right, code + "1", map);
160+
}
161+
162+
/**
163+
* Encodes the given plaintext string into a binary string using the generated Huffman dictionary.
164+
*
165+
* @param text The plaintext string to compress.
166+
* @return A string of '0's and '1's representing the compressed data.
167+
* Returns an empty string if the input is null or empty.
168+
* @throws IllegalStateException If attempting to encode when the Huffman tree is empty.
169+
* @throws IllegalArgumentException If the input text contains a character not present
170+
* in the original text used to build the tree.
171+
*/
172+
public String encode(String text) {
173+
if (text == null || text.isEmpty()) return "";
174+
if (root == null) throw new IllegalStateException("Huffman tree is empty.");
175+
176+
StringBuilder sb = new StringBuilder();
177+
for (char c : text.toCharArray()) {
178+
if (!huffmanCodes.containsKey(c)) {
179+
throw new IllegalArgumentException(String.format("Character '%c' (U+%04X) not found in Huffman dictionary.", c, (int) c));
180+
}
181+
sb.append(huffmanCodes.get(c));
182+
}
183+
return sb.toString();
184+
}
185+
186+
/**
187+
* Decodes the given binary string back into the original plaintext using the Huffman Tree.
188+
* Validates the integrity of the binary payload during traversal.
189+
*
190+
* @param encodedText The binary string of '0's and '1's to decompress.
191+
* @return The reconstructed plaintext string. Returns an empty string if the input is null or empty.
192+
* @throws IllegalStateException If attempting to decode when the Huffman tree is empty.
193+
* @throws IllegalArgumentException If the binary string contains characters other than '0' or '1',
194+
* or if the sequence ends abruptly without reaching a leaf node.
195+
*/
196+
public String decode(String encodedText) {
197+
if (encodedText == null || encodedText.isEmpty()) return "";
198+
if (root == null) throw new IllegalStateException("Huffman tree is empty.");
199+
200+
StringBuilder sb = new StringBuilder();
201+
202+
if (root.isLeaf()) {
203+
for (char bit : encodedText.toCharArray()) {
204+
if (bit != '0') {
205+
throw new IllegalArgumentException("Invalid binary sequence for single-character tree.");
206+
}
207+
sb.append(root.ch);
208+
}
209+
return sb.toString();
210+
}
211+
212+
Node current = root;
213+
for (char bit : encodedText.toCharArray()) {
214+
if (bit != '0' && bit != '1') {
215+
throw new IllegalArgumentException("Encoded text contains invalid characters: " + bit);
216+
}
217+
218+
current = (bit == '0') ? current.left : current.right;
219+
220+
if (current.isLeaf()) {
221+
sb.append(current.ch);
222+
current = root;
223+
}
224+
}
225+
226+
if (current != root) {
227+
throw new IllegalArgumentException("Malformed encoded string: incomplete sequence ending.");
228+
}
229+
230+
return sb.toString();
231+
}
232+
233+
/**
234+
* Retrieves the generated Huffman dictionary mapping characters to their binary codes.
235+
*
236+
* @return An unmodifiable map containing the character-to-binary-code mappings to prevent
237+
* external mutation of the algorithm's state.
238+
*/
239+
public Map<Character, String> getHuffmanCodes() {
240+
return huffmanCodes;
241+
}
242+
}
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package com.thealgorithms.compression;
2+
3+
import static org.junit.jupiter.api.Assertions.*;
4+
5+
import org.junit.jupiter.api.Test;
6+
7+
class HuffmanCodingTest {
8+
9+
@Test
10+
void testStandardLifecycle() {
11+
String input = "efficiency is key";
12+
HuffmanCoding huffman = new HuffmanCoding(input);
13+
14+
String encoded = huffman.encode(input);
15+
assertNotNull(encoded);
16+
assertTrue(encoded.matches("[01]+"));
17+
assertEquals(input, huffman.decode(encoded));
18+
}
19+
20+
@Test
21+
void testNullAndEmptyHandling() {
22+
HuffmanCoding huffman = new HuffmanCoding("");
23+
assertEquals("", huffman.encode(""));
24+
assertEquals("", huffman.decode(""));
25+
26+
HuffmanCoding huffmanNull = new HuffmanCoding(null);
27+
assertEquals("", huffmanNull.encode(null));
28+
assertEquals("", huffmanNull.decode(null));
29+
}
30+
31+
@Test
32+
void testSingleCharacterEdgeCase() {
33+
String input = "aaaaa";
34+
HuffmanCoding huffman = new HuffmanCoding(input);
35+
36+
String encoded = huffman.encode(input);
37+
assertEquals("00000", encoded);
38+
assertEquals(input, huffman.decode(encoded));
39+
}
40+
41+
@Test
42+
void testUnicodeAndSpecialCharacters() {
43+
// Tests spacing, symbols, non-latin alphabets, and surrogate pairs (emojis)
44+
String input = "Hello, World! 🚀\nLine 2: こんにちは";
45+
HuffmanCoding huffman = new HuffmanCoding(input);
46+
47+
String encoded = huffman.encode(input);
48+
assertEquals(input, huffman.decode(encoded));
49+
}
50+
51+
@Test
52+
void testFailFastOnUnseenCharacter() {
53+
HuffmanCoding huffman = new HuffmanCoding("abc");
54+
55+
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
56+
() -> huffman.encode("abcd") // 'd' was not in the original tree
57+
);
58+
assertTrue(exception.getMessage().contains("not found in Huffman dictionary"));
59+
}
60+
61+
@Test
62+
void testFailFastOnInvalidBinaryCharacter() {
63+
HuffmanCoding huffman = new HuffmanCoding("abc");
64+
String encoded = huffman.encode("abc");
65+
66+
// Inject a '2' into the binary stream
67+
String corruptedEncoded = encoded + "2";
68+
69+
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> huffman.decode(corruptedEncoded));
70+
assertTrue(exception.getMessage().contains("contains invalid characters"));
71+
}
72+
73+
@Test
74+
void testFailFastOnIncompleteSequence() {
75+
HuffmanCoding huffman = new HuffmanCoding("abcd");
76+
String encoded = huffman.encode("abc");
77+
78+
// Truncate the last bit to simulate an incomplete byte/sequence transfer
79+
String truncatedEncoded = encoded.substring(0, encoded.length() - 1);
80+
81+
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> huffman.decode(truncatedEncoded));
82+
assertTrue(exception.getMessage().contains("incomplete sequence"));
83+
}
84+
85+
@Test
86+
void testImmutabilityOfDictionary() {
87+
HuffmanCoding huffman = new HuffmanCoding("abc");
88+
var codes = huffman.getHuffmanCodes();
89+
90+
assertThrows(UnsupportedOperationException.class, () -> codes.put('z', "0101"));
91+
}
92+
93+
@Test
94+
void testStressVolume() {
95+
StringBuilder sb = new StringBuilder();
96+
// Generate a 100,000 character string
97+
for (int i = 0; i < 100000; i++) {
98+
sb.append((char) ('a' + (i % 26)));
99+
}
100+
String largeInput = sb.toString();
101+
102+
HuffmanCoding huffman = new HuffmanCoding(largeInput);
103+
String encoded = huffman.encode(largeInput);
104+
105+
assertEquals(largeInput, huffman.decode(encoded));
106+
}
107+
}

0 commit comments

Comments
 (0)