rustcoreutils · jgarzik · May 5, 2024 · May 5, 2024
diff --git a/xform/src/lzw.rs b/xform/src/lzw.rs
@@ -9,9 +9,6 @@
 // History:  Adapted from posixutils/compress/zopen.cc, which was in turn
 // adapted from FreeBSD's zopen.c.
 //
-// TODO:
-// - FIXME: file tail truncated (data corruption)
-//
 
 use std::io::{self, Error, ErrorKind, Read};
 
@@ -26,37 +23,105 @@ const CLEAR: i32 = 256;
 
 const RMASK: [i32; 9] = [0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff];
 
+/// A wrapper around the Read trait object used
+/// for reading the compressed file or
+/// file to be compressed
+struct CompReader {
+    inner_rdr: Box<dyn Read>,
+}
+
+impl CompReader {
+    fn new(rdr: Box<dyn Read>) -> Self {
+        Self { inner_rdr: rdr }
+    }
+
+    fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        let mut total_read = 0;
+
+        while total_read < buf.len() {
+            match self.inner_rdr.read(&mut buf[total_read..]) {
+                Ok(n) => {
+                    if n == 0 {
+                        break;
+                    }
+                    total_read += n;
+                }
+                Err(ref e) if e.kind() == ErrorKind::Interrupted => {}
+                Err(e) => {
+                    return Err(e);
+                }
+            }
+        }
+
+        if total_read == 0 {
+            return Err(io::Error::new(ErrorKind::UnexpectedEof, "Unexpected EOF"));
+        }
+
+        Ok(total_read)
+    }
+}
+
 fn max_code(n_bits: u32) -> u32 {
     (1 << (n_bits)) - 1
 }
 
 pub struct UnixLZWReader {
-    rdr: Box<dyn Read>,
+    /// the reader of the compressed file or the file to be compressed
+    rdr: CompReader,
+
+    /// if the compressed file has header or not
     have_hdr: bool,
+
+    /// if the eof has been reached or not
     eof: bool,
 
+    /// the max no of bits for the code(maxmaxcode is derived from this)
     maxbits: u32,
+
+    /// the current max no of bits for the code
     n_bits: u32,
+
+    /// if BLOCK_MASK is enabled in the compressed file or not
     block_compress: bool,
+
+    /// It indicates if the buffer has to be cleared or not
     clear: bool,
+
     code: i32,
+
+    /// the previously recognized code
     oldcode: i32,
+
     incode: i32,
+
+    /// the max no of codes that can be created of n_bits
     maxcode: i32,
+
+    /// the max value of maxcode
     maxmaxcode: i32,
+
+    /// the next free entry on the table
     free_ent: i32,
+
     finchar: i32,
+
+    /// It's the current read offset
     roffset: i32,
+
     size: i32,
+
+    /// Buffer to fill as we go on read  from the read stream
     gbuf: [u8; BITS as usize],
+
     tab_suffix: [i32; HSIZE],
+
     tab_prefix: [u16; HSIZE],
 }
 
 impl UnixLZWReader {
     pub fn new(rdr: Box<dyn Read>) -> UnixLZWReader {
         UnixLZWReader {
-            rdr,
+            rdr: CompReader::new(rdr),
             have_hdr: false,
             eof: false,
             maxbits: 0,
@@ -80,29 +145,37 @@ impl UnixLZWReader {
 
     fn getcode(&mut self) -> i32 {
         if self.clear || self.roffset >= self.size || self.free_ent > self.maxcode {
+            // as free_ent represents the index of the next available entry that can be made
+            // on the table, so if its more than the self.maxcode (i.e max allowed no of codes),
+            // which is derived from the current no of bits of code, then we need to expand
+            // our entry by increasing the n_bits by 1 and then updating the max_code
+            // from that
             if self.free_ent > self.maxcode {
-                self.n_bits = self.n_bits + 1;
-                if self.n_bits == self.maxcode as u32 {
-                    self.maxcode = self.maxmaxcode;
+                self.n_bits += 1;
+                self.maxcode = if self.n_bits == self.maxbits {
+                    self.maxmaxcode
                 } else {
-                    self.maxcode = max_code(self.n_bits) as i32;
-                }
+                    max_code(self.n_bits) as i32
+                };
             }
 
+            // reset the table entry back to smallest one
             if self.clear {
                 self.n_bits = INIT_BITS;
                 self.maxcode = max_code(self.n_bits) as i32;
                 self.clear = false;
             }
 
+            // the buffer for current max n of bits
             let gbuf = &mut self.gbuf[0..self.n_bits as usize];
 
-            let res = self.rdr.read_exact(gbuf);
-            if res.is_err() {
-                return -1;
+            match self.rdr.read_exact(gbuf) {
+                Ok(n) => {
+                    self.size = n as i32;
+                }
+                Err(_) => return -1,
             }
 
-            self.size = gbuf.len() as i32;
             self.roffset = 0;
             self.size = (self.size << 3) - (self.n_bits - 1) as i32;
         }
@@ -132,11 +205,12 @@ impl UnixLZWReader {
         gcode
     }
 
+    /// Read from the compressed stream of file
     pub fn read(&mut self) -> io::Result<Vec<u8>> {
         let mut outbytes: Vec<u8> = Vec::new();
 
         if !self.have_hdr {
-            // 3-byte header.  2 byte magic, 1 byte a bitmask of options.
+            // 3-byte header: 2 byte magic, 1 byte a bitmask of options.
             let mut header = [0; 3];
             self.rdr.read_exact(&mut header)?;
 
@@ -149,17 +223,33 @@ impl UnixLZWReader {
                 ));
             }
 
+            // the third byte has bitmask of options
+            // (Eg) if it has 10011111
+            // that means the first bit represents the BLOCK_MASK i.e block_compress
+            // has to be enabled or not
+            //
+            // the bit that we get onwards represent the bit position of the value we want as
+            // max no bits
+            // if it's at 5th position then it means 2^4 = 16
             let options = header[2];
+
             self.maxbits = (options & HDR_BIT_MASK) as u32;
             self.block_compress = (options & HDR_BLOCK_MASK) != 0;
 
             if self.maxbits > BITS {
                 return Err(Error::new(ErrorKind::Other, "invalid file header: bits"));
             }
 
+            // the max value that self.maxcode can have, which is derived
+            // from the maxbits that codes can have
+            // hence, 2^(self.maxbits)
             self.maxmaxcode = 1 << self.maxbits;
-            self.n_bits = INIT_BITS;
-            self.maxcode = max_code(self.n_bits) as i32;
+
+            // the no of bits of code that we start with
+            // btw, this no of bits also represent the fact that there can be
+            // 2 ^ (self.n_bits) entries in the table initially
+            self.n_bits = INIT_BITS; // 9
+            self.maxcode = max_code(self.n_bits) as i32; // 511
 
             for code in (0..=255).rev() {
                 let idx: usize = code as usize;
@@ -169,6 +259,8 @@ impl UnixLZWReader {
             }
 
             if self.block_compress {
+                // TODO: understand why we need to skip one index (i.e 256) (initial guess is that
+                // we need that index reserverd for CLEAR)
                 self.free_ent = FIRST;
             } else {
                 self.free_ent = 256;
@@ -196,6 +288,7 @@ impl UnixLZWReader {
             }
 
             if (self.code == CLEAR) && self.block_compress {
+                // clear the table and fill it again with value
                 for code in (0..=255).rev() {
                     let idx: usize = code as usize;
                     self.code = code;
@@ -210,6 +303,7 @@ impl UnixLZWReader {
                     break;
                 }
             }
+
             self.incode = self.code;
 
             if self.code >= self.free_ent {

diff --git a/xform/src/uncompress.rs b/xform/src/uncompress.rs
@@ -7,7 +7,6 @@
 // SPDX-License-Identifier: MIT
 //
 // TODO:
-// - FIXME: file tail truncated (data corruption)
 // - support NOT writing to stdout (but to file.Z, with .Z suffix removed)
 // - support options -f, -v
 //