Skip to content

fix: read upto EOF in uncompress #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 111 additions & 17 deletions xform/src/lzw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
// History: Adapted from posixutils/compress/zopen.cc, which was in turn
// adapted from FreeBSD's zopen.c.
//
// TODO:
// - FIXME: file tail truncated (data corruption)
//

use std::io::{self, Error, ErrorKind, Read};

Expand All @@ -26,37 +23,105 @@ const CLEAR: i32 = 256;

const RMASK: [i32; 9] = [0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff];

/// A wrapper around the Read trait object used
/// for reading the compressed file or
/// file to be compressed
struct CompReader {
inner_rdr: Box<dyn Read>,
}

impl CompReader {
fn new(rdr: Box<dyn Read>) -> Self {
Self { inner_rdr: rdr }
}

fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let mut total_read = 0;

while total_read < buf.len() {
match self.inner_rdr.read(&mut buf[total_read..]) {
Ok(n) => {
if n == 0 {
break;
}
total_read += n;
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => {}
Err(e) => {
return Err(e);
}
}
}

if total_read == 0 {
return Err(io::Error::new(ErrorKind::UnexpectedEof, "Unexpected EOF"));
}

Ok(total_read)
}
}

fn max_code(n_bits: u32) -> u32 {
(1 << (n_bits)) - 1
}

pub struct UnixLZWReader {
rdr: Box<dyn Read>,
/// the reader of the compressed file or the file to be compressed
rdr: CompReader,

/// if the compressed file has header or not
have_hdr: bool,

/// if the eof has been reached or not
eof: bool,

/// the max no of bits for the code(maxmaxcode is derived from this)
maxbits: u32,

/// the current max no of bits for the code
n_bits: u32,

/// if BLOCK_MASK is enabled in the compressed file or not
block_compress: bool,

/// It indicates if the buffer has to be cleared or not
clear: bool,

code: i32,

/// the previously recognized code
oldcode: i32,

incode: i32,

/// the max no of codes that can be created of n_bits
maxcode: i32,

/// the max value of maxcode
maxmaxcode: i32,

/// the next free entry on the table
free_ent: i32,

finchar: i32,

/// It's the current read offset
roffset: i32,

size: i32,

/// Buffer to fill as we go on read from the read stream
gbuf: [u8; BITS as usize],

tab_suffix: [i32; HSIZE],

tab_prefix: [u16; HSIZE],
}

impl UnixLZWReader {
pub fn new(rdr: Box<dyn Read>) -> UnixLZWReader {
UnixLZWReader {
rdr,
rdr: CompReader::new(rdr),
have_hdr: false,
eof: false,
maxbits: 0,
Expand All @@ -80,29 +145,37 @@ impl UnixLZWReader {

fn getcode(&mut self) -> i32 {
if self.clear || self.roffset >= self.size || self.free_ent > self.maxcode {
// as free_ent represents the index of the next available entry that can be made
// on the table, so if its more than the self.maxcode (i.e max allowed no of codes),
// which is derived from the current no of bits of code, then we need to expand
// our entry by increasing the n_bits by 1 and then updating the max_code
// from that
if self.free_ent > self.maxcode {
self.n_bits = self.n_bits + 1;
if self.n_bits == self.maxcode as u32 {
self.maxcode = self.maxmaxcode;
self.n_bits += 1;
self.maxcode = if self.n_bits == self.maxbits {
self.maxmaxcode
} else {
self.maxcode = max_code(self.n_bits) as i32;
}
max_code(self.n_bits) as i32
};
}

// reset the table entry back to smallest one
if self.clear {
self.n_bits = INIT_BITS;
self.maxcode = max_code(self.n_bits) as i32;
self.clear = false;
}

// the buffer for current max n of bits
let gbuf = &mut self.gbuf[0..self.n_bits as usize];

let res = self.rdr.read_exact(gbuf);
if res.is_err() {
return -1;
match self.rdr.read_exact(gbuf) {
Ok(n) => {
self.size = n as i32;
}
Err(_) => return -1,
}

self.size = gbuf.len() as i32;
self.roffset = 0;
self.size = (self.size << 3) - (self.n_bits - 1) as i32;
}
Expand Down Expand Up @@ -132,11 +205,12 @@ impl UnixLZWReader {
gcode
}

/// Read from the compressed stream of file
pub fn read(&mut self) -> io::Result<Vec<u8>> {
let mut outbytes: Vec<u8> = Vec::new();

if !self.have_hdr {
// 3-byte header. 2 byte magic, 1 byte a bitmask of options.
// 3-byte header: 2 byte magic, 1 byte a bitmask of options.
let mut header = [0; 3];
self.rdr.read_exact(&mut header)?;

Expand All @@ -149,17 +223,33 @@ impl UnixLZWReader {
));
}

// the third byte has bitmask of options
// (Eg) if it has 10011111
// that means the first bit represents the BLOCK_MASK i.e block_compress
// has to be enabled or not
//
// the bit that we get onwards represent the bit position of the value we want as
// max no bits
// if it's at 5th position then it means 2^4 = 16
let options = header[2];

self.maxbits = (options & HDR_BIT_MASK) as u32;
self.block_compress = (options & HDR_BLOCK_MASK) != 0;

if self.maxbits > BITS {
return Err(Error::new(ErrorKind::Other, "invalid file header: bits"));
}

// the max value that self.maxcode can have, which is derived
// from the maxbits that codes can have
// hence, 2^(self.maxbits)
self.maxmaxcode = 1 << self.maxbits;
self.n_bits = INIT_BITS;
self.maxcode = max_code(self.n_bits) as i32;

// the no of bits of code that we start with
// btw, this no of bits also represent the fact that there can be
// 2 ^ (self.n_bits) entries in the table initially
self.n_bits = INIT_BITS; // 9
self.maxcode = max_code(self.n_bits) as i32; // 511

for code in (0..=255).rev() {
let idx: usize = code as usize;
Expand All @@ -169,6 +259,8 @@ impl UnixLZWReader {
}

if self.block_compress {
// TODO: understand why we need to skip one index (i.e 256) (initial guess is that
// we need that index reserverd for CLEAR)
self.free_ent = FIRST;
} else {
self.free_ent = 256;
Expand Down Expand Up @@ -196,6 +288,7 @@ impl UnixLZWReader {
}

if (self.code == CLEAR) && self.block_compress {
// clear the table and fill it again with value
for code in (0..=255).rev() {
let idx: usize = code as usize;
self.code = code;
Expand All @@ -210,6 +303,7 @@ impl UnixLZWReader {
break;
}
}

self.incode = self.code;

if self.code >= self.free_ent {
Expand Down
1 change: 0 additions & 1 deletion xform/src/uncompress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
// SPDX-License-Identifier: MIT
//
// TODO:
// - FIXME: file tail truncated (data corruption)
// - support NOT writing to stdout (but to file.Z, with .Z suffix removed)
// - support options -f, -v
//
Expand Down