Skip to content

Commit ea45d49

Browse files
authored
Merge pull request #91 from rishadbaniya/fix_lzw_uncompres
fix: read upto EOF in uncompress
2 parents df98d85 + 32f6a95 commit ea45d49

File tree

2 files changed

+111
-18
lines changed

2 files changed

+111
-18
lines changed

xform/src/lzw.rs

Lines changed: 111 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,6 @@
99
// History: Adapted from posixutils/compress/zopen.cc, which was in turn
1010
// adapted from FreeBSD's zopen.c.
1111
//
12-
// TODO:
13-
// - FIXME: file tail truncated (data corruption)
14-
//
1512

1613
use std::io::{self, Error, ErrorKind, Read};
1714

@@ -26,37 +23,105 @@ const CLEAR: i32 = 256;
2623

2724
const RMASK: [i32; 9] = [0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff];
2825

26+
/// A wrapper around the Read trait object used
27+
/// for reading the compressed file or
28+
/// file to be compressed
29+
struct CompReader {
30+
inner_rdr: Box<dyn Read>,
31+
}
32+
33+
impl CompReader {
34+
fn new(rdr: Box<dyn Read>) -> Self {
35+
Self { inner_rdr: rdr }
36+
}
37+
38+
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<usize> {
39+
let mut total_read = 0;
40+
41+
while total_read < buf.len() {
42+
match self.inner_rdr.read(&mut buf[total_read..]) {
43+
Ok(n) => {
44+
if n == 0 {
45+
break;
46+
}
47+
total_read += n;
48+
}
49+
Err(ref e) if e.kind() == ErrorKind::Interrupted => {}
50+
Err(e) => {
51+
return Err(e);
52+
}
53+
}
54+
}
55+
56+
if total_read == 0 {
57+
return Err(io::Error::new(ErrorKind::UnexpectedEof, "Unexpected EOF"));
58+
}
59+
60+
Ok(total_read)
61+
}
62+
}
63+
2964
fn max_code(n_bits: u32) -> u32 {
3065
(1 << (n_bits)) - 1
3166
}
3267

3368
pub struct UnixLZWReader {
34-
rdr: Box<dyn Read>,
69+
/// the reader of the compressed file or the file to be compressed
70+
rdr: CompReader,
71+
72+
/// if the compressed file has header or not
3573
have_hdr: bool,
74+
75+
/// if the eof has been reached or not
3676
eof: bool,
3777

78+
/// the max no of bits for the code(maxmaxcode is derived from this)
3879
maxbits: u32,
80+
81+
/// the current max no of bits for the code
3982
n_bits: u32,
83+
84+
/// if BLOCK_MASK is enabled in the compressed file or not
4085
block_compress: bool,
86+
87+
/// It indicates if the buffer has to be cleared or not
4188
clear: bool,
89+
4290
code: i32,
91+
92+
/// the previously recognized code
4393
oldcode: i32,
94+
4495
incode: i32,
96+
97+
/// the max no of codes that can be created of n_bits
4598
maxcode: i32,
99+
100+
/// the max value of maxcode
46101
maxmaxcode: i32,
102+
103+
/// the next free entry on the table
47104
free_ent: i32,
105+
48106
finchar: i32,
107+
108+
/// It's the current read offset
49109
roffset: i32,
110+
50111
size: i32,
112+
113+
/// Buffer to fill as we go on read from the read stream
51114
gbuf: [u8; BITS as usize],
115+
52116
tab_suffix: [i32; HSIZE],
117+
53118
tab_prefix: [u16; HSIZE],
54119
}
55120

56121
impl UnixLZWReader {
57122
pub fn new(rdr: Box<dyn Read>) -> UnixLZWReader {
58123
UnixLZWReader {
59-
rdr,
124+
rdr: CompReader::new(rdr),
60125
have_hdr: false,
61126
eof: false,
62127
maxbits: 0,
@@ -80,29 +145,37 @@ impl UnixLZWReader {
80145

81146
fn getcode(&mut self) -> i32 {
82147
if self.clear || self.roffset >= self.size || self.free_ent > self.maxcode {
148+
// as free_ent represents the index of the next available entry that can be made
149+
// on the table, so if its more than the self.maxcode (i.e max allowed no of codes),
150+
// which is derived from the current no of bits of code, then we need to expand
151+
// our entry by increasing the n_bits by 1 and then updating the max_code
152+
// from that
83153
if self.free_ent > self.maxcode {
84-
self.n_bits = self.n_bits + 1;
85-
if self.n_bits == self.maxcode as u32 {
86-
self.maxcode = self.maxmaxcode;
154+
self.n_bits += 1;
155+
self.maxcode = if self.n_bits == self.maxbits {
156+
self.maxmaxcode
87157
} else {
88-
self.maxcode = max_code(self.n_bits) as i32;
89-
}
158+
max_code(self.n_bits) as i32
159+
};
90160
}
91161

162+
// reset the table entry back to smallest one
92163
if self.clear {
93164
self.n_bits = INIT_BITS;
94165
self.maxcode = max_code(self.n_bits) as i32;
95166
self.clear = false;
96167
}
97168

169+
// the buffer for current max n of bits
98170
let gbuf = &mut self.gbuf[0..self.n_bits as usize];
99171

100-
let res = self.rdr.read_exact(gbuf);
101-
if res.is_err() {
102-
return -1;
172+
match self.rdr.read_exact(gbuf) {
173+
Ok(n) => {
174+
self.size = n as i32;
175+
}
176+
Err(_) => return -1,
103177
}
104178

105-
self.size = gbuf.len() as i32;
106179
self.roffset = 0;
107180
self.size = (self.size << 3) - (self.n_bits - 1) as i32;
108181
}
@@ -132,11 +205,12 @@ impl UnixLZWReader {
132205
gcode
133206
}
134207

208+
/// Read from the compressed stream of file
135209
pub fn read(&mut self) -> io::Result<Vec<u8>> {
136210
let mut outbytes: Vec<u8> = Vec::new();
137211

138212
if !self.have_hdr {
139-
// 3-byte header. 2 byte magic, 1 byte a bitmask of options.
213+
// 3-byte header: 2 byte magic, 1 byte a bitmask of options.
140214
let mut header = [0; 3];
141215
self.rdr.read_exact(&mut header)?;
142216

@@ -149,17 +223,33 @@ impl UnixLZWReader {
149223
));
150224
}
151225

226+
// the third byte has bitmask of options
227+
// (Eg) if it has 10011111
228+
// that means the first bit represents the BLOCK_MASK i.e block_compress
229+
// has to be enabled or not
230+
//
231+
// the bit that we get onwards represent the bit position of the value we want as
232+
// max no bits
233+
// if it's at 5th position then it means 2^4 = 16
152234
let options = header[2];
235+
153236
self.maxbits = (options & HDR_BIT_MASK) as u32;
154237
self.block_compress = (options & HDR_BLOCK_MASK) != 0;
155238

156239
if self.maxbits > BITS {
157240
return Err(Error::new(ErrorKind::Other, "invalid file header: bits"));
158241
}
159242

243+
// the max value that self.maxcode can have, which is derived
244+
// from the maxbits that codes can have
245+
// hence, 2^(self.maxbits)
160246
self.maxmaxcode = 1 << self.maxbits;
161-
self.n_bits = INIT_BITS;
162-
self.maxcode = max_code(self.n_bits) as i32;
247+
248+
// the no of bits of code that we start with
249+
// btw, this no of bits also represent the fact that there can be
250+
// 2 ^ (self.n_bits) entries in the table initially
251+
self.n_bits = INIT_BITS; // 9
252+
self.maxcode = max_code(self.n_bits) as i32; // 511
163253

164254
for code in (0..=255).rev() {
165255
let idx: usize = code as usize;
@@ -169,6 +259,8 @@ impl UnixLZWReader {
169259
}
170260

171261
if self.block_compress {
262+
// TODO: understand why we need to skip one index (i.e 256) (initial guess is that
263+
// we need that index reserverd for CLEAR)
172264
self.free_ent = FIRST;
173265
} else {
174266
self.free_ent = 256;
@@ -196,6 +288,7 @@ impl UnixLZWReader {
196288
}
197289

198290
if (self.code == CLEAR) && self.block_compress {
291+
// clear the table and fill it again with value
199292
for code in (0..=255).rev() {
200293
let idx: usize = code as usize;
201294
self.code = code;
@@ -210,6 +303,7 @@ impl UnixLZWReader {
210303
break;
211304
}
212305
}
306+
213307
self.incode = self.code;
214308

215309
if self.code >= self.free_ent {

xform/src/uncompress.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
// SPDX-License-Identifier: MIT
88
//
99
// TODO:
10-
// - FIXME: file tail truncated (data corruption)
1110
// - support NOT writing to stdout (but to file.Z, with .Z suffix removed)
1211
// - support options -f, -v
1312
//

0 commit comments

Comments
 (0)