feat(handler): add geom_uzip handler

rxpha3l · rxpha3l · commit 9592034814f3 · 2025-03-03T16:54:23.000+01:00
geom_uzip is a FreeBSD feature for creating compressed disk images (usually containing UFS). The compression is done in blocks, and the resulting .uzip file can be mounted via the GEOM framework on FreeBSD. The mkuzip header includes a table with block counts and sizes. The header declares the block size (size of decompressed blocks) and total number of blocks. Block size must be a multiple of 512 and defaults to 16384 in mkuzip. It has the following structure: > Magic, which is a shebang that is stored on 10 bytes. > Version, which can change and is stored on 13 bytes. > Command, which can change and is stored on 105 bytes. > Block size, stored on 4 bytes. > Block count, stored on 4 bytes. > Table of content (TOC), which depends on the file lentgh. The TOC is a list of uint64_t offsets into the file for each block. To determine the length of a given block, read the next TOC entry and subtract the current offset from the next offset (this is why there is an extra TOC entry at the end). Each block is compressed using zlib. A standard zlib decompressor will decode them to a block of size block_size. Unblob parses the TOC to determine end & start offset of the uzip file. It will find the compressed blocks, decompress them using zlib and parses them together to recover the decompressed file. Empty chunks are ignored, which is why the decompressed file with unlbob can be a little bit lighter than the original one. [Sources] https://github.com/mikeryan/unuzip https://www.baeldung.com/linux/filesystem-in-a-file https://docs.python.org/3/library/zlib.html https://github.com/freebsd/freebsd-src/blob/master/sys/geom/uzip/g_uzip.c https://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html
diff --git a/python/unblob/handlers/__init__.py b/python/unblob/handlers/__init__.py
@@ -25,6 +25,7 @@
 from .compression import (
     bzip2,
     compress,
+    geom_uzip,
     gzip,
     lz4,
     lzh,
@@ -116,6 +117,7 @@
     zlib.ZlibHandler,
     engenius.EngeniusHandler,
     ecc.AutelECCHandler,
+    geom_uzip.UZIPHandler,
 )
 
 BUILTIN_DIR_HANDLERS: DirectoryHandlers = (
diff --git a/python/unblob/handlers/compression/geom_uzip.py b/python/unblob/handlers/compression/geom_uzip.py
@@ -0,0 +1,79 @@
+from io import SEEK_SET
+from pathlib import Path
+from typing import Optional
+from zlib import decompress
+
+from structlog import get_logger
+
+from unblob.file_utils import Endian, FileSystem, StructParser, read_until_past
+from unblob.models import (
+    Extractor,
+    ExtractResult,
+    File,
+    HexString,
+    StructHandler,
+    ValidChunk,
+)
+
+""",
+The geom_uzip header follows the following structure:
+10 bytes shebang, with newline suffix: #!/bin/sh\n
+13 bytes version, with newline suffix: #V2.0 Format\n or #L3.0 Format\n
+105 bytes command, with null bytes suffix: (kldstat -qm g_uzip||kldload geom_uzip)>&-&&mount_cd9660 /dev/`mdconfig -af $0`.uzip $1\nexit $?\n\x00\x00\x00\x00\x00\x00\x00\x00\x00
+"""
+
+C_DEFINITIONS = r"""
+    typedef struct uzip_header{
+        char magic[10];             /* '10 bytes '*/
+        char version[13];           /* 13 bytes */
+        char format[105];           /* 105 bytes */
+        uint32_t block_size;        /* 4 bytes */
+        uint32_t block_count;       /* 4 bytes - Number of blocks */
+        uint64_t toc[block_count];  /* table of content */
+    } uzip_header_t;
+"""
+HEADER_STRUCT = "uzip_header_t"
+
+logger = get_logger()
+
+
+class UZIPExtractor(Extractor):
+    def extract(self, inpath: Path, outdir: Path):
+        infile = File.from_path(inpath)
+        parser = StructParser(C_DEFINITIONS)
+        header = parser.parse(HEADER_STRUCT, infile, Endian.BIG)
+        fs = FileSystem(outdir)
+        outpath = Path(inpath.stem)
+        with fs.open(outpath, "wb+") as outfile:
+            for current_offset, next_offset in zip(header.toc[:-1], header.toc[1:]):
+                compressed_len = next_offset - current_offset
+                if compressed_len == 0:
+                    continue
+                infile.seek(current_offset, SEEK_SET)
+                outfile.write(decompress(infile.read(compressed_len)))
+        return ExtractResult(reports=fs.problems)
+
+
+class UZIPHandler(StructHandler):
+    NAME = "uzip"
+    PATTERNS = [
+        HexString(
+            "23 21 2F 62 69 6E 2F 73 68 0A 23 (56 32 | 4c 33) 2e 30 20 46 6f 72 6d 61 74 0A"
+        )
+    ]
+
+    HEADER_STRUCT = HEADER_STRUCT
+    C_DEFINITIONS = C_DEFINITIONS
+    EXTRACTOR = UZIPExtractor()
+
+    def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
+        header = self.parse_header(file, Endian.BIG)
+        # take the last TOC block offset, end of file is that block offset + block size,
+        # starting from the start offset
+        end_offset = start_offset + header.toc[-1]
+        file.seek(end_offset, SEEK_SET)
+        end_offset = read_until_past(file, b"\x00")
+        return ValidChunk(
+            start_offset=start_offset,
+            end_offset=end_offset,
+        )
diff --git a/tests/integration/compression/uzip/__input__/myfs.img.uzip b/tests/integration/compression/uzip/__input__/myfs.img.uzip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e04c83a5444127b9ec58c4b2d8fef904816cee4609d798589d6e8af6086a322
+size 59904
diff --git a/tests/integration/compression/uzip/__output__/myfs.img.uzip_extract/myfs.img b/tests/integration/compression/uzip/__output__/myfs.img.uzip_extract/myfs.img
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e04449191a0c3eab172e5819c5c1e9c10a9cd2e4ffca2abacf065ac1e3bd1328
+size 458752

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:1e04c83a5444127b9ec58c4b2d8fef904816cee4609d798589d6e8af6086a322`
	`3`	`+size 59904`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:e04449191a0c3eab172e5819c5c1e9c10a9cd2e4ffca2abacf065ac1e3bd1328`
	`3`	`+size 458752`