diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dd721da9..fcb7fe1d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        build: [stable, nightly, 1.50.0, macos, windows, mingw]
+        build: [stable, nightly, 1.56.0, macos, windows, mingw]
         include:
           - build: stable
             os: ubuntu-latest
@@ -15,9 +15,9 @@ jobs:
           - build: nightly
             os: ubuntu-latest
             rust: nightly
-          - build: 1.50.0
+          - build: 1.56.0
             os: ubuntu-latest
-            rust: 1.50.0
+            rust: 1.56.0
           - build: macos
             os: macos-latest
             rust: stable
@@ -28,42 +28,27 @@ jobs:
             os: windows-latest
             rust: stable-x86_64-gnu
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions-rs/toolchain@v1
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
         with:
-          profile: minimal
-          override: true
           toolchain: ${{ matrix.rust }}
       - run: cargo test --manifest-path ./miniz_oxide/Cargo.toml
       - run: cargo test --manifest-path ./miniz_oxide/Cargo.toml --features simd
       - run: cargo test --manifest-path ./miniz_oxide/Cargo.toml --no-default-features
-      - run: cargo build --manifest-path ./miniz_oxide/Cargo.toml --no-default-features
-      - run: cargo test
-
-  # rustfmt:
-  #   name: Rustfmt
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - uses: actions/checkout@v3
-  #     - uses: actions-rs/toolchain@v1
-  #       with:
-  #         profile: minimal
-  #         components: rustfmt # seems to not work?
-  #         toolchain: stable
-  #     - run: rustup toolchain install stable --component rustfmt
-  #     - run: cargo fmt -p miniz_oxide -- --check
+      - name: Test minimal
+        if: ${{ matrix.rust !=  '1.56.0' }} 
+        run: cargo test
 
   wasm:
     name: WebAssembly
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        target: [wasm32-unknown-unknown, wasm32-wasi]
+        target: [wasm32-unknown-unknown, wasm32-wasip1]
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions-rs/toolchain@v1
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
         with:
-          profile: minimal
           toolchain: stable
-          target: ${{ matrix.target }}
+          targets: ${{ matrix.target }}
       - run: cargo build -p miniz_oxide --target ${{ matrix.target }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ca43534a..1ad19fbd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,62 @@
 
 All notable changes to this project will be documented in this file.
 
+---
+## [0.8.5](https://github.com/Frommi/miniz_oxide/compare/0.8.4..0.8.5) - 2025-02-21
+
+### Bug Fixes
+
+- **(deflate)** some cleanups and evade a bounds check in compress_lz_codes - ([4c38ff8](https://github.com/Frommi/miniz_oxide/commit/4c38ff8abb3f8ee1f3708f8facd15d1fe9975fbc)) - oyvindln
+- **(deflate)** fix bug causing 0 length stored block to be output incorrectly causing corrupt stream - ([3d62e6b](https://github.com/Frommi/miniz_oxide/commit/3d62e6b6b81441b4a1867bf1504672c835654919)) - oyvindln
+
+
+---
+## [0.8.4](https://github.com/Frommi/miniz_oxide/compare/0.8.3..0.8.4) - 2025-02-11
+
+### Bug Fixes
+
+- **(deflate)** work around upstream rust change causing performance regression - ([7014124](https://github.com/Frommi/miniz_oxide/commit/701412465814a5add1b620c82a7c4eafb1936b45)) - oyvindln
+- **(doc)** typo on example code ([#162](https://github.com/Frommi/miniz_oxide/issues/162)) - ([2119168](https://github.com/Frommi/miniz_oxide/commit/2119168eeee4ff8a8b12505755611e00fe6b96cc)) - IvÃ¡n Izaguirre
+- **(inflate)** Guard against edge case with invalid match distance wrapping around too far when using wrapping buffer - ([4037fee](https://github.com/Frommi/miniz_oxide/commit/4037fee77fd5811ea10fe62a9c772942b6b72cb1)) - oyvindln
+- **(deflate)** Avoid stack overflow when initializing HashBuffers. ([#164](https://github.com/Frommi/miniz_oxide/issues/164)) - ([921bc2c](https://github.com/Frommi/miniz_oxide/commit/921bc2c51e450f22a2a9405a908c64005caa92fe)) - Lukasz Anforowicz
+
+---
+## [0.8.3](https://github.com/Frommi/miniz_oxide/compare/0.8.2..0.8.3) - 2025-01-13
+
+### Bug Fixes
+
+- **(bench)** add some basic criterion benchmarks - ([ac03751](https://github.com/Frommi/miniz_oxide/commit/ac03751c43df22b9bb7f47e50b7dbb8fc11ac141)) - oyvindln
+- **(deflate)** write directly to output buffer instaed of bit buffer to reduce overhead and improve performance of stored blocks a little - ([97ee3f1](https://github.com/Frommi/miniz_oxide/commit/97ee3f1673b0d8bd88f3abcafb6fe392b086e4b7)) - oyvindln
+- **(deflate)** split some code into new module and fix panic in pad_to_bytes from prev commit - ([04973ca](https://github.com/Frommi/miniz_oxide/commit/04973cad7b088868e51fd7970d028dad0ef0c5d0)) - oyvindln
+- **(deflate)** move stored level to it's own function and simplify to improve performance - ([1f829d2](https://github.com/Frommi/miniz_oxide/commit/1f829d2574a7842f4d5e5a3ff9c33f249451f79f)) - oyvindln
+- **(deflate)** remove no longer needed checks for raw mode in compress_normal and commend out accidentally enabled criterion dev dep - ([f357aa1](https://github.com/Frommi/miniz_oxide/commit/f357aa1462f8370592d2a23214490a7391c9f9de)) - oyvindln
+- **(miniz_oxide)** add richgel99 (original miniz author) as author and add copyright info from orig miniz in license files - ([c8a4485](https://github.com/Frommi/miniz_oxide/commit/c8a448500ccd9ab040a244dd7db37702ab9e6449)) - oyvindln
+
+---
+## [0.8.2](https://github.com/Frommi/miniz_oxide/compare/0.8.1..0.8.2) - 2024-12-17
+
+### Bug Fixes
+
+- **(deflate)** fix ([#159](https://github.com/Frommi/miniz_oxide/issues/159)) - ([e3536a7](https://github.com/Frommi/miniz_oxide/commit/e3536a779451012db9d6f8d803252a4f30ce6b91)) (fix for bug accidentally introduced in the previous release causing panics in some cases)- Matthew Deville
+
+---
+## [0.8.1](https://github.com/Frommi/miniz_oxide/compare/0.8.0..0.8.1) - 2024-12-17
+
+### Bug Fixes
+
+- **(fuzzing)** update fuzzing to work again - ([b7a5908](https://github.com/Frommi/miniz_oxide/commit/b7a5908e1b83bde6b60568f6a67952890ab925a9)) - user
+- **(deflate)** use built in fill instead of custom memset function - ([c0662f1](https://github.com/Frommi/miniz_oxide/commit/c0662f11528cbc32291bf91d6caa1890774c2729)) - oyvindln
+- **(inflate)** use smaller types in inflate struct, split up huffman table arrays to make struct smaller, make zlib level 0 if using rle, other minor tweaks - ([c5f8f76](https://github.com/Frommi/miniz_oxide/commit/c5f8f761148a3a8a0a7f1b42e698c5e630a8cdf6)) - oyvindln
+- **(inflate)** use function instead of lookup table for distance extra bits for tiny space/perf saving and fix clippy warnings - ([9f1fc5e](https://github.com/Frommi/miniz_oxide/commit/9f1fc5e5aeee4ce54be3a766e259b030f3b3cfa9)) - oyvindln
+- **(inflate)** use inputwrapper struct instead of iter to simplify input reading and change some data types for performance - ([423bdf8](https://github.com/Frommi/miniz_oxide/commit/423bdf84360c087bea6d3e2b463f3c3a2c1a2867)) - oyvindln
+- **(inflate)** don't use lookup table on aarch64 and loong since we have bit rev instruction there, fix clippy warnings and fix conditional in tree_lookup that seemed to break perf - ([083e4b3](https://github.com/Frommi/miniz_oxide/commit/083e4b3e66e9e4e45e7c48a56481d62ee6a78bce)) - oyvindln
+- **(inflate)** fill fast lookup table with invalid code value instead of zero so we can avoid check in hot code path givin a small performance boost - ([f73e6a4](https://github.com/Frommi/miniz_oxide/commit/f73e6a4600fbfa795d500d45caef4d48f8c85eff)) - oyvindln
+- **(inflate)** skip pointlessly clearing unused huffman code length tree - ([b3b1604](https://github.com/Frommi/miniz_oxide/commit/b3b16048bd459782964f10a23aef63bf058389d5)) - oyvindln
+- **(inflate)** use built in fill instead of custom memset function - ([e6ee54e](https://github.com/Frommi/miniz_oxide/commit/e6ee54e82c16ddccb6b55d5a20b8aa5cb4669ca0)) - oyvindln
+- **(tests)** change workflow to use rust 1.56.0 - ([7258c06](https://github.com/Frommi/miniz_oxide/commit/7258c064bf39cc124210546d535d82c9c6cd1b5f)) - oyvindln
+- **(deflate)** set min window bits in inflate header when using rle - ([02a8857](https://github.com/Frommi/miniz_oxide/commit/02a88571dcc58182df15abb5c1b0410bbd5db428)) - oyvindln
+- **(inflate)** Derive Clone for InflateState to allow random-access reads ([#157](https://github.com/Frommi/miniz_oxide/issues/157)) - ([0a33eff](https://github.com/Frommi/miniz_oxide/commit/0a33effd414711b379e01b0613ba5ae85a0e14d0)) - Phil Hord
+
 ---
 ## [0.8.0](https://github.com/Frommi/miniz_oxide/compare/0.7.4..0.8.0) - 2024-08-08
 
diff --git a/LICENSE-MIT.md b/LICENSE-MIT.md
index 64c53792..ec71b31c 100644
--- a/LICENSE-MIT.md
+++ b/LICENSE-MIT.md
@@ -1,6 +1,9 @@
 MIT License
 
+Copyright 2013-2014 RAD Game Tools and Valve Software
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
 Copyright (c) 2017 Frommi
+Copyright (c) 2017-2024 oyvindln
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index ade0dcb6..1fc5700c 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -9,14 +9,12 @@ publish = false
 cargo-fuzz = true
 
 [dependencies]
-libc="0.2.22"
+libfuzzer-sys="0.4.0"
 
 [dependencies.miniz_oxide_c_api]
 path = ".."
 [dependencies.miniz_oxide]
 path = "../miniz_oxide"
-[dependencies.libfuzzer-sys]
-git = "https://github.com/rust-fuzz/libfuzzer-sys.git"
 
 # Prevent this from interfering with workspaces
 [workspace]
diff --git a/fuzz/seeds/inflate_nonwrapping/empty_comp b/fuzz/seeds/inflate_nonwrapping/empty_comp
new file mode 100644
index 00000000..01e9e398
Binary files /dev/null and b/fuzz/seeds/inflate_nonwrapping/empty_comp differ
diff --git a/fuzz/seeds/inflate_nonwrapping/issue_130_table_size.bin b/fuzz/seeds/inflate_nonwrapping/issue_130_table_size.bin
new file mode 100644
index 00000000..2e80c50b
--- /dev/null
+++ b/fuzz/seeds/inflate_nonwrapping/issue_130_table_size.bin
@@ -0,0 +1,2 @@
+xýà$I’$I‹ª™»GDDfffVUUUUwwwww÷ÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌÌtwwwwWWUUUUffFFD„»›™
+ÏLfWwuwwOÏÌÌÌÌLàÀ‡øG=ø“>êc?üýò	^
\ No newline at end of file
diff --git a/miniz_oxide/Cargo.toml b/miniz_oxide/Cargo.toml
index dab4e338..b4949c39 100644
--- a/miniz_oxide/Cargo.toml
+++ b/miniz_oxide/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "miniz_oxide"
-authors = ["Frommi <daniil.liferenko@gmail.com>", "oyvindln <oyvindln@users.noreply.github.com>"]
-version = "0.8.0"
+authors = ["Frommi <daniil.liferenko@gmail.com>", "oyvindln <oyvindln@users.noreply.github.com>", "Rich Geldreich richgel99@gmail.com"]
+version = "0.8.5"
 license = "MIT OR Zlib OR Apache-2.0"
 readme = "Readme.md"
 keywords = ["zlib", "miniz", "deflate", "encoding"]
@@ -26,6 +26,15 @@ core = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-core
 alloc = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-alloc' }
 compiler_builtins = { version = '0.1.2', optional = true }
 
+[dev-dependencies]
+## Messes with minimum rust version and drags in deps just for running tests
+## so just comment out for now and enable manually when needed for enabling benches
+#criterion = "0.5"
+
+[[bench]]
+name = "benchmark"
+harness = false
+
 [features]
 default = ["with-alloc"]
 with-alloc = []
diff --git a/miniz_oxide/LICENSE b/miniz_oxide/LICENSE
index 64c53792..a1980025 100644
--- a/miniz_oxide/LICENSE
+++ b/miniz_oxide/LICENSE
@@ -1,6 +1,10 @@
 MIT License
 
+Copyright 2013-2014 RAD Game Tools and Valve Software
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
 Copyright (c) 2017 Frommi
+Copyright (c) 2017-2024 oyvindln
+
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/miniz_oxide/LICENSE-MIT.md b/miniz_oxide/LICENSE-MIT.md
index 64c53792..ec71b31c 100644
--- a/miniz_oxide/LICENSE-MIT.md
+++ b/miniz_oxide/LICENSE-MIT.md
@@ -1,6 +1,9 @@
 MIT License
 
+Copyright 2013-2014 RAD Game Tools and Valve Software
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
 Copyright (c) 2017 Frommi
+Copyright (c) 2017-2024 oyvindln
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/miniz_oxide/LICENSE-ZLIB.md b/miniz_oxide/LICENSE-ZLIB.md
index 7f513d1a..92871cd2 100644
--- a/miniz_oxide/LICENSE-ZLIB.md
+++ b/miniz_oxide/LICENSE-ZLIB.md
@@ -1,4 +1,7 @@
+Copyright 2013-2014 RAD Game Tools and Valve Software
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
 Copyright (c) 2020 Frommi
+Copyright (c) 2017-2024 oyvindln
 
 This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.
 
diff --git a/miniz_oxide/Readme.md b/miniz_oxide/Readme.md
index 6c177b0e..6ef5c3a1 100644
--- a/miniz_oxide/Readme.md
+++ b/miniz_oxide/Readme.md
@@ -25,7 +25,7 @@ Simple compression/decompression:
 ```rust
 
 use miniz_oxide::deflate::compress_to_vec;
-use miniz_oxide::inflate::decompress_to_vec;
+use miniz_oxide::inflate::decompress_to_vec_with_limit;
 
 fn roundtrip(data: &[u8]) {
     // Compress the input
diff --git a/miniz_oxide/benches/benchmark.rs b/miniz_oxide/benches/benchmark.rs
new file mode 100644
index 00000000..c6fefe07
--- /dev/null
+++ b/miniz_oxide/benches/benchmark.rs
@@ -0,0 +1,57 @@
+extern crate criterion;
+
+use std::hint::black_box;
+use std::io::Read;
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use miniz_oxide::deflate::{compress_to_vec, compress_to_vec_zlib};
+use miniz_oxide::inflate::{decompress_to_vec, decompress_to_vec_zlib};
+
+fn get_test_file_data(name: &str) -> Vec<u8> {
+    use std::fs::File;
+    let mut input = Vec::new();
+    let mut f = File::open(name).unwrap();
+
+    f.read_to_end(&mut input).unwrap();
+    input
+}
+
+fn get_test_data() -> Vec<u8> {
+    use std::env;
+    let path = env::var("TEST_FILE").unwrap_or_else(|_| "../miniz/miniz.c".to_string());
+    get_test_file_data(&path)
+}
+
+fn bench_inflate(c: &mut Criterion) {
+    let data = get_test_data();
+    let compressed = compress_to_vec(&data, 6);
+    c.bench_function("inflate_raw", |b| {
+        b.iter(|| decompress_to_vec(black_box(&compressed)))
+    });
+    let compressed_zlib = compress_to_vec_zlib(&data, 6);
+    c.bench_function("inflate_zlib", |b| {
+        b.iter(|| decompress_to_vec_zlib(black_box(&compressed_zlib)))
+    });
+}
+
+fn bench_deflate(c: &mut Criterion) {
+    let data = get_test_data();
+    c.bench_function("deflate_l6_raw", |b| {
+        b.iter(|| compress_to_vec(black_box(&data), 6))
+    });
+    c.bench_function("deflate_zlib_l6", |b| {
+        b.iter(|| compress_to_vec_zlib(black_box(&data), 6))
+    });
+    c.bench_function("deflate_l1_raw", |b| {
+        b.iter(|| compress_to_vec(black_box(&data), 1))
+    });
+    c.bench_function("deflate_zlib_l1", |b| {
+        b.iter(|| compress_to_vec_zlib(black_box(&data), 1))
+    });
+    c.bench_function("deflate_l0_raw", |b| {
+        b.iter(|| compress_to_vec(black_box(&data), 0))
+    });
+}
+
+criterion_group!(benches, bench_inflate, bench_deflate);
+criterion_main!(benches);
diff --git a/miniz_oxide/src/deflate/buffer.rs b/miniz_oxide/src/deflate/buffer.rs
index f246c07d..c3c4dcb9 100644
--- a/miniz_oxide/src/deflate/buffer.rs
+++ b/miniz_oxide/src/deflate/buffer.rs
@@ -3,6 +3,8 @@
 //! static length info.
 
 use crate::deflate::core::{LZ_DICT_SIZE, MAX_MATCH_LEN};
+use alloc::boxed::Box;
+use alloc::vec;
 
 /// Size of the buffer of lz77 encoded data.
 pub const LZ_CODE_BUF_SIZE: usize = 64 * 1024;
@@ -23,24 +25,29 @@ pub fn update_hash(current_hash: u16, byte: u8) -> u16 {
 }
 
 pub struct HashBuffers {
-    pub dict: [u8; LZ_DICT_FULL_SIZE],
-    pub next: [u16; LZ_DICT_SIZE],
-    pub hash: [u16; LZ_DICT_SIZE],
+    pub dict: Box<[u8; LZ_DICT_FULL_SIZE]>,
+    pub next: Box<[u16; LZ_DICT_SIZE]>,
+    pub hash: Box<[u16; LZ_DICT_SIZE]>,
 }
 
 impl HashBuffers {
     #[inline]
     pub fn reset(&mut self) {
-        *self = HashBuffers::default();
+        self.dict.fill(0);
+        self.next.fill(0);
+        self.hash.fill(0);
     }
 }
 
 impl Default for HashBuffers {
     fn default() -> HashBuffers {
         HashBuffers {
-            dict: [0; LZ_DICT_FULL_SIZE],
-            next: [0; LZ_DICT_SIZE],
-            hash: [0; LZ_DICT_SIZE],
+            dict: vec![0; LZ_DICT_FULL_SIZE]
+                .into_boxed_slice()
+                .try_into()
+                .unwrap(),
+            next: vec![0; LZ_DICT_SIZE].into_boxed_slice().try_into().unwrap(),
+            hash: vec![0; LZ_DICT_SIZE].into_boxed_slice().try_into().unwrap(),
         }
     }
 }
diff --git a/miniz_oxide/src/deflate/core.rs b/miniz_oxide/src/deflate/core.rs
index 3e822a20..dc2a15ab 100644
--- a/miniz_oxide/src/deflate/core.rs
+++ b/miniz_oxide/src/deflate/core.rs
@@ -11,17 +11,19 @@ use crate::deflate::buffer::{
     update_hash, HashBuffers, LocalBuf, LZ_CODE_BUF_SIZE, LZ_DICT_FULL_SIZE, LZ_HASH_BITS,
     LZ_HASH_SHIFT, LZ_HASH_SIZE, OUT_BUF_SIZE,
 };
+use crate::deflate::stored::compress_stored;
+use crate::deflate::zlib;
 use crate::shared::{update_adler32, HUFFMAN_LENGTH_ORDER, MZ_ADLER32_INIT};
 use crate::DataFormat;
 
 // Currently not bubbled up outside this module, so can fill in with more
 // context eventually if needed.
 type Result<T, E = Error> = core::result::Result<T, E>;
-struct Error {}
+pub(crate) struct Error {}
 
-const MAX_PROBES_MASK: i32 = 0xFFF;
+pub(crate) const MAX_PROBES_MASK: u32 = 0xFFF;
 
-const MAX_SUPPORTED_HUFF_CODESIZE: usize = 32;
+const MAX_SUPPORTED_HUFF_CODESIZE: usize = 15;
 
 /// Length code for length values.
 #[rustfmt::skip]
@@ -157,7 +159,7 @@ const BITMASKS: [u32; 17] = [
 
 /// The maximum number of checks for matches in the hash table the compressor will make for each
 /// compression level.
-const NUM_PROBES: [u32; 11] = [0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500];
+pub(crate) const NUM_PROBES: [u16; 11] = [0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500];
 
 #[derive(Copy, Clone)]
 struct SymFreq {
@@ -206,6 +208,13 @@ pub enum CompressionStrategy {
     Fixed = 4,
 }
 
+impl From<CompressionStrategy> for i32 {
+    #[inline(always)]
+    fn from(value: CompressionStrategy) -> Self {
+        value as i32
+    }
+}
+
 /// A list of deflate flush types.
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum TDEFLFlush {
@@ -290,111 +299,13 @@ const MAX_HUFF_SYMBOLS_2: usize = 19;
 /// Size of the chained hash table.
 pub(crate) const LZ_DICT_SIZE: usize = 32_768;
 /// Mask used when stepping through the hash chains.
-const LZ_DICT_SIZE_MASK: usize = (LZ_DICT_SIZE as u32 - 1) as usize;
+pub(crate) const LZ_DICT_SIZE_MASK: usize = (LZ_DICT_SIZE as u32 - 1) as usize;
 /// The minimum length of a match.
-const MIN_MATCH_LEN: u8 = 3;
+pub(crate) const MIN_MATCH_LEN: u8 = 3;
 /// The maximum length of a match.
 pub(crate) const MAX_MATCH_LEN: usize = 258;
 
-const DEFAULT_FLAGS: u32 = NUM_PROBES[4] | TDEFL_WRITE_ZLIB_HEADER;
-
-mod zlib {
-    const DEFAULT_CM: u8 = 8;
-    const DEFAULT_CINFO: u8 = 7 << 4;
-    const _DEFAULT_FDICT: u8 = 0;
-    const DEFAULT_CMF: u8 = DEFAULT_CM | DEFAULT_CINFO;
-    /// The 16-bit value consisting of CMF and FLG must be divisible by this to be valid.
-    const FCHECK_DIVISOR: u8 = 31;
-
-    /// Generate FCHECK from CMF and FLG (without FCKECH )so that they are correct according to the
-    /// specification, i.e (CMF*256 + FCHK) % 31 = 0.
-    /// Returns flg with the FCHKECK bits added (any existing FCHECK bits are ignored).
-    fn add_fcheck(cmf: u8, flg: u8) -> u8 {
-        let rem = ((usize::from(cmf) * 256) + usize::from(flg)) % usize::from(FCHECK_DIVISOR);
-
-        // Clear existing FCHECK if any
-        let flg = flg & 0b11100000;
-
-        // Casting is safe as rem can't overflow since it is a value mod 31
-        // We can simply add the value to flg as (31 - rem) will never be above 2^5
-        flg + (FCHECK_DIVISOR - rem as u8)
-    }
-
-    const fn zlib_level_from_flags(flags: u32) -> u8 {
-        use super::NUM_PROBES;
-
-        let num_probes = flags & (super::MAX_PROBES_MASK as u32);
-        if flags & super::TDEFL_GREEDY_PARSING_FLAG != 0 {
-            if num_probes <= 1 {
-                0
-            } else {
-                1
-            }
-        } else if num_probes >= NUM_PROBES[9] {
-            3
-        } else {
-            2
-        }
-    }
-
-    /// Get the zlib header for the level using the default window size and no
-    /// dictionary.
-    fn header_from_level(level: u8) -> [u8; 2] {
-        let cmf = DEFAULT_CMF;
-        [cmf, add_fcheck(cmf, level << 6)]
-    }
-
-    /// Create a zlib header from the given compression flags.
-    /// Only level is considered.
-    pub fn header_from_flags(flags: u32) -> [u8; 2] {
-        let level = zlib_level_from_flags(flags);
-        header_from_level(level)
-    }
-
-    #[cfg(test)]
-    mod test {
-        #[test]
-        fn zlib() {
-            use super::super::*;
-            use super::*;
-
-            let test_level = |level, expected| {
-                let flags = create_comp_flags_from_zip_params(
-                    level,
-                    MZ_DEFAULT_WINDOW_BITS,
-                    CompressionStrategy::Default as i32,
-                );
-                assert_eq!(zlib_level_from_flags(flags), expected);
-            };
-
-            assert_eq!(zlib_level_from_flags(DEFAULT_FLAGS), 2);
-            test_level(0, 0);
-            test_level(1, 0);
-            test_level(2, 1);
-            test_level(3, 1);
-            for i in 4..=8 {
-                test_level(i, 2)
-            }
-            test_level(9, 3);
-            test_level(10, 3);
-        }
-
-        #[test]
-        fn test_header() {
-            let header = super::header_from_level(3);
-            assert_eq!(
-                ((usize::from(header[0]) * 256) + usize::from(header[1])) % 31,
-                0
-            );
-        }
-    }
-}
-
-fn memset<T: Copy>(slice: &mut [T], val: T) {
-    for x in slice {
-        *x = val
-    }
-}
+pub(crate) const DEFAULT_FLAGS: u32 = NUM_PROBES[4] as u32 | TDEFL_WRITE_ZLIB_HEADER;
 
 #[cfg(test)]
 #[inline]
@@ -405,19 +316,19 @@ fn write_u16_le(val: u16, slice: &mut [u8], pos: usize) {
 
 // Read the two bytes starting at pos and interpret them as an u16.
 #[inline]
-const fn read_u16_le(slice: &[u8], pos: usize) -> u16 {
+const fn read_u16_le<const N: usize>(slice: &[u8; N], pos: usize) -> u16 {
     // The compiler is smart enough to optimize this into an unaligned load.
     slice[pos] as u16 | ((slice[pos + 1] as u16) << 8)
 }
 
 /// Main compression struct.
 pub struct CompressorOxide {
-    lz: LZOxide,
-    params: ParamsOxide,
+    pub(crate) lz: LZOxide,
+    pub(crate) params: ParamsOxide,
     /// Put HuffmanOxide on the heap with default trick to avoid
     /// excessive stack copies.
-    huff: Box<HuffmanOxide>,
-    dict: DictOxide,
+    pub(crate) huff: Box<HuffmanOxide>,
+    pub(crate) dict: DictOxide,
 }
 
 impl CompressorOxide {
@@ -532,7 +443,7 @@ pub struct CallbackFunc<'a> {
     pub put_buf_func: &'a mut dyn FnMut(&[u8]) -> bool,
 }
 
-impl<'a> CallbackFunc<'a> {
+impl CallbackFunc<'_> {
     fn flush_output(
         &mut self,
         saved_output: SavedOutputBufferOxide,
@@ -556,7 +467,7 @@ struct CallbackBuf<'a> {
     pub out_buf: &'a mut [u8],
 }
 
-impl<'a> CallbackBuf<'a> {
+impl CallbackBuf<'_> {
     fn flush_output(
         &mut self,
         saved_output: SavedOutputBufferOxide,
@@ -585,7 +496,7 @@ enum CallbackOut<'a> {
     Buf(CallbackBuf<'a>),
 }
 
-impl<'a> CallbackOut<'a> {
+impl CallbackOut<'_> {
     fn new_output_buffer<'b>(
         &'b mut self,
         local_buf: &'b mut [u8],
@@ -614,7 +525,7 @@ impl<'a> CallbackOut<'a> {
     }
 }
 
-struct CallbackOxide<'a> {
+pub(crate) struct CallbackOxide<'a> {
     in_buf: Option<&'a [u8]>,
     in_buf_size: Option<&'a mut usize>,
     out_buf_size: Option<&'a mut usize>,
@@ -665,6 +576,10 @@ impl<'a> CallbackOxide<'a> {
             CallbackOut::Buf(ref mut cb) => cb.flush_output(saved_output, params),
         }
     }
+
+    pub(crate) fn buf(&mut self) -> Option<&'a [u8]> {
+        self.in_buf
+    }
 }
 
 struct OutputBufferOxide<'a> {
@@ -676,7 +591,10 @@ struct OutputBufferOxide<'a> {
     pub bits_in: u32,
 }
 
-impl<'a> OutputBufferOxide<'a> {
+impl OutputBufferOxide<'_> {
+    /// Write bits to the bit buffer and flushes
+    /// the bit buffer so any whole bytes are output
+    /// to the underlying buffer.
     fn put_bits(&mut self, bits: u32, len: u32) {
         // TODO: Removing this assertion worsens performance
         // Need to figure out why
@@ -692,6 +610,14 @@ impl<'a> OutputBufferOxide<'a> {
         }
     }
 
+    #[inline]
+    /// Write the provided bits to the bit buffer without flushing
+    /// anything. Does not check if there is actually space for it.
+    fn put_bits_no_flush(&mut self, bits: u32, len: u32) {
+        self.bit_buffer |= bits << self.bits_in;
+        self.bits_in += len;
+    }
+
     const fn save(&self) -> SavedOutputBufferOxide {
         SavedOutputBufferOxide {
             pos: self.inner_pos,
@@ -708,12 +634,22 @@ impl<'a> OutputBufferOxide<'a> {
         self.local = saved.local;
     }
 
+    #[inline]
+    /// Pad the bit buffer to a whole byte with
+    /// zeroes and write that byte to the output buffer.
     fn pad_to_bytes(&mut self) {
         if self.bits_in != 0 {
             let len = 8 - self.bits_in;
             self.put_bits(0, len);
         }
     }
+
+    #[inline]
+    fn write_bytes(&mut self, bytes: &[u8]) {
+        debug_assert_eq!(self.bits_in, 0);
+        self.inner[self.inner_pos..self.inner_pos + bytes.len()].copy_from_slice(bytes);
+        self.inner_pos += bytes.len();
+    }
 }
 
 struct SavedOutputBufferOxide {
@@ -757,7 +693,7 @@ impl BitBuffer {
 /// NOTE: Only the literal/lengths have enough symbols to actually use
 /// the full array. It's unclear why it's defined like this in miniz,
 /// it could be for cache/alignment reasons.
-struct HuffmanOxide {
+pub(crate) struct HuffmanOxide {
     /// Number of occurrences of each symbol.
     pub count: [[u16; MAX_HUFF_SYMBOLS]; MAX_HUFF_TABLES],
     /// The bits of the huffman code assigned to the symbol
@@ -776,7 +712,7 @@ const HUFF_CODES_TABLE: usize = 2;
 /// Status of RLE encoding of huffman code lengths.
 struct Rle {
     pub z_count: u32,
-    pub repeat_count: u32,
+    pub repeat_count: u16,
     pub prev_code_size: u8,
 }
 
@@ -792,7 +728,7 @@ impl Rle {
         if self.repeat_count != 0 {
             if self.repeat_count < 3 {
                 counts[self.prev_code_size as usize] =
-                    counts[self.prev_code_size as usize].wrapping_add(self.repeat_count as u16);
+                    counts[self.prev_code_size as usize].wrapping_add(self.repeat_count);
                 let code = self.prev_code_size;
                 write(&[code, code, code][..self.repeat_count as usize])?;
             } else {
@@ -977,7 +913,7 @@ impl HuffmanOxide {
         code_size_limit: usize,
         static_table: bool,
     ) {
-        let mut num_codes = [0i32; MAX_SUPPORTED_HUFF_CODESIZE + 1];
+        let mut num_codes = [0i32; 32 + 1];
         let mut next_code = [0u32; MAX_SUPPORTED_HUFF_CODESIZE + 1];
 
         if static_table {
@@ -1017,8 +953,8 @@ impl HuffmanOxide {
 
             Self::enforce_max_code_size(&mut num_codes, num_used_symbols, code_size_limit);
 
-            memset(&mut self.code_sizes[table_num][..], 0);
-            memset(&mut self.codes[table_num][..], 0);
+            self.code_sizes[table_num].fill(0);
+            self.codes[table_num].fill(0);
 
             let mut last = num_used_symbols;
             for (i, &num_item) in num_codes
@@ -1051,25 +987,23 @@ impl HuffmanOxide {
                 continue;
             }
 
-            let mut code = next_code[code_size as usize];
+            let code = next_code[code_size as usize];
+
             next_code[code_size as usize] += 1;
 
-            let mut rev_code = 0;
-            for _ in 0..code_size {
-                rev_code = (rev_code << 1) | (code & 1);
-                code >>= 1;
-            }
-            *huff_code = rev_code as u16;
+            let rev_code = (code as u16).reverse_bits() >> (16 - code_size);
+
+            *huff_code = rev_code;
         }
     }
 
     fn start_static_block(&mut self, output: &mut OutputBufferOxide) {
-        memset(&mut self.code_sizes[LITLEN_TABLE][0..144], 8);
-        memset(&mut self.code_sizes[LITLEN_TABLE][144..256], 9);
-        memset(&mut self.code_sizes[LITLEN_TABLE][256..280], 7);
-        memset(&mut self.code_sizes[LITLEN_TABLE][280..288], 8);
+        self.code_sizes[LITLEN_TABLE][0..144].fill(8);
+        self.code_sizes[LITLEN_TABLE][144..256].fill(9);
+        self.code_sizes[LITLEN_TABLE][256..280].fill(7);
+        self.code_sizes[LITLEN_TABLE][280..288].fill(8);
 
-        memset(&mut self.code_sizes[DIST_TABLE][..32], 5);
+        self.code_sizes[DIST_TABLE][..32].fill(5);
 
         self.optimize_table(LITLEN_TABLE, 288, 15, true);
         self.optimize_table(DIST_TABLE, 32, 15, true);
@@ -1114,7 +1048,7 @@ impl HuffmanOxide {
             prev_code_size: 0xFF,
         };
 
-        memset(&mut self.count[HUFF_CODES_TABLE][..MAX_HUFF_SYMBOLS_2], 0);
+        self.count[HUFF_CODES_TABLE][..MAX_HUFF_SYMBOLS_2].fill(0);
 
         let mut packed_pos = 0;
         for &code_size in &code_sizes_to_pack[..total_code_sizes_to_pack] {
@@ -1149,10 +1083,10 @@ impl HuffmanOxide {
 
         self.optimize_table(2, MAX_HUFF_SYMBOLS_2, 7, false);
 
-        output.put_bits(2, 2);
+        output.put_bits_no_flush(2, 2);
 
-        output.put_bits((num_lit_codes - 257) as u32, 5);
-        output.put_bits((num_dist_codes - 1) as u32, 5);
+        output.put_bits_no_flush((num_lit_codes - 257) as u32, 5);
+        output.put_bits_no_flush((num_dist_codes - 1) as u32, 5);
 
         let mut num_bit_lengths = 18
             - HUFFMAN_LENGTH_ORDER
@@ -1192,18 +1126,19 @@ impl HuffmanOxide {
     }
 }
 
-struct DictOxide {
+pub(crate) struct DictOxide {
     /// The maximum number of checks in the hash chain, for the initial,
     /// and the lazy match respectively.
     pub max_probes: [u32; 2],
     /// Buffer of input data.
     /// Padded with 1 byte to simplify matching code in `compress_fast`.
-    pub b: Box<HashBuffers>,
+    pub b: HashBuffers,
 
     pub code_buf_dict_pos: usize,
     pub lookahead_size: usize,
     pub lookahead_pos: usize,
     pub size: usize,
+    loop_len: u8,
 }
 
 const fn probes_from_flags(flags: u32) -> [u32; 2] {
@@ -1217,11 +1152,12 @@ impl DictOxide {
     fn new(flags: u32) -> Self {
         DictOxide {
             max_probes: probes_from_flags(flags),
-            b: Box::default(),
+            b: HashBuffers::default(),
             code_buf_dict_pos: 0,
             lookahead_size: 0,
             lookahead_pos: 0,
             size: 0,
+            loop_len: 32,
         }
     }
 
@@ -1265,13 +1201,6 @@ impl DictOxide {
         u64::from_le_bytes(bytes)
     }
 
-    /// Do an unaligned read of the data at `pos` in the dictionary and treat it as if it was of
-    /// type T.
-    #[inline]
-    fn read_as_u16(&self, pos: usize) -> u16 {
-        read_u16_le(&self.b.dict[..], pos)
-    }
-
     /// Try to find a match for the data at lookahead_pos in the dictionary that is
     /// longer than `match_len`.
     /// Returns a tuple containing (match_distance, match_length). Will be equal to the input
@@ -1292,20 +1221,24 @@ impl DictOxide {
         let max_match_len = cmp::min(MAX_MATCH_LEN as u32, max_match_len);
         match_len = cmp::max(match_len, 1);
 
-        let pos = lookahead_pos & LZ_DICT_SIZE_MASK;
-        let mut probe_pos = pos;
-        // Number of probes into the hash chains.
-        let mut num_probes_left = self.max_probes[(match_len >= 32) as usize];
-
         // If we already have a match of the full length don't bother searching for another one.
         if max_match_len <= match_len {
             return (match_dist, match_len);
         }
 
+        let pos = lookahead_pos & LZ_DICT_SIZE_MASK;
+        let mut probe_pos = pos;
+        // Number of probes into the hash chains.
+        let mut num_probes_left = if match_len < 32 {
+            self.max_probes[0]
+        } else {
+            self.max_probes[1]
+        };
+
         // Read the last byte of the current match, and the next one, used to compare matches.
-        let mut c01: u16 = self.read_as_u16(pos + match_len as usize - 1);
+        let mut c01: u16 = read_u16_le(&self.b.dict, pos + match_len as usize - 1);
         // Read the two bytes at the end position of the current match.
-        let s01: u16 = self.read_as_u16(pos);
+        let s01: u16 = read_u16_le(&self.b.dict, pos);
 
         'outer: loop {
             let mut dist;
@@ -1332,7 +1265,8 @@ impl DictOxide {
                     // position to match against.
                     probe_pos = next_probe_pos & LZ_DICT_SIZE_MASK;
 
-                    if self.read_as_u16(probe_pos + match_len as usize - 1) == c01 {
+                    // TODO: This bounds check does not get optimized out
+                    if read_u16_le(&self.b.dict, probe_pos + match_len as usize - 1) == c01 {
                         break 'found;
                     }
                 }
@@ -1345,14 +1279,17 @@ impl DictOxide {
             }
 
             // Check if the two first bytes match.
-            if self.read_as_u16(probe_pos) != s01 {
+            if read_u16_le(&self.b.dict, probe_pos) != s01 {
                 continue;
             }
 
             let mut p = pos + 2;
             let mut q = probe_pos + 2;
             // The first two bytes matched, so check the full length of the match.
-            for _ in 0..32 {
+            // TODO: This is a workaround for an upstream issue introduced after a LLVM upgrade in rust 1.82.
+            // the compiler is too smart and ends up unrolling the loop which causes the performance to get worse
+            // Using a variable instead of a constant here to prevent it seems to at least get back some of the performance loss.
+            for _ in 0..self.loop_len as i32 {
                 let p_data: u64 = self.read_unaligned_u64(p);
                 let q_data: u64 = self.read_unaligned_u64(q);
                 // Compare of 8 bytes at a time by using unaligned loads of 64-bit integers.
@@ -1375,7 +1312,7 @@ impl DictOxide {
                         }
                         // We found a better match, so save the last two bytes for further match
                         // comparisons.
-                        c01 = self.read_as_u16(pos + match_len as usize - 1)
+                        c01 = read_u16_le(&self.b.dict, pos + match_len as usize - 1);
                     }
                     continue 'outer;
                 }
@@ -1386,7 +1323,7 @@ impl DictOxide {
     }
 }
 
-struct ParamsOxide {
+pub(crate) struct ParamsOxide {
     pub flags: u32,
     pub greedy_parsing: bool,
     pub block_index: u32,
@@ -1461,7 +1398,7 @@ impl ParamsOxide {
     }
 }
 
-struct LZOxide {
+pub(crate) struct LZOxide {
     pub codes: [u8; LZ_CODE_BUF_SIZE],
     pub code_position: usize,
     pub flag_position: usize,
@@ -1521,7 +1458,8 @@ impl LZOxide {
 fn compress_lz_codes(
     huff: &HuffmanOxide,
     output: &mut OutputBufferOxide,
-    lz_code_buf: &[u8],
+    lz_code_buf: &[u8; LZ_CODE_BUF_SIZE],
+    lz_code_buf_used_len: usize,
 ) -> Result<bool> {
     let mut flags = 1;
     let mut bb = BitBuffer {
@@ -1529,8 +1467,12 @@ fn compress_lz_codes(
         bits_in: output.bits_in,
     };
 
+    // Help out the compiler know this variable won't be larger than
+    // the buffer length since the constants won't propagate through the function call.
+    let lz_code_buf_used_len = cmp::min(lz_code_buf.len(), lz_code_buf_used_len);
+
     let mut i: usize = 0;
-    while i < lz_code_buf.len() {
+    while i < lz_code_buf_used_len {
         if flags == 1 {
             flags = u32::from(lz_code_buf[i]) | 0x100;
             i += 1;
@@ -1580,7 +1522,7 @@ fn compress_lz_codes(
             // The lz code was a literal
             for _ in 0..3 {
                 flags >>= 1;
-                let lit = lz_code_buf[i];
+                let lit = lz_code_buf[i & (LZ_CODE_BUF_SIZE - 1)];
                 i += 1;
 
                 debug_assert!(huff.code_sizes[0][lit as usize] != 0);
@@ -1589,7 +1531,7 @@ fn compress_lz_codes(
                     u32::from(huff.code_sizes[0][lit as usize]),
                 );
 
-                if flags & 1 == 1 || i >= lz_code_buf.len() {
+                if flags & 1 == 1 || i >= lz_code_buf_used_len {
                     break;
                 }
             }
@@ -1628,10 +1570,10 @@ fn compress_block(
         huff.start_dynamic_block(output)?;
     }
 
-    compress_lz_codes(huff, output, &lz.codes[..lz.code_position])
+    compress_lz_codes(huff, output, &lz.codes, lz.code_position)
 }
 
-fn flush_block(
+pub(crate) fn flush_block(
     d: &mut CompressorOxide,
     callback: &mut CallbackOxide,
     flush: TDEFLFlush,
@@ -1644,8 +1586,13 @@ fn flush_block(
         output.bit_buffer = d.params.saved_bit_buffer;
         output.bits_in = d.params.saved_bits_in;
 
+        // TODO: Don't think this second condition should be here but need to verify.
         let use_raw_block = (d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0)
             && (d.dict.lookahead_pos - d.dict.code_buf_dict_pos) <= d.dict.size;
+        debug_assert_eq!(
+            use_raw_block,
+            d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0
+        );
 
         assert!(d.params.flush_remaining == 0);
         d.params.flush_ofs = 0;
@@ -1656,7 +1603,7 @@ fn flush_block(
         // If we are at the start of the stream, write the zlib header if requested.
         if d.params.flags & TDEFL_WRITE_ZLIB_HEADER != 0 && d.params.block_index == 0 {
             let header = zlib::header_from_flags(d.params.flags);
-            output.put_bits(header[0].into(), 8);
+            output.put_bits_no_flush(header[0].into(), 8);
             output.put_bits(header[1].into(), 8);
         }
 
@@ -1692,7 +1639,7 @@ fn flush_block(
             // Block header.
             output.put_bits(0, 2);
 
-            // Block length has to start on a byte boundary, s opad.
+            // Block length has to start on a byte boundary, so pad.
             output.pad_to_bytes();
 
             // Block length and ones complement of block length.
@@ -1700,9 +1647,16 @@ fn flush_block(
             output.put_bits(!d.lz.total_bytes & 0xFFFF, 16);
 
             // Write the actual bytes.
-            for i in 0..d.lz.total_bytes {
-                let pos = (d.dict.code_buf_dict_pos + i as usize) & LZ_DICT_SIZE_MASK;
-                output.put_bits(u32::from(d.dict.b.dict[pos]), 8);
+            let start = d.dict.code_buf_dict_pos & LZ_DICT_SIZE_MASK;
+            let end = (d.dict.code_buf_dict_pos + d.lz.total_bytes as usize) & LZ_DICT_SIZE_MASK;
+            let dict = &mut d.dict.b.dict;
+            if start < end {
+                // The data does not wrap around.
+                output.write_bytes(&dict[start..end]);
+            } else if d.lz.total_bytes > 0 {
+                // The data wraps around and the input was not 0 bytes.
+                output.write_bytes(&dict[start..LZ_DICT_SIZE]);
+                output.write_bytes(&dict[..end]);
             }
         } else if !comp_success {
             output.load(saved_buffer);
@@ -1729,9 +1683,10 @@ fn flush_block(
             }
         }
 
-        memset(&mut d.huff.count[0][..MAX_HUFF_SYMBOLS_0], 0);
-        memset(&mut d.huff.count[1][..MAX_HUFF_SYMBOLS_1], 0);
+        d.huff.count[0][..MAX_HUFF_SYMBOLS_0].fill(0);
+        d.huff.count[1][..MAX_HUFF_SYMBOLS_1].fill(0);
 
+        // Clear LZ buffer for the next block.
         d.lz.code_position = 1;
         d.lz.flag_position = 0;
         d.lz.num_flags_left = 8;
@@ -1748,7 +1703,7 @@ fn flush_block(
     Ok(callback.flush_output(saved_buffer, &mut d.params))
 }
 
-fn record_literal(h: &mut HuffmanOxide, lz: &mut LZOxide, lit: u8) {
+pub(crate) fn record_literal(h: &mut HuffmanOxide, lz: &mut LZOxide, lit: u8) {
     lz.total_bytes += 1;
     lz.write_code(lit);
 
@@ -1785,12 +1740,12 @@ fn record_match(h: &mut HuffmanOxide, lz: &mut LZOxide, mut match_len: u32, mut
 }
 
 fn compress_normal(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool {
-    let mut src_pos = d.params.src_pos;
     let in_buf = match callback.in_buf {
         None => return true,
         Some(in_buf) => in_buf,
     };
 
+    let mut src_pos = d.params.src_pos;
     let mut lookahead_size = d.dict.lookahead_size;
     let mut lookahead_pos = d.dict.lookahead_pos;
     let mut saved_lit = d.params.saved_lit;
@@ -1872,9 +1827,9 @@ fn compress_normal(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> boo
             u32::from(MIN_MATCH_LEN) - 1
         };
         let cur_pos = lookahead_pos & LZ_DICT_SIZE_MASK;
-        if d.params.flags & (TDEFL_RLE_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS) != 0 {
+        if d.params.flags & TDEFL_RLE_MATCHES != 0 {
             // If TDEFL_RLE_MATCHES is set, we only look for repeating sequences of the current byte.
-            if d.dict.size != 0 && d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS == 0 {
+            if d.dict.size != 0 {
                 let c = d.dict.b.dict[(cur_pos.wrapping_sub(1)) & LZ_DICT_SIZE_MASK];
                 cur_match_len = d.dict.b.dict[cur_pos..(cur_pos + lookahead_size)]
                     .iter()
@@ -1949,11 +1904,10 @@ fn compress_normal(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> boo
         d.dict.size = cmp::min(d.dict.size + len_to_move, LZ_DICT_SIZE);
 
         let lz_buf_tight = d.lz.code_position > LZ_CODE_BUF_SIZE - 8;
-        let raw = d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0;
         let fat = ((d.lz.code_position * 115) >> 7) >= d.lz.total_bytes as usize;
-        let fat_or_raw = (d.lz.total_bytes > 31 * 1024) && (fat || raw);
+        let buf_fat = (d.lz.total_bytes > 31 * 1024) && fat;
 
-        if lz_buf_tight || fat_or_raw {
+        if lz_buf_tight || buf_fat {
             d.params.src_pos = src_pos;
             // These values are used in flush_block, so we need to write them back here.
             d.dict.lookahead_size = lookahead_size;
@@ -2278,13 +2232,15 @@ fn compress_inner(
         return res;
     }
 
-    let one_probe = d.params.flags & MAX_PROBES_MASK as u32 == 1;
+    let one_probe = d.params.flags & MAX_PROBES_MASK == 1;
     let greedy = d.params.flags & TDEFL_GREEDY_PARSING_FLAG != 0;
-    let filter_or_rle_or_raw = d.params.flags
-        & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS | TDEFL_RLE_MATCHES)
-        != 0;
+    let filter_or_rle = d.params.flags & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS) != 0;
 
-    let compress_success = if one_probe && greedy && !filter_or_rle_or_raw {
+    let raw = d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0;
+
+    let compress_success = if raw {
+        compress_stored(d, callback)
+    } else if one_probe && greedy && !filter_or_rle {
         compress_fast(d, callback)
     } else {
         compress_normal(d, callback)
@@ -2328,8 +2284,8 @@ fn compress_inner(
             _ => {
                 d.params.finished = d.params.flush == TDEFLFlush::Finish;
                 if d.params.flush == TDEFLFlush::Full {
-                    memset(&mut d.dict.b.hash[..], 0);
-                    memset(&mut d.dict.b.next[..], 0);
+                    d.dict.b.hash.fill(0);
+                    d.dict.b.next.fill(0);
                     d.dict.size = 0;
                 }
             }
@@ -2365,7 +2321,7 @@ pub fn create_comp_flags_from_zip_params(level: i32, window_bits: i32, strategy:
     } else {
         0
     };
-    let mut comp_flags = NUM_PROBES[num_probes] | greedy;
+    let mut comp_flags = u32::from(NUM_PROBES[num_probes]) | greedy;
 
     if window_bits > 0 {
         comp_flags |= TDEFL_WRITE_ZLIB_HEADER;
@@ -2376,7 +2332,7 @@ pub fn create_comp_flags_from_zip_params(level: i32, window_bits: i32, strategy:
     } else if strategy == CompressionStrategy::Filtered as i32 {
         comp_flags |= TDEFL_FILTER_MATCHES;
     } else if strategy == CompressionStrategy::HuffmanOnly as i32 {
-        comp_flags &= !MAX_PROBES_MASK as u32;
+        comp_flags &= !MAX_PROBES_MASK;
     } else if strategy == CompressionStrategy::Fixed as i32 {
         comp_flags |= TDEFL_FORCE_ALL_STATIC_BLOCKS;
     } else if strategy == CompressionStrategy::RLE as i32 {
@@ -2466,4 +2422,45 @@ mod test {
         let decoded = decompress_to_vec(&encoded[..]).unwrap();
         assert_eq!(&decoded[..], &slice[..]);
     }
+
+    #[test]
+    fn zlib_window_bits() {
+        use crate::inflate::stream::{inflate, InflateState};
+        use crate::DataFormat;
+        use alloc::boxed::Box;
+        let slice = [
+            1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 1, 2, 6, 1, 2, 3, 1, 2, 3, 2, 3, 1, 2, 3, 35, 22, 22, 2,
+            6, 2, 6,
+        ];
+        let mut encoded = vec![];
+        let flags = create_comp_flags_from_zip_params(2, 1, CompressionStrategy::RLE.into());
+        let mut d = CompressorOxide::new(flags);
+        let (status, in_consumed) =
+            compress_to_output(&mut d, &slice, TDEFLFlush::Finish, |out: &[u8]| {
+                encoded.extend_from_slice(out);
+                true
+            });
+
+        assert_eq!(status, TDEFLStatus::Done);
+        assert_eq!(in_consumed, slice.len());
+
+        let mut output = vec![0; slice.len()];
+
+        let mut decompressor = Box::new(InflateState::new(DataFormat::Zlib));
+
+        let mut out_slice = output.as_mut_slice();
+        // Feed 1 byte at a time and no back buffer to test that RLE encoding has been used.
+        for i in 0..encoded.len() {
+            let result = inflate(
+                &mut decompressor,
+                &encoded[i..i + 1],
+                out_slice,
+                crate::MZFlush::None,
+            );
+            out_slice = &mut out_slice[result.bytes_written..];
+        }
+        let cmf = decompressor.decompressor().zlib_header().0;
+        assert_eq!(cmf, 8);
+        assert_eq!(output, slice)
+    }
 }
diff --git a/miniz_oxide/src/deflate/mod.rs b/miniz_oxide/src/deflate/mod.rs
index f36f28c8..f31c587c 100644
--- a/miniz_oxide/src/deflate/mod.rs
+++ b/miniz_oxide/src/deflate/mod.rs
@@ -5,7 +5,9 @@ use crate::alloc::vec::Vec;
 
 mod buffer;
 pub mod core;
+mod stored;
 pub mod stream;
+mod zlib;
 use self::core::*;
 
 /// How much processing the compressor should do to compress the data.
@@ -188,6 +190,15 @@ mod test {
         assert_eq!(test_data, d.as_slice());
     }
 
+    #[test]
+    fn compress_rle() {
+        let test_data = b"Deflate late";
+
+        let res = compress_to_vec_inner(test_data, 1, 0, CompressionStrategy::RLE as i32);
+        let d = decompress_to_vec(res.as_slice()).expect("Failed to decompress!");
+        assert_eq!(test_data, d.as_slice());
+    }
+
     /// Test that a raw block compresses fine.
     #[test]
     fn compress_raw() {
diff --git a/miniz_oxide/src/deflate/stored.rs b/miniz_oxide/src/deflate/stored.rs
new file mode 100644
index 00000000..166d31a6
--- /dev/null
+++ b/miniz_oxide/src/deflate/stored.rs
@@ -0,0 +1,305 @@
+use crate::deflate::buffer::{update_hash, LZ_HASH_SHIFT, LZ_HASH_SIZE};
+use crate::deflate::core::{
+    flush_block, CallbackOxide, CompressorOxide, TDEFLFlush, TDEFLStatus, LZ_DICT_SIZE,
+    LZ_DICT_SIZE_MASK, MAX_MATCH_LEN, MIN_MATCH_LEN,
+};
+use core::cmp;
+
+pub(crate) fn compress_stored(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool {
+    let in_buf = match callback.buf() {
+        None => return true,
+        Some(in_buf) => in_buf,
+    };
+
+    // Make sure this is cleared in case compression level is switched later.
+    // TODO: It's possible we don't need this or could do this elsewhere later
+    // but just do this here to avoid causing issues for now.
+    d.params.saved_match_len = 0;
+    let mut bytes_written = d.lz.total_bytes;
+    let mut src_pos = d.params.src_pos;
+    let mut lookahead_size = d.dict.lookahead_size;
+    let mut lookahead_pos = d.dict.lookahead_pos;
+
+    while src_pos < in_buf.len() || (d.params.flush != TDEFLFlush::None && lookahead_size != 0) {
+        let src_buf_left = in_buf.len() - src_pos;
+        let num_bytes_to_process = cmp::min(src_buf_left, MAX_MATCH_LEN - lookahead_size);
+
+        if lookahead_size + d.dict.size >= usize::from(MIN_MATCH_LEN) - 1
+            && num_bytes_to_process > 0
+        {
+            let dictb = &mut d.dict.b;
+
+            let mut dst_pos = (lookahead_pos + lookahead_size) & LZ_DICT_SIZE_MASK;
+            let mut ins_pos = lookahead_pos + lookahead_size - 2;
+            // Start the hash value from the first two bytes
+            let mut hash = update_hash(
+                u16::from(dictb.dict[ins_pos & LZ_DICT_SIZE_MASK]),
+                dictb.dict[(ins_pos + 1) & LZ_DICT_SIZE_MASK],
+            );
+
+            lookahead_size += num_bytes_to_process;
+
+            for &c in &in_buf[src_pos..src_pos + num_bytes_to_process] {
+                // Add byte to input buffer.
+                dictb.dict[dst_pos] = c;
+                if dst_pos < MAX_MATCH_LEN - 1 {
+                    dictb.dict[LZ_DICT_SIZE + dst_pos] = c;
+                }
+
+                // Generate hash from the current byte,
+                hash = update_hash(hash, c);
+                dictb.next[ins_pos & LZ_DICT_SIZE_MASK] = dictb.hash[hash as usize];
+                // and insert it into the hash chain.
+                dictb.hash[hash as usize] = ins_pos as u16;
+                dst_pos = (dst_pos + 1) & LZ_DICT_SIZE_MASK;
+                ins_pos += 1;
+            }
+            src_pos += num_bytes_to_process;
+        } else {
+            let dictb = &mut d.dict.b;
+            for &c in &in_buf[src_pos..src_pos + num_bytes_to_process] {
+                let dst_pos = (lookahead_pos + lookahead_size) & LZ_DICT_SIZE_MASK;
+                dictb.dict[dst_pos] = c;
+                if dst_pos < MAX_MATCH_LEN - 1 {
+                    dictb.dict[LZ_DICT_SIZE + dst_pos] = c;
+                }
+
+                lookahead_size += 1;
+                if lookahead_size + d.dict.size >= MIN_MATCH_LEN.into() {
+                    let ins_pos = lookahead_pos + lookahead_size - 3;
+                    let hash = ((u32::from(dictb.dict[ins_pos & LZ_DICT_SIZE_MASK])
+                        << (LZ_HASH_SHIFT * 2))
+                        ^ ((u32::from(dictb.dict[(ins_pos + 1) & LZ_DICT_SIZE_MASK])
+                            << LZ_HASH_SHIFT)
+                            ^ u32::from(c)))
+                        & (LZ_HASH_SIZE as u32 - 1);
+
+                    dictb.next[ins_pos & LZ_DICT_SIZE_MASK] = dictb.hash[hash as usize];
+                    dictb.hash[hash as usize] = ins_pos as u16;
+                }
+            }
+
+            src_pos += num_bytes_to_process;
+        }
+
+        d.dict.size = cmp::min(LZ_DICT_SIZE - lookahead_size, d.dict.size);
+        if d.params.flush == TDEFLFlush::None && lookahead_size < MAX_MATCH_LEN {
+            break;
+        }
+
+        let len_to_move = 1;
+
+        bytes_written += 1;
+
+        lookahead_pos += len_to_move;
+        assert!(lookahead_size >= len_to_move);
+        lookahead_size -= len_to_move;
+        d.dict.size = cmp::min(d.dict.size + len_to_move, LZ_DICT_SIZE);
+
+        if bytes_written > 31 * 1024 {
+            d.lz.total_bytes = bytes_written;
+
+            d.params.src_pos = src_pos;
+            // These values are used in flush_block, so we need to write them back here.
+            d.dict.lookahead_size = lookahead_size;
+            d.dict.lookahead_pos = lookahead_pos;
+
+            let n = flush_block(d, callback, TDEFLFlush::None)
+                .unwrap_or(TDEFLStatus::PutBufFailed as i32);
+            if n != 0 {
+                return n > 0;
+            }
+            bytes_written = d.lz.total_bytes;
+        }
+    }
+
+    d.lz.total_bytes = bytes_written;
+    d.params.src_pos = src_pos;
+    d.dict.lookahead_size = lookahead_size;
+    d.dict.lookahead_pos = lookahead_pos;
+    true
+}
+
+/*
+fn compress_rle(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool {
+    let mut src_pos = d.params.src_pos;
+    let in_buf = match callback.in_buf {
+        None => return true,
+        Some(in_buf) => in_buf,
+    };
+
+    let mut lookahead_size = d.dict.lookahead_size;
+    let mut lookahead_pos = d.dict.lookahead_pos;
+    let mut saved_lit = d.params.saved_lit;
+    let mut saved_match_dist = d.params.saved_match_dist;
+    let mut saved_match_len = d.params.saved_match_len;
+
+    while src_pos < in_buf.len() || (d.params.flush != TDEFLFlush::None && lookahead_size != 0) {
+        let src_buf_left = in_buf.len() - src_pos;
+        let num_bytes_to_process = cmp::min(src_buf_left, MAX_MATCH_LEN - lookahead_size);
+
+        if lookahead_size + d.dict.size >= usize::from(MIN_MATCH_LEN) - 1
+            && num_bytes_to_process > 0
+        {
+            let dictb = &mut d.dict.b;
+
+            let mut dst_pos = (lookahead_pos + lookahead_size) & LZ_DICT_SIZE_MASK;
+            let mut ins_pos = lookahead_pos + lookahead_size - 2;
+            // Start the hash value from the first two bytes
+            let mut hash = update_hash(
+                u16::from(dictb.dict[ins_pos & LZ_DICT_SIZE_MASK]),
+                dictb.dict[(ins_pos + 1) & LZ_DICT_SIZE_MASK],
+            );
+
+            lookahead_size += num_bytes_to_process;
+
+            for &c in &in_buf[src_pos..src_pos + num_bytes_to_process] {
+                // Add byte to input buffer.
+                dictb.dict[dst_pos] = c;
+                if dst_pos < MAX_MATCH_LEN - 1 {
+                    dictb.dict[LZ_DICT_SIZE + dst_pos] = c;
+                }
+
+                // Generate hash from the current byte,
+                hash = update_hash(hash, c);
+                dictb.next[ins_pos & LZ_DICT_SIZE_MASK] = dictb.hash[hash as usize];
+                // and insert it into the hash chain.
+                dictb.hash[hash as usize] = ins_pos as u16;
+                dst_pos = (dst_pos + 1) & LZ_DICT_SIZE_MASK;
+                ins_pos += 1;
+            }
+            src_pos += num_bytes_to_process;
+        } else {
+            let dictb = &mut d.dict.b;
+            for &c in &in_buf[src_pos..src_pos + num_bytes_to_process] {
+                let dst_pos = (lookahead_pos + lookahead_size) & LZ_DICT_SIZE_MASK;
+                dictb.dict[dst_pos] = c;
+                if dst_pos < MAX_MATCH_LEN - 1 {
+                    dictb.dict[LZ_DICT_SIZE + dst_pos] = c;
+                }
+
+                lookahead_size += 1;
+                if lookahead_size + d.dict.size >= MIN_MATCH_LEN.into() {
+                    let ins_pos = lookahead_pos + lookahead_size - 3;
+                    let hash = ((u32::from(dictb.dict[ins_pos & LZ_DICT_SIZE_MASK])
+                        << (LZ_HASH_SHIFT * 2))
+                        ^ ((u32::from(dictb.dict[(ins_pos + 1) & LZ_DICT_SIZE_MASK])
+                            << LZ_HASH_SHIFT)
+                            ^ u32::from(c)))
+                        & (LZ_HASH_SIZE as u32 - 1);
+
+                    dictb.next[ins_pos & LZ_DICT_SIZE_MASK] = dictb.hash[hash as usize];
+                    dictb.hash[hash as usize] = ins_pos as u16;
+                }
+            }
+
+            src_pos += num_bytes_to_process;
+        }
+
+        d.dict.size = cmp::min(LZ_DICT_SIZE - lookahead_size, d.dict.size);
+        if d.params.flush == TDEFLFlush::None && lookahead_size < MAX_MATCH_LEN {
+            break;
+        }
+
+        let mut len_to_move = 1;
+        let mut cur_match_dist = 0;
+        let mut cur_match_len = if saved_match_len != 0 {
+            saved_match_len
+        } else {
+            u32::from(MIN_MATCH_LEN) - 1
+        };
+        let cur_pos = lookahead_pos & LZ_DICT_SIZE_MASK;
+                // If TDEFL_RLE_MATCHES is set, we only look for repeating sequences of the current byte.
+        if d.dict.size != 0 && d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS == 0 {
+            let c = d.dict.b.dict[(cur_pos.wrapping_sub(1)) & LZ_DICT_SIZE_MASK];
+                    cur_match_len = d.dict.b.dict[cur_pos..(cur_pos + lookahead_size)]
+                        .iter()
+                        .take_while(|&x| *x == c)
+                        .count() as u32;
+                    if cur_match_len < MIN_MATCH_LEN.into() {
+                        cur_match_len = 0
+                    } else {
+                        cur_match_dist = 1
+                    }
+                }
+
+
+        let far_and_small = cur_match_len == MIN_MATCH_LEN.into() && cur_match_dist >= 8 * 1024;
+        let filter_small = d.params.flags & TDEFL_FILTER_MATCHES != 0 && cur_match_len <= 5;
+        if far_and_small || filter_small || cur_pos == cur_match_dist as usize {
+            cur_match_dist = 0;
+            cur_match_len = 0;
+        }
+
+        if saved_match_len != 0 {
+            if cur_match_len > saved_match_len {
+                record_literal(&mut d.huff, &mut d.lz, saved_lit);
+                if cur_match_len >= 128 {
+                    record_match(&mut d.huff, &mut d.lz, cur_match_len, cur_match_dist);
+                    saved_match_len = 0;
+                    len_to_move = cur_match_len as usize;
+                } else {
+                    saved_lit = d.dict.b.dict[cur_pos];
+                    saved_match_dist = cur_match_dist;
+                    saved_match_len = cur_match_len;
+                }
+            } else {
+                record_match(&mut d.huff, &mut d.lz, saved_match_len, saved_match_dist);
+                len_to_move = (saved_match_len - 1) as usize;
+                saved_match_len = 0;
+            }
+        } else if cur_match_dist == 0 {
+            record_literal(
+                &mut d.huff,
+                &mut d.lz,
+                d.dict.b.dict[cmp::min(cur_pos, d.dict.b.dict.len() - 1)],
+            );
+        } else if d.params.greedy_parsing
+            || (d.params.flags & TDEFL_RLE_MATCHES != 0)
+            || cur_match_len >= 128
+        {
+            // If we are using lazy matching, check for matches at the next byte if the current
+            // match was shorter than 128 bytes.
+            record_match(&mut d.huff, &mut d.lz, cur_match_len, cur_match_dist);
+            len_to_move = cur_match_len as usize;
+        } else {
+            saved_lit = d.dict.b.dict[cmp::min(cur_pos, d.dict.b.dict.len() - 1)];
+            saved_match_dist = cur_match_dist;
+            saved_match_len = cur_match_len;
+        }
+
+        lookahead_pos += len_to_move;
+        assert!(lookahead_size >= len_to_move);
+        lookahead_size -= len_to_move;
+        d.dict.size = cmp::min(d.dict.size + len_to_move, LZ_DICT_SIZE);
+
+        let lz_buf_tight = d.lz.code_position > LZ_CODE_BUF_SIZE - 8;
+        let raw = d.params.flags & TDEFL_FORCE_ALL_RAW_BLOCKS != 0;
+        let fat = ((d.lz.code_position * 115) >> 7) >= d.lz.total_bytes as usize;
+        let fat_or_raw = (d.lz.total_bytes > 31 * 1024) && (fat || raw);
+
+        if lz_buf_tight || fat_or_raw {
+            d.params.src_pos = src_pos;
+            // These values are used in flush_block, so we need to write them back here.
+            d.dict.lookahead_size = lookahead_size;
+            d.dict.lookahead_pos = lookahead_pos;
+
+            let n = flush_block(d, callback, TDEFLFlush::None)
+                .unwrap_or(TDEFLStatus::PutBufFailed as i32);
+            if n != 0 {
+                d.params.saved_lit = saved_lit;
+                d.params.saved_match_dist = saved_match_dist;
+                d.params.saved_match_len = saved_match_len;
+                return n > 0;
+            }
+        }
+    }
+
+    d.params.src_pos = src_pos;
+    d.dict.lookahead_size = lookahead_size;
+    d.dict.lookahead_pos = lookahead_pos;
+    d.params.saved_lit = saved_lit;
+    d.params.saved_match_dist = saved_match_dist;
+    d.params.saved_match_len = saved_match_len;
+    true
+}*/
diff --git a/miniz_oxide/src/deflate/zlib.rs b/miniz_oxide/src/deflate/zlib.rs
new file mode 100644
index 00000000..281c4f17
--- /dev/null
+++ b/miniz_oxide/src/deflate/zlib.rs
@@ -0,0 +1,112 @@
+use crate::deflate::core::deflate_flags::{
+    TDEFL_FORCE_ALL_RAW_BLOCKS, TDEFL_GREEDY_PARSING_FLAG, TDEFL_RLE_MATCHES,
+};
+
+const DEFAULT_CM: u8 = 8;
+const DEFAULT_CINFO: u8 = 7 << 4;
+const _DEFAULT_FDICT: u8 = 0;
+const DEFAULT_CMF: u8 = DEFAULT_CM | DEFAULT_CINFO;
+// CMF used for RLE (technically it uses a window size of 0 but the lowest that can
+// be specified in the header corresponds to a window size of 1 << (0 + 8) aka 256.
+const MIN_CMF: u8 = DEFAULT_CM; // | 0
+/// The 16-bit value consisting of CMF and FLG must be divisible by this to be valid.
+const FCHECK_DIVISOR: u8 = 31;
+
+/// Generate FCHECK from CMF and FLG (without FCKECH )so that they are correct according to the
+/// specification, i.e (CMF*256 + FCHK) % 31 = 0.
+/// Returns flg with the FCHKECK bits added (any existing FCHECK bits are ignored).
+#[inline]
+fn add_fcheck(cmf: u8, flg: u8) -> u8 {
+    let rem = ((usize::from(cmf) * 256) + usize::from(flg)) % usize::from(FCHECK_DIVISOR);
+
+    // Clear existing FCHECK if any
+    let flg = flg & 0b11100000;
+
+    // Casting is safe as rem can't overflow since it is a value mod 31
+    // We can simply add the value to flg as (31 - rem) will never be above 2^5
+    flg + (FCHECK_DIVISOR - rem as u8)
+}
+
+#[inline]
+const fn zlib_level_from_flags(flags: u32) -> u8 {
+    use crate::deflate::core::NUM_PROBES;
+
+    let num_probes = flags & super::MAX_PROBES_MASK;
+    if (flags & TDEFL_GREEDY_PARSING_FLAG != 0) || (flags & TDEFL_RLE_MATCHES != 0) {
+        if num_probes <= 1 {
+            0
+        } else {
+            1
+        }
+    } else if num_probes >= NUM_PROBES[9] as u32 {
+        3
+    } else {
+        2
+    }
+}
+
+#[inline]
+const fn cmf_from_flags(flags: u32) -> u8 {
+    if (flags & TDEFL_RLE_MATCHES == 0) && (flags & TDEFL_FORCE_ALL_RAW_BLOCKS == 0) {
+        DEFAULT_CMF
+    // If we are using RLE encoding or no compression the window bits can be set as the
+    // minimum.
+    } else {
+        MIN_CMF
+    }
+}
+
+/// Get the zlib header for the level using the default window size and no
+/// dictionary.
+#[inline]
+fn header_from_level(level: u8, flags: u32) -> [u8; 2] {
+    let cmf = cmf_from_flags(flags);
+    [cmf, add_fcheck(cmf, level << 6)]
+}
+
+/// Create a zlib header from the given compression flags.
+/// Only level is considered.
+#[inline]
+pub fn header_from_flags(flags: u32) -> [u8; 2] {
+    let level = zlib_level_from_flags(flags);
+    header_from_level(level, flags)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::shared::MZ_DEFAULT_WINDOW_BITS;
+    #[test]
+    fn zlib() {
+        use super::super::*;
+        use super::*;
+
+        let test_level = |level, expected| {
+            let flags = create_comp_flags_from_zip_params(
+                level,
+                MZ_DEFAULT_WINDOW_BITS,
+                CompressionStrategy::Default as i32,
+            );
+            assert_eq!(zlib_level_from_flags(flags), expected);
+        };
+
+        assert_eq!(zlib_level_from_flags(DEFAULT_FLAGS), 2);
+        test_level(0, 0);
+        test_level(1, 0);
+        test_level(2, 1);
+        test_level(3, 1);
+        for i in 4..=8 {
+            test_level(i, 2)
+        }
+        test_level(9, 3);
+        test_level(10, 3);
+    }
+
+    #[test]
+    fn test_header() {
+        let header = super::header_from_level(3, 0);
+        assert_eq!(
+            ((usize::from(header[0]) * 256) + usize::from(header[1])) % 31,
+            0
+        );
+    }
+}
diff --git a/miniz_oxide/src/inflate/core.rs b/miniz_oxide/src/inflate/core.rs
index 738de236..1b6149f6 100644
--- a/miniz_oxide/src/inflate/core.rs
+++ b/miniz_oxide/src/inflate/core.rs
@@ -4,17 +4,16 @@ use super::*;
 use crate::shared::{update_adler32, HUFFMAN_LENGTH_ORDER};
 use ::core::cell::Cell;
 
+use ::core::cmp;
 use ::core::convert::TryInto;
-use ::core::{cmp, slice};
 
-use self::output_buffer::OutputBuffer;
+use self::output_buffer::{InputWrapper, OutputBuffer};
 
 pub const TINFL_LZ_DICT_SIZE: usize = 32_768;
 
 /// A struct containing huffman code lengths and the huffman code tree used by the decompressor.
+#[derive(Clone)]
 struct HuffmanTable {
-    /// Length of the code at each index.
-    pub code_size: [u8; MAX_HUFF_SYMBOLS_0],
     /// Fast lookup table for shorter huffman codes.
     ///
     /// See `HuffmanTable::fast_lookup`.
@@ -29,7 +28,6 @@ struct HuffmanTable {
 impl HuffmanTable {
     const fn new() -> HuffmanTable {
         HuffmanTable {
-            code_size: [0; MAX_HUFF_SYMBOLS_0],
             look_up: [0; FAST_LOOKUP_SIZE as usize],
             tree: [0; MAX_HUFF_TREE_SIZE],
         }
@@ -46,7 +44,7 @@ impl HuffmanTable {
 
     /// Get the symbol and the code length from the huffman tree.
     #[inline]
-    fn tree_lookup(&self, fast_symbol: i32, bit_buf: BitBuffer, mut code_len: u32) -> (i32, u32) {
+    fn tree_lookup(&self, fast_symbol: i32, bit_buf: BitBuffer, mut code_len: u8) -> (i32, u32) {
         let mut symbol = fast_symbol;
         // We step through the tree until we encounter a positive value, which indicates a
         // symbol.
@@ -54,17 +52,21 @@ impl HuffmanTable {
             // symbol here indicates the position of the left (0) node, if the next bit is 1
             // we add 1 to the lookup position to get the right node.
             let tree_index = (!symbol + ((bit_buf >> code_len) & 1) as i32) as usize;
+
+            // Use get here to avoid generatic panic code.
+            // The init_tree code should prevent this from actually going out of bounds
+            // but if there were somehow a bug with that
+            // we would at worst end up with corrupted output in release mode.
             debug_assert!(tree_index < self.tree.len());
-            if tree_index >= self.tree.len() {
-                break;
-            }
-            symbol = i32::from(self.tree[tree_index]);
+            symbol = i32::from(self.tree.get(tree_index).copied().unwrap_or(i16::MAX));
             code_len += 1;
             if symbol >= 0 {
                 break;
             }
         }
-        (symbol, code_len)
+        // Note: Using a u8 for code_len inside this function seems to improve performance, but changing it
+        // in localvars seems to worsen things so we convert it to a u32 here.
+        (symbol, u32::from(code_len))
     }
 
     #[inline]
@@ -75,18 +77,14 @@ impl HuffmanTable {
     ///
     /// It's possible we could avoid checking for 0 if we can guarantee a sane table.
     /// TODO: Check if a smaller type for code_len helps performance.
-    fn lookup(&self, bit_buf: BitBuffer) -> Option<(i32, u32)> {
+    fn lookup(&self, bit_buf: BitBuffer) -> (i32, u32) {
         let symbol = self.fast_lookup(bit_buf).into();
         if symbol >= 0 {
-            if (symbol >> 9) as u32 != 0 {
-                Some((symbol, (symbol >> 9) as u32))
-            } else {
-                // Zero-length code.
-                None
-            }
+            let length = (symbol >> 9) as u32;
+            (symbol, length)
         } else {
             // We didn't get a symbol from the fast lookup table, so check the tree instead.
-            Some(self.tree_lookup(symbol, bit_buf, FAST_LOOKUP_BITS.into()))
+            self.tree_lookup(symbol, bit_buf, FAST_LOOKUP_BITS)
         }
     }
 }
@@ -98,7 +96,7 @@ const MAX_HUFF_SYMBOLS_0: usize = 288;
 /// The length of the second (distance) huffman table.
 const MAX_HUFF_SYMBOLS_1: usize = 32;
 /// The length of the last (huffman code length) huffman table.
-const _MAX_HUFF_SYMBOLS_2: usize = 19;
+const MAX_HUFF_SYMBOLS_2: usize = 19;
 /// The maximum length of a code that can be looked up in the fast lookup table.
 const FAST_LOOKUP_BITS: u8 = 10;
 /// The size of the fast lookup table.
@@ -164,8 +162,16 @@ type BitBuffer = u64;
 #[cfg(not(target_pointer_width = "64"))]
 type BitBuffer = u32;
 
+/*
+enum HuffmanTableType {
+    LiteralLength = 0,
+    Dist = 1,
+    Huffman = 2,
+}*/
+
 /// Main decompression struct.
 ///
+#[derive(Clone)]
 pub struct DecompressorOxide {
     /// Current state of the decompressor.
     state: core::State,
@@ -178,9 +184,11 @@ pub struct DecompressorOxide {
     /// Adler32 checksum from the zlib header.
     z_adler32: u32,
     /// 1 if the current block is the last block, 0 otherwise.
-    finish: u32,
+    finish: u8,
     /// The type of the current block.
-    block_type: u32,
+    /// or if in a dynamic block, which huffman table we are currently
+    // initializing.
+    block_type: u8,
     /// 1 if the adler32 value should be checked.
     check_adler32: u32,
     /// Last match distance.
@@ -188,13 +196,16 @@ pub struct DecompressorOxide {
     /// Variable used for match length, symbols, and a number of other things.
     counter: u32,
     /// Number of extra bits for the last length or distance code.
-    num_extra: u32,
+    num_extra: u8,
     /// Number of entries in each huffman table.
-    table_sizes: [u32; MAX_HUFF_TABLES],
+    table_sizes: [u16; MAX_HUFF_TABLES],
     /// Buffer of input data.
     bit_buf: BitBuffer,
     /// Huffman tables.
     tables: [HuffmanTable; MAX_HUFF_TABLES],
+    code_size_literal: [u8; MAX_HUFF_SYMBOLS_0],
+    code_size_dist: [u8; MAX_HUFF_SYMBOLS_1],
+    code_size_huffman: [u8; MAX_HUFF_SYMBOLS_2],
     /// Raw block header.
     raw_header: [u8; 4],
     /// Huffman length codes.
@@ -234,6 +245,21 @@ impl DecompressorOxide {
             None
         }
     }
+
+    // Get zlib header for tests
+    // Only for tests for now, may provide a proper function for this for later.
+    #[cfg(all(test, feature = "with-alloc"))]
+    pub(crate) const fn zlib_header(&self) -> (u32, u32) {
+        (self.z_header0, self.z_header1)
+    }
+
+    /*fn code_size_table(&mut self, table_num: u8) -> &mut [u8] {
+        match table_num {
+            0 => &mut self.code_size_literal,
+            1 => &mut self.code_size_dist,
+            _ => &mut self.code_size_huffman,
+        }
+    }*/
 }
 
 impl Default for DecompressorOxide {
@@ -260,6 +286,9 @@ impl Default for DecompressorOxide {
                 HuffmanTable::new(),
                 HuffmanTable::new(),
             ],
+            code_size_literal: [0; MAX_HUFF_SYMBOLS_0],
+            code_size_dist: [0; MAX_HUFF_SYMBOLS_1],
+            code_size_huffman: [0; MAX_HUFF_SYMBOLS_2],
             raw_header: [0; 4],
             len_codes: [0; MAX_HUFF_SYMBOLS_0 + MAX_HUFF_SYMBOLS_1 + 137],
         }
@@ -307,7 +336,6 @@ enum State {
     BadCodeSizeDistPrevLookup,
     InvalidLitlen,
     InvalidDist,
-    InvalidCodeLen,
 }
 
 impl State {
@@ -335,7 +363,6 @@ impl State {
 
 use self::State::*;
 
-// Not sure why miniz uses 32-bit values for these, maybe alignment/cache again?
 // # Optimization
 // We add a extra value at the end and make the tables 32 elements long
 // so we can use a mask to avoid bounds checks.
@@ -360,56 +387,37 @@ const LENGTH_EXTRA: [u8; 32] = [
 
 /// Base length for each distance code.
 #[rustfmt::skip]
-const DIST_BASE: [u16; 32] = [
+const DIST_BASE: [u16; 30] = [
     1,    2,    3,    4,    5,    7,      9,      13,     17,     25,    33,
     49,   65,   97,   129,  193,  257,    385,    513,    769,    1025,  1537,
-    2049, 3073, 4097, 6145, 8193, 12_289, 16_385, 24_577, 32_768, 32_768
+    2049, 3073, 4097, 6145, 8193, 12_289, 16_385, 24_577
 ];
 
-/// Number of extra bits for each distance code.
-#[rustfmt::skip]
-const DIST_EXTRA: [u8; 32] = [
-    0, 0, 0, 0, 1, 1, 2,  2,  3,  3,  4,  4,  5,  5,  6,  6,
-    7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 13, 13
-];
+/// Get the number of extra bits used for a distance code.
+/// (Code numbers above `NUM_DISTANCE_CODES` will give some garbage
+/// value.)
+#[inline(always)]
+const fn num_extra_bits_for_distance_code(code: u8) -> u8 {
+    // TODO: Need to verify that this is faster on all platforms.
+    // This can be easily calculated without a lookup.
+    let c = code >> 1;
+    c.saturating_sub(1)
+}
 
 /// The mask used when indexing the base/extra arrays.
 const BASE_EXTRA_MASK: usize = 32 - 1;
 
-/// Sets the value of all the elements of the slice to `val`.
-#[inline]
-fn memset<T: Copy>(slice: &mut [T], val: T) {
-    for x in slice {
-        *x = val
-    }
-}
-
 /// Read an le u16 value from the slice iterator.
 ///
 /// # Panics
 /// Panics if there are less than two bytes left.
 #[inline]
-fn read_u16_le(iter: &mut slice::Iter<u8>) -> u16 {
+fn read_u16_le(iter: &mut InputWrapper) -> u16 {
     let ret = {
-        let two_bytes = iter.as_ref()[..2].try_into().unwrap();
+        let two_bytes = iter.as_slice()[..2].try_into().unwrap_or_default();
         u16::from_le_bytes(two_bytes)
     };
-    iter.nth(1);
-    ret
-}
-
-/// Read an le u32 value from the slice iterator.
-///
-/// # Panics
-/// Panics if there are less than four bytes left.
-#[inline(always)]
-#[cfg(target_pointer_width = "64")]
-fn read_u32_le(iter: &mut slice::Iter<u8>) -> u32 {
-    let ret = {
-        let four_bytes: [u8; 4] = iter.as_ref()[..4].try_into().unwrap();
-        u32::from_le_bytes(four_bytes)
-    };
-    iter.nth(3);
+    iter.advance(2);
     ret
 }
 
@@ -420,10 +428,10 @@ fn read_u32_le(iter: &mut slice::Iter<u8>) -> u32 {
 /// This function assumes that there is at least 4 bytes left in the input buffer.
 #[inline(always)]
 #[cfg(target_pointer_width = "64")]
-fn fill_bit_buffer(l: &mut LocalVars, in_iter: &mut slice::Iter<u8>) {
+fn fill_bit_buffer(l: &mut LocalVars, in_iter: &mut InputWrapper) {
     // Read four bytes into the buffer at once.
     if l.num_bits < 30 {
-        l.bit_buf |= BitBuffer::from(read_u32_le(in_iter)) << l.num_bits;
+        l.bit_buf |= BitBuffer::from(in_iter.read_u32_le()) << l.num_bits;
         l.num_bits += 32;
     }
 }
@@ -432,7 +440,7 @@ fn fill_bit_buffer(l: &mut LocalVars, in_iter: &mut slice::Iter<u8>) {
 /// Ensures at least 16 bits are present, requires at least 2 bytes in the in buffer.
 #[inline(always)]
 #[cfg(not(target_pointer_width = "64"))]
-fn fill_bit_buffer(l: &mut LocalVars, in_iter: &mut slice::Iter<u8>) {
+fn fill_bit_buffer(l: &mut LocalVars, in_iter: &mut InputWrapper) {
     // If the buffer is 32-bit wide, read 2 bytes instead.
     if l.num_bits < 15 {
         l.bit_buf |= BitBuffer::from(read_u16_le(in_iter)) << l.num_bits;
@@ -488,7 +496,7 @@ fn decode_huffman_code<F>(
     l: &mut LocalVars,
     table: usize,
     flags: u32,
-    in_iter: &mut slice::Iter<u8>,
+    in_iter: &mut InputWrapper,
     f: F,
 ) -> Action
 where
@@ -498,7 +506,7 @@ where
     // ready in the bit buffer to start decoding the next huffman code.
     if l.num_bits < 15 {
         // First, make sure there is enough data in the bit buffer to decode a huffman code.
-        if in_iter.len() < 2 {
+        if in_iter.bytes_left() < 2 {
             // If there is less than 2 bytes left in the input buffer, we try to look up
             // the huffman code with what's available, and return if that doesn't succeed.
             // Original explanation in miniz:
@@ -513,9 +521,9 @@ where
             // /* bit buffer contains >=15 bits (deflate's max. Huffman code size). */
             loop {
                 let mut temp = i32::from(r.tables[table].fast_lookup(l.bit_buf));
-
                 if temp >= 0 {
                     let code_len = (temp >> 9) as u32;
+                    // TODO: Is there any point to check for code_len != 0 here still?
                     if (code_len != 0) && (l.num_bits >= code_len) {
                         break;
                     }
@@ -578,15 +586,11 @@ where
         // Mask out the length value.
         symbol &= 511;
     } else {
-        let res = r.tables[table].tree_lookup(symbol, l.bit_buf, u32::from(FAST_LOOKUP_BITS));
+        let res = r.tables[table].tree_lookup(symbol, l.bit_buf, FAST_LOOKUP_BITS);
         symbol = res.0;
         code_len = res.1;
     };
 
-    if code_len == 0 {
-        return Action::Jump(InvalidCodeLen);
-    }
-
     l.bit_buf >>= code_len;
     l.num_bits -= code_len;
     f(r, l, symbol)
@@ -596,13 +600,13 @@ where
 /// returning the result.
 /// If reading fails, `Action::End is returned`
 #[inline]
-fn read_byte<F>(in_iter: &mut slice::Iter<u8>, flags: u32, f: F) -> Action
+fn read_byte<F>(in_iter: &mut InputWrapper, flags: u32, f: F) -> Action
 where
     F: FnOnce(u8) -> Action,
 {
-    match in_iter.next() {
+    match in_iter.read_byte() {
         None => end_of_input(flags),
-        Some(&byte) => f(byte),
+        Some(byte) => f(byte),
     }
 }
 
@@ -615,7 +619,7 @@ where
 fn read_bits<F>(
     l: &mut LocalVars,
     amount: u32,
-    in_iter: &mut slice::Iter<u8>,
+    in_iter: &mut InputWrapper,
     flags: u32,
     f: F,
 ) -> Action
@@ -644,7 +648,7 @@ where
 }
 
 #[inline]
-fn pad_to_bytes<F>(l: &mut LocalVars, in_iter: &mut slice::Iter<u8>, flags: u32, f: F) -> Action
+fn pad_to_bytes<F>(l: &mut LocalVars, in_iter: &mut InputWrapper, flags: u32, f: F) -> Action
 where
     F: FnOnce(&mut LocalVars) -> Action,
 {
@@ -671,23 +675,36 @@ fn undo_bytes(l: &mut LocalVars, max: u32) -> u32 {
 fn start_static_table(r: &mut DecompressorOxide) {
     r.table_sizes[LITLEN_TABLE] = 288;
     r.table_sizes[DIST_TABLE] = 32;
-    memset(&mut r.tables[LITLEN_TABLE].code_size[0..144], 8);
-    memset(&mut r.tables[LITLEN_TABLE].code_size[144..256], 9);
-    memset(&mut r.tables[LITLEN_TABLE].code_size[256..280], 7);
-    memset(&mut r.tables[LITLEN_TABLE].code_size[280..288], 8);
-    memset(&mut r.tables[DIST_TABLE].code_size[0..32], 5);
+    r.code_size_literal[0..144].fill(8);
+    r.code_size_literal[144..256].fill(9);
+    r.code_size_literal[256..280].fill(7);
+    r.code_size_literal[280..288].fill(8);
+    r.code_size_dist[0..32].fill(5);
 }
 
-#[cfg(feature = "rustc-dep-of-std")]
+#[cfg(any(
+    feature = "rustc-dep-of-std",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_arch = "loongarch64"
+))]
 fn reverse_bits(n: u32) -> u32 {
     // Lookup is not used when building as part of std to avoid wasting space
     // for lookup table in every rust binary
     // as it's only used for backtraces in the cold path
     // - see #152
+
+    // armv7 and newer, and loongarch have a cpu instruction for bit reversal so
+    // it's preferable to just use that on those architectures.
     n.reverse_bits()
 }
 
-#[cfg(not(feature = "rustc-dep-of-std"))]
+#[cfg(not(any(
+    feature = "rustc-dep-of-std",
+    target_arch = "aarch64",
+    target_arch = "arm64ec",
+    target_arch = "loongarch64"
+)))]
 fn reverse_bits(n: u32) -> u32 {
     static REVERSED_BITS_LOOKUP: [u32; 512] = {
         let mut table = [0; 512];
@@ -700,27 +717,45 @@ fn reverse_bits(n: u32) -> u32 {
 
         table
     };
-
     REVERSED_BITS_LOOKUP[n as usize]
 }
 
 fn init_tree(r: &mut DecompressorOxide, l: &mut LocalVars) -> Option<Action> {
     loop {
         let bt = r.block_type as usize;
-        if bt >= r.tables.len() {
-            return None;
-        }
+
+        let code_sizes = match bt {
+            LITLEN_TABLE => &mut r.code_size_literal[..],
+            DIST_TABLE => &mut r.code_size_dist,
+            HUFFLEN_TABLE => &mut r.code_size_huffman,
+            _ => return None,
+        };
         let table = &mut r.tables[bt];
+
+        let mut total_symbols = [0u16; 16];
+        let mut next_code = [0u32; 17];
+        const INVALID_CODE: i16 = 1 << 9 | 286;
+        // Set the values in the fast table to return a
+        // non-zero length and an invalid symbol instead of zero
+        // so that we do not have to have a check for a zero
+        // code length in the hot code path later
+        // and can instead error out on the invalid symbol check
+        // on bogus input.
+        table.look_up.fill(INVALID_CODE);
+        // If we are initializing the huffman code length we can skip
+        // this since these codes can't be longer than 3 bits
+        // and thus only use the fast table and this table won't be accessed so
+        // there is no point clearing it.
+        // TODO: Avoid creating this table at all.
+        if bt != HUFFLEN_TABLE {
+            table.tree.fill(0);
+        }
+
         let table_size = r.table_sizes[bt] as usize;
-        if table_size > table.code_size.len() {
+        if table_size > code_sizes.len() {
             return None;
         }
-        let mut total_symbols = [0u32; 16];
-        let mut next_code = [0u32; 17];
-        memset(&mut table.look_up[..], 0);
-        memset(&mut table.tree[..], 0);
-
-        for &code_size in &table.code_size[..table_size] {
+        for &code_size in &code_sizes[..table_size] {
             let cs = code_size as usize;
             if cs >= total_symbols.len() {
                 return None;
@@ -729,26 +764,35 @@ fn init_tree(r: &mut DecompressorOxide, l: &mut LocalVars) -> Option<Action> {
         }
 
         let mut used_symbols = 0;
-        let mut total = 0;
-        for (ts, next) in total_symbols
-            .iter()
-            .copied()
-            .zip(next_code.iter_mut().skip(1))
-            .skip(1)
-        {
+        let mut total = 0u32;
+        // Count up the total number of used lengths and check that the table is not under or over-subscribed.
+        for (&ts, next) in total_symbols.iter().zip(next_code[1..].iter_mut()).skip(1) {
             used_symbols += ts;
-            total += ts;
+            total += u32::from(ts);
             total <<= 1;
             *next = total;
         }
 
-        if total != 65_536 && used_symbols > 1 {
+        //
+        // While it's not explicitly stated in the spec, a hufflen table
+        // with a single length (or none) would be invalid as there needs to be
+        // at minimum a length for both a non-zero length huffman code for the end of block symbol
+        // and one of the codes to represent 0 to make sense - so just reject that here as well.
+        //
+        // The distance table is allowed to have a single distance code though according to the spect it is
+        // supposed to be accompanied by a second dummy code. It can also be empty indicating no used codes.
+        //
+        // The literal/length table can not be empty as there has to be an end of block symbol,
+        // The standard doesn't specify that there should be a dummy code in case of a single
+        // symbol (i.e an empty block). Normally that's not an issue though the code will have
+        // to take that into account later on in case of malformed input.
+        if total != 65_536 && (used_symbols > 1 || bt == HUFFLEN_TABLE) {
             return Some(Action::Jump(BadTotalSymbols));
         }
 
         let mut tree_next = -1;
         for symbol_index in 0..table_size {
-            let code_size = table.code_size[symbol_index];
+            let code_size = code_sizes[symbol_index];
             if code_size == 0 || usize::from(code_size) >= next_code.len() {
                 continue;
             }
@@ -779,7 +823,7 @@ fn init_tree(r: &mut DecompressorOxide, l: &mut LocalVars) -> Option<Action> {
             }
 
             let mut tree_cur = table.look_up[(rev_code & (FAST_LOOKUP_SIZE - 1)) as usize];
-            if tree_cur == 0 {
+            if tree_cur == INVALID_CODE {
                 table.look_up[(rev_code & (FAST_LOOKUP_SIZE - 1)) as usize] = tree_next;
                 tree_cur = tree_next;
                 tree_next -= 2;
@@ -811,18 +855,19 @@ fn init_tree(r: &mut DecompressorOxide, l: &mut LocalVars) -> Option<Action> {
             table.tree[tree_index] = symbol_index as i16;
         }
 
-        if r.block_type == 2 {
+        if r.block_type == HUFFLEN_TABLE as u8 {
             l.counter = 0;
             return Some(Action::Jump(ReadLitlenDistTablesCodeSize));
         }
 
-        if r.block_type == 0 {
+        if r.block_type == LITLEN_TABLE as u8 {
             break;
         }
         r.block_type -= 1;
     }
 
     l.counter = 0;
+
     Some(Action::Jump(DecodeLitlen))
 }
 
@@ -851,7 +896,7 @@ struct LocalVars {
     pub num_bits: u32,
     pub dist: u32,
     pub counter: u32,
-    pub num_extra: u32,
+    pub num_extra: u8,
 }
 
 #[inline]
@@ -955,6 +1000,9 @@ fn apply_match(
         transfer(out_slice, source_pos, out_pos, match_len, out_buf_size_mask);
     } else if match_len <= dist && source_pos + match_len < out_slice.len() {
         // Destination and source segments does not intersect and source does not wrap.
+        // TODO: An invalid before start of data wrapping match reached here before
+        // it was fixed (it wrapped around and ended overlapping again)- need
+        // to check that we are not wrapping here.
         if source_pos < out_pos {
             let (from_slice, to_slice) = out_slice.split_at_mut(out_pos);
             to_slice[..match_len].copy_from_slice(&from_slice[source_pos..source_pos + match_len]);
@@ -978,7 +1026,7 @@ fn apply_match(
 /// and already improves decompression speed a fair bit.
 fn decompress_fast(
     r: &mut DecompressorOxide,
-    in_iter: &mut slice::Iter<u8>,
+    in_iter: &mut InputWrapper,
     out_buf: &mut OutputBuffer,
     flags: u32,
     local_vars: &mut LocalVars,
@@ -998,50 +1046,42 @@ fn decompress_fast(
             // + 29 + 32 (left in bit buf, including last 13 dist extra) = 111 bits < 14 bytes
             // We need the one extra byte as we may write one length and one full match
             // before checking again.
-            if out_buf.bytes_left() < 259 || in_iter.len() < 14 {
+            if out_buf.bytes_left() < 259 || in_iter.bytes_left() < 14 {
                 state = State::DecodeLitlen;
                 break 'o TINFLStatus::Done;
             }
 
             fill_bit_buffer(&mut l, in_iter);
 
-            if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) {
-                l.counter = symbol as u32;
+            let (symbol, code_len) = r.tables[LITLEN_TABLE].lookup(l.bit_buf);
+            l.counter = symbol as u32;
+            l.bit_buf >>= code_len;
+            l.num_bits -= code_len;
+
+            if (l.counter & 256) != 0 {
+                // The symbol is not a literal.
+                break;
+            } else {
+                // If we have a 32-bit buffer we need to read another two bytes now
+                // to have enough bits to keep going.
+                if cfg!(not(target_pointer_width = "64")) {
+                    fill_bit_buffer(&mut l, in_iter);
+                }
+
+                let (symbol, code_len) = r.tables[LITLEN_TABLE].lookup(l.bit_buf);
                 l.bit_buf >>= code_len;
                 l.num_bits -= code_len;
-
-                if (l.counter & 256) != 0 {
-                    // The symbol is not a literal.
+                // The previous symbol was a literal, so write it directly and check
+                // the next one.
+                out_buf.write_byte(l.counter as u8);
+                if (symbol & 256) != 0 {
+                    l.counter = symbol as u32;
+                    // The symbol is a length value.
                     break;
                 } else {
-                    // If we have a 32-bit buffer we need to read another two bytes now
-                    // to have enough bits to keep going.
-                    if cfg!(not(target_pointer_width = "64")) {
-                        fill_bit_buffer(&mut l, in_iter);
-                    }
-
-                    if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) {
-                        l.bit_buf >>= code_len;
-                        l.num_bits -= code_len;
-                        // The previous symbol was a literal, so write it directly and check
-                        // the next one.
-                        out_buf.write_byte(l.counter as u8);
-                        if (symbol & 256) != 0 {
-                            l.counter = symbol as u32;
-                            // The symbol is a length value.
-                            break;
-                        } else {
-                            // The symbol is a literal, so write it directly and continue.
-                            out_buf.write_byte(symbol as u8);
-                        }
-                    } else {
-                        state.begin(InvalidCodeLen);
-                        break 'o TINFLStatus::Failed;
-                    }
+                    // The symbol is a literal, so write it directly and continue.
+                    out_buf.write_byte(symbol as u8);
                 }
-            } else {
-                state.begin(InvalidCodeLen);
-                break 'o TINFLStatus::Failed;
             }
         }
 
@@ -1060,18 +1100,19 @@ fn decompress_fast(
             // The symbol was a length code.
             // # Optimization
             // Mask the value to avoid bounds checks
-            // We could use get_unchecked later if can statically verify that
-            // this will never go out of bounds.
-            l.num_extra = u32::from(LENGTH_EXTRA[(l.counter - 257) as usize & BASE_EXTRA_MASK]);
+            // While the maximum is checked, the compiler isn't able to know that the
+            // value won't wrap around here.
+            l.num_extra = LENGTH_EXTRA[(l.counter - 257) as usize & BASE_EXTRA_MASK];
             l.counter = u32::from(LENGTH_BASE[(l.counter - 257) as usize & BASE_EXTRA_MASK]);
             // Length and distance codes have a number of extra bits depending on
             // the base, which together with the base gives us the exact value.
 
+            // We need to make sure we have at least 33 (so min 5 bytes) bits in the buffer at this spot.
             fill_bit_buffer(&mut l, in_iter);
             if l.num_extra != 0 {
                 let extra_bits = l.bit_buf & ((1 << l.num_extra) - 1);
                 l.bit_buf >>= l.num_extra;
-                l.num_bits -= l.num_extra;
+                l.num_bits -= u32::from(l.num_extra);
                 l.counter += extra_bits as u32;
             }
 
@@ -1081,33 +1122,30 @@ fn decompress_fast(
                 fill_bit_buffer(&mut l, in_iter);
             }
 
-            if let Some((mut symbol, code_len)) = r.tables[DIST_TABLE].lookup(l.bit_buf) {
-                symbol &= 511;
-                l.bit_buf >>= code_len;
-                l.num_bits -= code_len;
-                if symbol > 29 {
-                    state.begin(InvalidDist);
-                    break 'o TINFLStatus::Failed;
-                }
-
-                l.num_extra = u32::from(DIST_EXTRA[symbol as usize]);
-                l.dist = u32::from(DIST_BASE[symbol as usize]);
-            } else {
-                state.begin(InvalidCodeLen);
+            let (mut symbol, code_len) = r.tables[DIST_TABLE].lookup(l.bit_buf);
+            symbol &= 511;
+            l.bit_buf >>= code_len;
+            l.num_bits -= code_len;
+            if symbol > 29 {
+                state.begin(InvalidDist);
                 break 'o TINFLStatus::Failed;
             }
 
+            l.num_extra = num_extra_bits_for_distance_code(symbol as u8);
+            l.dist = u32::from(DIST_BASE[symbol as usize]);
+
             if l.num_extra != 0 {
                 fill_bit_buffer(&mut l, in_iter);
                 let extra_bits = l.bit_buf & ((1 << l.num_extra) - 1);
                 l.bit_buf >>= l.num_extra;
-                l.num_bits -= l.num_extra;
+                l.num_bits -= u32::from(l.num_extra);
                 l.dist += extra_bits as u32;
             }
 
             let position = out_buf.position();
-            if l.dist as usize > out_buf.position()
-                && (flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0)
+            if (l.dist as usize > out_buf.position()
+                && (flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0))
+                || (l.dist as usize > out_buf.get_ref().len())
             {
                 // We encountered a distance that refers a position before
                 // the start of the decoded data, so we can't continue.
@@ -1147,18 +1185,18 @@ fn decompress_fast(
 ///
 /// * The offset given by `out_pos` indicates where in the output buffer slice writing should start.
 /// * If [`TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF`] is not set, the output buffer is used in a
-/// wrapping manner, and it's size is required to be a power of 2.
+///   wrapping manner, and it's size is required to be a power of 2.
 /// * The decompression function normally needs access to 32KiB of the previously decompressed data
-///(or to the beginning of the decompressed data if less than 32KiB has been decompressed.)
+///   (or to the beginning of the decompressed data if less than 32KiB has been decompressed.)
 ///     - If this data is not available, decompression may fail.
 ///     - Some deflate compressors allow specifying a window size which limits match distances to
-/// less than this, or alternatively an RLE mode where matches will only refer to the previous byte
-/// and thus allows a smaller output buffer. The window size can be specified in the zlib
-/// header structure, however, the header data should not be relied on to be correct.
+///       less than this, or alternatively an RLE mode where matches will only refer to the previous byte
+///       and thus allows a smaller output buffer. The window size can be specified in the zlib
+///       header structure, however, the header data should not be relied on to be correct.
 ///
 /// `flags` indicates settings and status to the decompression function.
 /// * The [`TINFL_FLAG_HAS_MORE_INPUT`] has to be specified if more compressed data is to be provided
-/// in a subsequent call to this function.
+///   in a subsequent call to this function.
 /// * See the the [`inflate_flags`] module for details on other flags.
 ///
 /// # Returns
@@ -1175,7 +1213,7 @@ pub fn decompress(
     flags: u32,
 ) -> (TINFLStatus, usize, usize) {
     let out_buf_size_mask = if flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0 {
-        usize::max_value()
+        usize::MAX
     } else {
         // In the case of zero len, any attempt to write would produce HasMoreOutput,
         // so to gracefully process the case of there really being no output,
@@ -1191,7 +1229,7 @@ pub fn decompress(
         return (TINFLStatus::BadParam, 0, 0);
     }
 
-    let mut in_iter = in_buf.iter();
+    let mut in_iter = InputWrapper::from_slice(in_buf);
 
     let mut state = r.state;
 
@@ -1242,8 +1280,8 @@ pub fn decompress(
             // Read the block header and jump to the relevant section depending on the block type.
             ReadBlockHeader => generate_state!(state, 'state_machine, {
                 read_bits(&mut l, 3, &mut in_iter, flags, |l, bits| {
-                    r.finish = (bits & 1) as u32;
-                    r.block_type = (bits >> 1) as u32 & 3;
+                    r.finish = (bits & 1) as u8;
+                    r.block_type = ((bits >> 1) & 3) as u8;
                     match r.block_type {
                         0 => Action::Jump(BlockTypeNoCompression),
                         1 => {
@@ -1348,20 +1386,20 @@ pub fn decompress(
             }),
 
             RawMemcpy2 => generate_state!(state, 'state_machine, {
-                if in_iter.len() > 0 {
+                if in_iter.bytes_left() > 0 {
                     // Copy as many raw bytes as possible from the input to the output using memcpy.
                     // Raw block lengths are limited to 64 * 1024, so casting through usize and u32
                     // is not an issue.
                     let space_left = out_buf.bytes_left();
                     let bytes_to_copy = cmp::min(cmp::min(
                         space_left,
-                        in_iter.len()),
+                        in_iter.bytes_left()),
                         l.counter as usize
                     );
 
                     out_buf.write_slice(&in_iter.as_slice()[..bytes_to_copy]);
 
-                    in_iter.nth(bytes_to_copy - 1);
+                    in_iter.advance(bytes_to_copy);
                     l.counter -= bytes_to_copy as u32;
                     Action::Jump(RawMemcpy1)
                 } else {
@@ -1375,12 +1413,12 @@ pub fn decompress(
                     let num_bits = [5, 5, 4][l.counter as usize];
                     read_bits(&mut l, num_bits, &mut in_iter, flags, |l, bits| {
                         r.table_sizes[l.counter as usize] =
-                            bits as u32 + u32::from(MIN_TABLE_SIZES[l.counter as usize]);
+                            bits as u16 + MIN_TABLE_SIZES[l.counter as usize];
                         l.counter += 1;
                         Action::None
                     })
                 } else {
-                    memset(&mut r.tables[HUFFLEN_TABLE].code_size[..], 0);
+                    r.code_size_huffman.fill(0);
                     l.counter = 0;
                     // Check that the litlen and distance are within spec.
                     // litlen table should be <=286 acc to the RFC and
@@ -1400,25 +1438,24 @@ pub fn decompress(
             // Read the 3-bit lengths of the huffman codes describing the huffman code lengths used
             // to decode the lengths of the main tables.
             ReadHufflenTableCodeSize => generate_state!(state, 'state_machine, {
-                if l.counter < r.table_sizes[HUFFLEN_TABLE] {
+                if l.counter < r.table_sizes[HUFFLEN_TABLE].into() {
                     read_bits(&mut l, 3, &mut in_iter, flags, |l, bits| {
                         // These lengths are not stored in a normal ascending order, but rather one
                         // specified by the deflate specification intended to put the most used
                         // values at the front as trailing zero lengths do not have to be stored.
-                        r.tables[HUFFLEN_TABLE]
-                            .code_size[HUFFMAN_LENGTH_ORDER[l.counter as usize] as usize] =
+                        r.code_size_huffman[HUFFMAN_LENGTH_ORDER[l.counter as usize] as usize] =
                                 bits as u8;
                         l.counter += 1;
                         Action::None
                     })
                 } else {
-                    r.table_sizes[HUFFLEN_TABLE] = 19;
+                    r.table_sizes[HUFFLEN_TABLE] = MAX_HUFF_SYMBOLS_2 as u16;
                     init_tree(r, &mut l).unwrap_or(Action::End(TINFLStatus::Failed))
                 }
             }),
 
             ReadLitlenDistTablesCodeSize => generate_state!(state, 'state_machine, {
-                if l.counter < r.table_sizes[LITLEN_TABLE] + r.table_sizes[DIST_TABLE] {
+                if l.counter < u32::from(r.table_sizes[LITLEN_TABLE]) + u32::from(r.table_sizes[DIST_TABLE]) {
                     decode_huffman_code(
                         r, &mut l, HUFFLEN_TABLE,
                         flags, &mut in_iter, |r, l, symbol| {
@@ -1435,16 +1472,16 @@ pub fn decompress(
                             }
                         }
                     )
-                } else if l.counter != r.table_sizes[LITLEN_TABLE] + r.table_sizes[DIST_TABLE] {
+                } else if l.counter != u32::from(r.table_sizes[LITLEN_TABLE]) + u32::from(r.table_sizes[DIST_TABLE]) {
                     Action::Jump(BadCodeSizeSum)
                 } else {
-                    r.tables[LITLEN_TABLE].code_size[..r.table_sizes[LITLEN_TABLE] as usize]
+                    r.code_size_literal[..r.table_sizes[LITLEN_TABLE] as usize]
                         .copy_from_slice(&r.len_codes[..r.table_sizes[LITLEN_TABLE] as usize]);
 
                     let dist_table_start = r.table_sizes[LITLEN_TABLE] as usize;
                     let dist_table_end = (r.table_sizes[LITLEN_TABLE] +
                                           r.table_sizes[DIST_TABLE]) as usize;
-                    r.tables[DIST_TABLE].code_size[..r.table_sizes[DIST_TABLE] as usize]
+                    r.code_size_dist[..r.table_sizes[DIST_TABLE] as usize]
                         .copy_from_slice(&r.len_codes[dist_table_start..dist_table_end]);
 
                     r.block_type -= 1;
@@ -1453,7 +1490,7 @@ pub fn decompress(
             }),
 
             ReadExtraBitsCodeSize => generate_state!(state, 'state_machine, {
-                let num_extra = l.num_extra;
+                let num_extra = l.num_extra.into();
                 read_bits(&mut l, num_extra, &mut in_iter, flags, |l, mut extra_bits| {
                     // Mask to avoid a bounds check.
                     extra_bits += [3, 3, 11][(l.dist as usize - 16) & 3];
@@ -1463,19 +1500,16 @@ pub fn decompress(
                         0
                     };
 
-                    memset(
-                        &mut r.len_codes[
+                    r.len_codes[
                             l.counter as usize..l.counter as usize + extra_bits as usize
-                        ],
-                        val,
-                    );
+                        ].fill(val);
                     l.counter += extra_bits as u32;
                     Action::Jump(ReadLitlenDistTablesCodeSize)
                 })
             }),
 
             DecodeLitlen => generate_state!(state, 'state_machine, {
-                if in_iter.len() < 4 || out_buf.bytes_left() < 2 {
+                if in_iter.bytes_left() < 4 || out_buf.bytes_left() < 2 {
                     // See if we can decode a literal with the data we have left.
                     // Jumps to next state (WriteSymbol) if successful.
                     decode_huffman_code(
@@ -1493,7 +1527,7 @@ pub fn decompress(
                 // If there is enough space, use the fast inner decompression
                 // function.
                     out_buf.bytes_left() >= 259 &&
-                    in_iter.len() >= 14
+                    in_iter.bytes_left() >= 14
                 {
                     let (status, new_state) = decompress_fast(
                         r,
@@ -1513,7 +1547,7 @@ pub fn decompress(
                 } else {
                     fill_bit_buffer(&mut l, &mut in_iter);
 
-                    if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) {
+                    let (symbol, code_len) = r.tables[LITLEN_TABLE].lookup(l.bit_buf);
 
                     l.counter = symbol as u32;
                     l.bit_buf >>= code_len;
@@ -1529,7 +1563,7 @@ pub fn decompress(
                             fill_bit_buffer(&mut l, &mut in_iter);
                         }
 
-                        if let Some((symbol, code_len)) = r.tables[LITLEN_TABLE].lookup(l.bit_buf) {
+                        let (symbol, code_len) = r.tables[LITLEN_TABLE].lookup(l.bit_buf);
 
                             l.bit_buf >>= code_len;
                             l.num_bits -= code_len;
@@ -1545,13 +1579,9 @@ pub fn decompress(
                                 out_buf.write_byte(symbol as u8);
                                 Action::None
                             }
-                        } else {
-                            Action::Jump(InvalidCodeLen)
-                        }
-                    }
-                    } else {
-                        Action::Jump(InvalidCodeLen)
+
                     }
+
                 }
             }),
 
@@ -1584,7 +1614,7 @@ pub fn decompress(
                     // We could use get_unchecked later if can statically verify that
                     // this will never go out of bounds.
                     l.num_extra =
-                        u32::from(LENGTH_EXTRA[(l.counter - 257) as usize & BASE_EXTRA_MASK]);
+                        LENGTH_EXTRA[(l.counter - 257) as usize & BASE_EXTRA_MASK];
                     l.counter = u32::from(LENGTH_BASE[(l.counter - 257) as usize & BASE_EXTRA_MASK]);
                     // Length and distance codes have a number of extra bits depending on
                     // the base, which together with the base gives us the exact value.
@@ -1597,7 +1627,7 @@ pub fn decompress(
             }),
 
             ReadExtraBitsLitlen => generate_state!(state, 'state_machine, {
-                let num_extra = l.num_extra;
+                let num_extra = l.num_extra.into();
                 read_bits(&mut l, num_extra, &mut in_iter, flags, |l, extra_bits| {
                     l.counter += extra_bits as u32;
                     Action::Jump(DecodeDistance)
@@ -1608,16 +1638,19 @@ pub fn decompress(
                 // Try to read a huffman code from the input buffer and look up what
                 // length code the decoded symbol refers to.
                 decode_huffman_code(r, &mut l, DIST_TABLE, flags, &mut in_iter, |_r, l, symbol| {
+                    // # Optimizaton - transform the value into usize here before the check so
+                    // the compiler can optimize the bounds check later - ideally it should
+                    // know that the value can't be negative from earlier in the
+                    // decode_huffman_code function but it seems it may not be able
+                    // to make the assumption that it can't be negative and thus
+                    // overflow if it's converted after the check.
+                    let symbol = symbol as usize;
                     if symbol > 29 {
                         // Invalid distance code.
                         return Action::Jump(InvalidDist)
                     }
-                    // # Optimization
-                    // Mask the value to avoid bounds checks
-                    // We could use get_unchecked later if can statically verify that
-                    // this will never go out of bounds.
-                    l.num_extra = u32::from(DIST_EXTRA[symbol as usize & BASE_EXTRA_MASK]);
-                    l.dist = u32::from(DIST_BASE[symbol as usize & BASE_EXTRA_MASK]);
+                    l.num_extra = num_extra_bits_for_distance_code(symbol as u8);
+                    l.dist = u32::from(DIST_BASE[symbol]);
                     if l.num_extra != 0 {
                         // ReadEXTRA_BITS_DISTACNE
                         Action::Jump(ReadExtraBitsDistance)
@@ -1628,7 +1661,7 @@ pub fn decompress(
             }),
 
             ReadExtraBitsDistance => generate_state!(state, 'state_machine, {
-                let num_extra = l.num_extra;
+                let num_extra = l.num_extra.into();
                 read_bits(&mut l, num_extra, &mut in_iter, flags, |l, extra_bits| {
                     l.dist += extra_bits as u32;
                     Action::Jump(HuffDecodeOuterLoop2)
@@ -1636,8 +1669,8 @@ pub fn decompress(
             }),
 
             HuffDecodeOuterLoop2 => generate_state!(state, 'state_machine, {
-                if l.dist as usize > out_buf.position() &&
-                    (flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0)
+                if (l.dist as usize > out_buf.position() &&
+                    (flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF != 0)) || (l.dist as usize > out_buf.get_ref().len())
                 {
                     // We encountered a distance that refers a position before
                     // the start of the decoded data, so we can't continue.
@@ -1704,9 +1737,9 @@ pub fn decompress(
                 if r.finish != 0 {
                     pad_to_bytes(&mut l, &mut in_iter, flags, |_| Action::None);
 
-                    let in_consumed = in_buf.len() - in_iter.len();
+                    let in_consumed = in_buf.len() - in_iter.bytes_left();
                     let undo = undo_bytes(&mut l, in_consumed as u32) as usize;
-                    in_iter = in_buf[in_consumed - undo..].iter();
+                    in_iter = InputWrapper::from_slice(in_buf[in_consumed - undo..].iter().as_slice());
 
                     l.bit_buf &= ((1 as BitBuffer) << l.num_bits) - 1;
                     debug_assert_eq!(l.num_bits, 0);
@@ -1759,7 +1792,7 @@ pub fn decompress(
     let in_undo = if status != TINFLStatus::NeedsMoreInput
         && status != TINFLStatus::FailedCannotMakeProgress
     {
-        undo_bytes(&mut l, (in_buf.len() - in_iter.len()) as u32) as usize
+        undo_bytes(&mut l, (in_buf.len() - in_iter.bytes_left()) as u32) as usize
     } else {
         0
     };
@@ -1810,7 +1843,7 @@ pub fn decompress(
 
     (
         status,
-        in_buf.len() - in_iter.len() - in_undo,
+        in_buf.len() - in_iter.bytes_left() - in_undo,
         out_buf.position() - out_pos,
     )
 }
@@ -1891,7 +1924,7 @@ mod test {
     }
 
     fn masked_lookup(table: &HuffmanTable, bit_buf: BitBuffer) -> (i32, u32) {
-        let ret = table.lookup(bit_buf).unwrap();
+        let ret = table.lookup(bit_buf);
         (ret.0 & 511, ret.1)
     }
 
@@ -2049,4 +2082,49 @@ mod test {
         let res = decompress(&mut r, &encoded, &mut output_buf, 0, flags);
         assert_eq!(res, (TINFLStatus::HasMoreOutput, 2, 0));
     }
+
+    #[test]
+    fn dist_extra_bits() {
+        use self::num_extra_bits_for_distance_code;
+        // Number of extra bits for each distance code.
+        const DIST_EXTRA: [u8; 29] = [
+            0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+            12, 13,
+        ];
+
+        for (i, &dist) in DIST_EXTRA.iter().enumerate() {
+            assert_eq!(dist, num_extra_bits_for_distance_code(i as u8));
+        }
+    }
+
+    #[test]
+    fn check_tree() {
+        let mut r = DecompressorOxide::new();
+        let mut l = LocalVars {
+            bit_buf: 0,
+            num_bits: 0,
+            dist: 0,
+            counter: 0,
+            num_extra: 0,
+        };
+
+        r.code_size_huffman[0] = 1;
+        r.code_size_huffman[1] = 1;
+        //r.code_size_huffman[2] = 3;
+        //r.code_size_huffman[3] = 3;
+        //r.code_size_huffman[1] = 4;
+        r.block_type = HUFFLEN_TABLE as u8;
+        r.table_sizes[HUFFLEN_TABLE] = 4;
+        let res = init_tree(&mut r, &mut l).unwrap();
+
+        let status = match res {
+            Action::Jump(s) => s,
+            _ => {
+                //println!("issue");
+                return;
+            }
+        };
+        //println!("status {:?}", status);
+        assert!(status != BadTotalSymbols);
+    }
 }
diff --git a/miniz_oxide/src/inflate/mod.rs b/miniz_oxide/src/inflate/mod.rs
index 3f787e72..cbf41ee7 100644
--- a/miniz_oxide/src/inflate/mod.rs
+++ b/miniz_oxide/src/inflate/mod.rs
@@ -2,7 +2,6 @@
 
 #[cfg(feature = "with-alloc")]
 use crate::alloc::{boxed::Box, vec, vec::Vec};
-use ::core::usize;
 #[cfg(all(feature = "std", feature = "with-alloc"))]
 use std::error::Error;
 
@@ -123,7 +122,7 @@ fn decompress_error(status: TINFLStatus, output: Vec<u8>) -> Result<Vec<u8>, Dec
 #[inline]
 #[cfg(feature = "with-alloc")]
 pub fn decompress_to_vec(input: &[u8]) -> Result<Vec<u8>, DecompressError> {
-    decompress_to_vec_inner(input, 0, usize::max_value())
+    decompress_to_vec_inner(input, 0, usize::MAX)
 }
 
 /// Decompress the deflate-encoded data (with a zlib wrapper) in `input` to a vector.
@@ -139,7 +138,7 @@ pub fn decompress_to_vec_zlib(input: &[u8]) -> Result<Vec<u8>, DecompressError>
     decompress_to_vec_inner(
         input,
         inflate_flags::TINFL_FLAG_PARSE_ZLIB_HEADER,
-        usize::max_value(),
+        usize::MAX,
     )
 }
 
diff --git a/miniz_oxide/src/inflate/output_buffer.rs b/miniz_oxide/src/inflate/output_buffer.rs
index 5218a807..ce0ccd61 100644
--- a/miniz_oxide/src/inflate/output_buffer.rs
+++ b/miniz_oxide/src/inflate/output_buffer.rs
@@ -14,12 +14,12 @@ impl<'a> OutputBuffer<'a> {
         OutputBuffer { slice, position }
     }
 
-    #[inline]
+    #[inline(always)]
     pub const fn position(&self) -> usize {
         self.position
     }
 
-    #[inline]
+    #[inline(always)]
     pub fn set_position(&mut self, position: usize) {
         self.position = position;
     }
@@ -48,13 +48,64 @@ impl<'a> OutputBuffer<'a> {
         self.slice.len() - self.position
     }
 
-    #[inline]
+    #[inline(always)]
     pub const fn get_ref(&self) -> &[u8] {
         self.slice
     }
 
-    #[inline]
+    #[inline(always)]
     pub fn get_mut(&mut self) -> &mut [u8] {
         self.slice
     }
 }
+
+/// A wrapper for the output slice used when decompressing.
+///
+/// Using this rather than `Cursor` lets us implement the writing methods directly on
+/// the buffer and lets us use a usize rather than u64 for the position which helps with
+/// performance on 32-bit systems.
+#[derive(Copy, Clone)]
+pub struct InputWrapper<'a> {
+    slice: &'a [u8],
+}
+
+impl<'a> InputWrapper<'a> {
+    #[inline(always)]
+    pub const fn as_slice(&self) -> &[u8] {
+        self.slice
+    }
+
+    #[inline(always)]
+    pub const fn from_slice(slice: &'a [u8]) -> InputWrapper<'a> {
+        InputWrapper { slice }
+    }
+
+    #[inline(always)]
+    pub fn advance(&mut self, steps: usize) {
+        self.slice = &self.slice[steps..];
+    }
+
+    #[inline]
+    pub fn read_byte(&mut self) -> Option<u8> {
+        self.slice.first().map(|n| {
+            self.advance(1);
+            *n
+        })
+    }
+
+    #[inline]
+    #[cfg(target_pointer_width = "64")]
+    pub fn read_u32_le(&mut self) -> u32 {
+        let ret = {
+            let four_bytes: [u8; 4] = self.slice[..4].try_into().unwrap_or_default();
+            u32::from_le_bytes(four_bytes)
+        };
+        self.advance(4);
+        ret
+    }
+
+    #[inline(always)]
+    pub const fn bytes_left(&self) -> usize {
+        self.slice.len()
+    }
+}
diff --git a/miniz_oxide/src/inflate/stream.rs b/miniz_oxide/src/inflate/stream.rs
index 5463ab0f..39b41e1c 100644
--- a/miniz_oxide/src/inflate/stream.rs
+++ b/miniz_oxide/src/inflate/stream.rs
@@ -57,6 +57,7 @@ impl ResetPolicy for FullReset {
 
 /// A struct that compbines a decompressor with extra data for streaming decompression.
 ///
+#[derive(Clone)]
 pub struct InflateState {
     /// Inner decompressor struct
     decomp: DecompressorOxide,
@@ -226,6 +227,9 @@ pub fn inflate(
     if (flush == MZFlush::Finish) && first_call {
         decomp_flags |= inflate_flags::TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF;
 
+        // The caller is indicating that they want to finish the compression and this is the first call with the current stream
+        // so we can simply write directly to the output buffer.
+        // If there is not enough space for all of the decompressed data we will end up with a failure regardless.
         let status = decompress(&mut state.decomp, next_in, next_out, 0, decomp_flags);
         let in_bytes = status.1;
         let out_bytes = status.2;
@@ -420,4 +424,78 @@ mod test {
         // Should still have the checksum read from the header file.
         assert_eq!(state.decompressor().adler32_header(), Some(459605011))
     }
+
+    #[test]
+    fn test_partial_continue() {
+        let encoded = [
+            120u8, 156, 243, 72, 205, 201, 201, 215, 81, 168, 202, 201, 76, 82, 4, 0, 27, 101, 4,
+            19,
+        ];
+
+        // Feed input bytes one at a time to the decompressor
+        let mut out = vec![0; 50];
+        let mut state = InflateState::new_boxed(DataFormat::Zlib);
+        let mut part_in = 0;
+        let mut part_out = 0;
+        for i in 1..=encoded.len() {
+            let res = inflate(
+                &mut state,
+                &encoded[part_in..i],
+                &mut out[part_out..],
+                MZFlush::None,
+            );
+            let status = res.status.expect("Failed to decompress!");
+            if i == encoded.len() {
+                assert_eq!(status, MZStatus::StreamEnd);
+            } else {
+                assert_eq!(status, MZStatus::Ok);
+            }
+            part_out += res.bytes_written as usize;
+            part_in += res.bytes_consumed;
+        }
+
+        assert_eq!(out[..part_out as usize], b"Hello, zlib!"[..]);
+        assert_eq!(part_in, encoded.len());
+        assert_eq!(state.decompressor().adler32(), Some(459605011));
+    }
+
+    // Inflate part of a stream and clone the inflate state.
+    // Discard the original state and resume the stream from the clone.
+    #[test]
+    fn test_rewind_and_resume() {
+        let encoded = [
+            120u8, 156, 243, 72, 205, 201, 201, 215, 81, 168, 202, 201, 76, 82, 4, 0, 27, 101, 4,
+            19,
+        ];
+        let decoded = b"Hello, zlib!";
+
+        // Feed partial input bytes to the decompressor
+        let mut out = vec![0; 50];
+        let mut state = InflateState::new_boxed(DataFormat::Zlib);
+        let res1 = inflate(&mut state, &encoded[..10], &mut out, MZFlush::None);
+        let status = res1.status.expect("Failed to decompress!");
+        assert_eq!(status, MZStatus::Ok);
+
+        // Clone the state and discard the original
+        let mut resume = state.clone();
+        drop(state);
+
+        // Resume the stream using the cloned state
+        let res2 = inflate(
+            &mut resume,
+            &encoded[res1.bytes_consumed..],
+            &mut out[res1.bytes_written..],
+            MZFlush::Finish,
+        );
+        let status = res2.status.expect("Failed to decompress!");
+        assert_eq!(status, MZStatus::StreamEnd);
+
+        assert_eq!(res1.bytes_consumed + res2.bytes_consumed, encoded.len());
+        assert_eq!(res1.bytes_written + res2.bytes_written, decoded.len());
+        assert_eq!(
+            &out[..res1.bytes_written + res2.bytes_written as usize],
+            decoded
+        );
+        assert_eq!(resume.decompressor().adler32(), Some(459605011));
+    }
 }
diff --git a/miniz_oxide/tests/test.rs b/miniz_oxide/tests/test.rs
index dcb93874..cd30a341 100644
--- a/miniz_oxide/tests/test.rs
+++ b/miniz_oxide/tests/test.rs
@@ -250,6 +250,88 @@ fn issue_143_return_buf_error_on_finish_without_end_header() {
     assert_eq!(inflate_result.status.unwrap_err(), MZError::Buf)
 }
 
+#[test]
+fn decompress_empty_dynamic() {
+    // Empty block with dynamic huffman codes.
+    let enc = vec![5, 192, 129, 8, 0, 0, 0, 0, 32, 127, 235, 0b011, 0, 0, 0];
+
+    let res = decompress_to_vec(enc.as_slice()).unwrap();
+    assert!(res.is_empty());
+
+    let enc = vec![5, 192, 129, 8, 0, 0, 0, 0, 32, 127, 235, 0b1111011, 0, 0, 0];
+
+    let res = decompress_to_vec(enc.as_slice());
+    assert!(res.is_err());
+}
+
+fn decode_hex(s: &str) -> Vec<u8> {
+    (0..s.len())
+        .step_by(2)
+        .map(|i| u8::from_str_radix(&s[i..i + 2], 16).unwrap())
+        .collect::<Vec<_>>()
+}
+
+#[test]
+fn issue_161_index_out_of_range_apply_match() {
+    // This data contains an match that has a distance before the start of the data.
+    // and resulted in an edge cause causing a panic instead of returning with an error when using.
+    // a smaller wrapping buffer.
+    let content_hex = "fa99fff4f37fef5bbff9bb6ccb9ab4e47f66d9875cebf9ffe6eb6fbdf6e24b773f72ebe5175f62ff26bf78eec57bafdd78ee6b5f7efeee2b2f5b1d2bfe5100";
+    let content = decode_hex(&content_hex);
+
+    let mut decompressor = miniz_oxide::inflate::core::DecompressorOxide::new();
+
+    let mut buf2 = vec![0; 2048];
+    let _ = miniz_oxide::inflate::core::decompress(&mut decompressor, &content, &mut buf2, 0, 0);
+}
+
+#[test]
+fn empty_stored() {
+    // Compress empty input using stored compression level
+    // There was a logic error casuing this to output zeroes
+    // from the empty data buffer instead of outputting an empty stored block.
+    let data = vec![];
+    let enc = compress_to_vec_zlib(&data, 0);
+    let _ = decompress_to_vec_zlib(&enc).unwrap();
+}
+
+/*
+#[test]
+fn partial_decompression_imap_issue_158() {
+    use miniz_oxide::inflate::stream::{inflate, InflateState};
+    use miniz_oxide::{DataFormat, MZFlush};
+    use std::string;
+
+    // Decompresses to
+    // "* QUOTAROOT INBOX \"User quota\"\r\n* QUOTA \"User quota\" (STORAGE 76 307200)\r\nA0001 OK Getquotaroot completed (0.001 + 0.000 secs).\r\n"
+    let input = vec![
+        210, 82, 8, 12, 245, 15, 113, 12, 242, 247, 15, 81, 240, 244, 115, 242, 143, 80, 80, 10,
+        45, 78, 45, 82, 40, 44, 205, 47, 73, 84, 226, 229, 210, 130, 200, 163, 136, 42, 104, 4,
+        135, 248, 7, 57, 186, 187, 42, 152, 155, 41, 24, 27, 152, 27, 25, 24, 104, 242, 114, 57,
+        26, 24, 24, 24, 42, 248, 123, 43, 184, 167, 150, 128, 213, 21, 229, 231, 151, 40, 36, 231,
+        231, 22, 228, 164, 150, 164, 166, 40, 104, 24, 232, 129, 20, 104, 43, 128, 104, 3, 133,
+        226, 212, 228, 98, 77, 61, 94, 46, 0, 0, 0, 0, 255, 255,
+    ];
+
+    let mut inflate_stream = InflateState::new(DataFormat::Raw);
+    let mut output = vec![0; 8];
+    let result = inflate(&mut inflate_stream, &input, &mut output, MZFlush::None);
+
+    let out_string: String = string::String::from_utf8(output).unwrap();
+
+    println!("{}", out_string);
+    println!("written {}", result.bytes_written);
+
+    assert!(result.status.is_ok());
+    // Should not consume everything, there is not enough space in the buffer for the output.
+    assert!(
+        result.bytes_consumed < input.len(),
+        "bytes consumed {:?}, input.len() {}",
+        result.bytes_consumed,
+        input.len()
+    )
+}*/
+
 /*
 #[test]
 fn large_file() {
diff --git a/src/c_export.rs b/src/c_export.rs
index 97e3be21..1bee6999 100644
--- a/src/c_export.rs
+++ b/src/c_export.rs
@@ -253,7 +253,6 @@ impl<'io, ST: StateType> StreamOxide<'io, ST> {
     }
 }
 
-#[cfg(not(no_c_export))]
 unmangle!(
     /// Default allocation function using `malloc`.
     pub unsafe extern "C" fn miniz_def_alloc_func(
diff --git a/src/lib.rs b/src/lib.rs
index 438fddb7..d71d0330 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -297,13 +297,13 @@ unmangle!(
     }
 );
 
-#[cfg(target_bit_width = "64")]
+#[cfg(target_pointer_width = "64")]
 #[inline]
 fn buffer_too_large(source_len: c_ulong, dest_len: c_ulong) -> bool {
     (source_len | dest_len) > 0xFFFFFFFF
 }
 
-#[cfg(not(target_bit_width = "64"))]
+#[cfg(not(target_pointer_width = "64"))]
 #[inline]
 fn buffer_too_large(_source_len: c_ulong, _dest_len: c_ulong) -> bool {
     false