Skip to content

Support for lz4 compression #163 #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ jobs:
steps:
- name: Install optional tools macOS
if: runner.os == 'macOS' && matrix.optional-deps
run: brew install pigz pbzip2 isa-l zstd
run: brew install pigz pbzip2 isa-l zstd lz4
- name: Install optional tools Linux
if: runner.os == 'Linux' && matrix.optional-deps
run: sudo apt-get install pigz pbzip2 isal zstd
run: sudo apt-get install pigz pbzip2 isal zstd lz4
- name: Remove xz
if: runner.os == 'Linux' && !matrix.optional-deps
run: while which xz; do sudo rm $(which xz); done
Expand Down
10 changes: 7 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Supported compression formats are:
- gzip (``.gz``)
- bzip2 (``.bz2``)
- xz (``.xz``)
- lz4 (``.lz4``)
- Zstandard (``.zst``) (optional)


Expand Down Expand Up @@ -71,7 +72,7 @@ The function opens the file using a function suitable for the detected
file format and returns an open file-like object.

When writing, the file format is chosen based on the file name extension:
``.gz``, ``.bz2``, ``.xz``, ``.zst``. This can be overriden with ``format``.
``.gz``, ``.bz2``, ``.xz``, ``.zst``, ``.lz4``. This can be overriden with ``format``.
If the extension is not recognized, no compression is used.

When reading and a file name extension is available, the format is detected
Expand Down Expand Up @@ -99,13 +100,13 @@ preferred locale encoding.
**compresslevel**:
The compression level for writing to gzip, xz and Zstandard files.
If set to None, a default depending on the format is used:
gzip: 1, xz: 6, Zstandard: 3.
gzip: 1, xz: 6, Zstandard: 3, lz4: 1.

This parameter is ignored for other compression formats.

**format**:
Override the autodetection of the input or output format.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``.
Possible values are: ``"gz"``, ``"xz"``, ``"bz2"``, ``"zst"``, ``"lz4"``.

**threads**:
Set the number of additional threads spawned for compression or decompression.
Expand Down Expand Up @@ -138,6 +139,9 @@ built-in support for multithreaded compression.

For bz2 files, `pbzip2 (parallel bzip2) <http://compression.great-site.net/pbzip2/>`_ is used.

For lz4 files, [python lz4](https://python-lz4.readthedocs.io/en/stable/index.html)
package is used.

``xopen`` falls back to Python’s built-in functions
(``gzip.open``, ``lzma.open``, ``bz2.open``)
if none of the other methods can be used.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ requires-python = ">=3.9"
dynamic = ["version"]
dependencies = [
'isal>=1.6.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"'
'zlib-ng>=0.4.1; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"',
'lz4>4.3.1; platform_python_implementation != "PyPy"',
]

[project.urls]
Expand Down
62 changes: 56 additions & 6 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
XOPEN_DEFAULT_BZ2_COMPRESSION = 9
XOPEN_DEFAULT_XZ_COMPRESSION = 6
XOPEN_DEFAULT_ZST_COMPRESSION = 3
XOPEN_DEFAULT_LZ4_COMPRESSION = 0

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
Expand Down Expand Up @@ -70,6 +71,11 @@
except ImportError:
zstandard = None # type: ignore

try:
import lz4.frame # type: ignore
except ImportError:
lz4 = None

try:
import fcntl

Expand Down Expand Up @@ -120,6 +126,7 @@ class _ProgramSettings:
"zstd": _ProgramSettings(("zstd",), tuple(range(1, 20)), "-T"),
"pigz": _ProgramSettings(("pigz", "--no-name"), tuple(range(0, 10)) + (11,), "-p"),
"gzip": _ProgramSettings(("gzip", "--no-name"), tuple(range(1, 10))),
"lz4": _ProgramSettings(("lz4",), tuple(range(0, 17))),
}


Expand Down Expand Up @@ -551,6 +558,42 @@ def _open_zst(
return io.BufferedWriter(f) # mode "ab" and "wb"


def _open_lz4(
filename: FileOrPath,
mode: str,
compresslevel: Optional[int],
threads: Optional[int],
):
assert mode in ("rb", "ab", "wb")
if compresslevel is None:
compresslevel = XOPEN_DEFAULT_LZ4_COMPRESSION

if lz4 is not None and (mode == "rb" or (mode in ("ab", "wb") and threads == 0)):
# use Python bindings
f = lz4.frame.LZ4FrameFile(filename, mode, compression_level=compresslevel)
return f
else:
# use CLI program
try:
return _PipedCompressionProgram(
filename,
mode,
compresslevel,
threads,
program_settings=_PROGRAM_SETTINGS["lz4"],
)
except OSError:
_program_settings = _PROGRAM_SETTINGS["lz4"]
_program_settings.threads_flag = None
return _PipedCompressionProgram(
filename,
mode,
compresslevel,
threads,
program_settings=_program_settings,
)


def _open_gz(
filename: FileOrPath,
mode: str,
Expand Down Expand Up @@ -683,6 +726,10 @@ def _detect_format_from_content(filename: FileOrPath) -> Optional[str]:
elif bs[:4] == b"\x28\xb5\x2f\xfd":
# https://datatracker.ietf.org/doc/html/rfc8478#section-3.1.1
return "zst"
elif bs[:4] == b"\x04\x22\x4d\x18":
# https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)
return "lz4"

return None
finally:
if closefd:
Expand All @@ -694,7 +741,7 @@ def _detect_format_from_extension(filename: Union[str, bytes]) -> Optional[str]:
Attempt to detect file format from the filename extension.
Return None if no format could be detected.
"""
for ext in ("bz2", "xz", "gz", "zst"):
for ext in ("bz2", "xz", "gz", "zst", "lz4"):
if isinstance(filename, bytes):
if filename.endswith(b"." + ext.encode()):
return ext
Expand All @@ -717,7 +764,7 @@ def _file_or_path_to_binary_stream(
# object is not binary, this will crash at a later point.
return file_or_path, False # type: ignore
raise TypeError(
f"Unsupported type for {file_or_path}, " f"{file_or_path.__class__.__name__}."
f"Unsupported type for {file_or_path}, {file_or_path.__class__.__name__}."
)


Expand Down Expand Up @@ -797,6 +844,7 @@ def xopen( # noqa: C901
- .bz2 uses bzip2 compression
- .xz uses xz/lzma compression
- .zst uses zstandard compression
- .lz4 uses lz4 compression
- otherwise, no compression is used

When reading, if a file name extension is available, the format is detected
Expand All @@ -808,7 +856,7 @@ def xopen( # noqa: C901
compresslevel is the compression level for writing to gzip, xz and zst files.
This parameter is ignored for the other compression formats.
If set to None, a default depending on the format is used:
gzip: 6, xz: 6, zstd: 3.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: didn't we change the gzip level to 1?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we did.

gzip: 6, xz: 6, zstd: 3, lz4: 0.

When threads is None (the default), compressed file formats are read or written
using a pipe to a subprocess running an external tool such as,
Expand All @@ -828,7 +876,7 @@ def xopen( # noqa: C901

format overrides the autodetection of input and output formats. This can be
useful when compressed output needs to be written to a file without an
extension. Possible values are "gz", "xz", "bz2", "zst".
extension. Possible values are "gz", "xz", "bz2", "zst", "lz4".
"""
if mode in ("r", "w", "a"):
mode += "t" # type: ignore
Expand All @@ -844,10 +892,10 @@ def xopen( # noqa: C901
elif _file_is_a_socket_or_pipe(filename):
filename = open(filename, binary_mode) # type: ignore

if format not in (None, "gz", "xz", "bz2", "zst"):
if format not in (None, "gz", "xz", "bz2", "zst", "lz4"):
raise ValueError(
f"Format not supported: {format}. "
f"Choose one of: 'gz', 'xz', 'bz2', 'zst'"
f"Choose one of: 'gz', 'xz', 'bz2', 'zst', 'lz4'."
)
detected_format = format or _detect_format_from_extension(filepath)
if detected_format is None and "r" in mode:
Expand All @@ -861,6 +909,8 @@ def xopen( # noqa: C901
opened_file = _open_bz2(filename, binary_mode, compresslevel, threads)
elif detected_format == "zst":
opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
elif detected_format == "lz4":
opened_file = _open_lz4(filename, binary_mode, compresslevel, threads)
else:
opened_file, _ = _file_or_path_to_binary_stream(filename, binary_mode)

Expand Down
Binary file added tests/file.txt.lz4
Binary file not shown.
10 changes: 9 additions & 1 deletion tests/test_piped.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
_ProgramSettings,
)

extensions = ["", ".gz", ".bz2", ".xz", ".zst"]
extensions = ["", ".gz", ".bz2", ".xz", ".zst", ".lz4"]

try:
import fcntl
Expand Down Expand Up @@ -57,16 +57,24 @@ def available_zstd_programs():
return []


def available_lz4_programs():
if shutil.which("lz4"):
return [_PROGRAM_SETTINGS["lz4"]]
return []


PIPED_GZIP_PROGRAMS = available_gzip_programs()
PIPED_BZIP2_PROGRAMS = available_bzip2_programs()
PIPED_XZ_PROGRAMS = available_xz_programs()
PIPED_ZST_PROGRAMS = available_zstd_programs()
PIPED_LZ4_PROGRAMS = available_lz4_programs()

ALL_PROGRAMS_WITH_EXTENSION = (
list(zip(PIPED_GZIP_PROGRAMS, cycle([".gz"])))
+ list(zip(PIPED_BZIP2_PROGRAMS, cycle([".bz2"])))
+ list(zip(PIPED_XZ_PROGRAMS, cycle([".xz"])))
+ list(zip(PIPED_ZST_PROGRAMS, cycle([".zst"])))
+ list(zip(PIPED_LZ4_PROGRAMS, cycle([".lz4"])))
)


Expand Down
Loading
Loading