Skip to content

Commit ab6b745

Browse files
committed
Merge branch 'main' of https://github.com/rmoralespp/jsonl
2 parents 0339e37 + 375c07b commit ab6b745

File tree

5 files changed

+146
-17
lines changed

5 files changed

+146
-17
lines changed

jsonl.py

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,48 @@
4242
_logger = logging.getLogger(__name__)
4343
_logger.addHandler(logging.NullHandler())
4444

45+
ext_jsonl = ".jsonl"
46+
ext_gz = ".gz"
47+
ext_bz2 = ".bz2"
48+
ext_xz = ".xz"
49+
extensions = frozenset((ext_jsonl, ext_gz, ext_bz2, ext_xz))
50+
51+
52+
def _get_fileobj_extension(fileobj, /):
53+
"""Get the file extension based on the initial bytes of a file-like object."""
54+
55+
fd_position = fileobj.tell() # Save current position
56+
fileobj.seek(0) # Go to the start of the file
57+
bytes_ = fileobj.read(6) # Read enough bytes to detect compression
58+
fileobj.seek(fd_position) # Restore the original position
59+
60+
if bytes_[:2] == b"\x1f\x8b":
61+
# https://tools.ietf.org/html/rfc1952#page-6
62+
return ext_gz
63+
elif bytes_[:3] == b"\x42\x5a\x68":
64+
# https://en.wikipedia.org/wiki/List_of_file_signatures
65+
return ext_bz2
66+
elif bytes_[:6] == b"\xfd\x37\x7a\x58\x5a\x00":
67+
# https://tukaani.org/xz/xz-file-format.txt
68+
return ext_xz
69+
else:
70+
return None
71+
72+
73+
def _get_file_extension(name, mode, /, *, fileobj=None):
74+
"""Get the file extension based on the filename or file-like object."""
75+
76+
extension = os.path.splitext(name)[1]
77+
if extension in extensions:
78+
return extension
79+
elif mode == "rb" and fileobj:
80+
return _get_fileobj_extension(fileobj)
81+
elif "r" in mode: # if not fileobj, try to open the file and detect from content
82+
with open(name, "rb") as fd:
83+
return _get_fileobj_extension(fd)
84+
else:
85+
return None
86+
4587

4688
def _looks_like_url(value, /):
4789
if isinstance(value, (str, urllib.request.Request)):
@@ -79,12 +121,12 @@ def _xopen(name, /, *, mode="rb", encoding=None):
79121

80122
default = open
81123
openers = {
82-
".jsonl": default,
83-
".gz": gzip.open,
84-
".bz2": bz2.open,
85-
".xz": lzma.open,
124+
ext_jsonl: default,
125+
ext_gz: gzip.open,
126+
ext_bz2: bz2.open,
127+
ext_xz: lzma.open,
86128
}
87-
extension = os.path.splitext(name)[1]
129+
extension = _get_file_extension(name, mode)
88130
opener = openers.get(extension, default)
89131
return opener(name, mode=mode, encoding=encoding or _get_encoding(mode))
90132

@@ -99,15 +141,15 @@ def _xfile(name, obj, /):
99141
:param obj: File-like an object.
100142
"""
101143

102-
if name.endswith(".gz"):
144+
ext = _get_file_extension(name, "rb", fileobj=obj)
145+
if ext == ext_gz:
103146
file = gzip.GzipFile(fileobj=obj)
104-
elif name.endswith(".bz2"):
147+
elif ext == ext_bz2:
105148
file = bz2.BZ2File(obj)
106-
elif name.endswith(".xz"):
149+
elif ext == ext_xz:
107150
file = lzma.LZMAFile(obj) # noqa: SIM115
108151
else:
109152
file = obj
110-
111153
try:
112154
yield file
113155
finally:
@@ -344,7 +386,7 @@ def load_archive(
344386
:param bool broken: If true, skip broken lines (only logging a warning).
345387
:param Optional[Callable] json_loads: Custom function to deserialize JSON strings. By default, `json.loads` is used.
346388
:param Unpack[dict] json_loads_kwargs: Additional keywords to pass to `loads` of `json` provider.
347-
:rtype: Generator[tuple[str, Generator[Any]]]
389+
:rtype: Iterator[tuple[str, Iterator[Any]]]
348390
"""
349391

350392
# If a URL or Request object is provided, download the archive first.

tests/conftest.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import pytest
1414
import ujson
1515

16+
import jsonl
17+
1618
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "data"))
1719

1820

@@ -39,6 +41,8 @@ def log_message(self, fmt, *args): # pragma: no cover
3941

4042
server_thread = threading.Thread(target=server.serve_forever, name="http_server")
4143
server_thread.start()
44+
# Fix: warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg))
45+
threading.excepthook = lambda args: server.shutdown()
4246

4347
try:
4448
with server.socket:
@@ -64,7 +68,7 @@ def pathlike(request):
6468
return request.param
6569

6670

67-
@pytest.fixture(scope="package", params=(".jsonl", ".gz", ".bz2", ".xz", ".unknown"))
71+
@pytest.fixture(scope="package", params=jsonl.extensions)
6872
def file_extension(request):
6973
return request.param
7074

tests/test_load.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ def test_invalid_json_lines(broken):
2424
"[4]\n"
2525
"suffix\n" # bad JSON line
2626
)
27-
with contextlib.closing(io.StringIO(lines)) as iofile:
28-
result = jsonl.load(iofile, broken=broken)
27+
with contextlib.closing(io.StringIO(lines)) as fd:
28+
result = jsonl.load(fd, broken=broken)
2929
if broken:
3030
assert tuple(result) == ([1, 2], [3], [4])
3131
else:
@@ -34,8 +34,8 @@ def test_invalid_json_lines(broken):
3434

3535

3636
def test_invalid_utf8(broken):
37-
with contextlib.closing(io.BytesIO(b"\xff\xff\n[1, 2]")) as iofile:
38-
result = jsonl.load(iofile, broken=broken)
37+
with contextlib.closing(io.BytesIO(b"\xff\xff\n[1, 2]")) as fd:
38+
result = jsonl.load(fd, broken=broken)
3939
if broken:
4040
assert tuple(result) == ([1, 2],)
4141
else:
@@ -78,6 +78,17 @@ def test_filepath(filepath, json_loads, pathlike):
7878
assert result == expected
7979

8080

81+
def test_filepath_unknown_extension_but_detected_by_signature(filepath, json_loads):
82+
expected = tuple(tests.data)
83+
tests.write_text(filepath, content=tests.string_data) # Write compressed data first
84+
# Rename to have an unknown extension after writing valid data
85+
new_filepath = filepath + ".unknown"
86+
os.rename(filepath, new_filepath)
87+
88+
result = tuple(jsonl.load(new_filepath, json_loads=json_loads))
89+
assert result == expected
90+
91+
8192
@pytest.mark.parametrize("opener", (open, None))
8293
def test_filepath_using_opener(opener):
8394
expected = tuple(tests.data)

tests/test_load_archive.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22

33
import operator
4+
import os
45
import shutil
56

67
import pytest
@@ -14,9 +15,11 @@
1415
("file1", ["file1"]),
1516
("file*", ["file1", "file2"]),
1617
])
18+
@pytest.mark.parametrize("with_unknown_ext", [True, False])
1719
@pytest.mark.parametrize("archive_format", ["tar", "zip"])
18-
def test_load_archive(pattern, match_members, tmp_dir, archive_format, file_extension):
20+
def test_load_archive(pattern, match_members, tmp_dir, archive_format, file_extension, with_unknown_ext):
1921
pattern += file_extension
22+
2023
if archive_format == "tar":
2124
pattern = "./" + pattern # tar requires a leading slash for patterns
2225
match_members = [f"./{member}" for member in match_members]
@@ -31,13 +34,24 @@ def test_load_archive(pattern, match_members, tmp_dir, archive_format, file_exte
3134
content = tests.string_data.encode(jsonl._utf_8)
3235
fp.write(content)
3336

37+
if with_unknown_ext:
38+
# Rename files to have an unknown extension after writing valid data
39+
for member in members:
40+
new_member = member + ".unknown"
41+
os.rename(root_dir / member, root_dir / new_member)
42+
# Adjust pattern to match the new extension
43+
pattern += ".unknown"
44+
3445
archivepath = str(root_dir / "myarchive")
3546
archivepath = shutil.make_archive(archivepath, archive_format, root_dir=root_dir, base_dir=".")
3647

3748
order_by = operator.itemgetter(0)
3849
expected = sorted(((name, tests.data) for name in match_members), key=order_by)
3950
result = jsonl.load_archive(archivepath, pattern=pattern)
4051
result = sorted(((name, list(data)) for name, data in result), key=order_by)
52+
if with_unknown_ext:
53+
# Adjust expected names to have the .unknown extension
54+
expected = [(name + ".unknown", data) for name, data in expected]
4155
assert result == expected
4256

4357

tests/test_xfile.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
# -*- coding: utf-8 -*-
2+
23
import bz2
34
import gzip
45
import io
56
import lzma
67
import unittest.mock
78

9+
import pytest
10+
811
import jsonl
12+
import tests
913

1014

1115
def test_xfile_object(filepath):
12-
obj = unittest.mock.Mock()
16+
obj = unittest.mock.MagicMock() # subscribable object
1317
with jsonl._xfile(filepath, obj) as result:
1418
if filepath.endswith(".gz"):
1519
assert isinstance(result, gzip.GzipFile)
@@ -36,3 +40,57 @@ def test_xfile_close(filepath):
3640
assert not buffer.closed
3741

3842

43+
@pytest.mark.parametrize(
44+
"file_content, expected",
45+
[
46+
(b"\x1f\x8b\x08\x00\x00\x00", jsonl.ext_gz), # Gzip signature
47+
(b"\x42\x5a\x68\x31\x31\x39", jsonl.ext_bz2), # Bzip2 signature
48+
(b"\xfd\x37\x7a\x58\x5a\x00", jsonl.ext_xz), # XZ signature
49+
(b"\x00\x00\x00\x00\x00\x00", None), # No matching signature
50+
(b"", None), # Empty file content
51+
],
52+
)
53+
def test_get_fileobj_extension_ok(file_content, expected):
54+
fd = io.BytesIO(file_content)
55+
assert jsonl._get_fileobj_extension(fd) == expected
56+
57+
58+
def test_get_fileobj_extension_restores_file_pointer():
59+
file_content = b"\x1f\x8b\x08\x00\x00\x00"
60+
fd = io.BytesIO(file_content)
61+
initial_position = fd.tell()
62+
jsonl._get_fileobj_extension(fd)
63+
assert fd.tell() == initial_position
64+
65+
66+
@pytest.mark.parametrize(
67+
"name, mode, fileobj, expected",
68+
[
69+
# Known extensions
70+
("file.jsonl", "r", None, jsonl.ext_jsonl),
71+
("file.gz", "rb", io.BytesIO(b"\x1f\x8b"), jsonl.ext_gz),
72+
("file.bz2", "rb", io.BytesIO(b"\x42\x5a\x68"), jsonl.ext_bz2),
73+
("file.xz", "rb", io.BytesIO(b"\xfd\x37\x7a\x58\x5a\x00"), jsonl.ext_xz),
74+
# Unknown extensions but detected by signature
75+
("file.unknown", "rb", io.BytesIO(b"\x1f\x8b"), jsonl.ext_gz),
76+
("file.unknown", "rb", io.BytesIO(b"\x42\x5a\x68"), jsonl.ext_bz2),
77+
("file.unknown", "rb", io.BytesIO(b"\xfd\x37\x7a\x58\x5a\x00"), jsonl.ext_xz),
78+
# Unknown extension but text mode
79+
("file.unknown", "r", None, None),
80+
# Unknown extension but write/append binary mode
81+
("file.unknown", "wb", None, None),
82+
# Unknown extensions and no fileobj or undetectable fileobj
83+
("file", "r", None, None), # No extension
84+
("file", "rb", io.BytesIO(b""), None), # No extension, empty fileobj
85+
],
86+
)
87+
def test_get_file_extension_ok(name, mode, fileobj, expected, tmp_dir):
88+
path = tmp_dir / name
89+
if not fileobj:
90+
tests.write_text(path)
91+
assert jsonl._get_file_extension(path, mode, fileobj=fileobj) == expected
92+
93+
94+
def test_get_file_extension_ko():
95+
with pytest.raises(FileNotFoundError):
96+
jsonl._get_file_extension("nonexistent.file", "r")

0 commit comments

Comments
 (0)