Skip to content

Commit ef986d2

Browse files
Reorder requirements file decoding
This changes the decoding process to be more iniline with what was previously documented. The new process is outlined in the updated docs. The `auto_decode` function was removed and all decoding logic moved to the `pip._internal.req.req_file` module because: * This function was only ever used to decode requirements file * It was never really a generic 'util' function, it was always tied to the idiosyncrasies of decoding requirements files. * The module lived under `_internal` so I felt comfortable removing it A warning was added when we _do_ fallback to using the locale defined encoding to encourage users to move to an explicit encoding definition via a coding style comment. This fixes two existing bugs. Firstly, when: * a requirements file is encoded as UTF-8, and * some bytes in the file are incompatible with the system locale Previously, assuming no BOM or PEP-263 style comment, we would default to using the encoding from the system locale, which would then fail (see issue #12771) Secondly, when decoding a file starting with a UTF-32 little endian Byte Order Marker. Previously this would always fail since `codecs.BOM_UTF32_LE` is `codecs.BOM_UTF16_LE` followed by two null bytes, and because of the ordering of the list of BOMs the UTF-16 case would be run first and match the file prefix so we would incorrectly deduce that the file was UTF-16 little endian encoded. I can't imagine this is a popular encoding for a requirements file. Fixes: #12771
1 parent dd6c4ad commit ef986d2

File tree

6 files changed

+176
-86
lines changed

6 files changed

+176
-86
lines changed

docs/html/reference/requirements-file-format.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,14 @@ examples of all these forms, see {ref}`pip install Examples`.
5656

5757
### Encoding
5858

59-
Requirements files are `utf-8` encoding by default and also support
60-
{pep}`263` style comments to change the encoding (i.e.
61-
`# -*- coding: <encoding name> -*-`).
59+
The default encoding for requirement files is `UTF-8` unless a different
60+
encoding is specified using a {pep}`263` style comment (e.g. `# -*- coding:
61+
<encoding name> -*-`).
62+
63+
```{warning}
64+
pip will fallback to the locale defined encoding if `UTF-8` decoding fails. This is a quirk
65+
of pip's parser. This behaviour is *deprecated* and should not be relied upon.
66+
```
6267

6368
### Line continuations
6469

news/12771.feature.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Reorder the encoding detection when decoding a requirements file, relying on
2+
UTF-8 over the locale encoding by default.

src/pip/_internal/req/req_file.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
Requirements file parsing
33
"""
44

5+
import codecs
6+
import locale
57
import logging
68
import optparse
79
import os
810
import re
911
import shlex
12+
import sys
1013
import urllib.parse
1114
from dataclasses import dataclass
1215
from optparse import Values
@@ -26,7 +29,6 @@
2629
from pip._internal.cli import cmdoptions
2730
from pip._internal.exceptions import InstallationError, RequirementsFileParseError
2831
from pip._internal.models.search_scope import SearchScope
29-
from pip._internal.utils.encoding import auto_decode
3032

3133
if TYPE_CHECKING:
3234
from pip._internal.index.package_finder import PackageFinder
@@ -82,6 +84,21 @@
8284
str(o().dest) for o in SUPPORTED_OPTIONS_EDITABLE_REQ
8385
]
8486

87+
# order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE
88+
# so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data
89+
BOMS: List[Tuple[bytes, str]] = [
90+
(codecs.BOM_UTF8, "utf-8"),
91+
(codecs.BOM_UTF32, "utf-32"),
92+
(codecs.BOM_UTF32_BE, "utf-32-be"),
93+
(codecs.BOM_UTF32_LE, "utf-32-le"),
94+
(codecs.BOM_UTF16, "utf-16"),
95+
(codecs.BOM_UTF16_BE, "utf-16-be"),
96+
(codecs.BOM_UTF16_LE, "utf-16-le"),
97+
]
98+
99+
PEP263_ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
100+
DEFAULT_ENCODING = "utf-8"
101+
85102
logger = logging.getLogger(__name__)
86103

87104

@@ -568,7 +585,39 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
568585
# Assume this is a bare path.
569586
try:
570587
with open(url, "rb") as f:
571-
content = auto_decode(f.read())
588+
raw_content = f.read()
572589
except OSError as exc:
573590
raise InstallationError(f"Could not open requirements file: {exc}")
591+
592+
content = _decode_req_file(raw_content, url)
593+
574594
return url, content
595+
596+
597+
def _decode_req_file(data: bytes, url: str) -> str:
598+
for bom, encoding in BOMS:
599+
if data.startswith(bom):
600+
return data[len(bom) :].decode(encoding)
601+
602+
for line in data.split(b"\n")[:2]:
603+
if line[0:1] == b"#":
604+
result = PEP263_ENCODING_RE.search(line)
605+
if result is not None:
606+
encoding = result.groups()[0].decode("ascii")
607+
return data.decode(encoding)
608+
609+
try:
610+
return data.decode(DEFAULT_ENCODING)
611+
except UnicodeDecodeError:
612+
locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding()
613+
logging.warning(
614+
"unable to decode data from %s with default encoding %s, "
615+
"falling back to encoding from locale: %s. "
616+
"If this is intentional you should specify the encoding with a "
617+
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'",
618+
url,
619+
DEFAULT_ENCODING,
620+
locale_encoding,
621+
locale_encoding,
622+
)
623+
return data.decode(locale_encoding)

src/pip/_internal/utils/encoding.py

Lines changed: 0 additions & 36 deletions
This file was deleted.

tests/unit/test_req_file.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import codecs
12
import collections
23
import logging
34
import os
@@ -955,3 +956,116 @@ def test_install_requirements_with_options(
955956
)
956957

957958
assert req.global_options == [global_option]
959+
960+
@pytest.mark.parametrize(
961+
"raw_req_file,expected_name,expected_spec",
962+
[
963+
pytest.param(
964+
b"Django==1.4.2",
965+
"Django",
966+
"==1.4.2",
967+
id="defaults to UTF-8",
968+
),
969+
pytest.param(
970+
"# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"),
971+
"Django",
972+
"==1.4.2",
973+
id="decodes based on PEP-263 style headers",
974+
),
975+
],
976+
)
977+
def test_general_decoding(
978+
self,
979+
raw_req_file: bytes,
980+
expected_name: str,
981+
expected_spec: str,
982+
tmpdir: Path,
983+
session: PipSession,
984+
) -> None:
985+
req_file = tmpdir / "requirements.txt"
986+
req_file.write_bytes(raw_req_file)
987+
988+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
989+
990+
assert len(reqs) == 1
991+
assert reqs[0].name == expected_name
992+
assert reqs[0].specifier == expected_spec
993+
994+
@pytest.mark.parametrize(
995+
"bom,encoding",
996+
[
997+
(codecs.BOM_UTF8, "utf-8"),
998+
(codecs.BOM_UTF16_BE, "utf-16-be"),
999+
(codecs.BOM_UTF16_LE, "utf-16-le"),
1000+
(codecs.BOM_UTF32_BE, "utf-32-be"),
1001+
(codecs.BOM_UTF32_LE, "utf-32-le"),
1002+
# BOM automatically added when encoding byte-order dependent encodings
1003+
(b"", "utf-16"),
1004+
(b"", "utf-32"),
1005+
],
1006+
)
1007+
def test_decoding_with_BOM(
1008+
self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession
1009+
) -> None:
1010+
req_name = "Django"
1011+
req_specifier = "==1.4.2"
1012+
encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding)
1013+
req_file = tmpdir / "requirements.txt"
1014+
req_file.write_bytes(encoded_contents)
1015+
1016+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
1017+
1018+
assert len(reqs) == 1
1019+
assert reqs[0].name == req_name
1020+
assert reqs[0].specifier == req_specifier
1021+
1022+
def test_warns_and_fallsback_to_locale_on_utf8_decode_fail(
1023+
self,
1024+
tmpdir: Path,
1025+
session: PipSession,
1026+
caplog: pytest.LogCaptureFixture,
1027+
) -> None:
1028+
# \xff is valid in latin-1 but not UTF-8
1029+
data = b"pip<=24.0 # some comment\xff\n"
1030+
locale_encoding = "latin-1"
1031+
req_file = tmpdir / "requirements.txt"
1032+
req_file.write_bytes(data)
1033+
1034+
# it's hard to rely on a locale definitely existing for testing
1035+
# so patch things out for simplicity
1036+
with caplog.at_level(logging.WARNING), mock.patch(
1037+
"locale.getpreferredencoding", return_value=locale_encoding
1038+
):
1039+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
1040+
1041+
assert len(caplog.records) == 1
1042+
assert (
1043+
caplog.records[0].msg
1044+
== "unable to decode data from %s with default encoding %s, "
1045+
"falling back to encoding from locale: %s. "
1046+
"If this is intentional you should specify the encoding with a "
1047+
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'"
1048+
)
1049+
assert caplog.records[0].args == (
1050+
str(req_file),
1051+
"utf-8",
1052+
locale_encoding,
1053+
locale_encoding,
1054+
)
1055+
1056+
assert len(reqs) == 1
1057+
assert reqs[0].name == "pip"
1058+
assert str(reqs[0].specifier) == "<=24.0"
1059+
1060+
@pytest.mark.parametrize("encoding", ["utf-8", "gbk"])
1061+
def test_errors_on_non_decodable_data(
1062+
self, encoding: str, tmpdir: Path, session: PipSession
1063+
) -> None:
1064+
data = b"\xff"
1065+
req_file = tmpdir / "requirements.txt"
1066+
req_file.write_bytes(data)
1067+
1068+
with pytest.raises(UnicodeDecodeError), mock.patch(
1069+
"locale.getpreferredencoding", return_value=encoding
1070+
):
1071+
next(parse_reqfile(req_file.resolve(), session=session))

tests/unit/test_utils.py

Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
44
"""
55

6-
import codecs
76
import os
87
import shutil
98
import stat
@@ -12,7 +11,7 @@
1211
from io import BytesIO
1312
from pathlib import Path
1413
from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type
15-
from unittest.mock import Mock, patch
14+
from unittest.mock import Mock
1615

1716
import pytest
1817

@@ -21,7 +20,6 @@
2120
from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError
2221
from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated
2322
from pip._internal.utils.egg_link import egg_link_path_from_location
24-
from pip._internal.utils.encoding import BOMS, auto_decode
2523
from pip._internal.utils.glibc import (
2624
glibc_version_string,
2725
glibc_version_string_confstr,
@@ -445,48 +443,6 @@ def test_has_one_of(self) -> None:
445443
assert not empty_hashes.has_one_of({"sha256": "xyzt"})
446444

447445

448-
class TestEncoding:
449-
"""Tests for pip._internal.utils.encoding"""
450-
451-
def test_auto_decode_utf_16_le(self) -> None:
452-
data = (
453-
b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00"
454-
b"=\x001\x00.\x004\x00.\x002\x00"
455-
)
456-
assert data.startswith(codecs.BOM_UTF16_LE)
457-
assert auto_decode(data) == "Django==1.4.2"
458-
459-
def test_auto_decode_utf_16_be(self) -> None:
460-
data = (
461-
b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00="
462-
b"\x00=\x001\x00.\x004\x00.\x002"
463-
)
464-
assert data.startswith(codecs.BOM_UTF16_BE)
465-
assert auto_decode(data) == "Django==1.4.2"
466-
467-
def test_auto_decode_no_bom(self) -> None:
468-
assert auto_decode(b"foobar") == "foobar"
469-
470-
def test_auto_decode_pep263_headers(self) -> None:
471-
latin1_req = "# coding=latin1\n# Pas trop de café"
472-
assert auto_decode(latin1_req.encode("latin1")) == latin1_req
473-
474-
def test_auto_decode_no_preferred_encoding(self) -> None:
475-
om, em = Mock(), Mock()
476-
om.return_value = "ascii"
477-
em.return_value = None
478-
data = "data"
479-
with patch("sys.getdefaultencoding", om):
480-
with patch("locale.getpreferredencoding", em):
481-
ret = auto_decode(data.encode(sys.getdefaultencoding()))
482-
assert ret == data
483-
484-
@pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS])
485-
def test_all_encodings_are_valid(self, encoding: str) -> None:
486-
# we really only care that there is no LookupError
487-
assert "".encode(encoding).decode(encoding) == ""
488-
489-
490446
def raises(error: Type[Exception]) -> NoReturn:
491447
raise error
492448

0 commit comments

Comments
 (0)