Reorder requirements file decoding

This changes the decoding process to be more in line with what was previously documented. The new process is outlined in the updated docs. The `auto_decode` function was removed and all decoding logic moved to the `pip._internal.req.req_file` module because: * This function was only ever used to decode requirements file * It was never really a generic 'util' function, it was always tied to the idiosyncrasies of decoding requirements files. * The module lived under `_internal` so I felt comfortable removing it A warning was added when we _do_ fallback to using the locale defined encoding to encourage users to move to an explicit encoding definition via a coding style comment. This fixes two existing bugs. Firstly, when: * a requirements file is encoded as UTF-8, and * some bytes in the file are incompatible with the system locale Previously, assuming no BOM or PEP-263 style comment, we would default to using the encoding from the system locale, which would then fail (see issue pypa#12771) Secondly, when decoding a file starting with a UTF-32 little endian Byte Order Marker. Previously this would always fail since `codecs.BOM_UTF32_LE` is `codecs.BOM_UTF16_LE` followed by two null bytes, and because of the ordering of the list of BOMs the UTF-16 case would be run first and match the file prefix so we would incorrectly deduce that the file was UTF-16 little endian encoded. I can't imagine this is a popular encoding for a requirements file. Fixes: pypa#12771
matthewhughes934 · Dec 26, 2024 · f2eb413 · f2eb413
1 parent 7c218b9
commit f2eb413
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 86 deletions.
diff --git a/docs/html/reference/requirements-file-format.md b/docs/html/reference/requirements-file-format.md
@@ -56,9 +56,15 @@ examples of all these forms, see {ref}`pip install Examples`.
 
 ### Encoding
 
-Requirements files are `utf-8` encoding by default and also support
-{pep}`263` style comments to change the encoding (i.e.
-`# -*- coding: <encoding name> -*-`).
+It is simplest to encode your requirements files with UTF-8.
+The process for decoding requirements files is:
+
+- Check for any Byte Order Mark at the start of the file and if found use
+  the corresponding encoding to decode the file.
+- Check for any {pep}`263` style comment (e.g. `# -*- coding: <encoding name> -*-`)
+  and if found decode with the given encoding.
+- Try and decode with UTF-8, and if that fails,
+- fallback to trying to decode using the locale defined encoding.
 
 ### Line continuations
 

diff --git a/news/12771.feature.rst b/news/12771.feature.rst
@@ -0,0 +1,2 @@
+Reorder the encoding detection when decoding a requirements file, relying on
+UTF-8 over the locale encoding by default.
diff --git a/src/pip/_internal/req/req_file.py b/src/pip/_internal/req/req_file.py
@@ -2,11 +2,14 @@
 Requirements file parsing
 """
 
+import codecs
+import locale
 import logging
 import optparse
 import os
 import re
 import shlex
+import sys
 import urllib.parse
 from dataclasses import dataclass
 from optparse import Values
@@ -26,7 +29,6 @@
 from pip._internal.cli import cmdoptions
 from pip._internal.exceptions import InstallationError, RequirementsFileParseError
 from pip._internal.models.search_scope import SearchScope
-from pip._internal.utils.encoding import auto_decode
 
 if TYPE_CHECKING:
     from pip._internal.index.package_finder import PackageFinder
@@ -568,7 +570,56 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
     # Assume this is a bare path.
     try:
         with open(url, "rb") as f:
-            content = auto_decode(f.read())
+            raw_content = f.read()
     except OSError as exc:
         raise InstallationError(f"Could not open requirements file: {exc}")
+
+    content = _decode_req_file(raw_content, url)
+
     return url, content
+
+
+BOMS: List[Tuple[bytes, str]] = [
+    (codecs.BOM_UTF8, "utf-8"),
+    (codecs.BOM_UTF32, "utf-32"),
+    (codecs.BOM_UTF32_BE, "utf-32-be"),
+    (codecs.BOM_UTF32_LE, "utf-32-le"),
+    (codecs.BOM_UTF16, "utf-16"),
+    (codecs.BOM_UTF16_BE, "utf-16-be"),
+    (codecs.BOM_UTF16_LE, "utf-16-le"),
+]
+
+ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
+DEFAULT_ENCODING = "utf-8"
+
+
+def _decode_req_file(data: bytes, url: str) -> str:
+    # order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE
+    # so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data
+    for bom, encoding in BOMS:
+        if data.startswith(bom):
+            return data[len(bom) :].decode(encoding)
+
+    # PEP-263 style comments
+    for line in data.split(b"\n")[:2]:
+        if line[0:1] == b"#" and ENCODING_RE.search(line):
+            result = ENCODING_RE.search(line)
+            assert result is not None
+            encoding = result.groups()[0].decode("ascii")
+            return data.decode(encoding)
+
+    try:
+        return data.decode(DEFAULT_ENCODING)
+    except UnicodeDecodeError:
+        locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding()
+        logging.warning(
+            "unable to decode data from %s with default encoding %s, "
+            "falling back to encoding from locale: %s. "
+            "If this is intentional you should specify the encoding with a "
+            "PEP-263 style comment, e.g. '# -*- coding: %s -*-'",
+            url,
+            DEFAULT_ENCODING,
+            locale_encoding,
+            locale_encoding,
+        )
+        return data.decode(locale_encoding)
diff --git a/src/pip/_internal/utils/encoding.py b/src/pip/_internal/utils/encoding.py
diff --git a/tests/unit/test_req_file.py b/tests/unit/test_req_file.py
@@ -1,3 +1,4 @@
+import codecs
 import collections
 import logging
 import os
@@ -955,3 +956,116 @@ def test_install_requirements_with_options(
             )
 
         assert req.global_options == [global_option]
+
+    @pytest.mark.parametrize(
+        "raw_req_file,expected_name,expected_spec",
+        [
+            pytest.param(
+                b"Django==1.4.2",
+                "Django",
+                "==1.4.2",
+                id="defaults to UTF-8",
+            ),
+            pytest.param(
+                "# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"),
+                "Django",
+                "==1.4.2",
+                id="decodes based on PEP-263 style headers",
+            ),
+        ],
+    )
+    def test_general_decoding(
+        self,
+        raw_req_file: bytes,
+        expected_name: str,
+        expected_spec: str,
+        tmpdir: Path,
+        session: PipSession,
+    ) -> None:
+        req_file = tmpdir / "requirements.txt"
+        req_file.write_bytes(raw_req_file)
+
+        reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
+
+        assert len(reqs) == 1
+        assert reqs[0].name == expected_name
+        assert reqs[0].specifier == expected_spec
+
+    @pytest.mark.parametrize(
+        "bom,encoding",
+        [
+            (codecs.BOM_UTF8, "utf-8"),
+            (codecs.BOM_UTF16_BE, "utf-16-be"),
+            (codecs.BOM_UTF16_LE, "utf-16-le"),
+            (codecs.BOM_UTF32_BE, "utf-32-be"),
+            (codecs.BOM_UTF32_LE, "utf-32-le"),
+            # BOM automatically added when encoding byte-order dependent encodings
+            (b"", "utf-16"),
+            (b"", "utf-32"),
+        ],
+    )
+    def test_decoding_with_BOM(
+        self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession
+    ) -> None:
+        req_name = "Django"
+        req_specifier = "==1.4.2"
+        encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding)
+        req_file = tmpdir / "requirements.txt"
+        req_file.write_bytes(encoded_contents)
+
+        reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
+
+        assert len(reqs) == 1
+        assert reqs[0].name == req_name
+        assert reqs[0].specifier == req_specifier
+
+    def test_warns_and_fallsback_to_locale_on_utf8_decode_fail(
+        self,
+        tmpdir: Path,
+        session: PipSession,
+        caplog: pytest.LogCaptureFixture,
+    ) -> None:
+        # \xff is valid in latin-1 but not UTF-8
+        data = b"pip<=24.0 # some comment\xff\n"
+        locale_encoding = "latin-1"
+        req_file = tmpdir / "requirements.txt"
+        req_file.write_bytes(data)
+
+        # it's hard to rely on a locale definitely existing for testing
+        # so patch things out for simplicity
+        with caplog.at_level(logging.WARNING), mock.patch(
+            "locale.getpreferredencoding", return_value=locale_encoding
+        ):
+            reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
+
+        assert len(caplog.records) == 1
+        assert (
+            caplog.records[0].msg
+            == "unable to decode data from %s with default encoding %s, "
+            "falling back to encoding from locale: %s. "
+            "If this is intentional you should specify the encoding with a "
+            "PEP-263 style comment, e.g. '# -*- coding: %s -*-'"
+        )
+        assert caplog.records[0].args == (
+            str(req_file),
+            "utf-8",
+            locale_encoding,
+            locale_encoding,
+        )
+
+        assert len(reqs) == 1
+        assert reqs[0].name == "pip"
+        assert str(reqs[0].specifier) == "<=24.0"
+
+    @pytest.mark.parametrize("encoding", ["utf-8", "gbk"])
+    def test_erorrs_on_non_decodable_data(
+        self, encoding: str, tmpdir: Path, session: PipSession
+    ) -> None:
+        data = b"\xff"
+        req_file = tmpdir / "requirements.txt"
+        req_file.write_bytes(data)
+
+        with pytest.raises(UnicodeDecodeError), mock.patch(
+            "locale.getpreferredencoding", return_value=encoding
+        ):
+            next(parse_reqfile(req_file.resolve(), session=session))
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -3,7 +3,6 @@
 
 """
 
-import codecs
 import os
 import shutil
 import stat
@@ -12,7 +11,7 @@
 from io import BytesIO
 from pathlib import Path
 from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
 
 import pytest
 
@@ -21,7 +20,6 @@
 from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError
 from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated
 from pip._internal.utils.egg_link import egg_link_path_from_location
-from pip._internal.utils.encoding import BOMS, auto_decode
 from pip._internal.utils.glibc import (
     glibc_version_string,
     glibc_version_string_confstr,
@@ -445,48 +443,6 @@ def test_has_one_of(self) -> None:
         assert not empty_hashes.has_one_of({"sha256": "xyzt"})
 
 
-class TestEncoding:
-    """Tests for pip._internal.utils.encoding"""
-
-    def test_auto_decode_utf_16_le(self) -> None:
-        data = (
-            b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00"
-            b"=\x001\x00.\x004\x00.\x002\x00"
-        )
-        assert data.startswith(codecs.BOM_UTF16_LE)
-        assert auto_decode(data) == "Django==1.4.2"
-
-    def test_auto_decode_utf_16_be(self) -> None:
-        data = (
-            b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00="
-            b"\x00=\x001\x00.\x004\x00.\x002"
-        )
-        assert data.startswith(codecs.BOM_UTF16_BE)
-        assert auto_decode(data) == "Django==1.4.2"
-
-    def test_auto_decode_no_bom(self) -> None:
-        assert auto_decode(b"foobar") == "foobar"
-
-    def test_auto_decode_pep263_headers(self) -> None:
-        latin1_req = "# coding=latin1\n# Pas trop de café"
-        assert auto_decode(latin1_req.encode("latin1")) == latin1_req
-
-    def test_auto_decode_no_preferred_encoding(self) -> None:
-        om, em = Mock(), Mock()
-        om.return_value = "ascii"
-        em.return_value = None
-        data = "data"
-        with patch("sys.getdefaultencoding", om):
-            with patch("locale.getpreferredencoding", em):
-                ret = auto_decode(data.encode(sys.getdefaultencoding()))
-        assert ret == data
-
-    @pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS])
-    def test_all_encodings_are_valid(self, encoding: str) -> None:
-        # we really only care that there is no LookupError
-        assert "".encode(encoding).decode(encoding) == ""
-
-
 def raises(error: Type[Exception]) -> NoReturn:
     raise error
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Reorder the encoding detection when decoding a requirements file, relying on
		UTF-8 over the locale encoding by default.