Skip to content

Commit

Permalink
Reorder requirements file decoding
Browse files Browse the repository at this point in the history
This changes the decoding process to be more in line with what was
previously documented. The new process is outlined in the updated docs.

The `auto_decode` function was removed and all decoding logic moved to
the `pip._internal.req.req_file` module because:

* This function was only ever used to decode requirements file
* It was never really a generic 'util' function, it was always tied to
  the idiosyncrasies of decoding requirements files.
* The module lived under `_internal` so I felt comfortable removing it

A warning was added when we _do_ fallback to using the locale defined
encoding to encourage users to move to an explicit encoding definition
via a coding style comment.

This fixes two existing bugs. Firstly, when:

* a requirements file is encoded as UTF-8, and
* some bytes in the file are incompatible with the system locale

Previously, assuming no BOM or PEP-263 style comment, we would default
to using the encoding from the system locale, which would then fail (see
issue pypa#12771)

Secondly, when decoding a file starting with a UTF-32 little endian Byte
Order Marker. Previously this would always fail since
`codecs.BOM_UTF32_LE` is `codecs.BOM_UTF16_LE` followed by two null
bytes, and because of the ordering of the list of BOMs the UTF-16 case
would be run first and match the file prefix so we would incorrectly
deduce that the file was UTF-16 little endian encoded. I can't imagine
this is a popular encoding for a requirements file.

Fixes: pypa#12771
  • Loading branch information
matthewhughes934 committed Dec 26, 2024
1 parent 7c218b9 commit f2eb413
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 86 deletions.
12 changes: 9 additions & 3 deletions docs/html/reference/requirements-file-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,15 @@ examples of all these forms, see {ref}`pip install Examples`.

### Encoding

Requirements files are `utf-8` encoding by default and also support
{pep}`263` style comments to change the encoding (i.e.
`# -*- coding: <encoding name> -*-`).
It is simplest to encode your requirements files with UTF-8.
The process for decoding requirements files is:

- Check for any Byte Order Mark at the start of the file and if found use
the corresponding encoding to decode the file.
- Check for any {pep}`263` style comment (e.g. `# -*- coding: <encoding name> -*-`)
and if found decode with the given encoding.
- Try and decode with UTF-8, and if that fails,
- fallback to trying to decode using the locale defined encoding.

### Line continuations

Expand Down
2 changes: 2 additions & 0 deletions news/12771.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Reorder the encoding detection when decoding a requirements file, relying on
UTF-8 over the locale encoding by default.
55 changes: 53 additions & 2 deletions src/pip/_internal/req/req_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
Requirements file parsing
"""

import codecs
import locale
import logging
import optparse
import os
import re
import shlex
import sys
import urllib.parse
from dataclasses import dataclass
from optparse import Values
Expand All @@ -26,7 +29,6 @@
from pip._internal.cli import cmdoptions
from pip._internal.exceptions import InstallationError, RequirementsFileParseError
from pip._internal.models.search_scope import SearchScope
from pip._internal.utils.encoding import auto_decode

if TYPE_CHECKING:
from pip._internal.index.package_finder import PackageFinder
Expand Down Expand Up @@ -568,7 +570,56 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
# Assume this is a bare path.
try:
with open(url, "rb") as f:
content = auto_decode(f.read())
raw_content = f.read()
except OSError as exc:
raise InstallationError(f"Could not open requirements file: {exc}")

content = _decode_req_file(raw_content, url)

return url, content


BOMS: List[Tuple[bytes, str]] = [
(codecs.BOM_UTF8, "utf-8"),
(codecs.BOM_UTF32, "utf-32"),
(codecs.BOM_UTF32_BE, "utf-32-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
(codecs.BOM_UTF16, "utf-16"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
]

ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
DEFAULT_ENCODING = "utf-8"


def _decode_req_file(data: bytes, url: str) -> str:
# order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE
# so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data
for bom, encoding in BOMS:
if data.startswith(bom):
return data[len(bom) :].decode(encoding)

# PEP-263 style comments
for line in data.split(b"\n")[:2]:
if line[0:1] == b"#" and ENCODING_RE.search(line):
result = ENCODING_RE.search(line)
assert result is not None
encoding = result.groups()[0].decode("ascii")
return data.decode(encoding)

try:
return data.decode(DEFAULT_ENCODING)
except UnicodeDecodeError:
locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding()
logging.warning(
"unable to decode data from %s with default encoding %s, "
"falling back to encoding from locale: %s. "
"If this is intentional you should specify the encoding with a "
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'",
url,
DEFAULT_ENCODING,
locale_encoding,
locale_encoding,
)
return data.decode(locale_encoding)
36 changes: 0 additions & 36 deletions src/pip/_internal/utils/encoding.py

This file was deleted.

114 changes: 114 additions & 0 deletions tests/unit/test_req_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import collections
import logging
import os
Expand Down Expand Up @@ -955,3 +956,116 @@ def test_install_requirements_with_options(
)

assert req.global_options == [global_option]

@pytest.mark.parametrize(
"raw_req_file,expected_name,expected_spec",
[
pytest.param(
b"Django==1.4.2",
"Django",
"==1.4.2",
id="defaults to UTF-8",
),
pytest.param(
"# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"),
"Django",
"==1.4.2",
id="decodes based on PEP-263 style headers",
),
],
)
def test_general_decoding(
self,
raw_req_file: bytes,
expected_name: str,
expected_spec: str,
tmpdir: Path,
session: PipSession,
) -> None:
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(raw_req_file)

reqs = tuple(parse_reqfile(req_file.resolve(), session=session))

assert len(reqs) == 1
assert reqs[0].name == expected_name
assert reqs[0].specifier == expected_spec

@pytest.mark.parametrize(
"bom,encoding",
[
(codecs.BOM_UTF8, "utf-8"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
(codecs.BOM_UTF32_BE, "utf-32-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
# BOM automatically added when encoding byte-order dependent encodings
(b"", "utf-16"),
(b"", "utf-32"),
],
)
def test_decoding_with_BOM(
self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession
) -> None:
req_name = "Django"
req_specifier = "==1.4.2"
encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding)
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(encoded_contents)

reqs = tuple(parse_reqfile(req_file.resolve(), session=session))

assert len(reqs) == 1
assert reqs[0].name == req_name
assert reqs[0].specifier == req_specifier

def test_warns_and_fallsback_to_locale_on_utf8_decode_fail(
self,
tmpdir: Path,
session: PipSession,
caplog: pytest.LogCaptureFixture,
) -> None:
# \xff is valid in latin-1 but not UTF-8
data = b"pip<=24.0 # some comment\xff\n"
locale_encoding = "latin-1"
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(data)

# it's hard to rely on a locale definitely existing for testing
# so patch things out for simplicity
with caplog.at_level(logging.WARNING), mock.patch(
"locale.getpreferredencoding", return_value=locale_encoding
):
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))

assert len(caplog.records) == 1
assert (
caplog.records[0].msg
== "unable to decode data from %s with default encoding %s, "
"falling back to encoding from locale: %s. "
"If this is intentional you should specify the encoding with a "
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'"
)
assert caplog.records[0].args == (
str(req_file),
"utf-8",
locale_encoding,
locale_encoding,
)

assert len(reqs) == 1
assert reqs[0].name == "pip"
assert str(reqs[0].specifier) == "<=24.0"

@pytest.mark.parametrize("encoding", ["utf-8", "gbk"])
def test_erorrs_on_non_decodable_data(
self, encoding: str, tmpdir: Path, session: PipSession
) -> None:
data = b"\xff"
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(data)

with pytest.raises(UnicodeDecodeError), mock.patch(
"locale.getpreferredencoding", return_value=encoding
):
next(parse_reqfile(req_file.resolve(), session=session))
46 changes: 1 addition & 45 deletions tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""

import codecs
import os
import shutil
import stat
Expand All @@ -12,7 +11,7 @@
from io import BytesIO
from pathlib import Path
from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type
from unittest.mock import Mock, patch
from unittest.mock import Mock

import pytest

Expand All @@ -21,7 +20,6 @@
from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError
from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated
from pip._internal.utils.egg_link import egg_link_path_from_location
from pip._internal.utils.encoding import BOMS, auto_decode
from pip._internal.utils.glibc import (
glibc_version_string,
glibc_version_string_confstr,
Expand Down Expand Up @@ -445,48 +443,6 @@ def test_has_one_of(self) -> None:
assert not empty_hashes.has_one_of({"sha256": "xyzt"})


class TestEncoding:
"""Tests for pip._internal.utils.encoding"""

def test_auto_decode_utf_16_le(self) -> None:
data = (
b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00"
b"=\x001\x00.\x004\x00.\x002\x00"
)
assert data.startswith(codecs.BOM_UTF16_LE)
assert auto_decode(data) == "Django==1.4.2"

def test_auto_decode_utf_16_be(self) -> None:
data = (
b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00="
b"\x00=\x001\x00.\x004\x00.\x002"
)
assert data.startswith(codecs.BOM_UTF16_BE)
assert auto_decode(data) == "Django==1.4.2"

def test_auto_decode_no_bom(self) -> None:
assert auto_decode(b"foobar") == "foobar"

def test_auto_decode_pep263_headers(self) -> None:
latin1_req = "# coding=latin1\n# Pas trop de café"
assert auto_decode(latin1_req.encode("latin1")) == latin1_req

def test_auto_decode_no_preferred_encoding(self) -> None:
om, em = Mock(), Mock()
om.return_value = "ascii"
em.return_value = None
data = "data"
with patch("sys.getdefaultencoding", om):
with patch("locale.getpreferredencoding", em):
ret = auto_decode(data.encode(sys.getdefaultencoding()))
assert ret == data

@pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS])
def test_all_encodings_are_valid(self, encoding: str) -> None:
# we really only care that there is no LookupError
assert "".encode(encoding).decode(encoding) == ""


def raises(error: Type[Exception]) -> NoReturn:
raise error

Expand Down

0 comments on commit f2eb413

Please sign in to comment.