From 18c1e2493be381f986b44f04502be7b2adb59049 Mon Sep 17 00:00:00 2001 From: Matt Harbison Date: Fri, 10 Jan 2025 00:41:18 -0500 Subject: [PATCH] avoid encoding errors with unicode content piped through stdio on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consider this trivial file (with a trailing LF): print('This is a unicode character: ≠'.encode("UTF-8")) This command worked in cmd.exe or an MSYS terminal, and printed ≠ correctly: $ cat test.py | pyupgrade.exe --py38-plus - This crashed with an encoding error: $ cat test.py | pyupgrade.exe --py38-plus - > reformated.py Traceback (most recent call last): File "C:\hgdev\python39-x64\lib\runpy.py", line 197, in _run_module_as_main return _run_code(code, main_globals, None, File "C:\hgdev\python39-x64\lib\runpy.py", line 87, in _run_code exec(code, run_globals) File "c:\Users\Matt\.local\bin\pyupgrade.exe\__main__.py", line 7, in File "C:\Users\Matt\pipx\venvs\pyupgrade\lib\site-packages\pyupgrade\_main.py", line 389, in main ret |= _fix_file(filename, args) File "C:\Users\Matt\pipx\venvs\pyupgrade\lib\site-packages\pyupgrade\_main.py", line 330, in _fix_file print(contents_text, end='') File "C:\hgdev\python39-x64\lib\encodings\cp1252.py", line 19, in encode return codecs.charmap_encode(input,self.errors,encoding_table)[0] UnicodeEncodeError: 'charmap' codec can't encode character '\u2260' in position 36: character maps to Since bytes are read from `stdin.buffer` and decoded as UTF-8 when the input file is '-', it makes sense to write UTF-8 bytes to `stdout.buffer`, and avoid using the default codepage. The use case here is wiring this up to the `hg fix` extension, which writes content to the tool's stdin and reads it back from its stdout to reformat files. That shouldn't change the encoding. A workaround using the existing code is to set `PYTHONUTF8=1` in the environment, but that's not obvious or always easily done. This change also has the nice side effect of no longer changing LF input to CRLF output. (You'd think that `print(..., end='')` would avoid printing the EOL, but that's apparently baked into the `TextIO` object that is `sys.stdout`, and not something the print function can override.) --- pyupgrade/_main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyupgrade/_main.py b/pyupgrade/_main.py index e52b9c66..1b5d1ea2 100644 --- a/pyupgrade/_main.py +++ b/pyupgrade/_main.py @@ -327,7 +327,7 @@ def _fix_file(filename: str, args: argparse.Namespace) -> int: contents_text = _fix_tokens(contents_text) if filename == '-': - print(contents_text, end='') + sys.stdout.buffer.write(contents_text.encode()) elif contents_text != contents_text_orig: print(f'Rewriting {filename}', file=sys.stderr) with open(filename, 'w', encoding='UTF-8', newline='') as f: