From 603dffc71543dbab4d02f777e3c3b0ab8058e445 Mon Sep 17 00:00:00 2001 From: William Rusnack Date: Wed, 29 May 2024 20:13:20 -0400 Subject: [PATCH 1/4] integrate utf8 hPutStr to standard hPutStr --- src/Data/Text/IO.hs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Data/Text/IO.hs b/src/Data/Text/IO.hs index a4569bc3..ba054acd 100644 --- a/src/Data/Text/IO.hs +++ b/src/Data/Text/IO.hs @@ -49,14 +49,17 @@ import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout, withFile) import qualified Control.Exception as E import Control.Monad (liftM2, when) +import qualified Data.ByteString as B import Data.IORef (readIORef, writeIORef) import qualified Data.Text as T +import Data.Text.Encoding (encodeUtf8) import Data.Text.Internal.Fusion (stream) import Data.Text.Internal.Fusion.Types (Step(..), Stream(..)) import Data.Text.Internal.IO (hGetLineWith, readChunk) import GHC.IO.Buffer (Buffer(..), BufferState(..), RawCharBuffer, CharBuffer, emptyBuffer, isEmptyBuffer, newCharBuffer) import qualified GHC.IO.Buffer +import GHC.IO.Encoding (textEncodingName) import GHC.IO.Exception (IOException(ioe_type), IOErrorType(InappropriateType)) import GHC.IO.Handle.Internals (augmentIOError, hClose_help, wantReadableHandle, wantWritableHandle) @@ -176,12 +179,14 @@ hGetLine = hGetLineWith T.concat hPutStr :: Handle -> Text -> IO () -- This function is lifted almost verbatim from GHC.IO.Handle.Text. hPutStr h t = do - (buffer_mode, nl) <- + (buffer_mode, nl, isUTF8) <- wantWritableHandle "hPutStr" h $ \h_ -> do bmode <- getSpareBuffer h_ - return (bmode, haOutputNL h_) + let isUTF8 = maybe False ((== "UTF-8") . textEncodingName) $ haCodec h_ + return (bmode, haOutputNL h_, isUTF8) let str = stream t case buffer_mode of + _ | nl == LF && isUTF8 -> B.hPutStr h $ encodeUtf8 t (NoBuffering, _) -> hPutChars h str (LineBuffering, buf) -> writeLines h nl buf str (BlockBuffering _, buf) -> writeBlocks (nl == CRLF) h buf str From b56f07280c1463c59bfcaa6be2f55a9f601d1407 Mon Sep 17 00:00:00 2001 From: William Rusnack Date: Sat, 1 Jun 2024 09:24:40 -0400 Subject: [PATCH 2/4] comparing encoding pointers instead of strings of encoding names --- src/Data/Text/IO.hs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Data/Text/IO.hs b/src/Data/Text/IO.hs index ba054acd..c6fa7706 100644 --- a/src/Data/Text/IO.hs +++ b/src/Data/Text/IO.hs @@ -1,5 +1,6 @@ {-# LANGUAGE BangPatterns, CPP, RecordWildCards, ScopedTypeVariables #-} {-# LANGUAGE Trustworthy #-} +{-# LANGUAGE MagicHash #-} -- | -- Module : Data.Text.IO -- Copyright : (c) 2009, 2010 Bryan O'Sullivan, @@ -56,10 +57,11 @@ import Data.Text.Encoding (encodeUtf8) import Data.Text.Internal.Fusion (stream) import Data.Text.Internal.Fusion.Types (Step(..), Stream(..)) import Data.Text.Internal.IO (hGetLineWith, readChunk) +import GHC.Exts (reallyUnsafePtrEquality#, isTrue#) import GHC.IO.Buffer (Buffer(..), BufferState(..), RawCharBuffer, CharBuffer, emptyBuffer, isEmptyBuffer, newCharBuffer) import qualified GHC.IO.Buffer -import GHC.IO.Encoding (textEncodingName) +import GHC.IO.Encoding (utf8) import GHC.IO.Exception (IOException(ioe_type), IOErrorType(InappropriateType)) import GHC.IO.Handle.Internals (augmentIOError, hClose_help, wantReadableHandle, wantWritableHandle) @@ -182,8 +184,7 @@ hPutStr h t = do (buffer_mode, nl, isUTF8) <- wantWritableHandle "hPutStr" h $ \h_ -> do bmode <- getSpareBuffer h_ - let isUTF8 = maybe False ((== "UTF-8") . textEncodingName) $ haCodec h_ - return (bmode, haOutputNL h_, isUTF8) + return (bmode, haOutputNL h_, eqUTF8 h_) let str = stream t case buffer_mode of _ | nl == LF && isUTF8 -> B.hPutStr h $ encodeUtf8 t @@ -191,6 +192,9 @@ hPutStr h t = do (LineBuffering, buf) -> writeLines h nl buf str (BlockBuffering _, buf) -> writeBlocks (nl == CRLF) h buf str + where + eqUTF8 = maybe False (\enc -> isTrue# (reallyUnsafePtrEquality# utf8 enc)) . haCodec + hPutChars :: Handle -> Stream Char -> IO () hPutChars h (Stream next0 s0 _len) = loop s0 where From e1068f177437ae5849c033e126310f03ac379fb5 Mon Sep 17 00:00:00 2001 From: Li-yao Xia Date: Mon, 3 Jun 2024 17:22:29 +0200 Subject: [PATCH 3/4] Simplify import of utf8 and add a comment about pointer comparison in hPutStr --- src/Data/Text/Internal/IO.hs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Data/Text/Internal/IO.hs b/src/Data/Text/Internal/IO.hs index 1262be39..9badb3a6 100644 --- a/src/Data/Text/Internal/IO.hs +++ b/src/Data/Text/Internal/IO.hs @@ -40,12 +40,11 @@ import GHC.IO.Buffer (Buffer(..), BufferState(..), CharBuffer, RawCharBuffer, bufferAdjustL, bufferElems, charSize, emptyBuffer, isEmptyBuffer, newCharBuffer, readCharBuf, withRawBuffer, writeCharBuf) -import GHC.IO.Encoding (utf8) import GHC.IO.Handle.Internals (ioe_EOF, readTextDevice, wantReadableHandle_, wantWritableHandle) import GHC.IO.Handle.Text (commitBuffer') import GHC.IO.Handle.Types (BufferList(..), BufferMode(..), Handle__(..), Newline(..)) -import System.IO (Handle, hPutChar) +import System.IO (Handle, hPutChar, utf8) import System.IO.Error (isEOFError) import qualified Data.Text as T @@ -204,6 +203,10 @@ hPutStreamOrUtf8 h str mPutUtf8 = do (BlockBuffering _, buf) -> writeBlocks (nl == CRLF) h buf str where + -- If the encoding is UTF-8, it's most likely pointer-equal to + -- 'System.IO.utf8', letting us avoid a String comparison. + -- If it is somehow UTF-8 but not pointer-equal to 'utf8', + -- we will just take a slower branch, but the result is still correct. eqUTF8 = maybe False (\enc -> isTrue# (reallyUnsafePtrEquality# utf8 enc)) . haCodec {-# INLINE hPutStreamOrUtf8 #-} From fd62de72a444e21962f3e35f4290b82fd017e21e Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 22 Jun 2024 23:53:47 +0100 Subject: [PATCH 4/4] Add a comment about non-atomic B.hPutStrLn --- src/Data/Text/Internal/IO.hs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Data/Text/Internal/IO.hs b/src/Data/Text/Internal/IO.hs index 9badb3a6..546d3ab6 100644 --- a/src/Data/Text/Internal/IO.hs +++ b/src/Data/Text/Internal/IO.hs @@ -180,12 +180,16 @@ hPutStream h str = hPutStreamOrUtf8 h str Nothing -- | Write a string to a handle. hPutStr :: Handle -> Text -> IO () hPutStr h t = hPutStreamOrUtf8 h (stream t) (Just putUtf8) - where putUtf8 = B.hPutStr h (encodeUtf8 t) + where + putUtf8 = B.hPutStr h (encodeUtf8 t) -- | Write a string to a handle, followed by a newline. hPutStrLn :: Handle -> Text -> IO () hPutStrLn h t = hPutStreamOrUtf8 h (streamLn t) (Just putUtf8) - where putUtf8 = hPutBuilder h (encodeUtf8Builder t <> charUtf8 '\n') + where + -- Not using B.hPutStrLn because it's not necessarily atomic: + -- https://github.com/haskell/bytestring/issues/200 + putUtf8 = hPutBuilder h (encodeUtf8Builder t <> charUtf8 '\n') -- | 'hPutStream' with an optional special case when the output encoding is -- UTF-8 and without newline conversion.