Skip to content

Commit

Permalink
Use safe isValidUtf8 for large inputs (#470)
Browse files Browse the repository at this point in the history
* Use safe isValidUtf8 for large inputs

* Use compat function for checking pinnedness

* Touch byte array after passing it to safe FFI call
  • Loading branch information
noughtmare authored Feb 24, 2022
1 parent df1706a commit 38889aa
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 19 deletions.
20 changes: 18 additions & 2 deletions Data/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1523,13 +1523,29 @@ isInfixOf p s = null p || not (null $ snd $ breakSubstring p s)
--
-- @since 0.11.2.0
isValidUtf8 :: ByteString -> Bool
isValidUtf8 (BS ptr len) = accursedUnutterablePerformIO $ unsafeWithForeignPtr ptr $ \p -> do
i <- cIsValidUtf8 p (fromIntegral len)
isValidUtf8 (BS ptr len) = accursedUnutterablePerformIO $ unsafeWithForeignPtr ptr $ \p -> do
-- Use a safe FFI call for large inputs to avoid GC synchronization pauses
-- in multithreaded contexts.
-- This specific limit was chosen based on results of a simple benchmark, see:
-- https://github.com/haskell/bytestring/issues/451#issuecomment-991879338
-- When changing this function, also consider changing the related function:
-- Data.ByteString.Short.Internal.isValidUtf8
i <- if len < 1000000
then cIsValidUtf8 p (fromIntegral len)
else cIsValidUtf8Safe p (fromIntegral len)
pure $ i /= 0

-- We import bytestring_is_valid_utf8 both unsafe and safe. For small inputs
-- we can use the unsafe version to get a bit more performance, but for large
-- inputs the safe version should be used to avoid GC synchronization pauses
-- in multithreaded contexts.

foreign import ccall unsafe "bytestring_is_valid_utf8" cIsValidUtf8
:: Ptr Word8 -> CSize -> IO CInt

foreign import ccall safe "bytestring_is_valid_utf8" cIsValidUtf8Safe
:: Ptr Word8 -> CSize -> IO CInt

-- | Break a string on a substring, returning a pair of the part of the
-- string prior to the match, and the rest of the string.
--
Expand Down
60 changes: 43 additions & 17 deletions Data/ByteString/Short/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,11 @@ import GHC.Exts
,indexWord8ArrayAsWord64#
#endif
, setByteArray#
)
, sizeofByteArray#
, indexWord8Array#, indexCharArray#
, writeWord8Array#
, unsafeFreezeByteArray#
, touch# )
import GHC.IO
import GHC.ForeignPtr
( ForeignPtr(ForeignPtr)
Expand Down Expand Up @@ -482,6 +486,13 @@ createAndTrim'' l fill =
return (SBS ba#)
{-# INLINE createAndTrim'' #-}

isPinned :: ByteArray# -> Bool
#if MIN_VERSION_base(4,10,0)
isPinned ba# = isTrue# (isByteArrayPinned# ba#)
#else
isPinned _ = False
#endif

------------------------------------------------------------------------
-- Conversion to and from ByteString

Expand All @@ -501,18 +512,15 @@ toShortIO (BS fptr len) = do
BA# ba# <- stToIO (unsafeFreezeByteArray mba)
return (SBS ba#)


-- | /O(n)/. Convert a 'ShortByteString' into a 'ByteString'.
--
fromShort :: ShortByteString -> ByteString
#if MIN_VERSION_base(4,10,0)
fromShort (SBS ba#)
| isTrue# (isByteArrayPinned# ba#) = BS fp len
fromShort (SBS b#)
| isPinned b# = BS fp len
where
addr# = byteArrayContents# ba#
fp = ForeignPtr addr# (PlainPtr (unsafeCoerce# ba#))
len = I# (sizeofByteArray# ba#)
#endif
addr# = byteArrayContents# b#
fp = ForeignPtr addr# (PlainPtr (unsafeCoerce# b#))
len = I# (sizeofByteArray# b#)
fromShort !sbs = unsafeDupablePerformIO (fromShortIO sbs)

fromShortIO :: ShortByteString -> IO ByteString
Expand Down Expand Up @@ -675,7 +683,7 @@ infixr 5 `cons` --same as list (:)
infixl 5 `snoc`

-- | /O(n)/ Append a byte to the end of a 'ShortByteString'
--
--
-- Note: copies the entire byte array
--
-- @since 0.11.3.0
Expand Down Expand Up @@ -718,7 +726,7 @@ last = \sbs -> case null sbs of
--
-- @since 0.11.3.0
tail :: HasCallStack => ShortByteString -> ShortByteString
tail = \sbs ->
tail = \sbs ->
let l = length sbs
nl = l - 1
in case null sbs of
Expand Down Expand Up @@ -814,7 +822,7 @@ reverse = \sbs ->
where
go :: forall s. BA -> MBA s -> Int -> ST s ()
go !ba !mba !l = do
-- this is equivalent to: (q, r) = l `quotRem` 8
-- this is equivalent to: (q, r) = l `quotRem` 8
let q = l `shiftR` 3
r = l .&. 7
i' <- goWord8Chunk 0 r
Expand Down Expand Up @@ -1038,7 +1046,7 @@ drop = \n -> \sbs ->
let len = length sbs
in if | n <= 0 -> sbs
| n >= len -> empty
| otherwise ->
| otherwise ->
let newLen = len - n
in create newLen $ \mba -> copyByteArray (asBA sbs) n mba 0 newLen

Expand Down Expand Up @@ -1139,7 +1147,7 @@ splitAt n = \sbs -> if
| otherwise ->
let slen = length sbs
in if | n >= length sbs -> (sbs, empty)
| otherwise ->
| otherwise ->
let llen = min slen (max 0 n)
rlen = max 0 (slen - max 0 n)
lsbs = create llen $ \mba -> copyByteArray (asBA sbs) 0 mba 0 llen
Expand Down Expand Up @@ -1198,7 +1206,7 @@ stripSuffix :: ShortByteString -> ShortByteString -> Maybe ShortByteString
stripSuffix sbs1 = \sbs2 -> do
let l1 = length sbs1
l2 = length sbs2
if | isSuffixOf sbs1 sbs2 ->
if | isSuffixOf sbs1 sbs2 ->
if null sbs1
then Just sbs2
else Just $! create (l2 - l1) $ \dst -> do
Expand Down Expand Up @@ -1679,7 +1687,7 @@ compareByteArraysOff (BA# ba1#) ba1off (BA# ba2#) ba2off len =
ba2#
ba2off
(fromIntegral len)


foreign import ccall unsafe "static sbs_memcmp_off"
c_memcmp_ByteArray :: ByteArray# -> Int -> ByteArray# -> Int -> CSize -> IO CInt
Expand Down Expand Up @@ -1767,12 +1775,30 @@ useAsCStringLen sbs action =
-- @since 0.11.3.0
isValidUtf8 :: ShortByteString -> Bool
isValidUtf8 sbs@(SBS ba#) = accursedUnutterablePerformIO $ do
i <- cIsValidUtf8 ba# (fromIntegral (length sbs))
let n = length sbs
-- Use a safe FFI call for large inputs to avoid GC synchronization pauses
-- in multithreaded contexts.
-- This specific limit was chosen based on results of a simple benchmark, see:
-- https://github.com/haskell/bytestring/issues/451#issuecomment-991879338
-- When changing this function, also consider changing the related function:
-- Data.ByteString.isValidUtf8
i <- if n < 1000000 || not (isPinned ba#)
then cIsValidUtf8 ba# (fromIntegral n)
else cIsValidUtf8Safe ba# (fromIntegral n)
IO (\s -> (# touch# ba# s, () #))
return $ i /= 0

-- We import bytestring_is_valid_utf8 both unsafe and safe. For small inputs
-- we can use the unsafe version to get a bit more performance, but for large
-- inputs the safe version should be used to avoid GC synchronization pauses
-- in multithreaded contexts.

foreign import ccall unsafe "bytestring_is_valid_utf8" cIsValidUtf8
:: ByteArray# -> CSize -> IO CInt

foreign import ccall safe "bytestring_is_valid_utf8" cIsValidUtf8Safe
:: ByteArray# -> CSize -> IO CInt

-- ---------------------------------------------------------------------
-- Internal utilities

Expand Down

0 comments on commit 38889aa

Please sign in to comment.