Use safe isValidUtf8 for large inputs (#470)

* Use safe isValidUtf8 for large inputs * Use compat function for checking pinnedness * Touch byte array after passing it to safe FFI call
haskell · Feb 24, 2022 · 38889aa · 38889aa
1 parent df1706a
commit 38889aa
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 19 deletions.
diff --git a/Data/ByteString.hs b/Data/ByteString.hs
@@ -1523,13 +1523,29 @@ isInfixOf p s = null p || not (null $ snd $ breakSubstring p s)
 --
 -- @since 0.11.2.0
 isValidUtf8 :: ByteString -> Bool
-isValidUtf8 (BS ptr len) = accursedUnutterablePerformIO $ unsafeWithForeignPtr ptr $ \p -> do 
-  i <- cIsValidUtf8 p (fromIntegral len)
+isValidUtf8 (BS ptr len) = accursedUnutterablePerformIO $ unsafeWithForeignPtr ptr $ \p -> do
+  -- Use a safe FFI call for large inputs to avoid GC synchronization pauses
+  -- in multithreaded contexts.
+  -- This specific limit was chosen based on results of a simple benchmark, see:
+  -- https://github.com/haskell/bytestring/issues/451#issuecomment-991879338
+  -- When changing this function, also consider changing the related function:
+  -- Data.ByteString.Short.Internal.isValidUtf8
+  i <- if len < 1000000
+     then cIsValidUtf8 p (fromIntegral len)
+     else cIsValidUtf8Safe p (fromIntegral len)
   pure $ i /= 0
 
+-- We import bytestring_is_valid_utf8 both unsafe and safe. For small inputs
+-- we can use the unsafe version to get a bit more performance, but for large
+-- inputs the safe version should be used to avoid GC synchronization pauses
+-- in multithreaded contexts.
+
 foreign import ccall unsafe "bytestring_is_valid_utf8" cIsValidUtf8
   :: Ptr Word8 -> CSize -> IO CInt
 
+foreign import ccall safe "bytestring_is_valid_utf8" cIsValidUtf8Safe
+  :: Ptr Word8 -> CSize -> IO CInt
+
 -- | Break a string on a substring, returning a pair of the part of the
 -- string prior to the match, and the rest of the string.
 --

diff --git a/Data/ByteString/Short/Internal.hs b/Data/ByteString/Short/Internal.hs
@@ -233,7 +233,11 @@ import GHC.Exts
   ,indexWord8ArrayAsWord64#
 #endif
   , setByteArray#
-  )
+  , sizeofByteArray#
+  , indexWord8Array#, indexCharArray#
+  , writeWord8Array#
+  , unsafeFreezeByteArray#
+  , touch# )
 import GHC.IO
 import GHC.ForeignPtr
   ( ForeignPtr(ForeignPtr)
@@ -482,6 +486,13 @@ createAndTrim'' l fill =
             return (SBS ba#)
 {-# INLINE createAndTrim'' #-}
 
+isPinned :: ByteArray# -> Bool
+#if MIN_VERSION_base(4,10,0)
+isPinned ba# = isTrue# (isByteArrayPinned# ba#)
+#else
+isPinned _ = False
+#endif
+
 ------------------------------------------------------------------------
 -- Conversion to and from ByteString
 
@@ -501,18 +512,15 @@ toShortIO (BS fptr len) = do
     BA# ba# <- stToIO (unsafeFreezeByteArray mba)
     return (SBS ba#)
 
-
 -- | /O(n)/. Convert a 'ShortByteString' into a 'ByteString'.
 --
 fromShort :: ShortByteString -> ByteString
-#if MIN_VERSION_base(4,10,0)
-fromShort (SBS ba#)
-  | isTrue# (isByteArrayPinned# ba#) = BS fp len
+fromShort (SBS b#)
+  | isPinned b# = BS fp len
   where
-    addr# = byteArrayContents# ba#
-    fp    = ForeignPtr addr# (PlainPtr (unsafeCoerce# ba#))
-    len   = I# (sizeofByteArray# ba#)
-#endif
+    addr# = byteArrayContents# b#
+    fp = ForeignPtr addr# (PlainPtr (unsafeCoerce# b#))
+    len = I# (sizeofByteArray# b#)
 fromShort !sbs = unsafeDupablePerformIO (fromShortIO sbs)
 
 fromShortIO :: ShortByteString -> IO ByteString
@@ -675,7 +683,7 @@ infixr 5 `cons` --same as list (:)
 infixl 5 `snoc`
 
 -- | /O(n)/ Append a byte to the end of a 'ShortByteString'
--- 
+--
 -- Note: copies the entire byte array
 --
 -- @since 0.11.3.0
@@ -718,7 +726,7 @@ last = \sbs -> case null sbs of
 --
 -- @since 0.11.3.0
 tail :: HasCallStack => ShortByteString -> ShortByteString
-tail = \sbs -> 
+tail = \sbs ->
   let l  = length sbs
       nl = l - 1
   in case null sbs of
@@ -814,7 +822,7 @@ reverse = \sbs ->
   where
     go :: forall s. BA -> MBA s -> Int -> ST s ()
     go !ba !mba !l = do
-      -- this is equivalent to: (q, r) = l `quotRem` 8 
+      -- this is equivalent to: (q, r) = l `quotRem` 8
       let q = l `shiftR` 3
           r = l .&. 7
       i' <- goWord8Chunk 0 r
@@ -1038,7 +1046,7 @@ drop = \n -> \sbs ->
   let len = length sbs
   in if | n <= 0    -> sbs
         | n >= len  -> empty
-        | otherwise -> 
+        | otherwise ->
             let newLen = len - n
             in create newLen $ \mba -> copyByteArray (asBA sbs) n mba 0 newLen
 
@@ -1139,7 +1147,7 @@ splitAt n = \sbs -> if
   | otherwise ->
       let slen = length sbs
       in if | n >= length sbs -> (sbs, empty)
-            | otherwise -> 
+            | otherwise ->
                 let llen = min slen (max 0 n)
                     rlen = max 0 (slen - max 0 n)
                     lsbs = create llen $ \mba -> copyByteArray (asBA sbs) 0 mba 0 llen
@@ -1198,7 +1206,7 @@ stripSuffix :: ShortByteString -> ShortByteString -> Maybe ShortByteString
 stripSuffix sbs1 = \sbs2 -> do
   let l1 = length sbs1
       l2 = length sbs2
-  if | isSuffixOf sbs1 sbs2 -> 
+  if | isSuffixOf sbs1 sbs2 ->
          if null sbs1
          then Just sbs2
          else Just $! create (l2 - l1) $ \dst -> do
@@ -1679,7 +1687,7 @@ compareByteArraysOff (BA# ba1#) ba1off (BA# ba2#) ba2off len =
                        ba2#
                        ba2off
                        (fromIntegral len)
-  
+
 
 foreign import ccall unsafe "static sbs_memcmp_off"
   c_memcmp_ByteArray :: ByteArray# -> Int -> ByteArray# -> Int -> CSize -> IO CInt
@@ -1767,12 +1775,30 @@ useAsCStringLen sbs action =
 -- @since 0.11.3.0
 isValidUtf8 :: ShortByteString -> Bool
 isValidUtf8 sbs@(SBS ba#) = accursedUnutterablePerformIO $ do
-  i <- cIsValidUtf8 ba# (fromIntegral (length sbs))
+  let n = length sbs
+  -- Use a safe FFI call for large inputs to avoid GC synchronization pauses
+  -- in multithreaded contexts.
+  -- This specific limit was chosen based on results of a simple benchmark, see:
+  -- https://github.com/haskell/bytestring/issues/451#issuecomment-991879338
+  -- When changing this function, also consider changing the related function:
+  -- Data.ByteString.isValidUtf8
+  i <- if n < 1000000 || not (isPinned ba#)
+     then cIsValidUtf8 ba# (fromIntegral n)
+     else cIsValidUtf8Safe ba# (fromIntegral n)
+  IO (\s -> (# touch# ba# s, () #))
   return $ i /= 0
 
+-- We import bytestring_is_valid_utf8 both unsafe and safe. For small inputs
+-- we can use the unsafe version to get a bit more performance, but for large
+-- inputs the safe version should be used to avoid GC synchronization pauses
+-- in multithreaded contexts.
+
 foreign import ccall unsafe "bytestring_is_valid_utf8" cIsValidUtf8
   :: ByteArray# -> CSize -> IO CInt
 
+foreign import ccall safe "bytestring_is_valid_utf8" cIsValidUtf8Safe
+  :: ByteArray# -> CSize -> IO CInt
+
 -- ---------------------------------------------------------------------
 -- Internal utilities