Merge pull request #39 from snoyberg/fix37

Fix #37 Handle multi-chunk pax extended header data
snoyberg · Jan 6, 2024 · 76e07d0 · 76e07d0
2 parents d989656 + 6e29e59
commit 76e07d0
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 16 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -1,5 +1,10 @@
 # ChangeLog for tar-conduit
 
+## 0.4.1 - 2024-01-06
+
+* Fix bug in the parsing of pax extended header data when provided in more than
+  one sequential chunk [#37](https://github.com/snoyberg/tar-conduit/issues/37)
+
 ## 0.4.0 - 2023-08-07
 
 * `untarChunks` and `untar` now provide partial support for the pax interchange

diff --git a/src/Data/Conduit/Tar.hs b/src/Data/Conduit/Tar.hs
@@ -57,7 +57,12 @@ import qualified Data.ByteString.Short    as SS
 import qualified Data.ByteString.Unsafe   as BU
 import           Data.Foldable            (foldr')
 import qualified Data.Map                 as Map
-import           Data.Monoid              ((<>), mempty)
+#if !MIN_VERSION_base(4,11,0)
+import           Data.Monoid              ((<>))
+#endif
+#if !MIN_VERSION_base(4,8,0)
+import           Data.Monoid              (mempty)
+#endif
 import           Data.Word                (Word8)
 import           Foreign.C.Types          (CTime (..))
 import           Foreign.Storable
@@ -494,9 +499,24 @@ applyPax p h =
     updateUname = update "uname" $ \v h' -> h' { headerOwnerName = toShort v }
 
 parsePax :: Monad m => ConduitM TarChunk TarChunk (StateT PaxState m) PaxHeader
-parsePax = await >>= \case
-    Just (ChunkPayload _ b) -> pure $ paxParser b
-    _ -> pure mempty
+parsePax = paxParser <$> combineChunkPayloads mempty
+ where
+  combineChunkPayloads bs = await >>= \case
+    Nothing -> pure bs
+    Just (ChunkPayload _ b) ->
+      -- This uses <> (Data.ByteString.Internal.Type.append) rather than, say,
+      -- [ByteString] (created in reverse order) and
+      -- Data.ByteString.Internal.Type.concat on the reverse of the list. The
+      -- reason for doing so is an expectation that, in practice, the pax
+      -- extended header data will be received as a single chunk in the very
+      -- great majority of cases and, when it is not, in the great majority of
+      -- remaining cases it will be received as two sequential chunks. This is
+      -- optimised for that expectation, rather than the receipt of the data in
+      -- a large number of small chunks.
+      combineChunkPayloads $ bs <> b
+    Just other -> do
+      leftover other
+      pure bs
 
 -- | A pax extended header comprises one or more records. If the pax extended
 -- header is empty or does not parse, yields an empty 'Pax'.

diff --git a/tar-conduit.cabal b/tar-conduit.cabal
@@ -1,5 +1,5 @@
 name:                tar-conduit
-version:             0.4.0
+version:             0.4.1
 synopsis:            Extract and create tar files using conduit for streaming
 description:         Please see README.md. This is just filler to avoid warnings.
 homepage:            https://github.com/snoyberg/tar-conduit#readme

diff --git a/tests/Spec.hs b/tests/Spec.hs
@@ -7,9 +7,10 @@ module Main where
 
 import           Conduit
 import           Control.Exception
-import           Control.Monad         (void, when, zipWithM_)
+import           Control.Monad         (forM_, void, when, zipWithM_)
 import           Data.ByteString       as S
 import           Data.ByteString.Char8 as S8
+import qualified Data.ByteString.Lazy.Char8 as LS8
 import           Data.ByteString.Short (fromShort)
 import           Data.Conduit.List
 import           Data.Conduit.Tar
@@ -295,45 +296,75 @@ collectContent dir =
 -- | This test uses untar to process a simple example in the pax interchange
 -- format.
 paxSpec :: Spec
-paxSpec = do
+paxSpec = assert payloadSizeCheck $ do
     it "untarChunksRaw, pax interchange format" $ do
         res <- runConduitRes $
                paxExample
+            .| chopEvery bigChopSize
             .| untarChunksRaw
             .| processTarChunks
             .| sinkList
         pure res `shouldReturn`
             [ "/pax-global-header"
             , "payload: 19 comment=Example\n"
             , "/pax-extended-header"
-            , "payload: 17 path=filepath\n"
+            , "payload: " <> bigPayload1
+            , "payload: " <> bigPayload2
             , "original-dir/original-filepath"
-            , "payload: payload"
+            , "payload: " <> smallPayload
             ]
     it "untar, pax interchange format" $ do
         res <- runConduitRes $
                paxExample
+            .| chopEvery smallChopSize
             .| untar process
             .| sinkList
-        pure res `shouldReturn` [("filepath", "payload")]
+        pure res `shouldReturn`
+            [ (veryLongFilepath, smallPayload1)
+            , (veryLongFilepath, smallPayload2)
+            ]
     it "untarRaw, pax interchange format" $ do
         res <- runConduitRes $
                paxExample
+            .| chopEvery smallChopSize
             .| untarRaw process
             .| sinkList
-        pure res `shouldReturn` [("original-dir/original-filepath", "payload")]
+        pure res `shouldReturn`
+            [ ("original-dir/original-filepath", smallPayload1)
+            , ("original-dir/original-filepath", smallPayload2)
+            ]
   where
     process fi = awaitForever $ \bs -> yield (filePath fi, bs)
     processTarChunks = awaitForever $ \tc -> yield $ case tc of
         ChunkHeader h -> fromShort $
             headerFileNamePrefix h <> "/" <> headerFileNameSuffix h
         ChunkPayload _ bs -> "payload: " <> bs
         ChunkException e -> "exception: " <> S8.pack (show e)
+    chopEvery :: (MonadIO m) => Int -> ConduitT ByteString ByteString m ()
+    chopEvery n = chop
+      where
+        chop = await >>= \case
+            Nothing -> pure ()
+            Just val -> do
+                forM_ (split (S.unpack val)) $ \chunk -> yield (S.pack chunk)
+                chop
+        split = P.takeWhile (not . P.null)
+              . P.map (P.take n)
+              . P.iterate (P.drop n)
+    bigChopSize = 512
+    (bigPayload1, bigPayload2) = S.splitAt bigChopSize veryLongFilepathRecord
+    smallChopSize = 4
+    (smallPayload1, smallPayload2) = S.splitAt smallChopSize smallPayload
+    moreThanHalf s l = l > s && l <= 2 * s
+    payloadSizeCheck =
+           moreThanHalf bigChopSize (S.length veryLongFilepathRecord)
+        && moreThanHalf smallChopSize (S.length smallPayload)
+
 
 -- | Produces a simple example in the pax interchange format. It has a pax
 -- \'global\' header block providing a comment, a pax \'next\' header block
--- providing a path (@\"filepath\"@), and a normal file with filepath
--- @\"original-filepath\"@ and payload @\"payload\".
+-- providing a very long path (@\"very\very\...\very\long\filepath\"@), and a
+-- normal file with filepath @\"original-filepath\"@ and payload @\"payload\".
 paxExample :: MonadThrow m => ConduitM a ByteString m ()
 paxExample = void $
        yieldMany
@@ -366,7 +397,9 @@ paxExample = void $
         , headerFileNamePrefix = mempty
         }
     nextOffset :: Header -> FileOffset
-    nextOffset h = headerPayloadOffset h + ((headerPayloadSize h + 512) `div` 512)
+    nextOffset h =
+        let payloadRecordCount = (headerPayloadSize h + 511) `div` 512
+        in  headerPayloadOffset h + 512 + payloadRecordCount * 512
     globalHeader = (defaultHeader 0x0)
         { headerFileNameSuffix = "pax-global-header"
         , headerPayloadSize = fromIntegral $ S.length globalPayload
@@ -380,11 +413,24 @@ paxExample = void $
         }
     -- The path in the pax extended header should override the filepath
     -- specified in the ustar header.
-    extendedPayload = "17 path=filepath\n"
+    extendedPayload = veryLongFilepathRecord
     ustarHeader = (defaultHeader $ nextOffset extendedHeader)
         { headerFileNameSuffix = "original-filepath"
         , headerPayloadSize = fromIntegral $ S.length ustarPayload
         , headerLinkIndicator = 0x30 -- UTF-8 '0'
         , headerFileNamePrefix = "original-dir"
         }
-    ustarPayload = "payload"
+    ustarPayload = smallPayload
+
+-- | A very/very/.../very/long/filepath with 653 bytes.
+veryLongFilepath :: ByteString
+veryLongFilepath =
+    S8.toStrict (LS8.take 640 $ LS8.cycle "very/") <> "long/filepath"
+
+-- | A very, very, ..., very, long filepath record with 663 bytes.
+veryLongFilepathRecord :: ByteString
+veryLongFilepathRecord = "663 path=" <> veryLongFilepath <> "\n"
+
+-- | A small payload.
+smallPayload :: ByteString
+smallPayload = "payload"