Skip to content

Commit

Permalink
Merge pull request #39 from snoyberg/fix37
Browse files Browse the repository at this point in the history
Fix #37 Handle multi-chunk pax extended header data
  • Loading branch information
mpilgrem authored Jan 6, 2024
2 parents d989656 + 6e29e59 commit 76e07d0
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 16 deletions.
5 changes: 5 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# ChangeLog for tar-conduit

## 0.4.1 - 2024-01-06

* Fix bug in the parsing of pax extended header data when provided in more than
one sequential chunk [#37](https://github.com/snoyberg/tar-conduit/issues/37)

## 0.4.0 - 2023-08-07

* `untarChunks` and `untar` now provide partial support for the pax interchange
Expand Down
28 changes: 24 additions & 4 deletions src/Data/Conduit/Tar.hs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,12 @@ import qualified Data.ByteString.Short as SS
import qualified Data.ByteString.Unsafe as BU
import Data.Foldable (foldr')
import qualified Data.Map as Map
import Data.Monoid ((<>), mempty)
#if !MIN_VERSION_base(4,11,0)
import Data.Monoid ((<>))
#endif
#if !MIN_VERSION_base(4,8,0)
import Data.Monoid (mempty)
#endif
import Data.Word (Word8)
import Foreign.C.Types (CTime (..))
import Foreign.Storable
Expand Down Expand Up @@ -494,9 +499,24 @@ applyPax p h =
updateUname = update "uname" $ \v h' -> h' { headerOwnerName = toShort v }

parsePax :: Monad m => ConduitM TarChunk TarChunk (StateT PaxState m) PaxHeader
parsePax = await >>= \case
Just (ChunkPayload _ b) -> pure $ paxParser b
_ -> pure mempty
parsePax = paxParser <$> combineChunkPayloads mempty
where
combineChunkPayloads bs = await >>= \case
Nothing -> pure bs
Just (ChunkPayload _ b) ->
-- This uses <> (Data.ByteString.Internal.Type.append) rather than, say,
-- [ByteString] (created in reverse order) and
-- Data.ByteString.Internal.Type.concat on the reverse of the list. The
-- reason for doing so is an expectation that, in practice, the pax
-- extended header data will be received as a single chunk in the very
-- great majority of cases and, when it is not, in the great majority of
-- remaining cases it will be received as two sequential chunks. This is
-- optimised for that expectation, rather than the receipt of the data in
-- a large number of small chunks.
combineChunkPayloads $ bs <> b
Just other -> do
leftover other
pure bs

-- | A pax extended header comprises one or more records. If the pax extended
-- header is empty or does not parse, yields an empty 'Pax'.
Expand Down
2 changes: 1 addition & 1 deletion tar-conduit.cabal
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: tar-conduit
version: 0.4.0
version: 0.4.1
synopsis: Extract and create tar files using conduit for streaming
description: Please see README.md. This is just filler to avoid warnings.
homepage: https://github.com/snoyberg/tar-conduit#readme
Expand Down
68 changes: 57 additions & 11 deletions tests/Spec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ module Main where

import Conduit
import Control.Exception
import Control.Monad (void, when, zipWithM_)
import Control.Monad (forM_, void, when, zipWithM_)
import Data.ByteString as S
import Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy.Char8 as LS8
import Data.ByteString.Short (fromShort)
import Data.Conduit.List
import Data.Conduit.Tar
Expand Down Expand Up @@ -295,45 +296,75 @@ collectContent dir =
-- | This test uses untar to process a simple example in the pax interchange
-- format.
paxSpec :: Spec
paxSpec = do
paxSpec = assert payloadSizeCheck $ do
it "untarChunksRaw, pax interchange format" $ do
res <- runConduitRes $
paxExample
.| chopEvery bigChopSize
.| untarChunksRaw
.| processTarChunks
.| sinkList
pure res `shouldReturn`
[ "/pax-global-header"
, "payload: 19 comment=Example\n"
, "/pax-extended-header"
, "payload: 17 path=filepath\n"
, "payload: " <> bigPayload1
, "payload: " <> bigPayload2
, "original-dir/original-filepath"
, "payload: payload"
, "payload: " <> smallPayload
]
it "untar, pax interchange format" $ do
res <- runConduitRes $
paxExample
.| chopEvery smallChopSize
.| untar process
.| sinkList
pure res `shouldReturn` [("filepath", "payload")]
pure res `shouldReturn`
[ (veryLongFilepath, smallPayload1)
, (veryLongFilepath, smallPayload2)
]
it "untarRaw, pax interchange format" $ do
res <- runConduitRes $
paxExample
.| chopEvery smallChopSize
.| untarRaw process
.| sinkList
pure res `shouldReturn` [("original-dir/original-filepath", "payload")]
pure res `shouldReturn`
[ ("original-dir/original-filepath", smallPayload1)
, ("original-dir/original-filepath", smallPayload2)
]
where
process fi = awaitForever $ \bs -> yield (filePath fi, bs)
processTarChunks = awaitForever $ \tc -> yield $ case tc of
ChunkHeader h -> fromShort $
headerFileNamePrefix h <> "/" <> headerFileNameSuffix h
ChunkPayload _ bs -> "payload: " <> bs
ChunkException e -> "exception: " <> S8.pack (show e)
chopEvery :: (MonadIO m) => Int -> ConduitT ByteString ByteString m ()
chopEvery n = chop
where
chop = await >>= \case
Nothing -> pure ()
Just val -> do
forM_ (split (S.unpack val)) $ \chunk -> yield (S.pack chunk)
chop
split = P.takeWhile (not . P.null)
. P.map (P.take n)
. P.iterate (P.drop n)
bigChopSize = 512
(bigPayload1, bigPayload2) = S.splitAt bigChopSize veryLongFilepathRecord
smallChopSize = 4
(smallPayload1, smallPayload2) = S.splitAt smallChopSize smallPayload
moreThanHalf s l = l > s && l <= 2 * s
payloadSizeCheck =
moreThanHalf bigChopSize (S.length veryLongFilepathRecord)
&& moreThanHalf smallChopSize (S.length smallPayload)


-- | Produces a simple example in the pax interchange format. It has a pax
-- \'global\' header block providing a comment, a pax \'next\' header block
-- providing a path (@\"filepath\"@), and a normal file with filepath
-- @\"original-filepath\"@ and payload @\"payload\".
-- providing a very long path (@\"very\very\...\very\long\filepath\"@), and a
-- normal file with filepath @\"original-filepath\"@ and payload @\"payload\".
paxExample :: MonadThrow m => ConduitM a ByteString m ()
paxExample = void $
yieldMany
Expand Down Expand Up @@ -366,7 +397,9 @@ paxExample = void $
, headerFileNamePrefix = mempty
}
nextOffset :: Header -> FileOffset
nextOffset h = headerPayloadOffset h + ((headerPayloadSize h + 512) `div` 512)
nextOffset h =
let payloadRecordCount = (headerPayloadSize h + 511) `div` 512
in headerPayloadOffset h + 512 + payloadRecordCount * 512
globalHeader = (defaultHeader 0x0)
{ headerFileNameSuffix = "pax-global-header"
, headerPayloadSize = fromIntegral $ S.length globalPayload
Expand All @@ -380,11 +413,24 @@ paxExample = void $
}
-- The path in the pax extended header should override the filepath
-- specified in the ustar header.
extendedPayload = "17 path=filepath\n"
extendedPayload = veryLongFilepathRecord
ustarHeader = (defaultHeader $ nextOffset extendedHeader)
{ headerFileNameSuffix = "original-filepath"
, headerPayloadSize = fromIntegral $ S.length ustarPayload
, headerLinkIndicator = 0x30 -- UTF-8 '0'
, headerFileNamePrefix = "original-dir"
}
ustarPayload = "payload"
ustarPayload = smallPayload

-- | A very/very/.../very/long/filepath with 653 bytes.
veryLongFilepath :: ByteString
veryLongFilepath =
S8.toStrict (LS8.take 640 $ LS8.cycle "very/") <> "long/filepath"

-- | A very, very, ..., very, long filepath record with 663 bytes.
veryLongFilepathRecord :: ByteString
veryLongFilepathRecord = "663 path=" <> veryLongFilepath <> "\n"

-- | A small payload.
smallPayload :: ByteString
smallPayload = "payload"

0 comments on commit 76e07d0

Please sign in to comment.