From c4b28fb638b993971e2ee74d3cb29e0f4dc5735d Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Sat, 17 Apr 2021 12:44:49 -0400 Subject: [PATCH] support for hardlinks: extract, tree_hash, rewrite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for hardlinks, including: - extracting them by copying the linked file (no hardlink created) - tree hashing them as they are extracted - rewriting by duplicating the linked file This only supports hardlinks whose target is a plain file that has already been seen in the tarball that is being processed. You cannot have a hardlink that appears before the file that is linked. If the target of a hardlink is overwritten later, the link copies the current version of the file at the time of extraction. Tree hashing and rewrite are both consistent with this behavior. It is not supported to extract hardlinks where the link involves symlinks, even if the link refers to a path that would be a file — the target must be a plain file. Close #101. --- README.md | 31 ++++++++++++++---- src/Tar.jl | 21 ++++++++++++ src/create.jl | 15 +++++++-- src/extract.jl | 87 +++++++++++++++++++++++++++++++++++++------------- src/header.jl | 12 +++++-- test/setup.jl | 12 ++++++- 6 files changed, 143 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 9a6ac8e..e62cba5 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,13 @@ is encountered while extracting `tarball` and the entry is only extracted if the an archive, to skip entries that cause `extract` to throw an error, or to record what is extracted during the extraction process. +Before it is passed to the predicate function, the `Header` object is somewhat +modified from the raw header in the tarball: the `path` field is normalized to +remove `.` entries and replace multiple consecutive slashes with a single slash. +If the entry has type `:hardlink`, the link target path is normalized the same +way so that it will match the path of the target entry; the size field is set to +the size of the target path (which must be an already-seen file). + If the `skeleton` keyword is passed then a "skeleton" of the extracted tarball is written to the file or IO handle given. This skeleton file can be used to recreate an identical tarball by passing the `skeleton` keyword to the `create` @@ -156,6 +163,13 @@ is encountered while extracting `old_tarball` and the entry is skipped unless an archive, to skip entries that would cause `extract` to throw an error, or to record what content is encountered during the rewrite process. +Before it is passed to the predicate function, the `Header` object is somewhat +modified from the raw header in the tarball: the `path` field is normalized to +remove `.` entries and replace multiple consecutive slashes with a single slash. +If the entry has type `:hardlink`, the link target path is normalized the same +way so that it will match the path of the target entry; the size field is set to +the size of the target path (which must be an already-seen file). + ### Tar.tree_hash ```jl @@ -187,6 +201,13 @@ is encountered while processing `tarball` and an entry is only hashed if archive, to skip entries that cause `extract` to throw an error, or to record what is extracted during the hashing process. +Before it is passed to the predicate function, the `Header` object is somewhat +modified from the raw header in the tarball: the `path` field is normalized to +remove `.` entries and replace multiple consecutive slashes with a single slash. +If the entry has type `:hardlink`, the link target path is normalized the same +way so that it will match the path of the target entry; the size field is set to +the size of the target path (which must be an already-seen file). + Currently supported values for `algorithm` are `git-sha1` (the default) and `git-sha256`, which uses the same basic algorithm as `git-sha1` but replaces the SHA1 hash function with SHA2-256, the hash function that git will transition to @@ -362,18 +383,16 @@ supports only the following file types: * plain files * directories * symlinks +* hardlinks (extracted as copies) The `Tar` package does not support other file types that the TAR format can -represent, including: hard links, character devices, block devices, and FIFOs. -If you attempt to create or extract an archive that contains any of these kinds -of entries, `Tar` will raise an error. You can, however, list the contents of a +represent, including: character devices, block devices, and FIFOs. If you +attempt to create or extract an archive that contains any of these kinds of +entries, `Tar` will raise an error. You can, however, list the contents of a tarball containing other kinds of entries by passing the `strict=false` flag to the `list` function; without this option, `list` raises the same error as `extract` would. -In the future, optional support may be added for using hard links within -archives to avoid duplicating identical files. - ### Time Stamps Also in accordance with its design goal as a data transfer tool, the `Tar` diff --git a/src/Tar.jl b/src/Tar.jl index b265f38..5b4f5a9 100644 --- a/src/Tar.jl +++ b/src/Tar.jl @@ -175,6 +175,13 @@ is encountered while extracting `tarball` and the entry is only extracted if the an archive, to skip entries that cause `extract` to throw an error, or to record what is extracted during the extraction process. +Before it is passed to the predicate function, the `Header` object is somewhat +modified from the raw header in the tarball: the `path` field is normalized to +remove `.` entries and replace multiple consecutive slashes with a single slash. +If the entry has type `:hardlink`, the link target path is normalized the same +way so that it will match the path of the target entry; the size field is set to +the size of the target path (which must be an already-seen file). + If the `skeleton` keyword is passed then a "skeleton" of the extracted tarball is written to the file or IO handle given. This skeleton file can be used to recreate an identical tarball by passing the `skeleton` keyword to the `create` @@ -251,6 +258,13 @@ is encountered while extracting `old_tarball` and the entry is skipped unless `predicate(hdr)` is true. This can be used to selectively rewrite only parts of an archive, to skip entries that would cause `extract` to throw an error, or to record what content is encountered during the rewrite process. + +Before it is passed to the predicate function, the `Header` object is somewhat +modified from the raw header in the tarball: the `path` field is normalized to +remove `.` entries and replace multiple consecutive slashes with a single slash. +If the entry has type `:hardlink`, the link target path is normalized the same +way so that it will match the path of the target entry; the size field is set to +the size of the target path (which must be an already-seen file). """ function rewrite( predicate::Function, @@ -301,6 +315,13 @@ is encountered while processing `tarball` and an entry is only hashed if archive, to skip entries that cause `extract` to throw an error, or to record what is extracted during the hashing process. +Before it is passed to the predicate function, the `Header` object is somewhat +modified from the raw header in the tarball: the `path` field is normalized to +remove `.` entries and replace multiple consecutive slashes with a single slash. +If the entry has type `:hardlink`, the link target path is normalized the same +way so that it will match the path of the target entry; the size field is set to +the size of the target path (which must be an already-seen file). + Currently supported values for `algorithm` are `git-sha1` (the default) and `git-sha256`, which uses the same basic algorithm as `git-sha1` but replaces the SHA1 hash function with SHA2-256, the hash function that git will transition to diff --git a/src/create.jl b/src/create.jl index 3fb7013..90ba4db 100644 --- a/src/create.jl +++ b/src/create.jl @@ -54,10 +54,19 @@ function rewrite_tarball( end node = node′ end - if !(hdr.type == :directory && get(node, name, nothing) isa Dict) - node[name] = (hdr, position(old_tar)) + if hdr.type == :hardlink + node′ = tree + for part in split(hdr.link, '/') + node′ = node′[part] + end + hdr′ = Header(node′[1], path=hdr.path, mode=hdr.mode) + node[name] = (hdr′, node′[2]) + else + if !(hdr.type == :directory && get(node, name, nothing) isa Dict) + node[name] = (hdr, position(old_tar)) + end + skip_data(old_tar, hdr.size) end - skip_data(old_tar, hdr.size) end write_tarball(new_tar, tree, buf=buf) do node, tar_path if node isa Dict diff --git a/src/extract.jl b/src/extract.jl index 56e0a82..bae4f65 100644 --- a/src/extract.jl +++ b/src/extract.jl @@ -79,8 +79,16 @@ function extract_tarball( mkdir(sys_path) elseif hdr.type == :symlink copy_symlinks || symlink(hdr.link, sys_path) + elseif hdr.type == :hardlink + src_path = joinpath(root, hdr.link) + cp(src_path, sys_path) elseif hdr.type == :file read_data(tar, sys_path, size=hdr.size, buf=buf) + else # should already be caught by check_header + error("unsupported tarball entry type: $(hdr.type)") + end + # apply tarball permissions + if hdr.type in (:file, :hardlink) exec = 0o100 & hdr.mode != 0 tar_mode = exec ? 0o755 : 0o644 sys_mode = filemode(sys_path) @@ -93,21 +101,19 @@ function extract_tarball( # we don't have a way to do that afaik end chmod(sys_path, tar_mode & sys_mode) - else # should already be caught by check_header - error("unsupported tarball entry type: $(hdr.type)") end end copy_symlinks || return # resolve the internal targets of symlinks for (path, what) in paths - what isa AbstractString || continue + what isa String || continue target = link_target(paths, path, what) paths[path] = something(target, :symlink) end # follow chains of symlinks - follow(seen::Vector, what::Symbol) = + follow(seen::Vector, what::Any) = what == :symlink ? what : seen[end] follow(seen::Vector, what::String) = what in seen ? :symlink : follow(push!(seen, what), paths[what]) @@ -159,7 +165,7 @@ end # resolve symlink target or nothing if not valid function link_target( - paths::Dict{String,Union{String,Symbol}}, + paths::Dict{String}, path::AbstractString, link::AbstractString, ) @@ -220,12 +226,18 @@ function git_tree_hash( node[name] = Dict{String,Any}() end return - end - if hdr.type == :symlink + elseif hdr.type == :symlink mode = "120000" hash = git_object_hash("blob", HashType) do io write(io, hdr.link) end + elseif hdr.type == :hardlink + mode = iszero(hdr.mode & 0o100) ? "100644" : "100755" + node′ = tree + for part in split(hdr.link, '/') + node′ = node′[part] + end + hash = node′[2] # hash of linked file elseif hdr.type == :file mode = iszero(hdr.mode & 0o100) ? "100644" : "100755" hash = git_file_hash(tar, hdr.size, HashType, buf=buf) @@ -332,31 +344,62 @@ function read_tarball( ) write_skeleton_header(skeleton, buf=buf) # symbols for path types except symlinks store the link - paths = Dict{String,Union{Symbol,String}}() + paths = Dict{String,Any}() globals = Dict{String,String}() while !eof(tar) hdr = read_header(tar, globals=globals, buf=buf, tee=skeleton) hdr === nothing && break - # check if we should extract or skip - if !predicate(hdr) - skip_data(tar, hdr.size) - continue - end - check_header(hdr) + err = nothing # normalize path and check for symlink attacks path = "" for part in split(hdr.path, '/') + # check_header checks for ".." later (isempty(part) || part == ".") && continue - # check_header doesn't allow ".." in path - get(paths, path, nothing) isa String && error(""" - Refusing to extract path with symlink prefix, possible attack - * path to extract: $(repr(hdr.path)) - * symlink prefix: $(repr(path)) - """) - isempty(path) || (paths[path] = :directory) + if err === nothing && get(paths, path, nothing) isa String + err = """ + Tarball contains path with symlink prefix: + - path = $(repr(hdr.path)) + - prefix = $(repr(path)) + Refusing to extract — possible attack! + """ + end path = isempty(path) ? part : "$path/$part" end - paths[path] = hdr.type == :symlink ? hdr.link : hdr.type + hdr′ = Header(hdr, path=path) + # check that hardlinks refer to already-seen files + if err === nothing && hdr.type == :hardlink + parts = filter!(split(hdr.link, '/')) do part + # check_header checks for ".." later + !isempty(part) && part != "." + end + link = join(parts, '/') + hdr = Header(hdr, link=link) + hdr′ = Header(hdr′, link=link) + what = get(paths, link, Symbol("non-existent")) + if what isa Integer # plain file + hdr′ = Header(hdr′, size=what) + else + err = """ + Tarball contains hardlink with $what target: + - path = $(repr(hdr.path)) + - target = $(repr(hdr.link)) + Refusing to extract — possible attack! + """ + end + end + # check if we should extract or skip + if !predicate(hdr′) # pass normalized header + skip_data(tar, hdr.size) + continue + end + check_header(hdr) + err === nothing || error(err) + # record info about path + paths[path] = + hdr.type == :symlink ? hdr.link : + hdr.type == :file ? hdr.size : + hdr.type + # apply callback, checking that it consumes IO correctly before = applicable(position, tar) ? position(tar) : 0 callback(hdr, split(path, '/', keepempty=false)) applicable(position, tar) || continue diff --git a/src/header.jl b/src/header.jl index 49db676..d1e3601 100644 --- a/src/header.jl +++ b/src/header.jl @@ -99,12 +99,18 @@ function check_header(hdr::Header) err("path is absolute") occursin(r"(^|/)\.\.(/|$)", hdr.path) && err("path contains '..' component") - hdr.type in (:file, :symlink, :directory) || + hdr.type in (:file, :hardlink, :symlink, :directory) || err("unsupported entry type") hdr.type ∉ (:hardlink, :symlink) && !isempty(hdr.link) && err("non-link with link path") - hdr.type == :symlink && hdr.size != 0 && - err("symlink with non-zero size") + hdr.type ∈ (:hardlink, :symlink) && isempty(hdr.link) && + err("$(hdr.type) with empty link path") + hdr.type ∈ (:hardlink, :symlink) && hdr.size != 0 && + err("$(hdr.type) with non-zero size") + hdr.type == :hardlink && hdr.link[1] == '/' && + err("hardlink with absolute link path") + hdr.type == :hardlink && occursin(r"(^|/)\.\.(/|$)", hdr.link) && + err("hardlink contains '..' component") hdr.type == :directory && hdr.size != 0 && err("directory with non-zero size") hdr.type != :directory && endswith(hdr.path, "/") && diff --git a/test/setup.jl b/test/setup.jl index b73c261..c09bca3 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -61,8 +61,9 @@ function make_test_tarball(tar_create::Function = Tar.create) dir′ = joinpath(dir, "s"^b) mkpath(dir′) push!(paths, dir′) + path = paths[i += 1] link = joinpath(dir, "l"^b) - target = relpath(paths[i += 1], link) + target = relpath(path, link) symlink(target, link) push!(paths, link) broken = joinpath(dir, "b"^b) @@ -70,6 +71,15 @@ function make_test_tarball(tar_create::Function = Tar.create) symlink(chop(target), broken) push!(paths, broken) end + isfile(path) || continue + hard = joinpath(dir, "h"^b) + mode = isodd(i) ? 0o755 : 0o644 + if Sys.which("ln") !== nothing + run(`ln $path $hard`) + else + cp(path, hard) + end + chmod(hard, mode) end end end