advanced chunking

moonshadow565 · Oct 13, 2022 · 7a4a2e5 · 7a4a2e5
1 parent 5fb623d
commit 7a4a2e5
Show file tree

Hide file tree

Showing 12 changed files with 554 additions and 60 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,6 +8,14 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 add_subdirectory(dep)
 
 add_library(rlib STATIC
+    lib/rlib/ar.hpp
+    lib/rlib/ar.cpp
+    lib/rlib/ar/bnk.hpp
+    lib/rlib/ar/bnk.cpp
+    lib/rlib/ar/wad.hpp
+    lib/rlib/ar/wad.cpp
+    lib/rlib/ar/wpk.hpp
+    lib/rlib/ar/wpk.cpp
     lib/rlib/common.hpp
     lib/rlib/common.cpp
     lib/rlib/iofile.cpp

diff --git a/lib/rlib/ar.cpp b/lib/rlib/ar.cpp
@@ -0,0 +1,81 @@
+#include "ar.hpp"
+
+#include "ar/bnk.hpp"
+#include "ar/wad.hpp"
+#include "ar/wpk.hpp"
+
+using namespace rlib;
+using namespace rlib::ar;
+
+auto ArSplit::operator()(IO const& io, offset_cb cb) const -> void {
+    process(io, cb, 0, {.offset = 0, .size = io.size()});
+}
+
+template <typename T>
+auto ArSplit::process_ar(IO const& io, offset_cb cb, Entry top_entry) const -> void {
+    auto archive = T{};
+    if (auto error = archive.read(io, top_entry.offset, top_entry.size)) rlib_error(error);
+
+    // ensure offsets are processed in order
+    std::sort(archive.entries.begin(), archive.entries.end(), [](auto const& lhs, auto const& rhs) {
+        if (lhs.offset < rhs.offset) return true;
+        if (lhs.offset == rhs.offset && lhs.size > rhs.size) return true;
+        return false;
+    });
+
+    auto cur = top_entry.offset;
+    for (auto entry : archive.entries) {
+        // skip empty entries
+        if (!entry.size) continue;
+
+        // skip duplicate or overlapping entries
+        if (entry.offset < cur) {
+            continue;
+        }
+
+        // process any skipped data
+        if (auto leftover = entry.offset - cur) {
+            process(io, cb, -1, {.offset = cur, .size = leftover, .compressed = top_entry.compressed});
+        }
+
+        // process current entry
+        process(io,
+                cb,
+                T::can_nest && !no_nest && !entry.compressed ? 1 : -1,
+                {
+                    .offset = entry.offset,
+                    .size = entry.size,
+                    .compressed = entry.compressed,
+                });
+
+        // go to next entry
+        cur = entry.offset + entry.size;
+    }
+
+    // process any remaining data
+    if (auto remain = (top_entry.offset + top_entry.size) - cur) {
+        process(io, cb, -1, {.offset = cur, .size = remain, .compressed = top_entry.compressed});
+    }
+}
+
+auto ArSplit::process(IO const& io, offset_cb cb, int depth, Entry top_entry) const -> void {
+    if (depth >= 0 && top_entry.size >= 64) {
+        char buffer[8] = {};
+        rlib_assert(io.read(top_entry.offset, buffer));
+        if (!no_bnk && BNK::check_magic(buffer)) {
+            return process_ar<BNK>(io, cb, top_entry);
+        }
+        if (!no_wad && depth < 1 && WAD::check_magic(buffer)) {
+            return process_ar<WAD>(io, cb, top_entry);
+        }
+        if (!no_wpk && WPK::check_magic(buffer)) {
+            return process_ar<WPK>(io, cb, top_entry);
+        }
+    }
+    for (auto i = top_entry.offset, remain = top_entry.size; remain;) {
+        auto size = std::min(chunk_size, remain);
+        cb({.offset = i, .size = size, .compressed = top_entry.compressed});
+        i += size;
+        remain -= size;
+    }
+}
diff --git a/lib/rlib/ar.hpp b/lib/rlib/ar.hpp
@@ -0,0 +1,28 @@
+#pragma once
+#include <rlib/common.hpp>
+#include <rlib/iofile.hpp>
+
+namespace rlib {
+    struct ArSplit {
+        struct Entry {
+            std::size_t offset;
+            std::size_t size;
+            bool compressed;
+        };
+        using offset_cb = function_ref<void(Entry)>;
+
+        std::size_t chunk_size;
+        bool no_bnk;
+        bool no_wad;
+        bool no_wpk;
+        bool no_nest;
+
+        auto operator()(IO const& io, offset_cb cb) const -> void;
+
+    private:
+        auto process(IO const& io, offset_cb cb, int depth, Entry top_entry) const -> void;
+
+        template <typename T>
+        auto process_ar(IO const& io, offset_cb cb, Entry top_entry) const -> void;
+    };
+}
diff --git a/lib/rlib/ar/bnk.cpp b/lib/rlib/ar/bnk.cpp
@@ -0,0 +1,89 @@
+#include "bnk.hpp"
+
+#include <map>
+
+#define ar_assert(...)                                          \
+    do {                                                        \
+        if (!(__VA_ARGS__)) return " BNK::read: " #__VA_ARGS__; \
+    } while (false)
+
+using namespace rlib;
+using namespace rlib::ar;
+
+struct BNK::Entry::Raw {
+    std::array<char, 4> type;
+    std::uint32_t size;
+};
+
+struct BNK::Entry::DIDX {
+    std::uint32_t id;
+    std::uint32_t offset;
+    std::uint32_t size;
+};
+
+auto BNK::check_magic(std::span<char const> data) noexcept -> bool {
+    return data.size() >= 4 && std::memcmp(data.data(), "BKHD", 4) == 0;
+}
+
+auto BNK::read(IO const& io, std::size_t offset, std::size_t size) -> char const* {
+    using TYPE = std::array<char, 4>;
+    static constexpr auto BKHD = TYPE{'B', 'K', 'H', 'D'};
+    static constexpr auto DIDX = TYPE{'D', 'I', 'D', 'X'};
+    static constexpr auto DATA = TYPE{'D', 'A', 'T', 'A'};
+
+    auto magic = TYPE{};
+    ar_assert(size >= 8);
+    io.read(offset, magic);
+    ar_assert(magic == BKHD);
+
+    auto sections = std::map<TYPE, Entry>{};
+    for (std::size_t i = offset; i != offset + size;) {
+        Entry::Raw raw = {};
+        ar_assert(size >= i);
+        ar_assert(size - i >= sizeof(raw));
+        io.read(i, {(char*)&raw, sizeof(raw)});
+
+        i += sizeof(Entry::Raw);
+        ar_assert(size - i >= raw.size);
+
+        sections[raw.type] = Entry{.offset = i, .size = raw.size};
+
+        i += raw.size;
+    }
+
+    entries.clear();
+    entries.reserve(sections.size());
+
+    auto i_didx = sections.find(DIDX);
+    auto i_data = sections.find(DATA);
+    if (i_didx != sections.end() && i_data != sections.end()) {
+        auto didx_base = i_didx->second;
+        auto data_base = i_data->second;
+
+        ar_assert(didx_base.size % sizeof(Entry::DIDX) == 0);
+        auto didx_list = std::vector<Entry::DIDX>(didx_base.size / sizeof(Entry::DIDX));
+        io.read(didx_base.offset, {(char*)didx_list.data(), didx_base.size});
+
+        entries.reserve(sections.size() + didx_list.size());
+        for (auto const& didx : didx_list) {
+            ar_assert(data_base.size >= didx.offset);
+            ar_assert(data_base.size - didx.offset >= didx.size);
+            entries.push_back(Entry{
+                .offset = data_base.offset + didx.offset,
+                .size = didx.size,
+                .compressed = true,
+            });
+        }
+
+        i_didx->second.size = 0;
+        i_data->second.size = 0;
+    }
+
+    for (auto [key, entry] : sections) {
+        entry.offset -= sizeof(Entry::Raw);
+        entry.size += sizeof(Entry::Raw);
+        entries.push_back(entry);
+    }
+
+    return nullptr;
+}
diff --git a/lib/rlib/ar/bnk.hpp b/lib/rlib/ar/bnk.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include <rlib/common.hpp>
+#include <rlib/iofile.hpp>
+
+namespace rlib::ar {
+    struct BNK {
+        static constexpr bool can_nest = false;
+
+        struct Entry {
+            struct Raw;
+            struct DIDX;
+            std::size_t offset;
+            std::size_t size;
+            bool compressed;
+        };
+        std::vector<Entry> entries;
+
+        static auto check_magic(std::span<char const> data) noexcept -> bool;
+        auto read(IO const& io, std::size_t offset, std::size_t size) -> char const*;
+    };
+}
diff --git a/lib/rlib/ar/wad.cpp b/lib/rlib/ar/wad.cpp
@@ -0,0 +1,126 @@
+#include "wad.hpp"
+
+using namespace rlib;
+using namespace rlib::ar;
+
+#define ar_assert(...)                                          \
+    do {                                                        \
+        if (!(__VA_ARGS__)) return " WAD::read: " #__VA_ARGS__; \
+    } while (false)
+
+struct WAD::Header {
+    struct Base;
+    struct V1;
+    struct V2;
+    struct V3;
+
+    std::size_t entry_size;
+    std::size_t entry_count;
+    std::size_t toc_start;
+    std::size_t toc_size;
+};
+
+struct WAD::Header::Base {
+    std::array<char, 2> magic;
+    std::uint8_t version[2];
+};
+
+struct WAD::Header::V1 : Base {
+    std::uint16_t toc_start;
+    std::uint16_t entry_size;
+    std::uint32_t entry_count;
+};
+
+struct WAD::Header::V2 : Base {
+    std::array<std::uint8_t, 84> signature;
+    std::array<std::uint8_t, 8> checksum;
+    std::uint16_t toc_start;
+    std::uint16_t entry_size;
+    std::uint32_t entry_count;
+};
+
+struct WAD::Header::V3 : Base {
+    std::uint8_t signature[256];
+    std::array<std::uint8_t, 8> checksum;
+    static constexpr std::uint16_t toc_start = 272;
+    static constexpr std::uint16_t entry_size = 32;
+    std::uint32_t entry_count;
+};
+
+struct WAD::Entry::Raw {
+    std::uint64_t path;
+    std::uint32_t offset;
+    std::uint32_t size_compressed;
+    std::uint32_t size_uncompressed;
+    std::uint8_t type : 4;
+    std::uint8_t subchunks : 4;
+    std::uint8_t pad[3];
+};
+
+auto WAD::check_magic(std::span<char const> data) noexcept -> bool {
+    return data.size() >= 4 && std::memcmp(data.data(), "RW", 2) == 0 && (uint8_t)data[2] <= 10;
+}
+
+auto WAD::read(IO const& io, std::size_t offset, std::size_t size) -> char const* {
+    static constexpr auto MAGIC = std::array{'R', 'W'};
+
+    Header::Base header_base = {};
+    ar_assert(size >= sizeof(header_base));
+    io.read(offset, {(char*)&header_base, sizeof(header_base)});
+    ar_assert(header_base.magic == MAGIC);
+
+    Header header = {};
+    switch (header_base.version[0]) {
+#define read_header($V)                                           \
+    do {                                                          \
+        Header::V##$V v_header = {};                              \
+        ar_assert(size >= sizeof(header));                        \
+        io.read(offset, {(char*)&v_header, sizeof(v_header)});    \
+        header.entry_size = v_header.entry_size;                  \
+        header.entry_count = v_header.entry_count;                \
+        header.toc_start = v_header.toc_start;                    \
+        header.toc_size = header.entry_size * header.entry_count; \
+    } while (false)
+        case 0:
+        case 1:
+            read_header(1);
+            break;
+        case 2:
+            read_header(2);
+            break;
+        case 3:
+            read_header(3);
+            break;
+#undef read_header
+        default:
+            return "Unknown wad version";
+    }
+    ar_assert(size >= header.toc_start);
+    ar_assert(size - header.toc_start >= header.toc_size);
+    header.toc_start += offset;
+
+    entries.clear();
+    entries.reserve(header.entry_count + 1);
+
+    entries.push_back(Entry{
+        .offset = header.toc_start,
+        .size = header.toc_size,
+        .compressed = false,
+    });
+    for (std::size_t i = 0; i != header.entry_count; ++i) {
+        auto raw_entry = Entry::Raw{};
+        io.read(header.toc_start + i * header.entry_size, {(char*)&raw_entry, header.entry_size});
+
+        auto entry = Entry{
+            .offset = offset + raw_entry.offset,
+            .size = raw_entry.size_compressed,
+            .compressed = raw_entry.type != 0,
+        };
+        ar_assert(entry.offset >= header.toc_start + header.toc_size);
+        ar_assert(size >= entry.offset);
+        ar_assert(size - entry.offset >= entry.size);
+        entries.push_back(entry);
+    }
+
+    return nullptr;
+}
diff --git a/lib/rlib/ar/wad.hpp b/lib/rlib/ar/wad.hpp
@@ -0,0 +1,21 @@
+#pragma once
+#include <rlib/common.hpp>
+#include <rlib/iofile.hpp>
+
+namespace rlib::ar {
+    struct WAD {
+        static constexpr bool can_nest = true;
+
+        struct Header;
+        struct Entry {
+            struct Raw;
+            std::size_t offset;
+            std::size_t size;
+            bool compressed;
+        };
+        std::vector<Entry> entries;
+
+        static auto check_magic(std::span<char const> data) noexcept -> bool;
+        auto read(IO const& io, std::size_t offset, std::size_t size) -> char const*;
+    };
+}