Skip to content

Commit

Permalink
GH-5 Use different algorithm that produces more efficient diffs
Browse files Browse the repository at this point in the history
  • Loading branch information
heifner committed May 10, 2024
1 parent c7b60aa commit c9528ef
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 50 deletions.
104 changes: 55 additions & 49 deletions libraries/libfc/include/fc/container/ordered_diff.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#pragma once

#include <algorithm>
#include <concepts>
#include <cstddef>
#include <iterator>
#include <limits>
#include <type_traits>
#include <vector>
#include <utility>

Expand All @@ -9,6 +15,8 @@ namespace fc {
* @class ordered_diff
* @brief Provides ability to generate and apply diff of containers of type T
*
* Minimizes the number of inserts to transform source to target.
*
* Example use:
* std::vector<char> source = { 'a', 'b', 'f', 'c', 'd' };
* std::vector<char> target = { 'b', 'f', 'c', 'd', 'e', 'h' };
Expand All @@ -31,59 +39,57 @@ class ordered_diff {

/// Generate diff_result that when `apply_diff(source, diff_result)` will modify source to be equal to target.
static diff_result diff(const Container<T>& source, const Container<T>& target) {
size_t s = 0;
size_t t = 0;

diff_result result;
while (s < source.size() || t < target.size()) {
if (s < source.size() && t < target.size()) {
if (source[s] == target[t]) {
// nothing to do, skip over
assert(s <= std::numeric_limits<SizeType>::max());
assert(t <= std::numeric_limits<SizeType>::max());
++s;
++t;
} else { // not equal
if (s == source.size() - 1 && t == target.size() - 1) {
// both at end, insert target and remove source
assert(s <= std::numeric_limits<SizeType>::max());
assert(t <= std::numeric_limits<SizeType>::max());
result.remove_indexes.push_back(s);
result.insert_indexes.emplace_back(t, target[t]);
++s;
++t;
} else if (s + 1 < source.size() && t + 1 < target.size() && source[s + 1] == target[t + 1]) {
// misalignment, but next value equal, insert and remove
assert(s <= std::numeric_limits<SizeType>::max());
assert(t <= std::numeric_limits<SizeType>::max());
result.remove_indexes.push_back(s);
result.insert_indexes.emplace_back(t, target[t]);
++s;
++t;
} else if (t + 1 < target.size() && source[s] == target[t + 1]) {
// source equals next target, insert current target
assert(t <= std::numeric_limits<SizeType>::max());
result.insert_indexes.emplace_back(t, target[t]);
++t;
} else { // source[s + 1] == target[t]
// target matches next source, remove current source
assert(s <= std::numeric_limits<SizeType>::max());
result.remove_indexes.push_back(s);
++s;
}

// longest common subsequence table using single row to minimize memory usage
// See https://www.geeksforgeeks.org/minimum-number-deletions-insertions-transform-one-string-another/
Container<size_t> lcs_row(target.size() + 1, 0);

// Compute LCS
for (size_t s = 1; s <= source.size(); ++s) {
size_t prev = 0;
for (size_t t = 1; t <= target.size(); ++t) {
size_t curr = lcs_row[t];
if (source[s - 1] == target[t - 1]) {
lcs_row[t] = prev + 1;
} else {
lcs_row[t] = std::max(lcs_row[t], lcs_row[t - 1]);
}
} else if (s < source.size()) {
// remove extra in source
assert(s <= std::numeric_limits<SizeType>::max());
result.remove_indexes.push_back(s);
++s;
} else if (t < target.size()) {
// insert extra in target
assert(t <= std::numeric_limits<SizeType>::max());
result.insert_indexes.emplace_back(t, target[t]);
++t;
prev = curr;
}
}

// Use LCS to generate diff
size_t s = source.size();
size_t t = target.size();
while (s > 0 && t > 0) {
if (source[s - 1] == target[t - 1]) {
--s;
--t;
} else if (lcs_row[t] > lcs_row[t - 1]) {
assert(s - 1 <= std::numeric_limits<SizeType>::max());
result.remove_indexes.push_back(s - 1);
--s;
} else {
assert(t - 1 <= std::numeric_limits<SizeType>::max());
result.insert_indexes.emplace_back(t - 1, target[t - 1]);
--t;
}
}
// handle remaining elements
while (s > 0) {
assert(s - 1 <= std::numeric_limits<SizeType>::max());
result.remove_indexes.push_back(s - 1);
--s;
}
while (t > 0) {
assert(t - 1 <= std::numeric_limits<SizeType>::max());
result.insert_indexes.emplace_back(t - 1, target[t - 1]);
--t;
}

std::reverse(result.remove_indexes.begin(), result.remove_indexes.end());
std::reverse(result.insert_indexes.begin(), result.insert_indexes.end());

return result;
}
Expand Down
74 changes: 73 additions & 1 deletion libraries/libfc/test/test_ordered_diff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,36 @@ using namespace fc;

BOOST_AUTO_TEST_SUITE(ordered_diff_tests)

void print_diff(const auto& diff) {
// std::cout << "Remove: [";
// for (const auto& e : diff.remove_indexes) {
// std::cout << e << ", ";
// }
// std::cout << "]\n";
// std::cout << "Add: [";
// for (const auto& e : diff.insert_indexes) {
// std::cout << e.first << "|" << e.second << ", ";
// }
// std::cout << "]\n";
}

// verify only the inserts are in the diff insert_indexes
void verify_inserted(const auto& diff, const auto& inserts) {
BOOST_TEST_REQUIRE(diff.insert_indexes.size() == inserts.size());
for (size_t i = 0; i < inserts.size(); ++i) {
BOOST_TEST(diff.insert_indexes[i].second == inserts[i]);
}
}

BOOST_AUTO_TEST_CASE(ordered_diff_test) try {
using namespace std;

{ // Basic case
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target = {'a', 'c', 'e', 'f'};
auto result = ordered_diff<char>::diff(source, target);
print_diff(result);
verify_inserted(result, std::vector{'f'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
Expand All @@ -22,83 +45,132 @@ BOOST_AUTO_TEST_CASE(ordered_diff_test) try {
deque<char> source = {'a', 'x', 'c', 'd', 'e'};
deque<char> target = {'z', 'c', 'y', 'f'};
auto result = ordered_deque_char_diff::diff(source, target);
print_diff(result);
verify_inserted(result, std::vector{'z', 'y', 'f'});
source = ordered_deque_char_diff::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Empty vectors
vector<char> source;
vector<char> target;
ordered_diff<char, uint8_t>::diff_result result = ordered_diff<char, uint8_t>::diff(source, target);
print_diff(result);
verify_inserted(result, target);
source = ordered_diff<char, uint8_t>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // All elements removed
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target;
auto result = ordered_diff<char, int>::diff(source, target);
verify_inserted(result, target);
source = ordered_diff<char, int>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // All elements inserted
vector<char> source;
vector<char> target = {'a', 'b', 'c', 'd', 'e'};
auto result = ordered_diff<char>::diff(source, target);
print_diff(result);
verify_inserted(result, target);
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // No change
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target = source;
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector<char>{});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Mix of removals and inserts
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target = {'a', 'c', 'e', 'f', 'g', 'h'};
ordered_diff<char>::diff_result result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'f', 'g', 'h'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Mix of removals and inserts
vector<int> source = {1, 2, 3, 4, 5};
vector<int> target = {3, 4, 6, 2, 0};
auto result = ordered_diff<int>::diff(source, target);
// 2 insert because order changed between 3,4 and 2
verify_inserted(result, std::vector{6, 2, 0});
source = ordered_diff<int>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Complete change
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target = {'f', 'g', 'h', 'i'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, target);
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Diff order
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target = {'e', 'd', 'c', 'b', 'a'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'d', 'c', 'b', 'a'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // shift left
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target = {'b', 'c', 'd', 'e', 'f'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'f'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // shift right
vector<char> source = {'a', 'b', 'c', 'd', 'e'};
vector<char> target = {'z', 'a', 'b', 'c', 'd'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'z'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // non-unique
vector<char> source = {'a', 'b', 'c', 'd', 'e', 'c', 'a', 'q'};
vector<char> target = {'z', 'a', 'b', 'c', 'd', 'a'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'z'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Long diff
vector<char> source = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'};
vector<char> target = {'x', 'y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'x', 'y'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Longer diff
vector<char> source = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'};
vector<char> target = {'x', 'y', 'z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'x', 'y', 'z'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Longer still diff
vector<char> source = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'};
vector<char> target = {'t', 'u', 'v', 'w', 'x', 'y', 'z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'};
auto result = ordered_diff<char>::diff(source, target);
verify_inserted(result, std::vector{'t', 'u', 'v', 'w', 'x', 'y', 'z'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
{ // Longer still diff with additional diff
vector<char> source = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'};
vector<char> target = {'t', 'u', 'v', 'w', 'x', 'y', 'z', 'a', 'b', 'c', 'd', 'x', 'e', 'f', 'g', 'h'};
auto result = ordered_diff<char>::diff(source, target);
print_diff(result);
verify_inserted(result, std::vector{'t', 'u', 'v', 'w', 'x', 'y', 'z', 'x'});
source = ordered_diff<char>::apply_diff(std::move(source), result);
BOOST_TEST(source == target);
}
Expand Down Expand Up @@ -192,7 +264,7 @@ BOOST_AUTO_TEST_CASE(ordered_diff_moveable_test) try {
auto result = ordered_diff<count_moves>::diff(source, target);
source = ordered_diff<count_moves>::apply_diff(std::move(source), std::move(result));
BOOST_TEST(source == target);
BOOST_TEST(count_moves::num_moves == 1);
BOOST_TEST(count_moves::num_moves == 2); // one move is for std::reverse
}

} FC_LOG_AND_RETHROW();
Expand Down

0 comments on commit c9528ef

Please sign in to comment.