gaia-platform · yiwen-wong · Dec 20, 2021 · Dec 7, 2021 · Dec 9, 2021 · Dec 9, 2021
diff --git a/production/db/core/src/index_builder.cpp b/production/db/core/src/index_builder.cpp
@@ -5,8 +5,9 @@
 
 #include "gaia_internal/db/index_builder.hpp"
 
-#include <unordered_set>
+#include <array>
 #include <utility>
+#include <vector>
 
 #include "gaia/exceptions.hpp"
 
@@ -381,7 +382,7 @@ void index_builder_t::update_indexes_from_txn_log(
     // table is created or dropped in the txn.
     // Keep track of dropped tables.
     bool has_cleared_cache = false;
-    std::unordered_set<gaia_type_t> dropped_types;
+    std::vector<gaia_type_t> dropped_types;
 
     for (size_t i = 0; i < records.record_count; ++i)
     {
@@ -403,7 +404,7 @@ void index_builder_t::update_indexes_from_txn_log(
             if (log_record.operation == gaia_operation_t::remove)
             {
                 auto table_view = table_view_t(offset_to_ptr(log_record.old_offset));
-                dropped_types.insert(table_view.table_type());
+                dropped_types.push_back(table_view.table_type());
             }
         }
     }
@@ -448,7 +449,7 @@ void index_builder_t::update_indexes_from_txn_log(
         // The operation is from a dropped table.
         // Skip if catalog verification disabled and type not found in the catalog.
         if (is_system_object(obj->type)
-            || dropped_types.find(obj->type) != dropped_types.end()
+            || std::find(dropped_types.begin(), dropped_types.end(), obj->type) != dropped_types.end()
             || (skip_catalog_integrity_check && type_record_id == c_invalid_gaia_id))
         {
             continue;
@@ -462,62 +463,71 @@ void index_builder_t::update_indexes_from_txn_log(
 }
 
 template <class T_index>
-void remove_entries_with_offsets(base_index_t* base_index, const std::unordered_set<gaia_offset_t>& offsets, gaia_txn_id_t txn_id)
+void remove_entries_with_offsets(base_index_t* base_index, const index_offset_buffer_t& offsets, gaia_txn_id_t txn_id)
 {
     auto index = static_cast<T_index*>(base_index);
     index->remove_index_entry_with_offsets(offsets, txn_id);
 }
 
 void index_builder_t::gc_indexes_from_txn_log(const txn_log_t& records, bool deallocate_new_offsets)
 {
-    std::unordered_set<gaia_offset_t> collected_offsets;
-    std::unordered_set<gaia_type_t> offset_types;
-
-    for (size_t i = 0; i < records.record_count; ++i)
+    size_t records_index = 0;
+    while (records_index < records.record_count)
     {
-        const auto& log_record = records.log_records[i];
-        gaia_offset_t offset = deallocate_new_offsets ? log_record.new_offset : log_record.old_offset;
-
-        // If no action is needed, move on to the next log record.
-        if (offset != c_invalid_gaia_offset)
+        index_offset_buffer_t collected_offsets;
+        // Fill the offset buffer for garbage collection.
+        // Exit the loop when we either have run out of records to process or the offsets buffer is full.
+        for (; records_index < records.record_count && collected_offsets.size() < c_offset_buffer_size; ++records_index)
         {
-            auto obj = offset_to_ptr(offset);
+            const auto& log_record = records.log_records[records_index];
 
-            // We do not index system objects, so we can move on.
-            if (is_system_object(obj->type))
-            {
-                continue;
-            }
+            gaia_offset_t offset = deallocate_new_offsets ? log_record.new_offset : log_record.old_offset;
 
-            collected_offsets.insert(offset);
-            offset_types.insert(obj->type);
-        }
-    }
+            // If no action is needed, move on to the next log record.
+            if (offset != c_invalid_gaia_offset)
+            {
+                auto obj = offset_to_ptr(offset);
 
-    // Nothing to do here.
-    if (collected_offsets.size() == 0)
-    {
-        return;
-    }
+                // We do not index system objects, so we can move on.
+                if (is_system_object(obj->type))
+                {
+                    continue;
+                }
 
-    for (auto it : *get_indexes())
-    {
-        gaia_type_t indexed_type = it.second->table_type();
+                // Add the offset to the buffers and advance the buffer index.
+                collected_offsets.insert(offset, obj->type);
+            }
+        }
 
-        // This index does not contain any of the deleted offsets.
-        if (offset_types.find(indexed_type) == offset_types.end())
+        // When we reach this point, either we have 1) run out of records to iterate over or 2) the offsets buffer is now considered full.
+        // We know that 2) is false when the offsets buffer is empty and there is no garbage to collect.
+        // Therefore we can safely return here.
+        if (collected_offsets.empty())
         {
-            continue;
+            return;
         }
 
-        switch (it.second->type())
+        // Garbage collect the offsets in the buffer.
+        for (const auto& it : *get_indexes())
         {
-        case catalog::index_type_t::range:
-            remove_entries_with_offsets<range_index_t>(it.second.get(), collected_offsets, records.begin_ts);
-            break;
-        case catalog::index_type_t::hash:
-            remove_entries_with_offsets<hash_index_t>(it.second.get(), collected_offsets, records.begin_ts);
-            break;
+            gaia_type_t indexed_type = it.second->table_type();
+
+            // This index does not contain any of the deleted offsets.
+            // Skip the expensive remove_entries_with_offsets operation.
+            if (!collected_offsets.has_type(indexed_type))
+            {
+                continue;
+            }
+
+            switch (it.second->type())
+            {
+            case catalog::index_type_t::range:
+                remove_entries_with_offsets<range_index_t>(it.second.get(), collected_offsets, records.begin_ts);
+                break;
+            case catalog::index_type_t::hash:
+                remove_entries_with_offsets<hash_index_t>(it.second.get(), collected_offsets, records.begin_ts);
+                break;
+            }
         }
     }
 }
@@ -531,7 +541,7 @@ void mark_index_entries(base_index_t* base_index, gaia_txn_id_t txn_id)
 
 void index_builder_t::mark_index_entries_committed(gaia_txn_id_t txn_id)
 {
-    for (auto it : *get_indexes())
+    for (const auto& it : *get_indexes())
     {
         // Optimization: only mark index entries committed for UNIQUE indexes, as we only look up the flags on that path.
         if (it.second->is_unique())

diff --git a/production/db/inc/index/index.hpp b/production/db/inc/index/index.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <unordered_set>
+#include <array>
 
 #include "gaia_internal/db/db_types.hpp"
 
@@ -48,6 +48,27 @@ class index_writer_guard_t
     T_structure& m_data;
 };
 
+constexpr size_t c_offset_buffer_size = 32;
+
+/*
+* Buffer storing data for garbage collecting offsets.
+*/
+
+class index_offset_buffer_t
+{
+public:
+    void insert(gaia_offset_t offset, common::gaia_type_t type);
+    bool has_offset(gaia_offset_t offset) const;
+    bool has_type(common::gaia_type_t type) const;
+    bool empty() const;
+    size_t size() const;
+
+private:
+    std::array<gaia_offset_t, c_offset_buffer_size> m_offsets = {};
+    std::array<common::gaia_type_t, c_offset_buffer_size> m_offset_types = {};
+    size_t m_size = 0;
+};
+
 /**
  * Abstract in-memory index type:
  * T_structure is the underlying backing data structure of the index.
@@ -71,7 +92,7 @@ class index_t : public base_index_t
 
     // Index structure maintenance.
     void insert_index_entry(index_key_t&& key, index_record_t record);
-    void remove_index_entry_with_offsets(const std::unordered_set<gaia_offset_t>& offsets, gaia_txn_id_t gc_txn_id);
+    void remove_index_entry_with_offsets(const index_offset_buffer_t& offsets, gaia_txn_id_t gc_txn_id);
 
     // This method will mark all entries below a specified txn_id as committed.
     // This must only be called after all aborted/terminated index entries below the txn_id are garbage collected.

diff --git a/production/db/inc/index/index.inc b/production/db/inc/index/index.inc
@@ -4,7 +4,6 @@
 /////////////////////////////////////////////
 
 // Index structure maintenance.
-
 template <typename T_structure, typename T_iterator>
 void index_t<T_structure, T_iterator>::insert_index_entry(index_key_t&& key, index_record_t record)
 {
@@ -29,7 +28,7 @@ void index_t<T_structure, T_iterator>::insert_index_entry(index_key_t&& key, ind
 }
 
 template <typename T_structure, typename T_iterator>
-void index_t<T_structure, T_iterator>::remove_index_entry_with_offsets(const std::unordered_set<gaia_offset_t>& offsets, gaia_txn_id_t gc_txn_id)
+void index_t<T_structure, T_iterator>::remove_index_entry_with_offsets(const index_offset_buffer_t& offsets, gaia_txn_id_t gc_txn_id)
 {
     std::lock_guard lock(m_index_lock);
 
@@ -42,7 +41,7 @@ void index_t<T_structure, T_iterator>::remove_index_entry_with_offsets(const std
 
     for (auto it = m_data.begin(); it != m_data.end();)
     {
-        if (offsets.find(it->second.offset) != offsets.end())
+        if (it->second.txn_id <= gc_txn_id && offsets.has_offset(it->second.offset))
         {
             it = m_data.erase(it);
         }

diff --git a/production/db/index/CMakeLists.txt b/production/db/index/CMakeLists.txt
@@ -18,7 +18,8 @@ add_library(gaia_index STATIC
   src/base_index.cpp
   src/index.cpp
   src/hash_index.cpp
-  src/range_index.cpp)
+  src/range_index.cpp
+  src/index_offset_buffer.cpp)
 configure_gaia_target(gaia_index)
 target_include_directories(gaia_index PUBLIC ${GAIA_INDEX_INCLUDES})
 target_link_libraries(gaia_index PRIVATE gaia_common gaia_payload_types)

diff --git a/production/db/index/src/index_offset_buffer.cpp b/production/db/index/src/index_offset_buffer.cpp
@@ -0,0 +1,45 @@
+/////////////////////////////////////////////
+// Copyright (c) Gaia Platform LLC
+// All rights reserved.
+/////////////////////////////////////////////
+
+#include "index.hpp"
+
+namespace gaia
+{
+namespace db
+{
+namespace index
+{
+
+bool index_offset_buffer_t::has_offset(gaia_offset_t offset) const
+{
+    return std::find(m_offsets.cbegin(), m_offsets.cbegin() + m_size, offset)
+        != m_offsets.cbegin() + m_size;
+}
+
+bool index_offset_buffer_t::has_type(common::gaia_type_t type) const
+{
+    return std::find(m_offset_types.cbegin(), m_offset_types.cbegin() + m_size, type)
+        != m_offset_types.cbegin() + m_size;
+}
+
+size_t index_offset_buffer_t::size() const
+{
+    return m_size;
+}
+
+bool index_offset_buffer_t::empty() const
+{
+    return m_size == 0;
+}
+
+void index_offset_buffer_t::insert(gaia_offset_t offset, common::gaia_type_t type)
+{
+    m_offsets[m_size] = offset;
+    m_offset_types[m_size] = type;
+    ++m_size;
+}
+} // namespace index
+} // namespace db
+} // namespace gaia