From c9f223bf3bd8eccad89ad27dc8f32d53e14ba8ec Mon Sep 17 00:00:00 2001
From: Arnur Nigmetov <anigmetov@lbl.gov>
Date: Thu, 12 Dec 2024 00:02:08 -0800
Subject: [PATCH] Refactor (#3)

* Compute only necessary matrices in KICR

* Leave python detection to pybind11, bump version

* get rid of double/float, improve naming

* Remove sorted_id_to_value_ vector from Filtration

* Use combinatorial enumeration as simplex Uid (as in Ripser)

* Check for Ctrl-C in Python

* Add threading to boundary matrix computation

* Breaking: id of cell is always reset to its position in vector
To allow multi-threaded filtration initialization.
Also add multithreading to more places (copying matrix in parallel reduction,
setting pivots, etc).
---
 CMakeLists.txt                                |   3 -
 bindings/python/CMakeLists.txt                |  16 +-
 bindings/python/example_cone_diff.py          | 110 ---
 bindings/python/oineus.cpp                    |  21 +-
 bindings/python/oineus/__init__.py            | 358 ++++----
 bindings/python/oineus/diff/__init__.py       | 132 ++-
 bindings/python/oineus_common.cpp             | 139 +++-
 bindings/python/oineus_decomposition.cpp      |  36 +-
 .../python/oineus_fil_dgm_simplex_double.cpp  |   6 -
 .../python/oineus_fil_dgm_simplex_float.cpp   |   6 -
 bindings/python/oineus_functions.cpp          |  74 ++
 bindings/python/oineus_functions_double.cpp   |   6 -
 bindings/python/oineus_functions_float.cpp    |   6 -
 bindings/python/oineus_index_diagram.cpp      |   6 -
 bindings/python/oineus_persistence_bindings.h | 774 +++---------------
 bindings/python/oineus_top_optimizer.cpp      |  22 +-
 doc/tutorial.md                               | 268 ++++--
 examples/kernel-example-2.py                  |  22 -
 examples/kernel-example-3.py                  |  22 -
 examples/kernel-example-4.py                  |  22 -
 examples/oineus_v_opt.cpp                     |  19 +
 {bindings => examples}/python/example.py      |   5 +-
 {bindings => examples}/python/example_cone.py |  22 +-
 examples/python/example_cone_diff.py          |  65 ++
 examples/{ => python}/example_diff_cyl.py     |   9 +-
 .../{ => python}/example_diff_vr_dists.py     |   4 +-
 examples/{ => python}/example_diff_vr_pts.py  |   7 +-
 .../example_kernel.py}                        |  10 +-
 .../example_kernel_cyl.py}                    |  31 +-
 .../python/example_manual.py                  |  16 +-
 {bindings => examples}/python/example_opt.py  |   4 +-
 .../python/example_opt_vr.py                  |   8 +-
 {bindings => examples}/python/example_vr.py   |  47 +-
 .../python/example_wasserstein_opt.py         |   0
 .../visualize_critical_set_alpha.py           |   0
 .../{ => python}/visualize_critical_set_vr.py |  10 +-
 extern/taskflow/core/atomic_notifier.hpp      |  22 +-
 extern/taskflow/core/executor.hpp             |  12 +-
 extern/taskflow/core/graph.hpp                | 131 +--
 extern/taskflow/core/nonblocking_notifier.hpp |   2 -
 extern/taskflow/core/tsq.hpp                  |  12 +-
 extern/taskflow/core/worker.hpp               |   2 +-
 extern/taskflow/taskflow.hpp                  |   6 +-
 include/oineus/cell_with_value.h              |   8 +-
 include/oineus/common_defs.h                  |   9 +-
 include/oineus/decomposition.h                | 327 +++++---
 include/oineus/filtration.h                   | 260 +++---
 include/oineus/inclusion_filtration.h         |   2 +-
 include/oineus/kernel.h                       | 155 ++--
 include/oineus/lower_star.h                   |  51 +-
 include/oineus/mem_reclamation.h              |   2 +-
 include/oineus/params.h                       |  10 +-
 include/oineus/product_cell.h                 |   1 +
 include/oineus/simpl_map_filtration.h         |   2 -
 include/oineus/simplex.h                      | 119 +--
 include/oineus/sparse_matrix.h                |  77 +-
 include/oineus/top_optimizer.h                |  17 +-
 include/oineus/vietoris_rips.h                | 223 ++++-
 version.txt                                   |   2 +-
 59 files changed, 1837 insertions(+), 1921 deletions(-)
 delete mode 100644 bindings/python/example_cone_diff.py
 delete mode 100644 bindings/python/oineus_fil_dgm_simplex_double.cpp
 delete mode 100644 bindings/python/oineus_fil_dgm_simplex_float.cpp
 create mode 100644 bindings/python/oineus_functions.cpp
 delete mode 100644 bindings/python/oineus_functions_double.cpp
 delete mode 100644 bindings/python/oineus_functions_float.cpp
 delete mode 100644 bindings/python/oineus_index_diagram.cpp
 delete mode 100644 examples/kernel-example-2.py
 delete mode 100644 examples/kernel-example-3.py
 delete mode 100644 examples/kernel-example-4.py
 rename {bindings => examples}/python/example.py (90%)
 rename {bindings => examples}/python/example_cone.py (56%)
 create mode 100644 examples/python/example_cone_diff.py
 rename examples/{ => python}/example_diff_cyl.py (87%)
 rename examples/{ => python}/example_diff_vr_dists.py (93%)
 rename examples/{ => python}/example_diff_vr_pts.py (90%)
 rename examples/{kernel-example-1.py => python/example_kernel.py} (68%)
 rename examples/{kernel-cyl.py => python/example_kernel_cyl.py} (81%)
 rename {bindings => examples}/python/example_manual.py (80%)
 rename {bindings => examples}/python/example_opt.py (82%)
 rename {bindings => examples}/python/example_opt_vr.py (91%)
 rename {bindings => examples}/python/example_vr.py (50%)
 rename {bindings => examples}/python/example_wasserstein_opt.py (100%)
 rename examples/{ => python}/visualize_critical_set_alpha.py (100%)
 rename examples/{ => python}/visualize_critical_set_vr.py (97%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02354d8..0e6bb80 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,9 +21,6 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 
 find_package (Threads REQUIRED)
 find_package (Boost REQUIRED)
-find_package (Python3 COMPONENTS Interpreter Development REQUIRED)
-
-message(WARNING "executable ${Python3_EXECUTABLE}, ${Python_EXECUTABLE} ")
 
 include_directories(${Boost_INCLUDE_DIRS})
 
diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt
index 019f510..0494f45 100644
--- a/bindings/python/CMakeLists.txt
+++ b/bindings/python/CMakeLists.txt
@@ -13,17 +13,23 @@ add_custom_target           (oineus ALL ${CMAKE_COMMAND} -E copy_directory
                                        ${CMAKE_CURRENT_SOURCE_DIR}/oineus ${MODULE_OUTPUT_DIRECTORY}/oineus
                                        DEPENDS ${OINEUS_PYTHON})
 
+set(OINEUS_PYTHON_INT "long int" CACHE STRING "Choose the Int type in Python bindings, options are: int, long int, long long int." FORCE)
+set_property(CACHE OINEUS_PYTHON_INT PROPERTY STRINGS "int" "long int" "long long int")
+
+set(OINEUS_PYTHON_REAL "double" CACHE STRING "Choose the Real type in Python bindings, options are: float, double." FORCE)
+set_property(CACHE OINEUS_PYTHON_REAL PROPERTY STRINGS "float" "double")
 
 pybind11_add_module         (_oineus oineus.cpp
                                      oineus_common.cpp
                                      oineus_decomposition.cpp
-                                     oineus_index_diagram.cpp
-                                     oineus_fil_dgm_simplex_double.cpp
-                                     oineus_fil_dgm_simplex_float.cpp
-                                     oineus_functions_double.cpp
-                                     oineus_functions_float.cpp
+                                     oineus_diagram.cpp
+                                     oineus_cells.cpp
+                                     oineus_filtration.cpp
+                                     oineus_kicr.cpp
+                                     oineus_functions.cpp
                                      oineus_top_optimizer.cpp
                             )
+target_compile_definitions(_oineus PRIVATE OINEUS_PYTHON_REAL=${OINEUS_PYTHON_REAL} OINEUS_PYTHON_INT=${OINEUS_PYTHON_INT})
 target_link_libraries       (_oineus PRIVATE ${libraries})
 target_include_directories  (_oineus PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../include"
                                               "${CMAKE_CURRENT_SOURCE_DIR}/../../extern")
diff --git a/bindings/python/example_cone_diff.py b/bindings/python/example_cone_diff.py
deleted file mode 100644
index d07a1b0..0000000
--- a/bindings/python/example_cone_diff.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#!python3
-
-from icecream import ic
-import numpy as np
-import oineus as oin
-import torch
-
-n_pts = 50
-
-np.random.seed(1)
-
-pts_1 = np.random.uniform(low=0.0, high=5.0, size=n_pts * 2).reshape(n_pts, 2).astype(np.float64)
-pts_2 = pts_1 + np.random.uniform(size=n_pts *2, low=-0.01, high=0.01).reshape(pts_1.shape)
-
-# input: pts
-# output: VR filtration and torch Tensor with the lengths
-# of critical edges in filtration order, that is:
-# fil.simplices()[sorted_idx].value == lengths[sorted_idx]
-def get_vr_filtration_and_vals(pts, max_dim, max_radius):
-    fil, edges = oin.get_vr_filtration_and_critical_edges(pts, max_dim=max_dim, max_radius=max_radius, n_threads=1)
-    fil.reset_ids_to_sorted_ids()
-    pts = torch.Tensor(pts)
-    edge_start, edge_end = edges[:, 0], edges[:, 1]
-    lengths = torch.sum((pts[edge_start, :] - pts[edge_end, :])**2, axis=1) ** 0.5
-
-    return fil, lengths
-
-max_dim = 2
-max_radius = 1000.0
-
-fil_1, lengths_1 = get_vr_filtration_and_vals(pts_1, max_dim, max_radius)
-fil_2, lengths_2 = get_vr_filtration_and_vals(pts_2, max_dim, max_radius)
-
-# create fil_min
-
-simplices_1 = fil_1.simplices()
-simplices_2 = fil_2.simplices()
-
-fil_min_simplices = []
-
-for sigma in simplices_1:
-    min_sigma = oin.Simplex_double(sigma.sorted_id, sigma.vertices,
-                                   min(sigma.value, fil_2.simplex_value_by_vertices(sigma.vertices)))
-    fil_min_simplices.append(min_sigma)
-
-fil_min = oin.Filtration_double(fil_min_simplices, set_ids=False)
-
-# now simplices became reshuffled
-# we must permute lengths_1 and lengths_2 accordingly
-# id of simplex in fil_min is it's sorted_id in the original filtration
-
-min_sorted_id_to_sorted_id_1 = fil_min.get_inv_sorting_permutation()
-
-lengths_1_in_min_order = lengths_1[torch.LongTensor(min_sorted_id_to_sorted_id_1)]
-
-min_simplices = fil_min.simplices()
-
-
-min_sorted_id_to_sorted_id_2 = [ fil_2.get_sorted_id_by_vertices(min_sigma.vertices) for min_sigma in min_simplices ]
-
-lengths_2_in_min_order = lengths_2[torch.LongTensor(min_sorted_id_to_sorted_id_2)]
-
-for i, sigma in enumerate(min_simplices):
-    s1 = simplices_1[fil_1.get_sorted_id_by_vertices(sigma.vertices)]
-    s2 = simplices_2[fil_2.get_sorted_id_by_vertices(sigma.vertices)]
-    assert np.abs(s1.value - lengths_1_in_min_order[i]) < 0.0001
-    assert np.abs(s2.value - lengths_2_in_min_order[i]) < 0.0001
-
-
-min_critical_values = torch.min(torch.stack((lengths_1_in_min_order, lengths_2_in_min_order)), axis=0)[0]
-
-# verify that we got the critical values correctly
-for sigma in min_simplices:
-    assert torch.abs(sigma.value - min_critical_values[sigma.sorted_id]) < 0.00001
-
-
-min_and_coned_simplices = min_simplices[:]
-cone_v = fil_min.n_vertices()
-
-# append cone vertex
-# TODO: how to ensure that cone_vertex appears first?
-# for now we can ignore it
-min_and_coned_simplices.append(oin.Simplex_double(cone_v, [cone_v], 0.0))
-
-simplex_id = len(min_and_coned_simplices)
-
-# take cones over simplices from fil_1
-for sigma in simplices_1:
-    coned_sigma = sigma.join(new_vertex=cone_v, value=sigma.value, new_id=simplex_id)
-    simplex_id += 1
-    min_and_coned_simplices.append(coned_sigma)
-
-fil_min_and_coned = oin.Filtration_double(min_and_coned_simplices, set_ids=False)
-
-# in filtration order
-min_and_coned_simplices = fil_min_and_coned.simplices()
-
-for sigma in min_and_coned_simplices:
-    if cone_v in sigma.vertices:
-        if sigma.dim() == 0:
-            continue
-        # subtract additional 1 for cone vertex
-        orig_sorted_sigma_id = sigma.id - len(fil_1) - 1
-        assert(np.abs(lengths_1[orig_sorted_sigma_id] - sigma.value) < 0.00001)
-    else:
-        orig_sigma_id = sigma.id
-        orig_sorted_sigma_id = fil_min.get_sorted_id_by_id(orig_sigma_id)
-        assert(np.abs(min_critical_values[orig_sorted_sigma_id] - sigma.value) < 0.00001)
-
-print(fil_min_and_coned)
diff --git a/bindings/python/oineus.cpp b/bindings/python/oineus.cpp
index 3180419..54f4a37 100644
--- a/bindings/python/oineus.cpp
+++ b/bindings/python/oineus.cpp
@@ -8,19 +8,12 @@ PYBIND11_MODULE(_oineus, m)
 {
     m.doc() = "Oineus python bindings";
 
-//    std::string float_suffix = "_float";
-    std::string double_suffix = "_double";
-//    std::string double_suffix = "";
-
-    init_oineus_common_int(m);
-    init_oineus_common_diagram_int(m);
-    init_oineus_common_decomposition_int(m);
-
-//    init_oineus_functions_float(m, float_suffix);
-//    init_oineus_fil_dgm_simplex_float(m, float_suffix);
-
-    init_oineus_functions_double(m, double_suffix);
-    init_oineus_fil_dgm_simplex_double(m, double_suffix);
-
+    init_oineus_common(m);
+    init_oineus_cells(m);
+    init_oineus_filtration(m);
+    init_oineus_common_decomposition(m);
+    init_oineus_diagram(m);
+    init_oineus_functions(m);
+    init_oineus_kicr(m);
     init_oineus_top_optimizer(m);
 }
diff --git a/bindings/python/oineus/__init__.py b/bindings/python/oineus/__init__.py
index 41b39b4..5b7e14e 100755
--- a/bindings/python/oineus/__init__.py
+++ b/bindings/python/oineus/__init__.py
@@ -6,17 +6,24 @@
 
 from . import _oineus
 
-from ._oineus import *
-
-import warnings
-
-try:
-    from . import diff
-except:
-    warnings.warn("oineus.diff import failed, probably, because eagerpy is not installed")
-
+from ._oineus import ConflictStrategy, DenoiseStrategy, VREdge
+from ._oineus import CombinatorialProdSimplex, CombinatorialSimplex,Simplex, ProdSimplex
+from ._oineus import Filtration, ProdFiltration
+from ._oineus import Decomposition, IndexDiagramPoint, DiagramPoint, Diagrams
+from ._oineus import ReductionParams, KICRParams, KerImCokReduced, KerImCokReducedProd
+from ._oineus import IndicesValues, IndicesValuesProd, TopologyOptimizer, TopologyOptimizerProd
+from ._oineus import compute_relative_diagrams, get_boundary_matrix, get_denoise_target, get_induced_matching
+from ._oineus import get_nth_persistence, get_permutation_dtv, list_to_filtration
+
+# import warnings
+#
+# try:
+#     from . import diff
+# except:
+#     warnings.warn("oineus.diff import failed, probably, because eagerpy is not installed")
+#
 
-# __all__ = ["compute_diagrams_ls", "compute_diagrams_and_v_ls", "get_boundary_matrix", "to_scipy_matrix", "is_reduced", "get_freudenthal_filtration"]
+__all__ = ["compute_diagrams_ls", "compute_diagrams_vr", "get_boundary_matrix", "is_reduced"]
 
 
 def to_scipy_matrix(sparse_cols, shape=None):
@@ -29,205 +36,172 @@ def to_scipy_matrix(sparse_cols, shape=None):
     return scipy.sparse.csc_matrix((data, (row_ind, col_ind)), shape=shape)
 
 
-def is_reduced(a):
-    lowest_ones = []
-    for col_idx in range(a.shape[1]):
-        if np.any(a[:, col_idx] % 2 == 1):
-            lowest_ones.append(np.max(np.where(a[:, col_idx] % 2 == 1)))
-    return len(lowest_ones) == len(set(lowest_ones))
-
-
-def get_real_type(fil):
-    if "_double" in str(type(fil)):
-        return "double"
-    elif "_float" in str(type(fil)):
-        return "float"
+def max_distance(data: np.ndarray, from_pwdists: bool=False):
+    if from_pwdists:
+        return 1.00001 * np.min(np.max(data, axis=1))
     else:
-        raise RuntimeError(f"Unknown type: {type(fil)}")
-
-
-def get_type_dim(data: np.ndarray, points=False):
-    if data.dtype == np.float32:
-        type_part = "float"
-    elif data.dtype == np.float64:
-        type_part = "double"
+        assert data.ndim == 2 and data.shape[0] >= 2
+        diff = data[:, np.newaxis, :] - data[np.newaxis, :, :]
+        squared_distances = np.sum(diff**2, axis=2)
+        return 1.00001 * np.sqrt(np.min(np.max(squared_distances, axis=1)))
+
+
+def freudenthal_filtration(data: np.ndarray,
+                           negate: bool=False,
+                           wrap: bool=False,
+                           max_dim: int = 3,
+                           with_critical_vertices: bool=False,
+                           n_threads: int=1):
+    max_dim = min(max_dim, data.ndim)
+    if with_critical_vertices:
+        fil, vertices = _oineus.get_freudenthal_filtration_and_crit_vertices(data=data, negate=negate, wrap=wrap, max_dim=max_dim, n_threads=n_threads)
+        vertices = np.array(vertices, dtype=np.int64)
+        return fil, vertices
     else:
-        raise RuntimeError(f"Type not supported: {data.dtype}")
-
-    if points:
-        if data.ndim == 2 and data.shape[1] in [1, 2, 3, 4]:
-            dim_part = str(data.shape[1])
-        else:
-            raise RuntimeError(f"Dimension not supported: shape = {data.shape}")
-    else:
-        if data.ndim in [1, 2, 3]:
-            dim_part = str(data.ndim)
-        else:
-            raise RuntimeError(f"Dimension not supported: shape = {data.shape}")
-
-    return type_part, dim_part
-
-
-def get_freudenthal_filtration(data, negate, wrap, max_dim, n_threads):
-    type_part, dim_part = get_type_dim(data)
-    func = getattr(_oineus, f"get_fr_filtration_{type_part}_{dim_part}")
-    return func(data, negate, wrap, max_dim, n_threads)
-
-def get_freudenthal_filtration_and_critical_vertices(data, negate, wrap, max_dim, n_threads):
-    type_part, dim_part = get_type_dim(data)
-    func = getattr(_oineus, f"get_fr_filtration_and_critical_vertices_{type_part}_{dim_part}")
-    fil, cv = func(data, negate, wrap, max_dim, n_threads)
-    return fil, np.array(cv)
-
-
-def get_vr_filtration_from_pwdists(pwdists, max_dim, max_radius, n_threads):
-    type_part, _ = get_type_dim(pwdists, False)
-    func = getattr(_oineus, f"get_vr_filtration_from_pwdists_{type_part}")
-    return func(pwdists, max_dim, max_radius, n_threads)
-
-def get_vr_filtration_and_critical_edges_from_pwdists(pwdists, max_dim, max_radius, n_threads):
-    type_part, _ = get_type_dim(pwdists, False)
-    func = getattr(_oineus, f"get_vr_filtration_and_critical_edges_from_pwdists_{type_part}")
-    fil, edges = func(pwdists, max_dim, max_radius, n_threads)
-    edges = np.array([[e.x, e.y] for e in edges])
-    return fil, edges
-
-
-def get_vr_filtration(points, max_dim, max_radius, n_threads):
-    type_part, dim_part = get_type_dim(points, True)
-    func = getattr(_oineus, f"get_vr_filtration_{type_part}_{dim_part}")
-    return func(points, max_dim, max_radius, n_threads)
-
-def get_vr_filtration_and_critical_edges(points, max_dim, max_radius, n_threads):
-    type_part, dim_part = get_type_dim(points, True)
-    func = getattr(_oineus, f"get_vr_filtration_and_critical_edges_{type_part}_{dim_part}")
-    fil, edges = func(points, max_dim, max_radius, n_threads)
-    edges = np.array([[e.x, e.y] for e in edges])
-    return fil, edges
-
-
-def get_boundary_matrix(data, negate, wrap, max_dim, n_threads):
-    type_part, dim_part = get_type_dim(data)
-    func = getattr(_oineus, f"get_boundary_matrix_{type_part}_{dim_part}")
-    bm = func(data, negate, wrap, max_dim, n_threads)
-    return to_scipy_matrix(bm)
-
-
-def compute_diagrams_ls(data, negate, wrap, max_dim, params, include_inf_points, dualize):
-    type_part, dim_part = get_type_dim(data)
-    func = getattr(_oineus, f"compute_diagrams_ls_{type_part}_{dim_part}")
-    return func(data, negate, wrap, max_dim, params, include_inf_points, dualize)
+        return _oineus.get_freudenthal_filtration(data=data, negate=negate, wrap=wrap, max_dim=max_dim, n_threads=n_threads)
 
 
-def get_denoise_target(d, fil, rv, eps, strat):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_denoise_target_{type_part}")
-    return func(d, fil, rv, eps, strat)
+def vr_filtration(data: np.ndarray,
+                  from_pwdists: bool = False,
+                  max_dim: int = -1,
+                  max_diameter: float = -1.0,
+                  with_critical_edges: bool = False,
+                  n_threads: int = 1):
+    assert data.ndim == 2
 
+    if from_pwdists:
+        assert data.shape[0] == data.shape[1]
 
-def get_well_group_target(d, fil, rv, t):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_well_group_target_{type_part}")
-    return func(d, fil, rv, t)
+    if max_diameter < 0:
+        max_diameter = max_distance(data, from_pwdists)
 
+    if max_dim < 0:
+        if from_pwdists:
+            raise RuntimeError("vr_filtration: if input is pairwise distance matrix, max_dim must be specified")
+        else:
+            max_dim = data.shape[1]
 
-def get_ls_target_values_diagram_loss(d, dtv, fil):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_target_values_diagram_loss_{type_part}")
-    return func(dtv, False)
-
-
-def get_vr_target_values_diagram_loss(d, dtv, fil):
-    type_part = get_real_type(fil)
-    death_only = d == 0
-    func = getattr(_oineus, f"get_target_values_diagram_loss_{type_part}")
-    return func(dtv, death_only)
-
+    if from_pwdists:
+        if with_critical_edges:
+            func = _oineus.get_vr_filtration_and_critical_edges_from_pwdists
+        else:
+            func = _oineus.get_vr_filtration_from_pwdists
+    else:
+        if with_critical_edges:
+            func = _oineus.get_vr_filtration_and_critical_edges
+        else:
+            func = _oineus.get_vr_filtration
+
+    result = func(data, max_dim=max_dim, max_diameter=max_diameter, n_threads=n_threads)
+    if with_critical_edges:
+        # convert list of VREdges to numpy array
+        edges = [ [ e.x, e.y] for e in result[1] ]
+        edges = np.array(edges, dtype=np.int64)
+        return result[0], edges
+    else:
+        return result
 
 
-def get_ls_target_values_x(d, dtv, fil, decmp, decmp_coh, conflict_strategy):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_ls_target_values_x_{type_part}")
-    return func(d, dtv, fil, decmp, decmp_coh, conflict_strategy, False)
+def is_reduced(a):
+    lowest_ones = []
+    for col_idx in range(a.shape[1]):
+        if np.any(a[:, col_idx] % 2 == 1):
+            lowest_ones.append(np.max(np.where(a[:, col_idx] % 2 == 1)))
+    return len(lowest_ones) == len(set(lowest_ones))
 
+def mapping_cylinder(fil_domain, fil_codomain, v_domain, v_codomain, with_indices=False):
+    if isinstance(v_codomain, _oineus.Simplex) or isinstance(v_codomain, _oineus.ProdSimplex):
+        v_codomain = v_codomain.combinatorial_cell()
+    if isinstance(v_domain, _oineus.Simplex) or isinstance(v_domain, _oineus.ProdSimplex):
+        v_domain = v_domain.combinatorial_cell()
+    if with_indices:
+        return _oineus._mapping_cylinder_with_indices(fil_domain, fil_codomain, v_domain, v_codomain)
+    else:
+        return _oineus._mapping_cylinder(fil_domain, fil_codomain, v_domain, v_codomain)
 
-def get_vr_target_values_x(d, dtv, fil, decmp, decmp_coh, conflict_strategy):
-    type_part = get_real_type(fil)
-    death_only = d == 0
-    func = getattr(_oineus, f"get_vr_target_values_x_{type_part}")
-    return func(d, dtv, fil, decmp, decmp_coh, conflict_strategy, death_only)
+def multiply_filtration(fil, sigma):
+    if isinstance(sigma, _oineus.Simplex):
+        sigma = sigma.combinatorial_cell()
+    return _oineus._multiply_filtration(fil, sigma)
 
+def min_filtration(fil_1, fil_2, with_indices=False):
+    if with_indices:
+        return _oineus._min_filtration_with_indices(fil_1, fil_2)
+    else:
+        return _oineus._min_filtration(fil_1, fil_2)
+
+def compute_diagrams_ls(data: np.ndarray, negate: bool=False, wrap: bool=False,
+                        max_dim: typing.Optional[int]=None, params: typing.Optional[ReductionParams]=None,
+                        include_inf_points: bool=True, dualize: bool=False):
+    if max_dim is None:
+        max_dim = data.ndim - 1
+    if params is None:
+        params = _oineus.ReductionParams()
+    # max_dim is maximal dimension of the _diagram_, we need simplices one dimension higher, hence +1
+    fil = freudenthal_filtration(data=data, negate=negate, wrap=wrap, max_dim=max_dim + 1, n_threads=params.n_threads)
+    dcmp = _oineus.Decomposition(fil, dualize)
+    dcmp.reduce(params)
+    return dcmp.diagram(fil=fil, include_inf_points=include_inf_points)
+
+
+def compute_diagrams_vr(data: np.ndarray, from_pwdists: bool=False, max_dim: int=-1, max_diameter: float = -1.0, params: typing.Optional[ReductionParams]=None, include_inf_points: bool=True, dualize: bool=True):
+    if params is None:
+        params = _oineus.ReductionParams()
+    # max_dim is maximal dimension of the _diagram_, we need simplices one dimension higher, hence +1
+    fil = vr_filtration(data, from_pwdists, max_dim=max_dim, max_diameter=max_diameter, with_critical_edges=False, n_threads=params.n_threads)
+    dcmp = _oineus.Decomposition(fil, dualize)
+    dcmp.reduce(params)
+    return dcmp.diagram(fil=fil, include_inf_points=include_inf_points)
 
 
 def get_ls_wasserstein_matching_target_values(dgm, fil, rv, d: int, q: float, mip: bool, mdp: bool):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_ls_wasserstein_matching_target_values_{type_part}")
+    func = getattr(_oineus, f"get_ls_wasserstein_matching_target_values")
 
     if type(dgm) is np.ndarray:
-        DgmPt = getattr(_oineus, f"DiagramPoint_{type_part}")
         dgm_1 = []
         assert len(dgm.shape) == 2 and dgm.shape[1] == 2
         for p in dgm:
-            dgm_1.append(DgmPt(p[0], p[1]))
+            dgm_1.append(DiagramPoint(p[0], p[1]))
         dgm = dgm_1
 
     return func(dgm, fil, rv, d, q, mip, mdp)
 
 
-
-def get_bruelle_target(fil, rv, p, q, i_0, d, minimize, min_birth, max_death):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_bruelle_target_{type_part}")
-    return func(fil, rv, p, q, i_0, d, minimize, min_birth, max_death)
-
-
-def get_barycenter_target(fil, rv, d):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_barycenter_target_{type_part}")
-    return func(fil, rv, d)
-
-
-def get_nth_persistence(fil, rv, d, n):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_nth_persistence_{type_part}")
-    return func(fil, rv, d, n)
-
-
-def get_permutation(target_values, fil):
-    if len(target_values) == 0:
-        return {}
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"get_permutation_{type_part}")
-    return func(target_values, fil)
-
-
-def list_to_filtration(simplex_list): #take a list which contains data for simplices and convert it to a filtration
-    string_type = str(type(simplex_list[0][2]))
-    if "int" in string_type:
-        func = getattr(_oineus, f"list_to_filtration_int")
-        return func(simplex_list)
-    elif "float" in string_type:
-        func = getattr(_oineus, f"list_to_filtration_float")
-        return func(simplex_list)
-    elif "double" in string_type:
-        func = getattr(_oineus, f"list_to_filtration_double")
-        return func(simplex_list)
-
-def compute_kernel_image_cokernel_reduction(K_, L_, IdMap, n_threads): #
-    string_type = str(type(K_[0][2]))
-    func = getattr(_oineus, f"compute_kernel_image_cokernel_reduction_float")
-    return func(K_, L_, IdMap, n_threads)
-
-
-def get_ls_filtration(simplices: typing.List[typing.List[int]], vertex_values: np.ndarray, negate: bool, n_threads: int):
-    if vertex_values.dtype == np.float32:
-        func = getattr(_oineus, f"get_ls_filtration_float")
-    elif vertex_values.dtype == np.float64:
-        func = getattr(_oineus, f"get_ls_filtration_double")
-    return func(simplices, vertex_values, negate, n_threads)
-
-
-def compute_ker_im_cok_reduction_cyl(fil_2, fil_3):
+def compute_kernel_image_cokernel_reduction(K, L, params=None, reduction_params=None):
+    # simplicial filtrations can be supplied as lists,
+    # convert to Oineus filtrations if necessary
+    if isinstance(K, list):
+        K = _oineus.list_to_filtration(K)
+    if isinstance(L, list):
+        L = _oineus.list_to_filtration(L)
+
+    # KICR class is templatized by cell type in C++
+    # different instantiations have different class names in Python
+    # figure out the right one by type
+    if isinstance(K[0], _oineus.Simplex):
+        KICR_Class = _oineus.KerImCokReduced
+    elif isinstance(K[0], _oineus.ProdSimplex):
+        KICR_Class = _oineus.KerImCokReducedProd
+
+    if params is None:
+        # compute all by default
+        params = _oineus.KICRParams()
+        params.kernel = True
+        params.cokernel = True
+        params.image = True
+
+    if reduction_params != None:
+        assert type(params) == _oineus.ReductionParams
+        params.params_f = reduction_params
+        params.params_g = reduction_params
+        params.params_ker = reduction_params
+        params.params_im = reduction_params
+        params.params_cok = reduction_params
+
+    return KICR_Class(K, L, params)
+
+
+def compute_ker_cok_reduction_cyl(fil_2, fil_3):
     fil_min = _oineus.min_filtration(fil_2, fil_3)
 
     id_domain = fil_3.size() + fil_min.size() + 1
@@ -246,22 +220,10 @@ def compute_ker_im_cok_reduction_cyl(fil_2, fil_3):
     # to get a subcomplex, we multiply each fil_3 with id_domain
     fil_3_prod = _oineus.multiply_filtration(fil_3, v0)
 
-    params = _oineus.ReductionParams()
+    params = _oineus.KICRParams()
     params.kernel = params.cokernel = True
     params.image = False
 
-    kicr_reduction = _oineus.KerImCokReducedProd_double(fil_cyl, fil_3_prod, params)
-    return kicr_reduction
-
-def compute_relative_diagrams(fil, rel, include_inf_points=True):
-    type_part = get_real_type(fil)
-    func = getattr(_oineus, f"compute_relative_diagrams_{type_part}")
-    return func(fil, rel, include_inf_points)
-
-#def compute_cokernel_diagrams(K_, L_, IdMap, n_threads): #
-#    string_type = str(type(K_[0][2]))
-#    func = getattr(_oineus, f"compute_cokernel_diagrams_float")
-#    return func(K_, L_, IdMap, n_threads)
-#
+    kicr_reduction = _oineus.KerImCokReducedProd(fil_cyl, fil_3_prod, params)
 
-#def lists_to_paired_filtrations(simplex_list_1, simplex_list_2)
+    return kicr_reduction
\ No newline at end of file
diff --git a/bindings/python/oineus/diff/__init__.py b/bindings/python/oineus/diff/__init__.py
index f86cd38..fec6838 100644
--- a/bindings/python/oineus/diff/__init__.py
+++ b/bindings/python/oineus/diff/__init__.py
@@ -4,6 +4,7 @@
 import eagerpy as epy
 
 from .. import _oineus
+from .. import vr_filtration as non_diff_vr_filtration
 
 # to copy docstring and name from the wrapped _oineus (C++) methods
 
@@ -19,77 +20,81 @@ def __len__(self):
     def __repr__(self):
         return f"DiffFil(under_fil={self.under_fil}, values={self.values})"
 
-    #@ft.wraps(_oineus.Filtration_double.max_dim)
+    def __iter__(self):
+        return iter(self.under_fil)
+
+    #@ft.wraps(_oineus.Filtration.max_dim)
     def max_dim(self):
         return self.under_fil.max_dim()
 
-    #@ft.wraps(_oineus.Filtration_double.size)
+    #@ft.wraps(_oineus.Filtration.size)
     def size(self):
         return self.under_fil.size()
 
-    #@ft.wraps(_oineus.Filtration_double.size_in_dimension)
+    #@ft.wraps(_oineus.Filtration.size_in_dimension)
     def size_in_dimension(self, dim):
         return self.under_fil.size(dim)
 
-    #@ft.wraps(_oineus.Filtration_double.n_vertices)
+    #@ft.wraps(_oineus.Filtration.n_vertices)
     def n_vertices(self):
         return self.under_fil.n_vertices()
 
-    #@ft.wraps(_oineus.Filtration_double.cells)
+    #@ft.wraps(_oineus.Filtration.cells)
     def cells(self):
         return self.under_fil.cells()
 
-    #@ft.wraps(_oineus.Filtration_double.get_id_by_sorted_id)
-    def get_id_by_sorted_id(self, sorted_id):
-        return self.under_fil.get_id_by_sorted_id(sorted_id)
+    #@ft.wraps(_oineus.Filtration.get_id_by_sorted_id)
+    def id_by_sorted_id(self, sorted_id):
+        return self.under_fil.id_by_sorted_id(sorted_id)
 
-    #@ft.wraps(_oineus.Filtration_double.get_sorted_id_by_id)
-    def get_sorted_id_by_id(self, id):
-        return self.under_fil.get_sorted_id_by_id(id)
+    #@ft.wraps(_oineus.Filtration.get_sorted_id_by_id)
+    def sorted_id_by_id(self, id):
+        return self.under_fil.sorted_id_by_id(id)
 
-    #@ft.wraps(_oineus.Filtration_double.get_cell)
-    def get_cell(self, sorted_idx):
-        return self.under_fil.get_cell(sorted_idx)
+    #@ft.wraps(_oineus.Filtration.get_cell)
+    def cell(self, sorted_idx):
+        return self.under_fil.cell(sorted_idx)
 
-    #@ft.wraps(_oineus.Filtration_double.get_simplex)
-    def get_simplex(self, sorted_idx):
-        return self.under_fil.get_simplex(sorted_idx)
+    #@ft.wraps(_oineus.Filtration.get_simplex)
+    def simplex(self, sorted_idx):
+        return self.under_fil.simplex(sorted_idx)
 
-    #@ft.wraps(_oineus.Filtration_double.get_sorting_permutation)
-    def get_sorting_permutation(self):
-        return self.under_fil.get_sorting_permutation()
+    #@ft.wraps(_oineus.Filtration.get_sorting_permutation)
+    def sorting_permutation(self):
+        return self.under_fil.sorting_permutation()
 
-    #@ft.wraps(_oineus.Filtration_double.get_inv_sorting_permutation)
+    #@ft.wraps(_oineus.Filtration.get_inv_sorting_permutation)
     def get_inv_sorting_permutation(self):
-        return self.under_fil.get_inv_sorting_permutation()
+        return self.under_fil.inv_sorting_permutation()
 
-    #@ft.wraps(_oineus.Filtration_double.cell_by_uid)
+    #@ft.wraps(_oineus.Filtration.cell_by_uid)
     def cell_by_uid(self, uid):
         return self.under_fil.cell_by_uid(uid)
 
-    #@ft.wraps(_oineus.Filtration_double.boundary_matrix)
+    #@ft.wraps(_oineus.Filtration.boundary_matrix)
     def boundary_matrix(self, uid):
         return self.under_fil.boundary_matrix(uid)
 
-    #@ft.wraps(_oineus.Filtration_double.simplex_value_by_sorted_id)
+    #@ft.wraps(_oineus.Filtration.simplex_value_by_sorted_id)
     def simplex_value_by_sorted_id(self, sorted_id):
         return self.under_fil.simplex_value_by_sorted_id(sorted_id)
 
-    #@ft.wraps(_oineus.Filtration_double.simplex_value_by_vertices)
-    def simplex_value_by_vertices(self, vertices):
-        return self.under_fil.simplex_value_by_vertices(vertices)
+    #@ft.wraps(_oineus.Filtration.simplex_value_by_vertices)
+    def simplex_value_by_uid(self, uid):
+        return self.under_fil.simplex_value_by_uid(uid)
+
+    def cell_value_by_uid(self, uid):
+        return self.under_fil.cell_value_by_uid(uid)
 
-    #@ft.wraps(_oineus.Filtration_double.get_sorted_id_by_vertices)
-    def get_sorted_id_by_vertices(self, vertices):
-        return self.under_fil.get_sorted_id_by_vertices(vertices)
+    #@ft.wraps(_oineus.Filtration.get_sorted_id_by_vertices)
+    def sorted_id_by_uid(self, uid):
+        return self.under_fil.sorted_id_by_uid(uid)
 
-    #@ft.wraps(_oineus.Filtration_double.reset_ids_to_sorted_ids)
+    #@ft.wraps(_oineus.Filtration.reset_ids_to_sorted_ids)
     def reset_ids_to_sorted_ids(self):
         self.under_fil.reset_ids_to_sorted_ids()
 
 
-
-
 class TopologyOptimizer:
     """
     A wrapper around C++ topology optimizer
@@ -98,11 +103,11 @@ class TopologyOptimizer:
     not to worry about the type of a filtration
     """
     def __init__(self, fil):
-        if isinstance(fil,DiffFiltration): 
+        if isinstance(fil, DiffFiltration):
             fil = fil.under_fil
-        if type(fil) is _oineus.Filtration_double:
+        if type(fil) is _oineus.Filtration:
             self.under_opt = _oineus.TopologyOptimizer(fil)
-        elif type(fil) is _oineus.ProdFiltration_double:
+        elif type(fil) is _oineus.ProdFiltration:
             self.under_opt = _oineus.TopologyOptimizerProd(fil)
         else:
             raise RuntimeError("unknown filtration type in oineus.diff.TopologyOptimizer constructor")
@@ -173,7 +178,7 @@ def min_filtration(fil_1: DiffFiltration, fil_2: DiffFiltration) -> DiffFiltrati
     fil_1_under = fil_1.under_fil
     fil_2_under = fil_2.under_fil
 
-    min_fil_under, inds_1, inds_2 = _oineus.min_filtration_with_indices(fil_1_under, fil_2_under)
+    min_fil_under, inds_1, inds_2 = _oineus._min_filtration_with_indices(fil_1_under, fil_2_under)
 
     inds_1 = np.array(inds_1)
     inds_2 = np.array(inds_2)
@@ -186,7 +191,7 @@ def min_filtration(fil_1: DiffFiltration, fil_2: DiffFiltration) -> DiffFiltrati
     return DiffFiltration(min_fil_under, min_fil_values)
 
 
-def lower_star_freudenthal(data, negate, wrap, max_dim, n_threads):
+def freudenthal_filtration(data, negate, wrap, max_dim, n_threads):
     data = epy.astensor(data)
     dim_part = data.ndim
     np_data = data.float64().numpy()
@@ -198,47 +203,40 @@ def lower_star_freudenthal(data, negate, wrap, max_dim, n_threads):
     return DiffFiltration(fil, values)
 
 
-def vietoris_rips_pts(pts, max_dim, max_radius, eps=1e-6, n_threads=1) -> DiffFiltration:
-    pts = epy.astensor(pts)
-
-    pts_np = pts.float64().numpy()
-
-    type_part = "double"
-    assert(pts.ndim == 2)
-    dim_part = pts.shape[1]
+def vr_filtration(data, from_pwdists: bool=False, max_dim: int=-1, max_diameter: float=-1.0, eps=1e-6, n_threads=8) -> DiffFiltration:
 
-    func = getattr(_oineus, f"get_vr_filtration_and_critical_edges_{type_part}_{dim_part}")
-
-    fil, edges = func(pts_np, max_dim, max_radius, n_threads)
-    edges_s = np.array([e.x for e in edges])
-    edges_t = np.array([e.y for e in edges])
-
-    sqdists = epy.sum((pts[edges_s] - pts[edges_t]) ** 2 + eps, axis=1)
-    dists = epy.sqrt(sqdists).raw
+    data = epy.astensor(data)
+    data_np = data.float64().numpy()
+    assert(data.ndim == 2)
 
-    return DiffFiltration(fil, dists)
+    fil, edges = non_diff_vr_filtration(data=data_np, from_pwdists=from_pwdists, with_critical_edges=True,
+                                        max_dim=max_dim, max_diameter=max_diameter, n_threads=n_threads)
 
+    if not from_pwdists:
+        sqdists = epy.sum((data[edges[:, 0].flatten()] - data[edges[:, 1].flatten()]) ** 2 + eps, axis=1)
+        diff_dists = epy.sqrt(sqdists).raw
 
-def vietoris_rips_pwdists(pwdists, max_dim, max_radius, eps=1e-6, n_threads=1):
-    pwdists = epy.astensor(pwdists)
-    assert(pwdists.ndim == 2 and pwdists.shape[0] == pwdists.shape[1])
-    pwdists_np = pwdists.float64().numpy()
-    type_part = "double"
-    func = getattr(_oineus, f"get_vr_filtration_and_critical_edges_from_pwdists_{type_part}")
-    fil, edges = func(pwdists_np, max_dim, max_radius, n_threads)
-    edges = epy.astensor(np.array([[e.x, e.y] for e in edges], dtype=np.int64))
-    dists = pwdists[edges[:, 0], edges[:, 1]].raw
-    return DiffFiltration(fil, dists)
+        return DiffFiltration(fil, diff_dists)
+    else:
+        edges = epy.astensor(edges)
+        diff_dists = data[edges[:, 0], edges[:, 1]].raw
+        return DiffFiltration(fil, diff_dists)
 
 
 def mapping_cylinder_filtration(fil_domain: DiffFiltration, fil_codomain: DiffFiltration, v_domain, v_codomain) -> DiffFiltration:
     assert(type(fil_domain) is DiffFiltration)
     assert(type(fil_codomain) is DiffFiltration)
 
+    if isinstance(v_domain, _oineus.Simplex):
+        v_domain = v_domain.combinatorial_cell()
+
+    if isinstance(v_codomain, _oineus.Simplex):
+        v_codomain = v_codomain.combinatorial_cell()
+
     under_fil_dom = fil_domain.under_fil
     under_fil_cod = fil_codomain.under_fil
 
-    under_cyl_fil, cyl_val_inds = _oineus.mapping_cylinder_with_indices(under_fil_dom, under_fil_cod, v_domain, v_codomain)
+    under_cyl_fil, cyl_val_inds = _oineus._mapping_cylinder_with_indices(under_fil_dom, under_fil_cod, v_domain, v_codomain)
 
     cyl_val_inds = epy.astensor(np.array(cyl_val_inds, dtype=np.int64))
 
diff --git a/bindings/python/oineus_common.cpp b/bindings/python/oineus_common.cpp
index 0914e3a..7653c1f 100644
--- a/bindings/python/oineus_common.cpp
+++ b/bindings/python/oineus_common.cpp
@@ -1,6 +1,139 @@
 #include "oineus_persistence_bindings.h"
 
-void init_oineus_common_int(py::module& m)
+void init_oineus_common(py::module& m)
 {
-    init_oineus_common<int>(m);
-}
+    using namespace pybind11::literals;
+
+    using VREdge = oin::VREdge<oin_int>;
+
+    using oin::DenoiseStrategy;
+    using oin::ConflictStrategy;
+    using ReductionParams = oin::Params;
+    using KICRParams = oin::KICRParams;
+    std::string vr_edge_name = "VREdge";
+
+    py::class_<VREdge>(m, vr_edge_name.c_str())
+            .def(py::init<oin_int>())
+            .def_readwrite("x", &VREdge::x)
+            .def_readwrite("y", &VREdge::y)
+            .def("__getitem__", [](const VREdge& p, int i) {
+              if (i == 0)
+                  return p.x;
+              else if (i == 1)
+                  return p.y;
+              else
+                  throw std::out_of_range("i must be 0 or 1");
+            })
+            .def("__repr__", [](const VREdge& p) {
+              std::stringstream ss;
+              ss << p;
+              return ss.str();
+            });
+
+    py::class_<ReductionParams>(m, "ReductionParams")
+            .def(py::init<>())
+            .def_readwrite("n_threads", &ReductionParams::n_threads)
+            .def_readwrite("chunk_size", &ReductionParams::chunk_size)
+            .def_readwrite("write_dgms", &ReductionParams::write_dgms)
+            .def_readwrite("sort_dgms", &ReductionParams::sort_dgms)
+            .def_readwrite("clearing_opt", &ReductionParams::clearing_opt)
+            .def_readwrite("acq_rel", &ReductionParams::acq_rel)
+            .def_readwrite("print_time", &ReductionParams::print_time)
+            .def_readwrite("elapsed", &ReductionParams::elapsed)
+            .def_readwrite("compute_v", &ReductionParams::compute_v)
+            .def_readwrite("compute_u", &ReductionParams::compute_u)
+            .def_readwrite("do_sanity_check", &ReductionParams::do_sanity_check)
+            .def_readwrite("verbose", &ReductionParams::verbose)
+            .def("__repr__", [](const ReductionParams& self) { std::stringstream ss; ss << self; return ss.str(); })
+            .def(py::pickle(
+                    // __getstate__
+
+                    [](const ReductionParams& p) {
+                      return py::make_tuple(p.n_threads, p.chunk_size, p.write_dgms,
+                              p.sort_dgms, p.clearing_opt, p.acq_rel, p.print_time, p.compute_v, p.compute_u,
+                              p.do_sanity_check, p.elapsed, p.verbose);
+                    },
+                    // __setstate__
+                    [](py::tuple t) {
+                      if (t.size() != 12)
+                          throw std::runtime_error("Invalid tuple for ReductionParams");
+
+                      ReductionParams p;
+
+                      int i = 0;
+
+                      p.n_threads = t[i++].cast<decltype(p.n_threads)>();
+                      p.chunk_size = t[i++].cast<decltype(p.chunk_size)>();
+                      p.write_dgms = t[i++].cast<decltype(p.write_dgms)>();
+                      p.sort_dgms = t[i++].cast<decltype(p.sort_dgms)>();
+                      p.clearing_opt = t[i++].cast<decltype(p.clearing_opt)>();
+                      p.acq_rel = t[i++].cast<decltype(p.acq_rel)>();
+                      p.print_time = t[i++].cast<decltype(p.print_time)>();
+                      p.compute_v = t[i++].cast<decltype(p.compute_v)>();
+                      p.compute_u = t[i++].cast<decltype(p.compute_u)>();
+                      p.do_sanity_check = t[i++].cast<decltype(p.do_sanity_check)>();
+
+                      p.elapsed = t[i++].cast<decltype(p.elapsed)>();
+                      p.verbose = t[i++].cast<decltype(p.verbose)>();
+
+                      return p;
+                    }));
+
+    py::class_<KICRParams>(m, "KICRParams")
+            .def(py::init<>())
+            .def_readwrite("kernel", &KICRParams::kernel)
+            .def_readwrite("image", &KICRParams::image)
+            .def_readwrite("cokernel", &KICRParams::cokernel)
+            .def_readwrite("include_zero_persistence", &KICRParams::include_zero_persistence)
+            .def_readwrite("verbose", &KICRParams::verbose)
+            .def_readwrite("params_f", &KICRParams::params_f)
+            .def_readwrite("params_g", &KICRParams::params_g)
+            .def_readwrite("params_ker", &KICRParams::params_ker)
+            .def_readwrite("params_im", &KICRParams::params_ker)
+            .def_readwrite("params_cok", &KICRParams::params_ker)
+            .def("__repr__", [](const KICRParams& self) { std::stringstream ss; ss << self; return ss.str(); })
+            .def(py::pickle(
+                    // __getstate__
+
+                    [](const KICRParams& p) {
+                      return py::make_tuple(p.kernel, p.image, p.cokernel, p.include_zero_persistence, p.verbose,
+                              p.params_f, p.params_g, p.params_ker, p.params_im, p.params_cok);
+                    },
+                    // __setstate__
+                    [](py::tuple t) {
+                      if (t.size() != 10)
+                          throw std::runtime_error("Invalid tuple for KICRParams");
+
+                      KICRParams p;
+
+                      int i = 0;
+
+                      p.kernel = t[i++].cast<decltype(p.kernel)>();
+                      p.image = t[i++].cast<decltype(p.image)>();
+                      p.cokernel = t[i++].cast<decltype(p.cokernel)>();
+                      p.include_zero_persistence = t[i++].cast<decltype(p.include_zero_persistence)>();
+                      p.verbose = t[i++].cast<decltype(p.verbose)>();
+                      p.params_f = t[i++].cast<decltype(p.params_f)>();
+                      p.params_g = t[i++].cast<decltype(p.params_g)>();
+                      p.params_ker = t[i++].cast<decltype(p.params_ker)>();
+                      p.params_im = t[i++].cast<decltype(p.params_im)>();
+                      p.params_cok = t[i++].cast<decltype(p.params_cok)>();
+
+                      return p;
+                    }));
+
+    py::enum_<DenoiseStrategy>(m, "DenoiseStrategy", py::arithmetic())
+            .value("BirthBirth", DenoiseStrategy::BirthBirth, "(b, d) maps to (b, b)")
+            .value("DeathDeath", DenoiseStrategy::DeathDeath, "(b, d) maps to (d, d)")
+            .value("Midway", DenoiseStrategy::Midway, "((b, d) maps to ((b+d)/2, (b+d)/2)")
+            .def("as_str", [](const DenoiseStrategy& self) { return denoise_strategy_to_string(self); });
+
+    py::enum_<ConflictStrategy>(m, "ConflictStrategy", py::arithmetic())
+            .value("Max", ConflictStrategy::Max, "choose maximal displacement")
+            .value("Avg", ConflictStrategy::Avg, "average gradients")
+            .value("Sum", ConflictStrategy::Sum, "sum gradients")
+            .value("FixCritAvg", ConflictStrategy::FixCritAvg, "use matching on critical, average gradients on other cells")
+            .def("as_str", [](const ConflictStrategy& self) { return conflict_strategy_to_string(self); });
+
+
+}
\ No newline at end of file
diff --git a/bindings/python/oineus_decomposition.cpp b/bindings/python/oineus_decomposition.cpp
index 8011bc2..3047616 100644
--- a/bindings/python/oineus_decomposition.cpp
+++ b/bindings/python/oineus_decomposition.cpp
@@ -1,6 +1,36 @@
 #include "oineus_persistence_bindings.h"
 
-void init_oineus_common_decomposition_int(py::module& m)
+void init_oineus_common_decomposition(py::module& m)
 {
-    init_oineus_common_decomposition<int>(m);
-}
+    using namespace pybind11::literals;
+
+    using Decomposition = oin::VRUDecomposition<oin_int>;
+    using Simplex = oin::Simplex<oin_int>;
+    using SimplexFiltration = oin::Filtration<Simplex, oin_real>;
+    using ProdSimplex = oin::ProductCell<Simplex, Simplex>;
+    using ProdSimplexFiltration = oin::Filtration<ProdSimplex, oin_real>;
+
+    py::class_<Decomposition>(m, "Decomposition")
+            .def(py::init<const SimplexFiltration&, bool, int>(), py::arg("filtration"), py::arg("dualize"), py::arg("n_threads")=4)
+            .def(py::init<const ProdSimplexFiltration&, bool, int>(), py::arg("filtration"), py::arg("dualize"), py::arg("n_threads")=4)
+            .def(py::init<const typename Decomposition::MatrixData&, size_t, bool>())
+            .def_readwrite("r_data", &Decomposition::r_data)
+            .def_readwrite("v_data", &Decomposition::v_data)
+            .def_readwrite("u_data_t", &Decomposition::u_data_t)
+            .def_readwrite("d_data", &Decomposition::d_data)
+            .def("reduce", &Decomposition::reduce, py::arg("params")=oin::Params(), py::call_guard<py::gil_scoped_release>())
+            .def("sanity_check", &Decomposition::sanity_check, py::call_guard<py::gil_scoped_release>())
+            .def("diagram", [](const Decomposition& self, const SimplexFiltration& fil, bool include_inf_points)
+                            { return PyOineusDiagrams<oin_real>(self.diagram(fil, include_inf_points)); },
+                    py::arg("fil"), py::arg("include_inf_points") = true)
+            .def("diagram", [](const Decomposition& self, const ProdSimplexFiltration& fil, bool include_inf_points) { return PyOineusDiagrams<oin_real>(self.diagram(fil,
+                    include_inf_points)); },
+                    py::arg("fil"), py::arg("include_inf_points") = true)
+            .def("zero_pers_diagram", [](const Decomposition& self, const SimplexFiltration& fil) { return PyOineusDiagrams<oin_real>(self.zero_persistence_diagram(fil)); },
+                    py::arg("fil"))
+            .def("zero_pers_diagram", [](const Decomposition& self, const ProdSimplexFiltration& fil) { return PyOineusDiagrams<oin_real>(self.zero_persistence_diagram(fil)); },
+                    py::arg("fil"))
+            .def("filtration_index", &Decomposition::filtration_index, py::arg("matrix_index"))
+                    ;
+
+}
\ No newline at end of file
diff --git a/bindings/python/oineus_fil_dgm_simplex_double.cpp b/bindings/python/oineus_fil_dgm_simplex_double.cpp
deleted file mode 100644
index 4a14930..0000000
--- a/bindings/python/oineus_fil_dgm_simplex_double.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "oineus_persistence_bindings.h"
-
-void init_oineus_fil_dgm_simplex_double(py::module& m, std::string suffix)
-{
-    init_oineus_fil_dgm_simplex<int, double>(m, suffix);
-}
diff --git a/bindings/python/oineus_fil_dgm_simplex_float.cpp b/bindings/python/oineus_fil_dgm_simplex_float.cpp
deleted file mode 100644
index bad60ce..0000000
--- a/bindings/python/oineus_fil_dgm_simplex_float.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "oineus_persistence_bindings.h"
-
-void init_oineus_fil_dgm_simplex_float(py::module& m, std::string suffix)
-{
-    init_oineus_fil_dgm_simplex<int, float>(m, suffix);
-}
diff --git a/bindings/python/oineus_functions.cpp b/bindings/python/oineus_functions.cpp
new file mode 100644
index 0000000..ee31e2e
--- /dev/null
+++ b/bindings/python/oineus_functions.cpp
@@ -0,0 +1,74 @@
+#include "oineus_persistence_bindings.h"
+
+void init_oineus_functions(py::module& m)
+{
+    using namespace pybind11::literals;
+
+    using Simp = oin::Simplex<oin_int>;
+    using SimpProd = oin::ProductCell<Simp, Simp>;
+    using Filtration = oin::Filtration<Simp, oin_real>;
+
+    using oin::VREdge;
+
+    std::string func_name;
+
+    // Lower-star Freudenthal filtration
+    func_name = "get_freudenthal_filtration";
+    m.def(func_name.c_str(), &get_fr_filtration<oin_int, oin_real>,
+            py::arg("data"), py::arg("negate") = false, py::arg("wrap") = false, py::arg("max_dim") = 3, py::arg("n_threads") = 1);
+
+    func_name = "get_freudenthal_filtration_and_crit_vertices";
+    m.def(func_name.c_str(), &get_fr_filtration_and_critical_vertices<oin_int, oin_real>,
+            py::arg("data"), py::arg("negate") = false, py::arg("wrap") = false, py::arg("max_dim") = 3, py::arg("n_threads") = 1);
+
+    // Vietoris--Rips filtration
+    // Reasonable default (dimension of points) for max_dim is provided in Python
+    // in C++, max_diameter is +\infty by default
+    // Reasonable default (diameter of point cloud)
+    // is provided in __init__.py on pure Python level
+    func_name = "get_vr_filtration";
+    m.def(func_name.c_str(), &get_vr_filtration<oin_int, oin_real>,
+            py::arg("points"), py::arg("max_dim"), py::arg("max_diameter")=std::numeric_limits<oin_real>::max(), py::arg("n_threads")=1);
+
+    func_name = "get_vr_filtration_and_critical_edges";
+    m.def(func_name.c_str(), &get_vr_filtration_and_critical_edges<oin_int, oin_real>,
+            py::arg("points"), py::arg("max_dim"), py::arg("max_diameter")=std::numeric_limits<oin_real>::max(), py::arg("n_threads")=1);
+
+    func_name = "get_vr_filtration_from_pwdists";
+    m.def(func_name.c_str(), &get_vr_filtration_from_pwdists<oin_int, oin_real>,
+            py::arg("pwdists"), py::arg("max_dim"), py::arg("max_diameter")=std::numeric_limits<oin_real>::max(), py::arg("n_threads")=1);
+
+    func_name = "get_vr_filtration_and_critical_edges_from_pwdists";
+    m.def(func_name.c_str(), &get_vr_filtration_and_critical_edges_from_pwdists<oin_int, oin_real>,
+            py::arg("pwdists"), py::arg("max_dim"), py::arg("max_diameter")=std::numeric_limits<oin_real>::max(), py::arg("n_threads")=1);
+
+    // boundary matrix as vector of columns
+    func_name = "get_boundary_matrix";
+    m.def(func_name.c_str(), &get_boundary_matrix<oin_int, oin_real>);
+
+    // target values
+    func_name = "get_denoise_target";
+    m.def(func_name.c_str(), &oin::get_denoise_target<Simp, oin_real>);
+
+    func_name = "get_nth_persistence";
+    m.def(func_name.c_str(), &oin::get_nth_persistence<Simp, oin_real>);
+
+    // to get permutation for Warm Starts
+    func_name = "get_permutation";
+    m.def(func_name.c_str(), &oin::targets_to_permutation<Simp, oin_real>);
+
+    func_name = "get_permutation_dtv";
+    m.def(func_name.c_str(), &oin::targets_to_permutation_dtv<Simp, oin_real>);
+
+    func_name = "list_to_filtration";
+    m.def(func_name.c_str(), &list_to_filtration<oin_int, oin_real>);
+
+    func_name = "get_ls_filtration";
+    m.def(func_name.c_str(), &get_ls_filtration<oin_int, oin_real>);
+
+    func_name = "compute_relative_diagrams";
+    m.def(func_name.c_str(), &compute_relative_diagrams<Simp, oin_real>, py::arg("fil"), py::arg("rel"), py::arg("include_inf_points")=true);
+
+    func_name = "compute_relative_diagrams";
+    m.def(func_name.c_str(), &compute_relative_diagrams<SimpProd, oin_real>, py::arg("fil"), py::arg("rel"), py::arg("include_inf_points")=true);
+}
diff --git a/bindings/python/oineus_functions_double.cpp b/bindings/python/oineus_functions_double.cpp
deleted file mode 100644
index 4a7e5a1..0000000
--- a/bindings/python/oineus_functions_double.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "oineus_persistence_bindings.h"
-
-void init_oineus_functions_double(py::module& m, std::string suffix)
-{
-    init_oineus_functions<int, double>(m, suffix);
-}
diff --git a/bindings/python/oineus_functions_float.cpp b/bindings/python/oineus_functions_float.cpp
deleted file mode 100644
index c45a68e..0000000
--- a/bindings/python/oineus_functions_float.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "oineus_persistence_bindings.h"
-
-void init_oineus_functions_float(py::module& m, std::string suffix)
-{
-    init_oineus_functions<int, float>(m, suffix);
-}
diff --git a/bindings/python/oineus_index_diagram.cpp b/bindings/python/oineus_index_diagram.cpp
deleted file mode 100644
index 4ff084b..0000000
--- a/bindings/python/oineus_index_diagram.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "oineus_persistence_bindings.h"
-
-void init_oineus_common_diagram_int(py::module& m)
-{
-    init_oineus_common_diagram<int>(m);
-}
diff --git a/bindings/python/oineus_persistence_bindings.h b/bindings/python/oineus_persistence_bindings.h
index 34a510f..983052f 100644
--- a/bindings/python/oineus_persistence_bindings.h
+++ b/bindings/python/oineus_persistence_bindings.h
@@ -6,6 +6,7 @@
 #include <sstream>
 #include <variant>
 #include <functional>
+#include <type_traits>
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -13,9 +14,33 @@
 
 namespace py = pybind11;
 
+#define OINEUS_CHECK_FOR_PYTHON_INTERRUPT {if (PyErr_CheckSignals() != 0) throw py::error_already_set();}
+#define OINEUS_CHECK_FOR_PYTHON_INTERRUPT_WITH_GIL { py::gil_scoped_acquire acq; {if (PyErr_CheckSignals() != 0) throw py::error_already_set();} }
+
 #include <oineus/timer.h>
 #include <oineus/oineus.h>
 
+#ifndef OINEUS_PYTHON_INT
+#define OINEUS_PYTHON_INT long int
+#endif
+
+#ifndef OINEUS_PYTHON_REAL
+#define OINEUS_PYTHON_REAL double
+#endif
+
+using oin_int = OINEUS_PYTHON_INT;
+using oin_real = OINEUS_PYTHON_REAL;
+
+
+static_assert(std::is_same<oin_int, int>::value ||
+              std::is_same<oin_int, long int>::value ||
+              std::is_same<oin_int, long long int>::value,
+              "OINEUS_PYTHON_INT must be one of the int, long, long long");
+
+static_assert(std::is_same<oin_real, float>::value ||
+              std::is_same<oin_real, double>::value,
+              "OINEUS_PYTHON_REAL must be one of the float, double");
+
 namespace oin = oineus;
 using dim_type = oin::dim_type;
 
@@ -107,20 +132,39 @@ get_grid(py::array_t<Real, py::array::c_style | py::array::forcecast> data, bool
     return Grid(dims, wrap, pdata);
 }
 
-template<class Int, class Real, size_t D>
+template<class Int, class Real>
 decltype(auto)
 get_fr_filtration(py::array_t<Real, py::array::c_style | py::array::forcecast> data, bool negate, bool wrap, dim_type max_dim, int n_threads)
 {
-    auto grid = get_grid<Int, Real, D>(data, wrap);
-    return grid.freudenthal_filtration(max_dim, negate, n_threads);
+    dim_type d = data.ndim();
+    if (d == 1) {
+        return get_grid<Int, Real, 1>(data, wrap).freudenthal_filtration(max_dim, negate, n_threads);
+    } else if (d == 2) {
+        return get_grid<Int, Real, 2>(data, wrap).freudenthal_filtration(max_dim, negate, n_threads);
+    } else if (d == 3) {
+        return get_grid<Int, Real, 3>(data, wrap).freudenthal_filtration(max_dim, negate, n_threads);
+    } else if (d == 4) {
+        return get_grid<Int, Real, 4>(data, wrap).freudenthal_filtration(max_dim, negate, n_threads);
+    } else {
+        throw std::runtime_error("get_fr_filtration: dimension not supported by default, manual modification needed");
+    }
 }
 
-template<class Int, class Real, size_t D>
+template<class Int, class Real>
 decltype(auto)
 get_fr_filtration_and_critical_vertices(py::array_t<Real, py::array::c_style | py::array::forcecast> data, bool negate, bool wrap, dim_type max_dim, int n_threads)
-{
-    auto grid = get_grid<Int, Real, D>(data, wrap);
-    return grid.freudenthal_filtration_and_critical_vertices(max_dim, negate, n_threads);
+{    dim_type d = data.ndim();
+    if (d == 1) {
+        return get_grid<Int, Real, 1>(data, wrap).freudenthal_filtration_and_critical_vertices(max_dim, negate, n_threads);
+    } else if (d == 2) {
+        return get_grid<Int, Real, 2>(data, wrap).freudenthal_filtration_and_critical_vertices(max_dim, negate, n_threads);
+    } else if (d == 3) {
+        return get_grid<Int, Real, 3>(data, wrap).freudenthal_filtration_and_critical_vertices(max_dim, negate, n_threads);
+    } else if (d == 4) {
+        return get_grid<Int, Real, 4>(data, wrap).freudenthal_filtration_and_critical_vertices(max_dim, negate, n_threads);
+    } else {
+        throw std::runtime_error("get_fr_filtration: dimension not supported by default, manual modification needed");
+    }
 }
 
 template<class Real, size_t D>
@@ -226,15 +270,32 @@ get_ls_filtration(const py::list& simplices, const py::array_t<Real>& vertex_val
     return Fil(std::move(fil_simplices), negate, n_threads);
 }
 
-template<class Int, class Real, size_t D>
+template<class Int, class Real>
 decltype(auto)
-get_vr_filtration(py::array_t<Real, py::array::c_style | py::array::forcecast> points, dim_type max_dim, Real max_radius, int n_threads)
+get_vr_filtration(py::array_t<Real, py::array::c_style | py::array::forcecast> points, dim_type max_dim, Real max_diameter, int n_threads)
 {
-    return oin::get_vr_filtration<Int, Real, D>(numpy_to_point_vector<Real, D>(points), max_dim, max_radius, n_threads);
+     if (points.ndim() != 2)
+        throw std::runtime_error("get_vr_filtration: expected 2D array");
+
+    dim_type d = points.shape(1);
+
+    if (d == 1) {
+        return oin::get_vr_filtration<Int, Real, 1>(numpy_to_point_vector<Real, 1>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 2) {
+        return oin::get_vr_filtration<Int, Real, 2>(numpy_to_point_vector<Real, 2>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 3) {
+        return oin::get_vr_filtration<Int, Real, 3>(numpy_to_point_vector<Real, 3>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 4) {
+        return oin::get_vr_filtration<Int, Real, 4>(numpy_to_point_vector<Real, 4>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 5) {
+        return oin::get_vr_filtration<Int, Real, 5>(numpy_to_point_vector<Real, 5>(points), max_dim, max_diameter, n_threads);
+    } else {
+        throw std::runtime_error("get_vr_filtration: dimension not supported by default, recompilation needed");
+    }
 }
 
 template<class Int, class Real>
-decltype(auto) get_vr_filtration_and_critical_edges_from_pwdists(py::array_t<Real, py::array::c_style | py::array::forcecast> pw_dists, dim_type max_dim, Real max_radius, int n_threads)
+decltype(auto) get_vr_filtration_and_critical_edges_from_pwdists(py::array_t<Real, py::array::c_style | py::array::forcecast> pw_dists, dim_type max_dim, Real max_diameter, int n_threads)
 {
     if (pw_dists.ndim() != 2 or pw_dists.shape(0) != pw_dists.shape(1))
         throw std::runtime_error("Dimension mismatch");
@@ -247,13 +308,13 @@ decltype(auto) get_vr_filtration_and_critical_edges_from_pwdists(py::array_t<Rea
 
     oin::DistMatrix<Real> dist_matrix {pdata, n_points};
 
-    return oin::get_vr_filtration_and_critical_edges<Int, Real>(dist_matrix, max_dim, max_radius, n_threads);
+    return oin::get_vr_filtration_and_critical_edges<Int, Real>(dist_matrix, max_dim, max_diameter, n_threads);
 }
 
 template<class Int, class Real>
-decltype(auto) get_vr_filtration_from_pwdists(py::array_t<Real, py::array::c_style | py::array::forcecast> pw_dists, dim_type max_dim, Real max_radius, int n_threads)
+decltype(auto) get_vr_filtration_from_pwdists(py::array_t<Real, py::array::c_style | py::array::forcecast> pw_dists, dim_type max_dim, Real max_diameter, int n_threads)
 {
-    return get_vr_filtration_and_critical_edges_from_pwdists<Int, Real>(pw_dists, max_dim, max_radius, n_threads).first;
+    return get_vr_filtration_and_critical_edges_from_pwdists<Int, Real>(pw_dists, max_dim, max_diameter, n_threads).first;
 }
 
 template<class Cell, class Real>
@@ -285,7 +346,7 @@ compute_relative_diagrams(const oineus::Filtration<Cell, Real>& fil, const oineu
         relative_.insert(sigma.get_uid());
     }
 
-    auto rel_matrix = fil.boundary_matrix_full_rel(relative_);
+    auto rel_matrix = fil.boundary_matrix_rel(relative_);
     oineus::VRUDecomposition<Int> d_matrix {rel_matrix, false};
 
     oineus::Params params;
@@ -299,19 +360,36 @@ compute_relative_diagrams(const oineus::Filtration<Cell, Real>& fil, const oineu
     return PyOineusDiagrams<Real>(d_matrix.diagram(fil, relative_, include_inf_points));
 }
 
-template<class Int, class Real, size_t D>
+template<class Int, class Real>
 decltype(auto)
-get_vr_filtration_and_critical_edges(py::array_t<Real, py::array::c_style | py::array::forcecast> points, dim_type max_dim, Real max_radius, int n_threads)
+get_vr_filtration_and_critical_edges(py::array_t<Real, py::array::c_style | py::array::forcecast> points, dim_type max_dim, Real max_diameter, int n_threads)
 {
-    return oin::get_vr_filtration_and_critical_edges<Int, Real, D>(numpy_to_point_vector<Real, D>(points), max_dim, max_radius, n_threads);
+    if (points.ndim() != 2)
+        throw std::runtime_error("get_vr_filtration_and_critical_edges: expected 2D array");
+
+    dim_type d = points.shape(1);
+
+    if (d == 1) {
+        return oin::get_vr_filtration_and_critical_edges<Int, Real, 1>(numpy_to_point_vector<Real, 1>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 2) {
+        return oin::get_vr_filtration_and_critical_edges<Int, Real, 2>(numpy_to_point_vector<Real, 2>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 3) {
+        return oin::get_vr_filtration_and_critical_edges<Int, Real, 3>(numpy_to_point_vector<Real, 3>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 4) {
+        return oin::get_vr_filtration_and_critical_edges<Int, Real, 4>(numpy_to_point_vector<Real, 4>(points), max_dim, max_diameter, n_threads);
+    } else if (d == 5) {
+        return oin::get_vr_filtration_and_critical_edges<Int, Real, 5>(numpy_to_point_vector<Real, 5>(points), max_dim, max_diameter, n_threads);
+    } else {
+        throw std::runtime_error("get_vr_filtration_and_critical_edges: dimension " + std::to_string(d) + " not supported by default, recompilation needed");
+    }
 }
 
-template<class Int, class Real, size_t D>
+template<class Int, class Real>
 typename oin::VRUDecomposition<Int>::MatrixData
 get_boundary_matrix(py::array_t<Real, py::array::c_style | py::array::forcecast> data, bool negate, bool wrap, dim_type max_dim, int n_threads)
 {
-    auto fil = get_fr_filtration<Int, Real, D>(data, negate, wrap, max_dim, n_threads);
-    return fil.boundary_matrix_full();
+    auto fil = get_fr_filtration<Int, Real>(data, negate, wrap, max_dim, n_threads);
+    return fil.boundary_matrix();
 }
 
 template<class Int, class Real, size_t D>
@@ -319,17 +397,17 @@ typename oin::VRUDecomposition<Int>::MatrixData
 get_coboundary_matrix(py::array_t<Real, py::array::c_style | py::array::forcecast> data, bool negate, bool wrap, dim_type max_dim, int n_threads)
 {
     auto fil = get_fr_filtration<Int, Real, D>(data, negate, wrap, max_dim, n_threads);
-    auto bm = fil.boundary_matrix_full();
+    auto bm = fil.boundary_matrix();
     return oin::antitranspose(bm);
 }
 
-template<class Int, class Real, size_t D>
+template<class Int, class Real>
 PyOineusDiagrams<Real>
 compute_diagrams_ls_freudenthal(py::array_t<Real, py::array::c_style | py::array::forcecast> data, bool negate, bool wrap, dim_type max_dim, oin::Params& params, bool include_inf_points, bool dualize)
 {
     // for diagram in dimension d, we need (d+1)-cells
     Timer timer;
-    auto fil = get_fr_filtration<Int, Real, D>(data, negate, wrap, max_dim + 1, params.n_threads);
+    auto fil = get_fr_filtration<Int, Real>(data, negate, wrap, max_dim + 1, params.n_threads);
     auto elapsed_fil = timer.elapsed_reset();
     oin::VRUDecomposition<Int> decmp {fil, dualize};
     auto elapsed_decmp_ctor = timer.elapsed_reset();
@@ -346,654 +424,34 @@ compute_diagrams_ls_freudenthal(py::array_t<Real, py::array::c_style | py::array
 
 
 template<typename C, typename Real>
-decltype(auto) compute_kernel_image_cokernel_reduction(py::list K_, py::list L_, py::list IdMap_,
-        oin::Params& params) //take a list of cells and turn it into a filtration for oineus. The list should contain cells in the form '[id, [boundary], filtration value].
+oin::KerImCokReduced<C, Real, 2> compute_kernel_image_cokernel_reduction(const oin::Filtration<C, Real>& K, const oin::Filtration<C, Real>& L, oin::Params& params)
 {
     using Int = typename C::Int;
     using Filtration = oin::Filtration<C, Real>;
     using KICR = oin::KerImCokReduced<C, Real, 2>;
 
-    std::cout << "======================================" << std::endl;
-    std::cout << std::endl;
-    std::cout << "You have called \'compute_kernel_image_cokernel_reduction\', it takes as input a complex K, and a subcomplex L, as lists of cells in the format:" << std::endl;
-    std::cout << "          [id, [boundary], filtration value]" << std::endl;
-    std::cout << "and a mapping from L to K, which takes the id of a cell in L and returns the id of the cell in K, as well as an integer, telling oineus how many threads to use." << std::endl;
-    std::cout << std::endl;
-    std::cout << "======================================" << std::endl;
-    std::cout << std::endl;
-
-    std::cout << "------------ Importing K ------------" << std::endl;
-    Filtration K = list_to_filtration<Int, Real>(K_);
-    std::cout << "------------ Importing L ------------" << std::endl;
-    Filtration L = list_to_filtration<Int, Real>(L_);
-
-    int n_L = IdMap_.size();
-    std::vector<int> IdMapping;
-
-    for(int i = 0 ; i < n_L ; i++) {
-        int i_map = IdMap_[i].cast<int>();
-        IdMapping.push_back(i_map);
-    }
-
-    if (params.verbose) {
-        std::cout << "---------- Map from L to K ----------" << std::endl;
-        for(int i = 0 ; i < n_L ; i++) {
-            std::cout << "Cell " << i << " in L is mapped to cell " << IdMapping[i] << " in K." << std::endl;
-        }
-    }
-
     params.sort_dgms = false;
     params.clearing_opt = false;
 
-    params.kernel = params.image = params.cokernel = true;
+    oin::KICRParams kicr_params;
+    kicr_params.verbose = params.verbose;
+    kicr_params.kernel = kicr_params.image = kicr_params.cokernel = true;
+    kicr_params.params_f = kicr_params.params_g = params;
+    kicr_params.params_ker = kicr_params.params_cok = kicr_params.params_im = params;
 
-    KICR result { K, L, params };
+    KICR result { K, L, kicr_params };
 
     return result;
 }
 
 
-template<class Int>
-void init_oineus_common(py::module& m)
-{
-    using namespace pybind11::literals;
-
-    using oin::VREdge;
-
-    using oin::DenoiseStrategy;
-    using oin::ConflictStrategy;
-    using Decomposition = oin::VRUDecomposition<Int>;
-    using ReductionParams = oin::Params;
-    std::string vr_edge_name = "VREdge";
-
-    py::class_<VREdge>(m, vr_edge_name.c_str())
-            .def(py::init<Int>())
-            .def_readwrite("x", &VREdge::x)
-            .def_readwrite("y", &VREdge::y)
-            .def("__getitem__", [](const VREdge& p, int i) {
-              if (i == 0)
-                  return p.x;
-              else if (i == 1)
-                  return p.y;
-              else
-                  throw std::out_of_range("i must be 0 or 1");
-            })
-            .def("__repr__", [](const VREdge& p) {
-              std::stringstream ss;
-              ss << p;
-              return ss.str();
-            });
-
-    py::class_<ReductionParams>(m, "ReductionParams")
-            .def(py::init<>())
-            .def_readwrite("n_threads", &ReductionParams::n_threads)
-            .def_readwrite("chunk_size", &ReductionParams::chunk_size)
-            .def_readwrite("write_dgms", &ReductionParams::write_dgms)
-            .def_readwrite("sort_dgms", &ReductionParams::sort_dgms)
-            .def_readwrite("clearing_opt", &ReductionParams::clearing_opt)
-            .def_readwrite("acq_rel", &ReductionParams::acq_rel)
-            .def_readwrite("print_time", &ReductionParams::print_time)
-            .def_readwrite("elapsed", &ReductionParams::elapsed)
-            .def_readwrite("compute_v", &ReductionParams::compute_v)
-            .def_readwrite("compute_u", &ReductionParams::compute_u)
-            .def_readwrite("do_sanity_check", &ReductionParams::do_sanity_check)
-            .def_readwrite("kernel", &ReductionParams::kernel)
-            .def_readwrite("image", &ReductionParams::image)
-            .def_readwrite("cokernel", &ReductionParams::cokernel)
-            .def_readwrite("verbose", &ReductionParams::verbose)
-            .def("__repr__", [](const ReductionParams& self) { std::stringstream ss; ss << self; return ss.str(); })
-            .def(py::pickle(
-                    // __getstate__
-
-                    [](const ReductionParams& p) {
-                      return py::make_tuple(p.n_threads, p.chunk_size, p.write_dgms,
-                              p.sort_dgms, p.clearing_opt, p.acq_rel, p.print_time, p.compute_v, p.compute_u,
-                              p.do_sanity_check, p.elapsed, p.kernel, p.image, p.cokernel, p.verbose);
-                    },
-                    // __setstate__
-                    [](py::tuple t) {
-                      if (t.size() != 15)
-                          throw std::runtime_error("Invalid tuple for ReductionParams");
-
-                      ReductionParams p;
-
-                      int i = 0;
-
-                      p.n_threads = t[i++].cast<decltype(p.n_threads)>();
-                      p.chunk_size = t[i++].cast<decltype(p.chunk_size)>();
-                      p.write_dgms = t[i++].cast<decltype(p.write_dgms)>();
-                      p.sort_dgms = t[i++].cast<decltype(p.sort_dgms)>();
-                      p.clearing_opt = t[i++].cast<decltype(p.clearing_opt)>();
-                      p.acq_rel = t[i++].cast<decltype(p.acq_rel)>();
-                      p.print_time = t[i++].cast<decltype(p.print_time)>();
-                      p.compute_v = t[i++].cast<decltype(p.compute_v)>();
-                      p.compute_u = t[i++].cast<decltype(p.compute_u)>();
-                      p.do_sanity_check = t[i++].cast<decltype(p.do_sanity_check)>();
-
-                      p.elapsed = t[i++].cast<decltype(p.elapsed)>();
-                      p.kernel = t[i++].cast<decltype(p.kernel)>();
-                      p.image = t[i++].cast<decltype(p.image)>();
-                      p.cokernel = t[i++].cast<decltype(p.cokernel)>();
-                      p.verbose = t[i++].cast<decltype(p.verbose)>();
-
-                      return p;
-                    }));
-
-    py::enum_<DenoiseStrategy>(m, "DenoiseStrategy", py::arithmetic())
-            .value("BirthBirth", DenoiseStrategy::BirthBirth, "(b, d) maps to (b, b)")
-            .value("DeathDeath", DenoiseStrategy::DeathDeath, "(b, d) maps to (d, d)")
-            .value("Midway", DenoiseStrategy::Midway, "((b, d) maps to ((b+d)/2, (b+d)/2)")
-            .def("as_str", [](const DenoiseStrategy& self) { return denoise_strategy_to_string(self); });
-
-    py::enum_<ConflictStrategy>(m, "ConflictStrategy", py::arithmetic())
-            .value("Max", ConflictStrategy::Max, "choose maximal displacement")
-            .value("Avg", ConflictStrategy::Avg, "average gradients")
-            .value("Sum", ConflictStrategy::Sum, "sum gradients")
-            .value("FixCritAvg", ConflictStrategy::FixCritAvg, "use matching on critical, average gradients on other cells")
-            .def("as_str", [](const ConflictStrategy& self) { return conflict_strategy_to_string(self); });
-
-    using Simp = oin::Simplex<Int>;
-
-    py::class_<Simp>(m, "Simplex")
-            .def(py::init([](typename Simp::IdxVector vs) -> Simp {
-                      return Simp({vs});
-                    }),
-                    py::arg("vertices"))
-            .def(py::init([](Int id, typename Simp::IdxVector vs) -> Simp {
-              return Simp(id, vs);
-            }), py::arg("id"), py::arg("vertices"))
-            .def_property("id", &Simp::get_id, &Simp::set_id)
-            .def_property("vertices", &Simp::get_uid, &Simp::set_uid)
-            .def("get_uid", &Simp::get_uid)
-            .def("dim", &Simp::dim)
-            .def("boundary", &Simp::boundary)
-            .def("join", [](const Simp& sigma, Int new_vertex, Int new_id) {
-                      return sigma.join(new_id, new_vertex);
-                    },
-                    py::arg("new_vertex"),
-                    py::arg("new_id") = Simp::k_invalid_id)
-            .def("__repr__", [](const Simp& sigma) {
-              std::stringstream ss;
-              ss << sigma;
-              return ss.str();
-            });
-
-    using ProdSimplex = oin::ProductCell<Simp, Simp>;
-
-    py::class_<ProdSimplex>(m, "SimplexProduct")
-            .def(py::init([](const Simp& s1, const Simp& s2) -> ProdSimplex {
-                      return {s1, s2};
-                    }),
-                    py::arg("simplex_1"), py::arg("simplex_2"))
-            .def_property("id", &ProdSimplex::get_id, &ProdSimplex::set_id)
-            .def("get_uid", &ProdSimplex::get_uid)
-            .def("dim", &ProdSimplex::dim)
-            .def("boundary", &ProdSimplex::boundary)
-            .def("__repr__", [](const ProdSimplex& sigma) {
-              std::stringstream ss;
-              ss << sigma;
-              return ss.str();
-            });
-
-}
-
-void init_oineus_common_int(py::module& m);
-
-template<class Int>
-void init_oineus_common_decomposition(py::module& m)
-{
-    using namespace pybind11::literals;
-
-    using oin::VREdge;
-    using oin::DenoiseStrategy;
-    using oin::ConflictStrategy;
-    using Decomposition = oin::VRUDecomposition<Int>;
-    using Simplex = oin::Simplex<Int>;
-    using SimplexFiltrationDouble = oin::Filtration<Simplex, double>;
-    using SimplexFiltrationFloat = oin::Filtration<Simplex, float>;
-    using ProdSimplex = oin::ProductCell<Simplex, Simplex>;
-    using ProdSimplexFiltrationDouble = oin::Filtration<ProdSimplex, double>;
-    using ProdSimplexFiltrationFloat = oin::Filtration<ProdSimplex, float>;
-    std::string vr_edge_name = "VREdge";
-
-    py::class_<Decomposition>(m, "Decomposition")
-            .def(py::init<const SimplexFiltrationDouble&, bool>())
-            .def(py::init<const SimplexFiltrationFloat&, bool>())
-            .def(py::init<const ProdSimplexFiltrationDouble&, bool>())
-            .def(py::init<const ProdSimplexFiltrationFloat&, bool>())
-            .def(py::init<const typename Decomposition::MatrixData&, size_t, bool>())
-            .def_readwrite("r_data", &Decomposition::r_data)
-            .def_readwrite("v_data", &Decomposition::v_data)
-            .def_readwrite("u_data_t", &Decomposition::u_data_t)
-            .def_readwrite("d_data", &Decomposition::d_data)
-            .def("reduce", &Decomposition::reduce, py::call_guard<py::gil_scoped_release>())
-            .def("sanity_check", &Decomposition::sanity_check, py::call_guard<py::gil_scoped_release>())
-            .def("diagram", [](const Decomposition& self, const SimplexFiltrationDouble& fil, bool include_inf_points) { return PyOineusDiagrams<double>(self.diagram(fil, include_inf_points)); },
-                    py::arg("fil"), py::arg("include_inf_points") = true)
-            .def("diagram", [](const Decomposition& self, const SimplexFiltrationFloat& fil, bool include_inf_points) { return PyOineusDiagrams<float>(self.diagram(fil, include_inf_points)); },
-                    py::arg("fil"), py::arg("include_inf_points") = true)
-            .def("diagram", [](const Decomposition& self, const ProdSimplexFiltrationDouble& fil, bool include_inf_points) { return PyOineusDiagrams<double>(self.diagram(fil, include_inf_points)); },
-                    py::arg("fil"), py::arg("include_inf_points") = true)
-            .def("diagram", [](const Decomposition& self, const ProdSimplexFiltrationFloat& fil, bool include_inf_points) { return PyOineusDiagrams<float>(self.diagram(fil, include_inf_points)); },
-                    py::arg("fil"), py::arg("include_inf_points") = true)
-            .def("zero_pers_diagram", [](const Decomposition& self, const SimplexFiltrationFloat& fil) { return PyOineusDiagrams<float>(self.zero_persistence_diagram(fil)); },
-                    py::arg("fil"))
-            .def("zero_pers_diagram", [](const Decomposition& self, const SimplexFiltrationDouble& fil) { return PyOineusDiagrams<double>(self.zero_persistence_diagram(fil)); },
-                    py::arg("fil"))
-            .def("zero_pers_diagram", [](const Decomposition& self, const ProdSimplexFiltrationFloat& fil) { return PyOineusDiagrams<float>(self.zero_persistence_diagram(fil)); },
-                    py::arg("fil"))
-            .def("zero_pers_diagram", [](const Decomposition& self, const ProdSimplexFiltrationDouble& fil) { return PyOineusDiagrams<double>(self.zero_persistence_diagram(fil)); },
-                    py::arg("fil"))
-            .def("filtration_index", &Decomposition::filtration_index, py::arg("matrix_index"))
-                    ;
-
-}
-
-void init_oineus_common_decomposition_int(py::module& m);
-
-template<class Int>
-void init_oineus_common_diagram(py::module& m)
-{
-    using namespace pybind11::literals;
-
-    using oin::VREdge;
-    using oin::DenoiseStrategy;
-    using oin::ConflictStrategy;
-
-    using DgmPointInt = typename oin::DgmPoint<Int>;
-    using DgmPointSizet = typename oin::DgmPoint<size_t>;
-
-    py::class_<DgmPointInt>(m, "DgmPoint_int")
-            .def(py::init<Int, Int>())
-            .def_readwrite("birth", &DgmPointInt::birth)
-            .def_readwrite("death", &DgmPointInt::death)
-            .def("__getitem__", [](const DgmPointInt& p, int i) { return p[i]; })
-            .def("__hash__", [](const DgmPointInt& p) { return std::hash<DgmPointInt>()(p); })
-            .def("__eq__", [](const DgmPointInt& p, const DgmPointInt& q) { return p == q; })
-            .def("__repr__", [](const DgmPointInt& p) {
-              std::stringstream ss;
-              ss << p;
-              return ss.str();
-            });
-
-    py::class_<DgmPointSizet>(m, "DgmPoint_Sizet")
-            .def(py::init<size_t, size_t>())
-            .def_readwrite("birth", &DgmPointSizet::birth)
-            .def_readwrite("death", &DgmPointSizet::death)
-            .def("__getitem__", [](const DgmPointSizet& p, int i) { return p[i]; })
-            .def("__hash__", [](const DgmPointSizet& p) { return std::hash<DgmPointSizet>()(p); })
-            .def("__eq__", [](const DgmPointSizet& p, const DgmPointSizet& q) { return p == q; })
-            .def("__repr__", [](const DgmPointSizet& p) {
-              std::stringstream ss;
-              ss << p;
-              return ss.str();
-            });
-}
-
-void init_oineus_common_diagram_int(py::module& m);
-
-template<class Int, class Real>
-void init_oineus_functions(py::module& m, std::string suffix)
-{
-    using namespace pybind11::literals;
-
-    using Simp = oin::Simplex<Int>;
-    using SimpProd = oin::ProductCell<Simp, Simp>;
-    using Filtration = oin::Filtration<Simp, Real>;
-
-    using oin::VREdge;
-
-    std::string func_name;
-
-    // diagrams
-    func_name = "compute_diagrams_ls" + suffix + "_1";
-    m.def(func_name.c_str(), &compute_diagrams_ls_freudenthal<Int, Real, 1>);
-
-    func_name = "compute_diagrams_ls" + suffix + "_2";
-    m.def(func_name.c_str(), &compute_diagrams_ls_freudenthal<Int, Real, 2>);
-
-    func_name = "compute_diagrams_ls" + suffix + "_3";
-    m.def(func_name.c_str(), &compute_diagrams_ls_freudenthal<Int, Real, 3>);
-
-    // Lower-star Freudenthal filtration
-    func_name = "get_fr_filtration" + suffix + "_1";
-    m.def(func_name.c_str(), &get_fr_filtration<Int, Real, 1>);
-
-    func_name = "get_fr_filtration" + suffix + "_2";
-    m.def(func_name.c_str(), &get_fr_filtration<Int, Real, 2>);
-
-    func_name = "get_fr_filtration" + suffix + "_3";
-    m.def(func_name.c_str(), &get_fr_filtration<Int, Real, 3>);
-
-    func_name = "get_fr_filtration_and_critical_vertices" + suffix + "_1";
-    m.def(func_name.c_str(), &get_fr_filtration_and_critical_vertices<Int, Real, 1>);
-
-    func_name = "get_fr_filtration_and_critical_vertices" + suffix + "_2";
-    m.def(func_name.c_str(), &get_fr_filtration_and_critical_vertices<Int, Real, 2>);
-
-    func_name = "get_fr_filtration_and_critical_vertices" + suffix + "_3";
-    m.def(func_name.c_str(), &get_fr_filtration_and_critical_vertices<Int, Real, 3>);
-
-    // Vietoris--Rips filtration
-    func_name = "get_vr_filtration" + suffix + "_1";
-    m.def(func_name.c_str(), &get_vr_filtration<Int, Real, 1>);
-
-    func_name = "get_vr_filtration" + suffix + "_2";
-    m.def(func_name.c_str(), &get_vr_filtration<Int, Real, 2>);
-
-    func_name = "get_vr_filtration" + suffix + "_3";
-    m.def(func_name.c_str(), &get_vr_filtration<Int, Real, 3>);
-
-    func_name = "get_vr_filtration" + suffix + "_4";
-    m.def(func_name.c_str(), &get_vr_filtration<Int, Real, 4>);
-
-    func_name = "get_vr_filtration_and_critical_edges" + suffix + "_1";
-    m.def(func_name.c_str(), &get_vr_filtration_and_critical_edges<Int, Real, 1>);
-
-    func_name = "get_vr_filtration_and_critical_edges" + suffix + "_2";
-    m.def(func_name.c_str(), &get_vr_filtration_and_critical_edges<Int, Real, 2>);
-
-    func_name = "get_vr_filtration_and_critical_edges" + suffix + "_3";
-    m.def(func_name.c_str(), &get_vr_filtration_and_critical_edges<Int, Real, 3>);
-
-    func_name = "get_vr_filtration_and_critical_edges" + suffix + "_4";
-    m.def(func_name.c_str(), &get_vr_filtration_and_critical_edges<Int, Real, 4>);
-
-    func_name = "get_vr_filtration_from_pwdists" + suffix;
-    m.def(func_name.c_str(), &get_vr_filtration_from_pwdists<Int, Real>);
-
-    func_name = "get_vr_filtration_and_critical_edges_from_pwdists" + suffix;
-    m.def(func_name.c_str(), &get_vr_filtration_and_critical_edges_from_pwdists<Int, Real>);
-
-
-    // boundary matrix as vector of columns
-    func_name = "get_boundary_matrix" + suffix + "_1";
-    m.def(func_name.c_str(), &get_boundary_matrix<Int, Real, 1>);
-
-    func_name = "get_boundary_matrix" + suffix + "_2";
-    m.def(func_name.c_str(), &get_boundary_matrix<Int, Real, 2>);
-
-    func_name = "get_boundary_matrix" + suffix + "_3";
-    m.def(func_name.c_str(), &get_boundary_matrix<Int, Real, 3>);
-
-    // target values
-    func_name = "get_denoise_target" + suffix;
-    m.def(func_name.c_str(), &oin::get_denoise_target<Simp, Real>);
-
-    // target values -- diagram loss
-    func_name = "get_target_values_diagram_loss" + suffix;
-    m.def(func_name.c_str(), &oin::get_prescribed_simplex_values_diagram_loss<Real>);
-
-    // target values --- X set
-    func_name = "get_target_values_x" + suffix;
-    m.def(func_name.c_str(), &oin::get_prescribed_simplex_values_set_x<Simp, Real>);
-
-    // to reproduce "Well group loss" experiments
-    func_name = "get_well_group_target" + suffix;
-    m.def(func_name.c_str(), &oin::get_well_group_target<Simp, Real>);
-
-    func_name = "get_nth_persistence" + suffix;
-    m.def(func_name.c_str(), &oin::get_nth_persistence<Simp, Real>);
-
-    // to get permutation for Warm Starts
-    func_name = "get_permutation" + suffix;
-    m.def(func_name.c_str(), &oin::targets_to_permutation<Simp, Real>);
-
-    func_name = "get_permutation_dtv" + suffix;
-    m.def(func_name.c_str(), &oin::targets_to_permutation_dtv<Simp, Real>);
-
-    func_name = "list_to_filtration" + suffix;
-    m.def(func_name.c_str(), &list_to_filtration<Int, Real>);
-
-    func_name = "compute_kernel_image_cokernel_reduction" + suffix;
-    m.def(func_name.c_str(), &compute_kernel_image_cokernel_reduction<Simp, Real>);
-
-    func_name = "get_ls_filtration" + suffix;
-    m.def(func_name.c_str(), &get_ls_filtration<Int, Real>);
-
-    func_name = "compute_relative_diagrams" + suffix;
-    m.def(func_name.c_str(), &compute_relative_diagrams<Simp, Real>, py::arg("fil"), py::arg("rel"), py::arg("include_inf_points")=true);
-
-    func_name = "compute_relative_diagrams" + suffix;
-    m.def(func_name.c_str(), &compute_relative_diagrams<SimpProd, Real>, py::arg("fil"), py::arg("rel"), py::arg("include_inf_points")=true);
-}
-
-void init_oineus_functions_double(py::module& m, std::string suffix);
-void init_oineus_functions_float(py::module& m, std::string suffix);
-
-template<class Int, class Real>
-void init_oineus_fil_dgm_simplex(py::module& m, std::string suffix)
-{
-    using namespace pybind11::literals;
-
-    using DgmPoint = typename oin::Diagrams<Real>::Point;
-    using DgmPtVec = typename oin::Diagrams<Real>::Dgm;
-    using IndexDgmPtVec = typename oin::Diagrams<Real>::IndexDgm;
-    using Diagram = PyOineusDiagrams<Real>;
-
-    using Simplex = oin::Simplex<Int>;
-    using SimplexValue = oin::CellWithValue<oin::Simplex<Int>, Real>;
-    using Filtration = oin::Filtration<Simplex, Real>;
-
-    using ProdSimplex = oin::ProductCell<Simplex, Simplex>;
-    using ProdSimplexValue = oin::CellWithValue<ProdSimplex, Real>;
-    using ProdFiltration = oin::Filtration<ProdSimplex, Real>;
-
-    using oin::VREdge;
-
-    using VRUDecomp = oin::VRUDecomposition<Int>;
-    using KerImCokRedSimplex = oin::KerImCokReduced<Simplex, Real>;
-    using KerImCokRedProdSimplex = oin::KerImCokReduced<ProdSimplex, Real>;
-
-    std::string filtration_class_name = "Filtration" + suffix;
-    std::string prod_filtration_class_name = "ProdFiltration" + suffix;
-    std::string simplex_class_name = "Simplex" + suffix;
-    std::string prod_simplex_class_name = "ProdSimplex" + suffix;
-
-    std::string dgm_point_name = "DiagramPoint" + suffix;
-    std::string dgm_class_name = "Diagrams" + suffix;
-
-    std::string ker_im_cok_reduced_class_name = "KerImCokReduced" + suffix;
-    std::string ker_im_cok_reduced_prod_class_name = "KerImCokReducedProd" + suffix;
-
-    py::class_<DgmPoint>(m, dgm_point_name.c_str())
-            .def(py::init<Real, Real>(), py::arg("birth"), py::arg("death"))
-            .def_readwrite("birth", &DgmPoint::birth)
-            .def_readwrite("death", &DgmPoint::death)
-            .def_readwrite("birth_index", &DgmPoint::birth_index)
-            .def_readwrite("death_index", &DgmPoint::death_index)
-            .def("__getitem__", [](const DgmPoint& p, int i) { return p[i]; })
-            .def("__hash__", [](const DgmPoint& p) { return std::hash<DgmPoint>()(p); })
-            .def("__repr__", [](const DgmPoint& p) {
-              std::stringstream ss;
-              ss << p;
-              return ss.str();
-            });
-
-    py::class_<Diagram>(m, dgm_class_name.c_str())
-            .def(py::init<dim_type>())
-            .def("in_dimension", [](Diagram& self, dim_type dim, bool as_numpy) -> std::variant<pybind11::array_t<Real>, DgmPtVec> {
-                      if (as_numpy)
-                          return self.get_diagram_in_dimension_as_numpy(dim);
-                      else
-                          return self.get_diagram_in_dimension(dim);
-                    }, "return persistence diagram in dimension dim: if as_numpy is False (default), the diagram is returned as list of DgmPoints, else as NumPy array",
-                    py::arg("dim"), py::arg("as_numpy") = true)
-            .def("index_diagram_in_dimension", [](Diagram& self, dim_type dim, bool as_numpy, bool sorted) -> std::variant<pybind11::array_t<size_t>, IndexDgmPtVec> {
-                      if (as_numpy)
-                          return self.get_index_diagram_in_dimension_as_numpy(dim, sorted);
-                      else
-                          return self.get_index_diagram_in_dimension(dim, sorted);
-                    }, "return persistence pairing (index diagram) in dimension dim: if as_numpy is False (default), the diagram is returned as list of DgmPoints, else as NumPy array",
-                    py::arg("dim"), py::arg("as_numpy") = true, py::arg("sorted") =  true)
-            .def("__getitem__", &Diagram::get_diagram_in_dimension_as_numpy);
-
-    py::class_<SimplexValue>(m, simplex_class_name.c_str())
-            .def(py::init([](typename Simplex::IdxVector vs, Real value) -> SimplexValue {
-                      return SimplexValue({vs}, value);
-                    }),
-                    py::arg("vertices"),
-                    py::arg("value"))
-            .def(py::init([](Int id, typename Simplex::IdxVector vs, Real value) -> SimplexValue { return SimplexValue({id, vs}, value); }), py::arg("id"), py::arg("vertices"), py::arg("value"))
-            .def_property("id", &SimplexValue::get_id, &SimplexValue::set_id)
-            .def_readwrite("sorted_id", &SimplexValue::sorted_id_)
-            .def_property("vertices", &SimplexValue::get_uid, &SimplexValue::set_uid)
-            .def_readwrite("value", &SimplexValue::value_)
-            .def("dim", &SimplexValue::dim)
-            .def("boundary", &SimplexValue::boundary)
-            .def("join", [](const SimplexValue& sigma, Int new_vertex, Real value, Int new_id) {
-                      return sigma.join(new_id, new_vertex, value);
-                    },
-                    py::arg("new_vertex"),
-                    py::arg("value"),
-                    py::arg("new_id") = SimplexValue::k_invalid_id)
-            .def("__repr__", [](const SimplexValue& sigma) {
-              std::stringstream ss;
-              ss << sigma;
-              return ss.str();
-            });
-
-    py::class_<ProdSimplexValue>(m, prod_simplex_class_name.c_str())
-            .def(py::init([](const SimplexValue& sigma, const SimplexValue& tau, Real value) -> ProdSimplexValue {
-                      return ProdSimplexValue(ProdSimplex(sigma.get_cell(), tau.get_cell()), value);
-                    }),
-                    py::arg("cell_1"),
-                    py::arg("cell_2"),
-                    py::arg("value"))
-            .def(py::init([](const Simplex& sigma, const Simplex& tau, Real value) -> ProdSimplexValue {
-                      return ProdSimplexValue(ProdSimplex(sigma, tau), value);
-                    }),
-                    py::arg("cell_1"),
-                    py::arg("cell_2"),
-                    py::arg("value"))
-            .def_property("id", &ProdSimplexValue::get_id, &ProdSimplexValue::set_id)
-            .def_readwrite("sorted_id", &ProdSimplexValue::sorted_id_)
-            .def_property_readonly("cell_1", &ProdSimplexValue::get_factor_1)
-            .def_property_readonly("cell_2", &ProdSimplexValue::get_factor_2)
-            .def_property_readonly("uid", &ProdSimplexValue::get_uid)
-            .def_readwrite("value", &ProdSimplexValue::value_)
-            .def("dim", &ProdSimplexValue::dim)
-            .def("boundary", &ProdSimplexValue::boundary)
-            .def("__repr__", [](const ProdSimplexValue& sigma) {
-              std::stringstream ss;
-              ss << sigma;
-              return ss.str();
-            });
-
-    py::class_<Filtration>(m, filtration_class_name.c_str())
-            .def(py::init<typename Filtration::CellVector, bool, int, bool, bool>(),
-                    py::arg("cells"),
-                    py::arg("negate") = false,
-                    py::arg("n_threads") = 1,
-                    py::arg("sort_only_by_dimension") = false,
-                    py::arg("set_ids") = true)
-            .def("max_dim", &Filtration::max_dim, "maximal dimension of a cell in filtration")
-            .def("cells", &Filtration::cells_copy, "copy of all cells in filtration order")
-            .def("simplices", &Filtration::cells_copy, "copy of all simplices (cells) in filtration order")
-            .def("size", &Filtration::size, "number of cells in filtration")
-            .def("__len__", &Filtration::size)
-            .def("size_in_dimension", &Filtration::size_in_dimension, py::arg("dim"), "number of cells of dimension dim")
-            .def("n_vertices", &Filtration::n_vertices)
-            .def("simplex_value_by_sorted_id", &Filtration::value_by_sorted_id, py::arg("sorted_id"))
-            .def("get_id_by_sorted_id", &Filtration::get_id_by_sorted_id, py::arg("sorted_id"))
-            .def("get_sorted_id_by_id", &Filtration::get_sorted_id, py::arg("id"))
-            .def("get_cell", &Filtration::get_cell, py::arg("i"))
-            .def("get_simplex", &Filtration::get_cell, py::arg("i"))
-            .def("get_sorting_permutation", &Filtration::get_sorting_permutation)
-            .def("get_inv_sorting_permutation", &Filtration::get_inv_sorting_permutation)
-            .def("simplex_value_by_vertices", &Filtration::value_by_vertices, py::arg("vertices"))
-            .def("get_sorted_id_by_vertices", &Filtration::get_sorted_id_by_vertices, py::arg("vertices"))
-            .def("cell_by_uid", &Filtration::get_cell_by_uid, py::arg("uid"))
-            .def("boundary_matrix", &Filtration::boundary_matrix_full)
-            .def("coboundary_matrix", &Filtration::coboundary_matrix)
-            .def("boundary_matrix_rel", &Filtration::boundary_matrix_full_rel)
-            .def("reset_ids_to_sorted_ids", &Filtration::reset_ids_to_sorted_ids)
-            .def("set_values", &Filtration::set_values)
-            .def("__iter__", [](Filtration& fil) { return py::make_iterator(fil.begin(), fil.end()); }, py::keep_alive<0, 1>())
-            .def("subfiltration", [](Filtration& self, const py::function& py_pred) {
-                auto pred = [&py_pred](const typename Filtration::Cell& s) -> bool { return py_pred(s).template cast<bool>(); };
-                Filtration result = self.subfiltration(pred);
-                return result;
-            }, py::arg("predicate"), py::return_value_policy::move)
-            .def("__repr__", [](const Filtration& fil) {
-              std::stringstream ss;
-              ss << fil;
-              return ss.str();
-            });
-
-    py::class_<ProdFiltration>(m, prod_filtration_class_name.c_str())
-            .def(py::init<typename ProdFiltration::CellVector, bool, int, bool, bool>(),
-                    py::arg("cells"),
-                    py::arg("negate") = false,
-                    py::arg("n_threads") = 1,
-                    py::arg("sort_only_by_dimension") = false,
-                    py::arg("set_ids") = true)
-            .def("max_dim", &ProdFiltration::max_dim, "maximal dimension of a cell in filtration")
-            .def("cells", &ProdFiltration::cells_copy, "copy of all cells in filtration order")
-            .def("size", &ProdFiltration::size, "number of cells in filtration")
-            .def("__len__", &ProdFiltration::size)
-            .def("__iter__", [](ProdFiltration& fil) { return py::make_iterator(fil.begin(), fil.end()); }, py::keep_alive<0, 1>())
-            .def("size_in_dimension", &ProdFiltration::size_in_dimension, py::arg("dim"), "number of cells of dimension dim")
-            .def("cell_value_by_sorted_id", &ProdFiltration::value_by_sorted_id, py::arg("sorted_id"))
-            .def("get_id_by_sorted_id", &ProdFiltration::get_id_by_sorted_id, py::arg("sorted_id"))
-            .def("get_sorted_id_by_id", &ProdFiltration::get_sorted_id, py::arg("id"))
-            .def("get_cell", &ProdFiltration::get_cell, py::arg("i"))
-            .def("get_sorting_permutation", &ProdFiltration::get_sorting_permutation)
-            .def("get_inv_sorting_permutation", &ProdFiltration::get_inv_sorting_permutation)
-            .def("boundary_matrix", &ProdFiltration::boundary_matrix_full)
-            .def("reset_ids_to_sorted_ids", &ProdFiltration::reset_ids_to_sorted_ids)
-            .def("set_values", &ProdFiltration::set_values)
-            .def("__repr__", [](const ProdFiltration& fil) {
-              std::stringstream ss;
-              ss << fil;
-              return ss.str();
-            });
-
-    m.def("mapping_cylinder", &oin::build_mapping_cylinder<Simplex, Real>, py::arg("fil_domain"), py::arg("fil_codomain"), py::arg("v_domain"), py::arg("v_codomain"), "return mapping cylinder filtration of inclusion fil_domain -> fil_codomain, fil_domain is multiplied by v_domain and fil_codomain is multiplied by v_codomain");
-    m.def("mapping_cylinder_with_indices", &oin::build_mapping_cylinder_with_indices<Simplex, Real>, py::arg("fil_domain"), py::arg("fil_codomain"), py::arg("v_domain"), py::arg("v_codomain"), "return mapping cylinder filtration of inclusion fil_domain -> fil_codomain, fil_domain is multiplied by v_domain and fil_codomain is multiplied by v_codomain");
-    m.def("multiply_filtration", &oin::multiply_filtration<Simplex, Real>, py::arg("fil"), py::arg("sigma"), "return a filtration with each simplex in fil multiplied by simplex sigma");
-//    m.def("kernel_diagrams", &compute_kernel_diagrams<ProdSimplex, Real>, py::arg("fil_L"), py::arg("fil_K"), "return kernel persistence diagrams of inclusion fil_L -> fil_K");
-//    m.def("kernel_cokernel_diagrams", &compute_kernel_cokernel_diagrams<ProdSimplex, Real>, py::arg("fil_L"), py::arg("fil_K"), "return kernel and cokernel persistence diagrams of inclusion fil_L -> fil_K");
-
-    m.def("min_filtration", &oin::min_filtration<Simplex, Real>, py::arg("fil_1"), py::arg("fil_2"), "return a filtration where each simplex has minimal value from fil_1, fil_2");
-    m.def("min_filtration", &oin::min_filtration<ProdSimplex, Real>, py::arg("fil_1"), py::arg("fil_2"), "return a filtration where each cell has minimal value from fil_1, fil_2");
-
-    // helper for differentiable filtration
-    m.def("min_filtration_with_indices", &oin::min_filtration_with_indices<Simplex, Real>, py::arg("fil_1"), py::arg("fil_2"), "return a tuple (filtration, inds_1, inds_2) where each simplex has minimal value from fil_1, fil_2 and inds_1, inds_2 are its indices in fil_1, fil_2");
-    m.def("min_filtration_with_indices", &oin::min_filtration_with_indices<ProdSimplex, Real>, py::arg("fil_1"), py::arg("fil_2"), "return a tuple (filtration, inds_1, inds_2) where each simplex has minimal value from fil_1, fil_2 and inds_1, inds_2 are its indices in fil_1, fil_2");
-
-    py::class_<KerImCokRedSimplex>(m, ker_im_cok_reduced_class_name.c_str())
-            .def(py::init<const Filtration&, const Filtration&, oin::Params&, bool>(), py::arg("K"), py::arg("L"), py::arg("params"), py::arg("include_zero_persistence")=false)
-            .def("domain_diagrams", [](const KerImCokRedSimplex& self) { return PyOineusDiagrams<Real>(self.get_domain_diagrams()); })
-            .def("codomain_diagrams", [](const KerImCokRedSimplex& self) { return PyOineusDiagrams<Real>(self.get_codomain_diagrams()); })
-            .def("kernel_diagrams", [](const KerImCokRedSimplex& self) { return PyOineusDiagrams<Real>(self.get_kernel_diagrams()); })
-            .def("cokernel_diagrams", [](const KerImCokRedSimplex& self) { return PyOineusDiagrams<Real>(self.get_cokernel_diagrams()); })
-            .def("image_diagrams", [](const KerImCokRedSimplex& self) { return PyOineusDiagrams<Real>(self.get_image_diagrams()); })
-            // decomposition objects provide access to their R/V/U matrices
-            .def_readonly("decomposition_f", &KerImCokRedSimplex::dcmp_F_)
-            .def_readonly("decomposition_g", &KerImCokRedSimplex::dcmp_G_)
-            .def_readonly("decomposition_im", &KerImCokRedSimplex::dcmp_im_)
-            .def_readonly("decomposition_ker", &KerImCokRedSimplex::dcmp_ker_)
-            .def_readonly("decomposition_cok", &KerImCokRedSimplex::dcmp_cok_)
-            ;
-
-     py::class_<KerImCokRedProdSimplex>(m, ker_im_cok_reduced_prod_class_name.c_str())
-            .def(py::init<const ProdFiltration&, const ProdFiltration&, oin::Params&, bool>(), py::arg("K"), py::arg("L"), py::arg("params"), py::arg("include_zero_persistence")=false)
-            .def("kernel_diagrams", [](const KerImCokRedProdSimplex& self) { return PyOineusDiagrams<Real>(self.get_kernel_diagrams()); })
-            .def("cokernel_diagrams", [](const KerImCokRedProdSimplex& self) { return PyOineusDiagrams<Real>(self.get_cokernel_diagrams()); })
-            .def("image_diagrams", [](const KerImCokRedProdSimplex& self) { return PyOineusDiagrams<Real>(self.get_image_diagrams()); })
-            // decomposition objects provide access to their R/V/U matrices
-            .def_readonly("decomposition_f", &KerImCokRedProdSimplex::dcmp_F_)
-            .def_readonly("decomposition_g", &KerImCokRedProdSimplex::dcmp_G_)
-            .def_readonly("decomposition_im", &KerImCokRedProdSimplex::dcmp_im_)
-            .def_readonly("decomposition_ker", &KerImCokRedProdSimplex::dcmp_ker_)
-            .def_readonly("decomposition_cok", &KerImCokRedProdSimplex::dcmp_cok_)
-            ;
-}
-
-void init_oineus_fil_dgm_simplex_float(py::module& m, std::string suffix);
-void init_oineus_fil_dgm_simplex_double(py::module& m, std::string suffix);
-
+void init_oineus_common(py::module& m);
+void init_oineus_common_decomposition(py::module& m);
+void init_oineus_diagram(py::module& m);
+void init_oineus_functions(py::module& m);
+void init_oineus_filtration(py::module& m);
+void init_oineus_cells(py::module& m);
+void init_oineus_kicr(py::module& m);
 void init_oineus_top_optimizer(py::module& m);
 
 #endif //OINEUS_OINEUS_PERSISTENCE_BINDINGS_H
diff --git a/bindings/python/oineus_top_optimizer.cpp b/bindings/python/oineus_top_optimizer.cpp
index 6b5df07..8ea0b14 100644
--- a/bindings/python/oineus_top_optimizer.cpp
+++ b/bindings/python/oineus_top_optimizer.cpp
@@ -1,18 +1,14 @@
 #include "oineus_persistence_bindings.h"
 
-
 template<class Cell>
 void init_oineus_top_optimizer_class(py::module& m, std::string opt_name, std::string ind_vals_name)
 {
-    using Real = double;
-    using Int = int;
-
-    using Filtration = oin::Filtration<Cell, Real>;
-    using TopologyOptimizer = oin::TopologyOptimizer<Cell, Real>;
+    using Filtration = oin::Filtration<Cell, oin_real>;
+    using TopologyOptimizer = oin::TopologyOptimizer<Cell, oin_real>;
     using IndicesValues = typename TopologyOptimizer::IndicesValues;
     using CriticalSets = typename TopologyOptimizer::CriticalSets;
     using ConflictStrategy = oin::ConflictStrategy;
-    using Diagram = typename oin::Diagrams<Real>::Dgm;
+    using Diagram = typename oin::Diagrams<oin_real>::Dgm;
     using Indices = typename TopologyOptimizer::Indices;
     using Values = typename TopologyOptimizer::Values;
     using IndicesValues = typename TopologyOptimizer::IndicesValues;
@@ -37,7 +33,7 @@ void init_oineus_top_optimizer_class(py::module& m, std::string opt_name, std::s
     // optimization
     py::class_<TopologyOptimizer>(m, opt_name.c_str())
             .def(py::init<const Filtration&>())
-            .def("compute_diagram", [](TopologyOptimizer& opt, bool include_inf_points) { return PyOineusDiagrams<Real>(opt.compute_diagram(include_inf_points)); },
+            .def("compute_diagram", [](TopologyOptimizer& opt, bool include_inf_points) { return PyOineusDiagrams<oin_real>(opt.compute_diagram(include_inf_points)); },
                     py::arg("include_inf_points"),
                     "compute diagrams in all dimensions")
             .def("simplify", &TopologyOptimizer::simplify,
@@ -46,7 +42,7 @@ void init_oineus_top_optimizer_class(py::module& m, std::string opt_name, std::s
                     py::arg("dim"), "make points with persistence less than epsilon go to the diagonal")
             .def("get_nth_persistence", &TopologyOptimizer::get_nth_persistence,
                     py::arg("dim"), py::arg("n"), "return n-th persistence value in d-dimensional persistence diagram")
-            .def("match", [](TopologyOptimizer& opt, Diagram& template_dgm, dim_type d, Real wasserstein_q, bool return_wasserstein_distance) -> std::variant<IndicesValues, std::pair<IndicesValues, Real>>
+            .def("match", [](TopologyOptimizer& opt, Diagram& template_dgm, dim_type d, oin_real wasserstein_q, bool return_wasserstein_distance) -> std::variant<IndicesValues, std::pair<IndicesValues, oin_real>>
                     {
                       if (return_wasserstein_distance)
                           return opt.match_and_distance(template_dgm, d, wasserstein_q);
@@ -78,18 +74,14 @@ void init_oineus_top_optimizer_class(py::module& m, std::string opt_name, std::s
 
 void init_oineus_top_optimizer(py::module& m)
 {
-
-    using Real = double;
-    using Int = int;
-
-    using Simp = oin::Simplex<Int>;
+    using Simp = oin::Simplex<oin_int>;
     using SimpProd = oin::ProductCell<Simp, Simp>;
 
     init_oineus_top_optimizer_class<Simp>(m, "TopologyOptimizer", "IndicesValues");
     init_oineus_top_optimizer_class<SimpProd>(m, "TopologyOptimizerProd", "IndicesValuesProd");
 
     // induced matching
-    m.def("get_induced_matching", &oin::get_induced_matching<Simp, Real>,
+    m.def("get_induced_matching", &oin::get_induced_matching<Simp, oin_real>,
             "Compute induced matching for two filtrations of the same complex",
             py::arg("included_filtration"),
             py::arg("containing_filtration"),
diff --git a/doc/tutorial.md b/doc/tutorial.md
index 9f96635..e04718c 100644
--- a/doc/tutorial.md
+++ b/doc/tutorial.md
@@ -13,17 +13,17 @@ we need to create simplices first. We have a filtration of a triangle,
 and we use `double` to store filtration values (second argument to the constructor):
 ```python
 # vertices
-v0 = oin.Simplex_double([0], 0.1)
-v1 = oin.Simplex_double([1], 0.2)
-v2 = oin.Simplex_double([2], 0.3)
+v0 = oin.Simplex([0], 0.1)
+v1 = oin.Simplex([1], 0.2)
+v2 = oin.Simplex([2], 0.3)
 
 # edges
-e1 = oin.Simplex_double([0, 1], 1.2)
-e2 = oin.Simplex_double([0, 2], 1.4)
-e3 = oin.Simplex_double([1, 2], 2.1)
+e1 = oin.Simplex([0, 1], 1.2)
+e2 = oin.Simplex([0, 2], 1.4)
+e3 = oin.Simplex([1, 2], 2.1)
 
 # triangle
-t1 = oin.Simplex_double([0, 1, 2], 4.0)
+t1 = oin.Simplex([0, 1, 2], 4.0)
 ```
 
 We now put simplices into a list and create a parallel list
@@ -38,7 +38,7 @@ Now we create a filtration.
 
 ```python
 # constructor will sort simplices and assign sorted_ids
-fil = oin.Filtration_double(simplices)
+fil = oin.Filtration(simplices)
 
 print(fil)
 ```
@@ -55,7 +55,7 @@ by the `sorted_id`.
 The constructor of a filtration has some additional arguments:
 * `set_ids` is `True` by default, that is why `id`s are overwritten. Set it to `False` to preserve original `id`s.
 Caveat: vertex `i` must still have `id == i` and the `id`s must be unique. You can specify
-the `id` as the first argument to constructor: `oin.Simplex_double(3, [0, 1], 0.5)` or assign to it: `sigma.id = 3`.
+the `id` as the first argument to constructor: `oin.Simplex(3, [0, 1], 0.5)` or assign to it: `sigma.id = 3`.
 * `sort_only_by_dimension` is `False` by default. If you know that your simplices are already in the correct
 order, you can set it to `True`: the simplices will be arranged by dimension, but the order
 of simplices of same dimension will be preserved.
@@ -82,17 +82,21 @@ n_points = 20
 dim = 3
 points = np.random.uniform(size=(n_points, dim))
 
-fil = oin.get_vr_filtration(points, max_dim=3, max_radius=2)
+fil = oin.vr_filtration(points, max_dim=3, max_diameter=2)
 print(fil)
 ```
 
 The parameters are:
 * `points`: coordinates of points in the point cloud.
 * `max_dim`: the resulting filtration will contain simplices up to and including `max_dim`.
-If you want to compute persistence diagrams in dimension `d`, you need `max_dim >= d+1`.
-* `max_radius`: only consider balls up to this radius.
+If you want to compute persistence diagrams in dimension `d`, you need `max_dim >= d+1`. Default value:
+dimension of the points.
+* `max_diameter`: only consider simplices up to this diameter. Default value:
+a minimax that guarantees contractibility after that value, as in Ripser.
 
-For distance matrix:
+For distance matrix you use the same function, just
+specify the parameter `from_pwdists` to be True.
+In this case, `max_dim` must be supplied.
 
 ```python
 import numpy as np
@@ -109,7 +113,7 @@ points = np.random.uniform(size=(n_points, dim))
 
 distances = scipy.spatial.distance.cdist(points, points, 'euclidean')
 
-fil = oin.get_vr_filtration_from_pwdists(distances, max_dim=3, max_radius=2)
+fil = oin.vr_filtration(distances, from_pwdists=True, max_dim=3)
 print(fil)
 ```
 
@@ -122,18 +126,26 @@ for D = 1 , 2, or 3. Function values are represented as an D-dimensional NumPy a
 # create scalar function on 8x8x8 grid
 f = np.random.uniform(size=(8, 8, 8))
 
-fil = oin.get_freudenthal_filtration(data=f, max_dim=3)
+fil = oin.freudenthal_filtration(data=f, max_dim=3)
 ```
 
 If you want upper-star filtration, set `negate` to `True`:
 ```python
-fil = oin.get_freudenthal_filtration(data=f, negate=True, max_dim=3)
+fil = oin.freudenthal_filtration(data=f, negate=True)
 ```
 If you want periodic boundary conditions (D-dimensional torus instead
 of D-dimensional cube), set `wrap` to `True`:
 ```python
-fil = oin.get_freudenthal_filtration(data=f, wrap=True, max_dim=3)
+fil = oin.freudenthal_filtration(data=f, wrap=True)
 ```
+If your data is D-dimensional, but you only need lower-dimensional
+simplices, you can specify `max_dim` parameter (it defaults to `data.ndim`):
+```python
+fil = oin.freudenthal_filtration(data=f, max_dim=2)
+```
+Note that it is the dimension of maximal simplex in the filtration;
+if you want to compute persistence diagram in dimension k,
+you need to have simplices of dimension k+1 (negative ones).
 
 ## Persistence Diagrams
 
@@ -236,29 +248,23 @@ sigma_sorted_idx, tau_sorted_idx = ind_dgm_2[0, :]
 `sigma_sorted_idx` is the index of the birth simplex (triangle) in filtration order.
 `tau_sorted_idx` is the index of the death simplex (tetrahedron) in filtration order.
 There are many ways to get the original simplex:
-* `sigma = fil.get_simplex(sigma_sorted_idx)` will return a simplex itself. So, `fil.get_simplex`
+* `sigma = fil.simplex(sigma_sorted_idx)` will return a simplex itself. So, `fil.simplex`
 takes the `sorted_id` of a simplex and just accesses the vector of simplices at this index,
 so it is cheap.
-* `sigma_idx = fil.get_id_by_sorted_id(sigma_sorted_idx)` will return the `id` of `sigma`.
-Recall that, by default, it is the index of `sigma` in the original list of simplices that was used to create the filtration.
+* `sigma_idx = fil.id_by_sorted_id(sigma_sorted_idx)` will return the `id` of `sigma`.
+Recall that it is the index of `sigma` in the original list of simplices that was used to create the filtration.
 This is convenient, if you have a parallel array of some information, one entry per simplex,
 which you want to access.
 
 
 ## Topology Optimization
 
-Topology optimization is performed by the `TopologyOptimizer` class.
-It is created from a filtration.
 
-```python
-opt = oin.TopologyOptimizer(fil)
-```
-In order to specify the target (where some points
-in the diagram should go), we use indices and values.
+There are two ways to perform topological optimization.  We consider two examples, both use the following function to generate differentiable data.
+
 
-Let us consider an example. Here is a helper function to generate data:
 ```python
-def sample_data(num_points: int=100, noise_std_dev=0.1):
+def sample_data(num_points: int=50, noise_std_dev=0.1):
     # sample points from the unit circle and add noise
     # num_points: number of points to sample
     # noise_std_dev: standard deviation of Gaussian noise
@@ -277,15 +283,113 @@ def sample_data(num_points: int=100, noise_std_dev=0.1):
     pts.requires_grad_(True)
 
     return pts
+
+pts = sample_data()
+```
+
+
+We also set up a PyTorch optimizer that acts on the points:
+
+
+```python
+opt = torch.optim.SGD([pts], lr=0.1)
+```
+
+
+### Differentiable filtrations
+
+
+This method requires `eagerpy` package. It provides wrappers around tensors from PyTorch, Jax and Tensorflow.  Differentiable filtrations are defined in oineus.diff subpackage.  To create a differentiable Vietoris-Rips, we can use the function `oineus.diff.vr_filtration` with essentially the same signature as `oineus.vr_filtration`.
+
+
+```python
+import oineus as oin
+import oineus.diff
+
+
+fil = oin.diff.vr_filtration(pts)
 ```
-Suppose that we want to push all but the first n-1 points
-in the PD in dimension dim to move to the diagonal.
-The corresponding loss function is computed by this function:
+
+
+Now `fil` is an object that contains the standard filtration (accessible as `fil.under_fil`) and a differentiable tensor of critical values `fil.values`.  We also need to create a TopologyOptimizer object:
+
+
+```python
+top_opt = oin.diff.TopologyOptimizer(fil)
+```
+
+
+Suppose that we want to keep only the most persistent point in the 1-dimensional persistence diagram, and all other points should go to the diagonal vertically (that is, a point (b, d) ideally should go to point (b, b)).
+
+
+```python
+dim = 1
+n = 2
+dgm = top_opt.compute_diagram(include_inf_points=False)
+# eps is the threshold: all points whose persistence does not exceed eps will move
+# if we want to remove all but one, the most persistent point, eps must be the
+# second biggest persistence
+eps = top_opt.get_nth_persistence(dim, n)
+```
+
+
+TopologyOptimizer provides a helper function `simplify` which returns a tuple of indices and values. This is key concept: this tuple encodes prescribed targets for individual diagram points. The interpretation is: for every `i`, simplex `fil[indices[i]]` must take new critical value `values[i]`. In this case, we want to modify only the death value, so every simplex in `indices` will be a negative simplex that produces an off-diagonal point in the diagram.
+
+
+```python
+indices, values = top_opt.simplify(eps, oin.DenoiseStrategy.BirthBirth, dim)
+```
+
+
+Now, if we want to use the traditional method of back-propagation through a persistence diagram, we simply need to define the topological loss and use standard PyTorch approach like that:
+
+
+```python
+# zeros gradients on pts
+opt.zero_grad()
+
+top_loss = torch.mean((fil.values[indices] - values) ** 2)
+
+# populates gradients on pts from top_loss
+top_loss.backward()
+
+# updates pts using gradient descent
+opt.step()
+```
+
+However, if we want to operate through a bigger set of simplices at once, we can do the following (see 'Topological Optimization with Big Steps' for details):
+
+
+
+```python
+# returns a list of critical sets for each singleton loss defined by indices[i], values[i]
+critical_sets = top_opt.singletons(indices, values)
+
+# resolve conflicts between critical sets
+crit_indices, crit_values = top_opt.combine_loss(critical_sets, oin.ConflictStrategy.Max)
+
+# convert lists to something torch can understand
+crit_indices = np.array(crit_indices, dtype=np.int32)
+crit_values = torch.Tensor(crit_values)
+
+opt.zero_grad()
+top_loss = torch.mean((fil.values[crit_indices] - crit_values) ** 2)
+top_loss.backward()
+opt.step()
+```
+
+
+### Manual
+
+
+This is a more complicated way, which gives you more control, you do all the steps by hand.  The corresponding loss function is computed by this function:
 ```python
 def topological_loss(pts: torch.Tensor, dim: int=1, n: int=2):
     pts_as_numpy = pts.clone().detach().numpy().astype(np.float64)
-    fil, longest_edges = oin.get_vr_filtration_and_critical_edges(pts_as_numpy, max_dim=2, max_radius=9.0, n_threads=1)
-    
+    # we need critical edges for each simplex, so we specify
+    # with_critical_edges
+    fil, longest_edges = oin.vr_filtration(pts_as_numpy, with_critical_edges=True)
+
     top_opt = oin.TopologyOptimizer(fil)
 
     eps = top_opt.get_nth_persistence(dim, n)
@@ -308,34 +412,21 @@ def topological_loss(pts: torch.Tensor, dim: int=1, n: int=2):
         top_loss.requires_grad_(True)
     return top_loss
 ```
-First, we need to convert `pts` to a NumPy array, because that is the type
-that `oin.get_vr_filtration_and_critical_edges` expects as the first argument.
-Then we create the `TopologyOptimizer` object that provides access to all
-optimization-related functions.
+
+
+First, we need to convert `pts` to a NumPy array, because that is the type that `oin.vr_filtration` expects as the first argument.  Then we create the `TopologyOptimizer` object that provides access to all optimization-related functions.
+
+
 ```python
     eps = top_opt.get_nth_persistence(dim, n)
 ```
-Since we want to preserve all but the first `n-1` points of the diagram,
-we compute the persistence of the `n`-th point in the diagram. All points
-with persistence at most `eps` will be driven to the diagonal.
-There are 3 natural choices: a point `(b, d)` can be moved to 
-`(b, b)`, `(d, d)` or `((b+d)/2, (b+d)/2)`.
-They correspond to 3 members of the enum `oin.DenoiseStrategy`: `BirthBirth`, `DeathDeath`, `Midway`.
 
+
+Since we want to preserve all but the first `n-1` points of the diagram, we compute the persistence of the `n`-th point in the diagram. All points with persistence at most `eps` will be driven to the diagonal.  There are 3 natural choices: a point `(b, d)` can be moved to `(b, b)`, `(d, d)` or `((b+d)/2, (b+d)/2)`.  They correspond to 3 members of the enum `oin.DenoiseStrategy`: `BirthBirth`, `DeathDeath`, `Midway`.
 ```python
     indices, values = top_opt.simplify(eps, oin.DenoiseStrategy.BirthBirth, dim)
 ```
-`indices` and `values` encode what we call the *matching loss*.  
-It is easier to explain the meaning of this by example.
-Say, we want to move two points of the persistence diagram, `(b_1, d_1)` and `(b_2, d_2)` to the destinations
-`(target_b_1, target_d_1)` and `(target_b_2, target_d_2)` respectively.
-Recall that `b_1` is the filtration value of some simplex in filtration,
-say, `sigma_1`. Similarly, `d_1` corresponds to `sigma_2`, `b_2` corresponds
-to `sigma_3` and `d_2` corresponds to `sigma_4`.
-In this case, `indices = [i_1, i_2, i_3, i_4]` and 
-`values = [target_b_1, target_d_1, target_b_2, target_d_2]`,
-where `i_1` is the index (`sorted_id`) of `sigma_1` in the filtration, `i_2` is
-the index of `sigma_2`, etc.
+`indices` and `values` encode what we call the *matching loss*.  It is easier to explain the meaning of this by example.  Say, we want to move two points of the persistence diagram, `(b_1, d_1)` and `(b_2, d_2)` to the destinations `(target_b_1, target_d_1)` and `(target_b_2, target_d_2)` respectively.  Recall that `b_1` is the filtration value of some simplex in filtration, say, `sigma_1`. Similarly, `d_1` corresponds to `sigma_2`, `b_2` corresponds to `sigma_3` and `d_2` corresponds to `sigma_4`.  In this case, `indices = [i_1, i_2, i_3, i_4]` and `values = [target_b_1, target_d_1, target_b_2, target_d_2]`, where `i_1` is the index (`sorted_id`) of `sigma_1` in the filtration, `i_2` is the index of `sigma_2`, etc.
 
 
 Note that each pair, like `(i_2, target_d_1)` defines
@@ -343,6 +434,8 @@ the *singleton loss*. The line
 ```python
     critical_sets = top_opt.singletons(indices, values)
 ```
+
+
 computes all the critical sets at once. `critical_sets`
 is a `list` of the same length as `indices`.
 Each element of the list is a pair `(value, simplex_indices)`,
@@ -350,52 +443,37 @@ where `simplex_indices` contains the critical set
 (indices of simplices in filtration order) and
 `value` is the target value that should be assigned to all of them.
 
-There can be conflicts: different terms in the matching loss
-can send the same simplex to different values.
-In order to resolve them, we use the function `combine_loss`:
+
+There can be conflicts: different terms in the matching loss can send the same simplex to different values.  In order to resolve them, we use the function `combine_loss`:
 ```python
     crit_indices, crit_values = top_opt.combine_loss(critical_sets, oin.ConflictStrategy.Max)
 ```
-The meaning of the output is the same as in `simplify`:
-`crit_indices` contains the indices of simplices and `crit_values`
-contains the values that we want these simplices to have.
-The conflicts are resolved according to the `oin.ConflictStrategy` enum:
-`Max` means that we choose the value that is the farthest one from the current
-filtration value of the given simplex, `Avg` means that we take the average.
+The meaning of the output is the same as in `simplify`: `crit_indices` contains the indices of simplices and `crit_values` contains the values that we want these simplices to have.  The conflicts are resolved according to the `oin.ConflictStrategy` enum: `Max` means that we choose the value that is the farthest one from the current filtration value of the given simplex, `Avg` means that we take the average.
+
 
+In this example we use Vietoris--Rips. The simplex `crit_indices[k]` has the longest edge, and `crit_values[k]` is the length that we want this edge to have. It remains to express this in the differentiable (known to Torch) way.
 
-In this example we use Vietoris--Rips. The simplex
-`crit_indices[k]` has the longest edge, and `crit_values[k]` is the length
-that we want this edge to have. It remains to express
-this in the differentiable (known to Torch) way.
 
 First, for each critical simplex, let us extract
 the endpoints of its longest edge.
 ```python
-    # convert from list of ints to np.array, so that subscription works
-    crit_indices = np.array(crit_indices, dtype=np.int32)
-    
-    # we get only the edges of the critical simplices
-    crit_edges = longest_edges[crit_indices, :]
-    # split them into start and end points
-    # for critical simplex sigma that appears in position k in crit_indices,
-    # crit_edges_x[k] and crit_edges_y[k] give the indices (in pts)
-    # of the endpoints of its longest edge.
-    crit_edges_x, crit_edges_y = crit_edges[:, 0], crit_edges[:, 1]
+# convert from list of ints to np.array, so that subscription works
+crit_indices = np.array(crit_indices, dtype=np.int32)
+
+# we get only the edges of the critical simplices
+crit_edges = longest_edges[crit_indices, :]
+# split them into start and end points
+# for critical simplex sigma that appears in position k in crit_indices,
+# crit_edges_x[k] and crit_edges_y[k] give the indices (in pts)
+# of the endpoints of its longest edge.
+crit_edges_x, crit_edges_y = crit_edges[:, 0], crit_edges[:, 1]
 ```
 
 ```python
     top_loss = torch.sum(torch.abs(torch.sum((pts[crit_edges_x, :] - pts[crit_edges_y, :])**2, axis=1) - crit_values ** 2))
 ```
 
-The expression `torch.sum((pts[crit_edges_x, :] - pts[crit_edges_y, :])**2, axis=1)`
-performs summation over all coordinates, so the resulting tensor
-contains the squared lengths of critical edges computed in a differentiable
-way. Do not take the square root of these lenghts,
-this will lead to NaN-s in your gradients (see https://github.com/pytorch/pytorch/issues/15506,
-but this is not Torch-specific, Jax has the same behavior).
-Instead, use the squares of target lengths, as in this example.
-
+The expression `torch.sum((pts[crit_edges_x, :] - pts[crit_edges_y, :])**2, axis=1)` performs summation over all coordinates, so the resulting tensor contains the squared lengths of critical edges computed in a differentiable way. Do not take the square root of these lenghts, this will lead to NaN-s in your gradients (see https://github.com/pytorch/pytorch/issues/15506, but this is not Torch-specific, Jax has the same behavior).  Instead, either use the squares of target lengths, as in this example, or add a small number before taking the square root.
 
 
 Remarks:
@@ -421,12 +499,14 @@ themselves. To be able to differentiate the distances, use `torch.cdist` or anal
 to compute your distances in the differentiable way.
 
 
-
 ## Kernel, image and cokernel persistence
+
+
 Oineus can compute the kernel, image and cokernel persistence diagrams as in ["Persistent Homology for Kernels, Images, and Cokernels"](https://doi.org/10.1137/1.9781611973068.110) by D. Cohen-Steiner, H. Edelsbrunner, D. Morozov. We first perform the required reductions using `compute_kernel_image_cokernel_diagrams`, which has arguments:
 * `K` the simplicial complex with function values, as a list with an element per simplex in the format `[simplex_id, vertices, value]`, where `vertices` is a list containing the ids of the vertices, and value is the value under the function f.
 * `L` the simplicial sub-complex with function values, as a list with an element per simplex in the format `[simplex_id, vertices, value]`, where `vertices` is a list containing the ids of the vertices, and value is the value under the function g.
-* `L_to_K` a list which maps the cells in L to their corresponding cells in K,
+* `params` specifies which of the three components (image, kernel, cokernel)
+    you need. Type: KICRParams.
 * `n_threads` the number of threads you want to use,
 * `return` an object which contains the kernel, image and cokernel diagrams, as well as the reduced matrices.
 
@@ -434,7 +514,10 @@ To obtain the different diagrams, we use `kernel()`, `image()`, `cokernel()`, an
 
 **Note:** aside from the number of threads, all other parameters are set already. 
 
+
 #### Example
+
+
 Suppose we have a simplicial complex $K$ with a function $f$ on it, and a subcomplex $L \subset K$ with a function $g$ on it. In this example, $g = f|_L$. We then perform the 5 necessary reductions and compute the persistence diagrams using `compute_kernel_image_cokernel_diagrams`, and then access the 3 sets of diagrams using `kernel()`, `image()`, `cokernel()` respectively. After which we can obtain a diagram in a specific dimension $i$ using `in_dimension(i)`.
 
 ```python
@@ -442,11 +525,14 @@ import oineus as oin
 n_threads = 4
 K = [[0, [0], 10], [1,[1],50], [2,[2], 10], [3, [3], 10], [4,[0,1], 50], [5, [1,2], 50], [6,[0,3], 10], [7, [2,3], 10]]
 L = [[0, [0], 10], [1,[1],50], [2,[2], 10], [3, [0,1], 50], [4,[1,2],50]]
-L_to_K = [0,1,2,4,5]
-ker_im_cok_dgms = oin.compute_kernel_image_cokernel_diagrams(K, L, L_to_K, n_threads)
+ker_im_cok_dgms = oin.compute_kernel_image_cokernel_diagrams(K, L)
+# by default, all 3 diagrams are computed
 ker_dgms = ker_im_cok_dgms.kernel()
 im_dgms = ker_im_cok_dgms.image()
 cok_dgms = ker_im_cok_dgms.cokernel()
 ker_dgms.in_dimension(0)
+# if you only want, e.g., kernel:
+params = oin.KICRParams()
+params.image = params.cokernel = False
+ker_im_cok_dgms = oin.compute_kernel_image_cokernel_diagrams(K, L, params)
 ```
- 
diff --git a/examples/kernel-example-2.py b/examples/kernel-example-2.py
deleted file mode 100644
index 144bf37..0000000
--- a/examples/kernel-example-2.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import oineus as oin
-
-params=oin.ReductionParams()
-params.n_threads=4
-params.kernel=True
-params.image=True
-params.cokernel=True
-
-K = [ [0, [0], 10], [1,[1],50], [2,[2], 10], [3, [3], 10], [4,[0,1], 50], [5, [1,2], 50], [6,[0,3], 10], [7, [2,3], 10] ]
-L = [ [0, [0], 10], [1,[1],50], [2,[2], 10], [3, [0,1], 50], [4,[1,2],50] ]
-IdMapping = [0,1,2,4,5]
-
-kicr = oin.compute_kernel_image_cokernel_reduction(K, L, IdMapping, params)
-
-kernel_dgms = kicr.kernel_diagrams()
-kernel_dgms = kicr.image_diagrams()
-cokernel_dgms = kicr.cokernel_diagrams()
-print(kernel_dgms.in_dimension(0))
-print(image_dgms.in_dimension(0))
-print(cokernel_dgms.in_dimension(0))
-
-
diff --git a/examples/kernel-example-3.py b/examples/kernel-example-3.py
deleted file mode 100644
index bf38964..0000000
--- a/examples/kernel-example-3.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import oineus as oin
-
-params=oin.ReductionParams()
-params.n_threads=4
-params.kernel=True
-params.image=True
-params.cokernel=True
-
-K = [ [0, [0], 10], [1,[1],50], [2,[2], 20], [3, [3], 0], [4,[0,1], 60], [5, [1,2], 70], [6,[0,3], 30], [7, [2,3], 40] ]
-L = [ [0, [0], 10], [1,[1],50], [2,[2], 20], [3, [0,1], 60], [4,[1,2],70] ]
-IdMapping = [0,1,2,4,5]
-
-kicr = oin.compute_kernel_image_cokernel_reduction(K, L, IdMapping, params)
-
-kernel_dgms = kicr.kernel_diagrams()
-iamge_dgms = kicr.image_diagrams()
-cokernel_dgms = kicr.cokernel_diagrams()
-print(kernel_dgms.in_dimension(0))
-print(image_dgms.in_dimension(0))
-print(cokernel_dgms.in_dimension(0))
-
-
diff --git a/examples/kernel-example-4.py b/examples/kernel-example-4.py
deleted file mode 100644
index 7199158..0000000
--- a/examples/kernel-example-4.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import oineus as oin
-
-params=oin.ReductionParams()
-params.n_threads=4
-params.kernel=True
-params.image=True
-params.cokernel=True
-
-K = [ [0, [0], 10], [1,[1],50], [2,[2], 20], [3, [3], 50], [4,[4], 15], [5, [5], 12], [6,[0,1], 50], [7, [1,2], 60], [8,[2,3], 70], [9, [3,4], 80], [10, [0,5], 30], [11,[4,5], 20]]
-L = [ [0, [0], 10], [1,[1],50], [2,[2], 20], [3, [3], 50], [4,[4], 15], [5,[0,1], 50], [6, [1,2], 60], [7,[2,3], 70], [8, [3,4], 80] ]
-IdMapping = [0,1,2,3,4,6,7,8,9]
-
-kicr = oin.compute_kernel_image_cokernel_reduction(K, L, IdMapping, params)
-
-kernel_dgms = kicr.kernel_diagrams()
-image_dgms = kicr.image_diagrams()
-cokernel_dgms = kicr.cokernel_diagrams()
-print(kernel_dgms.in_dimension(0))
-print(image_dgms.in_dimension(0))
-print(cokernel_dgms.in_dimension(0))
-
-
diff --git a/examples/oineus_v_opt.cpp b/examples/oineus_v_opt.cpp
index d82334c..e4e8f24 100644
--- a/examples/oineus_v_opt.cpp
+++ b/examples/oineus_v_opt.cpp
@@ -4,6 +4,9 @@
 #include <random>
 
 #include <oineus/oineus.h>
+#include <taskflow/taskflow.hpp>
+#include <taskflow/core/flow_builder.hpp>
+#include <taskflow/algorithm/for_each.hpp>
 #include <opts/opts.h>
 
 using namespace oineus;
@@ -88,6 +91,7 @@ int main(int argc, char** argv)
     bool negate {false};
 
     Params params;
+    params.n_threads = 10;
     ops
             >> Option('d', "dim", top_d, "top dimension")
             >> Option('c', "chunk-size", params.chunk_size, "chunk_size")
@@ -108,12 +112,27 @@ int main(int argc, char** argv)
 
     spd::info("Reading file {}", fname_in);
 
+
     auto grid_func = read_function<Int, Real, 3>(fname_in, wrap);
     auto& grid = grid_func.first;
 
     auto fil = grid.freudenthal_filtration(top_d, negate, params.n_threads);
     VRUDecomposition<Int> rv { fil, false };
 
+    {
+        tf::Executor executor;
+        tf::Taskflow taskflow;
+        oineus::Filtration<oineus::Simplex<Int>, Real> fil1;
+        oineus::HelperFilCallable<oineus::Simplex<Int>, Real> aa(&fil1);
+        std::vector<Int> bb(10000);
+        tf::Task task1 = taskflow.for_each(bb.begin(), bb.end(), [] (auto& i) { i = 100; });
+//        tf::Task t1 = taskflow.for_each_index<Int, Int, Int, oineus::HelperFilCallable<oineus::Simplex<Int>, Real>> (Int(0), Int(0), Int(1), aa);
+//        tf::Task t2 = taskflow.for_each_index(Int(0), Int(0), Int(1), [](Int i) { std::cout << i << std::endl; });
+        executor.run(taskflow);
+        executor.wait_for_all();
+
+    }
+//
     spd::info("Matrix read");
 
     fname_dgm = fname_in + "_t_" + std::to_string(params.n_threads) + "_c_" + std::to_string(params.chunk_size);
diff --git a/bindings/python/example.py b/examples/python/example.py
similarity index 90%
rename from bindings/python/example.py
rename to examples/python/example.py
index 2c47a7c..1a9bdb6 100755
--- a/bindings/python/example.py
+++ b/examples/python/example.py
@@ -10,7 +10,7 @@
 # triangulate domain via Freudenthal and create lower star filtration
 # negate: set to True to get upper-star filtration
 # wrap: set to True to work on torus (periodic boundary conditions)
-fil = oin.get_freudenthal_filtration(data=f, negate=False, wrap=False, max_dim=3, n_threads=1)
+fil = oin.freudenthal_filtration(data=f, negate=False, wrap=False)
 
 cells = fil.cells()
 
@@ -25,10 +25,11 @@
 # reduction parameters
 # relevant members:
 # rp.clearing_opt --- whether you want to use clearing, True by default
-# rp.compute_v: True by default
+# rp.compute_v: False by default
 # rp.n_threads: number of threads to use, default is 1
 # rp. compute_u: False by default (cannot do it in multi-threaded mode, so switch off just to be on the safe side)
 rp = oin.ReductionParams()
+rp.compute_v = True
 
 # perform reduction
 dcmp.reduce(rp)
diff --git a/bindings/python/example_cone.py b/examples/python/example_cone.py
similarity index 56%
rename from bindings/python/example_cone.py
rename to examples/python/example_cone.py
index 634ae01..4f3b6c1 100644
--- a/bindings/python/example_cone.py
+++ b/examples/python/example_cone.py
@@ -1,40 +1,38 @@
 #!python3
 
-from icecream import ic
 import numpy as np
 import oineus as oin
 
-n_pts = 5
+n_pts = 10
 
 pts = np.random.uniform(size=n_pts * 2).reshape(n_pts, 2).astype(np.float64)
 
-fil_1 = oin.get_vr_filtration(pts, 1, 2.0, 1)
-fil_2 = oin.get_vr_filtration(pts, 1, 2.0, 1)
-
+fil_1 = oin.vr_filtration(pts)
+fil_2 = oin.vr_filtration(pts)
 
 simplices = fil_1.simplices()
 
 fil_min_simplices = []
 
 for sigma in simplices:
-    min_sigma = oin.Simplex_double(sigma.vertices, min(sigma.value, fil_2.simplex_value_by_vertices(sigma.vertices)))
+    min_sigma = oin.Simplex(sigma.vertices, min(sigma.value, fil_2.value_by_uid(sigma.uid)))
     fil_min_simplices.append(min_sigma)
 
-fil_min = oin.Filtration_double(fil_min_simplices)
+fil_min = oin.Filtration(fil_min_simplices)
 
 cone_v = fil_min.n_vertices()
 
 # must make copy here! otherwise iteration in for loop below will never end,
 # since we keep adding to the same list over which we iterate
 coned_simplices = simplices[:]
-simplex_id = fil_min.size()
+
+# append cone vertex, force it to be the first vertex in the list
+coned_simplices.append(oin.Simplex([cone_v], -0.000000001))
 
 for sigma in simplices:
-    coned_sigma = sigma.join(new_vertex=cone_v, value=sigma.value, new_id=simplex_id)
-    simplex_id += 1
+    coned_sigma = sigma.join(new_vertex=cone_v, value=sigma.value)
     coned_simplices.append(coned_sigma)
 
-
-fil_coned = oin.Filtration_double(coned_simplices, sort_only_by_dimension=True, set_ids=False)
+fil_coned = oin.Filtration(coned_simplices)
 
 print(fil_coned)
diff --git a/examples/python/example_cone_diff.py b/examples/python/example_cone_diff.py
new file mode 100644
index 0000000..32f4b06
--- /dev/null
+++ b/examples/python/example_cone_diff.py
@@ -0,0 +1,65 @@
+#!python3
+
+from icecream import ic
+import numpy as np
+import oineus as oin
+import oineus.diff
+import torch
+
+n_pts = 50
+
+np.random.seed(1)
+
+pts_1 = np.random.uniform(low=0.0, high=5.0, size=n_pts * 2).reshape(n_pts, 2).astype(np.float64)
+pts_2 = pts_1 + np.random.uniform(size=n_pts *2, low=-0.01, high=0.01).reshape(pts_1.shape)
+
+pts_1 = torch.DoubleTensor(pts_1)
+pts_2 = torch.DoubleTensor(pts_2)
+
+pts_1.requires_grad_()
+pts_2.requires_grad_()
+
+
+fil_1 = oin.diff.vr_filtration(pts_1, eps=1e-9)
+fil_2 = oin.diff.vr_filtration(pts_1, eps=1e-0)
+
+fil_min = oin.diff.min_filtration(fil_1, fil_2)
+
+# we will append coned simplices later
+min_and_coned_simplices = fil_min.cells()
+
+cone_v = fil_min.n_vertices()
+
+# append cone vertex
+# TODO: how to ensure that cone_vertex appears first?
+# for now we can ignore it
+min_and_coned_simplices.append(oin.Simplex(cone_v, [cone_v], -0.0))
+
+# take cones over simplices from fil_1
+for sigma in fil_1:
+    coned_sigma = sigma.join(new_vertex=cone_v, value=sigma.value)
+    min_and_coned_simplices.append(coned_sigma)
+
+fil_min_and_coned = oin.Filtration(min_and_coned_simplices)
+
+# in filtration order
+min_and_coned_simplices = fil_min_and_coned.simplices()
+
+lengths_1 = fil_1.values.detach().cpu().numpy()
+lengths_2 = fil_1.values.detach().cpu().numpy()
+lengths_min = np.min(np.stack((lengths_1, lengths_2)), axis=0)
+
+for sigma in min_and_coned_simplices:
+    if sigma.dim() == 0:
+        continue
+    # get cone base, we only need it's uid, so value does not matter
+    base_vertices = [v for v in sigma if v != cone_v]
+    sigma_base = oin.Simplex(base_vertices)
+    # here is how we can get the index of sigma_base in the original min_filtration:
+    # (note that uid is uniquely determined by simplex vertices)
+    orig_sorted_sigma_id = fil_min.sorted_id_by_uid(sigma_base.uid)
+    if sigma_base.dim() == sigma.dim():
+        assert(np.abs(lengths_1[orig_sorted_sigma_id] - sigma.value) < 0.001)
+    else:
+        assert(np.abs(lengths_min[orig_sorted_sigma_id] - sigma.value) < 0.001)
+
diff --git a/examples/example_diff_cyl.py b/examples/python/example_diff_cyl.py
similarity index 87%
rename from examples/example_diff_cyl.py
rename to examples/python/example_diff_cyl.py
index d7af459..713e016 100755
--- a/examples/example_diff_cyl.py
+++ b/examples/python/example_diff_cyl.py
@@ -4,6 +4,7 @@
 import torch
 
 import oineus as oin
+import oineus.diff
 
 # sample points from the unit circle
 np.random.seed(1)
@@ -29,14 +30,14 @@
 
 # start with topological part
 
-fil_dom = oin.diff.vietoris_rips_pts(pts_1, max_dim=2, max_radius=20.0, n_threads=1)
-fil_cod = oin.diff.vietoris_rips_pts(pts_2, max_dim=2, max_radius=20.0, n_threads=1)
+fil_dom = oin.diff.vr_filtration(pts_1, max_dim=2, max_diameter=20.0, n_threads=1)
+fil_cod = oin.diff.vr_filtration(pts_2, max_dim=2, max_diameter=20.0, n_threads=1)
 
 v_dom_id = fil_dom.size() + fil_cod.size()
 v_cod_id = v_dom_id + 1
 
-v_dom = oin.Simplex(v_dom_id, [v_dom_id])
-v_cod = oin.Simplex(v_cod_id, [v_cod_id])
+v_dom = oin.Simplex([v_dom_id])
+v_cod = oin.Simplex([v_cod_id])
 
 fil = oin.diff.mapping_cylinder_filtration(fil_dom, fil_cod, v_dom, v_cod)
 
diff --git a/examples/example_diff_vr_dists.py b/examples/python/example_diff_vr_dists.py
similarity index 93%
rename from examples/example_diff_vr_dists.py
rename to examples/python/example_diff_vr_dists.py
index f9150c2..b804941 100644
--- a/examples/example_diff_vr_dists.py
+++ b/examples/python/example_diff_vr_dists.py
@@ -4,6 +4,7 @@
 import torch
 
 import oineus as oin
+import oineus.diff
 
 # sample points from the unit circle
 np.random.seed(1)
@@ -33,7 +34,7 @@
 
 # start with topological part
 
-fil = oin.diff.vietoris_rips_pwdists(dists, max_dim=2, max_radius=2.0, n_threads=1)
+fil = oin.diff.vr_filtration(dists, from_pwdists=True, max_dim=2, n_threads=4)
 top_opt = oin.diff.TopologyOptimizer(fil)
 
 print(f"{len(fil) = }")
@@ -49,7 +50,6 @@
 crit_indices = np.array(crit_indices, dtype=np.int32)
 crit_values = torch.Tensor(crit_values)
 
-help(top_opt.simplify)
 
 print(f"{indices=}, {values=}, {crit_indices=}, {crit_values=}")
 
diff --git a/examples/example_diff_vr_pts.py b/examples/python/example_diff_vr_pts.py
similarity index 90%
rename from examples/example_diff_vr_pts.py
rename to examples/python/example_diff_vr_pts.py
index a4096d3..7f0227b 100755
--- a/examples/example_diff_vr_pts.py
+++ b/examples/python/example_diff_vr_pts.py
@@ -4,6 +4,9 @@
 import torch
 
 import oineus as oin
+import oineus.diff
+
+from icecream import ic
 
 # sample points from the unit circle
 np.random.seed(1)
@@ -23,7 +26,7 @@
 
 # start with topological part
 
-fil = oin.diff.vietoris_rips_pts(pts, max_dim=2, max_radius=2.0, n_threads=1)
+fil = oin.diff.vr_filtration(pts, max_dim=2, n_threads=4)
 top_opt = oin.diff.TopologyOptimizer(fil)
 
 dim = 1
@@ -41,3 +44,5 @@
 
 # let Torch figure the gradient on the coordinates
 top_loss.backward()
+ic(top_loss)
+
diff --git a/examples/kernel-example-1.py b/examples/python/example_kernel.py
similarity index 68%
rename from examples/kernel-example-1.py
rename to examples/python/example_kernel.py
index 904aa1f..fe492e4 100644
--- a/examples/kernel-example-1.py
+++ b/examples/python/example_kernel.py
@@ -1,20 +1,14 @@
 import oineus as oin
 
-params=oin.ReductionParams()
-params.n_threads=4
-params.kernel=True
-params.image=True
-params.cokernel=True
-
 K = [ [0, [0], 10], [1,[1],10], [2,[2], 10], [3, [3], 10], [4,[0,1], 10], [5, [1,2], 10], [6,[0,3], 10], [7, [2,3], 10] ]
 L = [ [0, [0], 10], [1,[1],10], [2,[2], 10], [3, [0,1], 10], [4,[1,2],10] ]
-IdMapping = [0,1,2,4,5]
 
-kicr = oin.compute_kernel_image_cokernel_reduction(K, L, IdMapping, params)
+kicr = oin.compute_kernel_image_cokernel_reduction(K, L)
 
 kernel_dgms = kicr.kernel_diagrams()
 image_dgms = kicr.image_diagrams()
 cokernel_dgms = kicr.cokernel_diagrams()
+
 print(kernel_dgms.in_dimension(0))
 print(image_dgms.in_dimension(0))
 print(cokernel_dgms.in_dimension(0))
diff --git a/examples/kernel-cyl.py b/examples/python/example_kernel_cyl.py
similarity index 81%
rename from examples/kernel-cyl.py
rename to examples/python/example_kernel_cyl.py
index 3b02592..f0236b4 100644
--- a/examples/kernel-cyl.py
+++ b/examples/python/example_kernel_cyl.py
@@ -36,10 +36,11 @@ def sample_data(num_points, noise_std_dev=0.1):
 # points in plane, orthogonal embedding in space, embedding matrix
 pts_2, pts_3, B = sample_data(10)
 
-max_radius = 3.0
-fil_2, edges_2 = oin.get_vr_filtration_and_critical_edges(pts_2, max_dim = 2, max_radius=max_radius, n_threads=1)
-fil_3, edges_3 = oin.get_vr_filtration_and_critical_edges(pts_3, max_dim = 2, max_radius = max_radius, n_threads=1)
-print("fils ok")
+max_diameter = 4.0
+
+fil_2, edges_2 = oin.vr_filtration(pts_2, max_dim=2, with_critical_edges=True, max_diameter=max_diameter)
+fil_3, edges_3 = oin.vr_filtration(pts_3, max_dim=2, with_critical_edges=True, max_diameter=max_diameter)
+print(f"fils ok, {len(fil_2)=}, {len(fil_3)=}")
 
 fil_min = oin.min_filtration(fil_2, fil_3)
 print("min fil ok")
@@ -52,8 +53,8 @@ def sample_data(num_points, noise_std_dev=0.1):
 # id_codomain: id of vertex at the bottom of the cylinder
 # i.e, we multiply fil_min with id_codomain
 
-v0 = oin.Simplex(id_domain, [id_domain])
-v1 = oin.Simplex(id_codomain, [id_codomain])
+v0 = oin.Simplex([id_domain])
+v1 = oin.Simplex([id_codomain])
 fil_cyl = oin.mapping_cylinder(fil_3, fil_min, v0, v1)
 
 print("cyl ok")
@@ -61,10 +62,10 @@ def sample_data(num_points, noise_std_dev=0.1):
 fil_3_prod = oin.multiply_filtration(fil_3, v0)
 
 print("prod ok")
-params = oin.ReductionParams()
+params = oin.KICRParams()
 params.kernel = params.cokernel = True
 params.image = False
-params.verbose = True
+params.verbose = False
 
 print(f"{fil_cyl.max_dim() = }")
 print(f"{fil_cyl.size() = }")
@@ -72,7 +73,7 @@ def sample_data(num_points, noise_std_dev=0.1):
 print(f"{fil_3_prod.size() = }")
 
 hdim = 1
-kicr_reduction = oin.KerImCokReducedProd_double(fil_cyl, fil_3_prod, params)
+kicr_reduction = oin.compute_kernel_image_cokernel_reduction(fil_cyl, fil_3_prod, params)
 print("kicr ok")
 ker_dgms = kicr_reduction.kernel_diagrams()
 coker_dgms = kicr_reduction.cokernel_diagrams()
@@ -83,19 +84,19 @@ def sample_data(num_points, noise_std_dev=0.1):
 
     # birth: comes from codomain (cylinder) complex
     # can be either in fil_3 or in fil_cyl
-    c_birth = fil_cyl.get_cell(b)
+    c_birth = fil_cyl[b]
     birth_simplex = c_birth.cell_1
     if c_birth.cell_2 == v1:
-        true_birth_simplex = fil_min.cell_by_uid(birth_simplex.get_uid())
+        true_birth_simplex = fil_min.cell_by_uid(birth_simplex.uid)
     else:
-        true_birth_simplex = fil_3.cell_by_uid(birth_simplex.get_uid())
+        true_birth_simplex = fil_3.cell_by_uid(birth_simplex.uid)
 
     # death: comes from included complex L, i.e., fil_3
     if d < fil_cyl.size():
         # point is finite
         c_death = fil_cyl.get_cell(d)
         death_simplex = c_death.cell_1
-        true_death_simplex = fil_3.cell_by_uid(death_simplex.get_uid())
+        true_death_simplex = fil_3.cell_by_uid(death_simplex.uid)
         print(f"{true_birth_simplex = }, {true_death_simplex = }")
 
     # true_birth_simplex.sorted_id and true_death_simplex.sorted_id are indices
@@ -109,7 +110,7 @@ def sample_data(num_points, noise_std_dev=0.1):
 
     # birth: comes from codomain (cylinder) complex
     # can be either in fil_3 or in fil_cyl
-    c_birth = fil_cyl.get_cell(b)
+    c_birth = fil_cyl[b]
     birth_simplex = c_birth.cell_1
     if c_birth.cell_2 == v1:
         true_birth_simplex = fil_min.cell_by_uid(birth_simplex.get_uid())
@@ -119,7 +120,7 @@ def sample_data(num_points, noise_std_dev=0.1):
     # death: comes from included complex L, i.e., fil_3
     if d < fil_cyl.size():
         # point is finite
-        c_death = fil_cyl.get_cell(d)
+        c_death = fil_cyl[d]
         death_simplex = c_death.cell_1
         if c_death.cell_2 == v1:
             true_death_simplex = fil_min.cell_by_uid(death_simplex.get_uid())
diff --git a/bindings/python/example_manual.py b/examples/python/example_manual.py
similarity index 80%
rename from bindings/python/example_manual.py
rename to examples/python/example_manual.py
index afb07e0..7b68825 100755
--- a/bindings/python/example_manual.py
+++ b/examples/python/example_manual.py
@@ -10,16 +10,16 @@
 # edges: e1 = [v0, v1]
 #
 
-v0 = oin.Simplex_double([0], 0.2)
-v1 = oin.Simplex_double([1], 0.1)
-v2 = oin.Simplex_double([2], 0.3)
+v0 = oin.Simplex([0], 0.2)
+v1 = oin.Simplex([1], 0.1)
+v2 = oin.Simplex([2], 0.3)
 
 # indices of vertices should be sorted
-e1 = oin.Simplex_double([0, 1], 0.9)
-e2 = oin.Simplex_double([0, 2], 0.5)
-e3 = oin.Simplex_double([1, 2], 0.8)
+e1 = oin.Simplex([0, 1], 0.9)
+e2 = oin.Simplex([0, 2], 0.5)
+e3 = oin.Simplex([1, 2], 0.8)
 
-t = oin.Simplex_double([0, 1, 2], 1.0)
+t = oin.Simplex([0, 1, 2], 1.0)
 
 simplices = [v0, v1, v2, e1, e2, e3, t]
 
@@ -27,7 +27,7 @@
 n_threads = 1
 
 # constructor will sort simplices and assign sorted_ids
-fil = oin.Filtration_double(simplices, negate, n_threads)
+fil = oin.Filtration(simplices, negate, n_threads)
 
 fil_simplices = fil.simplices()
 
diff --git a/bindings/python/example_opt.py b/examples/python/example_opt.py
similarity index 82%
rename from bindings/python/example_opt.py
rename to examples/python/example_opt.py
index 8a9697f..c0543a3 100644
--- a/bindings/python/example_opt.py
+++ b/examples/python/example_opt.py
@@ -21,7 +21,7 @@ def evaluate_func(x, y, z):
 # Evaluate the function on the grid
 f = evaluate_func(X, Y, Z)
 
-fil, max_value_vertices = oin.get_freudenthal_filtration_and_critical_vertices(f, negate=False, wrap=False, max_dim=2, n_threads=8)
+fil, max_value_vertices = oin.freudenthal_filtration(f, with_critical_vertices=True)
 
 top_opt = oin.TopologyOptimizer(fil)
 
@@ -35,4 +35,4 @@ def evaluate_func(x, y, z):
 crit_indices, crit_values = top_opt.combine_loss(critical_sets, oin.ConflictStrategy.Max)
 
 crit_indices = np.array(crit_indices, dtype=np.int32)
-crit_vertices = max_value_vertices[crit_indices]
\ No newline at end of file
+crit_vertices = max_value_vertices[crit_indices]
diff --git a/bindings/python/example_opt_vr.py b/examples/python/example_opt_vr.py
similarity index 91%
rename from bindings/python/example_opt_vr.py
rename to examples/python/example_opt_vr.py
index f6a4bca..91bfa40 100644
--- a/bindings/python/example_opt_vr.py
+++ b/examples/python/example_opt_vr.py
@@ -29,7 +29,7 @@ def sample_data():
 
 def topological_loss(pts: torch.Tensor, dim: int=1, n: int=2):
     pts_as_numpy = pts.clone().detach().numpy().astype(np.float64)
-    fil, longest_edges = oin.get_vr_filtration_and_critical_edges(pts_as_numpy, max_dim=2, max_radius=9.0, n_threads=1)
+    fil, edges = oin.vr_filtration(pts_as_numpy, with_critical_edges=True, n_threads=4)
     top_opt = oin.TopologyOptimizer(fil)
 
     # dgm = top_opt.compute_diagram(include_inf_points=False)
@@ -45,12 +45,12 @@ def topological_loss(pts: torch.Tensor, dim: int=1, n: int=2):
 
     crit_indices = np.array(crit_indices, dtype=np.int32)
 
-    dgm_method_edges = longest_edges[indices, :]
+    dgm_method_edges = edges[indices, :]
     dgm_method_edges_x, dgm_method_edges_y = dgm_method_edges[:, 0], dgm_method_edges[:, 1]
 
     dgm_method_values = torch.Tensor(values)
 
-    crit_method_edges = longest_edges[crit_indices, :]
+    crit_method_edges = edges[crit_indices, :]
     crit_method_edges_x, crit_method_edges_y = crit_method_edges[:, 0], crit_method_edges[:, 1]
 
     crit_method_values = torch.Tensor(crit_method_values)
@@ -69,7 +69,7 @@ def topological_loss(pts: torch.Tensor, dim: int=1, n: int=2):
 
 if __name__ == "__main__":
     draw = False
-    use_critical_sets = False
+    use_critical_sets = True
 
     pts = sample_data()
 
diff --git a/bindings/python/example_vr.py b/examples/python/example_vr.py
similarity index 50%
rename from bindings/python/example_vr.py
rename to examples/python/example_vr.py
index 5cc87d6..ad64c9c 100755
--- a/bindings/python/example_vr.py
+++ b/examples/python/example_vr.py
@@ -2,35 +2,62 @@
 
 import numpy as np
 import oineus as oin
+import time
+import sys
+
+from icecream import ic
+
 
 # random points in space
 np.random.seed(1)
-n_points = 20
+n_points = 50
 dim = 3
 points = np.random.uniform(size=(n_points, dim))
 
-# triangulate domain via Freudenthal and create lower star filtration
-# negate: set to True to get upper-star filtration
-# wrap: set to True to work on torus (periodic boundary conditions)
-fil = oin.get_vr_filtration(points=points, max_dim=3, max_radius=2, n_threads=1)
+# create Vietoris--Rips filtration
+n_threads = 8
+fil = oin.vr_filtration(points, n_threads=n_threads)
+
+# we can specify max_dim and max_diameter manually,
+# if we need to.
+# max_dim default value is the dimension of the points,
+# max_diameter default value is same minimax as in Ripser (after that the complex becomes contractible)
+
+fil = oin.vr_filtration(points, max_dim=1, max_diameter=0.5)
+
+# if we want to get parallel array of critical edges,
+# just specify with_critical_edges = True
+# function will return a tuple: (filtration, edges)
 
+fil, edges = oin.vr_filtration(points, with_critical_edges=True, n_threads=n_threads)
+
+# edges is NumPy array of shape (N, 2)
+# it contains indices of simplices in the filtration order (sorted_ids)
+
+# we can subscript filtration
+print(f"Filtration with {len(fil)} cells created,\nvertex 0: {fil[0]}, edge {edges[0]},\nlast simplex: {fil[-1]}, edge {edges[-1]}")
+
+# we can also get a copy of sorted cells:
 cells = fil.cells()
 
-# Vertices in cells are ids, not sorted_ids
-print(f"Filtration with {len(cells)} cells created,\nvertex 0: {cells[0]},\nlast simplex: {cells[-1]}")
+# we can access boundary matrix
+bm = fil.boundary_matrix()
 
-# no cohomology
-dualize = False
 # create VRU decomposition object, does not perform reduction yet
+# we want to use cohomology:
+dualize = True
 dcmp = oin.Decomposition(fil, dualize)
 
+
 # reduction parameters
 # relevant members:
 # rp.clearing_opt --- whether you want to use clearing, True by default
-# rp.compute_v: True by default
+# rp.compute_v: False by default
 # rp.n_threads: number of threads to use, default is 1
 # rp. compute_u: False by default (cannot do it in multi-threaded mode, so switch off just to be on the safe side)
 rp = oin.ReductionParams()
+rp.compute_v = True
+rp.n_threads = 4
 
 # perform reduction
 dcmp.reduce(rp)
diff --git a/bindings/python/example_wasserstein_opt.py b/examples/python/example_wasserstein_opt.py
similarity index 100%
rename from bindings/python/example_wasserstein_opt.py
rename to examples/python/example_wasserstein_opt.py
diff --git a/examples/visualize_critical_set_alpha.py b/examples/python/visualize_critical_set_alpha.py
similarity index 100%
rename from examples/visualize_critical_set_alpha.py
rename to examples/python/visualize_critical_set_alpha.py
diff --git a/examples/visualize_critical_set_vr.py b/examples/python/visualize_critical_set_vr.py
similarity index 97%
rename from examples/visualize_critical_set_vr.py
rename to examples/python/visualize_critical_set_vr.py
index b94265c..4c42293 100644
--- a/examples/visualize_critical_set_vr.py
+++ b/examples/python/visualize_critical_set_vr.py
@@ -51,13 +51,11 @@ def sample_epycycles(R=5.0, r_min=0.6, r_max=0.8, num_small_circles=6, num_point
 
 print(f"{input_points.shape = }")
 
-max_dim = 2
-max_radius = 20.0 * high_r
-n_threads = 1
+n_threads = 4
 
 init_dim = 1
 
-fil, crit_edges = oin.get_vr_filtration_and_critical_edges(input_points, max_dim, max_radius, n_threads)
+fil, crit_edges = oin.vr_filtration(input_points, with_critical_edges=True, n_threads=n_threads)
 
 print(f"{fil.size() = }")
 
@@ -122,7 +120,7 @@ def get_triangles_crit(x, y, change_coord, dim):
 
     triangles = []
     for t_idx in simplex_inds:
-        t_vertices = fil.get_cell(t_idx).vertices
+        t_vertices = fil[t_idx].vertices
         triangles.append((input_points[t_vertices[0]], input_points[t_vertices[1]], input_points[t_vertices[2]]))
 
     return triangles
@@ -141,7 +139,7 @@ def get_triangles_dgm(x, y, change_coord, dim):
     triangles = []
 
     for t_idx in [death_idx]:
-        t_vertices = fil.get_cell(t_idx).vertices
+        t_vertices = fil[t_idx].vertices
         triangles.append((input_points[t_vertices[0]], input_points[t_vertices[1]], input_points[t_vertices[2]]))
 
     return triangles
diff --git a/extern/taskflow/core/atomic_notifier.hpp b/extern/taskflow/core/atomic_notifier.hpp
index 6fcc18b..db8cb1e 100644
--- a/extern/taskflow/core/atomic_notifier.hpp
+++ b/extern/taskflow/core/atomic_notifier.hpp
@@ -55,14 +55,14 @@ class AtomicNotifier {
 
 inline void AtomicNotifier::notify_one() noexcept {
   uint64_t prev = _state.fetch_add(EPOCH_INC, std::memory_order_acq_rel);
-  if((prev & WAITER_MASK))  { // has waiter (typically unlikely)
+  if(TF_UNLIKELY(prev & WAITER_MASK))  { // has waiter (typically unlikely)
     _state.notify_one();
   }
 }
 
 inline void AtomicNotifier::notify_all() noexcept {
   uint64_t prev = _state.fetch_add(EPOCH_INC, std::memory_order_acq_rel);
-  if((prev & WAITER_MASK))  { // has waiter (typically unlikely)
+  if(TF_UNLIKELY(prev & WAITER_MASK))  { // has waiter (typically unlikely)
     _state.notify_all();
   }
 }
@@ -148,20 +148,22 @@ class AtomicNotifierV2 {
   std::vector<Waiter> _waiters;
 
   static constexpr uint64_t WAITER_INC  {1};
-  static constexpr size_t   EPOCH_SHIFT {32};
+  static constexpr uint64_t EPOCH_SHIFT {32};
   static constexpr uint64_t EPOCH_INC   {uint64_t(1) << EPOCH_SHIFT};
   static constexpr uint64_t WAITER_MASK {EPOCH_INC - 1};
 };
 
 inline void AtomicNotifierV2::notify_one() noexcept {
-  if((_state.load(std::memory_order_seq_cst) & WAITER_MASK) != 0) {
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  if((_state.load(std::memory_order_acquire) & WAITER_MASK) != 0) {
     _state.fetch_add(EPOCH_INC, std::memory_order_release);
     _state.notify_one(); 
   }
 }
 
 inline void AtomicNotifierV2::notify_all() noexcept {
-  if((_state.load(std::memory_order_seq_cst) & WAITER_MASK) != 0) {
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  if((_state.load(std::memory_order_acquire) & WAITER_MASK) != 0) {
     _state.fetch_add(EPOCH_INC, std::memory_order_release);
     _state.notify_all(); 
   }
@@ -183,8 +185,10 @@ inline size_t AtomicNotifierV2::size() const noexcept {
 }
 
 inline void AtomicNotifierV2::prepare_wait(Waiter* waiter) noexcept {
-  uint64_t prev = _state.fetch_add(WAITER_INC, std::memory_order_acquire);
-  waiter->epoch = prev + WAITER_INC; 
+  //uint64_t prev = _state.fetch_add(WAITER_INC, std::memory_order_acquire);
+  //waiter->epoch = prev + WAITER_INC; 
+  waiter->epoch = _state.fetch_add(WAITER_INC, std::memory_order_relaxed);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
 }
 
 inline void AtomicNotifierV2::cancel_wait(Waiter*) noexcept {
@@ -192,12 +196,12 @@ inline void AtomicNotifierV2::cancel_wait(Waiter*) noexcept {
 }
 
 inline void AtomicNotifierV2::commit_wait(Waiter* waiter) noexcept {
-  _state.wait(waiter->epoch, std::memory_order_relaxed);
+  _state.wait(waiter->epoch, std::memory_order_seq_cst);
   _state.fetch_sub(WAITER_INC, std::memory_order_relaxed);
 }
 
 
 
-} // namespace taskflow
+} // namespace taskflow -------------------------------------------------------
 
 #endif
diff --git a/extern/taskflow/core/executor.hpp b/extern/taskflow/core/executor.hpp
index b2be9be..bba8243 100644
--- a/extern/taskflow/core/executor.hpp
+++ b/extern/taskflow/core/executor.hpp
@@ -1408,7 +1408,7 @@ inline size_t Executor::num_observers() const noexcept {
 // Procedure: _schedule
 inline void Executor::_schedule(Worker& worker, Node* node) {
   
-  node->_state.fetch_or(Node::READY, std::memory_order_release);
+  //node->_state.fetch_or(Node::READY, std::memory_order_release);
 
   // caller is a worker to this pool - starting at v3.5 we do not use
   // any complicated notification mechanism as the experimental result
@@ -1426,7 +1426,7 @@ inline void Executor::_schedule(Worker& worker, Node* node) {
 // Procedure: _schedule
 inline void Executor::_schedule(Node* node) {
   
-  node->_state.fetch_or(Node::READY, std::memory_order_release);
+  //node->_state.fetch_or(Node::READY, std::memory_order_release);
 
   _freelist.push(node);
   _notifier.notify_one();
@@ -1448,7 +1448,7 @@ inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes)
   // has shown no significant advantage.
   if(worker._executor == this) {
     for(size_t i=0; i<num_nodes; ++i) {
-      nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+      //nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
       worker._wsq.push(nodes[i], [&](){ _freelist.push(worker._id, nodes[i]); });
       _notifier.notify_one();
     }
@@ -1456,7 +1456,7 @@ inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes)
   }
 
   for(size_t k=0; k<num_nodes; ++k) {
-    nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+    //nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
     _freelist.push(nodes[k]);
   }
   _notifier.notify_n(num_nodes);
@@ -1473,7 +1473,7 @@ inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
   }
 
   for(size_t k=0; k<num_nodes; ++k) {
-    nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+    //nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
     _freelist.push(nodes[k]);
   }
 
@@ -1484,7 +1484,7 @@ inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
 inline void Executor::_invoke(Worker& worker, Node* node) {
 
   // synchronize all outstanding memory operations caused by reordering
-  while(!(node->_state.load(std::memory_order_acquire) & Node::READY));
+  //while(!(node->_state.load(std::memory_order_acquire) & Node::READY));
 
   begin_invoke:
   
diff --git a/extern/taskflow/core/graph.hpp b/extern/taskflow/core/graph.hpp
index 7ab77f6..4ab32dd 100644
--- a/extern/taskflow/core/graph.hpp
+++ b/extern/taskflow/core/graph.hpp
@@ -398,18 +398,8 @@ class Runtime {
   @brief co-runs the given target and waits until it completes
   
   A target can be one of the following forms:
-    + a subflow task to spawn a subflow or
     + a composable graph object with `tf::Graph& T::graph()` defined
 
-  @code{.cpp}
-  // co-run a subflow and wait until all tasks complete
-  taskflow.emplace([](tf::Runtime& rt){
-    rt.corun([](tf::Subflow& sf){
-      tf::Task A = sf.emplace([](){});
-      tf::Task B = sf.emplace([](){});
-    }); 
-  });
-  
   // co-run a taskflow and wait until all tasks complete
   tf::Taskflow taskflow1, taskflow2;
   taskflow1.emplace([](){ std::cout << "running taskflow1\n"; });
@@ -725,8 +715,7 @@ class Node {
   // state bit flag
   constexpr static int CONDITIONED = 1;
   constexpr static int DETACHED    = 2;
-  constexpr static int READY       = 4;
-  constexpr static int EXCEPTION   = 8;
+  constexpr static int EXCEPTION   = 4;
 
   using Placeholder = std::monostate;
 
@@ -1234,124 +1223,6 @@ Node* Graph::_emplace_back(ArgsT&&... args) {
   return _nodes.back();
 }
 
-// ============================================================================
-// Freelist
-// ============================================================================
-//
-///**
-//@private
-//*/
-//template <typename T>
-//class Freelist {
-//
-//  struct HeadPtr {
-//    T* ptr  {nullptr};
-//    int tag {0};
-//  };
-//
-//  public:
-//
-//  void push(T* node) {
-//    HeadPtr c_head = _head.load(std::memory_order_relaxed);
-//    HeadPtr n_head = { node };
-//    do {
-//      n_head.tag = c_head.tag + 1;
-//      node->_freelist_next = c_head.ptr;
-//    } while(!_head.compare_exchange_weak(c_head, n_head, 
-//                                         std::memory_order_release, 
-//                                         std::memory_order_relaxed));
-//  }
-//
-//  T* pop() {
-//    HeadPtr c_head = _head.load(std::memory_order_acquire);
-//    HeadPtr n_head;  // new head
-//    while (c_head.ptr != nullptr) {
-//      n_head.ptr = c_head.ptr->_freelist_next;
-//      n_head.tag = c_head.tag + 1;
-//      if(_head.compare_exchange_weak(c_head, n_head, 
-//                                     std::memory_order_release, 
-//                                     std::memory_order_acquire)) {
-//        break;
-//      }
-//    }
-//    return c_head.ptr;
-//  }
-//  
-//  T* steal() {
-//    HeadPtr c_head = _head.load(std::memory_order_acquire);
-//    HeadPtr n_head;  // new head
-//
-//    if(c_head.ptr != nullptr) {
-//      // TODO: bug - here c_head.ptr may die already so accessing its freelist next
-//      // will cause memory segmentation fault
-//      n_head.ptr = c_head.ptr->_freelist_next;
-//      n_head.tag = c_head.tag + 1;
-//      if(_head.compare_exchange_weak(c_head, n_head, 
-//                                     std::memory_order_release, 
-//                                     std::memory_order_relaxed) == false) {
-//        return nullptr;
-//      }
-//    }
-//    return c_head.ptr;
-//  }
-//
-//  bool empty() const {
-//    return _head.load(std::memory_order_relaxed).ptr == nullptr;
-//  }
-//
-//  private:
-//
-//  //static_assert(std::atomic<HeadPtr>::is_always_lock_free);
-//
-//  std::atomic<HeadPtr> _head;
-//};
-//
-///**
-//@private
-//*/
-//template <typename T>
-//class Freelists {
-//
-//  public:
-//
-//  Freelists(size_t W) : _heads(W) {
-//  }
-//
-//  void push(size_t w, T* node) {
-//    // assert(w < _heads.size());
-//    _heads[w].push(node);  
-//  }
-//
-//  void push(T* node) {
-//    _heads[reinterpret_cast<uintptr_t>(node) % _heads.size()].push(node);
-//  }
-//
-//  T* steal(size_t w) {
-//    // assert(w < _heads.size());
-//    for(size_t i=0; i<_heads.size(); i++, w=(w+1)%_heads.size()) {
-//      if(T* ptr = _heads[w].steal(); ptr) {
-//        return ptr;
-//      }
-//    }
-//    return nullptr;
-//  }
-//
-//  bool empty() const {
-//    for(const auto& h : _heads) {
-//      if(!h.empty()) {
-//        return false;
-//      }
-//    }
-//    return true;
-//  }
-//
-//  private:
-//
-//  std::vector<Freelist<T>> _heads;
-//
-//};
-
-
 }  // end of namespace tf. ----------------------------------------------------
 
 
diff --git a/extern/taskflow/core/nonblocking_notifier.hpp b/extern/taskflow/core/nonblocking_notifier.hpp
index a806b43..ddf8700 100644
--- a/extern/taskflow/core/nonblocking_notifier.hpp
+++ b/extern/taskflow/core/nonblocking_notifier.hpp
@@ -542,7 +542,5 @@ class NonblockingNotifierV2 {
 };
 
 
-
-
 }  // namespace tf ------------------------------------------------------------
 
diff --git a/extern/taskflow/core/tsq.hpp b/extern/taskflow/core/tsq.hpp
index e39a713..2ed7991 100644
--- a/extern/taskflow/core/tsq.hpp
+++ b/extern/taskflow/core/tsq.hpp
@@ -189,7 +189,9 @@ void UnboundedTaskQueue<T>::push(T o) {
 
   a->push(b, o);
   std::atomic_thread_fence(std::memory_order_release);
-  _bottom.store(b + 1, std::memory_order_relaxed);
+
+  // original paper uses relaxed here but tsa complains
+  _bottom.store(b + 1, std::memory_order_release);
 }
 
 // Function: pop
@@ -401,7 +403,9 @@ bool BoundedTaskQueue<T, LogSize>::try_push(O&& o) {
   _buffer[b & BufferMask].store(std::forward<O>(o), std::memory_order_relaxed);
 
   std::atomic_thread_fence(std::memory_order_release);
-  _bottom.store(b + 1, std::memory_order_relaxed);
+  
+  // original paper uses relaxed here but tsa complains
+  _bottom.store(b + 1, std::memory_order_release);
 
   return true;
 }
@@ -423,7 +427,9 @@ void BoundedTaskQueue<T, LogSize>::push(O&& o, C&& on_full) {
   _buffer[b & BufferMask].store(std::forward<O>(o), std::memory_order_relaxed);
 
   std::atomic_thread_fence(std::memory_order_release);
-  _bottom.store(b + 1, std::memory_order_relaxed);
+  
+  // original paper uses relaxed here but tsa complains
+  _bottom.store(b + 1, std::memory_order_release);
 }
 
 // Function: pop
diff --git a/extern/taskflow/core/worker.hpp b/extern/taskflow/core/worker.hpp
index 1c8ed35..639ad5c 100644
--- a/extern/taskflow/core/worker.hpp
+++ b/extern/taskflow/core/worker.hpp
@@ -21,7 +21,7 @@ namespace tf {
 @private
 */
 #ifdef TF_ENABLE_ATOMIC_NOTIFIER
-  using DefaultNotifier = AtomicNotifierV2;
+  using DefaultNotifier = AtomicNotifier;
 #else
   using DefaultNotifier = NonblockingNotifierV2;
 #endif
diff --git a/extern/taskflow/taskflow.hpp b/extern/taskflow/taskflow.hpp
index 08b1e25..7b8d203 100644
--- a/extern/taskflow/taskflow.hpp
+++ b/extern/taskflow/taskflow.hpp
@@ -61,7 +61,7 @@ Unbounded task queue is used by the executor.
 /**
 @def TF_VERSION 
 
-@brief version of the %Taskflow (currently 3.8.0)
+@brief version of the %Taskflow (currently 3.9.0)
 
 The version system is made of a major version number, a minor version number,
 and a patch number:
@@ -69,7 +69,7 @@ and a patch number:
   + TF_VERSION / 100 % 1000 is the minor version
   + TF_VERSION / 100000 is the major version
 */
-#define TF_VERSION 300800
+#define TF_VERSION 300900
 
 /**
 @def TF_MAJOR_VERSION
@@ -111,7 +111,7 @@ namespace detail { }
 Release notes are available here: https://taskflow.github.io/taskflow/Releases.html
 */
 constexpr const char* version() {
-  return "3.8.0";
+  return "3.9.0";
 }
 
 
diff --git a/include/oineus/cell_with_value.h b/include/oineus/cell_with_value.h
index 1d1fe24..cc34a1f 100644
--- a/include/oineus/cell_with_value.h
+++ b/include/oineus/cell_with_value.h
@@ -15,7 +15,7 @@ struct CellWithValue {
 
     using Int = typename Cell::Int;
     using Uid = typename Cell::Uid;
-    using UidHasher = typename Cell::UidHasher;
+    using UidSet = typename Cell::UidSet;
     using Boundary = typename Cell::Boundary;
 
     static constexpr Int k_invalid_id = Int(-1);
@@ -35,7 +35,7 @@ struct CellWithValue {
 //            : cell_(std::forward(args)...), value_(value) { }
 
     CellWithValue(const Cell& cell, Real value) : cell_(cell), value_(value) {}
-    CellWithValue(Cell&& cell, Real value) : cell_(cell), value_(value) {}
+    CellWithValue(Cell&& cell, Real value) : cell_(std::move(cell)), value_(value) {}
 
     dim_type dim() const { return cell_.dim(); }
 
@@ -43,13 +43,13 @@ struct CellWithValue {
     void set_value(Real new_value) { value_ = new_value; }
 
     Int get_id() const { return cell_.get_id(); }
-    void set_id(Int new_id) { return cell_.set_id(new_id); }
+    void set_id(Int new_id) { cell_.set_id(new_id); }
 
     Int get_sorted_id() const { return sorted_id_; }
     void set_sorted_id(Int sorted_id) { sorted_id_ = sorted_id; }
 
     Uid get_uid() const { return cell_.get_uid(); }
-    void set_uid(const Uid& new_uid) { return cell_.set_uid(new_uid); }
+    void set_uid() { cell_.set_uid(); }
 
     const Cell& get_cell() const { return cell_; }
 
diff --git a/include/oineus/common_defs.h b/include/oineus/common_defs.h
index 6c43359..33370d6 100644
--- a/include/oineus/common_defs.h
+++ b/include/oineus/common_defs.h
@@ -8,15 +8,8 @@
 #include <sstream>
 #include <limits>
 
-#ifdef OINEUS_USE_CALIPER
-#include <caliper/cali.h>
-#else
-#define CALI_CXX_MARK_FUNCTION
-#define CALI_MARK_BEGIN(x) x;
-#define CALI_MARK_END(x) x;
-#endif
-
 #include "log_wrapper.h"
+#include "profile.h"
 
 namespace oineus {
 
diff --git a/include/oineus/decomposition.h b/include/oineus/decomposition.h
index 67595a7..75100a3 100644
--- a/include/oineus/decomposition.h
+++ b/include/oineus/decomposition.h
@@ -16,7 +16,6 @@
 #include <cstdlib>
 #include <stdexcept>
 
-
 #include "common_defs.h"
 #include "timer.h"
 #include "diagram.h"
@@ -31,10 +30,66 @@ namespace oineus {
     using Idx = int;
     using AtomicIdxVector = std::vector<std::atomic<Idx>>;
 
+    template<class MatrixTraits, class MemoryReclaimC>
+    void update_column(bool& needs_update,
+            MemoryReclaimC* mm,
+            typename MatrixTraits::CachedColumn& cached_reduced_col,
+            typename MatrixTraits::PColumn orig_col,
+            typename MatrixTraits::AMatrix& rv,
+            Idx current_column_idx)
+    {
+        if (needs_update) {
+            // orig_col and new_col can be nullptr for zero columns, other functions can handle that
+            typename MatrixTraits::PColumn new_col = MatrixTraits::load_from_cache(cached_reduced_col);
+
+            assert(MatrixTraits::check_col_duplicates(new_col).empty());
+
+            rv[current_column_idx].store(new_col, std::memory_order_seq_cst);
+
+            mm->retire(orig_col);
+        }
+        needs_update = false;
+    }
+
+    template<class Int, class PLogger>
+    void get_next_chunk(const bool clearing, std::atomic<int>& next_free_chunk, const int chunk_size, const int n_cols,
+                        int&my_chunk, int& chunk_begin, int& chunk_end, bool& done, PLogger& logger,
+                        // these are only used if clearing is true
+                        std::vector<std::atomic<int>>& next_free_chunks, const std::vector<Int>& dim_first, const std::vector<Int>& dim_last, Int& current_dim)
+    {
+        static_assert(std::is_signed_v<Int>, "Int must be signed: current_dim can go below 0 to signal done");
+        done = false;
+        if (clearing) {
+            while(true) {
+                if (current_dim < 0) {
+                    done = true;
+                    return;
+                }
+                my_chunk = next_free_chunks[current_dim].fetch_add(1, std::memory_order_seq_cst);
+                chunk_begin = dim_first.at(current_dim) + my_chunk * chunk_size;
+                if (chunk_begin > dim_last.at(current_dim)) {
+                    current_dim--;
+                    continue;
+                }
+                chunk_end = std::min(dim_last.at(current_dim) + 1, dim_first.at(current_dim) + (my_chunk + 1) * chunk_size);
+                logger->debug("exiting get_next_chunk, clearing = {}, done = {}, my_chunk = {}, chunk_begin = {}, chunk_end = {}, current_dim ={}", clearing, done, my_chunk, chunk_begin, chunk_end, current_dim);
+                break;
+            }
+        } else {
+            my_chunk = next_free_chunk.fetch_add(1, std::memory_order_seq_cst);
+            chunk_begin = my_chunk * chunk_size;
+            chunk_end = std::min(n_cols, (my_chunk + 1) * chunk_size);
+            done = (chunk_begin >= n_cols || chunk_end <= 0);
+            logger->debug("exiting get_next_chunk, clearing = {}, done = {}, my_chunk = {}, chunk_begin = {}, chunk_end = {}", clearing, done, my_chunk, chunk_begin, chunk_end);
+        }
+    }
+
     template<class MatrixTraits, class Int, class MemoryReclaimC>
-    void parallel_reduction(typename MatrixTraits::AMatrix& rv, AtomicIdxVector& pivots, std::atomic<Int>& next_free_chunk,
-            const Params params, int thread_idx, MemoryReclaimC* mm, ThreadStats& stats, bool go_down)
+    void parallel_reduction(typename MatrixTraits::AMatrix& rv, AtomicIdxVector& pivots, std::atomic<int>& next_free_chunk,
+            const Params params, int thread_idx, MemoryReclaimC* mm, ThreadStats& stats,
+            std::vector<std::atomic<int>>& next_free_chunks, std::vector<Int>& dim_first, std::vector<Int>& dim_last)
     {
+        // set logger
         bool log_each_thread_to_file = true;
         std::shared_ptr<spd::logger> logger;
         std::string logger_name = "oineus_log_thread_" + std::to_string(thread_idx);
@@ -51,32 +106,30 @@ namespace oineus {
         using Column = typename MatrixTraits::Column;
         using PColumn = typename MatrixTraits::PColumn;
 
-        std::memory_order acq = params.acq_rel ? std::memory_order_acquire : std::memory_order_seq_cst;
-        std::memory_order rel = params.acq_rel ? std::memory_order_release : std::memory_order_seq_cst;
-        std::memory_order relax = params.acq_rel ? std::memory_order_relaxed : std::memory_order_seq_cst;
+        std::memory_order acq = std::memory_order_seq_cst;
+        std::memory_order rel =  std::memory_order_seq_cst;
+        std::memory_order relax = std::memory_order_seq_cst;
 
-        logger->debug("thread {} started, mm = {}", thread_idx, (void*) (mm));
+        logger->debug("thread {} started, mm = {}, next_free_chunk = {}", thread_idx, (void*) (mm), (void*)(&next_free_chunk));
 
         const int n_cols = rv.size();
 
-        do {
-            int my_chunk;
+        int my_chunk, chunk_begin, chunk_end;
+        Int current_dim = params.clearing_opt ? dim_first.size() - 1 : 0;
+        bool done;
 
-            if (go_down) {
-                my_chunk = next_free_chunk--;
-            } else {
-                my_chunk = next_free_chunk++;
-            }
-
-            int chunk_begin = std::max(0, my_chunk * params.chunk_size);
-            int chunk_end = std::min(n_cols, (my_chunk + 1) * params.chunk_size);
+        do {
+            get_next_chunk(params.clearing_opt, next_free_chunk, params.chunk_size, n_cols,
+                        my_chunk, chunk_begin, chunk_end, done, logger,
+                        // the following are only used if clearing is true
+                        next_free_chunks, dim_first, dim_last, current_dim);
 
-            if (chunk_begin >= n_cols || chunk_end <= 0) {
+            if (done) {
                 logger->debug("Thread {} finished", thread_idx);
                 break;
             }
 
-            logger->debug("thread {}, processing chunk {}, from {} to {}, n_cols = {}", thread_idx, my_chunk, chunk_begin, chunk_end, n_cols);
+            logger->debug("thread {}, processing chunk {}, from {} to {}, n_cols = {}, current_dim = {}", thread_idx, my_chunk, chunk_begin, chunk_end, n_cols, current_dim);
 
             Idx current_column_idx = chunk_begin;
             int next_column = current_column_idx + 1;
@@ -90,6 +143,12 @@ namespace oineus {
                 logger->debug("thread {}, started reducing column = {}", thread_idx, current_column_idx);
 
                 PColumn orig_col = rv[current_column_idx].load(acq);
+
+                if (orig_col == nullptr)
+                    continue;
+
+                assert(MatrixTraits::check_col_duplicates(orig_col).empty());
+
                 auto cached_reduced_col = MatrixTraits::load_to_cache(orig_col);
 
 #ifndef NDEBUG
@@ -104,13 +163,11 @@ namespace oineus {
                             int c_current_low = MatrixTraits::low(cached_reduced_col);
                             Idx c_current_column_idx = current_column_idx;
 
-                            pivots[c_current_low].compare_exchange_weak(c_current_column_idx, -1, rel, relax);
+                            pivots[c_current_low].compare_exchange_strong(c_current_column_idx, -1, rel, relax);
                             // if CAS fails here, it's totally fine, just means
                             // that this column is not set as pivot
 
-                            // zero current column
-                            auto zero_col = new Column();
-                            rv[current_column_idx].store(zero_col, rel);
+                            rv[current_column_idx].store(nullptr, rel);
                             mm->retire(orig_col);
 
                             stats.n_cleared++;
@@ -128,7 +185,7 @@ namespace oineus {
                     }
                 }
 
-                bool update_column = false;
+                bool needs_update = false;
                 bool start_over = false;
 
                 Idx pivot_idx;
@@ -145,11 +202,14 @@ namespace oineus {
                         if (pivot_idx >= 0) {
                             pivot_col = rv[pivot_idx].load(acq);
                         }
-                    }
-                    while(pivot_idx >= 0 && MatrixTraits::low(pivot_col) != current_low);
+                    } while(pivot_idx >= 0 && pivot_col != nullptr && MatrixTraits::low(pivot_col) != current_low);
 
                     logger->debug("thread {}, column = {}, loaded pivot column, pivot_idx = {}", thread_idx, current_column_idx, pivot_idx);
+
                     if (pivot_idx == -1) {
+
+                        update_column<MatrixTraits, MemoryReclaimC>(needs_update, mm, cached_reduced_col, orig_col, rv, current_column_idx);
+
                         if (!pivots[current_low].compare_exchange_weak(pivot_idx, current_column_idx, rel, relax)) {
                             start_over = true;
                         }
@@ -163,24 +223,16 @@ namespace oineus {
 #endif
                         // pivot to the left: kill lowest one in current column
                         MatrixTraits::add_to_cached(pivot_col, cached_reduced_col);
-
-                        logger->debug("thread {}, column = {}, added pivot to the left OK, size = {}", thread_idx, current_column_idx, MatrixTraits::r_column_size(cached_reduced_col));
-                        update_column = true;
+                        logger->debug("thread {}, column = {}, added pivot to the left OK", thread_idx, current_column_idx);
+                        needs_update = true;
                     } else if (pivot_idx > current_column_idx) {
-
-                        stats.n_right_pivots++;
-
                         // pivot to the right: switch to reducing r[pivot_idx]
-                        if (update_column) {
-                            // create copy of reduced column and write in into matrix
-                            PColumn new_col = MatrixTraits::load_from_cache(cached_reduced_col);
-                            rv[current_column_idx].store(new_col, rel);
-                            // original column can be deleted
-                            mm->retire(orig_col);
-                            update_column = false;
-                        }
 
                         logger->debug("Pivot to the right, current_column_idx = {}, next_column = {}, pivot_idx = {}", current_column_idx, next_column, pivot_idx);
+                        stats.n_right_pivots++;
+
+                        // store current column, if needed
+                        update_column<MatrixTraits, MemoryReclaimC>(needs_update, mm, cached_reduced_col, orig_col, rv, current_column_idx);
 
                         // set current column as new pivot, start reducing column r[pivot_idx]
                         if (pivots[current_low].compare_exchange_weak(pivot_idx, current_column_idx, rel, relax)) {
@@ -196,16 +248,10 @@ namespace oineus {
                     }
                 } // reduction loop
 
-                logger->debug("Exited reduction loop, update_column = {}, start_over = {}", update_column, start_over);
-
-                if (update_column and not start_over) {
-                    // write copy of reduced column to matrix
-                    PColumn col = MatrixTraits::load_from_cache(cached_reduced_col);
-                    rv[current_column_idx].store(col, rel);
-                    mm->retire(orig_col);
-                }
+                logger->debug("Exited reduction loop, needs_update = {}, start_over = {}", needs_update, start_over);
 
                 if (not start_over) {
+                    update_column<MatrixTraits, MemoryReclaimC>(needs_update, mm, cached_reduced_col, orig_col, rv, current_column_idx);
                     current_column_idx = next_column;
                     next_column = current_column_idx + 1;
                     logger->debug("not starting over, advanced to next column, current_column_idx = {}, next_column = {}, chunk_end = {}", current_column_idx, next_column, chunk_end);
@@ -213,7 +259,7 @@ namespace oineus {
                         logger->debug("exiting loop over chunk columns");
                         break;
                     }
-                } // else we start with the same current_column_idx re-reading
+                } // else we re-start with the same current_column_idx re-reading
                 // the column, because one of CAS operations failed
 
             } //loop over columns
@@ -225,9 +271,16 @@ namespace oineus {
                 throw std::runtime_error("some columns in chunk not processed");
             }
 #endif
+
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT_WITH_GIL
+            if (my_chunk % 100 == 0) {
+                logger->debug("checking for Ctrl-C in Python...");
+                OINEUS_CHECK_FOR_PYTHON_INTERRUPT_WITH_GIL;
+                logger->debug("checking for Ctrl-C in Python... done");
+            }
+#endif
             mm->quiescent();
-        }
-        while(true); // loop over chunks
+        } while(true); // loop over chunks
         logger->debug("thread {}, EXIT reduction, stats = {}", thread_idx, stats);
     }
 
@@ -259,9 +312,9 @@ namespace oineus {
         VRUDecomposition& operator=(const VRUDecomposition&) = default;
 
         template<class C, class R>
-        VRUDecomposition(const Filtration<C, R>& fil, bool _dualize)
+        VRUDecomposition(const Filtration<C, R>& fil, bool _dualize, int n_threads=8)
                 :
-                d_data(!_dualize ? fil.boundary_matrix_full() : fil.coboundary_matrix()),
+                d_data(_dualize ? fil.coboundary_matrix(n_threads) : fil.boundary_matrix(n_threads)),
                 r_data(d_data),
                 dualize_(_dualize),
                 dim_first(fil.dim_first()),
@@ -332,8 +385,6 @@ namespace oineus {
                 const typename Cell::UidSet& relative,
                 bool include_all, bool include_inf_points, bool only_zero_persistence) const;
 
-
-
         template<typename Cell, typename Real>
         Diagrams<Real> diagram(const Filtration<Cell, Real>& fil, bool include_inf_points) const;
 
@@ -505,6 +556,7 @@ namespace oineus {
     template<class Int>
     void VRUDecomposition<Int>::reduce(Params& params)
     {
+        CALI_CXX_MARK_FUNCTION;
         if (d_data.empty()) {
             is_reduced = true;
             return;
@@ -592,6 +644,12 @@ namespace oineus {
                         if (params.compute_u)
                             u_data_t[pivot].push_back(i);
                     }
+
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                    if (i % 1000 == 0) {
+                        OINEUS_CHECK_FOR_PYTHON_INTERRUPT_WITH_GIL;
+                    }
+#endif
                 } // reduction loop
                 MatrixTraits::load_from_cache(cached_r_col, r_data[i]);
                 if (params.compute_v)
@@ -626,23 +684,33 @@ namespace oineus {
         std::atomic<typename MemoryReclaimC::EpochCounter> counter;
         counter = 0;
 
-        std::atomic<Int> next_free_chunk;
+        std::atomic<int> next_free_chunk = 0;
+        std::vector<std::atomic<int>> next_free_chunks(dim_first.size());
+        for(auto& nfc : next_free_chunks) {
+            nfc.store(0, std::memory_order_relaxed);
+        }
+
+        const int n_threads = std::min(params.n_threads, std::max(1, static_cast<int>(n_cols / params.chunk_size)));
 
         AMatrix ar_matrix(n_cols);
+        AtomicIdxVector pivots(n_cols);
 
-        // move data to ar_matrix
-        for(size_t i = 0; i < n_cols; ++i) {
-            ar_matrix[i] = new Column(std::move(r_data[i]));
-        }
-        spd::debug("Matrix moved");
+        tf::Executor executor(n_threads);
 
-        AtomicIdxVector pivots(n_cols);
-        for(auto& p: pivots) {
-            p.store(-1, std::memory_order_relaxed);
+        // move data to ar_matrix and set pivots in parallel
+        {
+            tf::Taskflow taskflow_prepare;
+            taskflow_prepare.for_each_index((size_t)0, n_cols, (size_t)1,
+                    [this, &ar_matrix, &pivots](size_t col_idx) {
+                        pivots[col_idx].store(-1, std::memory_order_relaxed);
+                        ar_matrix[col_idx] = new Column(std::move(r_data[col_idx]));
+                        assert(MatrixTraits::check_col_duplicates(ar_matrix[col_idx]).empty());
+                    });
+            executor.run(taskflow_prepare).wait();
         }
+
         spd::debug("Pivots initialized");
 
-        int n_threads = std::min(params.n_threads, std::max(1, static_cast<int>(n_cols / params.chunk_size)));
 
         std::vector<std::thread> ts;
         std::vector<std::unique_ptr<MemoryReclaimC>> mms;
@@ -651,14 +719,6 @@ namespace oineus {
         mms.reserve(n_threads);
         stats.reserve(n_threads);
 
-        bool go_down = params.clearing_opt;
-
-        if (go_down) {
-            next_free_chunk = n_cols / params.chunk_size;
-        } else {
-            next_free_chunk = 0;
-        }
-
         Timer timer;
 
         for(int thread_idx = 0; thread_idx < n_threads; ++thread_idx) {
@@ -668,7 +728,8 @@ namespace oineus {
 
             ts.emplace_back(parallel_reduction<MatrixTraits, Int, MemoryReclaimC>,
                     std::ref(ar_matrix), std::ref(pivots), std::ref(next_free_chunk),
-                    params, thread_idx, mms[thread_idx].get(), std::ref(stats[thread_idx]), go_down);
+                    params, thread_idx, mms[thread_idx].get(), std::ref(stats[thread_idx]),
+                    std::ref(next_free_chunks), std::ref(dim_first), std::ref(dim_last));
 
 #ifdef __linux__
             cpu_set_t cpuset;
@@ -699,12 +760,19 @@ namespace oineus {
 #ifdef OINEUS_GATHER_ADD_STATS
         write_add_stats_file(stats);
 #endif
-
-        // write reduced matrix back, collect V matrix, mark as reduced
-        for(size_t i = 0; i < n_cols; ++i) {
-            auto p = ar_matrix[i].load(std::memory_order_relaxed);
-            r_data[i] = std::move(*p);
-            delete p;
+        {
+            tf::Taskflow taskflow_finish;
+            taskflow_finish.for_each_index((size_t)0, n_cols, (size_t)1,
+                    [this, &ar_matrix, &pivots](size_t col_idx) {
+                        auto p = ar_matrix[col_idx].load(std::memory_order_relaxed);
+                        if (p) {
+                            r_data[col_idx] = std::move(*p);
+                            delete p;
+                        } else {
+                            r_data[col_idx].clear();
+                        }
+                    });
+            executor.run(taskflow_finish).wait();
         }
 
         is_reduced = true;
@@ -713,14 +781,16 @@ namespace oineus {
     template<class Int>
     void VRUDecomposition<Int>::reduce_parallel_rv(Params& params)
     {
+        CALI_CXX_MARK_FUNCTION;
         using namespace std::placeholders;
 
-        int c = 0;
         size_t n_cols = size();
 
         if (n_cols == 0)
             return;
 
+        int n_threads = std::min(params.n_threads, std::max(1, static_cast<int>(n_cols / params.chunk_size)));
+
         v_data = std::vector<IntSparseColumn>(n_cols);
 
         using MatrixTraits = SimpleRVMatrixTraits<Int, 2>;
@@ -731,25 +801,30 @@ namespace oineus {
 
         RVMatrix r_v_matrix(n_cols);
 
-        // move data to r_v_matrix
-        for(size_t i = 0; i < n_cols; ++i) {
-            IntSparseColumn v_column = {static_cast<Int>(i)};
-            r_v_matrix[i] = new RVColumn(r_data[i], v_column);
-        }
-        spd::debug("Matrix moved");
-
         std::atomic<typename MemoryReclaimC::EpochCounter> counter;
         counter = 0;
 
-        std::atomic<Int> next_free_chunk;
+        std::atomic<int> next_free_chunk = 0;
+        std::vector<std::atomic<int>> next_free_chunks(dim_first.size());
+        for(auto& nfc : next_free_chunks) {
+            nfc.store(0, std::memory_order_seq_cst);
+        }
 
         AtomicIdxVector pivots(n_rows);
-        for(auto& p: pivots) {
-            p.store(-1, std::memory_order_relaxed);
-        }
-        spd::debug("Pivots initialized");
 
-        int n_threads = std::min(params.n_threads, std::max(1, static_cast<int>(n_cols / params.chunk_size)));
+        tf::Executor executor(n_threads);
+
+        // move data to ar_matrix and set pivots in parallel
+        {
+            tf::Taskflow taskflow_prepare;
+            taskflow_prepare.for_each_index((size_t)0, n_cols, (size_t)1,
+                    [this, &r_v_matrix, &pivots](size_t col_idx) {
+                        pivots[col_idx].store(-1, std::memory_order_relaxed);
+                        IntSparseColumn v_column = {static_cast<Int>(col_idx)};
+                        r_v_matrix[col_idx] = new RVColumn(r_data[col_idx], v_column);
+                    });
+            executor.run(taskflow_prepare).wait();
+        }
 
         std::vector<std::thread> ts;
         std::vector<std::unique_ptr<MemoryReclaimC>> mms;
@@ -758,13 +833,7 @@ namespace oineus {
         mms.reserve(n_threads);
         stats.reserve(n_threads);
 
-        bool go_down = params.clearing_opt;
-
-        if (go_down) {
-            next_free_chunk = n_cols / params.chunk_size;
-        } else {
-            next_free_chunk = 0;
-        }
+        next_free_chunk = 0;
 
         Timer timer;
 
@@ -775,7 +844,8 @@ namespace oineus {
 
             ts.emplace_back(parallel_reduction<MatrixTraits, Int, MemoryReclaimC>,
                     std::ref(r_v_matrix), std::ref(pivots), std::ref(next_free_chunk),
-                    params, thread_idx, mms[thread_idx].get(), std::ref(stats[thread_idx]), go_down);
+                    params, thread_idx, mms[thread_idx].get(), std::ref(stats[thread_idx]),
+                    std::ref(next_free_chunks), std::ref(dim_first), std::ref(dim_last));
 
 #ifdef __linux__
             cpu_set_t cpuset;
@@ -806,13 +876,22 @@ namespace oineus {
 #ifdef OINEUS_GATHER_ADD_STATS
         write_add_stats_file(stats);
 #endif
-
-        // write reduced matrix back, collect V matrix, mark as reduced
-        for(size_t i = 0; i < n_cols; ++i) {
-            auto p = r_v_matrix[i].load(std::memory_order_relaxed);
-            r_data[i] = std::move(p->r_column);
-            v_data[i] = std::move(p->v_column);
-            delete p;
+        {
+            tf::Taskflow taskflow_finish;
+            taskflow_finish.for_each_index((size_t)0, n_cols, (size_t)1,
+                    [this, &r_v_matrix, &pivots](size_t col_idx) {
+                        auto p = r_v_matrix[col_idx].load(std::memory_order_relaxed);
+                        if (p) {
+                            r_data[col_idx] = std::move(p->r_column);
+                            v_data[col_idx] = std::move(p->v_column);
+                            delete p;
+                        } else {
+                            r_data[col_idx].clear();
+                            // TODO: Bauer's trick with filling V
+                            v_data[col_idx].clear();
+                        }
+                    });
+            executor.run(taskflow_finish).wait();
         }
 
         is_reduced = true;
@@ -837,10 +916,18 @@ namespace oineus {
         }
 
         if (include_inf_points)
-            for(size_t i = 0; i < r_data.size(); ++i)
+            for(size_t i = 0; i < r_data.size(); ++i) {
                 if (!is_zero(&r_data[i]))
                     rows_with_lowest_one.insert(low(&r_data[i]));
 
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                if (i % 100 == 0) {
+                    OINEUS_CHECK_FOR_PYTHON_INTERRUPT;
+                }
+#endif
+
+            }
+
         for(size_t col_idx = 0; col_idx < r_data.size(); ++col_idx) {
             auto col = &r_data[col_idx];
 
@@ -875,6 +962,13 @@ namespace oineus {
                     result.add_point(dim, birth, death, birth_idx, death_idx, birth_idx_us, death_idx_us);
                 }
             }
+
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+            if (col_idx % 100 == 0) {
+                OINEUS_CHECK_FOR_PYTHON_INTERRUPT;
+            }
+#endif
+
         }
 
         return result;
@@ -911,6 +1005,13 @@ namespace oineus {
 
                 if (!is_zero(&r_data[i]))
                     rows_with_lowest_one.insert(low(&r_data[i]));
+
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                if (i % 100 == 0) {
+                    OINEUS_CHECK_FOR_PYTHON_INTERRUPT;
+                }
+#endif
+
             }
 
         for(size_t col_idx = 0; col_idx < r_data.size(); ++col_idx) {
@@ -957,6 +1058,12 @@ namespace oineus {
                     result.add_point(dim, birth, death, birth_idx, death_idx, birth_idx_us, death_idx_us);
                 }
             }
+
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+            if (col_idx % 100 == 0) {
+                OINEUS_CHECK_FOR_PYTHON_INTERRUPT;
+            }
+#endif
         }
 
         return result;
diff --git a/include/oineus/filtration.h b/include/oineus/filtration.h
index 247ac7a..e9fdd80 100644
--- a/include/oineus/filtration.h
+++ b/include/oineus/filtration.h
@@ -9,7 +9,9 @@
 #include <algorithm>
 
 #include <taskflow/taskflow.hpp>
+#include <taskflow/core/flow_builder.hpp>
 #include <taskflow/algorithm/sort.hpp>
+#include <taskflow/algorithm/for_each.hpp>
 
 #include "timer.h"
 #include "simplex.h"
@@ -29,19 +31,17 @@ namespace oineus {
         std::vector<L> critical_value_locations;
     };
 
-
     template<class UnderCell_, class Real_>
     class Filtration {
     public:
         using Real = Real_;
         using Cell = CellWithValue<UnderCell_, Real>;
+        using UidHasher = typename UnderCell_::UidHasher;
         using Int = typename Cell::Int;
         using CellVector = std::vector<Cell>;
         using BoundaryMatrix = typename VRUDecomposition<Int>::MatrixData;
 
         using CellUid = typename Cell::Uid;
-        using UidHasher = typename Cell::UidHasher;
-
 
         Filtration() = default;
         Filtration(const Filtration&) = default;
@@ -50,26 +50,25 @@ namespace oineus {
         Filtration& operator=(Filtration&&) = default;
 
         // use move constructor to move cells
-        Filtration(CellVector&& _simplices, bool _negate, int n_threads = 1, bool _sort_only_by_dim=false, bool _set_ids=true)
+        Filtration(CellVector&& _simplices, bool _negate, int n_threads = 1, bool _sort_only_by_dim=false, bool _set_uids=true)
                 :
                 negate_(_negate),
-                cells_(_simplices)
+                cells_(std::move(_simplices)),
+                id_to_sorted_id_(cells_.size()),
+                sorted_id_to_id_(cells_.size())
         {
-            init(n_threads, _sort_only_by_dim, _set_ids);
+            CALI_CXX_MARK_FUNCTION;
+            init(n_threads, _sort_only_by_dim, _set_uids);
         }
 
-        void init(int n_threads, bool sort_only_by_dim, bool _set_ids)
+        void init(int n_threads, bool sort_only_by_dim, bool _set_uids)
         {
             CALI_CXX_MARK_FUNCTION;
 
-            if (_set_ids) {
-                set_ids();
-            }
-
             if (sort_only_by_dim) {
                 sort_dim_only();
             } else {
-                sort(n_threads);
+                sort_and_set(n_threads, _set_uids);
             }
 
             set_dim_info();
@@ -80,6 +79,7 @@ namespace oineus {
 
         void reset_ids_to_sorted_ids()
         {
+            CALI_CXX_MARK_FUNCTION;
             for(auto& sigma : cells_) {
                 sigma.set_id(sigma.get_sorted_id());
             }
@@ -93,7 +93,9 @@ namespace oineus {
         Filtration(const CellVector& cells, bool _negate, int n_threads = 1, bool _sort_only_by_dim=false, bool _set_ids=true)
                 :
                 negate_(_negate),
-                cells_(cells)
+                cells_(cells),
+                id_to_sorted_id_(cells.size()),
+                sorted_id_to_id_(cells.size())
         {
             init(n_threads, _sort_only_by_dim, _set_ids);
         }
@@ -122,53 +124,98 @@ namespace oineus {
 
         dim_type max_dim() const { return dim_last_.size() - 1; }
 
-        BoundaryMatrix boundary_matrix_full() const
+        BoundaryMatrix boundary_matrix(int n_threads=1) const
         {
             CALI_CXX_MARK_FUNCTION;
+            BoundaryMatrix result(size());
 
-            BoundaryMatrix result;
-            result.reserve(size());
-
-            for(dim_type d = 0; d <= max_dim(); ++d) {
-                auto m = boundary_matrix_in_dimension(d);
-                result.insert(result.end(), std::make_move_iterator(m.begin()), std::make_move_iterator(m.end()));
-            }
+            bool missing_ok = is_subfiltration();
 
+            // boundary of vertex is empty, need to do something in positive dimension only
+            tf::Executor executor(n_threads);
+            tf::Taskflow taskflow;
+            taskflow.for_each_index((size_t)0, size(), (size_t)1,
+                    [this, missing_ok, &result](size_t col_idx) {
+                        // skip 0-dim simplices
+                        if (col_idx <= dim_last(0))
+                            return;
+                        const auto& sigma = cells_[col_idx];
+                        auto& col = result[col_idx];
+                        col.reserve(sigma.dim() + 1);
+
+                        for(const auto& tau_vertices: sigma.boundary()) {
+                            if (missing_ok) {
+                                auto iter = uid_to_sorted_id.find(tau_vertices);
+                                if (iter != uid_to_sorted_id.end()) {
+                                    col.push_back(iter->second);
+                                }
+                            } else {
+                                col.push_back(uid_to_sorted_id.at(tau_vertices));
+                            }
+                        }
+                        std::sort(col.begin(), col.end());
+                    });
+            executor.run(taskflow).wait();
             return result;
         }
 
-        BoundaryMatrix boundary_matrix_in_dimension(dim_type d) const
+        BoundaryMatrix boundary_matrix_in_dimension(dim_type d, int n_threads) const
         {
+            CALI_CXX_MARK_FUNCTION;
             bool missing_ok = is_subfiltration();
 
             BoundaryMatrix result(size_in_dimension(d));
             // fill D with empty vectors
 
             // boundary of vertex is empty, need to do something in positive dimension only
-            if (d > 0)
-                for(size_t col_idx = 0; col_idx < size_in_dimension(d); ++col_idx) {
-                    auto& sigma = cells_[col_idx + dim_first(d)];
-                    auto& col = result[col_idx];
-                    col.reserve(d + 1);
-
-                    for(const auto& tau_vertices: sigma.boundary()) {
-                        if (missing_ok) {
-                            auto iter = uid_to_sorted_id.find(tau_vertices);
-                            if (iter != uid_to_sorted_id.end()) {
-                                col.push_back(iter->second);
+            if (d > 0) {
+                if (n_threads > 1) {
+                    tf::Executor executor(n_threads);
+                    tf::Taskflow taskflow;
+                    tf::Task fill_bdry_matrix = taskflow.for_each_index(0, size_in_dimension(d), (size_t)1,
+                            [this, d, missing_ok, &result](size_t col_idx) {
+                                auto& sigma = cells_[col_idx + dim_first(d)];
+                                auto& col = result[col_idx];
+                                col.reserve(d + 1);
+
+                                for(const auto& tau_vertices: sigma.boundary()) {
+                                    if (missing_ok) {
+                                        auto iter = uid_to_sorted_id.find(tau_vertices);
+                                        if (iter != uid_to_sorted_id.end()) {
+                                            col.push_back(iter->second);
+                                        }
+                                    } else {
+                                        col.push_back(uid_to_sorted_id.at(tau_vertices));
+                                    }
+                                }
+                                std::sort(col.begin(), col.end());
+                            });
+                    executor.run(taskflow).wait();
+                } else {
+                    for(size_t col_idx = 0; col_idx < size_in_dimension(d); ++col_idx) {
+                        auto& sigma = cells_[col_idx + dim_first(d)];
+                        auto& col = result[col_idx];
+                        col.reserve(d + 1);
+
+                        for(const auto& tau_vertices: sigma.boundary()) {
+                            if (missing_ok) {
+                                auto iter = uid_to_sorted_id.find(tau_vertices);
+                                if (iter != uid_to_sorted_id.end()) {
+                                    col.push_back(iter->second);
+                                }
+                            } else {
+                                col.push_back(uid_to_sorted_id.at(tau_vertices));
                             }
-                        } else {
-                            col.push_back(uid_to_sorted_id.at(tau_vertices));
                         }
-                    }
 
-                    std::sort(col.begin(), col.end());
+                        std::sort(col.begin(), col.end());
+                    }
                 }
-
+            }
             return result;
         }
 
-        BoundaryMatrix boundary_matrix_full_rel(const std::unordered_set<typename Cell::Uid, typename Cell::UidHasher>& relative) const
+        BoundaryMatrix boundary_matrix_rel(const typename Cell::UidSet& relative) const
         {
             CALI_CXX_MARK_FUNCTION;
 
@@ -176,15 +223,16 @@ namespace oineus {
             result.reserve(size());
 
             for(dim_type d = 0; d <= max_dim(); ++d) {
-                auto m = boundary_matrix_in_dimension(d, relative);
+                auto m = boundary_matrix_in_dimension_rel(d, relative);
                 result.insert(result.end(), std::make_move_iterator(m.begin()), std::make_move_iterator(m.end()));
             }
 
             return result;
         }
 
-        BoundaryMatrix boundary_matrix_in_dimension(dim_type d, const std::unordered_set<typename Cell::Uid, typename Cell::UidHasher>& relative) const
+        BoundaryMatrix boundary_matrix_in_dimension_rel(dim_type d, const typename Cell::UidSet& relative) const
         {
+            CALI_CXX_MARK_FUNCTION;
             BoundaryMatrix result(size_in_dimension(d));
             // fill D with empty vectors
 
@@ -205,15 +253,30 @@ namespace oineus {
                             col.push_back(uid_to_sorted_id.at(tau_vertices));
                     }
 
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                    if (col_idx % 100 == 0) {
+                      OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                    }
+#endif
+
                     std::sort(col.begin(), col.end());
+
+#ifndef NDEBUG
+                    // verify that there are no duplicates
+                    std::set<Int> col_check {col.begin(), col.end()};
+                    if (col_check.size() != col.size())
+                        throw std::runtime_error("Bad uid computation");
+#endif
+
                 }
 
             return result;
         }
 
-        BoundaryMatrix coboundary_matrix() const
+        BoundaryMatrix coboundary_matrix(int n_threads=1) const
         {
-            return antitranspose(boundary_matrix_full(), size());
+            CALI_CXX_MARK_FUNCTION;
+            return antitranspose(boundary_matrix(n_threads), size());
         }
 
         template<typename I, typename R, size_t D>
@@ -233,17 +296,12 @@ namespace oineus {
 
         Real value_by_sorted_id(Int sorted_id) const
         {
-            return sorted_id_to_value_[sorted_id];
-        }
-
-        Real value_by_vertices(const CellUid& vs) const
-        {
-            return sorted_id_to_value_.at(uid_to_sorted_id.at(vs));
+            return cells_[sorted_id].get_value();
         }
 
-        auto get_sorted_id_by_vertices(const CellUid& vs) const
+        Real value_by_uid(const CellUid& vs) const
         {
-            return uid_to_sorted_id.at(vs);
+            return cells_[uid_to_sorted_id.at(vs)].get_value();
         }
 
         auto get_sorted_id_by_uid(const CellUid& uid) const
@@ -316,7 +374,7 @@ namespace oineus {
         [[nodiscard]] size_t index_in_matrix(size_t cell_idx, bool dualize) const { return dualize ? size() - cell_idx - 1 : cell_idx; }
         [[nodiscard]] size_t index_in_filtration(size_t matrix_idx, bool dualize) const { return dualize ? size() - matrix_idx - 1 : matrix_idx; }
 
-        void set_values(const std::vector<Real>& new_values)
+        void set_values(const std::vector<Real>& new_values, int n_threads=1)
         {
             if (new_values.size() != cells_.size())
                 throw std::runtime_error("new_values.size() != cells_.size()");
@@ -324,15 +382,9 @@ namespace oineus {
             for(size_t i = 0; i < new_values.size(); ++i)
                 cells_[i].value_ = new_values[i];
 
-            sort(1);
-        }
-
-        std::vector<Real> all_values() const
-        {
-            return sorted_id_to_value_;
+            sort_and_set(n_threads, false);
         }
 
-
         // return true, if it's a subfiltration (subset) of a true filtration
         // if so, the
         bool is_subfiltration() const { return is_subfiltration_; }
@@ -354,7 +406,6 @@ namespace oineus {
                     result.uid_to_sorted_id[cell.get_uid()] = sorted_id;
                     result.id_to_sorted_id_[cell.get_id()] = sorted_id;
                     result.sorted_id_to_id_.push_back(cell.get_id());
-                    result.sorted_id_to_value_.push_back(cell.get_value());
                     result.cells_.back().sorted_id_ = sorted_id;
                     sorted_id++;
                 }
@@ -363,7 +414,7 @@ namespace oineus {
             if (result.size() == 0)
                 return Filtration();
 
-            dim_type max_dim = cells_.back().dim();
+            dim_type max_dim = result.cells_.back().dim();
 
             result.set_dim_info(max_dim, dims);
 
@@ -380,22 +431,12 @@ namespace oineus {
         bool is_subfiltration_ {false};
 
         std::unordered_map<CellUid, Int, UidHasher> uid_to_sorted_id;
-        std::unordered_map<Int, Int> id_to_sorted_id_;
+        std::vector<Int> id_to_sorted_id_;
         std::vector<Int> sorted_id_to_id_;
 
-        std::vector<Real> sorted_id_to_value_;
-
         std::vector<Int> dim_first_;
         std::vector<Int> dim_last_;
 
-        // private methods
-        void set_ids()
-        {
-            for(size_t id = 0; id < cells_.size(); ++id) {
-                cells_[id].set_id(static_cast<Int>(id));
-            }
-        }
-
         void set_dim_info()
         {
             Int curr_dim = 0;
@@ -458,9 +499,8 @@ namespace oineus {
                 dim_to_cells[cell.dim()].push_back(cell);
             }
 
-            sorted_id_to_id_ = std::vector<Int>(size(), Int(-1));
             uid_to_sorted_id.clear();
-            sorted_id_to_value_ = std::vector<Real>(size(), std::numeric_limits<Real>::max());
+            uid_to_sorted_id.reserve(cells_.size());
 
             size_t sorted_id = 0;
 
@@ -472,7 +512,6 @@ namespace oineus {
                     sorted_id_to_id_.at(sorted_id) = cell.get_id();
                     cell.sorted_id_ = sorted_id;
                     uid_to_sorted_id[cell.get_uid()] = sorted_id;
-                    sorted_id_to_value_.at(sorted_id) = cell.get_value();
 
                     cells_.push_back(cell);
 
@@ -482,41 +521,51 @@ namespace oineus {
         }
 
         // sort cells and assign sorted_ids
-        void sort([[maybe_unused]] int n_threads = 1)
+        void sort_and_set(int n_threads, bool set_uid)
         {
             CALI_CXX_MARK_FUNCTION;
 
-            sorted_id_to_id_ = std::vector<Int>(size(), Int(-1));
             uid_to_sorted_id.clear();
-            sorted_id_to_value_ = std::vector<Real>(size(), std::numeric_limits<Real>::max());
-
-            // sort by dimension first, then by value, then by id
-            auto cmp = [this](const Cell& sigma, const Cell& tau) {
-              auto v_sigma = this->negate_ ? -sigma.get_value() : sigma.get_value();
-              auto v_tau = this->negate_ ? -tau.get_value() : tau.get_value();
-              auto d_sigma = sigma.dim(), d_tau = tau.dim();
-              auto sigma_id = sigma.get_id(), tau_id = tau.get_id();
-              return std::tie(d_sigma, v_sigma, sigma_id) < std::tie(d_tau, v_tau, tau_id);
-            };
-
-            if (n_threads > 1) {
-                tf::Executor executor(n_threads);
-                tf::Taskflow taskflow;
-                tf::Task sort = taskflow.sort(cells_.begin(), cells_.end(), cmp);
-                executor.run(taskflow).wait();
-            } else {
-                std::sort(cells_.begin(), cells_.end(), cmp);
-            }
-
+            uid_to_sorted_id.reserve(cells_.size());
+
+            auto cmp = [this](const Cell& sigma, const Cell& tau)
+                {
+                      Real v_sigma = negate() ? -sigma.get_value() : sigma.get_value();
+                      Real v_tau = negate() ? -tau.get_value() : tau.get_value();
+                      dim_type d_sigma = sigma.dim(), d_tau = tau.dim();
+                      auto sigma_id = sigma.get_id(), tau_id = tau.get_id();
+                      return std::tie(d_sigma, v_sigma, sigma_id) < std::tie(d_tau, v_tau, tau_id);
+                };
+
+            CALI_MARK_BEGIN("TF_SORT_AND_SET");
+            tf::Executor executor(n_threads);
+            tf::Taskflow taskflow;
+            tf::Task set_id_and_uid = taskflow.for_each_index(static_cast<Int>(0), static_cast<Int>(size()), static_cast<Int>(1),
+                    [this, set_uid](Int sorted_id){
+                    auto& sigma = this->cells_[sorted_id];
+                    if (set_uid) sigma.set_uid();
+                    sigma.set_id(sorted_id);
+                    });
+            tf::Task sort_task = taskflow.sort(cells_.begin(), cells_.end(), cmp);
+            tf::Task post_sort = taskflow.for_each_index(static_cast<Int>(0), static_cast<Int>(size()), static_cast<Int>(1),
+                    [this](Int sorted_id){
+                    auto& sigma = this->cells_[sorted_id];
+                    this->id_to_sorted_id_[sigma.get_id()] = sorted_id;
+                    this->sorted_id_to_id_[sorted_id] = sigma.get_id();
+                    sigma.sorted_id_ = sorted_id;
+                    });
+            sort_task.succeed(set_id_and_uid);
+            post_sort.succeed(sort_task);
+            executor.run(taskflow).wait();
+            CALI_MARK_END("TF_SORT_AND_SET");
+
+            // unordered_set is not thread-safe, cannot do in parallel
+            CALI_MARK_BEGIN("filtration_init_housekeeping-uid");
             for(size_t sorted_id = 0; sorted_id < size(); ++sorted_id) {
                 auto& sigma = cells_[sorted_id];
-
-                id_to_sorted_id_[sigma.get_id()] = sorted_id;
-                sorted_id_to_id_[sorted_id] = sigma.get_id();
-                sigma.sorted_id_ = sorted_id;
                 uid_to_sorted_id[sigma.get_uid()] = sorted_id;
-                sorted_id_to_value_[sorted_id] = sigma.get_value();
             }
+            CALI_MARK_END("filtration_init_housekeeping-uid");
         }
 
         template<typename C, typename R>
@@ -584,9 +633,6 @@ namespace oineus {
 
         bool negate = fil_1.negate();
 
-        std::vector<Real> values_1 = fil_1.all_values();
-        std::vector<Real> values_2 = fil_2.all_values();
-
         const std::vector<Cell>& cells = fil_1.cells();
 
         std::vector<std::tuple<Real, dim_type, size_t, size_t>> to_sort;
@@ -596,8 +642,8 @@ namespace oineus {
         for(size_t idx_1 = 0; idx_1 < fil_1.size(); ++idx_1) {
             size_t idx_2 = fil_2.get_sorted_id_by_uid(cells[idx_1].get_uid());
 
-            Real value_1 = values_1[idx_1];
-            Real value_2 = values_2[idx_2];
+            Real value_1 = fil_1.get_cell_value(idx_1);
+            Real value_2 = fil_2.get_cell_value(idx_2);
 
             Real min_value = negate ? std::min(-value_1, -value_2) : std::min(value_1, value_2);
             to_sort.emplace_back(min_value, cells[idx_1].dim(), idx_1, idx_2);
diff --git a/include/oineus/inclusion_filtration.h b/include/oineus/inclusion_filtration.h
index b857998..8e97660 100644
--- a/include/oineus/inclusion_filtration.h
+++ b/include/oineus/inclusion_filtration.h
@@ -101,7 +101,7 @@ class InclusionFiltration {
                 col.reserve(d + 1);
 
                 for(const auto& tau_vertices: sigma.boundary()) {
-                    col.push_back(fil_domain_.get_sorted_id_by_vertices(tau_vertices));
+                    col.push_back(fil_domain_.get_sorted_id_by_uid(tau_vertices));
                 }
 
                 std::sort(col.begin(), col.end());
diff --git a/include/oineus/kernel.h b/include/oineus/kernel.h
index 7386ee0..81ebba1 100644
--- a/include/oineus/kernel.h
+++ b/include/oineus/kernel.h
@@ -4,18 +4,49 @@
 #include <unordered_map>
 #include <cmath>
 #include <numeric>
+#include <ostream>
 
 #include "simplex.h"
 #include "sparse_matrix.h"
 #include "decomposition.h"
 #include "filtration.h"
 #include "diagram.h"
+#include "profile.h"
 
 // suppress pragma message from boost
 #define BOOST_BIND_GLOBAL_PLACEHOLDERS
 
 namespace oineus {
 
+struct KICRParams {
+    bool kernel {true};
+    bool image  {true};
+    bool cokernel {true};
+    bool include_zero_persistence {false};
+    bool verbose {false};
+    Params params_f;
+    Params params_g;
+    Params params_ker;
+    Params params_im;
+    Params params_cok;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const KICRParams& p)
+{
+    out << "KICRParams(compute_kernel = " << p.kernel;
+    out << ", compute_image = " << p.image;
+    out << ", compute_cokernel = " << p.cokernel;
+    out << ", include_zero_persistence = " << p.include_zero_persistence;
+    out << ", verbose = " << p.verbose;
+    out << ", params_f = " << p.params_f;
+    out << ", params_g = " << p.params_g;
+    if (p.kernel) out << ", params_ker = " << p.params_ker;
+    if (p.image)  out << ", params_im = " << p.params_im;
+    if (p.cokernel)  out << ", params_cok = " << p.params_cok;
+    out << ")";
+    return out;
+}
+
 template<typename Cell, typename Real_, int P = 2>
 struct KerImCokReduced {
     using Int = typename Cell::Int;
@@ -49,8 +80,7 @@ struct KerImCokReduced {
     std::vector<size_t> new_order_to_old_;     //given sorted_id i of cell in K, new_order_to_old[i] is its index in the ordering 'first L, then K-L', used in D_im
     std::vector<size_t> old_order_to_new_;     // the inverse of the above
     std::vector<size_t> K_to_ker_column_index_;
-    Params params_;
-    bool include_zero_persistence_ {false};   // whether we want to have points on the diagonal in the diagram
+    KICRParams params_;
 
     // entries (rows) in col are indexed w.r.t. K filtration order
     // return col reindexed by the new order: L first, K-L second
@@ -69,9 +99,6 @@ struct KerImCokReduced {
 
     Matrix compute_d_im() const
     {
-        if (not dcmp_F_.is_reduced)
-            throw std::runtime_error("reduce D_f first");
-
         const Matrix& m = dcmp_F_.get_D();
 
         Matrix result;
@@ -87,7 +114,7 @@ struct KerImCokReduced {
 
     // take columns from v that correspond to cycles (i.e., the corresponding column in r is zero)
     // apply old_to_new_order to them (reorder rows)
-    // return the matrix comprised from the resulting columns
+    // return the matrix comprised of the resulting columns
     Matrix compute_d_ker()
     {
         const Matrix& v = dcmp_im_.get_V();
@@ -141,10 +168,10 @@ struct KerImCokReduced {
             for(auto& x: new_col)
                 x = sorted_L_to_sorted_K_[x];
 
-            // NB: sorting not needed: new_col was sorted before
-
+            // sorting not needed: new_col was sorted before
             d_cok[i] = std::move(new_col);
         }
+
         return d_cok;
     }
 
@@ -161,7 +188,8 @@ struct KerImCokReduced {
     }
 
     // parameters: complex K, a subcomplex L, reduction params
-    KerImCokReduced(const Fil& K, const Fil& L, Params& params, bool include_zero_persistence=false)
+    KerImCokReduced(const Fil& K, const Fil& L,
+                    KICRParams& params)
             :
             fil_K_(K),
             fil_L_(L),
@@ -174,15 +202,8 @@ struct KerImCokReduced {
             ker_diagrams_(K.max_dim() + 1),
             im_diagrams_(K.max_dim() + 1),
             cok_diagrams_(K.max_dim() + 1),
-            params_(params),
-            include_zero_persistence_(include_zero_persistence)
+            params_(params)
     {
-        if (params.compute_u) { std::cerr << "WARNING: compute_u will be ignored, do not need it for Ker/Im/Cok algorithm" << std::endl; }
-        if (!params.compute_v) { std::cerr << "WARNING: compute_v is false, but V will be computed, we need it for Ker/Im/Cok algorithm" << std::endl; }
-
-        params.compute_v = true;
-        params.compute_u = false;
-
         if (params_.verbose) { std::cerr << "Performing kernel, image, cokernel reduction, reduction parameters: " << params << std::endl; }
 
         // all cells in L must be present in K
@@ -192,30 +213,47 @@ struct KerImCokReduced {
         if (fil_K_.size() < fil_L_.size())
             throw std::runtime_error("second argument L must be a subcomplex of the first argument K");
 
+        CALI_MARK_BEGIN("sorted_L_to_sorted_K");
         for(size_t fil_L_idx = 0 ; fil_L_idx < fil_L_.size() ; fil_L_idx++) {//getting sorted L to sorted K is relatively easy
             sorted_L_to_sorted_K_[fil_L_idx] = fil_K_.get_sorted_id_by_uid(fil_L_.get_cell(fil_L_idx).get_uid());
         }
+        CALI_MARK_END("sorted_L_to_sorted_K");
 
+        CALI_MARK_BEGIN("sorted_K_to_sorted_L");
         for(size_t i = 0 ; i < fil_L_.size() ; i++) {
             //for cells in K which are also in L, set the sorted id, which we can get from sorted L to sorted K
             sorted_K_to_sorted_L_[sorted_L_to_sorted_K_[i]] = i;
         }
+        CALI_MARK_END("sorted_K_to_sorted_L");
 
         if (params_.verbose) { std::cerr << "K_to_L and L_to_K computed" << std::endl; }
 
         //set up the reduction for F  on K
-        dcmp_F_ = VRUDecomp(fil_K_.boundary_matrix_full());
-        dcmp_F_.reduce(params);
-        if (params_.verbose) { std::cerr << "dcmp_F_ reduced" << std::endl; }
+        CALI_MARK_BEGIN("dcmp_F.reduce");
+        dcmp_F_ = VRUDecomp(fil_K_.boundary_matrix());
+        dcmp_F_.reduce(params_.params_f);
+        if (params_.verbose) { std::cerr << "dcmp_F_ reduced with params = " << params_.params_f << std::endl; }
+        CALI_MARK_END("dcmp_F.reduce");
 
         //set up reduction for G on L
-        dcmp_G_ = VRUDecomp(fil_L_.boundary_matrix_full());
-        dcmp_G_.reduce(params);
-        if (params_.verbose) { std::cerr << "dcmp_G_ reduced" << std::endl; }
-
+        CALI_MARK_BEGIN("dcmp_G.reduce");
+        params_.params_g.compute_v = params_.params_g.compute_v or params_.cokernel;
+        dcmp_G_ = VRUDecomp(fil_L_.boundary_matrix());
+        dcmp_G_.reduce(params_.params_g);
+        if (params_.verbose) { std::cerr << "dcmp_G_ reduced with params = " << params_.params_g << std::endl; }
+        CALI_MARK_END("dcmp_G.reduce");
+
+        CALI_MARK_BEGIN("dcmp_F.diagram");
         cod_diagrams_ = dcmp_F_.diagram(fil_K_, true);
+        if (params_.verbose) { std::cerr << "cod_diagrams computed" << std::endl; }
+        CALI_MARK_END("dcmp_F.diagram");
+
+        CALI_MARK_BEGIN("dcmp_G.diagram");
         dom_diagrams_ = dcmp_G_.diagram(fil_L_, true);
+        if (params_.verbose) { std::cerr << "dom_diagrams computed" << std::endl; }
+        CALI_MARK_END("dcmp_G.diagram");
 
+        CALI_MARK_BEGIN("new_order_to_old");
         std::iota(new_order_to_old_.begin(), new_order_to_old_.end(), 0);
 
         if (params_.verbose) std::cerr << "Sorting so that cells in L come before cells in K." << std::endl;
@@ -229,6 +267,7 @@ struct KerImCokReduced {
                   // if both i and j are in L or both are not, use the existing order
                   return i < j;
                 });
+        CALI_MARK_END("new_order_to_old");
 
         // map from old order to new order so that we know which cells correspond to which rows.
         // This could be done by just shuffling the row indices, but as we create a new reduction isntance, we need to create a new matrix anyway.
@@ -237,38 +276,50 @@ struct KerImCokReduced {
             old_order_to_new_[new_order_to_old_[i]] = i;
         }
 
-        params.clearing_opt = false;
-
         // step 2 of the algorithm
+        CALI_MARK_BEGIN("dcmp_im");
+        params_.params_im.clearing_opt = false;
+        // if user wants to compute v, keep it, but if we need ker, we must compute it anyway
+        params_.params_im.compute_v = params_.params_im.compute_v or params_.kernel;
         auto d_im = compute_d_im();
         dcmp_im_ = VRUDecomp(d_im);
-        dcmp_im_.reduce(params);
-        if (params_.verbose) { std::cerr << "dcmp_im_ reduced" << std::endl; }
+        dcmp_im_.reduce(params_.params_im);
+        if (params_.verbose) { std::cerr << "dcmp_im_ reduced, size=" << dcmp_im_.size() << ", params = " << params_.params_im << std::endl; }
+        CALI_MARK_END("dcmp_im");
 
         // step 3 of the algorithm
 
-        //params.compute_v = false;
-
-        Matrix d_ker = compute_d_ker();
-        // NB: d_ker is not a square matrix, has fewer columns that rows. We must give the number of rows (#cells in K) to VRUDecomp ctor.
-        dcmp_ker_ = VRUDecomp(d_ker, fil_K_.size());
-        dcmp_ker_.reduce(params);
-        if (params_.verbose) { std::cerr << "dcmp_ker reduced" << std::endl; }
+        if (params_.kernel) {
+            CALI_MARK_BEGIN("dcmp_ker");
+            params_.params_ker.clearing_opt = false;
+            Matrix d_ker = compute_d_ker();
+            // NB: d_ker is not a square matrix, has fewer columns that rows. We must give the number of rows (#cells in K) to VRUDecomp ctor.
+            dcmp_ker_ = VRUDecomp(d_ker, fil_K_.size());
+            dcmp_ker_.reduce(params_.params_ker);
+            if (params_.verbose) { std::cerr << "dcmp_ker reduced, size=" << dcmp_ker_.size() << ", params = " << params_.params_ker << std::endl; }
+            CALI_MARK_END("dcmp_ker");
+        }
 
-        // step 4 of the algorithm
-        Matrix d_cok = compute_d_cok();
-        // NB: d_cok is not a square matrix, has fewer columns that rows. We must give the number of rows to VRUDecomp ctor.
-        dcmp_cok_ = VRUDecomp(d_cok, fil_K_.size());
-        dcmp_cok_.reduce_parallel_rv(params);
-        if (params_.verbose) { std::cerr << "dcmp_cok reduced" << std::endl; }
+        if (params_.cokernel) {
+            // step 4 of the algorithm
+            CALI_MARK_BEGIN("dcmp_cok");
+            params_.params_cok.clearing_opt = false;
+            Matrix d_cok = compute_d_cok();
+            // NB: d_cok is not a square matrix, has fewer columns that rows. We must give the number of rows to VRUDecomp ctor.
+            dcmp_cok_ = VRUDecomp(d_cok, fil_K_.size());
+            dcmp_cok_.reduce(params_.params_cok);
+            if (params_.verbose) { std::cerr << "dcmp_cok reduced, size=" << dcmp_cok_.size() << std::endl; }
+            CALI_MARK_END("dcmp_cok");
+        }
 
-        if (params.kernel) generate_ker_diagrams();
-        if (params.cokernel) generate_cok_diagrams();
-        if (params.image) generate_im_diagrams();
+        if (params_.kernel) generate_ker_diagrams();
+        if (params_.cokernel) generate_cok_diagrams();
+        if (params_.image) generate_im_diagrams();
     }
 
     void generate_ker_diagrams(bool inf_points = true)
     {
+        CALI_CXX_MARK_FUNCTION;
         if (params_.verbose) std::cerr << "generating kernel diagrams" << std::endl;
         // if we need points at infinity,
         // we have to keep track of the matched positive cells;
@@ -308,7 +359,7 @@ struct KerImCokReduced {
 
             dim_type dim = fil_K_.get_cell(death_idx).dim() - 1;
 
-            if (birth != death or include_zero_persistence_)
+            if (birth != death or params_.include_zero_persistence)
                 ker_diagrams_.add_point(dim, birth, death, birth_idx, death_idx, fil_K_.get_id_by_sorted_id(birth_idx), fil_K_.get_id_by_sorted_id(death_idx));
 
             if (inf_points) {
@@ -365,6 +416,7 @@ struct KerImCokReduced {
 
     void generate_cok_diagrams(bool inf_points = true)
     {
+        CALI_CXX_MARK_FUNCTION;
         if (params_.verbose) std::cerr << "generating cokernel diagrams" << std::endl;
         // if we need points at infinity,
         // we have to keep track of the matched positive cells;
@@ -374,9 +426,9 @@ struct KerImCokReduced {
         // simplex τ gives death in Cok(g -> f) iff τ is
         // negative in R_f and the lowest one in its column in R_im
         // corresponds to a simplex in K − L
-        for(size_t death_idx = 0 ; death_idx < dcmp_F_.get_R().size() ; ++death_idx) {
+        for(size_t death_idx = 0 ; death_idx < dcmp_im_.get_R().size() ; ++death_idx) {
             // tau is positive -> skip it
-            if (dcmp_F_.is_positive(death_idx)) {
+            if (dcmp_im_.is_positive(death_idx)) {
                 continue;
             }
 
@@ -408,7 +460,7 @@ struct KerImCokReduced {
 
             dim_type dim = fil_K_.get_cell(birth_idx).dim();
 
-            if (birth != death or include_zero_persistence_)
+            if (birth != death or params_.include_zero_persistence)
                 cok_diagrams_.add_point(dim, birth, death, birth_idx, death_idx, fil_K_.get_id_by_sorted_id(birth_idx), fil_K_.get_id_by_sorted_id(death_idx));
 
             if (inf_points) {
@@ -425,14 +477,14 @@ struct KerImCokReduced {
         if (inf_points) {
             // A simplex σ gives birth in Cok(g -> f) iff σ is positive
             // in R_f and it is either in K − L or negative in R_g .
-            for(size_t birth_idx = 0 ; birth_idx < dcmp_F_.get_R().size() ; ++birth_idx) {
+            for(size_t birth_idx = 0 ; birth_idx < dcmp_im_.get_R().size() ; ++birth_idx) {
                 // sigma is paired, skip it
                 if (matched_positive_cells.count(birth_idx)) {
                     continue;
                 }
 
                 // sigma is negative in R_f, skip it
-                if (dcmp_F_.is_negative(birth_idx)) {
+                if (dcmp_im_.is_negative(birth_idx)) {
                     continue;
                 }
 
@@ -454,6 +506,7 @@ struct KerImCokReduced {
     // TODO: merge with cokernel diagrams? the logic is close, instead of continue we can just have an if-statement
     void generate_im_diagrams(bool inf_points = true)
     {
+        CALI_CXX_MARK_FUNCTION;
         std::unordered_set<size_t> matched_positive_cells;
 
         // Death. A simplex τ gives death in Im(g →f ) iff τ is negative in Rf
@@ -484,7 +537,7 @@ struct KerImCokReduced {
             Real death = fil_K_.value_by_sorted_id(death_idx);
             dim_type dim = fil_K_.get_cell(birth_idx).dim();
 
-            if (birth != death or include_zero_persistence_)
+            if (birth != death or params_.include_zero_persistence)
                 im_diagrams_.add_point(dim, birth, death, birth_idx, death_idx, fil_K_.get_id_by_sorted_id(birth_idx), fil_K_.get_id_by_sorted_id(death_idx));
 
             if (inf_points) {
diff --git a/include/oineus/lower_star.h b/include/oineus/lower_star.h
index 43bb439..9334123 100644
--- a/include/oineus/lower_star.h
+++ b/include/oineus/lower_star.h
@@ -265,7 +265,7 @@ class Grid {
         vertices.reserve(total_size);
 
         for(dim_type d = 0 ; d <= top_d ; ++d) {
-            add_freudenthal_simplices(d, negate, simplices, vertices);
+            add_freudenthal_simplices(d, negate, simplices, vertices, true);
         }
 
         auto fil = GridFiltration(simplices, negate, n_threads);
@@ -281,7 +281,26 @@ class Grid {
 
     GridFiltration freudenthal_filtration(size_t top_d, bool negate, int n_threads = 1) const
     {
-        return freudenthal_filtration_and_critical_vertices(top_d, negate, n_threads).first;
+        if (top_d > dim)
+            throw std::runtime_error("bad dimension, top_d = " + std::to_string(top_d) + ", dim = " + std::to_string(dim));
+
+        SimplexVec simplices;
+        CriticalVertices dummy_vertices;
+
+        // calculate total number of cells to allocate memory once
+        size_t total_size = 0;
+        for(dim_type d = 0 ; d <= top_d ; ++d) {
+            total_size += get_fr_displacements(d).size() * size();
+        }
+
+        simplices.reserve(total_size);
+
+        for(dim_type d = 0 ; d <= top_d ; ++d) {
+            add_freudenthal_simplices(d, negate, simplices, dummy_vertices, false);
+        }
+
+        auto fil = GridFiltration(simplices, negate, n_threads);
+        return fil;
     }
 
     Real value_at_vertex(Int vertex) const
@@ -317,7 +336,13 @@ class Grid {
     bool wrap_ {false};
     Real* data_ {nullptr};
 
-    void add_freudenthal_simplices_from_vertex(const GridPoint& v, size_t d, bool negate, const GridPointVecVec& disps, SimplexVec& simplices, CriticalVertices& critical_vertices) const
+    void add_freudenthal_simplices_from_vertex(const GridPoint& v,
+            size_t d,
+            bool negate,
+            const GridPointVecVec& disps,
+            SimplexVec& simplices,
+            CriticalVertices& critical_vertices,
+            bool return_critical_vertices) const
     {
         IdxVector v_ids(d + 1, 0);
 
@@ -340,21 +365,33 @@ class Grid {
             if (is_valid_simplex) {
                 ValueVertex vv = simplex_value_and_vertex(v_ids, negate);
                 simplices.emplace_back(v_ids, vv.value);
-                critical_vertices.emplace_back(vv.vertex);
+                if (return_critical_vertices) {
+                    critical_vertices.emplace_back(vv.vertex);
+                }
             }
         }
     }
 
-    void add_freudenthal_simplices(dim_type d, bool negate, SimplexVec& simplices, CriticalVertices& critical_vertices) const
+    void add_freudenthal_simplices(dim_type d,
+                                   bool negate,
+                                   SimplexVec& simplices,
+                                   CriticalVertices& critical_vertices,
+                                   bool return_critical_vertices) const
     {
         auto disps = get_fr_displacements(d);
 
         for(Int i = 0 ; i < size() ; ++i) {
             GridPoint v = id_to_point(i);
-            add_freudenthal_simplices_from_vertex(v, d, negate, disps, simplices, critical_vertices);
+            add_freudenthal_simplices_from_vertex(v, d, negate, disps, simplices, critical_vertices, return_critical_vertices);
+
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+            if (i % 100 == 0) {
+                OINEUS_CHECK_FOR_PYTHON_INTERRUPT;
+            }
+#endif
+
         }
     }
-
 }; // class Grid
 
 template<typename Int, typename Real, size_t D>
diff --git a/include/oineus/mem_reclamation.h b/include/oineus/mem_reclamation.h
index cf7fe4f..393e335 100644
--- a/include/oineus/mem_reclamation.h
+++ b/include/oineus/mem_reclamation.h
@@ -55,7 +55,7 @@ namespace oineus {
 
         void retire(T*& ptr)
         {
-            to_retire_.push_back(ptr);
+            if (ptr) to_retire_.push_back(ptr);
             ptr = nullptr;
         }
 
diff --git a/include/oineus/params.h b/include/oineus/params.h
index 0837a5e..2ca9559 100644
--- a/include/oineus/params.h
+++ b/include/oineus/params.h
@@ -11,17 +11,14 @@ namespace oineus {
         int n_threads{1};
         int chunk_size{128};
         bool write_dgms{false};
-        bool sort_dgms{true};
+        bool sort_dgms{false};
         bool clearing_opt{true};
         bool acq_rel{false};
         bool print_time{false};
-        bool compute_v{true};
+        bool compute_v{false};
         bool compute_u{false};
         bool do_sanity_check{false};
         double elapsed{0.0};
-        bool kernel{false};
-        bool image{false};
-        bool cokernel{false};
         bool verbose{false};
     };
 
@@ -38,9 +35,6 @@ namespace oineus {
         out << ", compute_u = " << p.compute_u;
         out << ", do_sanity_check = " << p.do_sanity_check;
         out << ", elapsed = " << p.elapsed;
-        out << ", kernel = " << p.kernel;
-        out << ", image = " << p.image;
-        out << ", cokernel = " << p.cokernel;
         out << ", verbose = " << p.verbose;
         out << ")";
         return out;
diff --git a/include/oineus/product_cell.h b/include/oineus/product_cell.h
index 3059add..08c11df 100644
--- a/include/oineus/product_cell.h
+++ b/include/oineus/product_cell.h
@@ -41,6 +41,7 @@ struct ProductCell {
 
     Int get_id() const { return id_; }
     void set_id(Int new_id) { id_ = new_id; }
+    void set_uid() { cell_1_.set_uid(); cell_2_.set_uid(); }
 
     dim_type dim() const { return cell_1_.dim() + cell_2_.dim(); }
 
diff --git a/include/oineus/simpl_map_filtration.h b/include/oineus/simpl_map_filtration.h
index 6a38e59..3826711 100644
--- a/include/oineus/simpl_map_filtration.h
+++ b/include/oineus/simpl_map_filtration.h
@@ -8,8 +8,6 @@
 
 #include <algorithm>
 
-#include <icecream/icecream.hpp>
-
 #include "timer.h"
 #include "simplex.h"
 #include "filtration.h"
diff --git a/include/oineus/simplex.h b/include/oineus/simplex.h
index 45d3c3c..437dd95 100644
--- a/include/oineus/simplex.h
+++ b/include/oineus/simplex.h
@@ -13,9 +13,43 @@
 
 namespace oineus {
 
+template<typename IntIn, typename IntOut>
+IntOut comb(IntIn n, IntIn k)
+{
+    if (n < k || n == 0) {
+        return static_cast<IntOut>(0);
+    }
+
+    if (k == 0 || k == n)
+        return static_cast<IntOut>(1);
+
+    IntOut result = 1;
+    for(IntOut i = 1; i <= k; i++) {
+        result *= (n - i + 1);
+        assert(result % i == 0);
+        result /= i;
+    }
+    return result;
+}
+
+// combinatorial simplex numbering, as in Ripser (see Bauer's paper)
+// encode dimension info in the 4 most significant bits of result
+template<typename IntIn, typename IntOut>
+IntOut simplex_uid(const std::vector<IntIn>& vertices)
+{
+    IntOut dim_info = (vertices.size() + 1) << (8 * sizeof(IntOut) - 4);
+    IntOut uid = 0;
+    for(IntIn i = 0; i < vertices.size(); i++) {
+        uid += comb<IntIn, IntOut>(vertices[i], i + 1);
+    }
+    return uid | dim_info;
+}
+
+template<typename Int_>
 struct VREdge {
-    size_t x;
-    size_t y;
+    using Int = Int_;
+    Int x;
+    Int y;
     bool operator==(const VREdge& other) const { return x == other.x and y == other.y; }
     bool operator!=(const VREdge& other) const { return !(*this == other); }
     bool operator<(const VREdge& other) const { return x < other.x or (x == other.x and y < other.y); };
@@ -24,7 +58,8 @@ struct VREdge {
     bool operator>=(const VREdge& other) const { return *this > other or *this == other; }
 };
 
-inline std::ostream& operator<<(std::ostream& out, const VREdge& e)
+template<class Int>
+inline std::ostream& operator<<(std::ostream& out, const VREdge<Int>& e)
 {
     out << "edge(x=" << e.x << ", y=" << e.y << ")";
     return out;
@@ -35,13 +70,14 @@ struct Simplex {
     using Int = Int_;
     using IdxVector = std::vector<Int>;
 
-    using Uid = IdxVector;
+    using Uid = long long;
     // for Z2 only for now
     using Boundary = std::vector<Uid>;
 
     static constexpr Int k_invalid_id = Int(-1);
 
     Int id_ {k_invalid_id};
+    Uid uid_ {k_invalid_id};
     IdxVector vertices_;
 
     Simplex() = default;
@@ -50,7 +86,7 @@ struct Simplex {
     Simplex& operator=(const Simplex&) = default;
     Simplex& operator=(Simplex&&) = default;
 
-    Simplex(const IdxVector& _vertices)
+    Simplex(const IdxVector& _vertices, bool set_uid_immediately = true)
             :vertices_(_vertices)
     {
         if (vertices_.empty())
@@ -60,32 +96,39 @@ struct Simplex {
             id_ = vertices_[0];
         else
             std::sort(vertices_.begin(), vertices_.end());
+
+        if (set_uid_immediately) set_uid();
     }
 
+    // uids are set in parallel
+    void set_uid() { uid_ = simplex_uid<Int, Uid>(vertices_); }
+
     dim_type dim() const { return static_cast<Int>(vertices_.size()) - 1; }
 
     Int get_id() const { return id_; }
     void set_id(Int new_id) { id_ = new_id; }
 
-    Simplex(const int _id, const IdxVector& _vertices)
+    Simplex(const Int _id, const IdxVector& _vertices, bool set_uid_immediately = true)
             :vertices_(_vertices), id_(_id)
     {
         if (vertices_.empty())
             throw std::runtime_error("Empty simplex not allowed");
 
         if (vertices_.size() > 1)
-            std::sort(vertices_.begin(), vertices_.end());
+            std::sort(vertices_.begin(), vertices_.end(), std::greater<Int>());
+
+        if (set_uid_immediately) set_uid();
     }
 
     Boundary boundary() const
     {
-        std::vector<IdxVector> bdry;
+        Boundary boundary;
 
         if (dim() == 0)
-            return bdry;
-
-        bdry.reserve(vertices_.size());
+            return boundary;
 
+        boundary.reserve(vertices_.size());
+        // TODO: do not materialize tau and just skip in when computing uid?
         for(size_t i = 0 ; i < vertices_.size() ; ++i) {
             IdxVector tau;
             tau.reserve(vertices_.size() - 1);
@@ -96,10 +139,10 @@ struct Simplex {
 
             // vertices_ is sorted -> tau is sorted automatically
 
-            bdry.push_back(tau);
+            boundary.push_back(simplex_uid<Int, Uid>(tau));
         }
 
-        return bdry;
+        return boundary;
     }
 
     // create a new simplex by joining with vertex and assign value to it
@@ -132,39 +175,18 @@ struct Simplex {
     template<typename I>
     friend std::ostream& operator<<(std::ostream&, const Simplex<I>&);
 
-    const Uid& get_uid() const { return vertices_; }
-    void set_uid(const Uid& new_vs)
-    {
-        vertices_ = new_vs;
-        std::sort(vertices_.begin(), vertices_.end());
-    }
-
-    static std::string uid_to_string(const Uid& uid)
-    {
-        std::stringstream ss;
-        ss << "[";
-        for(auto v : uid) {
-            ss << v << ",";
-        }
-        ss << "]";
-        return ss.str();
-    }
+    Uid get_uid() const { return uid_; }
 
-    std::string uid_as_string() const
-    {
-        return uid_to_string(get_uid());
-    }
-
-    struct UidHasher {
-        std::size_t operator()(const Uid& vs) const
-        {
-            // TODO: replace with better hash function
-            std::size_t seed = 0;
-            for(auto v: vs)
-                oineus::hash_combine(seed, v);
-            return seed;
-        }
-    };
+    // struct UidHasher {
+    //     std::size_t operator()(const Uid& vs) const
+    //     {
+    //         // TODO: replace with better hash function
+    //         std::size_t seed = 0;
+    //         for(auto v: vs)
+    //             oineus::hash_combine(seed, v);
+    //         return seed;
+    //     }
+    // };
 
     std::string repr() const
     {
@@ -179,6 +201,7 @@ struct Simplex {
         return out.str();
     }
 
+    using UidHasher = std::hash<Uid>;
     using UidSet = std::unordered_set<Uid, UidHasher>;
 };
 
@@ -197,9 +220,9 @@ std::ostream& operator<<(std::ostream& out, const Simplex<I>& s)
 }
 
 namespace std {
-template<>
-struct hash<oineus::VREdge> {
-    std::size_t operator()(const oineus::VREdge& p) const
+template<class Int>
+struct hash<oineus::VREdge<Int>> {
+    std::size_t operator()(const oineus::VREdge<Int>& p) const
     {
         std::size_t seed = 0;
         oineus::hash_combine(seed, p.x);
diff --git a/include/oineus/sparse_matrix.h b/include/oineus/sparse_matrix.h
index 99aeeaa..676fbcf 100644
--- a/include/oineus/sparse_matrix.h
+++ b/include/oineus/sparse_matrix.h
@@ -66,6 +66,8 @@ struct SimpleSparseMatrixTraits {
 
     static void add_column(const Column* col_a, const Column* col_b, Column* sum);
 
+    static Column& r_data(Column* col) { return *col; }
+
     // get identity matrix
     static Matrix eye(size_t n)
     {
@@ -111,7 +113,10 @@ struct SimpleSparseMatrixTraits<Int_, 2> {
 
     static CachedColumn load_to_cache(Column* col)
     {
-        return load_to_cache(*col);
+        if (col == nullptr)
+            return CachedColumn();
+        else
+            return load_to_cache(*col);
     }
 
     static void add_to_cached(const Column& pivot, CachedColumn& reduced)
@@ -127,37 +132,15 @@ struct SimpleSparseMatrixTraits<Int_, 2> {
 
     static void add_to_cached(const Column* pivot, CachedColumn& reduced)
     {
-        //{
-        //    std::stringstream ss;
-        //    ss << "ENTER add_to_cached, pivots = [";
-        //    for(auto x : *pivot)
-        //        ss << x << ", ";
-        //    ss << "], reduced = [";
-        //    for(auto x : reduced)
-        //        ss << x << ", ";
-        //    ss << "]";
-        //    std::cerr << ss.str() << std::endl;
-        //}
-
         for(auto e : *pivot) {
             auto iter_exists = reduced.insert(e);
             if (!iter_exists.second)
                 reduced.erase(iter_exists.first);
         }
-
-        //{
-        //    std::stringstream ss;
-        //    ss << "EXIT add_to_cached, pivots = [";
-        //    for(auto x : *pivot)
-        //        ss << x << ", ";
-        //    ss << "], reduced = [";
-        //    for(auto x : reduced)
-        //        ss << x << ", ";
-        //    ss << "]";
-        //    std::cerr << ss.str() << std::endl;
-        //}
     }
 
+    static Column& r_data(Column* col) { return *col; }
+
     static bool is_zero(const CachedColumn& col)
     {
         return col.empty();
@@ -170,7 +153,10 @@ struct SimpleSparseMatrixTraits<Int_, 2> {
 
     static PColumn load_from_cache(const CachedColumn& col)
     {
-        return new Column(col.begin(), col.end());
+        if (col.empty())
+            return nullptr;
+        else
+            return new Column(col.begin(), col.end());
     }
 
     static void load_from_cache(const CachedColumn& cached_col, Column& col)
@@ -188,6 +174,25 @@ struct SimpleSparseMatrixTraits<Int_, 2> {
         return cols;
     }
 
+    // return empty string, if there are no duplicates
+    // otherwise return error message with column content
+    static std::string check_col_duplicates(PColumn col)
+    {
+        if (col == nullptr) return "";
+        std::set<Entry> seen;
+        for(const Entry& e : *col) {
+            if (seen.find(e) != seen.end()) {
+                std::stringstream ss;
+                ss << " ERROR DUPLICATES in " << (void*) col << ", [";
+                for(auto x : *col)
+                    ss << x << ", ";
+                ss << "] ";
+                return ss.str();
+            }
+            seen.insert(e);
+        }
+        return "";
+    }
 };
 
 template<typename Int_, int P>
@@ -373,14 +378,29 @@ struct SimpleRVMatrixTraits<Int_, 2> {
 
     static CachedColumn load_to_cache(Column* col)
     {
-        return load_to_cache(*col);
+        if (col == nullptr)
+            return {{}, {}};
+        else
+            return load_to_cache(*col);
     }
 
     static PColumn load_from_cache(const CachedColumn& col)
     {
-        return new Column({col.first.begin(), col.first.end()}, {col.second.begin(), col.second.end()});
+        if (is_zero(col))
+            return nullptr;
+        else
+            return new Column({col.first.begin(), col.first.end()}, {col.second.begin(), col.second.end()});
     }
 
+    static auto& r_data(Column* col) { return col->r_column; }
+
+    static std::string check_col_duplicates(PColumn col)
+    {
+        if (col == nullptr) return "";
+        std::string s_r = SimpleSparseMatrixTraits<Int, 2>::check_col_duplicates(&(col->r_column));
+        std::string s_v = SimpleSparseMatrixTraits<Int, 2>::check_col_duplicates(&(col->v_column));
+        return s_r + s_v;
+    }
 //    // get identity matrix
 //    static Matrix eye(size_t n)
 //    {
@@ -640,6 +660,7 @@ SparseMatrix<Int> mat_multiply_2(const SparseMatrix<Int>& a, const SparseMatrix<
 template<typename Int>
 SparseMatrix<Int> antitranspose(const SparseMatrix<Int>& a)
 {
+    CALI_CXX_MARK_FUNCTION;
     SparseMatrix<Int> result(a.n_cols(), a.n_rows());
 
     for(size_t row_idx = 0 ; row_idx < a.size() ; ++row_idx) {
diff --git a/include/oineus/top_optimizer.h b/include/oineus/top_optimizer.h
index bf9f4c4..1594cb3 100644
--- a/include/oineus/top_optimizer.h
+++ b/include/oineus/top_optimizer.h
@@ -134,8 +134,10 @@ class TopologyOptimizer {
             fil_(fil),
             negate_(fil.negate())
     {
-        params_hom_.clearing_opt = false;
-        params_coh_.clearing_opt = false;
+        params_hom_.compute_v = true;
+        params_coh_.compute_v = true;
+        params_hom_.compute_u = true;
+        params_coh_.compute_u = true;
     }
 
     TopologyOptimizer(const Fil& fil, const ComputeFlags& hints)
@@ -147,11 +149,6 @@ class TopologyOptimizer {
     {
         params_hom_.compute_u = hints.compute_homology_u;
         params_coh_.compute_u = hints.compute_cohomology_u;
-
-        if (hints.compute_homology_u)
-            params_hom_.clearing_opt = false;
-        if (hints.compute_cohomology_u)
-            params_coh_.clearing_opt = false;
     }
 
     bool cmp(Real a, Real b) const
@@ -273,7 +270,6 @@ class TopologyOptimizer {
             throw std::runtime_error("indices and values must have the same size");
 
         auto flags = get_flags(indices, values);
-        //IC(flags);
         params_coh_.compute_u = flags.compute_cohomology_u;
         params_hom_.compute_u = flags.compute_homology_u;
 
@@ -309,7 +305,7 @@ class TopologyOptimizer {
 
     Real get_cell_value(size_t simplex_idx) const
     {
-        return fil_.value_by_sorted_id(simplex_idx);
+        return fil_.get_cell_value(simplex_idx);
     }
 
 //    Target dgm_target_to_target(const DgmTarget& dgm_target) const
@@ -609,11 +605,13 @@ class TopologyOptimizer {
 
     Indices increase_death(size_t negative_simplex_idx) const
     {
+        CALI_CXX_MARK_FUNCTION;
         return increase_death(negative_simplex_idx, fil_.infinity());
     }
 
     Indices decrease_death(size_t negative_simplex_idx, Real target_death) const
     {
+        CALI_CXX_MARK_FUNCTION;
         Indices result;
 
         auto& r_cols = decmp_hom_.r_data;
@@ -645,6 +643,7 @@ class TopologyOptimizer {
 
     Indices decrease_death(size_t negative_simplex_idx) const
     {
+        CALI_CXX_MARK_FUNCTION;
         return decrease_death(negative_simplex_idx, -fil_.infinity());
     }
 
diff --git a/include/oineus/vietoris_rips.h b/include/oineus/vietoris_rips.h
index fbb1143..2354b0e 100644
--- a/include/oineus/vietoris_rips.h
+++ b/include/oineus/vietoris_rips.h
@@ -36,20 +36,19 @@ namespace oineus {
     }
 
     template<class Int, class Real>
-    std::pair<CellWithValue<Simplex<Int>, Real>, VREdge> vr_simplex_with_edge(const DistMatrix<Real>& dist_matrix, const std::vector<size_t>& vertices_)
+    std::pair<CellWithValue<Simplex<Int>, Real>, VREdge<Int>> vr_simplex_with_edge(const DistMatrix<Real>& dist_matrix, const typename Simplex<Int>::IdxVector& vertices)
     {
-        using IdxVector = typename Simplex<Int>::IdxVector;
-        using Simplex = CellWithValue<Simplex<Int>, Real>;
+        using SimplexWithValue = CellWithValue<Simplex<Int>, Real>;
 
-        assert(not vertices_.empty());
+        assert(not vertices.empty());
 
         Real crit_value = 0;
-        VREdge crit_edge {vertices_[0], vertices_[0]};
+        VREdge<Int> crit_edge {vertices[0], vertices[0]};
 
-        for(size_t u_idx = 0; u_idx < vertices_.size(); ++u_idx) {
-            for(size_t v_idx = u_idx + 1; v_idx < vertices_.size(); ++v_idx) {
-                size_t u = vertices_[u_idx];
-                size_t v = vertices_[v_idx];
+        for(size_t u_idx = 0; u_idx < vertices.size(); ++u_idx) {
+            for(size_t v_idx = u_idx + 1; v_idx < vertices.size(); ++v_idx) {
+                auto u = vertices[u_idx];
+                auto v = vertices[v_idx];
                 Real uv_dist = dist_matrix.get_distance(u, v);
                 if (uv_dist > crit_value) {
                     crit_value = uv_dist;
@@ -58,14 +57,36 @@ namespace oineus {
             }
         }
 
-        // convert size_t to Int, if necessary
-        IdxVector vertices {vertices_.begin(), vertices_.end()};
+        // false: do not compute uid immediately, set it in filtration ctor in parallel
+        return {SimplexWithValue(Simplex(vertices, false), crit_value), crit_edge};
+    }
+
+    template<class Int, class Real>
+    CellWithValue<Simplex<Int>, Real> vr_simplex(const DistMatrix<Real>& dist_matrix, const typename Simplex<Int>::IdxVector& vertices)
+    {
+        using Simp = CellWithValue<Simplex<Int>, Real>;
+
+        assert(not vertices.empty());
+
+        Real crit_value = 0;
+
+        for(size_t u_idx = 0; u_idx < vertices.size(); ++u_idx) {
+            for(size_t v_idx = u_idx + 1; v_idx < vertices.size(); ++v_idx) {
+                auto u = vertices[u_idx];
+                auto v = vertices[v_idx];
+                Real uv_dist = dist_matrix.get_distance(u, v);
+                if (uv_dist > crit_value) {
+                    crit_value = uv_dist;
+                }
+            }
+        }
 
-        return {Simplex(vertices, crit_value), crit_edge};
+        // false in Simplex<Int> ctor: do not set uid immediately
+        return Simp({vertices, false}, crit_value);
     }
 
     template<class Int, class Real, std::size_t D>
-    std::pair<CellWithValue<Simplex<Int>, Real>, VREdge> vr_simplex_with_edge(const std::vector<Point<Real, D>>& points, const std::vector<size_t>& vertices_)
+    std::pair<CellWithValue<Simplex<Int>, Real>, VREdge<Int>> vr_simplex_with_edge(const std::vector<Point<Real, D>>& points, const typename Simplex<Int>::IdxVector& vertices_)
     {
         using IdxVector = typename Simplex<Int>::IdxVector;
         using Simplex = CellWithValue<Simplex<Int>, Real>;
@@ -73,12 +94,12 @@ namespace oineus {
         assert(not vertices_.empty());
 
         Real crit_value = 0;
-        VREdge crit_edge {vertices_[0], vertices_[0]};
+        VREdge<Int> crit_edge {vertices_[0], vertices_[0]};
 
         for(size_t u_idx = 0; u_idx < vertices_.size(); ++u_idx) {
             for(size_t v_idx = u_idx + 1; v_idx < vertices_.size(); ++v_idx) {
-                size_t u = vertices_[u_idx];
-                size_t v = vertices_[v_idx];
+                auto u = vertices_[u_idx];
+                auto v = vertices_[v_idx];
                 if (sq_dist(points[u], points[v]) > crit_value) {
                     crit_value = sq_dist(points[u], points[v]);
                     crit_edge = {u, v};
@@ -88,12 +109,34 @@ namespace oineus {
 
         crit_value = sqrt(crit_value);
 
-        // convert size_t to Int, if necessary
-        IdxVector vertices {vertices_.begin(), vertices_.end()};
+        return {Simplex({vertices_}, crit_value), crit_edge};
+    }
+
+    template<class Int, class Real, std::size_t D>
+    CellWithValue<Simplex<Int>, Real> vr_simplex(const std::vector<Point<Real, D>>& points, const typename Simplex<Int>::IdxVector& vertices)
+    {
+        using Simplex = CellWithValue<Simplex<Int>, Real>;
 
-        return {Simplex({vertices}, crit_value), crit_edge};
+        assert(not vertices.empty());
+
+        Real crit_value = 0;
+
+        for(size_t u_idx = 0; u_idx < vertices.size(); ++u_idx) {
+            for(size_t v_idx = u_idx + 1; v_idx < vertices.size(); ++v_idx) {
+                auto u = vertices[u_idx];
+                auto v = vertices[v_idx];
+                if (sq_dist(points[u], points[v]) > crit_value) {
+                    crit_value = sq_dist(points[u], points[v]);
+                }
+            }
+        }
+
+        crit_value = sqrt(crit_value);
+
+        return Simplex({vertices}, crit_value);
     }
 
+
     template<class Functor, class NeighborTest, class VertexContainer>
     void bron_kerbosch(VertexContainer& current,
             const VertexContainer& candidates,
@@ -103,29 +146,58 @@ namespace oineus {
             const Functor& functor,
             bool check_initial)
     {
+        CALI_CXX_MARK_FUNCTION;
         if (check_initial and not current.empty())
             functor(current);
 
         if (current.size() == max_dim + 1)
             return;
 
+        size_t cur_idx = 0;
         for(auto cur = excluded_end; cur != candidates.end(); ++cur) {
             current.push_back(*cur);
 
             VertexContainer new_candidates;
 
-            for(auto ccur = candidates.begin(); ccur != cur; ++ccur)
+            size_t i = 0;
+
+            for(auto ccur = candidates.begin(); ccur != cur; ++ccur) {
                 if (neighbor(*ccur, *cur))
                     new_candidates.push_back(*ccur);
 
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                if (i % 500 == 0) {
+                    OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                }
+                i++;
+#endif
+            }
+
             size_t ex = new_candidates.size();
 
-            for(auto ccur = std::next(cur); ccur != candidates.end(); ++ccur)
+            i = 0;
+            for(auto ccur = std::next(cur); ccur != candidates.end(); ++ccur) {
                 if (neighbor(*ccur, *cur))
                     new_candidates.push_back(*ccur);
 
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                if (i % 500 == 0) {
+                    OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+                }
+                i++;
+#endif
+
+            }
+
             excluded_end = new_candidates.begin() + ex;
 
+#ifdef OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+            if (cur_idx % 500 == 0) {
+                OINEUS_CHECK_FOR_PYTHON_INTERRUPT
+            }
+            cur_idx++;
+#endif
+
             bron_kerbosch(current, new_candidates, excluded_end, max_dim, neighbor, functor, true);
 
             current.pop_back();
@@ -135,21 +207,22 @@ namespace oineus {
 
     // Bron-Kerbosch, from Dionysus
     template<class Int, class Real, std::size_t D>
-    std::pair<Filtration<Simplex<Int>, Real>, std::vector<VREdge>> get_vr_filtration_and_critical_edges(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_radius = std::numeric_limits<Real>::max(), int n_threads = 1)
+    std::pair<Filtration<Simplex<Int>, Real>, std::vector<VREdge<Int>>> get_vr_filtration_and_critical_edges(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_diameter = std::numeric_limits<Real>::max(), int n_threads = 1)
     {
+        CALI_CXX_MARK_FUNCTION;
         using VRFiltration = Filtration<Simplex<Int>, Real>;
         using Simplex = typename VRFiltration::Cell;
-        using VertexContainer = std::vector<size_t>;
-        using Edges = std::vector<VREdge>;
+        using VertexContainer = typename Simplex::Cell::IdxVector;
+        using Edges = std::vector<VREdge<Int>>;
 
-        auto neighbor = [&](size_t u, size_t v) { return sq_dist(points[u], points[v]) <= max_radius * max_radius; };
+        auto neighbor = [&](size_t u, size_t v) { return sq_dist(points[u], points[v]) <= max_diameter * max_diameter; };
 
         std::vector<Simplex> simplices;
         Edges edges;
         bool negate {false};
 
         // vertices are added manually to preserve order (id == index)
-        for(size_t v = 0; v < points.size(); ++v) {
+        for(Int v = 0; v < static_cast<Int>(points.size()); ++v) {
             auto [simplex, edge] = vr_simplex_with_edge<Int, Real>(points, {v});
             simplices.emplace_back(simplex);
             edges.emplace_back(edge);
@@ -175,6 +248,7 @@ namespace oineus {
 
         // use sorted info from fil to rearrange edges
         Edges sorted_edges;
+
         sorted_edges.reserve(edges.size());
 
         for(size_t sorted_edge_idx = 0; sorted_edge_idx < edges.size(); ++sorted_edge_idx) {
@@ -185,27 +259,28 @@ namespace oineus {
     }
 
     template<class Int, class Real>
-    std::pair<Filtration<Simplex<Int>, Real>, std::vector<VREdge>> get_vr_filtration_and_critical_edges(const DistMatrix<Real>& dist_matrix, dim_type max_dim, Real max_radius = std::numeric_limits<Real>::max(), int n_threads = 1)
+    std::pair<Filtration<Simplex<Int>, Real>, std::vector<VREdge<Int>>> get_vr_filtration_and_critical_edges(const DistMatrix<Real>& dist_matrix, dim_type max_dim, Real max_diameter = std::numeric_limits<Real>::max(), int n_threads = 1)
     {
         using Filtration = Filtration<Simplex<Int>, Real>;
+        using VertexContainer = typename Simplex<Int>::IdxVector;
         using Simplex = CellWithValue<Simplex<Int>, Real>;
-        using VertexContainer = std::vector<size_t>;
 
-        auto neighbor = [&](size_t u, size_t v) { return dist_matrix.get_distance(u, v) <= max_radius * max_radius; };
+        auto neighbor = [&](size_t u, size_t v) { return dist_matrix.get_distance(u, v) <= max_diameter * max_diameter; };
 
         std::vector<Simplex> simplices;
-        std::vector<VREdge> edges;
+        std::vector<VREdge<Int>> edges;
         bool negate {false};
 
-        size_t n_points = dist_matrix.n_points;
+        Int n_points = dist_matrix.n_points;
 
         // vertices are added manually to preserve order (id == index)
-        for(size_t v = 0; v < n_points; ++v) {
+        for(Int v = 0; v < n_points; ++v) {
             auto [simplex, edge] = vr_simplex_with_edge<Int, Real>(dist_matrix, {v});
             simplices.emplace_back(simplex);
             edges.emplace_back(edge);
         }
 
+
         auto functor = [&](const VertexContainer& vs)
                 { if (vs.size() > 1) {
                     auto [simplex, edge] = vr_simplex_with_edge<Int, Real>(dist_matrix, vs);
@@ -225,7 +300,7 @@ namespace oineus {
         auto fil = Filtration(std::move(simplices), negate, n_threads);
 
         // use sorted info from fil to rearrange edges
-        std::vector<VREdge> sorted_edges;
+        std::vector<VREdge<Int>> sorted_edges;
         sorted_edges.reserve(edges.size());
 
         for(size_t sorted_edge_idx = 0; sorted_edge_idx < edges.size(); ++sorted_edge_idx) {
@@ -236,27 +311,85 @@ namespace oineus {
     }
 
     template<class Int, class Real, std::size_t D>
-    auto get_vr_filtration(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_radius = std::numeric_limits<Real>::max(), int n_threads = 1)
+    auto get_vr_filtration(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_diameter = std::numeric_limits<Real>::max(), int n_threads = 1)
     {
-        return get_vr_filtration_and_critical_edges<Int, Real, D>(points, max_dim, max_radius, n_threads).first;
+        CALI_CXX_MARK_FUNCTION;
+        using VertexContainer = typename Simplex<Int>::IdxVector;
+
+        auto neighbor = [&](size_t u, size_t v) { return sq_dist(points[u], points[v]) <= max_diameter * max_diameter; };
+
+        std::vector<CellWithValue<Simplex<Int>, Real>> simplices;
+        bool negate {false};
+
+        // vertices are added manually to preserve order (id == index)
+        for(Int v = 0; v < static_cast<Int>(points.size()); ++v) {
+            simplices.emplace_back(vr_simplex<Int, Real, D>(points, {v}));
+        }
+
+        auto functor = [&](const VertexContainer& vs)
+                { if (vs.size() > 1) {
+                    simplices.emplace_back(vr_simplex<Int, Real, D>(points, vs));
+                }
+                };
+
+        VertexContainer current;
+        VertexContainer candidates(points.size());
+        std::iota(candidates.begin(), candidates.end(), 0);
+        auto excluded_end {candidates.cbegin()};
+        bool check_initial {false};
+
+        bron_kerbosch(current, candidates, excluded_end, max_dim, neighbor, functor, check_initial);
+        // Filtration constructor will sort simplices and assign sorted ids
+        return Filtration<Simplex<Int>, Real>(std::move(simplices), negate, n_threads);
     }
 
     template<class Int, class Real>
-    auto get_vr_filtration(const DistMatrix<Real>& dist_matrix, dim_type max_dim, Real max_radius = std::numeric_limits<Real>::max(), int n_threads = 1)
+    auto get_vr_filtration(const DistMatrix<Real>& dist_matrix, dim_type max_dim, Real max_diameter = std::numeric_limits<Real>::max(), int n_threads = 1)
     {
-        return get_vr_filtration_and_critical_edges<Int, Real>(dist_matrix, max_dim, max_radius, n_threads).first;
+        using Filtration = Filtration<Simplex<Int>, Real>;
+        using VertexContainer = typename Simplex<Int>::IdxVector;
+        using Simplex = CellWithValue<Simplex<Int>, Real>;
+
+        auto neighbor = [&](size_t u, size_t v) { return dist_matrix.get_distance(u, v) <= max_diameter * max_diameter; };
+
+        std::vector<Simplex> simplices;
+        bool negate {false};
+
+        Int n_points = dist_matrix.n_points;
+
+        // vertices are added manually to preserve order (id == index)
+        for(Int v = 0; v < n_points; ++v) {
+            auto simplex = vr_simplex<Int, Real>(dist_matrix, {v});
+            simplices.emplace_back(simplex);
+        }
+
+        auto functor = [&](const VertexContainer& vs)
+                { if (vs.size() > 1) {
+                    simplices.emplace_back(vr_simplex<Int, Real>(dist_matrix, vs));
+                }
+                };
+
+        VertexContainer current;
+        VertexContainer candidates(n_points);
+        std::iota(candidates.begin(), candidates.end(), 0);
+        auto excluded_end {candidates.cbegin()};
+        bool check_initial {false};
+
+        bron_kerbosch(current, candidates, excluded_end, max_dim, neighbor, functor, check_initial);
+        // Filtration constructor will sort simplices and assign sorted ids
+        return Filtration(std::move(simplices), negate, n_threads);
     }
 
 
     // stupid brute-force
     template<class Int, class Real, std::size_t D>
-    std::pair<Filtration<Simplex<Int>, Real>, std::vector<VREdge>> get_vr_filtration_and_critical_edges_naive(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_radius = std::numeric_limits<Real>::max(), int n_threads = 1)
+    std::pair<Filtration<Simplex<Int>, Real>, std::vector<VREdge<Int>>> get_vr_filtration_and_critical_edges_naive(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_diameter = std::numeric_limits<Real>::max(), int n_threads = 1)
     {
         using VRFiltration = Filtration<Simplex<Int>, Real>;
         using VRSimplex = typename VRFiltration::Cell;
 
         std::vector<VRSimplex> simplices;
-        std::vector<VREdge> edges;
+        std::vector<VREdge<Int>> edges;
 
         for(size_t v_idx = 0; v_idx < points.size(); ++v_idx) {
             std::vector<size_t> vertices {v_idx};
@@ -269,7 +402,7 @@ namespace oineus {
             for(size_t u_idx = 0; u_idx < points.size(); ++u_idx)
                 for(size_t v_idx = u_idx + 1; v_idx < points.size(); ++v_idx) {
                     auto [s, e] = vr_simplex_with_edge<Int, Real, D>(points, {u_idx, v_idx});
-                    if (s.get_value() <= max_radius) {
+                    if (s.get_value() <= max_diameter) {
                         simplices.emplace_back(s);
                         edges.emplace_back(e);
                     }
@@ -280,7 +413,7 @@ namespace oineus {
                 for(size_t v_idx = u_idx + 1; v_idx < points.size(); ++v_idx)
                     for(size_t w_idx = v_idx + 1; w_idx < points.size(); ++w_idx) {
                         auto [s, e] = vr_simplex_with_edge<Int, Real, D>(points, {u_idx, v_idx, w_idx});
-                        if (s.get_value() <= max_radius) {
+                        if (s.get_value() <= max_diameter) {
                             simplices.emplace_back(s);
                             edges.emplace_back(e);
                         }
@@ -292,7 +425,7 @@ namespace oineus {
                     for(size_t w_idx = v_idx + 1; w_idx < points.size(); ++w_idx)
                         for(size_t t_idx = w_idx + 1; t_idx < points.size(); ++t_idx) {
                             auto [s, e] = vr_simplex_with_edge<Int, Real, D>(points, {u_idx, v_idx, w_idx, t_idx});
-                            if (s.get_value() <= max_radius) {
+                            if (s.get_value() <= max_diameter) {
                                 simplices.emplace_back(s);
                                 edges.emplace_back(e);
                             }
@@ -301,13 +434,13 @@ namespace oineus {
         if (max_dim >= 4)
             throw std::runtime_error("not implemented");
 
-        return std::make_pair<VRFiltration, std::vector<VREdge>>(VRFiltration(std::move(simplices), false, n_threads), std::move(edges));
+        return std::make_pair<VRFiltration, std::vector<VREdge<Int>>>(VRFiltration(std::move(simplices), false, n_threads), std::move(edges));
     }
 
     template<class Int, class Real, std::size_t D>
-    auto get_vr_filtration_naive(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_radius = std::numeric_limits<Real>::max(), int n_threads = 1)
+    auto get_vr_filtration_naive(const std::vector<Point<Real, D>>& points, dim_type max_dim = D, Real max_diameter = std::numeric_limits<Real>::max(), int n_threads = 1)
     {
-        return get_vr_filtration_and_critical_edges_naive<Int, Real, D>(points, max_dim, max_radius, n_threads).first;
+        return get_vr_filtration_and_critical_edges_naive<Int, Real, D>(points, max_dim, max_diameter, n_threads).first;
     }
 
 };
diff --git a/version.txt b/version.txt
index b4c1b73..236d3b0 100644
--- a/version.txt
+++ b/version.txt
@@ -1,3 +1,3 @@
 MAJOR 0
 MINOR 9
-PATCH 7
+PATCH 8