diff --git a/contrib/evaluation.py b/contrib/evaluation.py
index f78e33951d..9c762b6081 100644
--- a/contrib/evaluation.py
+++ b/contrib/evaluation.py
@@ -226,7 +226,7 @@ def compute_PR_for(q):
 # Functions that compare search results with a reference result.
 # They are intended for use in tests
 
-def test_ref_knn_with_draws(Dref, Iref, Dnew, Inew):
+def check_ref_knn_with_draws(Dref, Iref, Dnew, Inew):
     """ test that knn search results are identical, raise if not """
     np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
     # here we have to be careful because of draws
@@ -243,14 +243,14 @@ def test_ref_knn_with_draws(Dref, Iref, Dnew, Inew):
             testcase.assertEqual(set(Iref[i, mask]), set(Inew[i, mask]))
 
 
-def test_ref_range_results(lims_ref, Dref, Iref,
-                           lims_new, Dnew, Inew):
+def check_ref_range_results(Lref, Dref, Iref,
+                            Lnew, Dnew, Inew):
     """ compare range search results wrt. a reference result,
     throw if it fails """
-    np.testing.assert_array_equal(lims_ref, lims_new)
-    nq = len(lims_ref) - 1
+    np.testing.assert_array_equal(Lref, Lnew)
+    nq = len(Lref) - 1
     for i in range(nq):
-        l0, l1 = lims_ref[i], lims_ref[i + 1]
+        l0, l1 = Lref[i], Lref[i + 1]
         Ii_ref = Iref[l0:l1]
         Ii_new = Inew[l0:l1]
         Di_ref = Dref[l0:l1]
diff --git a/contrib/exhaustive_search.py b/contrib/exhaustive_search.py
index fa7f1b11c4..19b452a25d 100644
--- a/contrib/exhaustive_search.py
+++ b/contrib/exhaustive_search.py
@@ -11,7 +11,6 @@
 
 LOG = logging.getLogger(__name__)
 
-
 def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2):
     """Computes the exact KNN search results for a dataset that possibly
     does not fit in RAM but for which we have an iterator that
@@ -51,47 +50,82 @@ def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2):
 
 
 
-def range_search_gpu(xq, r2, index_gpu, index_cpu):
+def range_search_gpu(xq, r2, index_gpu, index_cpu, gpu_k=1024):
     """GPU does not support range search, so we emulate it with
     knn search + fallback to CPU index.
 
-    The index_cpu can either be a CPU index or a numpy table that will
-    be used to construct a Flat index if needed.
+    The index_cpu can either be:
+    - a CPU index that supports range search
+    - a numpy table, that will be used to construct a Flat index if needed.
+    - None. In that case, at most gpu_k results will be returned
     """
     nq, d = xq.shape
-    LOG.debug("GPU search %d queries" % nq)
-    k = min(index_gpu.ntotal, 1024)
+    k = min(index_gpu.ntotal, gpu_k)
+    keep_max = faiss.is_similarity_metric(index_gpu.metric_type)
+    LOG.debug(f"GPU search {nq} queries with {k=:}")
+    t0 = time.time()
     D, I = index_gpu.search(xq, k)
-    if index_gpu.metric_type == faiss.METRIC_L2:
-        mask = D[:, k - 1] < r2
-    else:
-        mask = D[:, k - 1] > r2
-    if mask.sum() > 0:
-        LOG.debug("CPU search remain %d" % mask.sum())
-        if isinstance(index_cpu, np.ndarray):
-            # then it in fact an array that we have to make flat
-            xb = index_cpu
-            index_cpu = faiss.IndexFlat(d, index_gpu.metric_type)
-            index_cpu.add(xb)
-        lim_remain, D_remain, I_remain = index_cpu.range_search(xq[mask], r2)
+    t1 = time.time() - t0
+    t2 = 0
+    lim_remain = None
+    if index_cpu is not None:
+        if not keep_max:
+            mask = D[:, k - 1] < r2
+        else:
+            mask = D[:, k - 1] > r2
+        if mask.sum() > 0:
+            LOG.debug("CPU search remain %d" % mask.sum())
+            t0 = time.time()
+            if isinstance(index_cpu, np.ndarray):
+                # then it in fact an array that we have to make flat
+                xb = index_cpu
+                index_cpu = faiss.IndexFlat(d, index_gpu.metric_type)
+                index_cpu.add(xb)
+            lim_remain, D_remain, I_remain = index_cpu.range_search(xq[mask], r2)
+            t2 = time.time() - t0
     LOG.debug("combine")
-    D_res, I_res = [], []
-    nr = 0
-    for i in range(nq):
-        if not mask[i]:
-            if index_gpu.metric_type == faiss.METRIC_L2:
-                nv = (D[i, :] < r2).sum()
+    t0 = time.time()
+
+    combiner = faiss.CombinerRangeKNN(nq, k, float(r2), keep_max)
+    if True:
+        sp = faiss.swig_ptr
+        combiner.I = sp(I)
+        combiner.D = sp(D)
+        # combiner.set_knn_result(sp(I), sp(D))
+        if lim_remain is not None:
+            combiner.mask = sp(mask)
+            combiner.D_remain = sp(D_remain)
+            combiner.lim_remain = sp(lim_remain.view("int64"))
+            combiner.I_remain = sp(I_remain)
+            # combiner.set_range_result(sp(mask), sp(lim_remain.view("int64")), sp(D_remain), sp(I_remain))
+        L_res = np.empty(nq + 1, dtype='int64')
+        combiner.compute_sizes(sp(L_res))
+        nres = L_res[-1]
+        D_res = np.empty(nres, dtype='float32')
+        I_res = np.empty(nres, dtype='int64')
+        combiner.write_result(sp(D_res), sp(I_res))
+    else:
+        D_res, I_res = [], []
+        nr = 0
+        for i in range(nq):
+            if not mask[i]:
+                if index_gpu.metric_type == faiss.METRIC_L2:
+                    nv = (D[i, :] < r2).sum()
+                else:
+                    nv = (D[i, :] > r2).sum()
+                D_res.append(D[i, :nv])
+                I_res.append(I[i, :nv])
             else:
-                nv = (D[i, :] > r2).sum()
-            D_res.append(D[i, :nv])
-            I_res.append(I[i, :nv])
-        else:
-            l0, l1 = lim_remain[nr], lim_remain[nr + 1]
-            D_res.append(D_remain[l0:l1])
-            I_res.append(I_remain[l0:l1])
-            nr += 1
-    lims = np.cumsum([0] + [len(di) for di in D_res])
-    return lims, np.hstack(D_res), np.hstack(I_res)
+                l0, l1 = lim_remain[nr], lim_remain[nr + 1]
+                D_res.append(D_remain[l0:l1])
+                I_res.append(I_remain[l0:l1])
+                nr += 1
+        L_res = np.cumsum([0] + [len(di) for di in D_res])
+        D_res = np.hstack(D_res)
+        I_res = np.hstack(I_res)
+    t3 = time.time() - t0
+    LOG.debug(f"times {t1:.3f}s {t2:.3f}s {t3:.3f}s")
+    return L_res, D_res, I_res
 
 
 def range_ground_truth(xq, db_iterator, threshold, metric_type=faiss.METRIC_L2,
diff --git a/contrib/ivf_tools.py b/contrib/ivf_tools.py
index 3ea8827315..26ada886a1 100644
--- a/contrib/ivf_tools.py
+++ b/contrib/ivf_tools.py
@@ -77,7 +77,7 @@ def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
     res = faiss.RangeSearchResult(n)
     sp = faiss.swig_ptr
 
-    index_ivf.range_search_preassigned(
+    index_ivf.range_search_preassigned_c(
         n, sp(x), radius,
         sp(list_nos), sp(coarse_dis),
         res
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 80e1fa20cb..9109981a73 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -309,6 +309,7 @@ Index* ToGpuClonerMultiple::clone_Index_to_shards(const Index* index) {
 
     std::vector<faiss::Index*> shards(n);
 
+#pragma omp parallel for
     for (idx_t i = 0; i < n; i++) {
         // make a shallow copy
         if (reserveVecs) {
diff --git a/faiss/gpu/test/test_contrib_gpu.py b/faiss/gpu/test/test_contrib_gpu.py
index 061df7da5b..faeeb5d0d8 100644
--- a/faiss/gpu/test/test_contrib_gpu.py
+++ b/faiss/gpu/test/test_contrib_gpu.py
@@ -12,7 +12,7 @@
 
 from faiss.contrib import datasets, evaluation, big_batch_search
 from faiss.contrib.exhaustive_search import knn_ground_truth, \
-    range_ground_truth
+    range_ground_truth, range_search_gpu
 
 
 class TestComputeGT(unittest.TestCase):
@@ -51,7 +51,7 @@ def do_test_range(self, metric):
             xq, ds.database_iterator(bs=100), threshold,
             metric_type=metric)
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             ref_lims, ref_D, ref_I,
             new_lims, new_D, new_I
         )
@@ -131,3 +131,49 @@ def knn_function(xq, xb, k, metric=faiss.METRIC_L2, thread_id=None):
 
     def test_Flat(self):
         self.do_test("IVF64,Flat")
+
+
+class TestRangeSearchGpu(unittest.TestCase):
+
+    def do_test(self, factory_string):
+        ds = datasets.SyntheticDataset(32, 2000, 4000, 1000)
+        k = 10
+        index_gpu = faiss.index_cpu_to_all_gpus(
+            faiss.index_factory(ds.d, factory_string)
+        )
+        index_gpu.train(ds.get_train())
+        index_gpu.add(ds.get_database())
+        # just to find a reasonable threshold
+        D, _ = index_gpu.search(ds.get_queries(), k)
+        threshold = np.median(D[:, 5])
+
+        # ref run
+        index_cpu = faiss.index_gpu_to_cpu(index_gpu)
+        Lref, Dref, Iref = index_cpu.range_search(ds.get_queries(), threshold)
+        nres_per_query = Lref[1:] - Lref[:-1]
+        # make sure some entries were computed by CPU and some by GPU
+        assert np.any(nres_per_query > 4) and not np.all(nres_per_query > 4)
+
+        # mixed GPU / CPU run
+        Lnew, Dnew, Inew = range_search_gpu(
+            ds.get_queries(), threshold, index_gpu, index_cpu, gpu_k=4)
+        evaluation.check_ref_range_results(
+            Lref, Dref, Iref,
+            Lnew, Dnew, Inew
+        )
+
+        # also test the version without CPU search
+        Lnew2, Dnew2, Inew2 = range_search_gpu(
+            ds.get_queries(), threshold, index_gpu, None, gpu_k=4)
+        for q in range(ds.nq):
+            ref = Iref[Lref[q]:Lref[q+1]]
+            new = Inew2[Lnew2[q]:Lnew2[q+1]]
+            if nres_per_query[q] <= 4:
+                self.assertEqual(set(ref), set(new))
+            else:
+                ref = set(ref)
+                for v in new:
+                    self.assertIn(v, ref)
+
+    def test_ivf(self):
+        self.do_test("IVF64,Flat")
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 49343043bd..c0f0f9456d 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -22,7 +22,7 @@
 from faiss.extra_wrappers import kmin, kmax, pairwise_distances, rand, randint, \
     lrand, randn, rand_smooth_vectors, eval_intersection, normalize_L2, \
     ResultHeap, knn, Kmeans, checksum, matrix_bucket_sort_inplace, bucket_sort, \
-    merge_knn_results
+    merge_knn_results, MapInt64ToInt64
 
 
 __version__ = "%d.%d.%d" % (FAISS_VERSION_MAJOR,
diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py
index 54539797f5..7d410d4afb 100644
--- a/faiss/python/class_wrappers.py
+++ b/faiss/python/class_wrappers.py
@@ -544,6 +544,7 @@ def replacement_range_search(self, x, thresh, *, params=None):
         n, d = x.shape
         assert d == self.d
         x = np.ascontiguousarray(x, dtype='float32')
+        thresh = float(thresh)
 
         res = RangeSearchResult(n)
         self.range_search_c(n, swig_ptr(x), thresh, res, params)
@@ -618,6 +619,64 @@ def replacement_search_preassigned(self, x, k, Iq, Dq, *, params=None, D=None, I
         )
         return D, I
 
+    def replacement_range_search_preassigned(self, x, thresh, Iq, Dq, *, params=None):
+        """Search vectors that are within a distance of the query vectors.
+
+        Parameters
+        ----------
+        x : array_like
+            Query vectors, shape (n, d) where d is appropriate for the index.
+            `dtype` must be float32.
+        thresh : float
+            Threshold to select neighbors. All elements within this radius are returned,
+            except for maximum inner product indexes, where the elements above the
+            threshold are returned
+        Iq : array_like, optional
+            Nearest centroids, size (n, nprobe)
+        Dq : array_like, optional
+            Distance array to the centroids, size (n, nprobe)
+        params : SearchParameters
+            Search parameters of the current search (overrides the class-level params)
+
+
+        Returns
+        -------
+        lims: array_like
+            Starting index of the results for each query vector, size n+1.
+        D : array_like
+            Distances of the nearest neighbors, shape `lims[n]`. The distances for
+            query i are in `D[lims[i]:lims[i+1]]`.
+        I : array_like
+            Labels of nearest neighbors, shape `lims[n]`. The labels for query i
+            are in `I[lims[i]:lims[i+1]]`.
+
+        """
+        n, d = x.shape
+        assert d == self.d
+        x = np.ascontiguousarray(x, dtype='float32')
+
+        Iq = np.ascontiguousarray(Iq, dtype='int64')
+        assert params is None, "params not supported"
+        assert Iq.shape == (n, self.nprobe)
+
+        if Dq is not None:
+            Dq = np.ascontiguousarray(Dq, dtype='float32')
+            assert Dq.shape == Iq.shape
+
+        thresh = float(thresh)
+        res = RangeSearchResult(n)
+        self.range_search_preassigned_c(
+            n, swig_ptr(x), thresh,
+            swig_ptr(Iq), swig_ptr(Dq),
+            res
+        )
+        # get pointers and copy them
+        lims = rev_swig_ptr(res.lims, n + 1).copy()
+        nd = int(lims[-1])
+        D = rev_swig_ptr(res.distances, nd).copy()
+        I = rev_swig_ptr(res.labels, nd).copy()
+        return lims, D, I
+
     def replacement_sa_encode(self, x, codes=None):
         n, d = x.shape
         assert d == self.d
@@ -675,8 +734,12 @@ def replacement_permute_entries(self, perm):
                    ignore_missing=True)
     replace_method(the_class, 'search_and_reconstruct',
                    replacement_search_and_reconstruct, ignore_missing=True)
+
+    # these ones are IVF-specific
     replace_method(the_class, 'search_preassigned',
                    replacement_search_preassigned, ignore_missing=True)
+    replace_method(the_class, 'range_search_preassigned',
+                   replacement_range_search_preassigned, ignore_missing=True)
     replace_method(the_class, 'sa_encode', replacement_sa_encode)
     replace_method(the_class, 'sa_decode', replacement_sa_decode)
     replace_method(the_class, 'add_sa_codes', replacement_add_sa_codes,
@@ -776,6 +839,36 @@ def replacement_range_search(self, x, thresh):
         I = rev_swig_ptr(res.labels, nd).copy()
         return lims, D, I
 
+    def replacement_range_search_preassigned(self, x, thresh, Iq, Dq, *, params=None):
+        n, d = x.shape
+        x = _check_dtype_uint8(x)
+        assert d * 8 == self.d
+
+        Iq = np.ascontiguousarray(Iq, dtype='int64')
+        assert params is None, "params not supported"
+        assert Iq.shape == (n, self.nprobe)
+
+        if Dq is not None:
+            Dq = np.ascontiguousarray(Dq, dtype='int32')
+            assert Dq.shape == Iq.shape
+
+        thresh = int(thresh)
+        res = RangeSearchResult(n)
+        self.range_search_preassigned_c(
+            n, swig_ptr(x), thresh,
+            swig_ptr(Iq), swig_ptr(Dq),
+            res
+        )
+        # get pointers and copy them
+        lims = rev_swig_ptr(res.lims, n + 1).copy()
+        nd = int(lims[-1])
+        D = rev_swig_ptr(res.distances, nd).copy()
+        I = rev_swig_ptr(res.labels, nd).copy()
+        return lims, D, I
+
+
+
+
     def replacement_remove_ids(self, x):
         if isinstance(x, IDSelector):
             sel = x
@@ -794,6 +887,8 @@ def replacement_remove_ids(self, x):
     replace_method(the_class, 'remove_ids', replacement_remove_ids)
     replace_method(the_class, 'search_preassigned',
                    replacement_search_preassigned, ignore_missing=True)
+    replace_method(the_class, 'range_search_preassigned',
+                   replacement_range_search_preassigned, ignore_missing=True)
 
 
 def handle_VectorTransform(the_class):
@@ -937,7 +1032,7 @@ def handle_MapLong2Long(the_class):
 
     def replacement_map_add(self, keys, vals):
         n, = keys.shape
-        assert (n,) == keys.shape
+        assert (n,) == vals.shape
         self.add_c(n, swig_ptr(keys), swig_ptr(vals))
 
     def replacement_map_search_multiple(self, keys):
diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py
index deff3c8995..d264767724 100644
--- a/faiss/python/extra_wrappers.py
+++ b/faiss/python/extra_wrappers.py
@@ -297,6 +297,34 @@ def merge_knn_results(Dall, Iall, keep_max=False):
     )
     return Dnew, Inew
 
+######################################################
+# Efficient ID to ID map
+######################################################
+
+class MapInt64ToInt64:
+
+    def __init__(self, capacity):
+        self.log2_capacity = int(np.log2(capacity))
+        assert capacity == 2 ** self.log2_capacity, "need power of 2 capacity"
+        self.capacity = capacity
+        self.tab = np.empty((capacity, 2), dtype='int64')
+        faiss.hashtable_int64_to_int64_init(self.log2_capacity, swig_ptr(self.tab))
+
+    def add(self, keys, vals):
+        n, = keys.shape
+        assert vals.shape == (n,)
+        faiss.hashtable_int64_to_int64_add(
+            self.log2_capacity, swig_ptr(self.tab),
+            n, swig_ptr(keys), swig_ptr(vals))
+
+    def lookup(self, keys):
+        n, = keys.shape
+        vals = np.empty((n,), dtype='int64')
+        faiss.hashtable_int64_to_int64_lookup(
+            self.log2_capacity, swig_ptr(self.tab),
+            n, swig_ptr(keys), swig_ptr(vals))
+        return vals
+
 ######################################################
 # KNN function
 ######################################################
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index a57aa93536..b49dcd80e2 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -794,17 +794,141 @@ faiss::Quantizer * downcast_Quantizer (faiss::Quantizer *aq)
 #endif
 
 
+/*******************************************************************
+ * How should the template objects appear in the scripting language?
+ *******************************************************************/
+
+// answer: the same as the C++ typedefs, but we still have to redefine them
+
+%template() faiss::CMin<float, int64_t>;
+%template() faiss::CMin<int, int64_t>;
+%template() faiss::CMax<float, int64_t>;
+%template() faiss::CMax<int, int64_t>;
+
+%template(float_minheap_array_t) faiss::HeapArray<faiss::CMin<float, int64_t> >;
+%template(int_minheap_array_t) faiss::HeapArray<faiss::CMin<int, int64_t> >;
+%template(float_maxheap_array_t) faiss::HeapArray<faiss::CMax<float, int64_t> >;
+%template(int_maxheap_array_t) faiss::HeapArray<faiss::CMax<int, int64_t> >;
+
+%template(CMin_float_partition_fuzzy)
+    faiss::partition_fuzzy<faiss::CMin<float, int64_t> >;
+%template(CMax_float_partition_fuzzy)
+    faiss::partition_fuzzy<faiss::CMax<float, int64_t> >;
+
+%template(AlignedTableUint8) faiss::AlignedTable<uint8_t>;
+%template(AlignedTableUint16) faiss::AlignedTable<uint16_t>;
+%template(AlignedTableFloat32) faiss::AlignedTable<float>;
+
+
+// SWIG seems to have some trouble resolving function template types here, so
+// declare explicitly
+
+%define INSTANTIATE_uint16_partition_fuzzy(C, id_t)
+
+%inline %{
+
+uint16_t C ## _uint16_partition_fuzzy(
+        uint16_t *vals, id_t *ids, size_t n,
+        size_t q_min, size_t q_max, size_t * q_out)
+{
+    return faiss::partition_fuzzy<faiss::C<unsigned short, id_t> >(
+        vals, ids, n, q_min, q_max, q_out);
+}
+
+%}
+
+%enddef
+
+INSTANTIATE_uint16_partition_fuzzy(CMin, int64_t)
+INSTANTIATE_uint16_partition_fuzzy(CMax, int64_t)
+INSTANTIATE_uint16_partition_fuzzy(CMin, int)
+INSTANTIATE_uint16_partition_fuzzy(CMax, int)
+
+// Same for merge_knn_results
+
+// same define as explicit instanciation in Heap.cpp
+%define INSTANTIATE_merge_knn_results(C, distance_t)
+
+%inline %{
+void merge_knn_results_ ## C(
+    size_t n, size_t k, int nshard,
+    const distance_t *all_distances, const faiss::idx_t *all_labels,
+    distance_t *distances, faiss::idx_t *labels)
+{
+    faiss::merge_knn_results<faiss::idx_t, faiss::C<distance_t, int>>(
+        n, k, nshard, all_distances, all_labels, distances, labels);
+}
+%}
+
+%enddef
+
+INSTANTIATE_merge_knn_results(CMin, float);
+INSTANTIATE_merge_knn_results(CMax, float);
+INSTANTIATE_merge_knn_results(CMin, int32_t);
+INSTANTIATE_merge_knn_results(CMax, int32_t);
+
+
+
+%inline %{
+
+/*******************************************************************
+ * numpy misses a hash table implementation, hence this class. It
+ * represents not found values as -1 like in the Index implementation
+ *******************************************************************/
+
+
+struct MapLong2Long {
+    std::unordered_map<int64_t, int64_t> map;
+
+    void add(size_t n, const int64_t *keys, const int64_t *vals) {
+        map.reserve(map.size() + n);
+        for (size_t i = 0; i < n; i++) {
+            map[keys[i]] = vals[i];
+        }
+    }
+
+    int64_t search(int64_t key) const {
+        auto ptr = map.find(key);
+        if (ptr == map.end()) {
+            return -1;
+        } else {
+            return ptr->second;
+        }
+    }
+
+    void search_multiple(size_t n, int64_t *keys, int64_t * vals) {
+        for (size_t i = 0; i < n; i++) {
+            vals[i] = search(keys[i]);
+        }
+    }
+};
+
+%}
+
+
+/*******************************************************************
+ * Expose a few basic functions
+ *******************************************************************/
+
+
+void omp_set_num_threads (int num_threads);
+int omp_get_max_threads ();
+void *memcpy(void *dest, const void *src, size_t n);
+
+
+
+
+/*******************************************************************
+ * Python-specific: do not release GIL any more, as functions below
+ * use the Python/C API
+ *******************************************************************/
 
-// Python-specific: do not release GIL any more, as functions below
-// use the Python/C API
 #ifdef SWIGPYTHON
 %exception;
 #endif
 
 
 
-
-
 /*******************************************************************
  * Python specific: numpy array <-> C++ pointer interface
  *******************************************************************/
@@ -859,6 +983,9 @@ PyObject *swig_ptr (PyObject *a)
     if(PyArray_TYPE(ao) == NPY_INT32) {
         return SWIG_NewPointerObj(data, SWIGTYPE_p_int, 0);
     }
+    if(PyArray_TYPE(ao) == NPY_BOOL) {
+        return SWIG_NewPointerObj(data, SWIGTYPE_p_bool, 0);
+    }
     if(PyArray_TYPE(ao) == NPY_UINT64) {
 #ifdef SWIGWORDSIZE64
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);
@@ -935,90 +1062,6 @@ REV_SWIG_PTR(uint64_t, NPY_UINT64);
 
 
 
-/*******************************************************************
- * How should the template objects appear in the scripting language?
- *******************************************************************/
-
-// answer: the same as the C++ typedefs, but we still have to redefine them
-
-%template() faiss::CMin<float, int64_t>;
-%template() faiss::CMin<int, int64_t>;
-%template() faiss::CMax<float, int64_t>;
-%template() faiss::CMax<int, int64_t>;
-
-%template(float_minheap_array_t) faiss::HeapArray<faiss::CMin<float, int64_t> >;
-%template(int_minheap_array_t) faiss::HeapArray<faiss::CMin<int, int64_t> >;
-%template(float_maxheap_array_t) faiss::HeapArray<faiss::CMax<float, int64_t> >;
-%template(int_maxheap_array_t) faiss::HeapArray<faiss::CMax<int, int64_t> >;
-
-%template(CMin_float_partition_fuzzy)
-    faiss::partition_fuzzy<faiss::CMin<float, int64_t> >;
-%template(CMax_float_partition_fuzzy)
-    faiss::partition_fuzzy<faiss::CMax<float, int64_t> >;
-
-%template(AlignedTableUint8) faiss::AlignedTable<uint8_t>;
-%template(AlignedTableUint16) faiss::AlignedTable<uint16_t>;
-%template(AlignedTableFloat32) faiss::AlignedTable<float>;
-
-
-// SWIG seems to have some trouble resolving function template types here, so
-// declare explicitly
-
-%define INSTANTIATE_uint16_partition_fuzzy(C, id_t)
-
-%inline %{
-
-uint16_t C ## _uint16_partition_fuzzy(
-        uint16_t *vals, id_t *ids, size_t n,
-        size_t q_min, size_t q_max, size_t * q_out)
-{
-    return faiss::partition_fuzzy<faiss::C<unsigned short, id_t> >(
-        vals, ids, n, q_min, q_max, q_out);
-}
-
-%}
-
-%enddef
-
-INSTANTIATE_uint16_partition_fuzzy(CMin, int64_t)
-INSTANTIATE_uint16_partition_fuzzy(CMax, int64_t)
-INSTANTIATE_uint16_partition_fuzzy(CMin, int)
-INSTANTIATE_uint16_partition_fuzzy(CMax, int)
-
-// Same for merge_knn_results
-
-// same define as explicit instanciation in Heap.cpp
-%define INSTANTIATE_merge_knn_results(C, distance_t)
-
-%inline %{
-void merge_knn_results_ ## C(
-    size_t n, size_t k, int nshard,
-    const distance_t *all_distances, const faiss::idx_t *all_labels,
-    distance_t *distances, faiss::idx_t *labels)
-{
-    faiss::merge_knn_results<faiss::idx_t, faiss::C<distance_t, int>>(
-        n, k, nshard, all_distances, all_labels, distances, labels);
-}
-%}
-
-%enddef
-
-INSTANTIATE_merge_knn_results(CMin, float);
-INSTANTIATE_merge_knn_results(CMax, float);
-INSTANTIATE_merge_knn_results(CMin, int32_t);
-INSTANTIATE_merge_knn_results(CMax, int32_t);
-
-
-/*******************************************************************
- * Expose a few basic functions
- *******************************************************************/
-
-
-void omp_set_num_threads (int num_threads);
-int omp_get_max_threads ();
-void *memcpy(void *dest, const void *src, size_t n);
-
-
 /*******************************************************************
  * For Faiss/Pytorch interop via pointers encoded as longs
  *******************************************************************/
@@ -1047,42 +1090,6 @@ void * cast_integer_to_void_ptr (int64_t x) {
 %}
 
 
-/*******************************************************************
- * Range search interface
- *******************************************************************/
-
-
-%inline %{
-
-// numpy misses a hash table implementation, hence this class. It
-// represents not found values as -1 like in the Index implementation
-
-struct MapLong2Long {
-    std::unordered_map<int64_t, int64_t> map;
-
-    void add(size_t n, const int64_t *keys, const int64_t *vals) {
-        map.reserve(map.size() + n);
-        for (size_t i = 0; i < n; i++) {
-            map[keys[i]] = vals[i];
-        }
-    }
-
-    int64_t search(int64_t key) {
-        if (map.count(key) == 0) {
-            return -1;
-        } else {
-            return map[key];
-        }
-    }
-
-    void search_multiple(size_t n, int64_t *keys, int64_t * vals) {
-        for (size_t i = 0; i < n; i++) {
-            vals[i] = search(keys[i]);
-        }
-    }
-};
-
-%}
 
 
 
diff --git a/faiss/utils/partitioning.h b/faiss/utils/partitioning.h
index 90157d2312..cd63592401 100644
--- a/faiss/utils/partitioning.h
+++ b/faiss/utils/partitioning.h
@@ -71,4 +71,5 @@ struct PartitionStats {
 // global var that collects them all
 FAISS_API extern PartitionStats partition_stats;
 
+
 } // namespace faiss
diff --git a/faiss/utils/sorting.cpp b/faiss/utils/sorting.cpp
index 18c3f963a5..67dd51bf73 100644
--- a/faiss/utils/sorting.cpp
+++ b/faiss/utils/sorting.cpp
@@ -689,4 +689,144 @@ void matrix_bucket_sort_inplace(
     }
 }
 
+/** Hashtable implementation for int64 -> int64 with external storage
+ * implemented for speed and parallel processing.
+ */
+
+namespace {
+
+int log2_capacity_to_log2_nbucket(int log2_capacity) {
+    return log2_capacity < 12    ? 0
+            : log2_capacity < 20 ? log2_capacity - 12
+                                 : 10;
+}
+
+// https://bigprimes.org/
+int64_t bigprime = 8955327411143;
+
+inline int64_t hash_function(int64_t x) {
+    return (x * 1000003) % bigprime;
+}
+
+} // anonymous namespace
+
+void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) {
+    size_t capacity = (size_t)1 << log2_capacity;
+#pragma omp parallel for
+    for (int64_t i = 0; i < capacity; i++) {
+        tab[2 * i] = -1;
+        tab[2 * i + 1] = -1;
+    }
+}
+
+void hashtable_int64_to_int64_add(
+        int log2_capacity,
+        int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        const int64_t* vals) {
+    size_t capacity = (size_t)1 << log2_capacity;
+    std::vector<int64_t> hk(n);
+    std::vector<uint64_t> bucket_no(n);
+    int64_t mask = capacity - 1;
+    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
+    size_t nbucket = (size_t)1 << log2_nbucket;
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        hk[i] = hash_function(keys[i]) & mask;
+        bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket);
+    }
+
+    std::vector<int64_t> lims(nbucket + 1);
+    std::vector<int64_t> perm(n);
+    bucket_sort(
+            n,
+            bucket_no.data(),
+            nbucket,
+            lims.data(),
+            perm.data(),
+            omp_get_max_threads());
+
+    int num_errors = 0;
+#pragma omp parallel for reduction(+ : num_errors)
+    for (int64_t bucket = 0; bucket < nbucket; bucket++) {
+        size_t k0 = bucket << (log2_capacity - log2_nbucket);
+        size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
+
+        for (size_t i = lims[bucket]; i < lims[bucket + 1]; i++) {
+            int64_t j = perm[i];
+            assert(bucket_no[j] == bucket);
+            assert(hk[j] >= k0 && hk[j] < k1);
+            size_t slot = hk[j];
+            for (;;) {
+                if (tab[slot * 2] == -1) { // found!
+                    tab[slot * 2] = keys[j];
+                    tab[slot * 2 + 1] = vals[j];
+                    break;
+                } else if (tab[slot * 2] == keys[j]) { // overwrite!
+                    tab[slot * 2 + 1] = vals[j];
+                    break;
+                }
+                slot++;
+                if (slot == k1) {
+                    slot = k0;
+                }
+                if (slot == hk[j]) { // no free slot left in bucket
+                    num_errors++;
+                    break;
+                }
+            }
+            if (num_errors > 0) {
+                break;
+            }
+        }
+    }
+    FAISS_THROW_IF_NOT_MSG(num_errors == 0, "hashtable capacity exhausted");
+}
+
+void hashtable_int64_to_int64_lookup(
+        int log2_capacity,
+        const int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        int64_t* vals) {
+    size_t capacity = (size_t)1 << log2_capacity;
+    std::vector<int64_t> hk(n), bucket_no(n);
+    int64_t mask = capacity - 1;
+    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
+    size_t nbucket = (size_t)1 << log2_nbucket;
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        int64_t k = keys[i];
+        int64_t hk = hash_function(k) & mask;
+        size_t slot = hk;
+
+        if (tab[2 * slot] == -1) { // not in table
+            vals[i] = -1;
+        } else if (tab[2 * slot] == k) { // found!
+            vals[i] = tab[2 * slot + 1];
+        } else { // need to search in [k0, k1)
+            size_t bucket = hk >> (log2_capacity - log2_nbucket);
+            size_t k0 = bucket << (log2_capacity - log2_nbucket);
+            size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
+            for (;;) {
+                if (tab[slot * 2] == k) { // found!
+                    vals[i] = tab[2 * slot + 1];
+                    break;
+                }
+                slot++;
+                if (slot == k1) {
+                    slot = k0;
+                }
+                if (slot == hk) { // bucket is full and not found
+                    vals[i] = -1;
+                    break;
+                }
+            }
+        }
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/utils/sorting.h b/faiss/utils/sorting.h
index a7dd783b1e..593d952ae0 100644
--- a/faiss/utils/sorting.h
+++ b/faiss/utils/sorting.h
@@ -68,4 +68,31 @@ void matrix_bucket_sort_inplace(
         int64_t* lims,
         int nt = 0);
 
+/** Hashtable implementation for int64 -> int64 with external storage
+ * implemented for fast batch add and lookup.
+ *
+ * tab is of size  2 * (1 << log2_capacity)
+ * n is the number of elements to add or search
+ *
+ * adding several values in a same batch: an arbitrary one gets added
+ * in different batches: the newer batch overwrites.
+ * raises an exception if capacity is exhausted.
+ */
+
+void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab);
+
+void hashtable_int64_to_int64_add(
+        int log2_capacity,
+        int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        const int64_t* vals);
+
+void hashtable_int64_to_int64_lookup(
+        int log2_capacity,
+        const int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        int64_t* vals);
+
 } // namespace faiss
diff --git a/faiss/utils/utils.cpp b/faiss/utils/utils.cpp
index 894653b1cd..54a3700672 100644
--- a/faiss/utils/utils.cpp
+++ b/faiss/utils/utils.cpp
@@ -528,4 +528,66 @@ bool check_openmp() {
     return true;
 }
 
+namespace {
+
+int64_t count_lt(int64_t n, const float* row, float threshold) {
+    for (int64_t i = 0; i < n; i++) {
+        if (!(row[i] < threshold)) {
+            return i;
+        }
+    }
+    return n;
+}
+
+int64_t count_gt(int64_t n, const float* row, float threshold) {
+    for (int64_t i = 0; i < n; i++) {
+        if (!(row[i] > threshold)) {
+            return i;
+        }
+    }
+    return n;
+}
+
+} // namespace
+
+void CombinerRangeKNN::compute_sizes(int64_t* L_res) {
+    this->L_res = L_res;
+    L_res[0] = 0;
+    int64_t j = 0;
+    for (int64_t i = 0; i < nq; i++) {
+        int64_t n_in;
+        if (!mask || !mask[i]) {
+            const float* row = D + i * k;
+            n_in = keep_max ? count_gt(k, row, r2)
+                            : count_lt(k, row, r2);
+        } else {
+            n_in = lim_remain[j + 1] - lim_remain[j];
+            j++;
+        }
+        L_res[i + 1] = n_in; // L_res[i] + n_in;
+    }
+    // cumsum
+    for (int64_t i = 0; i < nq; i++) {
+        L_res[i + 1] += L_res[i];
+    }
+}
+
+void CombinerRangeKNN::write_result(float* D_res, int64_t* I_res) {
+    FAISS_THROW_IF_NOT(L_res);
+    int64_t j = 0;
+    for (int64_t i = 0; i < nq; i++) {
+        int64_t n_in = L_res[i + 1] - L_res[i];
+        float* D_row = D_res + L_res[i];
+        int64_t* I_row = I_res + L_res[i];
+        if (!mask || !mask[i]) {
+            memcpy(D_row, D + i * k, n_in * sizeof(*D_row));
+            memcpy(I_row, I + i * k, n_in * sizeof(*I_row));
+        } else {
+            memcpy(D_row, D_remain + lim_remain[j], n_in * sizeof(*D_row));
+            memcpy(I_row, I_remain + lim_remain[j], n_in * sizeof(*I_row));
+            j++;
+        }
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/utils/utils.h b/faiss/utils/utils.h
index b1080a929b..af7c677e39 100644
--- a/faiss/utils/utils.h
+++ b/faiss/utils/utils.h
@@ -163,6 +163,39 @@ uint64_t hash_bytes(const uint8_t* bytes, int64_t n);
 /** Whether OpenMP annotations were respected. */
 bool check_openmp();
 
+/** This class is used to combine range and knn search results
+ * in contrib.exhaustive_search.range_search_gpu */
+
+struct CombinerRangeKNN {
+    int64_t nq;    /// nb of queries
+    size_t k;      /// number of neighbors for the knn search part
+    float r2;      /// range search radius
+    bool keep_max; /// whether to keep max values instead of min.
+
+    CombinerRangeKNN(int64_t nq, size_t k, float r2, bool keep_max)
+            : nq(nq), k(k), r2(r2), keep_max(keep_max) {}
+
+    /// Knn search results
+    const int64_t* I = nullptr; /// size nq * k
+    const float* D = nullptr;   /// size nq * k
+
+    /// optional: range search results (ignored if mask is NULL)
+    const bool* mask =
+            nullptr; /// mask for where knn results are valid, size nq
+    // range search results for remaining entries nrange = sum(mask)
+    const int64_t* lim_remain = nullptr; /// size nrange + 1
+    const float* D_remain = nullptr;     /// size lim_remain[nrange]
+    const int64_t* I_remain = nullptr;   /// size lim_remain[nrange]
+
+    const int64_t* L_res = nullptr; /// size nq + 1
+    // Phase 1: compute sizes into limits array (of size nq + 1)
+    void compute_sizes(int64_t* L_res);
+
+    /// Phase 2: caller allocates D_res and I_res (size L_res[nq])
+    /// Phase 3: fill in D_res and I_res
+    void write_result(float* D_res, int64_t* I_res);
+};
+
 } // namespace faiss
 
 #endif /* FAISS_utils_h */
diff --git a/tests/test_build_blocks.py b/tests/test_build_blocks.py
index 342d56443a..189c785998 100644
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@@ -693,3 +693,35 @@ def test_max_int(self):
 
     def test_max_float(self):
         self.do_test(ismax=True, dtype='float32')
+
+
+class TestMapInt64ToInt64(unittest.TestCase):
+
+    def do_test(self, capacity, n):
+        """ test that we are able to lookup """
+        rs = np.random.RandomState(123)
+        # make sure we have unique values
+        keys = np.unique(rs.choice(2 ** 29, size=n))
+        rs.shuffle(keys)
+        n = keys.size
+        vals = rs.choice(2 ** 30, size=n).astype('int64')
+        tab = faiss.MapInt64ToInt64(capacity)
+        tab.add(keys, vals)
+
+        # lookup and check
+        vals2 = tab.lookup(keys)
+        np.testing.assert_array_equal(vals, vals2)
+
+        # make a few keys that we know are not there
+        mask = rs.rand(n) < 0.3
+        keys[mask] = rs.choice(2 ** 29, size=n)[mask] + 2 ** 29
+        vals2 = tab.lookup(keys)
+        np.testing.assert_array_equal(-1, vals2[mask])
+        np.testing.assert_array_equal(vals[~mask], vals2[~mask])
+
+    def test_small(self):
+        self.do_test(16384, 10000)
+
+    def xx_test_large(self):
+        # don't run by default because it's slow
+        self.do_test(2 ** 21, 10 ** 6)
diff --git a/tests/test_contrib.py b/tests/test_contrib.py
index 10ac3224c7..ad85a3ddd0 100644
--- a/tests/test_contrib.py
+++ b/tests/test_contrib.py
@@ -130,7 +130,7 @@ def do_test_range(self, metric):
             xq, ds.database_iterator(bs=100), threshold, ngpu=0,
             metric_type=metric)
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             ref_lims, ref_D, ref_I,
             new_lims, new_D, new_I
         )
@@ -161,7 +161,7 @@ def matrix_iterator(xb, bs):
         _, new_lims, new_D, new_I = range_search_max_results(
             index, matrix_iterator(xq, 100), threshold, max_results=1e10)
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             ref_lims, ref_D, ref_I,
             new_lims, new_D, new_I
         )
@@ -175,7 +175,7 @@ def matrix_iterator(xb, bs):
 
         ref_lims, ref_D, ref_I = index.range_search(xq, new_threshold)
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             ref_lims, ref_D, ref_I,
             new_lims, new_D, new_I
         )
@@ -435,7 +435,7 @@ def do_test(self, metric_type):
             min_results=Dref.size, clip_to_min=True
         )
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             lims_ref, Dref, Iref,
             lims_new, Dnew, Inew
         )
diff --git a/tests/test_index.py b/tests/test_index.py
index bc9392d7ea..0e828e08c1 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1112,9 +1112,6 @@ def test_IVFSQ(self):
     def test_IVFPQ(self):
         self.do_test("IVF5,PQ4x4np")
 
-if __name__ == '__main__':
-    unittest.main()
-
 
 class TestValidIndexParams(unittest.TestCase):
 
diff --git a/tests/test_partition.py b/tests/test_partition.py
index 08087fb7c3..02de7e8c2c 100644
--- a/tests/test_partition.py
+++ b/tests/test_partition.py
@@ -40,6 +40,8 @@ def test_partition_fuzzy_2(self):
         self.do_partition(160, (70, 80))
 
 
+def pointer_to_minus1():
+    return np.array([-1], dtype='int64').view("uint64")
 
 class TestPartitioningFloat(unittest.TestCase, PartitionTests):
 
@@ -67,7 +69,7 @@ def do_partition(self, n, q, maxval=None, seed=None):
             )
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             faiss.CMax_float_partition_fuzzy(
                 sp(vals), sp(ids), n,
                 q_min, q_max, sp(q)
@@ -117,7 +119,7 @@ def do_partition(self, n, q, maxval=None, seed=None):
             )
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             faiss.CMin_float_partition_fuzzy(
                 sp(vals), sp(ids), n,
                 q_min, q_max, sp(q)
@@ -164,7 +166,7 @@ def do_partition(self, n, q, maxval=65536, seed=None):
                 tab_a.get(), sp(ids), n, q, q, None)
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             faiss.CMax_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n,
                 q_min, q_max, sp(q)
@@ -213,7 +215,7 @@ def do_partition(self, n, q, maxval=65536, seed=None):
                 tab_a.get(), sp(ids), n, q, q, None)
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             thresh2 = faiss.CMin_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n,
                 q_min, q_max, sp(q)
diff --git a/tests/test_search_params.py b/tests/test_search_params.py
index 2c76f151dc..7b83bbfcc5 100644
--- a/tests/test_search_params.py
+++ b/tests/test_search_params.py
@@ -9,7 +9,7 @@
 import unittest
 
 from faiss.contrib import datasets
-from faiss.contrib.evaluation import sort_range_res_2
+from faiss.contrib.evaluation import sort_range_res_2, check_ref_range_results
 
 faiss.omp_set_num_threads(4)
 
@@ -416,3 +416,28 @@ def test_12_92(self):
             len(ids), sp(ids), sp(j01[:1]), sp(j01[1:]))
         print(j01)
         assert j01[0] >= j01[1]
+
+class TestPrecomputed(unittest.TestCase):
+
+    def test_knn_and_range(self):
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, "IVF32,Flat")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        Dq, Iq = index.quantizer.search(ds.get_queries(), index.nprobe)
+        Dnew, Inew = index.search_preassigned(ds.get_queries(), 10, Iq, Dq)
+        np.testing.assert_equal(Iref, Inew)
+        np.testing.assert_equal(Dref, Dnew)
+
+        r2 = float(np.median(Dref[:, 5]))
+        Lref, Dref, Iref = index.range_search(ds.get_queries(), r2)
+        assert Lref.size > 10   # make sure there is something to test...
+
+        Lnew, Dnew, Inew = index.range_search_preassigned(ds.get_queries(), r2, Iq, Dq)
+        check_ref_range_results(
+            Lref, Dref, Iref,
+            Lnew, Dnew, Inew
+        )