From 1123789165e72fa520acf4c068eb04ae9d1d41a3 Mon Sep 17 00:00:00 2001
From: Sphere L <sph6r6.l1u@gmail.com>
Date: Sun, 12 Nov 2023 23:20:28 -0600
Subject: [PATCH 01/19] Multi-input mv-lookup. (#49)

* Add mv_lookup.rs

* mv_lookup::prover, mv_lookup::verifier

* Replace lookup with mv_lookup

* replace halo2 with mv lookup

Co-authored-by: ying tong <therealyingtong@users.noreply.github.com>

* cleanups

Co-authored-by: ying tong <therealyingtong@users.noreply.github.com>

* ConstraintSystem: setup lookup_tracker

Co-authored-by: Andrija <akinovak@gmail.com>

* mv_lookup::hybrid_prover

Co-authored-by: Andrija <akinovak@gmail.com>

* WIP

* mv_multi_lookup: enable lookup caching

Co-authored-by: therealyingtong <yingtong.lai@gmail.com>

* Rename hybrid_lookup -> lookup

* Chunk lookups using user-provided minimum degree

Co-authored-by: Andrija <akinovak@gmail.com>

* mv_lookup bench

Co-authored-by: Andrija <akinovak@gmail.com>

* Introduce counter feature for FFTs and MSMs

Co-authored-by: Andrija <akinovak@gmail.com>

* Fix off-by-one errors in chunk_lookup

Co-authored-by: Andrija <akinovak@gmail.com>

* bench wip

* time evaluate_h

* KZG

* more efficient batch inversion

* extended lookup example

* Finalize mv lookup

Author: therealyingtong <yingtong.lai@gmail.com>

* Remove main/

* Fix according to the comments

* replace scan with parallel grand sum computation

* Revert Cargo.lock

* mv lookup Argument name

* parallel batch invert

---------

Co-authored-by: Andrija <akinovak@gmail.com>
Co-authored-by: ying tong <therealyingtong@users.noreply.github.com>
Co-authored-by: therealyingtong <yingtong.lai@gmail.com>
---
 Cargo.lock                                    |   3 +-
 halo2_proofs/Cargo.toml                       |   7 +
 halo2_proofs/benches/lookups.rs               | 239 ++++++++++
 halo2_proofs/src/arithmetic.rs                |  22 +
 halo2_proofs/src/dev.rs                       | 172 +++----
 halo2_proofs/src/dev/failure.rs               | 149 +++---
 halo2_proofs/src/lib.rs                       |  19 +
 halo2_proofs/src/plonk.rs                     |   2 +
 halo2_proofs/src/plonk/circuit.rs             | 172 +++++--
 .../src/plonk/circuit/compress_selectors.rs   |   2 +-
 halo2_proofs/src/plonk/evaluation.rs          | 243 +++++++---
 halo2_proofs/src/plonk/keygen.rs              |   2 +
 halo2_proofs/src/plonk/mv_lookup.rs           |  99 ++++
 .../src/plonk/mv_lookup/exec_info.json        |  46 ++
 halo2_proofs/src/plonk/mv_lookup/prover.rs    | 440 ++++++++++++++++++
 halo2_proofs/src/plonk/mv_lookup/verifier.rs  | 195 ++++++++
 halo2_proofs/src/plonk/prover.rs              |  42 +-
 halo2_proofs/src/plonk/verifier.rs            |  50 +-
 halo2_proofs/src/poly.rs                      |   2 +-
 halo2_proofs/tests/plonk_api.rs               |  25 +-
 20 files changed, 1645 insertions(+), 286 deletions(-)
 create mode 100644 halo2_proofs/benches/lookups.rs
 create mode 100644 halo2_proofs/src/plonk/mv_lookup.rs
 create mode 100644 halo2_proofs/src/plonk/mv_lookup/exec_info.json
 create mode 100644 halo2_proofs/src/plonk/mv_lookup/prover.rs
 create mode 100644 halo2_proofs/src/plonk/mv_lookup/verifier.rs

diff --git a/Cargo.lock b/Cargo.lock
index ec56f19b08..fe9a1ec7f3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -887,6 +887,7 @@ dependencies = [
  "group",
  "gumdrop",
  "halo2curves 0.3.1 (git+https://github.com/scroll-tech/halo2curves.git?branch=0.3.1-derive-serde)",
+ "lazy_static",
  "log",
  "num-bigint",
  "num-integer",
@@ -2234,4 +2235,4 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30b31594f29d27036c383b53b59ed3476874d518f0efb151b27a4c275141390e"
 dependencies = [
  "tap",
-]
+]
\ No newline at end of file
diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 148088899b..1aab9fdd02 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -39,6 +39,10 @@ harness = false
 name = "dev_lookup"
 harness = false
 
+[[bench]]
+name = "lookups"
+harness = false
+
 [[bench]]
 name = "fft"
 harness = false
@@ -63,12 +67,14 @@ crossbeam = "0.8.0"
 # Developer tooling dependencies
 plotters = { version = "0.3.0", optional = true }
 tabbycat = { version = "0.1", features = ["attributes"], optional = true }
+lazy_static = { version = "1", optional = true }
 log = "0.4.17"
 
 # timer
 ark-std = { version = "0.3.0" }
 env_logger = "0.8.0"
 
+
 [dev-dependencies]
 assert_matches = "1.5"
 criterion = "0.3"
@@ -90,6 +96,7 @@ gwc = []
 parallel_syn = []
 phase-check = []
 profile = ["ark-std/print-trace"]
+counter = ["lazy_static"]
 mock-batch-inv = []
 
 [lib]
diff --git a/halo2_proofs/benches/lookups.rs b/halo2_proofs/benches/lookups.rs
new file mode 100644
index 0000000000..e9fc4eb4ef
--- /dev/null
+++ b/halo2_proofs/benches/lookups.rs
@@ -0,0 +1,239 @@
+#[macro_use]
+extern crate criterion;
+
+use halo2_proofs::arithmetic::FieldExt;
+use halo2_proofs::circuit::{Layouter, SimpleFloorPlanner, Value};
+use halo2_proofs::plonk::*;
+use halo2_proofs::poly::kzg::multiopen::VerifierGWC;
+use halo2_proofs::poly::{commitment::ParamsProver, Rotation};
+use halo2_proofs::transcript::{Blake2bRead, Blake2bWrite, Challenge255};
+use halo2curves::bn256::{Bn256, G1Affine};
+use halo2curves::pairing::Engine;
+use rand_core::OsRng;
+
+use halo2_proofs::{
+    poly::{
+        kzg::{
+            commitment::{KZGCommitmentScheme, ParamsKZG},
+            multiopen::ProverGWC,
+            strategy::SingleStrategy,
+        },
+    },
+    transcript::{TranscriptReadBuffer, TranscriptWriterBuffer},
+};
+
+use std::marker::PhantomData;
+
+use criterion::{BenchmarkId, Criterion};
+
+fn criterion_benchmark(c: &mut Criterion) {
+    #[derive(Clone, Default)]
+    struct MyCircuit<F: FieldExt> {
+        _marker: PhantomData<F>,
+    }
+
+    #[derive(Clone)]
+    struct MyConfig {
+        selector: Selector,
+        table: TableColumn,
+        advice: Column<Advice>,
+        other_advice: Column<Advice>,
+    }
+
+    impl<F: FieldExt> Circuit<F> for MyCircuit<F> {
+        type Config = MyConfig;
+        type FloorPlanner = SimpleFloorPlanner;
+
+        fn without_witnesses(&self) -> Self {
+            Self::default()
+        }
+
+        fn configure(meta: &mut ConstraintSystem<F>) -> MyConfig {
+            let config = MyConfig {
+                selector: meta.complex_selector(),
+                table: meta.lookup_table_column(),
+                advice: meta.advice_column(),
+                other_advice: meta.advice_column(),
+            };
+
+            let dummy_selector = meta.complex_selector();
+
+            meta.create_gate("degree 6 gate", |meta| {
+                let dummy_selector = meta.query_selector(dummy_selector);
+                let constraints = vec![dummy_selector.clone(); 4].iter().fold(dummy_selector.clone(), |acc, val| acc * val.clone());
+                Constraints::with_selector(dummy_selector, Some(constraints))
+            });
+
+            meta.lookup("lookup", |meta| {
+                let advice = meta.query_advice(config.advice, Rotation::cur());
+                vec![(advice, config.table)]
+            });
+
+            meta.lookup("lookup", |meta| {
+                let advice = meta.query_advice(config.advice, Rotation::cur());
+                vec![(advice, config.table)]
+            });
+
+            meta.lookup("lookup", |meta| {
+                let advice = meta.query_advice(config.advice, Rotation::cur());
+                vec![(advice, config.table)]
+            });
+
+            meta.lookup("lookup", |meta| {
+                let advice = meta.query_advice(config.advice, Rotation::cur());
+                vec![(advice, config.table)]
+            });
+
+            meta.lookup("lookup", |meta| {
+                let advice = meta.query_advice(config.advice, Rotation::cur());
+                vec![(advice, config.table)]
+            });
+
+            /*
+                - We need degree at least 6 because 6 - 1 = 5 and we need to go to extended domain of 8n
+                - Our goal is to get to max degree of 9 because now 9 - 1 = 8 and that will fit into domain
+            
+                - base degree = table_deg + 2 
+                - if we put input_expression_degree = 1
+                => degree = base + 1 = 3 + 1 = 4
+                - we can batch one more with 5 more lookups
+            */
+
+            config
+        }
+
+        fn synthesize(
+            &self,
+            config: MyConfig,
+            mut layouter: impl Layouter<F>,
+        ) -> Result<(), Error> {
+            layouter.assign_table(
+                || "8-bit table",
+                |mut table| {
+                    for row in 0u64..(1 << 8) {
+                        table.assign_cell(
+                            || format!("row {}", row),
+                            config.table,
+                            row as usize,
+                            || Value::known(F::from(row)),
+                        )?;
+                    }
+
+                    Ok(())
+                },
+            )?;
+
+            layouter.assign_region(
+                || "assign values",
+                |mut region| {
+                    for offset in 0u64..(1 << 10) {
+                        config.selector.enable(&mut region, offset as usize)?;
+                        region.assign_advice(
+                            || format!("offset {}", offset),
+                            config.advice,
+                            offset as usize,
+                            || Value::known(F::from((offset % 256))),
+                        )?;
+                    }
+                    for offset in 1u64..(1 << 10) {
+                        config.selector.enable(&mut region, offset as usize)?;
+                        region.assign_advice(
+                            || format!("offset {}", offset),
+                            config.other_advice,
+                            offset as usize - 1,
+                            || Value::known(F::from((offset % 256))),
+                        )?;
+                    }
+                    Ok(())
+                },
+            )
+        }
+    }
+
+    fn keygen(k: u32) -> (ParamsKZG<Bn256>, ProvingKey<G1Affine>) {
+        let params: ParamsKZG<Bn256> = ParamsKZG::new(k);
+        let empty_circuit: MyCircuit<<Bn256 as Engine>::Scalar> = MyCircuit {
+            _marker: PhantomData,
+        };
+        let vk = keygen_vk(&params, &empty_circuit).expect("keygen_vk should not fail");
+        let pk = keygen_pk(&params, vk, &empty_circuit).expect("keygen_pk should not fail");
+        (params, pk)
+    }
+
+    fn prover(k: u32, params: &ParamsKZG<Bn256>, pk: &ProvingKey<G1Affine>) -> Vec<u8> {
+        let rng = OsRng;
+
+        let circuit: MyCircuit<<Bn256 as Engine>::Scalar> = MyCircuit {
+            _marker: PhantomData,
+        };
+
+        let mut transcript = Blake2bWrite::<_, _, Challenge255<G1Affine>>::init(vec![]);
+        create_proof::<KZGCommitmentScheme<Bn256>, ProverGWC<'_, Bn256>, _, _, _, _>(
+            params,
+            pk,
+            &[circuit],
+            &[&[]],
+            rng,
+            &mut transcript,
+        )
+        .expect("proof generation should not fail");
+        transcript.finalize()
+    }
+
+    fn verifier(params: &ParamsKZG<Bn256>, vk: &VerifyingKey<G1Affine>, proof: &[u8]) {
+        let strategy = SingleStrategy::new(params);
+        let mut transcript = Blake2bRead::<_, _, Challenge255<G1Affine>>::init(proof);
+        assert!(verify_proof::<
+            KZGCommitmentScheme<Bn256>,
+            VerifierGWC<'_, Bn256>,
+            Challenge255<G1Affine>,
+            Blake2bRead<&[u8], G1Affine, Challenge255<G1Affine>>,
+            SingleStrategy<'_, Bn256>,
+        >(params, vk, strategy, &[&[]], &mut transcript)
+        .is_ok());
+    }
+
+    let k_range = 16..=16;
+
+    let mut keygen_group = c.benchmark_group("plonk-keygen");
+    keygen_group.sample_size(10);
+    for k in k_range.clone() {
+        keygen_group.bench_with_input(BenchmarkId::from_parameter(k), &k, |b, &k| {
+            b.iter(|| keygen(k));
+        });
+    }
+    keygen_group.finish();
+
+    let mut prover_group = c.benchmark_group("plonk-prover");
+    prover_group.sample_size(10);
+    for k in k_range.clone() {
+        let (params, pk) = keygen(k);
+
+        prover_group.bench_with_input(
+            BenchmarkId::from_parameter(k),
+            &(k, &params, &pk),
+            |b, &(k, params, pk)| {
+                b.iter(|| prover(k, params, pk));
+            },
+        );
+    }
+    prover_group.finish();
+
+    let mut verifier_group = c.benchmark_group("plonk-verifier");
+    for k in k_range {
+        let (params, pk) = keygen(k);
+        let proof = prover(k, &params, &pk);
+
+        verifier_group.bench_with_input(
+            BenchmarkId::from_parameter(k),
+            &(&params, pk.get_vk(), &proof[..]),
+            |b, &(params, vk, proof)| {
+                b.iter(|| verifier(params, vk, proof));
+            },
+        );
+    }
+    verifier_group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
index c06575b549..8cddc98ffb 100644
--- a/halo2_proofs/src/arithmetic.rs
+++ b/halo2_proofs/src/arithmetic.rs
@@ -132,6 +132,17 @@ pub fn small_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::C
 ///
 /// This will use multithreading if beneficial.
 pub fn best_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Curve {
+    #[cfg(feature = "counter")]
+    {
+        use crate::MSM_COUNTER;
+        *MSM_COUNTER
+            .lock()
+            .unwrap()
+            .entry(coeffs.len())
+            .and_modify(|cnt| *cnt += 1)
+            .or_insert(1);
+    }
+
     assert_eq!(coeffs.len(), bases.len());
 
     let num_threads = multicore::current_num_threads();
@@ -171,6 +182,17 @@ pub fn best_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Cu
 ///
 /// This will use multithreading if beneficial.
 pub fn best_fft<G: Group>(a: &mut [G], omega: G::Scalar, log_n: u32) {
+    #[cfg(feature = "counter")]
+    {
+        use crate::FFT_COUNTER;
+        *FFT_COUNTER
+            .lock()
+            .unwrap()
+            .entry(a.len())
+            .and_modify(|cnt| *cnt += 1)
+            .or_insert(1);
+    }
+
     let threads = multicore::current_num_threads();
     let log_split = log2_floor(threads) as usize;
     let n = a.len() as usize;
diff --git a/halo2_proofs/src/dev.rs b/halo2_proofs/src/dev.rs
index 94d7da90a1..b6fac5266e 100644
--- a/halo2_proofs/src/dev.rs
+++ b/halo2_proofs/src/dev.rs
@@ -892,6 +892,7 @@ impl<'a, F: FieldExt> MockProver<'a, F> {
 
         let mut cs = ConstraintSystem::default();
         let config = ConcreteCircuit::configure(&mut cs);
+        let cs = cs.chunk_lookups();
         let cs = cs;
 
         if n < cs.minimum_rows() {
@@ -1289,7 +1290,9 @@ impl<'a, F: FieldExt> MockProver<'a, F> {
                         )
                     };
 
-                    assert!(lookup.table_expressions.len() == lookup.input_expressions.len());
+                    for input_expressions in lookup.inputs_expressions.iter() {
+                        assert!(lookup.table_expressions.len() == input_expressions.len());
+                    }
                     assert!(self.usable_rows.end > 0);
 
                     // We optimize on the basis that the table might have been filled so that the last
@@ -1335,49 +1338,53 @@ impl<'a, F: FieldExt> MockProver<'a, F> {
                     }
                     let table = &cached_table;
 
-                    let mut inputs: Vec<(Vec<_>, usize)> = lookup_input_row_ids
-                        .clone()
-                        .into_iter()
-                        .filter_map(|input_row| {
-                            let t = lookup
-                                .input_expressions
-                                .iter()
-                                .map(move |c| load(c, input_row))
-                                .collect();
-
-                            if t != fill_row {
-                                // Also keep track of the original input row, since we're going to sort.
-                                Some((t, input_row))
-                            } else {
-                                None
-                            }
-                        })
-                        .collect();
-                    inputs.sort_unstable();
-
-                    let mut i = 0;
-                    inputs
+                    lookup
+                        .inputs_expressions
                         .iter()
-                        .filter_map(move |(input, input_row)| {
-                            while i < table.len() && &table[i] < input {
-                                i += 1;
-                            }
-                            if i == table.len() || &table[i] > input {
-                                assert!(table.binary_search(input).is_err());
+                        .map(|input_expressions| {
+                            let mut inputs: Vec<(Vec<_>, usize)> = lookup_input_row_ids
+                                .clone()
+                                .filter_map(|input_row| {
+                                    let t = input_expressions
+                                        .iter()
+                                        .map(move |c| load(c, input_row))
+                                        .collect();
+
+                                    if t != fill_row {
+                                        // Also keep track of the original input row, since we're going to sort.
+                                        Some((t, input_row))
+                                    } else {
+                                        None
+                                    }
+                                })
+                                .collect();
+                            inputs.sort_unstable();
 
-                                Some(VerifyFailure::Lookup {
-                                    name: lookup.name,
-                                    lookup_index,
-                                    location: FailureLocation::find_expressions(
-                                        &self.cs,
-                                        &self.regions,
-                                        *input_row,
-                                        lookup.input_expressions.iter(),
-                                    ),
+                            let mut i = 0;
+                            inputs
+                                .iter()
+                                .filter_map(move |(input, input_row)| {
+                                    while i < table.len() && &table[i] < input {
+                                        i += 1;
+                                    }
+                                    if i == table.len() || &table[i] > input {
+                                        assert!(table.binary_search(input).is_err());
+
+                                        Some(VerifyFailure::Lookup {
+                                            name: lookup.name,
+                                            lookup_index,
+                                            location: FailureLocation::find_expressions(
+                                                &self.cs,
+                                                &self.regions,
+                                                *input_row,
+                                                input_expressions.iter(),
+                                            ),
+                                        })
+                                    } else {
+                                        None
+                                    }
                                 })
-                            } else {
-                                None
-                            }
+                                .collect::<Vec<_>>()
                         })
                         .collect::<Vec<_>>()
                 });
@@ -1432,7 +1439,7 @@ impl<'a, F: FieldExt> MockProver<'a, F> {
         let mut errors: Vec<_> = iter::empty()
             .chain(selector_errors)
             .chain(gate_errors)
-            .chain(lookup_errors)
+            .chain(lookup_errors.flatten())
             .chain(perm_errors)
             .collect();
         if errors.is_empty() {
@@ -1668,7 +1675,9 @@ impl<'a, F: FieldExt> MockProver<'a, F> {
                         )
                     };
 
-                    assert!(lookup.table_expressions.len() == lookup.input_expressions.len());
+                    for input_expressions in lookup.inputs_expressions.iter() {
+                        assert!(lookup.table_expressions.len() == input_expressions.len());
+                    }
                     assert!(self.usable_rows.end > 0);
 
                     // We optimize on the basis that the table might have been filled so that the last
@@ -1715,43 +1724,48 @@ impl<'a, F: FieldExt> MockProver<'a, F> {
                     }
                     let table = &cached_table;
 
-                    let mut inputs: Vec<(Vec<_>, usize)> = lookup_input_row_ids
-                        .clone()
-                        .into_par_iter()
-                        .filter_map(|input_row| {
-                            let t = lookup
-                                .input_expressions
-                                .iter()
-                                .map(move |c| load(c, input_row))
+                    lookup
+                        .inputs_expressions
+                        .iter()
+                        .map(|input_expressions| {
+                            let mut inputs: Vec<(Vec<_>, usize)> = lookup_input_row_ids
+                                .clone()
+                                .into_par_iter()
+                                .filter_map(|input_row| {
+                                    let t = input_expressions
+                                        .iter()
+                                        .map(move |c| load(c, input_row))
+                                        .collect();
+
+                                    if t != fill_row {
+                                        // Also keep track of the original input row, since we're going to sort.
+                                        Some((t, input_row))
+                                    } else {
+                                        None
+                                    }
+                                })
                                 .collect();
-
-                            if t != fill_row {
-                                // Also keep track of the original input row, since we're going to sort.
-                                Some((t, input_row))
-                            } else {
-                                None
-                            }
-                        })
-                        .collect();
-                    inputs.par_sort_unstable();
-
-                    inputs
-                        .par_iter()
-                        .filter_map(move |(input, input_row)| {
-                            if table.binary_search(input).is_err() {
-                                Some(VerifyFailure::Lookup {
-                                    name: lookup.name,
-                                    lookup_index,
-                                    location: FailureLocation::find_expressions(
-                                        &self.cs,
-                                        &self.regions,
-                                        *input_row,
-                                        lookup.input_expressions.iter(),
-                                    ),
+                            inputs.par_sort_unstable();
+
+                            inputs
+                                .par_iter()
+                                .filter_map(move |(input, input_row)| {
+                                    if table.binary_search(input).is_err() {
+                                        Some(VerifyFailure::Lookup {
+                                            name: lookup.name,
+                                            lookup_index,
+                                            location: FailureLocation::find_expressions(
+                                                &self.cs,
+                                                &self.regions,
+                                                *input_row,
+                                                input_expressions.iter(),
+                                            ),
+                                        })
+                                    } else {
+                                        None
+                                    }
                                 })
-                            } else {
-                                None
-                            }
+                                .collect::<Vec<_>>()
                         })
                         .collect::<Vec<_>>()
                 });
@@ -1810,7 +1824,7 @@ impl<'a, F: FieldExt> MockProver<'a, F> {
         let mut errors: Vec<_> = iter::empty()
             .chain(selector_errors)
             .chain(gate_errors)
-            .chain(lookup_errors)
+            .chain(lookup_errors.flatten())
             .chain(perm_errors)
             .collect();
         if errors.is_empty() {
diff --git a/halo2_proofs/src/dev/failure.rs b/halo2_proofs/src/dev/failure.rs
index 229cb171f4..68f0e32cf8 100644
--- a/halo2_proofs/src/dev/failure.rs
+++ b/halo2_proofs/src/dev/failure.rs
@@ -159,7 +159,6 @@ pub enum VerifyFailure {
     },
     /// A lookup input did not exist in its corresponding table.
     Lookup {
-        /// The name of the lookup that is not satisfied.
         name: &'static str,
         /// The index of the lookup that is not satisfied. These indices are assigned in
         /// the order in which `ConstraintSystem::lookup` is called during
@@ -534,8 +533,10 @@ fn render_lookup<F: FieldExt>(
 
     eprintln!("error: lookup input does not exist in table");
     eprint!("  (");
-    for i in 0..lookup.input_expressions.len() {
-        eprint!("{}L{}", if i == 0 { "" } else { ", " }, i);
+    for input_expressions in lookup.inputs_expressions.iter() {
+        for i in 0..input_expressions.len() {
+            eprint!("{}L{}", if i == 0 { "" } else { ", " }, i);
+        }
     }
 
     eprint!(") ∉ (");
@@ -545,79 +546,81 @@ fn render_lookup<F: FieldExt>(
     eprintln!(")");
 
     eprintln!();
-    eprintln!("  Lookup '{}' inputs:", name);
-    for (i, input) in lookup.input_expressions.iter().enumerate() {
-        // Fetch the cell values (since we don't store them in VerifyFailure::Lookup).
-        let cell_values = input.evaluate(
-            &|_| BTreeMap::default(),
-            &|_| panic!("virtual selectors are removed during optimization"),
-            &cell_value(&util::load_slice(
-                n,
-                row,
-                &cs.fixed_queries,
-                prover.fixed.as_slice(),
-            )),
-            &cell_value(&util::load_slice(
-                n,
-                row,
-                &cs.advice_queries,
-                &prover.advice,
-            )),
-            &cell_value(&util::load_instance(
-                n,
-                row,
-                &cs.instance_queries,
-                &prover.instance,
-            )),
-            &|_| BTreeMap::default(),
-            &|a| a,
-            &|mut a, mut b| {
-                a.append(&mut b);
-                a
-            },
-            &|mut a, mut b| {
-                a.append(&mut b);
-                a
-            },
-            &|a, _| a,
-        );
-
-        // Collect the necessary rendering information:
-        // - The columns involved in this constraint.
-        // - How many cells are in each column.
-        // - The grid of cell values, indexed by rotation.
-        let mut columns = BTreeMap::<metadata::Column, usize>::default();
-        let mut layout = BTreeMap::<i32, BTreeMap<metadata::Column, _>>::default();
-        for (i, (cell, _)) in cell_values.iter().enumerate() {
-            *columns.entry(cell.column).or_default() += 1;
-            layout
-                .entry(cell.rotation)
-                .or_default()
-                .entry(cell.column)
-                .or_insert(format!("x{}", i));
-        }
+    eprintln!("  Lookup inputs:");
+    for input_expressions in lookup.inputs_expressions.iter() {
+        for (i, input) in input_expressions.iter().enumerate() {
+            // Fetch the cell values (since we don't store them in VerifyFailure::Lookup).
+            let cell_values = input.evaluate(
+                &|_| BTreeMap::default(),
+                &|_| panic!("virtual selectors are removed during optimization"),
+                &cell_value(&util::load_slice(
+                    n,
+                    row,
+                    &cs.fixed_queries,
+                    prover.fixed.as_slice(),
+                )),
+                &cell_value(&util::load_slice(
+                    n,
+                    row,
+                    &cs.advice_queries,
+                    &prover.advice,
+                )),
+                &cell_value(&util::load_instance(
+                    n,
+                    row,
+                    &cs.instance_queries,
+                    &prover.instance,
+                )),
+                &|_| BTreeMap::default(),
+                &|a| a,
+                &|mut a, mut b| {
+                    a.append(&mut b);
+                    a
+                },
+                &|mut a, mut b| {
+                    a.append(&mut b);
+                    a
+                },
+                &|a, _| a,
+            );
+
+            // Collect the necessary rendering information:
+            // - The columns involved in this constraint.
+            // - How many cells are in each column.
+            // - The grid of cell values, indexed by rotation.
+            let mut columns = BTreeMap::<metadata::Column, usize>::default();
+            let mut layout = BTreeMap::<i32, BTreeMap<metadata::Column, _>>::default();
+            for (i, (cell, _)) in cell_values.iter().enumerate() {
+                *columns.entry(cell.column).or_default() += 1;
+                layout
+                    .entry(cell.rotation)
+                    .or_default()
+                    .entry(cell.column)
+                    .or_insert(format!("x{}", i));
+            }
 
-        if i != 0 {
-            eprintln!();
-        }
-        eprintln!(
-            "    L{} = {}",
-            i,
-            emitter::expression_to_string(input, &layout)
-        );
-        eprintln!("    ^");
-
-        emitter::render_cell_layout("    | ", location, &columns, &layout, |_, rotation| {
-            if rotation == 0 {
-                eprint!(" <--{{ Lookup '{}' inputs queried here", name);
+            if i != 0 {
+                eprintln!();
             }
-        });
+            eprintln!(
+                "    L{} = {}",
+                i,
+                emitter::expression_to_string(input, &layout)
+            );
+            eprintln!("    ^");
+
+            emitter::render_cell_layout("    | ", location, &columns, &layout, |_, rotation| {
+                if rotation == 0 {
+                    eprint!(" <--{{ Lookup inputs queried here");
+                }
+            });
 
-        // Print the map from local variables to assigned values.
-        eprintln!("    |");
-        eprintln!("    | Assigned cell values:");
-        for (i, (_, value)) in cell_values.iter().enumerate() {
-            eprintln!("    |   x{} = {}", i, value);
+            // Print the map from local variables to assigned values.
+            eprintln!("    |");
+            eprintln!("    | Assigned cell values:");
+            for (i, (_, value)) in cell_values.iter().enumerate() {
+                eprintln!("    |   x{} = {}", i, value);
+            }
         }
     }
 }
diff --git a/halo2_proofs/src/lib.rs b/halo2_proofs/src/lib.rs
index 0ca61f64e4..3dd8dcd128 100644
--- a/halo2_proofs/src/lib.rs
+++ b/halo2_proofs/src/lib.rs
@@ -24,6 +24,25 @@
 #![allow(unused_imports)]
 #![allow(clippy::derive_partial_eq_without_eq)]
 
+#[cfg(feature = "counter")]
+#[macro_use]
+extern crate lazy_static;
+
+#[cfg(feature = "counter")]
+use lazy_static::lazy_static;
+
+#[cfg(feature = "counter")]
+use std::sync::Mutex;
+
+#[cfg(feature = "counter")]
+use std::collections::BTreeMap;
+
+#[cfg(feature = "counter")]
+lazy_static! {
+    static ref FFT_COUNTER: Mutex<BTreeMap<usize, usize>> = Mutex::new(BTreeMap::new());
+    static ref MSM_COUNTER: Mutex<BTreeMap<usize, usize>> = Mutex::new(BTreeMap::new());
+}
+
 pub mod arithmetic;
 pub mod circuit;
 pub use halo2curves;
diff --git a/halo2_proofs/src/plonk.rs b/halo2_proofs/src/plonk.rs
index d1aecca38e..8e97729a7b 100644
--- a/halo2_proofs/src/plonk.rs
+++ b/halo2_proofs/src/plonk.rs
@@ -27,7 +27,9 @@ mod circuit;
 mod error;
 mod evaluation;
 mod keygen;
+#[allow(dead_code)]
 mod lookup;
+mod mv_lookup;
 pub(crate) mod permutation;
 mod vanishing;
 
diff --git a/halo2_proofs/src/plonk/circuit.rs b/halo2_proofs/src/plonk/circuit.rs
index 8f1ec3da8c..734a325ee5 100644
--- a/halo2_proofs/src/plonk/circuit.rs
+++ b/halo2_proofs/src/plonk/circuit.rs
@@ -1,14 +1,17 @@
 use core::cmp::max;
 use core::ops::{Add, Mul};
 use ff::Field;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
+use std::fmt::Debug;
+use std::hash::Hasher;
+use std::marker::PhantomData;
 use std::ops::Range;
 use std::{
     convert::TryFrom,
     ops::{Neg, Sub},
 };
 
-use super::{lookup, permutation, Assigned, Error};
+use super::{mv_lookup, permutation, Assigned, Error};
 use crate::dev::metadata;
 use crate::{
     circuit::{Layouter, Region, Value},
@@ -390,7 +393,7 @@ impl Selector {
 }
 
 /// Query of fixed column at a certain relative location
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct FixedQuery {
     /// Query index
     pub(crate) index: usize,
@@ -417,7 +420,7 @@ impl FixedQuery {
 }
 
 /// Query of advice column at a certain relative location
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct AdviceQuery {
     /// Query index
     pub(crate) index: usize,
@@ -451,7 +454,7 @@ impl AdviceQuery {
 }
 
 /// Query of instance column at a certain relative location
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct InstanceQuery {
     /// Query index
     pub(crate) index: usize,
@@ -1370,9 +1373,16 @@ impl<F: Field> Gate<F> {
     }
 }
 
+/// TODO doc
+#[derive(Debug, Clone)]
+pub struct LookupTracker<F: Field> {
+    pub(crate) table: Vec<Expression<F>>,
+    pub(crate) inputs: Vec<Vec<Expression<F>>>,
+}
+
 /// This is a description of the circuit environment, such as the gate, column and
 /// permutation arrangements.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone)]
 pub struct ConstraintSystem<F: Field> {
     pub num_fixed_columns: usize,
     pub num_advice_columns: usize,
@@ -1402,9 +1412,12 @@ pub struct ConstraintSystem<F: Field> {
     // Permutation argument for performing equality constraints
     pub permutation: permutation::Argument,
 
+    /// Map from table expression to vec of vec of input expressions
+    pub lookups_map: BTreeMap<String, LookupTracker<F>>,
+
     // Vector of lookup arguments, where each corresponds to a sequence of
     // input expressions and a sequence of table expressions involved in the lookup.
-    pub lookups: Vec<lookup::Argument<F>>,
+    pub lookups: Vec<mv_lookup::Argument<F>>,
 
     // List of indexes of Fixed columns which are associated to a circuit-general Column tied to their annotation.
     pub(crate) general_column_annotations: BTreeMap<metadata::Column, String>,
@@ -1431,7 +1444,7 @@ pub struct PinnedConstraintSystem<'a, F: Field> {
     instance_queries: &'a Vec<(Column<Instance>, Rotation)>,
     fixed_queries: &'a Vec<(Column<Fixed>, Rotation)>,
     permutation: &'a permutation::Argument,
-    lookups: &'a Vec<lookup::Argument<F>>,
+    lookups_map: &'a BTreeMap<String, LookupTracker<F>>,
     constants: &'a Vec<Column<Fixed>>,
     minimum_degree: &'a Option<usize>,
 }
@@ -1457,7 +1470,7 @@ impl<'a, F: Field> std::fmt::Debug for PinnedConstraintSystem<'a, F> {
             .field("instance_queries", self.instance_queries)
             .field("fixed_queries", self.fixed_queries)
             .field("permutation", self.permutation)
-            .field("lookups", self.lookups)
+            .field("lookups_map", self.lookups_map)
             .field("constants", self.constants)
             .field("minimum_degree", self.minimum_degree);
         debug_struct.finish()
@@ -1492,6 +1505,7 @@ impl<F: Field> Default for ConstraintSystem<F> {
             num_advice_queries: Vec::new(),
             instance_queries: Vec::new(),
             permutation: permutation::Argument::new(),
+            lookups_map: BTreeMap::default(),
             lookups: Vec::new(),
             general_column_annotations: BTreeMap::new(),
             constants: vec![],
@@ -1518,7 +1532,7 @@ impl<F: Field> ConstraintSystem<F> {
             advice_queries: &self.advice_queries,
             instance_queries: &self.instance_queries,
             permutation: &self.permutation,
-            lookups: &self.lookups,
+            lookups_map: &self.lookups_map,
             constants: &self.constants,
             minimum_degree: &self.minimum_degree,
         }
@@ -1549,11 +1563,12 @@ impl<F: Field> ConstraintSystem<F> {
     /// they need to match.
     pub fn lookup(
         &mut self,
+        // FIXME use name in debug messages
         name: &'static str,
         table_map: impl FnOnce(&mut VirtualCells<'_, F>) -> Vec<(Expression<F>, TableColumn)>,
-    ) -> usize {
+    ) {
         let mut cells = VirtualCells::new(self);
-        let table_map = table_map(&mut cells)
+        let table_map: Vec<_> = table_map(&mut cells)
             .into_iter()
             .map(|(input, table)| {
                 if input.contains_simple_selector() {
@@ -1566,11 +1581,90 @@ impl<F: Field> ConstraintSystem<F> {
             })
             .collect();
 
-        let index = self.lookups.len();
+        let (input_expressions, table_expressions): (Vec<_>, Vec<_>) =
+            table_map.into_iter().unzip();
+        let table_expressions_identifier = table_expressions
+            .iter()
+            .fold(String::new(), |string, expr| string + &expr.identifier());
+
+        self.lookups_map
+            .entry(table_expressions_identifier)
+            .and_modify(|table_tracker| table_tracker.inputs.push(input_expressions.clone()))
+            .or_insert(LookupTracker {
+                table: table_expressions,
+                inputs: vec![input_expressions],
+            });
+    }
 
-        self.lookups.push(lookup::Argument::new(name, table_map));
+    /// Chunk lookup arguments into pieces below a given degree bound
+    pub fn chunk_lookups(mut self) -> Self {
+        if self.lookups_map.is_empty() {
+            return self;
+        }
 
-        index
+        let max_gate_degree = self.max_gate_degree();
+        let max_single_lookup_degree: usize = self
+            .lookups_map
+            .values()
+            .map(|v| {
+                let table_degree = v.table.iter().map(|expr| expr.degree()).max().unwrap();
+                let base_lookup_degree = super::mv_lookup::base_degree(table_degree);
+
+                let max_inputs_degree: usize = v
+                    .inputs
+                    .iter()
+                    .map(|input| input.iter().map(|expr| expr.degree()).max().unwrap())
+                    .max()
+                    .unwrap();
+
+                mv_lookup::degree_with_input(base_lookup_degree, max_inputs_degree)
+            })
+            .max()
+            .unwrap();
+
+        let required_degree = std::cmp::max(max_gate_degree, max_single_lookup_degree);
+        let required_degree = (required_degree as u64 - 1).next_power_of_two() as usize;
+
+        self.set_minimum_degree(required_degree + 1);
+
+        // safe to unwrap here
+        let minimum_degree = self.minimum_degree.unwrap();
+
+        let mut lookups: Vec<_> = vec![];
+        for v in self.lookups_map.values() {
+            let LookupTracker { table, inputs } = v;
+            let mut args = vec![super::mv_lookup::Argument::new(
+                "mv_lookup",
+                table,
+                &[inputs[0].clone()],
+            )];
+
+            for input in inputs.iter().skip(1) {
+                let cur_input_degree = input.iter().map(|expr| expr.degree()).max().unwrap();
+                let mut indicator = false;
+                for i in 0..args.len() {
+                    // try to fit input in one of the args
+                    let cur_argument_degree = args[i].required_degree();
+                    let new_potential_degree = cur_argument_degree + cur_input_degree;
+                    if new_potential_degree <= minimum_degree {
+                        args[i].inputs_expressions.push(input.clone());
+                        indicator = true;
+                        break;
+                    }
+                }
+
+                if !indicator {
+                    args.push(super::mv_lookup::Argument::new(
+                        "dummy",
+                        table,
+                        &[input.clone()],
+                    ))
+                }
+            }
+            lookups.append(&mut args);
+        }
+        self.lookups = lookups;
+        self
     }
 
     /// Add a lookup argument for some input expressions and table expressions.
@@ -1579,17 +1673,26 @@ impl<F: Field> ConstraintSystem<F> {
     /// they need to match.
     pub fn lookup_any(
         &mut self,
+        // FIXME use name in debug messages
         name: &'static str,
         table_map: impl FnOnce(&mut VirtualCells<'_, F>) -> Vec<(Expression<F>, Expression<F>)>,
-    ) -> usize {
+    ) {
         let mut cells = VirtualCells::new(self);
         let table_map = table_map(&mut cells);
 
-        let index = self.lookups.len();
-
-        self.lookups.push(lookup::Argument::new(name, table_map));
-
-        index
+        let (input_expressions, table_expressions): (Vec<_>, Vec<_>) =
+            table_map.into_iter().unzip();
+        let table_expressions_identifier = table_expressions
+            .iter()
+            .fold(String::new(), |string, expr| string + &expr.identifier());
+
+        self.lookups_map
+            .entry(table_expressions_identifier)
+            .and_modify(|table_tracker| table_tracker.inputs.push(input_expressions.clone()))
+            .or_insert(LookupTracker {
+                table: table_expressions,
+                inputs: vec![input_expressions],
+            });
     }
 
     fn query_fixed_index(&mut self, column: Column<Fixed>, at: Rotation) -> usize {
@@ -1698,7 +1801,9 @@ impl<F: Field> ConstraintSystem<F> {
     /// larger amount than actually needed. This can be used, for example, to
     /// force the permutation argument to involve more columns in the same set.
     pub fn set_minimum_degree(&mut self, degree: usize) {
-        self.minimum_degree = Some(degree);
+        self.minimum_degree = self
+            .minimum_degree
+            .map_or(Some(degree), |min_degree| Some(max(min_degree, degree)));
     }
 
     /// Creates a new gate.
@@ -1843,8 +1948,9 @@ impl<F: Field> ConstraintSystem<F> {
         // lookup expressions
         for expr in self.lookups.iter_mut().flat_map(|lookup| {
             lookup
-                .input_expressions
+                .inputs_expressions
                 .iter_mut()
+                .flatten()
                 .chain(lookup.table_expressions.iter_mut())
         }) {
             replace_selectors(expr, &selector_replacements, true);
@@ -2003,6 +2109,15 @@ impl<F: Field> ConstraintSystem<F> {
         (0..=max_phase).map(sealed::Phase)
     }
 
+    /// Compute the maximum degree of gates in the constraint system
+    pub fn max_gate_degree(&self) -> usize {
+        self.gates
+            .iter()
+            .flat_map(|gate| gate.polynomials().iter().map(|poly| poly.degree()))
+            .max()
+            .unwrap_or(0)
+    }
+
     /// Compute the degree of the constraint system (the maximum degree of all
     /// constraints).
     pub fn degree(&self) -> usize {
@@ -2023,13 +2138,16 @@ impl<F: Field> ConstraintSystem<F> {
 
         // Account for each gate to ensure our quotient polynomial is the
         // correct degree and that our extended domain is the right size.
+        degree = std::cmp::max(degree, self.max_gate_degree());
+
+        // Lookup degree
         degree = std::cmp::max(
             degree,
-            self.gates
+            self.lookups
                 .iter()
-                .flat_map(|gate| gate.polynomials().iter().map(|poly| poly.degree()))
+                .map(|hl| hl.required_degree())
                 .max()
-                .unwrap_or(0),
+                .unwrap_or(1),
         );
 
         std::cmp::max(degree, self.minimum_degree.unwrap_or(1))
@@ -2135,7 +2253,7 @@ impl<F: Field> ConstraintSystem<F> {
     }
 
     /// Returns lookup arguments
-    pub fn lookups(&self) -> &Vec<lookup::Argument<F>> {
+    pub fn lookups(&self) -> &Vec<mv_lookup::Argument<F>> {
         &self.lookups
     }
 
diff --git a/halo2_proofs/src/plonk/circuit/compress_selectors.rs b/halo2_proofs/src/plonk/circuit/compress_selectors.rs
index 0e8db9ffc8..39696acc01 100644
--- a/halo2_proofs/src/plonk/circuit/compress_selectors.rs
+++ b/halo2_proofs/src/plonk/circuit/compress_selectors.rs
@@ -20,7 +20,7 @@ pub struct SelectorDescription {
 /// This describes the assigned combination of a particular selector as well as
 /// the expression it should be substituted with.
 #[derive(Debug, Clone)]
-pub struct SelectorAssignment<F> {
+pub struct SelectorAssignment<F: Field> {
     /// The selector that this structure references, by index.
     pub selector: usize,
 
diff --git a/halo2_proofs/src/plonk/evaluation.rs b/halo2_proofs/src/plonk/evaluation.rs
index c9c9a5cbf1..887f0a290b 100644
--- a/halo2_proofs/src/plonk/evaluation.rs
+++ b/halo2_proofs/src/plonk/evaluation.rs
@@ -1,7 +1,9 @@
 use crate::multicore;
 use crate::plonk::lookup::prover::Committed;
 use crate::plonk::permutation::Argument;
-use crate::plonk::{lookup, permutation, AdviceQuery, Any, FixedQuery, InstanceQuery, ProvingKey};
+use crate::plonk::{
+    mv_lookup, permutation, AdviceQuery, Any, FixedQuery, InstanceQuery, ProvingKey,
+};
 use crate::poly::Basis;
 use crate::{
     arithmetic::{eval_polynomial, parallelize, CurveAffine, FieldExt},
@@ -185,7 +187,7 @@ pub struct Evaluator<C: CurveAffine> {
     ///  Custom gates evalution
     pub custom_gates: GraphEvaluator<C>,
     ///  Lookups evalution
-    pub lookups: Vec<GraphEvaluator<C>>,
+    pub lookups: Vec<(Vec<GraphEvaluator<C>>, GraphEvaluator<C>)>,
 }
 
 /// GraphEvaluator
@@ -241,9 +243,12 @@ impl<C: CurveAffine> Evaluator<C> {
 
         // Lookups
         for lookup in cs.lookups.iter() {
-            let mut graph = GraphEvaluator::default();
+            let mut graph_table = GraphEvaluator::default();
+            let mut graph_inputs: Vec<_> = (0..lookup.inputs_expressions.len())
+                .map(|_| GraphEvaluator::default())
+                .collect();
 
-            let mut evaluate_lc = |expressions: &Vec<Expression<_>>| {
+            let evaluate_lc = |graph: &mut GraphEvaluator<C>, expressions: &Vec<Expression<_>>| {
                 let parts = expressions
                     .iter()
                     .map(|expr| graph.add_expression(expr))
@@ -255,22 +260,33 @@ impl<C: CurveAffine> Evaluator<C> {
                 ))
             };
 
-            // Input coset
-            let compressed_input_coset = evaluate_lc(&lookup.input_expressions);
+            // Inputs cosets
+            for (input_expressions, graph_input) in lookup
+                .inputs_expressions
+                .iter()
+                .zip(graph_inputs.iter_mut())
+            {
+                let compressed_input_coset = evaluate_lc(graph_input, input_expressions);
+
+                graph_input.add_calculation(Calculation::Add(
+                    compressed_input_coset,
+                    ValueSource::Beta(),
+                ));
+            }
+
             // table coset
-            let compressed_table_coset = evaluate_lc(&lookup.table_expressions);
-            // z(\omega X) (a'(X) + \beta) (s'(X) + \gamma)
-            let right_gamma = graph.add_calculation(Calculation::Add(
+            let compressed_table_coset = evaluate_lc(&mut graph_table, &lookup.table_expressions);
+
+            graph_table.add_calculation(Calculation::Add(
                 compressed_table_coset,
-                ValueSource::Gamma(),
-            ));
-            let lc = graph.add_calculation(Calculation::Add(
-                compressed_input_coset,
                 ValueSource::Beta(),
             ));
-            graph.add_calculation(Calculation::Mul(lc, right_gamma));
 
-            ev.lookups.push(graph);
+            /*
+                a) f_i + beta
+                b) t + beta
+            */
+            ev.lookups.push((graph_inputs.to_vec(), graph_table));
         }
 
         ev
@@ -287,7 +303,7 @@ impl<C: CurveAffine> Evaluator<C> {
         beta: C::ScalarExt,
         gamma: C::ScalarExt,
         theta: C::ScalarExt,
-        lookups: &[Vec<lookup::prover::Committed<C>>],
+        lookups: &[Vec<mv_lookup::prover::Committed<C>>],
         permutations: &[permutation::prover::Committed<C>],
     ) -> Polynomial<C::ScalarExt, ExtendedLagrangeCoeff> {
         let domain = &pk.vk.domain;
@@ -492,33 +508,146 @@ impl<C: CurveAffine> Evaluator<C> {
                         });
                     }
 
+                    // For lookups, compute inputs_inv_sum = ∑ 1 / (f_i(X) + beta)
+                    // The outer vector has capacity self.lookups.len()
+                    // The middle vector has capacity domain.extended_len()
+                    // The inner vector has capacity
+                    let inputs_inv_sum: Vec<Vec<Vec<_>>> = lookups
+                        .iter()
+                        .enumerate()
+                        .map(|(n, _)| {
+                            let (inputs_lookup_evaluator, _) = &self.lookups[n];
+                            let mut inputs_eval_data: Vec<_> = inputs_lookup_evaluator
+                                .iter()
+                                .map(|input_lookup_evaluator| input_lookup_evaluator.instance())
+                                .collect();
+
+                            let mut inputs_values_for_extended_domain: Vec<C::Scalar> =
+                                Vec::with_capacity(self.lookups[n].0.len() << domain.k());
+                            for idx in 0..size {
+                                // For each compressed input column, evaluate at ω^i and add beta
+                                // This is a vector of length self.lookups[n].0.len()
+                                let inputs_values: Vec<C::ScalarExt> = inputs_lookup_evaluator
+                                    .iter()
+                                    .zip(inputs_eval_data.iter_mut())
+                                    .map(|(input_lookup_evaluator, input_eval_data)| {
+                                        input_lookup_evaluator.evaluate(
+                                            input_eval_data,
+                                            fixed,
+                                            advice,
+                                            instance,
+                                            challenges,
+                                            &beta,
+                                            &gamma,
+                                            &theta,
+                                            &y,
+                                            &C::ScalarExt::zero(),
+                                            idx,
+                                            rot_scale,
+                                            isize,
+                                        )
+                                    })
+                                    .collect();
+
+                                inputs_values_for_extended_domain.extend_from_slice(&inputs_values);
+                            }
+
+                            let num_threads = rayon::current_num_threads();
+                            let chunk_size =
+                                (inputs_values_for_extended_domain.len() + num_threads - 1)
+                                    / num_threads;
+                            rayon::scope(|scope| {
+                                for chunk in
+                                    inputs_values_for_extended_domain.chunks_mut(chunk_size)
+                                {
+                                    scope.spawn(|_| {
+                                        chunk.batch_invert();
+                                    })
+                                }
+                            });
+
+                            // The outer vector has capacity domain.extended_len()
+                            // The inner vector has capacity self.lookups[n].0.len()
+                            let inputs_inv_sums: Vec<Vec<_>> = inputs_values_for_extended_domain
+                                .chunks_exact(self.lookups[n].0.len())
+                                .map(|c| c.to_vec())
+                                .collect();
+                            inputs_inv_sums
+                        })
+                        .collect();
+
                     // Lookups
                     for (n, lookup) in lookups.iter().enumerate() {
                         // Polynomials required for this lookup.
                         // Calculated here so these only have to be kept in memory for the short time
                         // they are actually needed.
-                        let product_coset = pk.vk.domain.coeff_to_extended_part(
-                            lookup.product_poly.clone(),
-                            current_extended_omega,
-                        );
-                        let permuted_input_coset = pk.vk.domain.coeff_to_extended_part(
-                            lookup.permuted_input_poly.clone(),
-                            current_extended_omega,
-                        );
-                        let permuted_table_coset = pk.vk.domain.coeff_to_extended_part(
-                            lookup.permuted_table_poly.clone(),
+                        let phi_coset = pk.vk.domain.coeff_to_extended_part(
+                            lookup.phi_poly.clone(),
                             current_extended_omega,
                         );
+                        let m_coset = pk
+                            .vk
+                            .domain
+                            .coeff_to_extended_part(lookup.m_poly.clone(), current_extended_omega);
 
                         // Lookup constraints
+                        /*
+                            φ_i(X) = f_i(X) + beta
+                            τ(X) = t(X) + beta
+                            LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+                            RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+                                = (τ(X) * Π(φ_i(X)) * ∑ 1/(φ_i(X))) - Π(φ_i(X)) * m(X)
+                                = Π(φ_i(X)) * (τ(X) * ∑ 1/(φ_i(X)) - m(X))
+                        */
                         parallelize(&mut values, |values, start| {
-                            let lookup_evaluator = &self.lookups[n];
-                            let mut eval_data = lookup_evaluator.instance();
+                            let (inputs_lookup_evaluator, table_lookup_evaluator) =
+                                &self.lookups[n];
+                            let mut inputs_eval_data: Vec<_> = inputs_lookup_evaluator
+                                .iter()
+                                .map(|input_lookup_evaluator| input_lookup_evaluator.instance())
+                                .collect();
+                            let mut table_eval_data = table_lookup_evaluator.instance();
+
                             for (i, value) in values.iter_mut().enumerate() {
                                 let idx = start + i;
 
-                                let table_value = lookup_evaluator.evaluate(
-                                    &mut eval_data,
+                                // f_i(X) + beta for i in expressions
+                                let inputs_value: Vec<C::ScalarExt> = inputs_lookup_evaluator
+                                    .iter()
+                                    .zip(inputs_eval_data.iter_mut())
+                                    .map(|(input_lookup_evaluator, input_eval_data)| {
+                                        input_lookup_evaluator.evaluate(
+                                            input_eval_data,
+                                            fixed,
+                                            advice,
+                                            instance,
+                                            challenges,
+                                            &beta,
+                                            &gamma,
+                                            &theta,
+                                            &y,
+                                            &C::ScalarExt::zero(),
+                                            idx,
+                                            rot_scale,
+                                            isize,
+                                        )
+                                    })
+                                    .collect();
+
+                                // Π(φ_i(X))
+                                let inputs_prod: C::Scalar = inputs_value
+                                    .iter()
+                                    .fold(C::Scalar::one(), |acc, input| acc * input);
+
+                                // f_i(X) + beta at ω^idx
+                                let fi_inverses = &inputs_inv_sum[n][idx];
+                                let inputs_inv_sum = fi_inverses
+                                    .iter()
+                                    .fold(C::Scalar::zero(), |acc, input| acc + input);
+
+                                // t(X) + beta
+                                let table_value = table_lookup_evaluator.evaluate(
+                                    &mut table_eval_data,
                                     fixed,
                                     advice,
                                     instance,
@@ -534,41 +663,27 @@ impl<C: CurveAffine> Evaluator<C> {
                                 );
 
                                 let r_next = get_rotation_idx(idx, 1, rot_scale, isize);
-                                let r_prev = get_rotation_idx(idx, -1, rot_scale, isize);
 
-                                let a_minus_s =
-                                    permuted_input_coset[idx] - permuted_table_coset[idx];
-                                // l_0(X) * (1 - z(X)) = 0
-                                *value = *value * y + ((one - product_coset[idx]) * l0[idx]);
-                                // l_last(X) * (z(X)^2 - z(X)) = 0
-                                *value = *value * y
-                                    + ((product_coset[idx] * product_coset[idx]
-                                        - product_coset[idx])
-                                        * l_last[idx]);
-                                // (1 - (l_last(X) + l_blind(X))) * (
-                                //   z(\omega X) (a'(X) + \beta) (s'(X) + \gamma)
-                                //   - z(X) (\theta^{m-1} a_0(X) + ... + a_{m-1}(X) + \beta)
-                                //          (\theta^{m-1} s_0(X) + ... + s_{m-1}(X) + \gamma)
-                                // ) = 0
-                                *value = *value * y
-                                    + ((product_coset[r_next]
-                                        * (permuted_input_coset[idx] + beta)
-                                        * (permuted_table_coset[idx] + gamma)
-                                        - product_coset[idx] * table_value)
-                                        * l_active_row[idx]);
-                                // Check that the first values in the permuted input expression and permuted
-                                // fixed expression are the same.
-                                // l_0(X) * (a'(X) - s'(X)) = 0
-                                *value = *value * y + (a_minus_s * l0[idx]);
-                                // Check that each value in the permuted lookup input expression is either
-                                // equal to the value above it, or the value at the same index in the
-                                // permuted table expression.
-                                // (1 - (l_last + l_blind)) * (a′(X) − s′(X))⋅(a′(X) − a′(\omega^{-1} X)) = 0
-                                *value = *value * y
-                                    + (a_minus_s
-                                        * (permuted_input_coset[idx]
-                                            - permuted_input_coset[r_prev])
-                                        * l_active_row[idx]);
+                                let lhs = {
+                                    // τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+                                    table_value * inputs_prod * (phi_coset[r_next] - phi_coset[idx])
+                                };
+
+                                let rhs = {
+                                    //   τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+                                    // = (τ(X) * Π(φ_i(X)) * ∑ 1/(φ_i(X))) - Π(φ_i(X)) * m(X)
+                                    // = Π(φ_i(X)) * (τ(X) * ∑ 1/(φ_i(X)) - m(X))
+                                    inputs_prod * (table_value * inputs_inv_sum - m_coset[idx])
+                                };
+
+                                // phi[0] = 0
+                                *value = *value * y + l0[idx] * phi_coset[idx];
+
+                                // phi[u] = 0
+                                *value = *value * y + l_last[idx] * phi_coset[idx];
+
+                                // q(X) = (1 - (l_last(X) + l_blind(X))) * (LHS - RHS)
+                                *value = *value * y + (lhs - rhs) * l_active_row[idx];
                             }
                         });
                     }
diff --git a/halo2_proofs/src/plonk/keygen.rs b/halo2_proofs/src/plonk/keygen.rs
index 2f461491f6..d4365b40cf 100644
--- a/halo2_proofs/src/plonk/keygen.rs
+++ b/halo2_proofs/src/plonk/keygen.rs
@@ -41,6 +41,8 @@ where
     let mut cs = ConstraintSystem::default();
     let config = ConcreteCircuit::configure(&mut cs);
 
+    let cs = cs.chunk_lookups();
+
     let degree = cs.degree();
 
     let domain = EvaluationDomain::new(degree as u32, k);
diff --git a/halo2_proofs/src/plonk/mv_lookup.rs b/halo2_proofs/src/plonk/mv_lookup.rs
new file mode 100644
index 0000000000..86022912ad
--- /dev/null
+++ b/halo2_proofs/src/plonk/mv_lookup.rs
@@ -0,0 +1,99 @@
+use super::circuit::Expression;
+use ff::Field;
+use std::fmt::{self, Debug};
+
+pub(crate) mod prover;
+pub(crate) mod verifier;
+
+/// Degree of lookup without inputs
+pub fn base_degree(table_degree: usize) -> usize {
+    // let lhs_degree = table_degree + inputs_expressions_degree + 1
+    // let degree = lhs_degree + 1
+    std::cmp::max(3, table_degree + 2)
+}
+
+pub fn degree_with_input(base_degree: usize, input_expression_degree: usize) -> usize {
+    base_degree + input_expression_degree
+}
+
+#[derive(Clone)]
+pub struct Argument<F: Field> {
+    pub name: &'static str,
+    pub(crate) table_expressions: Vec<Expression<F>>,
+    pub(crate) inputs_expressions: Vec<Vec<Expression<F>>>,
+}
+
+impl<F: Field> Debug for Argument<F> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Argument")
+            .field("table_expressions", &self.table_expressions)
+            .field("inputs_expressions", &self.inputs_expressions)
+            .finish()
+    }
+}
+
+impl<F: Field> Argument<F> {
+    /// Constructs a new lookup argument.
+    pub fn new(name: &'static str, table: &[Expression<F>], input: &[Vec<Expression<F>>]) -> Self {
+        Self {
+            name,
+            table_expressions: table.to_owned(),
+            inputs_expressions: input.to_owned(),
+        }
+    }
+
+    pub(crate) fn required_degree(&self) -> usize {
+        assert!(self
+            .inputs_expressions
+            .iter()
+            .all(|input| input.len() == self.table_expressions.len()));
+
+        let expr_degree = |input_expressions: &Vec<Expression<F>>| {
+            let mut input_degree = 0;
+            for expr in input_expressions.iter() {
+                input_degree = std::cmp::max(input_degree, expr.degree());
+            }
+
+            input_degree
+        };
+
+        let inputs_expressions_degree: usize = self
+            .inputs_expressions
+            .iter()
+            .map(|input_expressions| expr_degree(input_expressions))
+            .sum();
+
+        let mut table_degree = 0;
+        for expr in self.table_expressions.iter() {
+            table_degree = std::cmp::max(table_degree, expr.degree());
+        }
+
+        /*
+            φ_i(X) = f_i(X) + α
+            τ(X) = t(X) + α
+            LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+                = table_degree + sum(input_degree) + 1
+            RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+
+            deg(q(X)) = (1 - (q_last + q_blind)) * (LHS - RHS)
+                 = 1 + LHS
+        */
+
+        let lhs_degree = table_degree + inputs_expressions_degree + 1;
+        let degree = lhs_degree + 1;
+
+        // 3 = phi + q_blind + table (where table is = 1)
+        // + 1 for each of inputs expressions
+        std::cmp::max(3 + self.inputs_expressions.len(), degree)
+    }
+
+    /// Returns input of this argument
+    pub fn input_expressions(&self) -> &Vec<Vec<Expression<F>>> {
+        &self.inputs_expressions
+    }
+
+    /// Returns table of this argument
+    pub fn table_expressions(&self) -> &Vec<Expression<F>> {
+        &self.table_expressions
+    }
+}
diff --git a/halo2_proofs/src/plonk/mv_lookup/exec_info.json b/halo2_proofs/src/plonk/mv_lookup/exec_info.json
new file mode 100644
index 0000000000..f7a7042c0f
--- /dev/null
+++ b/halo2_proofs/src/plonk/mv_lookup/exec_info.json
@@ -0,0 +1,46 @@
+{
+    "unit": "ms",
+    "non_batched": {
+        "k": 14,
+        "halo2": {
+            "protocol": "halo2",
+            "methods": {
+                "commit_permuted": {
+                    "compress_expressions": 1,
+                    "permute_expressions": 4.5, 
+                    "commit_permuted_input": 5,
+                    "commit_permuted_table": 5
+                },
+                "grand_product": {
+                    "lookup_product_denom": 2, 
+                    "lookup_product": 0.2, 
+                    "grand_prod_evals": 0.5,
+                    "grand_prod_commit": 7.5
+                }, 
+                "h_evaluation": {
+        
+                }
+            }
+        },
+        "mv": {
+            "protocol": "mv",
+            "methods": {
+                "compute_multiplicity": {
+                    "compress_expressions": 1,
+                    "compute_multiplicities": 2,
+                    "commit_m": 1
+                },
+                "grand_sum": {
+                    "inputs_log_derivatives": 2, 
+                    "table_log_derivatives": 1.8,
+                    "log_derivatives_diff": 0.2,
+                    "grand_sum_evals": 0.2, 
+                    "grand_sum_commit": 33
+                }, 
+                "h_evaluation": {
+        
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
new file mode 100644
index 0000000000..7da0802549
--- /dev/null
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -0,0 +1,440 @@
+use super::super::{
+    circuit::Expression, ChallengeBeta, ChallengeGamma, ChallengeTheta, ChallengeX, Error,
+    ProvingKey,
+};
+use super::Argument;
+use crate::plonk::evaluation::evaluate;
+use crate::{
+    arithmetic::{eval_polynomial, parallelize, CurveAffine, FieldExt},
+    poly::{
+        commitment::{Blind, Params},
+        Coeff, EvaluationDomain, ExtendedLagrangeCoeff, LagrangeCoeff, Polynomial, ProverQuery,
+        Rotation,
+    },
+    transcript::{EncodedChallenge, TranscriptWrite},
+};
+use blake2b_simd::Hash;
+use ff::{BitViewSized, PrimeField, PrimeFieldBits};
+use group::{
+    ff::{BatchInvert, Field},
+    Curve,
+};
+use rand_core::RngCore;
+use rayon::current_num_threads;
+use std::collections::{BTreeSet, HashSet};
+use std::{any::TypeId, convert::TryInto, num::ParseIntError, ops::Index};
+use std::{
+    collections::BTreeMap,
+    iter,
+    ops::{Mul, MulAssign},
+};
+
+use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
+
+#[derive(Debug)]
+pub(in crate::plonk) struct Prepared<C: CurveAffine> {
+    compressed_inputs_expressions: Vec<Polynomial<C::Scalar, LagrangeCoeff>>,
+    compressed_table_expression: Polynomial<C::Scalar, LagrangeCoeff>,
+    m_values: Polynomial<C::Scalar, LagrangeCoeff>,
+}
+
+#[derive(Debug)]
+pub(in crate::plonk) struct Committed<C: CurveAffine> {
+    pub(in crate::plonk) m_poly: Polynomial<C::Scalar, Coeff>,
+    pub(in crate::plonk) phi_poly: Polynomial<C::Scalar, Coeff>,
+}
+
+pub(in crate::plonk) struct Evaluated<C: CurveAffine> {
+    constructed: Committed<C>,
+}
+
+impl<F: FieldExt> Argument<F> {
+    pub(in crate::plonk) fn prepare<
+        'a,
+        'params: 'a,
+        C,
+        P: Params<'params, C>,
+        E: EncodedChallenge<C>,
+        R: RngCore,
+        T: TranscriptWrite<C, E>,
+    >(
+        &self,
+        pk: &ProvingKey<C>,
+        params: &P,
+        domain: &EvaluationDomain<C::Scalar>,
+        theta: ChallengeTheta<C>,
+        advice_values: &'a [Polynomial<C::Scalar, LagrangeCoeff>],
+        fixed_values: &'a [Polynomial<C::Scalar, LagrangeCoeff>],
+        instance_values: &'a [Polynomial<C::Scalar, LagrangeCoeff>],
+        challenges: &'a [C::Scalar],
+        mut rng: R, // in case we want to blind (do we actually need zk?)
+        transcript: &mut T,
+    ) -> Result<Prepared<C>, Error>
+    where
+        C: CurveAffine<ScalarExt = F>,
+        C::Curve: Mul<F, Output = C::Curve> + MulAssign<F>,
+    {
+        // Closure to get values of expressions and compress them
+        let compress_expressions = |expressions: &[Expression<C::Scalar>]| {
+            let compressed_expression = expressions
+                .iter()
+                .map(|expression| {
+                    pk.vk.domain.lagrange_from_vec(evaluate(
+                        expression,
+                        params.n() as usize,
+                        1,
+                        fixed_values,
+                        advice_values,
+                        instance_values,
+                        challenges,
+                    ))
+                })
+                .fold(domain.empty_lagrange(), |acc, expression| {
+                    acc * *theta + &expression
+                });
+            compressed_expression
+        };
+
+        // Get values of input expressions involved in the lookup and compress them
+        let compressed_inputs_expressions: Vec<_> = self
+            .inputs_expressions
+            .iter()
+            .map(|input_expressions| compress_expressions(input_expressions))
+            .collect();
+
+        // Get values of table expressions involved in the lookup and compress them
+        let compressed_table_expression = compress_expressions(&self.table_expressions);
+
+        let blinding_factors = pk.vk.cs.blinding_factors();
+
+        // compute m(X)
+        let table_index_value_mapping: BTreeMap<C::Scalar, usize> = compressed_table_expression
+            .iter()
+            .take(params.n() as usize - blinding_factors - 1)
+            .enumerate()
+            .map(|(i, &x)| (x, i))
+            .collect();
+
+        let m_values: Vec<F> = {
+            use std::sync::atomic::{AtomicU64, Ordering};
+            use std::sync::RwLock;
+            let m_values: Vec<AtomicU64> = (0..params.n()).map(|_| AtomicU64::new(0)).collect();
+
+            for compressed_input_expression in compressed_inputs_expressions.iter() {
+                let _ = compressed_input_expression
+                    .par_iter()
+                    .take(params.n() as usize - blinding_factors - 1)
+                    .try_for_each(|fi| -> Result<(), Error> {
+                        let index = table_index_value_mapping
+                            .get(fi)
+                            .ok_or(Error::ConstraintSystemFailure)?;
+                        m_values[*index].fetch_add(1, Ordering::Relaxed);
+                        Ok(())
+                    });
+            }
+
+            m_values
+                .par_iter()
+                .map(|mi| F::from(mi.load(Ordering::Relaxed) as u64))
+                .collect()
+        };
+        let m_values = pk.vk.domain.lagrange_from_vec(m_values);
+
+        #[cfg(feature = "sanity-checks")]
+        {
+            // check that m is zero after blinders
+            let invalid_ms = m_values
+                .iter()
+                .skip(params.n() as usize - blinding_factors)
+                .collect::<Vec<_>>();
+            assert_eq!(invalid_ms.len(), blinding_factors);
+            for mi in invalid_ms {
+                assert_eq!(*mi, C::Scalar::zero());
+            }
+
+            // check sums
+            let alpha = C::Scalar::random(&mut rng);
+            let cs_input_sum =
+                |compressed_input_expression: &Polynomial<C::Scalar, LagrangeCoeff>| {
+                    let mut lhs_sum = C::Scalar::zero();
+                    for &fi in compressed_input_expression
+                        .iter()
+                        .take(params.n() as usize - blinding_factors - 1)
+                    {
+                        lhs_sum += (fi + alpha).invert().unwrap();
+                    }
+
+                    lhs_sum
+                };
+
+            let mut lhs_sum = C::Scalar::zero();
+
+            for compressed_input_expression in compressed_inputs_expressions.iter() {
+                lhs_sum += cs_input_sum(compressed_input_expression);
+            }
+
+            let mut rhs_sum = C::Scalar::zero();
+            for (&ti, &mi) in compressed_table_expression.iter().zip(m_values.iter()) {
+                rhs_sum += mi * (ti + alpha).invert().unwrap();
+            }
+
+            assert_eq!(lhs_sum, rhs_sum);
+        }
+
+        // commit to m(X)
+        let blind = Blind(C::Scalar::zero());
+        let m_commitment = params.commit_lagrange(&m_values, blind).to_affine();
+
+        // write commitment of m(X) to transcript
+        transcript.write_point(m_commitment)?;
+
+        Ok(Prepared {
+            compressed_inputs_expressions,
+            compressed_table_expression,
+            m_values,
+        })
+    }
+}
+
+impl<C: CurveAffine> Prepared<C> {
+    pub(in crate::plonk) fn commit_grand_sum<
+        'params,
+        P: Params<'params, C>,
+        E: EncodedChallenge<C>,
+        R: RngCore,
+        T: TranscriptWrite<C, E>,
+    >(
+        self,
+        pk: &ProvingKey<C>,
+        params: &P,
+        beta: ChallengeBeta<C>,
+        mut rng: R,
+        transcript: &mut T,
+    ) -> Result<Committed<C>, Error> {
+        /*
+            φ_i(X) = f_i(X) + α
+            τ(X) = t(X) + α
+            LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+            RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+        */
+
+        // ∑ 1/(φ_i(X))
+        let mut inputs_log_derivatives = vec![C::Scalar::zero(); params.n() as usize];
+        for compressed_input_expression in self.compressed_inputs_expressions.iter() {
+            let mut input_log_derivatives = vec![C::Scalar::zero(); params.n() as usize];
+
+            parallelize(
+                &mut input_log_derivatives,
+                |input_log_derivatives, start| {
+                    for (input_log_derivative, fi) in input_log_derivatives
+                        .iter_mut()
+                        .zip(compressed_input_expression[start..].iter())
+                    {
+                        *input_log_derivative = *beta + fi;
+                    }
+                },
+            );
+            input_log_derivatives.iter_mut().batch_invert();
+
+            // TODO: remove last blinders from this
+            for i in 0..params.n() as usize {
+                inputs_log_derivatives[i] += input_log_derivatives[i];
+            }
+        }
+
+        // 1 / τ(X)
+        let mut table_log_derivatives = vec![C::Scalar::zero(); params.n() as usize];
+        parallelize(
+            &mut table_log_derivatives,
+            |table_log_derivatives, start| {
+                for (table_log_derivative, ti) in table_log_derivatives
+                    .iter_mut()
+                    .zip(self.compressed_table_expression[start..].iter())
+                {
+                    *table_log_derivative = *beta + ti;
+                }
+            },
+        );
+
+        table_log_derivatives.iter_mut().batch_invert();
+
+        // (Σ 1/(φ_i(X)) - m(X) / τ(X))
+        let mut log_derivatives_diff = vec![C::Scalar::zero(); params.n() as usize];
+        parallelize(&mut log_derivatives_diff, |log_derivatives_diff, start| {
+            for (((log_derivative_diff, fi), ti), mi) in log_derivatives_diff
+                .iter_mut()
+                .zip(inputs_log_derivatives[start..].iter())
+                .zip(table_log_derivatives[start..].iter())
+                .zip(self.m_values[start..].iter())
+            {
+                // (Σ 1/(φ_i(X)) - m(X) / τ(X))
+                *log_derivative_diff = *fi - *mi * *ti;
+            }
+        });
+
+        // Compute the evaluations of the lookup grand sum polynomial
+        // over our domain, starting with phi[0] = 0
+        let blinding_factors = pk.vk.cs.blinding_factors();
+        let phi = {
+            let active_size = params.n() as usize - blinding_factors;
+            let chunk = {
+                let num_threads = crate::multicore::current_num_threads();
+                let mut chunk = (active_size as usize) / num_threads;
+                if chunk < num_threads {
+                    chunk = 1;
+                }
+                chunk
+            };
+            let num_chunks = (active_size as usize + chunk - 1) / chunk;
+            let mut segment_sum = vec![C::Scalar::zero(); num_chunks];
+            let mut grand_sum = iter::once(C::Scalar::zero())
+                .chain(log_derivatives_diff)
+                .take(active_size)
+                .collect::<Vec<_>>();
+            parallelize(&mut grand_sum, |segment_grand_sum, _| {
+                for i in 1..segment_grand_sum.len() {
+                    segment_grand_sum[i] += segment_grand_sum[i - 1];
+                }
+            });
+            for i in 1..segment_sum.len() {
+                segment_sum[i] = segment_sum[i - 1] + grand_sum[i * chunk - 1];
+            }
+            parallelize(&mut grand_sum, |grand_sum, start| {
+                let prefix_sum = segment_sum[start / chunk];
+                for v in grand_sum.iter_mut() {
+                    *v += prefix_sum;
+                }
+            });
+            grand_sum
+                .into_iter()
+                .chain((0..blinding_factors).map(|_| C::Scalar::random(&mut rng)))
+                .collect::<Vec<_>>()
+        };
+        assert_eq!(phi.len(), params.n() as usize);
+        let phi = pk.vk.domain.lagrange_from_vec(phi);
+
+        #[cfg(feature = "sanity-checks")]
+        // This test works only with intermediate representations in this method.
+        // It can be used for debugging purposes.
+        {
+            // While in Lagrange basis, check that product is correctly constructed
+            let u = (params.n() as usize) - (blinding_factors + 1);
+
+            /*
+                φ_i(X) = f_i(X) + α
+                τ(X) = t(X) + α
+                LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+                RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+            */
+
+            // q(X) = LHS - RHS mod zH(X)
+            for i in 0..u {
+                // Π(φ_i(X))
+                let fi_prod = || {
+                    let mut prod = C::Scalar::one();
+                    for compressed_input_expression in self.compressed_inputs_expressions.iter() {
+                        prod *= *beta + compressed_input_expression[i];
+                    }
+
+                    prod
+                };
+
+                let fi_log_derivative = || {
+                    let mut sum = C::Scalar::zero();
+                    for compressed_input_expression in self.compressed_inputs_expressions.iter() {
+                        sum += (*beta + compressed_input_expression[i]).invert().unwrap();
+                    }
+
+                    sum
+                };
+
+                // LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+                let lhs = {
+                    (*beta + self.compressed_table_expression[i])
+                        * fi_prod()
+                        * (phi[i + 1] - phi[i])
+                };
+
+                // RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+                let rhs = {
+                    (*beta + self.compressed_table_expression[i])
+                        * fi_prod()
+                        * (fi_log_derivative()
+                            - self.m_values[i]
+                                * (*beta + self.compressed_table_expression[i])
+                                    .invert()
+                                    .unwrap())
+                };
+
+                assert_eq!(lhs - rhs, C::Scalar::zero());
+            }
+
+            assert_eq!(phi[u], C::Scalar::zero());
+        }
+
+        let grand_sum_blind = Blind(C::Scalar::zero());
+        let phi_commitment = params.commit_lagrange(&phi, grand_sum_blind).to_affine();
+
+        // Hash grand sum commitment
+        transcript.write_point(phi_commitment)?;
+
+        Ok(Committed {
+            m_poly: pk.vk.domain.lagrange_to_coeff(self.m_values),
+            phi_poly: pk.vk.domain.lagrange_to_coeff(phi),
+        })
+    }
+}
+
+impl<C: CurveAffine> Committed<C> {
+    pub(in crate::plonk) fn evaluate<E: EncodedChallenge<C>, T: TranscriptWrite<C, E>>(
+        self,
+        pk: &ProvingKey<C>,
+        x: ChallengeX<C>,
+        transcript: &mut T,
+    ) -> Result<Evaluated<C>, Error> {
+        let domain = &pk.vk.domain;
+        let x_next = domain.rotate_omega(*x, Rotation::next());
+
+        let phi_eval = eval_polynomial(&self.phi_poly, *x);
+        let phi_next_eval = eval_polynomial(&self.phi_poly, x_next);
+        let m_eval = eval_polynomial(&self.m_poly, *x);
+
+        // Hash each advice evaluation
+        for eval in iter::empty()
+            .chain(Some(phi_eval))
+            .chain(Some(phi_next_eval))
+            .chain(Some(m_eval))
+        {
+            transcript.write_scalar(eval)?;
+        }
+
+        Ok(Evaluated { constructed: self })
+    }
+}
+
+impl<C: CurveAffine> Evaluated<C> {
+    pub(in crate::plonk) fn open<'a>(
+        &'a self,
+        pk: &'a ProvingKey<C>,
+        x: ChallengeX<C>,
+    ) -> impl Iterator<Item = ProverQuery<'a, C>> + Clone {
+        let x_next = pk.vk.domain.rotate_omega(*x, Rotation::next());
+
+        iter::empty()
+            .chain(Some(ProverQuery {
+                point: *x,
+                poly: &self.constructed.phi_poly,
+                blind: Blind(C::Scalar::zero()),
+            }))
+            .chain(Some(ProverQuery {
+                point: x_next,
+                poly: &self.constructed.phi_poly,
+                blind: Blind(C::Scalar::zero()),
+            }))
+            .chain(Some(ProverQuery {
+                point: *x,
+                poly: &self.constructed.m_poly,
+                blind: Blind(C::Scalar::zero()),
+            }))
+    }
+}
diff --git a/halo2_proofs/src/plonk/mv_lookup/verifier.rs b/halo2_proofs/src/plonk/mv_lookup/verifier.rs
new file mode 100644
index 0000000000..0a8085fc33
--- /dev/null
+++ b/halo2_proofs/src/plonk/mv_lookup/verifier.rs
@@ -0,0 +1,195 @@
+use std::iter;
+
+use super::super::{
+    circuit::Expression, ChallengeBeta, ChallengeGamma, ChallengeTheta, ChallengeX,
+};
+use super::Argument;
+use crate::{
+    arithmetic::{CurveAffine, FieldExt},
+    plonk::{Error, VerifyingKey},
+    poly::{commitment::MSM, Rotation, VerifierQuery},
+    transcript::{EncodedChallenge, TranscriptRead},
+};
+use ff::{BatchInvert, Field};
+
+pub struct PreparedCommitments<C: CurveAffine> {
+    m_commitment: C,
+}
+
+pub struct Committed<C: CurveAffine> {
+    prepared: PreparedCommitments<C>,
+    phi_commitment: C,
+}
+
+pub struct Evaluated<C: CurveAffine> {
+    committed: Committed<C>,
+    phi_eval: C::Scalar,
+    phi_next_eval: C::Scalar,
+    m_eval: C::Scalar,
+}
+
+impl<F: FieldExt> Argument<F> {
+    pub(in crate::plonk) fn read_prepared_commitments<
+        C: CurveAffine,
+        E: EncodedChallenge<C>,
+        T: TranscriptRead<C, E>,
+    >(
+        &self,
+        transcript: &mut T,
+    ) -> Result<PreparedCommitments<C>, Error> {
+        let m_commitment = transcript.read_point()?;
+
+        Ok(PreparedCommitments { m_commitment })
+    }
+}
+
+impl<C: CurveAffine> PreparedCommitments<C> {
+    pub(in crate::plonk) fn read_grand_sum_commitment<
+        E: EncodedChallenge<C>,
+        T: TranscriptRead<C, E>,
+    >(
+        self,
+        transcript: &mut T,
+    ) -> Result<Committed<C>, Error> {
+        let phi_commitment = transcript.read_point()?;
+
+        Ok(Committed {
+            prepared: self,
+            phi_commitment,
+        })
+    }
+}
+
+impl<C: CurveAffine> Committed<C> {
+    pub(crate) fn evaluate<E: EncodedChallenge<C>, T: TranscriptRead<C, E>>(
+        self,
+        transcript: &mut T,
+    ) -> Result<Evaluated<C>, Error> {
+        let phi_eval = transcript.read_scalar()?;
+        let phi_next_eval = transcript.read_scalar()?;
+        let m_eval = transcript.read_scalar()?;
+
+        Ok(Evaluated {
+            committed: self,
+            phi_eval,
+            phi_next_eval,
+            m_eval,
+        })
+    }
+}
+
+impl<C: CurveAffine> Evaluated<C> {
+    pub(in crate::plonk) fn expressions<'a>(
+        &'a self,
+        l_0: C::Scalar,
+        l_last: C::Scalar,
+        l_blind: C::Scalar,
+        argument: &'a Argument<C::Scalar>,
+        theta: ChallengeTheta<C>,
+        beta: ChallengeBeta<C>,
+        advice_evals: &[C::Scalar],
+        fixed_evals: &[C::Scalar],
+        instance_evals: &[C::Scalar],
+        challenges: &[C::Scalar],
+    ) -> impl Iterator<Item = C::Scalar> + 'a {
+        let active_rows = C::Scalar::one() - (l_last + l_blind);
+
+        /*
+            φ_i(X) = f_i(X) + beta
+            τ(X) = t(X) + beta
+            LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+            RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+        */
+
+        let grand_sum_expression = || {
+            let compress_expressions = |expressions: &[Expression<C::Scalar>]| {
+                expressions
+                    .iter()
+                    .map(|expression| {
+                        expression.evaluate(
+                            &|scalar| scalar,
+                            &|_| panic!("virtual selectors are removed during optimization"),
+                            &|query| fixed_evals[query.index],
+                            &|query| advice_evals[query.index],
+                            &|query| instance_evals[query.index],
+                            &|challenge| challenges[challenge.index()],
+                            &|a| -a,
+                            &|a, b| a + &b,
+                            &|a, b| a * &b,
+                            &|a, scalar| a * &scalar,
+                        )
+                    })
+                    .fold(C::Scalar::zero(), |acc, eval| acc * &*theta + &eval)
+            };
+
+            // φ_i(X) = f_i(X) + beta
+            let mut f_evals: Vec<_> = argument
+                .inputs_expressions
+                .iter()
+                .map(|input_expressions| compress_expressions(input_expressions) + *beta)
+                .collect();
+
+            let t_eval = compress_expressions(&argument.table_expressions);
+
+            let tau = t_eval + *beta;
+            // Π(φ_i(X))
+            let prod_fi = f_evals
+                .iter()
+                .fold(C::Scalar::one(), |acc, eval| acc * eval);
+            // ∑ 1/(φ_i(X))
+            let sum_inv_fi = {
+                f_evals.batch_invert();
+                f_evals
+                    .iter()
+                    .fold(C::Scalar::zero(), |acc, eval| acc + eval)
+            };
+
+            // LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
+            let lhs = tau * prod_fi * (self.phi_next_eval - self.phi_eval);
+
+            // RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+            let rhs = { tau * prod_fi * (sum_inv_fi - self.m_eval * tau.invert().unwrap()) };
+
+            (lhs - rhs) * active_rows
+        };
+
+        std::iter::empty()
+            .chain(
+                // phi[0] = 0
+                Some(l_0 * self.phi_eval),
+            )
+            .chain(
+                // phi[u] = 0
+                Some(l_last * self.phi_eval),
+            )
+            .chain(
+                // (1 - l_last - l_blind) * (lhs - rhs) = 0
+                Some(grand_sum_expression()),
+            )
+    }
+
+    pub(in crate::plonk) fn queries<'r, M: MSM<C> + 'r>(
+        &'r self,
+        vk: &'r VerifyingKey<C>,
+        x: ChallengeX<C>,
+    ) -> impl Iterator<Item = VerifierQuery<'r, C, M>> + Clone {
+        let x_next = vk.domain.rotate_omega(*x, Rotation::next());
+
+        iter::empty()
+            .chain(Some(VerifierQuery::new_commitment(
+                &self.committed.phi_commitment,
+                *x,
+                self.phi_eval,
+            )))
+            .chain(Some(VerifierQuery::new_commitment(
+                &self.committed.phi_commitment,
+                x_next,
+                self.phi_next_eval,
+            )))
+            .chain(Some(VerifierQuery::new_commitment(
+                &self.committed.prepared.m_commitment,
+                *x,
+                self.m_eval,
+            )))
+    }
+}
diff --git a/halo2_proofs/src/plonk/prover.rs b/halo2_proofs/src/plonk/prover.rs
index 7c176e576a..38fddd440e 100644
--- a/halo2_proofs/src/plonk/prover.rs
+++ b/halo2_proofs/src/plonk/prover.rs
@@ -1,4 +1,4 @@
-use ff::Field;
+use ff::{Field, PrimeField};
 use group::Curve;
 use halo2curves::CurveExt;
 use rand_core::RngCore;
@@ -16,7 +16,7 @@ use super::{
         Advice, Any, Assignment, Challenge, Circuit, Column, ConstraintSystem, FirstPhase, Fixed,
         FloorPlanner, Instance, Selector,
     },
-    lookup, permutation, vanishing, ChallengeBeta, ChallengeGamma, ChallengeTheta, ChallengeX,
+    mv_lookup, permutation, vanishing, ChallengeBeta, ChallengeGamma, ChallengeTheta, ChallengeX,
     ChallengeY, Error, Expression, ProvingKey,
 };
 use crate::{
@@ -55,7 +55,20 @@ pub fn create_proof<
     instances: &[&[&[Scheme::Scalar]]],
     mut rng: R,
     transcript: &mut T,
-) -> Result<(), Error> {
+) -> Result<(), Error>
+where
+    Scheme::Scalar: PrimeField,
+{
+    #[cfg(feature = "counter")]
+    {
+        use crate::{FFT_COUNTER, MSM_COUNTER};
+        use std::collections::BTreeMap;
+
+        // reset counters at the beginning of the prove
+        *MSM_COUNTER.lock().unwrap() = BTreeMap::new();
+        *FFT_COUNTER.lock().unwrap() = BTreeMap::new();
+    }
+
     for instance in instances.iter() {
         if instance.len() != pk.vk.cs.num_instance_columns {
             return Err(Error::InvalidInstances);
@@ -516,7 +529,7 @@ pub fn create_proof<
     // Sample theta challenge for keeping lookup columns linearly independent
     let theta: ChallengeTheta<_> = transcript.squeeze_challenge_scalar();
 
-    let lookups: Vec<Vec<lookup::prover::Permuted<Scheme::Curve>>> = instance
+    let lookups: Vec<Vec<mv_lookup::prover::Prepared<Scheme::Curve>>> = instance
         .iter()
         .zip(advice.iter())
         .map(|(instance, advice)| -> Result<Vec<_>, Error> {
@@ -526,7 +539,7 @@ pub fn create_proof<
                 .lookups
                 .iter()
                 .map(|lookup| {
-                    lookup.commit_permuted(
+                    lookup.prepare(
                         pk,
                         params,
                         domain,
@@ -569,13 +582,13 @@ pub fn create_proof<
         })
         .collect::<Result<Vec<_>, _>>()?;
 
-    let lookups: Vec<Vec<lookup::prover::Committed<Scheme::Curve>>> = lookups
+    let lookups: Vec<Vec<mv_lookup::prover::Committed<Scheme::Curve>>> = lookups
         .into_iter()
         .map(|lookups| -> Result<Vec<_>, _> {
             // Construct and commit to products for each lookup
             lookups
                 .into_iter()
-                .map(|lookup| lookup.commit_product(pk, params, beta, gamma, &mut rng, transcript))
+                .map(|lookup| lookup.commit_grand_sum(pk, params, beta, &mut rng, transcript))
                 .collect::<Result<Vec<_>, _>>()
         })
         .collect::<Result<Vec<_>, _>>()?;
@@ -698,8 +711,7 @@ pub fn create_proof<
         .map(|permutation| -> Result<_, _> { permutation.construct().evaluate(pk, x, transcript) })
         .collect::<Result<Vec<_>, _>>()?;
 
-    // Evaluate the lookups, if any, at omega^i x.
-    let lookups: Vec<Vec<lookup::prover::Evaluated<Scheme::Curve>>> = lookups
+    let lookups: Vec<Vec<mv_lookup::prover::Evaluated<Scheme::Curve>>> = lookups
         .into_iter()
         .map(|lookups| -> Result<Vec<_>, _> {
             lookups
@@ -757,6 +769,18 @@ pub fn create_proof<
         // We query the h(X) polynomial at x
         .chain(vanishing.open(x));
 
+    #[cfg(feature = "counter")]
+    {
+        use crate::{FFT_COUNTER, MSM_COUNTER};
+        use std::collections::BTreeMap;
+        println!("MSM_COUNTER: {:?}", MSM_COUNTER.lock().unwrap());
+        println!("FFT_COUNTER: {:?}", *FFT_COUNTER.lock().unwrap());
+
+        // reset counters at the end of the proving
+        *MSM_COUNTER.lock().unwrap() = BTreeMap::new();
+        *FFT_COUNTER.lock().unwrap() = BTreeMap::new();
+    }
+
     let prover = P::new(params);
     prover
         .create_proof(rng, transcript, instances)
diff --git a/halo2_proofs/src/plonk/verifier.rs b/halo2_proofs/src/plonk/verifier.rs
index e6a2327e7b..cf66b54fca 100644
--- a/halo2_proofs/src/plonk/verifier.rs
+++ b/halo2_proofs/src/plonk/verifier.rs
@@ -128,7 +128,7 @@ pub fn verify_proof<
             vk.cs
                 .lookups
                 .iter()
-                .map(|argument| argument.read_permuted_commitments(transcript))
+                .map(|argument| argument.read_prepared_commitments(transcript))
                 .collect::<Result<Vec<_>, _>>()
         })
         .collect::<Result<Vec<_>, _>>()?;
@@ -149,10 +149,10 @@ pub fn verify_proof<
     let lookups_committed = lookups_permuted
         .into_iter()
         .map(|lookups| {
-            // Hash each lookup product commitment
+            // Hash each lookup sum commitment
             lookups
                 .into_iter()
-                .map(|lookup| lookup.read_product_commitment(transcript))
+                .map(|lookup| lookup.read_grand_sum_commitment(transcript))
                 .collect::<Result<Vec<_>, _>>()
         })
         .collect::<Result<Vec<_>, _>>()?;
@@ -297,27 +297,22 @@ pub fn verify_proof<
                         gamma,
                         x,
                     ))
-                    .chain(
-                        lookups
-                            .iter()
-                            .zip(vk.cs.lookups.iter())
-                            .flat_map(move |(p, argument)| {
-                                p.expressions(
-                                    l_0,
-                                    l_last,
-                                    l_blind,
-                                    argument,
-                                    theta,
-                                    beta,
-                                    gamma,
-                                    advice_evals,
-                                    fixed_evals,
-                                    instance_evals,
-                                    challenges,
-                                )
-                            })
-                            .into_iter(),
-                    )
+                    .chain(lookups.iter().zip(vk.cs.lookups.iter()).flat_map(
+                        move |(p, argument)| {
+                            p.expressions(
+                                l_0,
+                                l_last,
+                                l_blind,
+                                argument,
+                                theta,
+                                beta,
+                                advice_evals,
+                                fixed_evals,
+                                instance_evals,
+                                challenges,
+                            )
+                        },
+                    ))
             });
 
         vanishing.verify(params, expressions, y, xn)
@@ -363,12 +358,7 @@ pub fn verify_proof<
                         },
                     ))
                     .chain(permutation.queries(vk, x))
-                    .chain(
-                        lookups
-                            .iter()
-                            .flat_map(move |p| p.queries(vk, x))
-                            .into_iter(),
-                    )
+                    .chain(lookups.iter().flat_map(move |p| p.queries(vk, x)))
             },
         )
         .chain(
diff --git a/halo2_proofs/src/poly.rs b/halo2_proofs/src/poly.rs
index 84089482aa..c6c3c38a30 100644
--- a/halo2_proofs/src/poly.rs
+++ b/halo2_proofs/src/poly.rs
@@ -307,7 +307,7 @@ impl<'a, F: Field, B: Basis> Sub<F> for &'a Polynomial<F, B> {
 /// Describes the relative rotation of a vector. Negative numbers represent
 /// reverse (leftmost) rotations and positive numbers represent forward (rightmost)
 /// rotations. Zero represents no rotation.
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct Rotation(pub i32);
 
 impl Rotation {
diff --git a/halo2_proofs/tests/plonk_api.rs b/halo2_proofs/tests/plonk_api.rs
index 778cf840fb..27394c29d9 100644
--- a/halo2_proofs/tests/plonk_api.rs
+++ b/halo2_proofs/tests/plonk_api.rs
@@ -1,7 +1,8 @@
 #![allow(clippy::many_single_char_names)]
 #![allow(clippy::op_ref)]
 
-// use assert_matches::assert_matches;
+use assert_matches::assert_matches;
+use ff::PrimeField;
 use halo2_proofs::arithmetic::{Field, FieldExt};
 #[cfg(feature = "parallel_syn")]
 use halo2_proofs::circuit::Region;
@@ -70,6 +71,13 @@ impl PlonkConfig {
         let sp = meta.fixed_column();
         let sl = meta.lookup_table_column();
 
+        // Add to test mvlookup
+        let dummy = meta.complex_selector();
+        let dummy_2 = meta.complex_selector();
+        let dummy_3 = meta.complex_selector();
+
+        let dummy_table = meta.lookup_table_column();
+
         /*
          *   A         B      ...  sl
          * [
@@ -91,6 +99,21 @@ impl PlonkConfig {
             vec![(a_, sl)]
         });
 
+        // Add to test mvlookup
+        meta.lookup("lookup_same", |meta| {
+            let a_ = meta.query_any(a, Rotation::cur());
+            vec![(a_, sl)]
+        });
+
+        meta.lookup("lookup_same", |meta| {
+            let b_ = meta.query_any(b, Rotation::cur());
+            let dummy = meta.query_selector(dummy);
+            let dummy_2 = meta.query_selector(dummy_2);
+            let dummy_3 = meta.query_selector(dummy_3);
+
+            vec![(dummy * dummy_2 * dummy_3 * b_, dummy_table)]
+        });
+
         meta.create_gate("Combined add-mult", |meta| {
             let d = meta.query_advice(d, Rotation::next());
             let a = meta.query_advice(a, Rotation::cur());

From ef79cc2cc10f430726b653ee2714573e77dec5e0 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Mon, 13 Nov 2023 13:55:44 +0800
Subject: [PATCH 02/19] fmt

---
 halo2_proofs/benches/lookups.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/halo2_proofs/benches/lookups.rs b/halo2_proofs/benches/lookups.rs
index e9fc4eb4ef..41c11285c8 100644
--- a/halo2_proofs/benches/lookups.rs
+++ b/halo2_proofs/benches/lookups.rs
@@ -12,12 +12,10 @@ use halo2curves::pairing::Engine;
 use rand_core::OsRng;
 
 use halo2_proofs::{
-    poly::{
-        kzg::{
-            commitment::{KZGCommitmentScheme, ParamsKZG},
-            multiopen::ProverGWC,
-            strategy::SingleStrategy,
-        },
+    poly::kzg::{
+        commitment::{KZGCommitmentScheme, ParamsKZG},
+        multiopen::ProverGWC,
+        strategy::SingleStrategy,
     },
     transcript::{TranscriptReadBuffer, TranscriptWriterBuffer},
 };
@@ -60,7 +58,9 @@ fn criterion_benchmark(c: &mut Criterion) {
 
             meta.create_gate("degree 6 gate", |meta| {
                 let dummy_selector = meta.query_selector(dummy_selector);
-                let constraints = vec![dummy_selector.clone(); 4].iter().fold(dummy_selector.clone(), |acc, val| acc * val.clone());
+                let constraints = vec![dummy_selector.clone(); 4]
+                    .iter()
+                    .fold(dummy_selector.clone(), |acc, val| acc * val.clone());
                 Constraints::with_selector(dummy_selector, Some(constraints))
             });
 
@@ -92,8 +92,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             /*
                 - We need degree at least 6 because 6 - 1 = 5 and we need to go to extended domain of 8n
                 - Our goal is to get to max degree of 9 because now 9 - 1 = 8 and that will fit into domain
-            
-                - base degree = table_deg + 2 
+
+                - base degree = table_deg + 2
                 - if we put input_expression_degree = 1
                 => degree = base + 1 = 3 + 1 = 4
                 - we can batch one more with 5 more lookups

From 96ec27404660f8b547720392fae5e8ad9e147856 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Mon, 13 Nov 2023 14:18:11 +0800
Subject: [PATCH 03/19] fix unit test

---
 halo2_proofs/src/plonk/circuit.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/halo2_proofs/src/plonk/circuit.rs b/halo2_proofs/src/plonk/circuit.rs
index 734a325ee5..b2ff3ab077 100644
--- a/halo2_proofs/src/plonk/circuit.rs
+++ b/halo2_proofs/src/plonk/circuit.rs
@@ -1634,7 +1634,7 @@ impl<F: Field> ConstraintSystem<F> {
         for v in self.lookups_map.values() {
             let LookupTracker { table, inputs } = v;
             let mut args = vec![super::mv_lookup::Argument::new(
-                "mv_lookup",
+                "lookup",
                 table,
                 &[inputs[0].clone()],
             )];

From d7b4a5b076aeadc42a59f4323f9e456e737c96a6 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Mon, 13 Nov 2023 15:17:15 +0800
Subject: [PATCH 04/19] fix clippy errors

---
 halo2_proofs/benches/lookups.rs            |  6 +++---
 halo2_proofs/src/arithmetic.rs             |  4 ++--
 halo2_proofs/src/dev/failure.rs            |  2 +-
 halo2_proofs/src/plonk/circuit.rs          | 16 +++++++---------
 halo2_proofs/src/plonk/evaluation.rs       |  7 ++++---
 halo2_proofs/src/plonk/mv_lookup/prover.rs |  4 ++--
 halo2_proofs/src/plonk/prover.rs           |  6 +++---
 halo2_proofs/tests/plonk_api.rs            |  2 --
 8 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/halo2_proofs/benches/lookups.rs b/halo2_proofs/benches/lookups.rs
index 41c11285c8..b3171fdd9e 100644
--- a/halo2_proofs/benches/lookups.rs
+++ b/halo2_proofs/benches/lookups.rs
@@ -132,7 +132,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             || format!("offset {}", offset),
                             config.advice,
                             offset as usize,
-                            || Value::known(F::from((offset % 256))),
+                            || Value::known(F::from(offset % 256)),
                         )?;
                     }
                     for offset in 1u64..(1 << 10) {
@@ -141,7 +141,7 @@ fn criterion_benchmark(c: &mut Criterion) {
                             || format!("offset {}", offset),
                             config.other_advice,
                             offset as usize - 1,
-                            || Value::known(F::from((offset % 256))),
+                            || Value::known(F::from(offset % 256)),
                         )?;
                     }
                     Ok(())
@@ -160,7 +160,7 @@ fn criterion_benchmark(c: &mut Criterion) {
         (params, pk)
     }
 
-    fn prover(k: u32, params: &ParamsKZG<Bn256>, pk: &ProvingKey<G1Affine>) -> Vec<u8> {
+    fn prover(_k: u32, params: &ParamsKZG<Bn256>, pk: &ProvingKey<G1Affine>) -> Vec<u8> {
         let rng = OsRng;
 
         let circuit: MyCircuit<<Bn256 as Engine>::Scalar> = MyCircuit {
diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
index 8cddc98ffb..e89bc5c697 100644
--- a/halo2_proofs/src/arithmetic.rs
+++ b/halo2_proofs/src/arithmetic.rs
@@ -135,7 +135,7 @@ pub fn best_multiexp<C: CurveAffine>(coeffs: &[C::Scalar], bases: &[C]) -> C::Cu
     #[cfg(feature = "counter")]
     {
         use crate::MSM_COUNTER;
-        *MSM_COUNTER
+        MSM_COUNTER
             .lock()
             .unwrap()
             .entry(coeffs.len())
@@ -185,7 +185,7 @@ pub fn best_fft<G: Group>(a: &mut [G], omega: G::Scalar, log_n: u32) {
     #[cfg(feature = "counter")]
     {
         use crate::FFT_COUNTER;
-        *FFT_COUNTER
+        FFT_COUNTER
             .lock()
             .unwrap()
             .entry(a.len())
diff --git a/halo2_proofs/src/dev/failure.rs b/halo2_proofs/src/dev/failure.rs
index 68f0e32cf8..3309380073 100644
--- a/halo2_proofs/src/dev/failure.rs
+++ b/halo2_proofs/src/dev/failure.rs
@@ -445,7 +445,7 @@ fn render_constraint_not_satisfied<F: Field>(
 /// ```
 fn render_lookup<F: FieldExt>(
     prover: &MockProver<F>,
-    name: &str,
+    _name: &str,
     lookup_index: usize,
     location: &FailureLocation,
 ) {
diff --git a/halo2_proofs/src/plonk/circuit.rs b/halo2_proofs/src/plonk/circuit.rs
index b2ff3ab077..5a83a2810b 100644
--- a/halo2_proofs/src/plonk/circuit.rs
+++ b/halo2_proofs/src/plonk/circuit.rs
@@ -1564,11 +1564,11 @@ impl<F: Field> ConstraintSystem<F> {
     pub fn lookup(
         &mut self,
         // FIXME use name in debug messages
-        name: &'static str,
+        _name: &'static str,
         table_map: impl FnOnce(&mut VirtualCells<'_, F>) -> Vec<(Expression<F>, TableColumn)>,
     ) {
         let mut cells = VirtualCells::new(self);
-        let table_map: Vec<_> = table_map(&mut cells)
+        let (input_expressions, table_expressions): (Vec<_>, Vec<_>) = table_map(&mut cells)
             .into_iter()
             .map(|(input, table)| {
                 if input.contains_simple_selector() {
@@ -1579,10 +1579,8 @@ impl<F: Field> ConstraintSystem<F> {
 
                 (input, table)
             })
-            .collect();
+            .unzip();
 
-        let (input_expressions, table_expressions): (Vec<_>, Vec<_>) =
-            table_map.into_iter().unzip();
         let table_expressions_identifier = table_expressions
             .iter()
             .fold(String::new(), |string, expr| string + &expr.identifier());
@@ -1642,12 +1640,12 @@ impl<F: Field> ConstraintSystem<F> {
             for input in inputs.iter().skip(1) {
                 let cur_input_degree = input.iter().map(|expr| expr.degree()).max().unwrap();
                 let mut indicator = false;
-                for i in 0..args.len() {
+                for arg in args.iter_mut() {
                     // try to fit input in one of the args
-                    let cur_argument_degree = args[i].required_degree();
+                    let cur_argument_degree = arg.required_degree();
                     let new_potential_degree = cur_argument_degree + cur_input_degree;
                     if new_potential_degree <= minimum_degree {
-                        args[i].inputs_expressions.push(input.clone());
+                        arg.inputs_expressions.push(input.clone());
                         indicator = true;
                         break;
                     }
@@ -1674,7 +1672,7 @@ impl<F: Field> ConstraintSystem<F> {
     pub fn lookup_any(
         &mut self,
         // FIXME use name in debug messages
-        name: &'static str,
+        _name: &'static str,
         table_map: impl FnOnce(&mut VirtualCells<'_, F>) -> Vec<(Expression<F>, Expression<F>)>,
     ) {
         let mut cells = VirtualCells::new(self);
diff --git a/halo2_proofs/src/plonk/evaluation.rs b/halo2_proofs/src/plonk/evaluation.rs
index 887f0a290b..677b375270 100644
--- a/halo2_proofs/src/plonk/evaluation.rs
+++ b/halo2_proofs/src/plonk/evaluation.rs
@@ -53,7 +53,8 @@ pub enum ValueSource {
     /// beta
     Beta(),
     /// gamma
-    Gamma(),
+    // only used by the old halo2 lookup scheme
+    // Gamma(),
     /// theta
     Theta(),
     /// y
@@ -80,7 +81,7 @@ impl ValueSource {
         instance_values: &[Polynomial<F, B>],
         challenges: &[F],
         beta: &F,
-        gamma: &F,
+        _gamma: &F,
         theta: &F,
         y: &F,
         previous_value: &F,
@@ -99,7 +100,7 @@ impl ValueSource {
             }
             ValueSource::Challenge(index) => challenges[*index],
             ValueSource::Beta() => *beta,
-            ValueSource::Gamma() => *gamma,
+            // ValueSource::Gamma() => *gamma,
             ValueSource::Theta() => *theta,
             ValueSource::Y() => *y,
             ValueSource::PreviousValue() => *previous_value,
diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index 7da0802549..18a57c3058 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -67,7 +67,7 @@ impl<F: FieldExt> Argument<F> {
         fixed_values: &'a [Polynomial<C::Scalar, LagrangeCoeff>],
         instance_values: &'a [Polynomial<C::Scalar, LagrangeCoeff>],
         challenges: &'a [C::Scalar],
-        mut rng: R, // in case we want to blind (do we actually need zk?)
+        mut _rng: R, // in case we want to blind (do we actually need zk?)
         transcript: &mut T,
     ) -> Result<Prepared<C>, Error>
     where
@@ -153,7 +153,7 @@ impl<F: FieldExt> Argument<F> {
             }
 
             // check sums
-            let alpha = C::Scalar::random(&mut rng);
+            let alpha = C::Scalar::random(&mut _rng);
             let cs_input_sum =
                 |compressed_input_expression: &Polynomial<C::Scalar, LagrangeCoeff>| {
                     let mut lhs_sum = C::Scalar::zero();
diff --git a/halo2_proofs/src/plonk/prover.rs b/halo2_proofs/src/plonk/prover.rs
index 38fddd440e..2766406943 100644
--- a/halo2_proofs/src/plonk/prover.rs
+++ b/halo2_proofs/src/plonk/prover.rs
@@ -399,7 +399,7 @@ where
                 })
                 .collect::<BTreeSet<_>>();
 
-            for (circuit_idx, ((circuit, advice), instances)) in circuits
+            for (_circuit_idx, ((circuit, advice), instances)) in circuits
                 .iter()
                 .zip(advice.iter_mut())
                 .zip(instances)
@@ -438,7 +438,7 @@ where
                 {
                     for (idx, advice_col) in witness.advice_vec.iter().enumerate() {
                         if pk.vk.cs.advice_column_phase[idx].0 < current_phase.0
-                            && advice_assignments[circuit_idx][idx].values != advice_col.values
+                            && advice_assignments[_circuit_idx][idx].values != advice_col.values
                         {
                             log::error!(
                                 "advice column {}(at {:?}) changed when {:?}",
@@ -459,7 +459,7 @@ where
                             if column_indices.contains(&column_index) {
                                 #[cfg(feature = "phase-check")]
                                 {
-                                    advice_assignments[circuit_idx][column_index] = advice.clone();
+                                    advice_assignments[_circuit_idx][column_index] = advice.clone();
                                 }
                                 Some(advice)
                             } else {
diff --git a/halo2_proofs/tests/plonk_api.rs b/halo2_proofs/tests/plonk_api.rs
index 27394c29d9..ceb196d46b 100644
--- a/halo2_proofs/tests/plonk_api.rs
+++ b/halo2_proofs/tests/plonk_api.rs
@@ -1,8 +1,6 @@
 #![allow(clippy::many_single_char_names)]
 #![allow(clippy::op_ref)]
 
-use assert_matches::assert_matches;
-use ff::PrimeField;
 use halo2_proofs::arithmetic::{Field, FieldExt};
 #[cfg(feature = "parallel_syn")]
 use halo2_proofs::circuit::Region;

From 0cc345f08034ab38af5336b52cf8405d68402726 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Tue, 14 Nov 2023 16:52:25 +0800
Subject: [PATCH 05/19] add todo in mv_lookup's prover

---
 halo2_proofs/src/arithmetic.rs             | 13 ++++-
 halo2_proofs/src/plonk/mv_lookup.rs        |  5 +-
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 62 +++++++++++++++++++---
 halo2_proofs/tests/plonk_api.rs            |  4 +-
 4 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
index e89bc5c697..39eacc2991 100644
--- a/halo2_proofs/src/arithmetic.rs
+++ b/halo2_proofs/src/arithmetic.rs
@@ -534,7 +534,7 @@ where
 
 /// This simple utility function will parallelize an operation that is to be
 /// performed over a mutable slice.
-pub fn parallelize<T: Send, F: Fn(&mut [T], usize) + Send + Sync + Clone>(v: &mut [T], f: F) {
+pub(crate) fn parallelize_internal<T: Send, F: Fn(&mut [T], usize) + Send + Sync + Clone>(v: &mut [T], f: F) -> Vec<usize> {
     let n = v.len();
     let num_threads = multicore::current_num_threads();
     let mut chunk = (n as usize) / num_threads;
@@ -543,14 +543,23 @@ pub fn parallelize<T: Send, F: Fn(&mut [T], usize) + Send + Sync + Clone>(v: &mu
     }
 
     multicore::scope(|scope| {
+        let mut chunk_starts = vec![];
         for (chunk_num, v) in v.chunks_mut(chunk).enumerate() {
             let f = f.clone();
             scope.spawn(move |_| {
                 let start = chunk_num * chunk;
                 f(v, start);
             });
+            let start = chunk_num * chunk;
+            chunk_starts.push(start);
         }
-    });
+
+        chunk_starts
+    })
+}
+
+pub fn parallelize<T: Send, F: Fn(&mut [T], usize) + Send + Sync + Clone>(v: &mut [T], f: F) {
+    parallelize_internal(v, f);
 }
 
 fn log2_floor(num: usize) -> u32 {
diff --git a/halo2_proofs/src/plonk/mv_lookup.rs b/halo2_proofs/src/plonk/mv_lookup.rs
index 86022912ad..b4fd2d825f 100644
--- a/halo2_proofs/src/plonk/mv_lookup.rs
+++ b/halo2_proofs/src/plonk/mv_lookup.rs
@@ -63,10 +63,7 @@ impl<F: Field> Argument<F> {
             .map(|input_expressions| expr_degree(input_expressions))
             .sum();
 
-        let mut table_degree = 0;
-        for expr in self.table_expressions.iter() {
-            table_degree = std::cmp::max(table_degree, expr.degree());
-        }
+        let table_degree = expr_degree(&self.table_expressions);
 
         /*
             φ_i(X) = f_i(X) + α
diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index 18a57c3058..864eba4d54 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -30,6 +30,7 @@ use std::{
 };
 
 use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
+use crate::arithmetic::parallelize_internal;
 
 #[derive(Debug)]
 pub(in crate::plonk) struct Prepared<C: CurveAffine> {
@@ -67,7 +68,7 @@ impl<F: FieldExt> Argument<F> {
         fixed_values: &'a [Polynomial<C::Scalar, LagrangeCoeff>],
         instance_values: &'a [Polynomial<C::Scalar, LagrangeCoeff>],
         challenges: &'a [C::Scalar],
-        mut _rng: R, // in case we want to blind (do we actually need zk?)
+        mut rng: R, // in case we want to blind (do we actually need zk?)
         transcript: &mut T,
     ) -> Result<Prepared<C>, Error>
     where
@@ -107,6 +108,7 @@ impl<F: FieldExt> Argument<F> {
 
         let blinding_factors = pk.vk.cs.blinding_factors();
 
+        // TODO: construction of BTreeMap for a large vector
         // compute m(X)
         let table_index_value_mapping: BTreeMap<C::Scalar, usize> = compressed_table_expression
             .iter()
@@ -153,7 +155,7 @@ impl<F: FieldExt> Argument<F> {
             }
 
             // check sums
-            let alpha = C::Scalar::random(&mut _rng);
+            let alpha = C::Scalar::random(&mut rng);
             let cs_input_sum =
                 |compressed_input_expression: &Polynomial<C::Scalar, LagrangeCoeff>| {
                     let mut lhs_sum = C::Scalar::zero();
@@ -182,7 +184,8 @@ impl<F: FieldExt> Argument<F> {
         }
 
         // commit to m(X)
-        let blind = Blind(C::Scalar::zero());
+        // TODO: should we use zero instead?
+        let blind = Blind(C::Scalar::random(rng));
         let m_commitment = params.commit_lagrange(&m_values, blind).to_affine();
 
         // write commitment of m(X) to transcript
@@ -212,8 +215,8 @@ impl<C: CurveAffine> Prepared<C> {
         transcript: &mut T,
     ) -> Result<Committed<C>, Error> {
         /*
-            φ_i(X) = f_i(X) + α
-            τ(X) = t(X) + α
+            φ_i(X) = f_i(X) + beta
+            τ(X) = t(X) + beta
             LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
             RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
         */
@@ -234,6 +237,7 @@ impl<C: CurveAffine> Prepared<C> {
                     }
                 },
             );
+            // TODO: use parallelized batch invert
             input_log_derivatives.iter_mut().batch_invert();
 
             // TODO: remove last blinders from this
@@ -256,6 +260,7 @@ impl<C: CurveAffine> Prepared<C> {
             },
         );
 
+        // TODO: use parallelized batch invert
         table_log_derivatives.iter_mut().batch_invert();
 
         // (Σ 1/(φ_i(X)) - m(X) / τ(X))
@@ -276,6 +281,7 @@ impl<C: CurveAffine> Prepared<C> {
         // over our domain, starting with phi[0] = 0
         let blinding_factors = pk.vk.cs.blinding_factors();
         let phi = {
+            // parallelized version of log_derivatives_diff.scan()
             let active_size = params.n() as usize - blinding_factors;
             let chunk = {
                 let num_threads = crate::multicore::current_num_threads();
@@ -291,6 +297,8 @@ impl<C: CurveAffine> Prepared<C> {
                 .chain(log_derivatives_diff)
                 .take(active_size)
                 .collect::<Vec<_>>();
+            // TODO: remove the implicit assumption that parallelize() split the grand_sum
+            //      into segments that each has `chunk` elements except the last.
             parallelize(&mut grand_sum, |segment_grand_sum, _| {
                 for i in 1..segment_grand_sum.len() {
                     segment_grand_sum[i] += segment_grand_sum[i - 1];
@@ -372,7 +380,7 @@ impl<C: CurveAffine> Prepared<C> {
             assert_eq!(phi[u], C::Scalar::zero());
         }
 
-        let grand_sum_blind = Blind(C::Scalar::zero());
+        let grand_sum_blind = Blind(C::Scalar::random(rng));
         let phi_commitment = params.commit_lagrange(&phi, grand_sum_blind).to_affine();
 
         // Hash grand sum commitment
@@ -438,3 +446,45 @@ impl<C: CurveAffine> Evaluated<C> {
             }))
     }
 }
+
+mod benches {
+    use std::collections::BTreeMap;
+    use std::time::Instant;
+    use ark_std::rand::thread_rng;
+    use env_logger::init;
+    use ff::Field;
+    use halo2curves::bn256::Fr;
+
+    // bench the time to construct a BTreeMap out of a large table
+    // tivm is short for table_index_value_mapping
+    #[ignore]
+    #[test]
+    fn bench_tivm_btree_map() {
+        init();
+        let mut rng = thread_rng();
+
+        for log_n in 20..26 {
+            let n = 1 << log_n;
+            let dur = Instant::now();
+            let table: BTreeMap<Fr, usize> = (0..n)
+                .into_iter()
+                .map(|_| Fr::random(&mut rng))
+                .enumerate()
+                .map(|(i, x)| (x, i))
+                .collect();
+            log::info!("construct btreemap from random vec (len = {}) took {:?}", n, dur.elapsed());
+        }
+
+        for log_n in 20..26 {
+            let n = 1 << log_n;
+            let dur = Instant::now();
+            let table: BTreeMap<Fr, usize> = (0..n)
+                .into_iter()
+                .map(|i| Fr::from(i))
+                .enumerate()
+                .map(|(i, x)| (x, i))
+                .collect();
+            log::info!("construct btreemap from increasing vec (len = {}) took {:?}", n, dur.elapsed());
+        }
+    }
+}
\ No newline at end of file
diff --git a/halo2_proofs/tests/plonk_api.rs b/halo2_proofs/tests/plonk_api.rs
index ceb196d46b..dbe7fcf8e2 100644
--- a/halo2_proofs/tests/plonk_api.rs
+++ b/halo2_proofs/tests/plonk_api.rs
@@ -672,7 +672,7 @@ fn plonk_api() {
         >(verifier_params, pk.get_vk(), &proof[..]);
     }
 
-    env_logger::init();
+    let _logger_err = env_logger::try_init();
     test_plonk_api_ipa();
     test_plonk_api_gwc();
     test_plonk_api_shplonk();
@@ -854,7 +854,7 @@ fn plonk_api_with_many_subregions() {
     type Scheme = KZGCommitmentScheme<Bn256>;
     // bad_keys!(Scheme);
 
-    env_logger::try_init().unwrap();
+    let _logger_err = env_logger::try_init();
     let (a, instance, lookup_table) = common!(Scheme);
 
     let circuit: MyCircuit<<Scheme as CommitmentScheme>::Scalar> = MyCircuit {

From 4d96a2591ebb5cc82bc6dfd06d8ad5463ca54d09 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Tue, 14 Nov 2023 16:56:31 +0800
Subject: [PATCH 06/19] fmt and clippy

---
 halo2_proofs/src/arithmetic.rs             |  5 ++++-
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 26 ++++++++++++++--------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
index 39eacc2991..e820e80f06 100644
--- a/halo2_proofs/src/arithmetic.rs
+++ b/halo2_proofs/src/arithmetic.rs
@@ -534,7 +534,10 @@ where
 
 /// This simple utility function will parallelize an operation that is to be
 /// performed over a mutable slice.
-pub(crate) fn parallelize_internal<T: Send, F: Fn(&mut [T], usize) + Send + Sync + Clone>(v: &mut [T], f: F) -> Vec<usize> {
+pub(crate) fn parallelize_internal<T: Send, F: Fn(&mut [T], usize) + Send + Sync + Clone>(
+    v: &mut [T],
+    f: F,
+) -> Vec<usize> {
     let n = v.len();
     let num_threads = multicore::current_num_threads();
     let mut chunk = (n as usize) / num_threads;
diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index 864eba4d54..e6d893eb19 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -29,8 +29,8 @@ use std::{
     ops::{Mul, MulAssign},
 };
 
-use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 use crate::arithmetic::parallelize_internal;
+use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 
 #[derive(Debug)]
 pub(in crate::plonk) struct Prepared<C: CurveAffine> {
@@ -448,12 +448,12 @@ impl<C: CurveAffine> Evaluated<C> {
 }
 
 mod benches {
-    use std::collections::BTreeMap;
-    use std::time::Instant;
     use ark_std::rand::thread_rng;
     use env_logger::init;
     use ff::Field;
     use halo2curves::bn256::Fr;
+    use std::collections::BTreeMap;
+    use std::time::Instant;
 
     // bench the time to construct a BTreeMap out of a large table
     // tivm is short for table_index_value_mapping
@@ -466,25 +466,33 @@ mod benches {
         for log_n in 20..26 {
             let n = 1 << log_n;
             let dur = Instant::now();
-            let table: BTreeMap<Fr, usize> = (0..n)
+            let _table: BTreeMap<Fr, usize> = (0..n)
                 .into_iter()
                 .map(|_| Fr::random(&mut rng))
                 .enumerate()
                 .map(|(i, x)| (x, i))
                 .collect();
-            log::info!("construct btreemap from random vec (len = {}) took {:?}", n, dur.elapsed());
+            log::info!(
+                "construct btreemap from random vec (len = {}) took {:?}",
+                n,
+                dur.elapsed()
+            );
         }
 
         for log_n in 20..26 {
             let n = 1 << log_n;
             let dur = Instant::now();
-            let table: BTreeMap<Fr, usize> = (0..n)
+            let _table: BTreeMap<Fr, usize> = (0..n)
                 .into_iter()
-                .map(|i| Fr::from(i))
+                .map(Fr::from)
                 .enumerate()
                 .map(|(i, x)| (x, i))
                 .collect();
-            log::info!("construct btreemap from increasing vec (len = {}) took {:?}", n, dur.elapsed());
+            log::info!(
+                "construct btreemap from increasing vec (len = {}) took {:?}",
+                n,
+                dur.elapsed()
+            );
         }
     }
-}
\ No newline at end of file
+}

From 63e8b9d75e285f579429f090d10c17c20dc823cd Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Tue, 14 Nov 2023 17:23:35 +0800
Subject: [PATCH 07/19] fix clippy

---
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index e6d893eb19..33877029b9 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -449,7 +449,6 @@ impl<C: CurveAffine> Evaluated<C> {
 
 mod benches {
     use ark_std::rand::thread_rng;
-    use env_logger::init;
     use ff::Field;
     use halo2curves::bn256::Fr;
     use std::collections::BTreeMap;
@@ -460,7 +459,7 @@ mod benches {
     #[ignore]
     #[test]
     fn bench_tivm_btree_map() {
-        init();
+        env_logger::init();
         let mut rng = thread_rng();
 
         for log_n in 20..26 {

From 6f0eb4f8a6ac1aa011cb235469305342b29edf13 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Tue, 14 Nov 2023 18:07:16 +0800
Subject: [PATCH 08/19] add detailed running time of steps in logup's prover

---
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index 33877029b9..b8084226c9 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -28,6 +28,8 @@ use std::{
     iter,
     ops::{Mul, MulAssign},
 };
+use std::time::Instant;
+use ark_std::{end_timer, start_timer};
 
 use crate::arithmetic::parallelize_internal;
 use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
@@ -110,13 +112,16 @@ impl<F: FieldExt> Argument<F> {
 
         // TODO: construction of BTreeMap for a large vector
         // compute m(X)
+        let tivm_time = start_timer!(|| "table index value mapping");
         let table_index_value_mapping: BTreeMap<C::Scalar, usize> = compressed_table_expression
             .iter()
             .take(params.n() as usize - blinding_factors - 1)
             .enumerate()
             .map(|(i, &x)| (x, i))
             .collect();
+        end_timer!(tivm_time);
 
+        let m_time = start_timer!(|| "m(X) values");
         let m_values: Vec<F> = {
             use std::sync::atomic::{AtomicU64, Ordering};
             use std::sync::RwLock;
@@ -140,6 +145,7 @@ impl<F: FieldExt> Argument<F> {
                 .map(|mi| F::from(mi.load(Ordering::Relaxed) as u64))
                 .collect()
         };
+        end_timer!(m_time);
         let m_values = pk.vk.domain.lagrange_from_vec(m_values);
 
         #[cfg(feature = "sanity-checks")]
@@ -222,6 +228,7 @@ impl<C: CurveAffine> Prepared<C> {
         */
 
         // ∑ 1/(φ_i(X))
+        let inputs_log_drv_time = start_timer!(|| "inputs_log_derivative");
         let mut inputs_log_derivatives = vec![C::Scalar::zero(); params.n() as usize];
         for compressed_input_expression in self.compressed_inputs_expressions.iter() {
             let mut input_log_derivatives = vec![C::Scalar::zero(); params.n() as usize];
@@ -245,8 +252,10 @@ impl<C: CurveAffine> Prepared<C> {
                 inputs_log_derivatives[i] += input_log_derivatives[i];
             }
         }
+        end_timer!(inputs_log_drv_time);
 
         // 1 / τ(X)
+        let table_log_drv_time = start_timer!(|| "table log derivative");
         let mut table_log_derivatives = vec![C::Scalar::zero(); params.n() as usize];
         parallelize(
             &mut table_log_derivatives,
@@ -262,7 +271,9 @@ impl<C: CurveAffine> Prepared<C> {
 
         // TODO: use parallelized batch invert
         table_log_derivatives.iter_mut().batch_invert();
+        end_timer!(table_log_drv_time);
 
+        let log_drv_diff_time = start_timer!(|| "log derivatives diff");
         // (Σ 1/(φ_i(X)) - m(X) / τ(X))
         let mut log_derivatives_diff = vec![C::Scalar::zero(); params.n() as usize];
         parallelize(&mut log_derivatives_diff, |log_derivatives_diff, start| {
@@ -276,10 +287,12 @@ impl<C: CurveAffine> Prepared<C> {
                 *log_derivative_diff = *fi - *mi * *ti;
             }
         });
+        end_timer!(log_drv_diff_time);
 
         // Compute the evaluations of the lookup grand sum polynomial
         // over our domain, starting with phi[0] = 0
         let blinding_factors = pk.vk.cs.blinding_factors();
+        let phi_time = start_timer!(|| "par_scan(log_derivatives_diff)");
         let phi = {
             // parallelized version of log_derivatives_diff.scan()
             let active_size = params.n() as usize - blinding_factors;
@@ -318,6 +331,7 @@ impl<C: CurveAffine> Prepared<C> {
                 .chain((0..blinding_factors).map(|_| C::Scalar::random(&mut rng)))
                 .collect::<Vec<_>>()
         };
+        end_timer!(phi_time);
         assert_eq!(phi.len(), params.n() as usize);
         let phi = pk.vk.domain.lagrange_from_vec(phi);
 

From 71c818610d363236add7218fc895095840c7d1f3 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Tue, 14 Nov 2023 18:08:51 +0800
Subject: [PATCH 09/19] fmt

---
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index b8084226c9..e9c9f115aa 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -13,6 +13,7 @@ use crate::{
     },
     transcript::{EncodedChallenge, TranscriptWrite},
 };
+use ark_std::{end_timer, start_timer};
 use blake2b_simd::Hash;
 use ff::{BitViewSized, PrimeField, PrimeFieldBits};
 use group::{
@@ -22,14 +23,13 @@ use group::{
 use rand_core::RngCore;
 use rayon::current_num_threads;
 use std::collections::{BTreeSet, HashSet};
+use std::time::Instant;
 use std::{any::TypeId, convert::TryInto, num::ParseIntError, ops::Index};
 use std::{
     collections::BTreeMap,
     iter,
     ops::{Mul, MulAssign},
 };
-use std::time::Instant;
-use ark_std::{end_timer, start_timer};
 
 use crate::arithmetic::parallelize_internal;
 use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};

From 97a7f98c6ae27f11e7ec3bc8ad3aae023a400acd Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Wed, 15 Nov 2023 18:26:39 +0800
Subject: [PATCH 10/19] add more log hooks

---
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index e9c9f115aa..59e11878a3 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -77,6 +77,14 @@ impl<F: FieldExt> Argument<F> {
         C: CurveAffine<ScalarExt = F>,
         C::Curve: Mul<F, Output = C::Curve> + MulAssign<F>,
     {
+        let prepare_time = start_timer!(|| format!(
+            "prepare m(X) (inputs={:?}, table={})",
+            self.inputs_expressions
+                .iter()
+                .map(|e| e.len())
+                .collect::<Vec<usize>>(),
+            self.table_expressions.len()
+        ));
         // Closure to get values of expressions and compress them
         let compress_expressions = |expressions: &[Expression<C::Scalar>]| {
             let compressed_expression = expressions
@@ -197,6 +205,8 @@ impl<F: FieldExt> Argument<F> {
         // write commitment of m(X) to transcript
         transcript.write_point(m_commitment)?;
 
+        end_timer!(prepare_time);
+
         Ok(Prepared {
             compressed_inputs_expressions,
             compressed_table_expression,
@@ -226,6 +236,7 @@ impl<C: CurveAffine> Prepared<C> {
             LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
             RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
         */
+        let lookup_commit_time = start_timer!(|| "commit_grand_sum");
 
         // ∑ 1/(φ_i(X))
         let inputs_log_drv_time = start_timer!(|| "inputs_log_derivative");
@@ -244,8 +255,10 @@ impl<C: CurveAffine> Prepared<C> {
                     }
                 },
             );
+            let inputs_inv_time = start_timer!(|| "batch invert");
             // TODO: use parallelized batch invert
             input_log_derivatives.iter_mut().batch_invert();
+            end_timer!(inputs_inv_time);
 
             // TODO: remove last blinders from this
             for i in 0..params.n() as usize {
@@ -269,8 +282,10 @@ impl<C: CurveAffine> Prepared<C> {
             },
         );
 
+        let table_inv_time = start_timer!(|| "table batch invert");
         // TODO: use parallelized batch invert
         table_log_derivatives.iter_mut().batch_invert();
+        end_timer!(table_inv_time);
         end_timer!(table_log_drv_time);
 
         let log_drv_diff_time = start_timer!(|| "log derivatives diff");
@@ -400,6 +415,7 @@ impl<C: CurveAffine> Prepared<C> {
         // Hash grand sum commitment
         transcript.write_point(phi_commitment)?;
 
+        end_timer!(lookup_commit_time);
         Ok(Committed {
             m_poly: pk.vk.domain.lagrange_to_coeff(self.m_values),
             phi_poly: pk.vk.domain.lagrange_to_coeff(phi),

From e837827b7c01a37daea5ec89ba6b80234ca80ba7 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Wed, 15 Nov 2023 18:51:34 +0800
Subject: [PATCH 11/19] more running time logs

---
 halo2_proofs/src/plonk/prover.rs | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/halo2_proofs/src/plonk/prover.rs b/halo2_proofs/src/plonk/prover.rs
index 2766406943..7a66afac72 100644
--- a/halo2_proofs/src/plonk/prover.rs
+++ b/halo2_proofs/src/plonk/prover.rs
@@ -34,6 +34,7 @@ use crate::{
     poly::batch_invert_assigned,
     transcript::{EncodedChallenge, TranscriptWrite},
 };
+use ark_std::{end_timer, start_timer};
 use group::prime::PrimeCurveAffine;
 
 /// This creates a proof for the provided `circuit` when given the public
@@ -533,8 +534,11 @@ where
         .iter()
         .zip(advice.iter())
         .map(|(instance, advice)| -> Result<Vec<_>, Error> {
+            let lookup_get_mx_time =
+                start_timer!(|| format!("get m(X) in {} lookups", pk.vk.cs.lookups.len()));
             // Construct and commit to permuted values for each lookup
-            pk.vk
+            let mx = pk
+                .vk
                 .cs
                 .lookups
                 .iter()
@@ -552,7 +556,10 @@ where
                         transcript,
                     )
                 })
-                .collect()
+                .collect();
+            end_timer!(lookup_get_mx_time);
+
+            mx
         })
         .collect::<Result<Vec<_>, _>>()?;
 
@@ -582,6 +589,7 @@ where
         })
         .collect::<Result<Vec<_>, _>>()?;
 
+    let lookup_commit_time = start_timer!(|| "lookup commit grand sum");
     let lookups: Vec<Vec<mv_lookup::prover::Committed<Scheme::Curve>>> = lookups
         .into_iter()
         .map(|lookups| -> Result<Vec<_>, _> {
@@ -592,6 +600,7 @@ where
                 .collect::<Result<Vec<_>, _>>()
         })
         .collect::<Result<Vec<_>, _>>()?;
+    end_timer!(lookup_commit_time);
 
     // Commit to the vanishing argument's random polynomial for blinding h(x_3)
     let vanishing = vanishing::Argument::commit(params, domain, &mut rng, transcript)?;

From ee957d8fe67e1b803481e7cc84813f5c9c949eec Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Wed, 15 Nov 2023 19:21:48 +0800
Subject: [PATCH 12/19] use par invert

---
 halo2_proofs/src/arithmetic.rs             | 6 ++++++
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 8 +++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/halo2_proofs/src/arithmetic.rs b/halo2_proofs/src/arithmetic.rs
index e820e80f06..fa8c801aee 100644
--- a/halo2_proofs/src/arithmetic.rs
+++ b/halo2_proofs/src/arithmetic.rs
@@ -532,6 +532,12 @@ where
     q
 }
 
+pub fn par_invert<F: Field>(values: &mut [F]) {
+    parallelize(values, |values, _start| {
+        values.batch_invert();
+    });
+}
+
 /// This simple utility function will parallelize an operation that is to be
 /// performed over a mutable slice.
 pub(crate) fn parallelize_internal<T: Send, F: Fn(&mut [T], usize) + Send + Sync + Clone>(
diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index 59e11878a3..defe88b192 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -31,7 +31,7 @@ use std::{
     ops::{Mul, MulAssign},
 };
 
-use crate::arithmetic::parallelize_internal;
+use crate::arithmetic::{par_invert, parallelize_internal};
 use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 
 #[derive(Debug)]
@@ -256,8 +256,7 @@ impl<C: CurveAffine> Prepared<C> {
                 },
             );
             let inputs_inv_time = start_timer!(|| "batch invert");
-            // TODO: use parallelized batch invert
-            input_log_derivatives.iter_mut().batch_invert();
+            par_invert(input_log_derivatives.as_mut_slice());
             end_timer!(inputs_inv_time);
 
             // TODO: remove last blinders from this
@@ -283,8 +282,7 @@ impl<C: CurveAffine> Prepared<C> {
         );
 
         let table_inv_time = start_timer!(|| "table batch invert");
-        // TODO: use parallelized batch invert
-        table_log_derivatives.iter_mut().batch_invert();
+        par_invert(table_log_derivatives.as_mut_slice());
         end_timer!(table_inv_time);
         end_timer!(table_log_drv_time);
 

From d901014b088601b994a427a22ef280f217214b5a Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Wed, 15 Nov 2023 20:23:06 +0800
Subject: [PATCH 13/19] use sorted-vector to store how many times a table
 element occurs in input

---
 halo2_proofs/src/plonk/mv_lookup/prover.rs | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/halo2_proofs/src/plonk/mv_lookup/prover.rs b/halo2_proofs/src/plonk/mv_lookup/prover.rs
index defe88b192..6a90f8ade2 100644
--- a/halo2_proofs/src/plonk/mv_lookup/prover.rs
+++ b/halo2_proofs/src/plonk/mv_lookup/prover.rs
@@ -32,7 +32,9 @@ use std::{
 };
 
 use crate::arithmetic::{par_invert, parallelize_internal};
-use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
+use rayon::prelude::{
+    IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator, ParallelSliceMut,
+};
 
 #[derive(Debug)]
 pub(in crate::plonk) struct Prepared<C: CurveAffine> {
@@ -118,15 +120,15 @@ impl<F: FieldExt> Argument<F> {
 
         let blinding_factors = pk.vk.cs.blinding_factors();
 
-        // TODO: construction of BTreeMap for a large vector
         // compute m(X)
         let tivm_time = start_timer!(|| "table index value mapping");
-        let table_index_value_mapping: BTreeMap<C::Scalar, usize> = compressed_table_expression
+        let mut sorted_table_with_indices = compressed_table_expression
             .iter()
             .take(params.n() as usize - blinding_factors - 1)
             .enumerate()
-            .map(|(i, &x)| (x, i))
-            .collect();
+            .map(|(i, t)| (t, i))
+            .collect::<Vec<_>>();
+        sorted_table_with_indices.par_sort_by_key(|(&t, _)| t);
         end_timer!(tivm_time);
 
         let m_time = start_timer!(|| "m(X) values");
@@ -140,10 +142,12 @@ impl<F: FieldExt> Argument<F> {
                     .par_iter()
                     .take(params.n() as usize - blinding_factors - 1)
                     .try_for_each(|fi| -> Result<(), Error> {
-                        let index = table_index_value_mapping
-                            .get(fi)
-                            .ok_or(Error::ConstraintSystemFailure)?;
-                        m_values[*index].fetch_add(1, Ordering::Relaxed);
+                        let index = sorted_table_with_indices
+                            .binary_search_by_key(&fi, |&(t, _)| t)
+                            .map_err(|_| Error::ConstraintSystemFailure)?;
+                        let index = sorted_table_with_indices[index].1;
+
+                        m_values[index].fetch_add(1, Ordering::Relaxed);
                         Ok(())
                     });
             }

From a52c09adf033a4bbb8b6a795482a2f08272851f6 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Fri, 1 Dec 2023 16:08:21 +0800
Subject: [PATCH 14/19] par the process to get inputs_inv_sum

---
 halo2_proofs/src/plonk/evaluation.rs | 98 ++++++++++++----------------
 1 file changed, 43 insertions(+), 55 deletions(-)

diff --git a/halo2_proofs/src/plonk/evaluation.rs b/halo2_proofs/src/plonk/evaluation.rs
index 677b375270..732354dfdb 100644
--- a/halo2_proofs/src/plonk/evaluation.rs
+++ b/halo2_proofs/src/plonk/evaluation.rs
@@ -18,6 +18,8 @@ use group::{
     ff::{BatchInvert, Field},
     Curve,
 };
+use rayon::prelude::IntoParallelIterator;
+use rayon::prelude::ParallelIterator;
 use std::any::TypeId;
 use std::convert::TryInto;
 use std::num::ParseIntError;
@@ -511,69 +513,56 @@ impl<C: CurveAffine> Evaluator<C> {
 
                     // For lookups, compute inputs_inv_sum = ∑ 1 / (f_i(X) + beta)
                     // The outer vector has capacity self.lookups.len()
-                    // The middle vector has capacity domain.extended_len()
-                    // The inner vector has capacity
-                    let inputs_inv_sum: Vec<Vec<Vec<_>>> = lookups
+                    let inputs_inv_sum: Vec<Vec<_>> = self
+                        .lookups
                         .iter()
-                        .enumerate()
-                        .map(|(n, _)| {
-                            let (inputs_lookup_evaluator, _) = &self.lookups[n];
+                        .map(|lookup| {
+                            let (inputs_lookup_evaluator, _) = lookup;
                             let mut inputs_eval_data: Vec<_> = inputs_lookup_evaluator
                                 .iter()
                                 .map(|input_lookup_evaluator| input_lookup_evaluator.instance())
                                 .collect();
 
                             let mut inputs_values_for_extended_domain: Vec<C::Scalar> =
-                                Vec::with_capacity(self.lookups[n].0.len() << domain.k());
-                            for idx in 0..size {
-                                // For each compressed input column, evaluate at ω^i and add beta
-                                // This is a vector of length self.lookups[n].0.len()
-                                let inputs_values: Vec<C::ScalarExt> = inputs_lookup_evaluator
+                                inputs_lookup_evaluator
                                     .iter()
                                     .zip(inputs_eval_data.iter_mut())
-                                    .map(|(input_lookup_evaluator, input_eval_data)| {
-                                        input_lookup_evaluator.evaluate(
-                                            input_eval_data,
-                                            fixed,
-                                            advice,
-                                            instance,
-                                            challenges,
-                                            &beta,
-                                            &gamma,
-                                            &theta,
-                                            &y,
-                                            &C::ScalarExt::zero(),
-                                            idx,
-                                            rot_scale,
-                                            isize,
-                                        )
+                                    .flat_map(|(input_lookup_evaluator, input_eval_data)| {
+                                        (0..size).into_iter().map(|idx| {
+                                            input_lookup_evaluator.evaluate(
+                                                input_eval_data,
+                                                fixed,
+                                                advice,
+                                                instance,
+                                                challenges,
+                                                &beta,
+                                                &gamma,
+                                                &theta,
+                                                &y,
+                                                &C::ScalarExt::zero(),
+                                                idx,
+                                                rot_scale,
+                                                isize,
+                                            )
+                                        })
                                     })
                                     .collect();
 
-                                inputs_values_for_extended_domain.extend_from_slice(&inputs_values);
-                            }
-
-                            let num_threads = rayon::current_num_threads();
-                            let chunk_size =
-                                (inputs_values_for_extended_domain.len() + num_threads - 1)
-                                    / num_threads;
-                            rayon::scope(|scope| {
-                                for chunk in
-                                    inputs_values_for_extended_domain.chunks_mut(chunk_size)
-                                {
-                                    scope.spawn(|_| {
-                                        chunk.batch_invert();
-                                    })
-                                }
+                            parallelize(&mut inputs_values_for_extended_domain, |values, _| {
+                                values.batch_invert();
                             });
 
-                            // The outer vector has capacity domain.extended_len()
-                            // The inner vector has capacity self.lookups[n].0.len()
-                            let inputs_inv_sums: Vec<Vec<_>> = inputs_values_for_extended_domain
-                                .chunks_exact(self.lookups[n].0.len())
-                                .map(|c| c.to_vec())
-                                .collect();
-                            inputs_inv_sums
+                            let inputs_len = inputs_lookup_evaluator.len();
+
+                            (0..size)
+                                .into_par_iter()
+                                .map(|i| {
+                                    (0..inputs_len)
+                                        .into_iter()
+                                        .map(|j| inputs_values_for_extended_domain[j * size + i])
+                                        .fold(C::Scalar::zero(), |acc, x| acc + x)
+                                })
+                                .collect::<Vec<_>>()
                         })
                         .collect();
 
@@ -596,9 +585,11 @@ impl<C: CurveAffine> Evaluator<C> {
                             φ_i(X) = f_i(X) + beta
                             τ(X) = t(X) + beta
                             LHS = τ(X) * Π(φ_i(X)) * (ϕ(gX) - ϕ(X))
-                            RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+                            RHS = τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))      (1)
                                 = (τ(X) * Π(φ_i(X)) * ∑ 1/(φ_i(X))) - Π(φ_i(X)) * m(X)
                                 = Π(φ_i(X)) * (τ(X) * ∑ 1/(φ_i(X)) - m(X))
+
+                                = ∑_i τ(X) * Π_{j != i} φ_j(X) - m(X) * Π(φ_i(X))        (2)
                         */
                         parallelize(&mut values, |values, start| {
                             let (inputs_lookup_evaluator, table_lookup_evaluator) =
@@ -641,10 +632,7 @@ impl<C: CurveAffine> Evaluator<C> {
                                     .fold(C::Scalar::one(), |acc, input| acc * input);
 
                                 // f_i(X) + beta at ω^idx
-                                let fi_inverses = &inputs_inv_sum[n][idx];
-                                let inputs_inv_sum = fi_inverses
-                                    .iter()
-                                    .fold(C::Scalar::zero(), |acc, input| acc + input);
+                                let inv_sum: C::Scalar = inputs_inv_sum[n][idx];
 
                                 // t(X) + beta
                                 let table_value = table_lookup_evaluator.evaluate(
@@ -674,7 +662,7 @@ impl<C: CurveAffine> Evaluator<C> {
                                     //   τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
                                     // = (τ(X) * Π(φ_i(X)) * ∑ 1/(φ_i(X))) - Π(φ_i(X)) * m(X)
                                     // = Π(φ_i(X)) * (τ(X) * ∑ 1/(φ_i(X)) - m(X))
-                                    inputs_prod * (table_value * inputs_inv_sum - m_coset[idx])
+                                    inputs_prod * (table_value * inv_sum - m_coset[idx])
                                 };
 
                                 // phi[0] = 0

From 7a0b3fca0649f55125a78feea2553af734b2bbf7 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Fri, 1 Dec 2023 17:11:25 +0800
Subject: [PATCH 15/19] use par

---
 halo2_proofs/src/plonk/evaluation.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/halo2_proofs/src/plonk/evaluation.rs b/halo2_proofs/src/plonk/evaluation.rs
index 732354dfdb..db573afe58 100644
--- a/halo2_proofs/src/plonk/evaluation.rs
+++ b/halo2_proofs/src/plonk/evaluation.rs
@@ -528,7 +528,7 @@ impl<C: CurveAffine> Evaluator<C> {
                                     .iter()
                                     .zip(inputs_eval_data.iter_mut())
                                     .flat_map(|(input_lookup_evaluator, input_eval_data)| {
-                                        (0..size).into_iter().map(|idx| {
+                                        (0..size).into_par_iter().map(|idx| {
                                             input_lookup_evaluator.evaluate(
                                                 input_eval_data,
                                                 fixed,

From 35e5b8853ddb9d3e112b687737244d0670122161 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Fri, 1 Dec 2023 17:32:44 +0800
Subject: [PATCH 16/19] fix par

---
 halo2_proofs/src/plonk/evaluation.rs | 38 ++++++++++++++++++----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/halo2_proofs/src/plonk/evaluation.rs b/halo2_proofs/src/plonk/evaluation.rs
index db573afe58..3aedfa0f68 100644
--- a/halo2_proofs/src/plonk/evaluation.rs
+++ b/halo2_proofs/src/plonk/evaluation.rs
@@ -518,17 +518,21 @@ impl<C: CurveAffine> Evaluator<C> {
                         .iter()
                         .map(|lookup| {
                             let (inputs_lookup_evaluator, _) = lookup;
-                            let mut inputs_eval_data: Vec<_> = inputs_lookup_evaluator
-                                .iter()
-                                .map(|input_lookup_evaluator| input_lookup_evaluator.instance())
-                                .collect();
 
-                            let mut inputs_values_for_extended_domain: Vec<C::Scalar> =
-                                inputs_lookup_evaluator
-                                    .iter()
-                                    .zip(inputs_eval_data.iter_mut())
-                                    .flat_map(|(input_lookup_evaluator, input_eval_data)| {
-                                        (0..size).into_par_iter().map(|idx| {
+                            let inputs_values_for_extended_domain: Vec<Vec<C::Scalar>> = (0..size)
+                                .into_par_iter()
+                                .map(|idx| {
+                                    let mut inputs_eval_data: Vec<_> = inputs_lookup_evaluator
+                                        .iter()
+                                        .map(|input_lookup_evaluator| {
+                                            input_lookup_evaluator.instance()
+                                        })
+                                        .collect();
+
+                                    inputs_lookup_evaluator
+                                        .iter()
+                                        .zip(inputs_eval_data.iter_mut())
+                                        .map(|(input_lookup_evaluator, input_eval_data)| {
                                             input_lookup_evaluator.evaluate(
                                                 input_eval_data,
                                                 fixed,
@@ -545,7 +549,13 @@ impl<C: CurveAffine> Evaluator<C> {
                                                 isize,
                                             )
                                         })
-                                    })
+                                        .collect()
+                                })
+                                .collect();
+                            let mut inputs_values_for_extended_domain: Vec<C::Scalar> =
+                                inputs_values_for_extended_domain
+                                    .into_iter()
+                                    .flatten()
                                     .collect();
 
                             parallelize(&mut inputs_values_for_extended_domain, |values, _| {
@@ -557,9 +567,9 @@ impl<C: CurveAffine> Evaluator<C> {
                             (0..size)
                                 .into_par_iter()
                                 .map(|i| {
-                                    (0..inputs_len)
-                                        .into_iter()
-                                        .map(|j| inputs_values_for_extended_domain[j * size + i])
+                                    inputs_values_for_extended_domain
+                                        [i * inputs_len..(i + 1) * inputs_len]
+                                        .iter()
                                         .fold(C::Scalar::zero(), |acc, x| acc + x)
                                 })
                                 .collect::<Vec<_>>()

From fab29cc8c48d431d0369ddffe84e05cce4faf1e4 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Fri, 1 Dec 2023 18:58:16 +0800
Subject: [PATCH 17/19] add feature to skip inv sums

---
 halo2_proofs/src/plonk/evaluation.rs | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/halo2_proofs/src/plonk/evaluation.rs b/halo2_proofs/src/plonk/evaluation.rs
index 3aedfa0f68..0dbf62e56c 100644
--- a/halo2_proofs/src/plonk/evaluation.rs
+++ b/halo2_proofs/src/plonk/evaluation.rs
@@ -23,7 +23,9 @@ use rayon::prelude::ParallelIterator;
 use std::any::TypeId;
 use std::convert::TryInto;
 use std::num::ParseIntError;
+use std::process::exit;
 use std::slice;
+use std::sync::atomic::fence;
 use std::{
     collections::BTreeMap,
     iter,
@@ -513,6 +515,7 @@ impl<C: CurveAffine> Evaluator<C> {
 
                     // For lookups, compute inputs_inv_sum = ∑ 1 / (f_i(X) + beta)
                     // The outer vector has capacity self.lookups.len()
+                    #[cfg(not(feature = "logup_skip_inv"))]
                     let inputs_inv_sum: Vec<Vec<_>> = self
                         .lookups
                         .iter()
@@ -641,9 +644,6 @@ impl<C: CurveAffine> Evaluator<C> {
                                     .iter()
                                     .fold(C::Scalar::one(), |acc, input| acc * input);
 
-                                // f_i(X) + beta at ω^idx
-                                let inv_sum: C::Scalar = inputs_inv_sum[n][idx];
-
                                 // t(X) + beta
                                 let table_value = table_lookup_evaluator.evaluate(
                                     &mut table_eval_data,
@@ -668,7 +668,25 @@ impl<C: CurveAffine> Evaluator<C> {
                                     table_value * inputs_prod * (phi_coset[r_next] - phi_coset[idx])
                                 };
 
+                                #[cfg(feature = "logup_skip_inv")]
+                                let rhs = {
+                                    //   τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
+                                    // = ∑_i τ(X) * Π_{j != i} φ_j(X) - m(X) * Π(φ_i(X))
+                                    let inputs = (0..inputs_value.len())
+                                        .map(|i| {
+                                            inputs_value
+                                                .iter()
+                                                .enumerate()
+                                                .filter(|(j, _)| *j != i)
+                                                .fold(C::Scalar::one(), |acc, (_, x)| acc * *x)
+                                        })
+                                        .fold(C::Scalar::zero(), |acc, x| acc + x);
+                                    inputs * table_value - inputs_prod * m_coset[idx]
+                                };
+                                #[cfg(not(feature = "logup_skip_inv"))]
                                 let rhs = {
+                                    // ∑ 1 / (f_i(X) + beta) at ω^idx
+                                    let inv_sum: C::Scalar = inputs_inv_sum[n][idx];
                                     //   τ(X) * Π(φ_i(X)) * (∑ 1/(φ_i(X)) - m(X) / τ(X))))
                                     // = (τ(X) * Π(φ_i(X)) * ∑ 1/(φ_i(X))) - Π(φ_i(X)) * m(X)
                                     // = Π(φ_i(X)) * (τ(X) * ∑ 1/(φ_i(X)) - m(X))

From 4e6a0e6d7e5e64954acbcdc95a108f809d0faa13 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Fri, 1 Dec 2023 19:47:31 +0800
Subject: [PATCH 18/19] add new feature flag

---
 halo2_proofs/Cargo.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/halo2_proofs/Cargo.toml b/halo2_proofs/Cargo.toml
index 1aab9fdd02..2603f6f6b9 100644
--- a/halo2_proofs/Cargo.toml
+++ b/halo2_proofs/Cargo.toml
@@ -86,7 +86,7 @@ rand_core = { version = "0.6", default-features = false, features = ["getrandom"
 getrandom = { version = "0.2", features = ["js"] }
 
 [features]
-default = ["batch", "gwc"]
+default = ["batch", "gwc", "logup_skip_inv"]
 dev-graph = ["plotters", "tabbycat"]
 gadget-traces = ["backtrace"]
 sanity-checks = []
@@ -98,6 +98,7 @@ phase-check = []
 profile = ["ark-std/print-trace"]
 counter = ["lazy_static"]
 mock-batch-inv = []
+logup_skip_inv = []
 
 [lib]
 bench = false

From 6a3e41ea5eb0b779553a11a460f8746cd58e9869 Mon Sep 17 00:00:00 2001
From: kunxian xia <xiakunxian130@gmail.com>
Date: Fri, 8 Dec 2023 17:15:25 +0800
Subject: [PATCH 19/19] fix clippy error

---
 halo2_proofs/tests/plonk_api.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/halo2_proofs/tests/plonk_api.rs b/halo2_proofs/tests/plonk_api.rs
index b6a166dcff..1f1f0a2aa7 100644
--- a/halo2_proofs/tests/plonk_api.rs
+++ b/halo2_proofs/tests/plonk_api.rs
@@ -769,6 +769,7 @@ fn plonk_api() {
         >(verifier_params, pk.get_vk(), &proof[..]);
     }
 
+    #[allow(unused)]
     fn test_plonk_api_ipa() {
         use halo2_proofs::poly::ipa::commitment::{IPACommitmentScheme, ParamsIPA};
         use halo2_proofs::poly::ipa::multiopen::{ProverIPA, VerifierIPA};