diff --git a/Cargo.toml b/Cargo.toml
index 1418cb9a..52c646cc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,7 @@ resolver = "2"
 
 [profile.dev]
 opt-level = 3
-debug = 1              # change to 0 or 2 for more or less debug info
+debug = 2              # change to 0 or 2 for more or less debug info
 overflow-checks = true
 incremental = true
 
diff --git a/halo2-base/src/gates/circuit/builder.rs b/halo2-base/src/gates/circuit/builder.rs
index 15c92c84..df37faa1 100644
--- a/halo2-base/src/gates/circuit/builder.rs
+++ b/halo2-base/src/gates/circuit/builder.rs
@@ -222,6 +222,7 @@ impl<F: ScalarField> BaseCircuitBuilder<F> {
         for lm in &mut self.lookup_manager {
             lm.clear();
         }
+        self.assigned_instances.iter_mut().for_each(|c| c.clear());
     }
 
     /// Returns a mutable reference to the [Context] of a gate thread. Spawns a new thread for the given phase, if none exists.
diff --git a/halo2-base/src/lib.rs b/halo2-base/src/lib.rs
index f7a6ccfc..adfbaf36 100644
--- a/halo2-base/src/lib.rs
+++ b/halo2-base/src/lib.rs
@@ -12,6 +12,7 @@
 use std::any::TypeId;
 
 use getset::CopyGetters;
+use itertools::Itertools;
 // Different memory allocator options:
 #[cfg(feature = "jemallocator")]
 use jemallocator::Jemalloc;
@@ -445,6 +446,12 @@ impl<F: ScalarField> Context<F> {
         self.last().unwrap()
     }
 
+    /// Assigns a list of constant values and returns the corresponding assigned cells.
+    /// * `c`: the list of constant values to be assigned
+    pub fn load_constants(&mut self, c: &[F]) -> Vec<AssignedValue<F>> {
+        c.iter().map(|v| self.load_constant(*v)).collect_vec()
+    }
+
     /// Assigns the 0 value to a new cell or returns a previously assigned zero cell from `zero_cell`.
     pub fn load_zero(&mut self) -> AssignedValue<F> {
         if let Some(zcell) = &self.zero_cell {
diff --git a/halo2-base/src/poseidon/hasher/mod.rs b/halo2-base/src/poseidon/hasher/mod.rs
index 73478568..2608cc36 100644
--- a/halo2-base/src/poseidon/hasher/mod.rs
+++ b/halo2-base/src/poseidon/hasher/mod.rs
@@ -107,6 +107,11 @@ impl<F: ScalarField, const T: usize, const RATE: usize> PoseidonHasher<F, T, RAT
         self.consts.get_or_init(|| PoseidonHasherConsts::<F, T, RATE>::new(ctx, gate, &self.spec));
     }
 
+    /// Clear all consts.
+    pub fn clear(&mut self) {
+        self.consts.take();
+    }
+
     fn empty_hash(&self) -> &AssignedValue<F> {
         self.consts.get().unwrap().empty_hash()
     }
@@ -187,21 +192,21 @@ impl<F: ScalarField, const T: usize, const RATE: usize> PoseidonHasher<F, T, RAT
     pub fn hash_fix_len_array(
         &self,
         ctx: &mut Context<F>,
-        range: &impl RangeInstructions<F>,
+        gate: &impl GateInstructions<F>,
         inputs: &[AssignedValue<F>],
     ) -> AssignedValue<F>
     where
         F: BigPrimeField,
     {
         let mut state = self.init_state().clone();
-        fix_len_array_squeeze(ctx, range.gate(), inputs, &mut state, &self.spec)
+        fix_len_array_squeeze(ctx, gate, inputs, &mut state, &self.spec)
     }
 
     /// Constrains and returns hashes of inputs in a compact format. Length of `compact_inputs` should be determined at compile time.
     pub fn hash_compact_input(
         &self,
         ctx: &mut Context<F>,
-        range: &impl RangeInstructions<F>,
+        gate: &impl GateInstructions<F>,
         compact_inputs: &[PoseidonCompactInput<F, RATE>],
     ) -> Vec<PoseidonCompactOutput<F>>
     where
@@ -212,18 +217,18 @@ impl<F: ScalarField, const T: usize, const RATE: usize> PoseidonHasher<F, T, RAT
         for input in compact_inputs {
             // Assume this is the last row of a logical input:
             // Depending on if len == RATE.
-            let is_full = range.gate().is_equal(ctx, input.len, Constant(F::from(RATE as u64)));
+            let is_full = gate.is_equal(ctx, input.len, Constant(F::from(RATE as u64)));
             // Case 1: if len != RATE.
-            state.permutation(ctx, range.gate(), &input.inputs, Some(input.len), &self.spec);
+            state.permutation(ctx, gate, &input.inputs, Some(input.len), &self.spec);
             // Case 2: if len == RATE, an extra permuation is needed for squeeze.
             let mut state_2 = state.clone();
-            state_2.permutation(ctx, range.gate(), &[], None, &self.spec);
+            state_2.permutation(ctx, gate, &[], None, &self.spec);
             // Select the result of case 1/2 depending on if len == RATE.
-            let hash = range.gate().select(ctx, state_2.s[1], state.s[1], is_full);
+            let hash = gate.select(ctx, state_2.s[1], state.s[1], is_full);
             outputs.push(PoseidonCompactOutput { hash, is_final: input.is_final });
             // Reset state to init_state if this is the end of a logical input.
             // TODO: skip this if this is the last row.
-            state.select(ctx, range.gate(), input.is_final, self.init_state());
+            state.select(ctx, gate, input.is_final, self.init_state());
         }
         outputs
     }
diff --git a/halo2-base/src/poseidon/hasher/tests/hasher.rs b/halo2-base/src/poseidon/hasher/tests/hasher.rs
index 4066c2f5..2023c4ec 100644
--- a/halo2-base/src/poseidon/hasher/tests/hasher.rs
+++ b/halo2-base/src/poseidon/hasher/tests/hasher.rs
@@ -97,7 +97,7 @@ fn hasher_compact_inputs_compatiblity_verification<
         let native_result = native_sponge.squeeze();
         native_results.push(native_result);
     }
-    let compact_outputs = hasher.hash_compact_input(ctx, range, &compact_inputs);
+    let compact_outputs = hasher.hash_compact_input(ctx, range.gate(), &compact_inputs);
     let mut output_offset = 0;
     for (compact_output, compact_input) in compact_outputs.iter().zip(compact_inputs) {
         // into() doesn't work if ! is in the beginning in the bool expression...
diff --git a/halo2-base/src/poseidon/mod.rs b/halo2-base/src/poseidon/mod.rs
index 3e3398d8..896b863c 100644
--- a/halo2-base/src/poseidon/mod.rs
+++ b/halo2-base/src/poseidon/mod.rs
@@ -107,7 +107,7 @@ impl<'a, F: ScalarField, const T: usize, const RATE: usize> PoseidonInstructions
     {
         self.hasher.hash_fix_len_array(
             ctx,
-            self.range_chip,
+            self.range_chip.gate(),
             inputs.bytes().map(|sb| *sb.as_ref()).as_ref(),
         )
     }
diff --git a/halo2-base/src/virtual_region/copy_constraints.rs b/halo2-base/src/virtual_region/copy_constraints.rs
index da991fb9..3a405f1e 100644
--- a/halo2-base/src/virtual_region/copy_constraints.rs
+++ b/halo2-base/src/virtual_region/copy_constraints.rs
@@ -77,9 +77,21 @@ impl<F: Field + Ord> CopyConstraintManager<F> {
     /// Adds external raw Halo2 cell to `self.assigned_advices` and returns a new virtual cell that can be
     /// used as a tag (but will not be re-assigned). The returned [ContextCell] will have `type_id` the `TypeId::of::<Cell>()`.
     pub fn load_external_cell(&mut self, cell: Cell) -> ContextCell {
+        self.load_external_cell_impl(Some(cell))
+    }
+
+    /// Mock to load an external cell for base circuit simulation. If any mock external cell is loaded, calling [assign_raw] will panic.
+    pub fn mock_external_assigned(&mut self, v: F) -> AssignedValue<F> {
+        let context_cell = self.load_external_cell_impl(None);
+        AssignedValue { value: Assigned::Trivial(v), cell: Some(context_cell) }
+    }
+
+    fn load_external_cell_impl(&mut self, cell: Option<Cell>) -> ContextCell {
         let context_cell = ContextCell::new(TypeId::of::<Cell>(), 0, self.external_cell_count);
         self.external_cell_count += 1;
-        self.assigned_advices.insert(context_cell, cell);
+        if let Some(cell) = cell {
+            self.assigned_advices.insert(context_cell, cell);
+        }
         context_cell
     }
 
diff --git a/hashes/zkevm/Cargo.toml b/hashes/zkevm/Cargo.toml
index 4814145a..213d4c2b 100644
--- a/hashes/zkevm/Cargo.toml
+++ b/hashes/zkevm/Cargo.toml
@@ -12,9 +12,13 @@ itertools = "0.11"
 lazy_static = "1.4"
 log = "0.4"
 num-bigint = { version = "0.4" }
-halo2-base = { path = "../../halo2-base", default-features = false }
+halo2-base = { path = "../../halo2-base", default-features = false, features = [
+    "test-utils",
+] }
 rayon = "1.7"
 sha3 = "0.10.8"
+pse-poseidon = { git = "https://github.com/axiom-crypto/pse-poseidon.git" }
+getset = "0.1.2"
 
 [dev-dependencies]
 criterion = "0.3"
diff --git a/hashes/zkevm/src/keccak/coprocessor/circuit/leaf.rs b/hashes/zkevm/src/keccak/coprocessor/circuit/leaf.rs
new file mode 100644
index 00000000..6d4169e4
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/circuit/leaf.rs
@@ -0,0 +1,501 @@
+use std::cell::RefCell;
+
+use crate::{
+    keccak::{
+        coprocessor::{
+            encode::{
+                get_words_to_witness_multipliers, num_poseidon_absorb_per_keccak_f,
+                num_word_per_witness,
+            },
+            output::{dummy_circuit_output, KeccakCircuitOutput},
+            param::*,
+        },
+        vanilla::{
+            keccak_packed_multi::get_num_keccak_f, param::*, witness::multi_keccak,
+            KeccakAssignedRow, KeccakCircuitConfig, KeccakConfigParams,
+        },
+    },
+    util::eth_types::Field,
+};
+use getset::{CopyGetters, Getters};
+use halo2_base::{
+    gates::{
+        circuit::{builder::BaseCircuitBuilder, BaseCircuitParams, BaseConfig},
+        flex_gate::MultiPhaseThreadBreakPoints,
+        GateInstructions, RangeInstructions,
+    },
+    halo2_proofs::{
+        circuit::{Layouter, SimpleFloorPlanner},
+        plonk::{Circuit, ConstraintSystem, Error},
+    },
+    poseidon::hasher::{
+        spec::OptimizedPoseidonSpec, PoseidonCompactInput, PoseidonCompactOutput, PoseidonHasher,
+    },
+    safe_types::{SafeBool, SafeTypeChip},
+    AssignedValue, Context,
+    QuantumCell::Constant,
+};
+use itertools::Itertools;
+
+/// Keccak Coprocessor Leaf Circuit
+#[derive(Getters)]
+pub struct KeccakCoprocessorLeafCircuit<F: Field> {
+    inputs: Vec<Vec<u8>>,
+
+    /// Parameters of this circuit. The same parameters always construct the same circuit.
+    #[getset(get = "pub")]
+    params: KeccakCoprocessorLeafCircuitParams,
+
+    base_circuit_builder: RefCell<BaseCircuitBuilder<F>>,
+    hasher: RefCell<PoseidonHasher<F, POSEIDON_T, POSEIDON_RATE>>,
+}
+
+/// Parameters of KeccakCoprocessorLeafCircuit.
+#[derive(Default, Clone, CopyGetters)]
+pub struct KeccakCoprocessorLeafCircuitParams {
+    /// This circuit has 2^k rows.
+    #[getset(get_copy = "pub")]
+    k: usize,
+    // Number of unusable rows withhold by Halo2.
+    #[getset(get_copy = "pub")]
+    num_unusable_row: usize,
+    /// The bits of lookup table for RangeChip.
+    #[getset(get_copy = "pub")]
+    lookup_bits: usize,
+    /// Max keccak_f this circuits can aceept. The circuit can at most process <capacity> of inputs
+    /// with < NUM_BYTES_TO_ABSORB bytes or an input with <capacity> * NUM_BYTES_TO_ABSORB - 1 bytes.
+    #[getset(get_copy = "pub")]
+    capacity: usize,
+    // If true, publish raw outputs. Otherwise, publish Poseidon commitment of raw outputs.
+    #[getset(get_copy = "pub")]
+    publish_raw_outputs: bool,
+
+    // Derived parameters of sub-circuits.
+    pub keccak_circuit_params: KeccakConfigParams,
+    pub base_circuit_params: BaseCircuitParams,
+}
+
+impl KeccakCoprocessorLeafCircuitParams {
+    /// Create a new KeccakCoprocessorLeafCircuitParams.
+    pub fn new(
+        k: usize,
+        num_unusable_row: usize,
+        lookup_bits: usize,
+        capacity: usize,
+        publish_raw_outputs: bool,
+    ) -> Self {
+        assert!(1 << k > num_unusable_row, "Number of unusable rows must be less than 2^k");
+        let max_rows = (1 << k) - num_unusable_row;
+        // Derived from [crate::keccak::native_circuit::keccak_packed_multi::get_keccak_capacity].
+        let rows_per_round = max_rows / (capacity * (NUM_ROUNDS + 1) + 1 + NUM_WORDS_TO_ABSORB);
+        assert!(rows_per_round > 0, "No enough rows for the speficied capacity");
+        let keccak_circuit_params = KeccakConfigParams { k: k as u32, rows_per_round };
+        let base_circuit_params = BaseCircuitParams {
+            k,
+            lookup_bits: Some(lookup_bits),
+            num_instance_columns: if publish_raw_outputs {
+                OUTPUT_NUM_COL_RAW
+            } else {
+                OUTPUT_NUM_COL_COMMIT
+            },
+            ..Default::default()
+        };
+        Self {
+            k,
+            num_unusable_row,
+            lookup_bits,
+            capacity,
+            publish_raw_outputs,
+            keccak_circuit_params,
+            base_circuit_params,
+        }
+    }
+}
+
+/// Circuit::Config for Keccak Coprocessor Leaf Circuit.
+#[derive(Clone)]
+pub struct KeccakCoprocessorLeafConfig<F: Field> {
+    pub base_circuit_config: BaseConfig<F>,
+    pub keccak_circuit_config: KeccakCircuitConfig<F>,
+}
+
+impl<F: Field> Circuit<F> for KeccakCoprocessorLeafCircuit<F> {
+    type Config = KeccakCoprocessorLeafConfig<F>;
+    type FloorPlanner = SimpleFloorPlanner;
+    type Params = KeccakCoprocessorLeafCircuitParams;
+
+    fn params(&self) -> Self::Params {
+        self.params.clone()
+    }
+
+    /// Creates a new instance of the [RangeCircuitBuilder] without witnesses by setting the witness_gen_only flag to false
+    fn without_witnesses(&self) -> Self {
+        unimplemented!()
+    }
+
+    /// Configures a new circuit using [`BaseConfigParams`]
+    fn configure_with_params(meta: &mut ConstraintSystem<F>, params: Self::Params) -> Self::Config {
+        let base_circuit_params = params.base_circuit_params;
+        let base_circuit_config =
+            BaseCircuitBuilder::configure_with_params(meta, base_circuit_params.clone());
+        let keccak_circuit_config = KeccakCircuitConfig::new(meta, params.keccak_circuit_params);
+        Self::Config { base_circuit_config, keccak_circuit_config }
+    }
+
+    fn configure(_: &mut ConstraintSystem<F>) -> Self::Config {
+        unreachable!("You must use configure_with_params");
+    }
+
+    fn synthesize(
+        &self,
+        config: Self::Config,
+        mut layouter: impl Layouter<F>,
+    ) -> Result<(), Error> {
+        let k = self.params.k;
+        config.keccak_circuit_config.load_aux_tables(&mut layouter, k as u32)?;
+        let mut keccak_assigned_rows: Vec<KeccakAssignedRow<'_, F>> = Vec::default();
+        layouter.assign_region(
+            || "keccak circuit",
+            |mut region| {
+                let (keccak_rows, _) = multi_keccak::<F>(
+                    &self.inputs,
+                    Some(self.params.capacity),
+                    self.params.keccak_circuit_params,
+                );
+                keccak_assigned_rows =
+                    config.keccak_circuit_config.assign(&mut region, &keccak_rows);
+                Ok(())
+            },
+        )?;
+
+        // Base circuit witness generation.
+        let loaded_keccak_fs = self.load_keccak_assigned_rows(keccak_assigned_rows);
+        self.generate_base_circuit_witnesses(&loaded_keccak_fs);
+
+        self.base_circuit_builder.borrow().synthesize(config.base_circuit_config, layouter)?;
+
+        // Reset the circuit to the initial state so synthesize could be called multiple times.
+        self.base_circuit_builder.borrow_mut().clear();
+        self.hasher.borrow_mut().clear();
+        Ok(())
+    }
+}
+
+/// Witnesses of a keccak_f which are necessary to be loaded into halo2-lib.
+#[derive(Clone, Copy, Debug, CopyGetters, Getters)]
+pub struct LoadedKeccakF<F: Field> {
+    /// bytes_left of the first row of the first round of this keccak_f. This could be used to determine the length of the input.
+    #[getset(get_copy = "pub")]
+    pub(crate) bytes_left: AssignedValue<F>,
+    /// Input words (u64) of this keccak_f.
+    #[getset(get = "pub")]
+    pub(crate) word_values: [AssignedValue<F>; NUM_WORDS_TO_ABSORB],
+    /// The output of this keccak_f. is_final/hash_lo/hash_hi come from the first row of the last round(NUM_ROUNDS).
+    #[getset(get_copy = "pub")]
+    pub(crate) is_final: SafeBool<F>,
+    /// The lower 16 bits (in big-endian, 16..) of the output of this keccak_f.
+    #[getset(get_copy = "pub")]
+    pub(crate) hash_lo: AssignedValue<F>,
+    /// The high 16 bits (in big-endian, ..16) of the output of this keccak_f.
+    #[getset(get_copy = "pub")]
+    pub(crate) hash_hi: AssignedValue<F>,
+}
+
+impl<F: Field> LoadedKeccakF<F> {
+    pub fn new(
+        bytes_left: AssignedValue<F>,
+        word_values: [AssignedValue<F>; NUM_WORDS_TO_ABSORB],
+        is_final: SafeBool<F>,
+        hash_lo: AssignedValue<F>,
+        hash_hi: AssignedValue<F>,
+    ) -> Self {
+        Self { bytes_left, word_values, is_final, hash_lo, hash_hi }
+    }
+}
+
+impl<F: Field> KeccakCoprocessorLeafCircuit<F> {
+    /// Create a new KeccakCoprocessorLeafCircuit.
+    pub fn new(
+        inputs: Vec<Vec<u8>>,
+        params: KeccakCoprocessorLeafCircuitParams,
+        witness_gen_only: bool,
+    ) -> Self {
+        let input_size = inputs.iter().map(|input| get_num_keccak_f(input.len())).sum::<usize>();
+        assert!(input_size < params.capacity, "Input size exceeds capacity");
+        let mut base_circuit_builder = BaseCircuitBuilder::new(witness_gen_only);
+        base_circuit_builder.set_params(params.base_circuit_params.clone());
+        Self {
+            inputs,
+            params,
+            base_circuit_builder: RefCell::new(base_circuit_builder),
+            hasher: RefCell::new(create_hasher()),
+        }
+    }
+
+    /// Get break points of BaseCircuitBuilder.
+    pub fn base_circuit_break_points(&self) -> MultiPhaseThreadBreakPoints {
+        self.base_circuit_builder.borrow().break_points()
+    }
+
+    /// Set break points of BaseCircuitBuilder.
+    pub fn set_base_circuit_break_points(&self, break_points: MultiPhaseThreadBreakPoints) {
+        self.base_circuit_builder.borrow_mut().set_break_points(break_points);
+    }
+
+    pub fn update_base_circuit_params(&mut self, params: &BaseCircuitParams) {
+        self.params.base_circuit_params = params.clone();
+        self.base_circuit_builder.borrow_mut().set_params(params.clone());
+    }
+
+    /// Simulate witness generation of the base circuit to determine BaseCircuitParams because the number of columns
+    /// of the base circuit can only be known after witness generation.
+    pub fn calculate_base_circuit_params(
+        params: &KeccakCoprocessorLeafCircuitParams,
+    ) -> BaseCircuitParams {
+        // Create a simulation circuit to calculate base circuit parameters.
+        let simulation_circuit = Self::new(vec![], params.clone(), false);
+        let loaded_keccak_fs = simulation_circuit.mock_load_keccak_assigned_rows();
+        simulation_circuit.generate_base_circuit_witnesses(&loaded_keccak_fs);
+
+        let base_circuit_params = simulation_circuit
+            .base_circuit_builder
+            .borrow_mut()
+            .calculate_params(Some(params.num_unusable_row));
+        // prevent drop warnings
+        simulation_circuit.base_circuit_builder.borrow_mut().clear();
+
+        base_circuit_params
+    }
+
+    /// Mock loading Keccak assigned rows from Keccak circuit. This function doesn't create any witnesses/constraints.
+    fn mock_load_keccak_assigned_rows(&self) -> Vec<LoadedKeccakF<F>> {
+        let base_circuit_builder = self.base_circuit_builder.borrow();
+        let mut copy_manager = base_circuit_builder.core().copy_manager.lock().unwrap();
+        (0..self.params.capacity)
+            .map(|_| LoadedKeccakF {
+                bytes_left: copy_manager.mock_external_assigned(F::ZERO),
+                word_values: core::array::from_fn(|_| copy_manager.mock_external_assigned(F::ZERO)),
+                is_final: SafeTypeChip::unsafe_to_bool(
+                    copy_manager.mock_external_assigned(F::ZERO),
+                ),
+                hash_lo: copy_manager.mock_external_assigned(F::ZERO),
+                hash_hi: copy_manager.mock_external_assigned(F::ZERO),
+            })
+            .collect_vec()
+    }
+
+    /// Load needed witnesses into halo2-lib from keccak assigned rows. This function doesn't create any witnesses/constraints.
+    fn load_keccak_assigned_rows(
+        &self,
+        assigned_rows: Vec<KeccakAssignedRow<'_, F>>,
+    ) -> Vec<LoadedKeccakF<F>> {
+        let rows_per_round = self.params.keccak_circuit_params.rows_per_round;
+        let base_circuit_builder = self.base_circuit_builder.borrow();
+        let mut copy_manager = base_circuit_builder.core().copy_manager.lock().unwrap();
+        assigned_rows
+            .into_iter()
+            .step_by(rows_per_round)
+            // Skip the first round which is dummy.
+            .skip(1)
+            .chunks(NUM_ROUNDS + 1)
+            .into_iter()
+            .map(|rounds| {
+                let mut rounds = rounds.collect_vec();
+                assert_eq!(rounds.len(), NUM_ROUNDS + 1);
+                let bytes_left = copy_manager.load_external_assigned(rounds[0].bytes_left.clone());
+                let output_row = rounds.pop().unwrap();
+                let word_values = core::array::from_fn(|i| {
+                    let assigned_row = &rounds[i];
+                    copy_manager.load_external_assigned(assigned_row.word_value.clone())
+                });
+                let is_final = SafeTypeChip::unsafe_to_bool(
+                    copy_manager.load_external_assigned(output_row.is_final),
+                );
+                let hash_lo = copy_manager.load_external_assigned(output_row.hash_lo);
+                let hash_hi = copy_manager.load_external_assigned(output_row.hash_hi);
+                LoadedKeccakF { bytes_left, word_values, is_final, hash_lo, hash_hi }
+            })
+            .collect()
+    }
+
+    /// Generate witnesses of the base circuit.
+    fn generate_base_circuit_witnesses(&self, loaded_keccak_fs: &[LoadedKeccakF<F>]) {
+        let range = self.base_circuit_builder.borrow().range_chip();
+        let gate = range.gate();
+        let circuit_final_outputs = {
+            let mut base_circuit_builder_mut = self.base_circuit_builder.borrow_mut();
+            let ctx = base_circuit_builder_mut.main(0);
+            let mut hasher = self.hasher.borrow_mut();
+            hasher.initialize_consts(ctx, gate);
+
+            let lookup_key_per_keccak_f =
+                encode_inputs_from_keccak_fs(ctx, gate, &hasher, loaded_keccak_fs);
+            Self::generate_circuit_final_outputs(
+                ctx,
+                gate,
+                &lookup_key_per_keccak_f,
+                loaded_keccak_fs,
+            )
+        };
+        self.publish_outputs(&circuit_final_outputs);
+    }
+
+    /// Combine lookup keys and Keccak results to generate final outputs of the circuit.
+    fn generate_circuit_final_outputs(
+        ctx: &mut Context<F>,
+        gate: &impl GateInstructions<F>,
+        lookup_key_per_keccak_f: &[PoseidonCompactOutput<F>],
+        loaded_keccak_fs: &[LoadedKeccakF<F>],
+    ) -> Vec<KeccakCircuitOutput<AssignedValue<F>>> {
+        let KeccakCircuitOutput {
+            key: dummy_key_val,
+            hash_lo: dummy_keccak_val_lo,
+            hash_hi: dummy_keccak_val_hi,
+        } = dummy_circuit_output::<F>();
+
+        // Dummy row for keccak_fs with is_final = false. The corresponding logical input is empty.
+        let dummy_key_witness = ctx.load_constant(dummy_key_val);
+        let dummy_keccak_lo_witness = ctx.load_constant(dummy_keccak_val_lo);
+        let dummy_keccak_hi_witness = ctx.load_constant(dummy_keccak_val_hi);
+
+        let mut circuit_final_outputs = Vec::with_capacity(loaded_keccak_fs.len());
+        for (compact_output, loaded_keccak_f) in
+            lookup_key_per_keccak_f.iter().zip(loaded_keccak_fs)
+        {
+            let is_final = AssignedValue::from(loaded_keccak_f.is_final);
+            let key = gate.select(ctx, *compact_output.hash(), dummy_key_witness, is_final);
+            let hash_lo =
+                gate.select(ctx, loaded_keccak_f.hash_lo, dummy_keccak_lo_witness, is_final);
+            let hash_hi =
+                gate.select(ctx, loaded_keccak_f.hash_hi, dummy_keccak_hi_witness, is_final);
+            circuit_final_outputs.push(KeccakCircuitOutput { key, hash_lo, hash_hi });
+        }
+        circuit_final_outputs
+    }
+
+    /// Publish outputs of the circuit as public instances.
+    fn publish_outputs(&self, outputs: &[KeccakCircuitOutput<AssignedValue<F>>]) {
+        // The length of outputs should always equal to params.capacity.
+        assert_eq!(outputs.len(), self.params.capacity);
+        if !self.params.publish_raw_outputs {
+            let range_chip = self.base_circuit_builder.borrow().range_chip();
+            let gate = range_chip.gate();
+            let mut base_circuit_builder_mut = self.base_circuit_builder.borrow_mut();
+            let ctx = base_circuit_builder_mut.main(0);
+
+            // TODO: wrap this into a function which should be shared wiht App circuits.
+            let output_commitment = self.hasher.borrow().hash_fix_len_array(
+                ctx,
+                gate,
+                &outputs
+                    .iter()
+                    .flat_map(|output| [output.key, output.hash_lo, output.hash_hi])
+                    .collect_vec(),
+            );
+
+            let assigned_instances = &mut base_circuit_builder_mut.assigned_instances;
+            // The commitment should be in the first row.
+            assert!(assigned_instances[OUTPUT_COL_IDX_COMMIT].is_empty());
+            assigned_instances[OUTPUT_COL_IDX_COMMIT].push(output_commitment);
+        } else {
+            let assigned_instances = &mut self.base_circuit_builder.borrow_mut().assigned_instances;
+
+            // Outputs should be in the top of instance columns.
+            assert!(assigned_instances[OUTPUT_COL_IDX_KEY].is_empty());
+            assert!(assigned_instances[OUTPUT_COL_IDX_HASH_LO].is_empty());
+            assert!(assigned_instances[OUTPUT_COL_IDX_HASH_HI].is_empty());
+            for output in outputs {
+                assigned_instances[OUTPUT_COL_IDX_KEY].push(output.key);
+                assigned_instances[OUTPUT_COL_IDX_HASH_LO].push(output.hash_lo);
+                assigned_instances[OUTPUT_COL_IDX_HASH_HI].push(output.hash_hi);
+            }
+        }
+    }
+}
+
+fn create_hasher<F: Field>() -> PoseidonHasher<F, POSEIDON_T, POSEIDON_RATE> {
+    // Construct in-circuit Poseidon hasher.
+    let spec = OptimizedPoseidonSpec::<F, POSEIDON_T, POSEIDON_RATE>::new::<
+        POSEIDON_R_F,
+        POSEIDON_R_P,
+        POSEIDON_SECURE_MDS,
+    >();
+    PoseidonHasher::<F, POSEIDON_T, POSEIDON_RATE>::new(spec)
+}
+
+/// Encode raw inputs from Keccak circuit witnesses into lookup keys.
+///
+/// Each element in the return value corrresponds to a Keccak chunk. If is_final = true, this element is the lookup key of the corresponding logical input.
+pub fn encode_inputs_from_keccak_fs<F: Field>(
+    ctx: &mut Context<F>,
+    gate: &impl GateInstructions<F>,
+    initialized_hasher: &PoseidonHasher<F, POSEIDON_T, POSEIDON_RATE>,
+    loaded_keccak_fs: &[LoadedKeccakF<F>],
+) -> Vec<PoseidonCompactOutput<F>> {
+    // Circuit parameters
+    let num_poseidon_absorb_per_keccak_f = num_poseidon_absorb_per_keccak_f::<F>();
+    let num_word_per_witness = num_word_per_witness::<F>();
+    let num_witness_per_keccak_f = POSEIDON_RATE * num_poseidon_absorb_per_keccak_f;
+
+    // Constant witnesses
+    let rate_const = ctx.load_constant(F::from(POSEIDON_RATE as u64));
+    let one_const = ctx.load_constant(F::ONE);
+    let zero_const = ctx.load_zero();
+    let multipliers_val = get_words_to_witness_multipliers::<F>()
+        .into_iter()
+        .map(|multiplier| Constant(multiplier))
+        .collect_vec();
+
+    let compact_input_len = loaded_keccak_fs.len() * num_poseidon_absorb_per_keccak_f;
+    let mut compact_inputs = Vec::with_capacity(compact_input_len);
+    let mut last_is_final = one_const;
+    for loaded_keccak_f in loaded_keccak_fs {
+        // If this keccak_f is the last of a logical input.
+        let is_final = loaded_keccak_f.is_final;
+        let mut poseidon_absorb_data = Vec::with_capacity(num_witness_per_keccak_f);
+
+        // First witness of a keccak_f: [<length_placeholder>, word_values[0], word_values[1], ...]
+        // <length_placeholder> is the length of the input if this is the first keccak_f of a logical input. Otherwise 0.
+        let mut words = Vec::with_capacity(num_word_per_witness);
+        let input_bytes_len = gate.mul(ctx, loaded_keccak_f.bytes_left, last_is_final);
+        words.push(input_bytes_len);
+        words.extend_from_slice(&loaded_keccak_f.word_values[0..(num_word_per_witness - 1)]);
+        let first_witness = gate.inner_product(ctx, words, multipliers_val.clone());
+        poseidon_absorb_data.push(first_witness);
+
+        // Turn every num_word_per_witness words later into a witness.
+        for words in &loaded_keccak_f
+            .word_values
+            .into_iter()
+            .skip(num_word_per_witness - 1)
+            .chunks(num_word_per_witness)
+        {
+            let mut words = words.collect_vec();
+            words.resize(num_word_per_witness, zero_const);
+            let witness = gate.inner_product(ctx, words, multipliers_val.clone());
+            poseidon_absorb_data.push(witness);
+        }
+        // Pad 0s to make sure poseidon_absorb_data.len() % RATE == 0.
+        poseidon_absorb_data.resize(num_witness_per_keccak_f, zero_const);
+        for (i, poseidon_absorb) in poseidon_absorb_data.chunks(POSEIDON_RATE).enumerate() {
+            compact_inputs.push(PoseidonCompactInput::new(
+                poseidon_absorb.try_into().unwrap(),
+                if i + 1 == num_poseidon_absorb_per_keccak_f {
+                    is_final
+                } else {
+                    SafeTypeChip::unsafe_to_bool(zero_const)
+                },
+                rate_const,
+            ));
+        }
+        last_is_final = is_final.into();
+    }
+
+    let compact_outputs = initialized_hasher.hash_compact_input(ctx, gate, &compact_inputs);
+
+    compact_outputs
+        .into_iter()
+        .skip(num_poseidon_absorb_per_keccak_f - 1)
+        .step_by(num_poseidon_absorb_per_keccak_f)
+        .collect_vec()
+}
diff --git a/hashes/zkevm/src/keccak/coprocessor/circuit/mod.rs b/hashes/zkevm/src/keccak/coprocessor/circuit/mod.rs
new file mode 100644
index 00000000..6a66fc13
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/circuit/mod.rs
@@ -0,0 +1,3 @@
+pub mod leaf;
+#[cfg(test)]
+mod tests;
diff --git a/hashes/zkevm/src/keccak/coprocessor/circuit/tests/leaf.rs b/hashes/zkevm/src/keccak/coprocessor/circuit/tests/leaf.rs
new file mode 100644
index 00000000..57d1378f
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/circuit/tests/leaf.rs
@@ -0,0 +1,217 @@
+use crate::{
+    halo2_proofs::{
+        dev::MockProver,
+        halo2curves::bn256::Bn256,
+        halo2curves::bn256::Fr,
+        plonk::{keygen_pk, keygen_vk},
+    },
+    keccak::coprocessor::{
+        circuit::leaf::{KeccakCoprocessorLeafCircuit, KeccakCoprocessorLeafCircuitParams},
+        output::{calculate_circuit_outputs_commit, multi_inputs_to_circuit_outputs},
+    },
+};
+
+use halo2_base::{
+    halo2_proofs::poly::kzg::commitment::ParamsKZG,
+    utils::testing::{check_proof_with_instances, gen_proof_with_instances},
+};
+use itertools::Itertools;
+use rand_core::OsRng;
+
+#[test]
+fn test_mock_leaf_circuit_raw_outputs() {
+    let k: usize = 18;
+    let num_unusable_row: usize = 109;
+    let lookup_bits: usize = 4;
+    let capacity: usize = 10;
+    let publish_raw_outputs: bool = true;
+
+    let inputs = vec![
+        (0u8..200).collect::<Vec<_>>(),
+        vec![],
+        (0u8..1).collect::<Vec<_>>(),
+        (0u8..135).collect::<Vec<_>>(),
+        (0u8..136).collect::<Vec<_>>(),
+        (0u8..200).collect::<Vec<_>>(),
+    ];
+
+    let mut params = KeccakCoprocessorLeafCircuitParams::new(
+        k,
+        num_unusable_row,
+        lookup_bits,
+        capacity,
+        publish_raw_outputs,
+    );
+    let base_circuit_params =
+        KeccakCoprocessorLeafCircuit::<Fr>::calculate_base_circuit_params(&params);
+    params.base_circuit_params = base_circuit_params;
+    let circuit = KeccakCoprocessorLeafCircuit::<Fr>::new(inputs.clone(), params.clone(), false);
+    let circuit_outputs = multi_inputs_to_circuit_outputs::<Fr>(&inputs, params.capacity());
+
+    let instances = vec![
+        circuit_outputs.iter().map(|o| o.key).collect_vec(),
+        circuit_outputs.iter().map(|o| o.hash_lo).collect_vec(),
+        circuit_outputs.iter().map(|o| o.hash_hi).collect_vec(),
+    ];
+
+    let prover = MockProver::<Fr>::run(k as u32, &circuit, instances).unwrap();
+    prover.assert_satisfied();
+}
+
+#[test]
+fn test_prove_leaf_circuit_raw_outputs() {
+    let _ = env_logger::builder().is_test(true).try_init();
+
+    let k: usize = 18;
+    let num_unusable_row: usize = 109;
+    let lookup_bits: usize = 4;
+    let capacity: usize = 10;
+    let publish_raw_outputs: bool = true;
+
+    let inputs = vec![];
+    let mut circuit_params = KeccakCoprocessorLeafCircuitParams::new(
+        k,
+        num_unusable_row,
+        lookup_bits,
+        capacity,
+        publish_raw_outputs,
+    );
+    let base_circuit_params =
+        KeccakCoprocessorLeafCircuit::<Fr>::calculate_base_circuit_params(&circuit_params);
+    circuit_params.base_circuit_params = base_circuit_params;
+    let circuit = KeccakCoprocessorLeafCircuit::<Fr>::new(inputs, circuit_params.clone(), false);
+
+    let params = ParamsKZG::<Bn256>::setup(k as u32, OsRng);
+
+    let vk = keygen_vk(&params, &circuit).unwrap();
+    let pk = keygen_pk(&params, vk, &circuit).unwrap();
+
+    let inputs = vec![
+        (0u8..200).collect::<Vec<_>>(),
+        vec![],
+        (0u8..1).collect::<Vec<_>>(),
+        (0u8..135).collect::<Vec<_>>(),
+        (0u8..136).collect::<Vec<_>>(),
+        (0u8..200).collect::<Vec<_>>(),
+    ];
+    let circuit_outputs = multi_inputs_to_circuit_outputs::<Fr>(&inputs, circuit_params.capacity());
+    let instances: Vec<Vec<Fr>> = vec![
+        circuit_outputs.iter().map(|o| o.key).collect_vec(),
+        circuit_outputs.iter().map(|o| o.hash_lo).collect_vec(),
+        circuit_outputs.iter().map(|o| o.hash_hi).collect_vec(),
+    ];
+
+    let break_points = circuit.base_circuit_break_points();
+    let circuit = KeccakCoprocessorLeafCircuit::<Fr>::new(inputs, circuit_params, true);
+    circuit.set_base_circuit_break_points(break_points);
+
+    let proof = gen_proof_with_instances(
+        &params,
+        &pk,
+        circuit,
+        instances.iter().map(|f| f.as_slice()).collect_vec().as_slice(),
+    );
+    check_proof_with_instances(
+        &params,
+        pk.get_vk(),
+        &proof,
+        instances.iter().map(|f| f.as_slice()).collect_vec().as_slice(),
+        true,
+    );
+}
+
+#[test]
+fn test_mock_leaf_circuit_commit() {
+    let k: usize = 18;
+    let num_unusable_row: usize = 109;
+    let lookup_bits: usize = 4;
+    let capacity: usize = 10;
+    let publish_raw_outputs: bool = false;
+
+    let inputs = vec![
+        (0u8..200).collect::<Vec<_>>(),
+        vec![],
+        (0u8..1).collect::<Vec<_>>(),
+        (0u8..135).collect::<Vec<_>>(),
+        (0u8..136).collect::<Vec<_>>(),
+        (0u8..200).collect::<Vec<_>>(),
+    ];
+
+    let mut params = KeccakCoprocessorLeafCircuitParams::new(
+        k,
+        num_unusable_row,
+        lookup_bits,
+        capacity,
+        publish_raw_outputs,
+    );
+    let base_circuit_params =
+        KeccakCoprocessorLeafCircuit::<Fr>::calculate_base_circuit_params(&params);
+    params.base_circuit_params = base_circuit_params;
+    let circuit = KeccakCoprocessorLeafCircuit::<Fr>::new(inputs.clone(), params.clone(), false);
+    let circuit_outputs = multi_inputs_to_circuit_outputs::<Fr>(&inputs, params.capacity());
+
+    let instances = vec![vec![calculate_circuit_outputs_commit(&circuit_outputs)]];
+
+    let prover = MockProver::<Fr>::run(k as u32, &circuit, instances).unwrap();
+    prover.assert_satisfied();
+}
+
+#[test]
+fn test_prove_leaf_circuit_commit() {
+    let _ = env_logger::builder().is_test(true).try_init();
+
+    let k: usize = 18;
+    let num_unusable_row: usize = 109;
+    let lookup_bits: usize = 4;
+    let capacity: usize = 10;
+    let publish_raw_outputs: bool = false;
+
+    let inputs = vec![];
+    let mut circuit_params = KeccakCoprocessorLeafCircuitParams::new(
+        k,
+        num_unusable_row,
+        lookup_bits,
+        capacity,
+        publish_raw_outputs,
+    );
+    let base_circuit_params =
+        KeccakCoprocessorLeafCircuit::<Fr>::calculate_base_circuit_params(&circuit_params);
+    circuit_params.base_circuit_params = base_circuit_params;
+    let circuit = KeccakCoprocessorLeafCircuit::<Fr>::new(inputs, circuit_params.clone(), false);
+
+    let params = ParamsKZG::<Bn256>::setup(k as u32, OsRng);
+
+    let vk = keygen_vk(&params, &circuit).unwrap();
+    let pk = keygen_pk(&params, vk, &circuit).unwrap();
+
+    let inputs = vec![
+        (0u8..200).collect::<Vec<_>>(),
+        vec![],
+        (0u8..1).collect::<Vec<_>>(),
+        (0u8..135).collect::<Vec<_>>(),
+        (0u8..136).collect::<Vec<_>>(),
+        (0u8..200).collect::<Vec<_>>(),
+    ];
+
+    let break_points = circuit.base_circuit_break_points();
+    let circuit =
+        KeccakCoprocessorLeafCircuit::<Fr>::new(inputs.clone(), circuit_params.clone(), true);
+    circuit.set_base_circuit_break_points(break_points);
+
+    let circuit_outputs = multi_inputs_to_circuit_outputs::<Fr>(&inputs, circuit_params.capacity());
+    let instances = vec![vec![calculate_circuit_outputs_commit(&circuit_outputs)]];
+
+    let proof = gen_proof_with_instances(
+        &params,
+        &pk,
+        circuit,
+        instances.iter().map(|f| f.as_slice()).collect_vec().as_slice(),
+    );
+    check_proof_with_instances(
+        &params,
+        pk.get_vk(),
+        &proof,
+        instances.iter().map(|f| f.as_slice()).collect_vec().as_slice(),
+        true,
+    );
+}
diff --git a/hashes/zkevm/src/keccak/coprocessor/circuit/tests/mod.rs b/hashes/zkevm/src/keccak/coprocessor/circuit/tests/mod.rs
new file mode 100644
index 00000000..4d6a7f45
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/circuit/tests/mod.rs
@@ -0,0 +1,2 @@
+#[cfg(test)]
+pub mod leaf;
diff --git a/hashes/zkevm/src/keccak/coprocessor/encode.rs b/hashes/zkevm/src/keccak/coprocessor/encode.rs
new file mode 100644
index 00000000..4922b817
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/encode.rs
@@ -0,0 +1,116 @@
+use itertools::Itertools;
+
+use crate::{keccak::vanilla::param::*, util::eth_types::Field};
+
+use super::param::*;
+
+// TODO: Abstract this module into a trait for all coprocessor circuits.
+
+/// Module to encode raw inputs into lookup keys for looking up keccak results. The encoding is
+/// designed to be efficient in coprocessor circuits.
+
+/// Encode a native input bytes into its corresponding lookup key. This function can be considered as the spec of the encoding.
+pub fn encode_native_input<F: Field>(bytes: &[u8]) -> F {
+    assert!(NUM_BITS_PER_WORD <= u128::BITS as usize);
+    let multipliers: Vec<F> = get_words_to_witness_multipliers::<F>();
+    let num_word_per_witness = num_word_per_witness::<F>();
+    let len = bytes.len();
+
+    // Divide the bytes input into Keccak words(each word has NUM_BYTES_PER_WORD bytes).
+    let mut words = bytes
+        .chunks(NUM_BYTES_PER_WORD)
+        .map(|chunk| {
+            let mut padded_chunk = [0; u128::BITS as usize / NUM_BITS_PER_BYTE];
+            padded_chunk[..chunk.len()].copy_from_slice(chunk);
+            u128::from_le_bytes(padded_chunk)
+        })
+        .collect_vec();
+    // An extra keccak_f is performed if len % NUM_BYTES_TO_ABSORB == 0.
+    if len % NUM_BYTES_TO_ABSORB == 0 {
+        words.extend([0; NUM_WORDS_TO_ABSORB]);
+    }
+    // 1. Split Keccak words into keccak_fs(each keccak_f has NUM_WORDS_TO_ABSORB).
+    // 2. Append an extra word into the beginning of each keccak_f. In the first keccak_f, this word is the byte length of the input. Otherwise 0.
+    let words_per_chunk = words
+        .chunks(NUM_WORDS_TO_ABSORB)
+        .enumerate()
+        .map(|(i, chunk)| {
+            let mut padded_chunk = [0; NUM_WORDS_TO_ABSORB + 1];
+            padded_chunk[0] = if i == 0 { len as u128 } else { 0 };
+            padded_chunk[1..(chunk.len() + 1)].copy_from_slice(chunk);
+            padded_chunk
+        })
+        .collect_vec();
+    // Compress every num_word_per_witness words into a witness.
+    let witnesses_per_chunk = words_per_chunk
+        .iter()
+        .map(|chunk| {
+            chunk
+                .chunks(num_word_per_witness)
+                .map(|c| {
+                    c.iter().zip(multipliers.iter()).fold(F::ZERO, |acc, (word, multipiler)| {
+                        acc + F::from_u128(*word) * multipiler
+                    })
+                })
+                .collect_vec()
+        })
+        .collect_vec();
+    // Absorb witnesses keccak_f by keccak_f.
+    let mut native_poseidon_sponge =
+        pse_poseidon::Poseidon::<F, POSEIDON_T, POSEIDON_RATE>::new(POSEIDON_R_F, POSEIDON_R_P);
+    for witnesses in witnesses_per_chunk {
+        for absorbing in witnesses.chunks(POSEIDON_RATE) {
+            // To avoid absorbing witnesses crossing keccak_fs together, pad 0s to make sure absorb.len() == RATE.
+            let mut padded_absorb = [F::ZERO; POSEIDON_RATE];
+            padded_absorb[..absorbing.len()].copy_from_slice(absorbing);
+            native_poseidon_sponge.update(&padded_absorb);
+        }
+    }
+    native_poseidon_sponge.squeeze()
+}
+
+// TODO: Add a function to encode a VarLenBytes into a lookup key. The function should be used by App Circuits.
+
+// For reference, when F is bn254::Fr:
+// num_word_per_witness = 3
+// num_witness_per_keccak_f = 6
+// num_poseidon_absorb_per_keccak_f = 3
+
+/// Number of Keccak words in each encoded input for Poseidon.
+/// When `F` is `bn254::Fr`, this is 3.
+pub const fn num_word_per_witness<F: Field>() -> usize {
+    (F::CAPACITY as usize) / NUM_BITS_PER_WORD
+}
+
+/// Number of witnesses to represent inputs in a keccak_f.
+///
+/// Assume the representation of <length of raw input> is not longer than a Keccak word.
+///
+/// When `F` is `bn254::Fr`, this is 6.
+pub const fn num_witness_per_keccak_f<F: Field>() -> usize {
+    // With <length of raw input>, a keccak_f could have NUM_WORDS_TO_ABSORB + 1 words.
+    // ceil((NUM_WORDS_TO_ABSORB + 1) / num_word_per_witness)
+    NUM_WORDS_TO_ABSORB / num_word_per_witness::<F>() + 1
+}
+
+/// Number of Poseidon absorb rounds per keccak_f.
+///
+/// When `F` is `bn254::Fr`, with our fixed `POSEIDON_RATE = 2`, this is 3.
+pub const fn num_poseidon_absorb_per_keccak_f<F: Field>() -> usize {
+    // Each absorb round consumes RATE witnesses.
+    // ceil(num_witness_per_keccak_f / RATE)
+    (num_witness_per_keccak_f::<F>() - 1) / POSEIDON_RATE + 1
+}
+
+pub(crate) fn get_words_to_witness_multipliers<F: Field>() -> Vec<F> {
+    let num_word_per_witness = num_word_per_witness::<F>();
+    let mut multiplier_f = F::ONE;
+    let mut multipliers = Vec::with_capacity(num_word_per_witness);
+    multipliers.push(multiplier_f);
+    let base_f = F::from_u128(1u128 << NUM_BITS_PER_WORD);
+    for _ in 1..num_word_per_witness {
+        multiplier_f *= base_f;
+        multipliers.push(multiplier_f);
+    }
+    multipliers
+}
diff --git a/hashes/zkevm/src/keccak/coprocessor/mod.rs b/hashes/zkevm/src/keccak/coprocessor/mod.rs
new file mode 100644
index 00000000..135a96b4
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/mod.rs
@@ -0,0 +1,10 @@
+/// Module of Keccak coprocessor circuit.
+pub mod circuit;
+/// Module of encoding raw inputs to coprocessor circuit lookup keys.
+pub mod encode;
+/// Module of Keccak coprocessor circuit output.
+pub mod output;
+/// Module of Keccak coprocessor circuit constant parameters.
+pub mod param;
+#[cfg(test)]
+mod tests;
diff --git a/hashes/zkevm/src/keccak/coprocessor/output.rs b/hashes/zkevm/src/keccak/coprocessor/output.rs
new file mode 100644
index 00000000..84d5f985
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/output.rs
@@ -0,0 +1,72 @@
+use super::{encode::encode_native_input, param::*};
+use crate::{keccak::vanilla::keccak_packed_multi::get_num_keccak_f, util::eth_types::Field};
+use itertools::Itertools;
+use sha3::{Digest, Keccak256};
+
+/// Witnesses to be exposed as circuit outputs.
+#[derive(Clone, Copy, PartialEq, Debug)]
+pub struct KeccakCircuitOutput<E> {
+    /// Key for App circuits to lookup keccak hash.
+    pub key: E,
+    /// Low 128 bits of Keccak hash.
+    pub hash_lo: E,
+    /// High 128 bits of Keccak hash.
+    pub hash_hi: E,
+}
+
+/// Return circuit outputs of the specified Keccak corprocessor circuit for a specified input.
+pub fn multi_inputs_to_circuit_outputs<F: Field>(
+    inputs: &[Vec<u8>],
+    capacity: usize,
+) -> Vec<KeccakCircuitOutput<F>> {
+    assert!(u128::BITS <= F::CAPACITY);
+    let mut outputs =
+        inputs.iter().flat_map(|input| input_to_circuit_outputs::<F>(input)).collect_vec();
+    assert!(outputs.len() <= capacity);
+    outputs.resize(capacity, dummy_circuit_output());
+    outputs
+}
+
+/// Return corresponding circuit outputs of a native input in bytes. An logical input could produce multiple
+/// outputs. The last one is the lookup key and hash of the input. Other outputs are paddings which are the lookup
+/// key and hash of an empty input.
+pub fn input_to_circuit_outputs<F: Field>(bytes: &[u8]) -> Vec<KeccakCircuitOutput<F>> {
+    assert!(u128::BITS <= F::CAPACITY);
+    let len = bytes.len();
+    let num_keccak_f = get_num_keccak_f(len);
+
+    let mut output = Vec::with_capacity(num_keccak_f);
+    output.resize(num_keccak_f - 1, dummy_circuit_output());
+
+    let key = encode_native_input(bytes);
+    let hash = Keccak256::digest(bytes);
+    let hash_lo = F::from_u128(u128::from_be_bytes(hash[16..].try_into().unwrap()));
+    let hash_hi = F::from_u128(u128::from_be_bytes(hash[..16].try_into().unwrap()));
+    output.push(KeccakCircuitOutput { key, hash_lo, hash_hi });
+
+    output
+}
+
+/// Return the dummy circuit output for padding.
+pub fn dummy_circuit_output<F: Field>() -> KeccakCircuitOutput<F> {
+    assert!(u128::BITS <= F::CAPACITY);
+    let key = encode_native_input(&[]);
+    // Output of Keccak256::digest is big endian.
+    let hash = Keccak256::digest([]);
+    let hash_lo = F::from_u128(u128::from_be_bytes(hash[16..].try_into().unwrap()));
+    let hash_hi = F::from_u128(u128::from_be_bytes(hash[..16].try_into().unwrap()));
+    KeccakCircuitOutput { key, hash_lo, hash_hi }
+}
+
+/// Calculate the commitment of circuit outputs.
+pub fn calculate_circuit_outputs_commit<F: Field>(outputs: &[KeccakCircuitOutput<F>]) -> F {
+    let mut native_poseidon_sponge =
+        pse_poseidon::Poseidon::<F, POSEIDON_T, POSEIDON_RATE>::new(POSEIDON_R_F, POSEIDON_R_P);
+    native_poseidon_sponge.update(
+        &outputs
+            .iter()
+            .flat_map(|output| [output.key, output.hash_lo, output.hash_hi])
+            .collect_vec(),
+    );
+    native_poseidon_sponge.squeeze()
+}
diff --git a/hashes/zkevm/src/keccak/coprocessor/param.rs b/hashes/zkevm/src/keccak/coprocessor/param.rs
new file mode 100644
index 00000000..889d0bd9
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/param.rs
@@ -0,0 +1,12 @@
+pub const OUTPUT_NUM_COL_COMMIT: usize = 1;
+pub const OUTPUT_NUM_COL_RAW: usize = 3;
+pub const OUTPUT_COL_IDX_COMMIT: usize = 0;
+pub const OUTPUT_COL_IDX_KEY: usize = 0;
+pub const OUTPUT_COL_IDX_HASH_LO: usize = 1;
+pub const OUTPUT_COL_IDX_HASH_HI: usize = 2;
+
+pub const POSEIDON_T: usize = 3;
+pub const POSEIDON_RATE: usize = 2;
+pub const POSEIDON_R_F: usize = 8;
+pub const POSEIDON_R_P: usize = 57;
+pub const POSEIDON_SECURE_MDS: usize = 0;
diff --git a/hashes/zkevm/src/keccak/coprocessor/tests/mod.rs b/hashes/zkevm/src/keccak/coprocessor/tests/mod.rs
new file mode 100644
index 00000000..63c4e272
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/tests/mod.rs
@@ -0,0 +1,2 @@
+#[cfg(test)]
+mod output;
diff --git a/hashes/zkevm/src/keccak/coprocessor/tests/output.rs b/hashes/zkevm/src/keccak/coprocessor/tests/output.rs
new file mode 100644
index 00000000..c72c518c
--- /dev/null
+++ b/hashes/zkevm/src/keccak/coprocessor/tests/output.rs
@@ -0,0 +1,131 @@
+use crate::keccak::coprocessor::output::{
+    dummy_circuit_output, input_to_circuit_outputs, multi_inputs_to_circuit_outputs,
+    KeccakCircuitOutput,
+};
+use halo2_base::halo2_proofs::halo2curves::{bn256::Fr, ff::PrimeField};
+use itertools::Itertools;
+use lazy_static::lazy_static;
+
+lazy_static! {
+    static ref OUTPUT_EMPTY: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x54595a1525d3534a,
+            0xf90e160f1b4648ef,
+            0x34d557ddfb89da5d,
+            0x04ffe3d4b8885928,
+        ]),
+        hash_lo: Fr::from_u128(0xe500b653ca82273b7bfad8045d85a470),
+        hash_hi: Fr::from_u128(0xc5d2460186f7233c927e7db2dcc703c0),
+    };
+    static ref OUTPUT_0: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0xc009f26a12e2f494,
+            0xb4a9d43c17609251,
+            0x68068b5344cba120,
+            0x1531327ea92d38ba,
+        ]),
+        hash_lo: Fr::from_u128(0x6612f7b477d66591ff96a9e064bcc98a),
+        hash_hi: Fr::from_u128(0xbc36789e7a1e281436464229828f817d),
+    };
+    static ref OUTPUT_0_135: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x9a88287adab4da1c,
+            0xe9ff61b507cfd8c2,
+            0xdbf697a6a3ad66a1,
+            0x1eb1d5cc8cdd1532,
+        ]),
+        hash_lo: Fr::from_u128(0x290b0e1706f6a82e5a595b9ce9faca62),
+        hash_hi: Fr::from_u128(0xcbdfd9dee5faad3818d6b06f95a219fd),
+    };
+    static ref OUTPUT_0_136: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x39c1a578acb62676,
+            0x0dc19a75e610c062,
+            0x3f158e809150a14a,
+            0x2367059ac8c80538,
+        ]),
+        hash_lo: Fr::from_u128(0xff11fe3e38e17df89cf5d29c7d7f807e),
+        hash_hi: Fr::from_u128(0x7ce759f1ab7f9ce437719970c26b0a66),
+    };
+    static ref OUTPUT_0_200: KeccakCircuitOutput<Fr> = KeccakCircuitOutput {
+        key: Fr::from_raw([
+            0x379bfca638552583,
+            0x1bf7bd603adec30e,
+            0x05efe90ad5dbd814,
+            0x053c729cb8908ccb,
+        ]),
+        hash_lo: Fr::from_u128(0xb4543f3d2703c0923c6901c2af57b890),
+        hash_hi: Fr::from_u128(0xbfb0aa97863e797943cf7c33bb7e880b),
+    };
+}
+
+#[test]
+fn test_dummy_circuit_output() {
+    let KeccakCircuitOutput { key, hash_lo, hash_hi } = dummy_circuit_output::<Fr>();
+    assert_eq!(key, OUTPUT_EMPTY.key);
+    assert_eq!(hash_lo, OUTPUT_EMPTY.hash_lo);
+    assert_eq!(hash_hi, OUTPUT_EMPTY.hash_hi);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_empty() {
+    let result = input_to_circuit_outputs::<Fr>(&[]);
+    assert_eq!(result, vec![*OUTPUT_EMPTY]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_1_keccak_f() {
+    let result = input_to_circuit_outputs::<Fr>(&[0]);
+    assert_eq!(result, vec![*OUTPUT_0]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_1_keccak_f_full() {
+    let result = input_to_circuit_outputs::<Fr>(&(0..135).collect_vec());
+    assert_eq!(result, vec![*OUTPUT_0_135]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_2_keccak_f_2nd_empty() {
+    let result = input_to_circuit_outputs::<Fr>(&(0..136).collect_vec());
+    assert_eq!(result, vec![*OUTPUT_EMPTY, *OUTPUT_0_136]);
+}
+
+#[test]
+fn test_input_to_circuit_outputs_2_keccak_f() {
+    let result = input_to_circuit_outputs::<Fr>(&(0..200).collect_vec());
+    assert_eq!(result, vec![*OUTPUT_EMPTY, *OUTPUT_0_200]);
+}
+
+#[test]
+fn test_multi_input_to_circuit_outputs() {
+    let results = multi_inputs_to_circuit_outputs::<Fr>(
+        &[(0..135).collect_vec(), (0..200).collect_vec(), vec![], vec![0], (0..136).collect_vec()],
+        10,
+    );
+    assert_eq!(
+        results,
+        vec![
+            *OUTPUT_0_135,
+            *OUTPUT_EMPTY,
+            *OUTPUT_0_200,
+            *OUTPUT_EMPTY,
+            *OUTPUT_0,
+            *OUTPUT_EMPTY,
+            *OUTPUT_0_136,
+            // Padding
+            *OUTPUT_EMPTY,
+            *OUTPUT_EMPTY,
+            *OUTPUT_EMPTY,
+        ]
+    );
+}
+
+#[test]
+#[should_panic]
+fn test_multi_input_to_circuit_outputs_exceed_capacity() {
+    let _ = multi_inputs_to_circuit_outputs::<Fr>(
+        &[(0..135).collect_vec(), (0..200).collect_vec(), vec![], vec![0], (0..136).collect_vec()],
+        2,
+    );
+}
diff --git a/hashes/zkevm/src/keccak/mod.rs b/hashes/zkevm/src/keccak/mod.rs
index 0dc18d87..58480989 100644
--- a/hashes/zkevm/src/keccak/mod.rs
+++ b/hashes/zkevm/src/keccak/mod.rs
@@ -1,1294 +1,4 @@
-use self::{cell_manager::*, keccak_packed_multi::*, param::*, table::*, util::*};
-use super::util::{
-    constraint_builder::BaseConstraintBuilder,
-    eth_types::{self, Field},
-    expression::{and, not, select, Expr},
-};
-use crate::{
-    halo2_proofs::{
-        circuit::{Layouter, Region, Value},
-        halo2curves::ff::PrimeField,
-        plonk::{Column, ConstraintSystem, Error, Expression, Fixed, TableColumn, VirtualCells},
-        poly::Rotation,
-    },
-    util::{
-        expression::{from_bytes, sum},
-        word::{self, Word, WordExpr},
-    },
-};
-use halo2_base::utils::halo2::{raw_assign_advice, raw_assign_fixed};
-use itertools::Itertools;
-use log::{debug, info};
-use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
-use std::marker::PhantomData;
-
-pub mod cell_manager;
-pub mod keccak_packed_multi;
-pub mod param;
-pub mod table;
-#[cfg(test)]
-mod tests;
-pub mod util;
-
-/// Configuration parameters to define [`KeccakCircuitConfig`]
-#[derive(Copy, Clone, Debug, Default)]
-pub struct KeccakConfigParams {
-    /// The circuit degree, i.e., circuit has 2<sup>k</sup> rows
-    pub k: u32,
-    /// The number of rows to use for each round in the keccak_f permutation
-    pub rows_per_round: usize,
-}
-
-/// KeccakConfig
-#[derive(Clone, Debug)]
-pub struct KeccakCircuitConfig<F> {
-    // Bool. True on 1st row of each round.
-    q_enable: Column<Fixed>,
-    // Bool. True on 1st row.
-    q_first: Column<Fixed>,
-    // Bool. True on 1st row of all rounds except last rounds.
-    q_round: Column<Fixed>,
-    // Bool. True on 1st row of last rounds.
-    q_absorb: Column<Fixed>,
-    // Bool. True on 1st row of last rounds.
-    q_round_last: Column<Fixed>,
-    // Bool. True on 1st row of rounds which might contain inputs.
-    // Note: first NUM_WORDS_TO_ABSORB rounds of each chunk might contain inputs.
-    // It "might" contain inputs because it's possible that a round only have paddings.
-    q_input: Column<Fixed>,
-    // Bool. True on 1st row of all last input round.
-    q_input_last: Column<Fixed>,
-
-    pub keccak_table: KeccakTable,
-
-    cell_manager: CellManager<F>,
-    round_cst: Column<Fixed>,
-    normalize_3: [TableColumn; 2],
-    normalize_4: [TableColumn; 2],
-    normalize_6: [TableColumn; 2],
-    chi_base_table: [TableColumn; 2],
-    pack_table: [TableColumn; 2],
-
-    // config parameters for convenience
-    pub parameters: KeccakConfigParams,
-
-    _marker: PhantomData<F>,
-}
-
-impl<F: Field> KeccakCircuitConfig<F> {
-    /// Return a new KeccakCircuitConfig
-    pub fn new(meta: &mut ConstraintSystem<F>, parameters: KeccakConfigParams) -> Self {
-        let k = parameters.k;
-        let num_rows_per_round = parameters.rows_per_round;
-
-        let q_enable = meta.fixed_column();
-        let q_first = meta.fixed_column();
-        let q_round = meta.fixed_column();
-        let q_absorb = meta.fixed_column();
-        let q_round_last = meta.fixed_column();
-        let q_input = meta.fixed_column();
-        let q_input_last = meta.fixed_column();
-        let round_cst = meta.fixed_column();
-        let keccak_table = KeccakTable::construct(meta);
-
-        let is_final = keccak_table.is_enabled;
-        let hash_word = keccak_table.output;
-
-        let normalize_3 = array_init::array_init(|_| meta.lookup_table_column());
-        let normalize_4 = array_init::array_init(|_| meta.lookup_table_column());
-        let normalize_6 = array_init::array_init(|_| meta.lookup_table_column());
-        let chi_base_table = array_init::array_init(|_| meta.lookup_table_column());
-        let pack_table = array_init::array_init(|_| meta.lookup_table_column());
-
-        let mut cell_manager = CellManager::new(num_rows_per_round);
-        let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-        let mut total_lookup_counter = 0;
-
-        let start_new_hash = |meta: &mut VirtualCells<F>, rot| {
-            // A new hash is started when the previous hash is done or on the first row
-            meta.query_fixed(q_first, rot) + meta.query_advice(is_final, rot)
-        };
-
-        // Round constant
-        let mut round_cst_expr = 0.expr();
-        meta.create_gate("Query round cst", |meta| {
-            round_cst_expr = meta.query_fixed(round_cst, Rotation::cur());
-            vec![0u64.expr()]
-        });
-        // State data
-        let mut s = vec![vec![0u64.expr(); 5]; 5];
-        let mut s_next = vec![vec![0u64.expr(); 5]; 5];
-        for i in 0..5 {
-            for j in 0..5 {
-                let cell = cell_manager.query_cell(meta);
-                s[i][j] = cell.expr();
-                s_next[i][j] = cell.at_offset(meta, num_rows_per_round as i32).expr();
-            }
-        }
-        // Absorb data
-        let absorb_from = cell_manager.query_cell(meta);
-        let absorb_data = cell_manager.query_cell(meta);
-        let absorb_result = cell_manager.query_cell(meta);
-        let mut absorb_from_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
-        let mut absorb_data_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
-        let mut absorb_result_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
-        for i in 0..NUM_WORDS_TO_ABSORB {
-            let rot = ((i + 1) * num_rows_per_round) as i32;
-            absorb_from_next[i] = absorb_from.at_offset(meta, rot).expr();
-            absorb_data_next[i] = absorb_data.at_offset(meta, rot).expr();
-            absorb_result_next[i] = absorb_result.at_offset(meta, rot).expr();
-        }
-
-        // Store the pre-state
-        let pre_s = s.clone();
-
-        // Absorb
-        // The absorption happening at the start of the 24 rounds is done spread out
-        // over those 24 rounds. In a single round (in 17 of the 24 rounds) a
-        // single word is absorbed so the work is spread out. The absorption is
-        // done simply by doing state + data and then normalizing the result to [0,1].
-        // We also need to convert the input data into bytes to calculate the input data
-        // rlc.
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        let part_size = get_num_bits_per_absorb_lookup(k);
-        let input = absorb_from.expr() + absorb_data.expr();
-        let absorb_fat =
-            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
-        cell_manager.start_region();
-        let absorb_res = transform::expr(
-            "absorb",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            absorb_fat,
-            normalize_3,
-            true,
-        );
-        cb.require_equal("absorb result", decode::expr(absorb_res), absorb_result.expr());
-        info!("- Post absorb:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Squeeze
-        // The squeezing happening at the end of the 24 rounds is done spread out
-        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
-        // single word is converted to bytes.
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        // Potential optimization: could do multiple bytes per lookup
-        let packed_parts =
-            split::expr(meta, &mut cell_manager, &mut cb, absorb_data.expr(), 0, 8, false, None);
-        cell_manager.start_region();
-        // input_bytes.len() = packed_parts.len() = 64 / 8 = 8 = NUM_BYTES_PER_WORD
-        let input_bytes = transform::expr(
-            "squeeze unpack",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            packed_parts,
-            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
-            true,
-        );
-        debug_assert_eq!(input_bytes.len(), NUM_BYTES_PER_WORD);
-
-        // Padding data
-        cell_manager.start_region();
-        let is_paddings = input_bytes.iter().map(|_| cell_manager.query_cell(meta)).collect_vec();
-        info!("- Post padding:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Theta
-        // Calculate
-        // - `c[i] = s[i][0] + s[i][1] + s[i][2] + s[i][3] + s[i][4]`
-        // - `bc[i] = normalize(c)`.
-        // - `t[i] = bc[(i + 4) % 5] + rot(bc[(i + 1)% 5], 1)`
-        // This is done by splitting the bc values in parts in a way
-        // that allows us to also calculate the rotated value "for free".
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        let part_size_c = get_num_bits_per_theta_c_lookup(k);
-        let mut c_parts = Vec::new();
-        for s in s.iter() {
-            // Calculate c and split into parts
-            let c = s[0].clone() + s[1].clone() + s[2].clone() + s[3].clone() + s[4].clone();
-            c_parts.push(split::expr(
-                meta,
-                &mut cell_manager,
-                &mut cb,
-                c,
-                1,
-                part_size_c,
-                false,
-                None,
-            ));
-        }
-        // Now calculate `bc` by normalizing `c`
-        cell_manager.start_region();
-        let mut bc = Vec::new();
-        for c in c_parts {
-            // Normalize c
-            bc.push(transform::expr(
-                "theta c",
-                meta,
-                &mut cell_manager,
-                &mut lookup_counter,
-                c,
-                normalize_6,
-                true,
-            ));
-        }
-        // Now do `bc[(i + 4) % 5] + rot(bc[(i + 1) % 5], 1)` using just expressions.
-        // We don't normalize the result here. We do it as part of the rho/pi step, even
-        // though we would only have to normalize 5 values instead of 25, because of the
-        // way the rho/pi and chi steps can be combined it's more efficient to
-        // do it there (the max value for chi is 4 already so that's the
-        // limiting factor).
-        let mut os = vec![vec![0u64.expr(); 5]; 5];
-        for i in 0..5 {
-            let t = decode::expr(bc[(i + 4) % 5].clone())
-                + decode::expr(rotate(bc[(i + 1) % 5].clone(), 1, part_size_c));
-            for j in 0..5 {
-                os[i][j] = s[i][j].clone() + t.clone();
-            }
-        }
-        s = os.clone();
-        info!("- Post theta:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Rho/Pi
-        // For the rotation of rho/pi we split up the words like expected, but in a way
-        // that allows reusing the same parts in an optimal way for the chi step.
-        // We can save quite a few columns by not recombining the parts after rho/pi and
-        // re-splitting the words again before chi. Instead we do chi directly
-        // on the output parts of rho/pi. For rho/pi specically we do
-        // `s[j][2 * i + 3 * j) % 5] = normalize(rot(s[i][j], RHOM[i][j]))`.
-        cell_manager.start_region();
-        let mut lookup_counter = 0;
-        let part_size = get_num_bits_per_base_chi_lookup(k);
-        // To combine the rho/pi/chi steps we have to ensure a specific layout so
-        // query those cells here first.
-        // For chi we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) & s[(i+2)%5][j])`. `j`
-        // remains static but `i` is accessed in a wrap around manner. To do this using
-        // multiple rows with lookups in a way that doesn't require any
-        // extra additional cells or selectors we have to put all `s[i]`'s on the same
-        // row. This isn't that strong of a requirement actually because we the
-        // words are split into multipe parts, and so only the parts at the same
-        // position of those words need to be on the same row.
-        let target_word_sizes = target_part_sizes(part_size);
-        let num_word_parts = target_word_sizes.len();
-        let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] = array_init::array_init(|_| {
-            array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
-        });
-        let mut num_columns = 0;
-        let mut column_starts = [0usize; 3];
-        for p in 0..3 {
-            column_starts[p] = cell_manager.start_region();
-            let mut row_idx = 0;
-            num_columns = 0;
-            for j in 0..5 {
-                for _ in 0..num_word_parts {
-                    for i in 0..5 {
-                        rho_pi_chi_cells[p][i][j]
-                            .push(cell_manager.query_cell_at_row(meta, row_idx));
-                    }
-                    if row_idx == 0 {
-                        num_columns += 1;
-                    }
-                    row_idx = (((row_idx as usize) + 1) % num_rows_per_round) as i32;
-                }
-            }
-        }
-        // Do the transformation, resulting in the word parts also being normalized.
-        let pi_region_start = cell_manager.start_region();
-        let mut os_parts = vec![vec![Vec::new(); 5]; 5];
-        for (j, os_part) in os_parts.iter_mut().enumerate() {
-            for i in 0..5 {
-                // Split s into parts
-                let s_parts = split_uniform::expr(
-                    meta,
-                    &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
-                    &mut cell_manager,
-                    &mut cb,
-                    s[i][j].clone(),
-                    RHO_MATRIX[i][j],
-                    part_size,
-                    true,
-                );
-                // Normalize the data to the target cells
-                let s_parts = transform_to::expr(
-                    "rho/pi",
-                    meta,
-                    &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
-                    &mut lookup_counter,
-                    s_parts.clone(),
-                    normalize_4,
-                    true,
-                );
-                os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
-            }
-        }
-        let pi_region_end = cell_manager.start_region();
-        // Pi parts range checks
-        // To make the uniform stuff work we had to combine some parts together
-        // in new cells (see split_uniform). Here we make sure those parts are range
-        // checked. Potential improvement: Could combine multiple smaller parts
-        // in a single lookup but doesn't save that much.
-        for c in pi_region_start..pi_region_end {
-            meta.lookup("pi part range check", |_| {
-                vec![(cell_manager.columns()[c].expr.clone(), normalize_4[0])]
-            });
-            lookup_counter += 1;
-        }
-        info!("- Post rho/pi:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // Chi
-        // In groups of 5 columns, we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) &
-        // s[(i+2)%5][j])` five times, on each row (no selector needed).
-        // This is calculated by making use of `CHI_BASE_LOOKUP_TABLE`.
-        let mut lookup_counter = 0;
-        let part_size_base = get_num_bits_per_base_chi_lookup(k);
-        for idx in 0..num_columns {
-            // First fetch the cells we wan to use
-            let mut input: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
-            let mut output: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
-            for c in 0..5 {
-                input[c] = cell_manager.columns()[column_starts[1] + idx * 5 + c].expr.clone();
-                output[c] = cell_manager.columns()[column_starts[2] + idx * 5 + c].expr.clone();
-            }
-            // Now calculate `a ^ ((~b) & c)` by doing `lookup[3 - 2*a + b - c]`
-            for i in 0..5 {
-                let input = scatter::expr(3, part_size_base) - 2.expr() * input[i].clone()
-                    + input[(i + 1) % 5].clone()
-                    - input[(i + 2) % 5].clone();
-                let output = output[i].clone();
-                meta.lookup("chi base", |_| {
-                    vec![(input.clone(), chi_base_table[0]), (output.clone(), chi_base_table[1])]
-                });
-                lookup_counter += 1;
-            }
-        }
-        // Now just decode the parts after the chi transformation done with the lookups
-        // above.
-        let mut os = vec![vec![0u64.expr(); 5]; 5];
-        for (i, os) in os.iter_mut().enumerate() {
-            for (j, os) in os.iter_mut().enumerate() {
-                let mut parts = Vec::new();
-                for idx in 0..num_word_parts {
-                    parts.push(Part {
-                        num_bits: part_size_base,
-                        cell: rho_pi_chi_cells[2][i][j][idx].clone(),
-                        expr: rho_pi_chi_cells[2][i][j][idx].expr(),
-                    });
-                }
-                *os = decode::expr(parts);
-            }
-        }
-        s = os.clone();
-
-        // iota
-        // Simply do the single xor on state [0][0].
-        cell_manager.start_region();
-        let part_size = get_num_bits_per_absorb_lookup(k);
-        let input = s[0][0].clone() + round_cst_expr.clone();
-        let iota_parts =
-            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
-        cell_manager.start_region();
-        // Could share columns with absorb which may end up using 1 lookup/column
-        // fewer...
-        s[0][0] = decode::expr(transform::expr(
-            "iota",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            iota_parts,
-            normalize_3,
-            true,
-        ));
-        // Final results stored in the next row
-        for i in 0..5 {
-            for j in 0..5 {
-                cb.require_equal("next row check", s[i][j].clone(), s_next[i][j].clone());
-            }
-        }
-        info!("- Post chi:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        let mut lookup_counter = 0;
-        cell_manager.start_region();
-
-        // Squeeze data
-        let squeeze_from = cell_manager.query_cell(meta);
-        let mut squeeze_from_prev = vec![0u64.expr(); NUM_WORDS_TO_SQUEEZE];
-        for (idx, squeeze_from_prev) in squeeze_from_prev.iter_mut().enumerate() {
-            let rot = (-(idx as i32) - 1) * num_rows_per_round as i32;
-            *squeeze_from_prev = squeeze_from.at_offset(meta, rot).expr();
-        }
-        // Squeeze
-        // The squeeze happening at the end of the 24 rounds is done spread out
-        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
-        // single word is converted to bytes.
-        // Potential optimization: could do multiple bytes per lookup
-        cell_manager.start_region();
-        // Unpack a single word into bytes (for the squeeze)
-        // Potential optimization: could do multiple bytes per lookup
-        let squeeze_from_parts =
-            split::expr(meta, &mut cell_manager, &mut cb, squeeze_from.expr(), 0, 8, false, None);
-        cell_manager.start_region();
-        let squeeze_bytes = transform::expr(
-            "squeeze unpack",
-            meta,
-            &mut cell_manager,
-            &mut lookup_counter,
-            squeeze_from_parts,
-            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
-            true,
-        );
-        info!("- Post squeeze:");
-        info!("Lookups: {}", lookup_counter);
-        info!("Columns: {}", cell_manager.get_width());
-        total_lookup_counter += lookup_counter;
-
-        // The round constraints that we've been building up till now
-        meta.create_gate("round", |meta| cb.gate(meta.query_fixed(q_round, Rotation::cur())));
-
-        // Absorb
-        meta.create_gate("absorb", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let continue_hash = not::expr(start_new_hash(meta, Rotation::cur()));
-            let absorb_positions = get_absorb_positions();
-            let mut a_slice = 0;
-            for j in 0..5 {
-                for i in 0..5 {
-                    if absorb_positions.contains(&(i, j)) {
-                        cb.condition(continue_hash.clone(), |cb| {
-                            cb.require_equal(
-                                "absorb verify input",
-                                absorb_from_next[a_slice].clone(),
-                                pre_s[i][j].clone(),
-                            );
-                        });
-                        cb.require_equal(
-                            "absorb result copy",
-                            select::expr(
-                                continue_hash.clone(),
-                                absorb_result_next[a_slice].clone(),
-                                absorb_data_next[a_slice].clone(),
-                            ),
-                            s_next[i][j].clone(),
-                        );
-                        a_slice += 1;
-                    } else {
-                        cb.require_equal(
-                            "absorb state copy",
-                            pre_s[i][j].clone() * continue_hash.clone(),
-                            s_next[i][j].clone(),
-                        );
-                    }
-                }
-            }
-            cb.gate(meta.query_fixed(q_absorb, Rotation::cur()))
-        });
-
-        // Collect the bytes that are spread out over previous rows
-        let mut hash_bytes = Vec::new();
-        for i in 0..NUM_WORDS_TO_SQUEEZE {
-            for byte in squeeze_bytes.iter() {
-                let rot = (-(i as i32) - 1) * num_rows_per_round as i32;
-                hash_bytes.push(byte.cell.at_offset(meta, rot).expr());
-            }
-        }
-
-        // Squeeze
-        meta.create_gate("squeeze", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let start_new_hash = start_new_hash(meta, Rotation::cur());
-            // The words to squeeze
-            let hash_words: Vec<_> =
-                pre_s.into_iter().take(4).map(|a| a[0].clone()).take(4).collect();
-            // Verify if we converted the correct words to bytes on previous rows
-            for (idx, word) in hash_words.iter().enumerate() {
-                cb.condition(start_new_hash.clone(), |cb| {
-                    cb.require_equal(
-                        "squeeze verify packed",
-                        word.clone(),
-                        squeeze_from_prev[idx].clone(),
-                    );
-                });
-            }
-
-            let hash_bytes_le = hash_bytes.into_iter().rev().collect::<Vec<_>>();
-            cb.condition(start_new_hash, |cb| {
-                cb.require_equal_word(
-                    "output check",
-                    word::Word32::new(hash_bytes_le.try_into().expect("32 limbs")).to_word(),
-                    hash_word.map(|col| meta.query_advice(col, Rotation::cur())),
-                );
-            });
-            cb.gate(meta.query_fixed(q_round_last, Rotation::cur()))
-        });
-
-        // Some general input checks
-        meta.create_gate("input checks", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            cb.require_boolean("boolean is_final", meta.query_advice(is_final, Rotation::cur()));
-            cb.gate(meta.query_fixed(q_enable, Rotation::cur()))
-        });
-
-        // Enforce fixed values on the first row
-        meta.create_gate("first row", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            cb.require_zero(
-                "is_final needs to be disabled on the first row",
-                meta.query_advice(is_final, Rotation::cur()),
-            );
-            cb.gate(meta.query_fixed(q_first, Rotation::cur()))
-        });
-
-        // some utility query functions
-        let q = |col: Column<Fixed>, meta: &mut VirtualCells<'_, F>| {
-            meta.query_fixed(col, Rotation::cur())
-        };
-        /*
-        eg：
-            data:
-                get_num_rows_per_round: 18
-                input: "12345678abc"
-            table:
-                Note[1]: be careful: is_paddings is not column here! It is [Cell; 8] and it will be constrained later.
-                Note[2]: only first row of each round has constraints on bytes_left. This example just shows how witnesses are filled.
-        offset word_value bytes_left  is_paddings q_enable q_input_last
-        18     0x87654321    11          0         1        0 // 1st round begin
-        19        0          10          0         0        0
-        20        0          9           0         0        0
-        21        0          8           0         0        0
-        22        0          7           0         0        0
-        23        0          6           0         0        0
-        24        0          5           0         0        0
-        25        0          4           0         0        0
-        26        0          4           NA        0        0
-        ...
-        35        0          4           NA        0        0  // 1st round end
-        36      0xcba        3           0         1        1  // 2nd round begin
-        37        0          2           0         0        0
-        38        0          1           0         0        0
-        39        0          0           1         0        0
-        40        0          0           1         0        0
-        41        0          0           1         0        0
-        42        0          0           1         0        0
-        43        0          0           1         0        0
-        */
-
-        meta.create_gate("word_value", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let masked_input_bytes = input_bytes
-                .iter()
-                .zip(is_paddings.clone())
-                .map(|(input_byte, is_padding)| {
-                    input_byte.expr.clone() * not::expr(is_padding.expr().clone())
-                })
-                .collect_vec();
-            let input_word = from_bytes::expr(&masked_input_bytes);
-            cb.require_equal(
-                "word value",
-                input_word,
-                meta.query_advice(keccak_table.word_value, Rotation::cur()),
-            );
-            cb.gate(q(q_input, meta))
-        });
-        meta.create_gate("bytes_left", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let bytes_left_expr = meta.query_advice(keccak_table.bytes_left, Rotation::cur());
-
-            // bytes_left is 0 in the absolute first `rows_per_round` of the entire circuit, i.e., the first dummy round.
-            cb.condition(q(q_first, meta), |cb| {
-                cb.require_zero(
-                    "bytes_left needs to be zero on the absolute first dummy round",
-                    meta.query_advice(keccak_table.bytes_left, Rotation::cur()),
-                );
-            });
-            let is_final_expr = meta.query_advice(is_final, Rotation::cur());
-            // is_final ==> bytes_left == 0.
-            // Note: is_final = true only in the last round, which doesn't have any data to absorb.
-            cb.condition(meta.query_advice(is_final, Rotation::cur()), |cb| {
-                cb.require_zero("bytes_left should be 0 when is_final", bytes_left_expr.clone());
-            });
-            // word_len = q_input? NUM_BYTES_PER_WORD - sum(is_paddings): 0
-            // Only rounds with q_input == true have inputs to absorb.
-            let word_len = select::expr(
-                q(q_input, meta),
-                NUM_BYTES_PER_WORD.expr() - sum::expr(is_paddings.clone()),
-                0.expr(),
-            );
-            // !is_final[i] ==> bytes_left[i + num_rows_per_round] + word_len == bytes_left[i]
-            cb.condition(not::expr(is_final_expr), |cb| {
-                let bytes_left_next_expr =
-                    meta.query_advice(keccak_table.bytes_left, Rotation(num_rows_per_round as i32));
-                cb.require_equal(
-                    "if not final, bytes_left decreaes by the length of the word",
-                    bytes_left_expr,
-                    bytes_left_next_expr.clone() + word_len,
-                );
-            });
-
-            cb.gate(q(q_enable, meta))
-        });
-
-        // Enforce logic for when this block is the last block for a hash
-        let last_is_padding_in_block = is_paddings.last().unwrap().at_offset(
-            meta,
-            -(((NUM_ROUNDS + 1 - NUM_WORDS_TO_ABSORB) * num_rows_per_round) as i32),
-        );
-        meta.create_gate("is final", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            // All absorb rows except the first row
-            cb.condition(
-                meta.query_fixed(q_absorb, Rotation::cur())
-                    - meta.query_fixed(q_first, Rotation::cur()),
-                |cb| {
-                    cb.require_equal(
-                        "is_final needs to be the same as the last is_padding in the block",
-                        meta.query_advice(is_final, Rotation::cur()),
-                        last_is_padding_in_block.expr(),
-                    );
-                },
-            );
-            // For all the rows of a round, only the first row can have `is_final == 1`.
-            cb.condition(
-                (1..num_rows_per_round as i32)
-                    .map(|i| meta.query_fixed(q_enable, Rotation(-i)))
-                    .fold(0.expr(), |acc, elem| acc + elem),
-                |cb| {
-                    cb.require_zero(
-                        "is_final only when q_enable",
-                        meta.query_advice(is_final, Rotation::cur()),
-                    );
-                },
-            );
-            cb.gate(1.expr())
-        });
-
-        // Padding
-        // May be cleaner to do this padding logic in the byte conversion lookup but
-        // currently easier to do it like this.
-        let prev_is_padding =
-            is_paddings.last().unwrap().at_offset(meta, -(num_rows_per_round as i32));
-        meta.create_gate("padding", |meta| {
-            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
-            let q_input = meta.query_fixed(q_input, Rotation::cur());
-            let q_input_last = meta.query_fixed(q_input_last, Rotation::cur());
-
-            // All padding selectors need to be boolean
-            for is_padding in is_paddings.iter() {
-                cb.condition(meta.query_fixed(q_enable, Rotation::cur()), |cb| {
-                    cb.require_boolean("is_padding boolean", is_padding.expr());
-                });
-            }
-            // This last padding selector will be used on the first round row so needs to be
-            // zero
-            cb.condition(meta.query_fixed(q_absorb, Rotation::cur()), |cb| {
-                cb.require_zero(
-                    "last is_padding should be zero on absorb rows",
-                    is_paddings.last().unwrap().expr(),
-                );
-            });
-            // Now for each padding selector
-            for idx in 0..is_paddings.len() {
-                // Previous padding selector can be on the previous row
-                let is_padding_prev =
-                    if idx == 0 { prev_is_padding.expr() } else { is_paddings[idx - 1].expr() };
-                let is_first_padding = is_paddings[idx].expr() - is_padding_prev.clone();
-
-                // Check padding transition 0 -> 1 done only once
-                cb.condition(q_input.expr(), |cb| {
-                    cb.require_boolean("padding step boolean", is_first_padding.clone());
-                });
-
-                // Padding start/intermediate/end byte checks
-                if idx == is_paddings.len() - 1 {
-                    // These can be combined in the future, but currently this would increase the
-                    // degree by one Padding start/intermediate byte, all
-                    // padding rows except the last one
-                    cb.condition(
-                        and::expr([q_input.expr() - q_input_last.expr(), is_paddings[idx].expr()]),
-                        |cb| {
-                            // Input bytes need to be zero, or one if this is the first padding byte
-                            cb.require_equal(
-                                "padding start/intermediate byte last byte",
-                                input_bytes[idx].expr.clone(),
-                                is_first_padding.expr(),
-                            );
-                        },
-                    );
-                    // Padding start/end byte, only on the last padding row
-                    cb.condition(and::expr([q_input_last.expr(), is_paddings[idx].expr()]), |cb| {
-                        // The input byte needs to be 128, unless it's also the first padding
-                        // byte then it's 129
-                        cb.require_equal(
-                            "padding start/end byte",
-                            input_bytes[idx].expr.clone(),
-                            is_first_padding.expr() + 128.expr(),
-                        );
-                    });
-                } else {
-                    // Padding start/intermediate byte
-                    cb.condition(and::expr([q_input.expr(), is_paddings[idx].expr()]), |cb| {
-                        // Input bytes need to be zero, or one if this is the first padding byte
-                        cb.require_equal(
-                            "padding start/intermediate byte",
-                            input_bytes[idx].expr.clone(),
-                            is_first_padding.expr(),
-                        );
-                    });
-                }
-            }
-            cb.gate(1.expr())
-        });
-
-        info!("Degree: {}", meta.degree());
-        info!("Minimum rows: {}", meta.minimum_rows());
-        info!("Total Lookups: {}", total_lookup_counter);
-        #[cfg(feature = "display")]
-        {
-            println!("Total Keccak Columns: {}", cell_manager.get_width());
-            std::env::set_var("KECCAK_ADVICE_COLUMNS", cell_manager.get_width().to_string());
-        }
-        #[cfg(not(feature = "display"))]
-        info!("Total Keccak Columns: {}", cell_manager.get_width());
-        info!("num unused cells: {}", cell_manager.get_num_unused_cells());
-        info!("part_size absorb: {}", get_num_bits_per_absorb_lookup(k));
-        info!("part_size theta: {}", get_num_bits_per_theta_c_lookup(k));
-        info!("part_size theta c: {}", get_num_bits_per_lookup(THETA_C_LOOKUP_RANGE, k));
-        info!("part_size theta t: {}", get_num_bits_per_lookup(4, k));
-        info!("part_size rho/pi: {}", get_num_bits_per_rho_pi_lookup(k));
-        info!("part_size chi base: {}", get_num_bits_per_base_chi_lookup(k));
-        info!("uniform part sizes: {:?}", target_part_sizes(get_num_bits_per_theta_c_lookup(k)));
-
-        KeccakCircuitConfig {
-            q_enable,
-            q_first,
-            q_round,
-            q_absorb,
-            q_round_last,
-            q_input,
-            q_input_last,
-            keccak_table,
-            cell_manager,
-            round_cst,
-            normalize_3,
-            normalize_4,
-            normalize_6,
-            chi_base_table,
-            pack_table,
-            parameters,
-            _marker: PhantomData,
-        }
-    }
-}
-
-#[allow(dead_code)]
-#[derive(Clone)]
-pub struct KeccakAssignedRow<'v, F: Field> {
-    pub(crate) is_final: KeccakAssignedValue<'v, F>,
-    pub(crate) hash_lo: KeccakAssignedValue<'v, F>,
-    pub(crate) hash_hi: KeccakAssignedValue<'v, F>,
-    pub(crate) bytes_left: KeccakAssignedValue<'v, F>,
-    pub(crate) word_value: KeccakAssignedValue<'v, F>,
-}
-
-impl<F: Field> KeccakCircuitConfig<F> {
-    /// Returns vector of `is_final`, `length`, `hash.lo`, `hash.hi` for assigned rows
-    pub fn assign<'v>(
-        &self,
-        region: &mut Region<F>,
-        witness: &[KeccakRow<F>],
-    ) -> Vec<KeccakAssignedRow<'v, F>> {
-        witness
-            .iter()
-            .enumerate()
-            .map(|(offset, keccak_row)| self.set_row(region, offset, keccak_row))
-            .collect()
-    }
-
-    /// Output is `is_final`, `length`, `hash.lo`, `hash.hi` at that row
-    pub fn set_row<'v>(
-        &self,
-        region: &mut Region<F>,
-        offset: usize,
-        row: &KeccakRow<F>,
-    ) -> KeccakAssignedRow<'v, F> {
-        // Fixed selectors
-        for (_, column, value) in &[
-            ("q_enable", self.q_enable, F::from(row.q_enable)),
-            ("q_first", self.q_first, F::from(offset == 0)),
-            ("q_round", self.q_round, F::from(row.q_round)),
-            ("q_round_last", self.q_round_last, F::from(row.q_round_last)),
-            ("q_absorb", self.q_absorb, F::from(row.q_absorb)),
-            ("q_input", self.q_input, F::from(row.q_input)),
-            ("q_input_last", self.q_input_last, F::from(row.q_input_last)),
-        ] {
-            raw_assign_fixed(region, *column, offset, *value);
-        }
-
-        // Keccak data
-        let [is_final, hash_lo, hash_hi, bytes_left, word_value] = [
-            ("is_final", self.keccak_table.is_enabled, Value::known(F::from(row.is_final))),
-            ("hash_lo", self.keccak_table.output.lo(), row.hash.lo()),
-            ("hash_hi", self.keccak_table.output.hi(), row.hash.hi()),
-            ("bytes_left", self.keccak_table.bytes_left, Value::known(row.bytes_left)),
-            ("word_value", self.keccak_table.word_value, Value::known(row.word_value)),
-        ]
-        .map(|(_name, column, value)| raw_assign_advice(region, column, offset, value));
-
-        // Cell values
-        row.cell_values.iter().zip(self.cell_manager.columns()).for_each(|(bit, column)| {
-            raw_assign_advice(region, column.advice, offset, Value::known(*bit));
-        });
-
-        // Round constant
-        raw_assign_fixed(region, self.round_cst, offset, row.round_cst);
-
-        KeccakAssignedRow { is_final, hash_lo, hash_hi, bytes_left, word_value }
-    }
-
-    pub fn load_aux_tables(&self, layouter: &mut impl Layouter<F>, k: u32) -> Result<(), Error> {
-        load_normalize_table(layouter, "normalize_6", &self.normalize_6, 6u64, k)?;
-        load_normalize_table(layouter, "normalize_4", &self.normalize_4, 4u64, k)?;
-        load_normalize_table(layouter, "normalize_3", &self.normalize_3, 3u64, k)?;
-        load_lookup_table(
-            layouter,
-            "chi base",
-            &self.chi_base_table,
-            get_num_bits_per_base_chi_lookup(k),
-            &CHI_BASE_LOOKUP_TABLE,
-        )?;
-        load_pack_table(layouter, &self.pack_table)
-    }
-}
-
-/// Witness generation for keccak hash of little-endian `bytes`.
-fn keccak<F: Field>(
-    rows: &mut Vec<KeccakRow<F>>,
-    squeeze_digests: &mut Vec<[F; NUM_WORDS_TO_SQUEEZE]>,
-    bytes: &[u8],
-    parameters: KeccakConfigParams,
-) {
-    let k = parameters.k;
-    let num_rows_per_round = parameters.rows_per_round;
-
-    let mut bits = into_bits(bytes);
-    let mut s = [[F::ZERO; 5]; 5];
-    let absorb_positions = get_absorb_positions();
-    let num_bytes_in_last_block = bytes.len() % RATE;
-    let two = F::from(2u64);
-
-    // Padding
-    bits.push(1);
-    while (bits.len() + 1) % RATE_IN_BITS != 0 {
-        bits.push(0);
-    }
-    bits.push(1);
-
-    // running length of absorbed input in bytes
-    let mut length = 0;
-    let chunks = bits.chunks(RATE_IN_BITS);
-    let num_chunks = chunks.len();
-
-    let mut cell_managers = Vec::with_capacity(NUM_ROUNDS + 1);
-    let mut regions = Vec::with_capacity(NUM_ROUNDS + 1);
-    // keeps track of running lengths over all rounds in an absorb step
-    let mut round_lengths = Vec::with_capacity(NUM_ROUNDS + 1);
-    let mut hash_words = [F::ZERO; NUM_WORDS_TO_SQUEEZE];
-    let mut hash = Word::default();
-
-    for (idx, chunk) in chunks.enumerate() {
-        let is_final_block = idx == num_chunks - 1;
-
-        let mut absorb_rows = Vec::new();
-        // Absorb
-        for (idx, &(i, j)) in absorb_positions.iter().enumerate() {
-            let absorb = pack(&chunk[idx * 64..(idx + 1) * 64]);
-            let from = s[i][j];
-            s[i][j] = field_xor(s[i][j], absorb);
-            absorb_rows.push(AbsorbData { from, absorb, result: s[i][j] });
-        }
-
-        // better memory management to clear already allocated Vecs
-        cell_managers.clear();
-        regions.clear();
-        round_lengths.clear();
-
-        for round in 0..NUM_ROUNDS + 1 {
-            let mut cell_manager = CellManager::new(num_rows_per_round);
-            let mut region = KeccakRegion::new();
-
-            let mut absorb_row = AbsorbData::default();
-            if round < NUM_WORDS_TO_ABSORB {
-                absorb_row = absorb_rows[round].clone();
-            }
-
-            // State data
-            for s in &s {
-                for s in s {
-                    let cell = cell_manager.query_cell_value();
-                    cell.assign(&mut region, 0, *s);
-                }
-            }
-
-            // Absorb data
-            let absorb_from = cell_manager.query_cell_value();
-            let absorb_data = cell_manager.query_cell_value();
-            let absorb_result = cell_manager.query_cell_value();
-            absorb_from.assign(&mut region, 0, absorb_row.from);
-            absorb_data.assign(&mut region, 0, absorb_row.absorb);
-            absorb_result.assign(&mut region, 0, absorb_row.result);
-
-            // Absorb
-            cell_manager.start_region();
-            let part_size = get_num_bits_per_absorb_lookup(k);
-            let input = absorb_row.from + absorb_row.absorb;
-            let absorb_fat =
-                split::value(&mut cell_manager, &mut region, input, 0, part_size, false, None);
-            cell_manager.start_region();
-            let _absorb_result = transform::value(
-                &mut cell_manager,
-                &mut region,
-                absorb_fat.clone(),
-                true,
-                |v| v & 1,
-                true,
-            );
-
-            // Padding
-            cell_manager.start_region();
-            // Unpack a single word into bytes (for the absorption)
-            // Potential optimization: could do multiple bytes per lookup
-            let packed =
-                split::value(&mut cell_manager, &mut region, absorb_row.absorb, 0, 8, false, None);
-            cell_manager.start_region();
-            let input_bytes =
-                transform::value(&mut cell_manager, &mut region, packed, false, |v| *v, true);
-            cell_manager.start_region();
-            let is_paddings =
-                input_bytes.iter().map(|_| cell_manager.query_cell_value()).collect::<Vec<_>>();
-            debug_assert_eq!(is_paddings.len(), NUM_BYTES_PER_WORD);
-            if round < NUM_WORDS_TO_ABSORB {
-                for (padding_idx, is_padding) in is_paddings.iter().enumerate() {
-                    let byte_idx = round * NUM_BYTES_PER_WORD + padding_idx;
-                    let padding = if is_final_block && byte_idx >= num_bytes_in_last_block {
-                        true
-                    } else {
-                        length += 1;
-                        false
-                    };
-                    is_padding.assign(&mut region, 0, F::from(padding));
-                }
-            }
-            cell_manager.start_region();
-
-            if round != NUM_ROUNDS {
-                // Theta
-                let part_size = get_num_bits_per_theta_c_lookup(k);
-                let mut bcf = Vec::new();
-                for s in &s {
-                    let c = s[0] + s[1] + s[2] + s[3] + s[4];
-                    let bc_fat =
-                        split::value(&mut cell_manager, &mut region, c, 1, part_size, false, None);
-                    bcf.push(bc_fat);
-                }
-                cell_manager.start_region();
-                let mut bc = Vec::new();
-                for bc_fat in bcf {
-                    let bc_norm = transform::value(
-                        &mut cell_manager,
-                        &mut region,
-                        bc_fat.clone(),
-                        true,
-                        |v| v & 1,
-                        true,
-                    );
-                    bc.push(bc_norm);
-                }
-                cell_manager.start_region();
-                let mut os = [[F::ZERO; 5]; 5];
-                for i in 0..5 {
-                    let t = decode::value(bc[(i + 4) % 5].clone())
-                        + decode::value(rotate(bc[(i + 1) % 5].clone(), 1, part_size));
-                    for j in 0..5 {
-                        os[i][j] = s[i][j] + t;
-                    }
-                }
-                s = os;
-                cell_manager.start_region();
-
-                // Rho/Pi
-                let part_size = get_num_bits_per_base_chi_lookup(k);
-                let target_word_sizes = target_part_sizes(part_size);
-                let num_word_parts = target_word_sizes.len();
-                let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] =
-                    array_init::array_init(|_| {
-                        array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
-                    });
-                let mut column_starts = [0usize; 3];
-                for p in 0..3 {
-                    column_starts[p] = cell_manager.start_region();
-                    let mut row_idx = 0;
-                    for j in 0..5 {
-                        for _ in 0..num_word_parts {
-                            for i in 0..5 {
-                                rho_pi_chi_cells[p][i][j]
-                                    .push(cell_manager.query_cell_value_at_row(row_idx as i32));
-                            }
-                            row_idx = (row_idx + 1) % num_rows_per_round;
-                        }
-                    }
-                }
-                cell_manager.start_region();
-                let mut os_parts: [[Vec<PartValue<F>>; 5]; 5] =
-                    array_init::array_init(|_| array_init::array_init(|_| Vec::new()));
-                for (j, os_part) in os_parts.iter_mut().enumerate() {
-                    for i in 0..5 {
-                        let s_parts = split_uniform::value(
-                            &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
-                            &mut cell_manager,
-                            &mut region,
-                            s[i][j],
-                            RHO_MATRIX[i][j],
-                            part_size,
-                            true,
-                        );
-
-                        let s_parts = transform_to::value(
-                            &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
-                            &mut region,
-                            s_parts.clone(),
-                            true,
-                            |v| v & 1,
-                        );
-                        os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
-                    }
-                }
-                cell_manager.start_region();
-
-                // Chi
-                let part_size_base = get_num_bits_per_base_chi_lookup(k);
-                let three_packed = pack::<F>(&vec![3u8; part_size_base]);
-                let mut os = [[F::ZERO; 5]; 5];
-                for j in 0..5 {
-                    for i in 0..5 {
-                        let mut s_parts = Vec::new();
-                        for ((part_a, part_b), part_c) in os_parts[i][j]
-                            .iter()
-                            .zip(os_parts[(i + 1) % 5][j].iter())
-                            .zip(os_parts[(i + 2) % 5][j].iter())
-                        {
-                            let value =
-                                three_packed - two * part_a.value + part_b.value - part_c.value;
-                            s_parts.push(PartValue {
-                                num_bits: part_size_base,
-                                rot: j as i32,
-                                value,
-                            });
-                        }
-                        os[i][j] = decode::value(transform_to::value(
-                            &rho_pi_chi_cells[2][i][j],
-                            &mut region,
-                            s_parts.clone(),
-                            true,
-                            |v| CHI_BASE_LOOKUP_TABLE[*v as usize],
-                        ));
-                    }
-                }
-                s = os;
-                cell_manager.start_region();
-
-                // iota
-                let part_size = get_num_bits_per_absorb_lookup(k);
-                let input = s[0][0] + pack_u64::<F>(ROUND_CST[round]);
-                let iota_parts = split::value::<F>(
-                    &mut cell_manager,
-                    &mut region,
-                    input,
-                    0,
-                    part_size,
-                    false,
-                    None,
-                );
-                cell_manager.start_region();
-                s[0][0] = decode::value(transform::value(
-                    &mut cell_manager,
-                    &mut region,
-                    iota_parts.clone(),
-                    true,
-                    |v| v & 1,
-                    true,
-                ));
-            }
-
-            // Assign the hash result
-            let is_final = is_final_block && round == NUM_ROUNDS;
-            hash = if is_final {
-                let hash_bytes_le = s
-                    .into_iter()
-                    .take(4)
-                    .flat_map(|a| to_bytes::value(&unpack(a[0])))
-                    .rev()
-                    .collect::<Vec<_>>();
-
-                let word: Word<Value<F>> =
-                    Word::from(eth_types::Word::from_little_endian(hash_bytes_le.as_slice()))
-                        .map(Value::known);
-                word
-            } else {
-                Word::default().into_value()
-            };
-
-            // The words to squeeze out: this is the hash digest as words with
-            // NUM_BYTES_PER_WORD (=8) bytes each
-            for (hash_word, a) in hash_words.iter_mut().zip(s.iter()) {
-                *hash_word = a[0];
-            }
-
-            round_lengths.push(length);
-
-            cell_managers.push(cell_manager);
-            regions.push(region);
-        }
-
-        // Now that we know the state at the end of the rounds, set the squeeze data
-        let num_rounds = cell_managers.len();
-        for (idx, word) in hash_words.iter().enumerate() {
-            let cell_manager = &mut cell_managers[num_rounds - 2 - idx];
-            let region = &mut regions[num_rounds - 2 - idx];
-
-            cell_manager.start_region();
-            let squeeze_packed = cell_manager.query_cell_value();
-            squeeze_packed.assign(region, 0, *word);
-
-            cell_manager.start_region();
-            let packed = split::value(cell_manager, region, *word, 0, 8, false, None);
-            cell_manager.start_region();
-            transform::value(cell_manager, region, packed, false, |v| *v, true);
-        }
-        squeeze_digests.push(hash_words);
-
-        for round in 0..NUM_ROUNDS + 1 {
-            let round_cst = pack_u64(ROUND_CST[round]);
-
-            for row_idx in 0..num_rows_per_round {
-                let word_value = if round < NUM_WORDS_TO_ABSORB && row_idx == 0 {
-                    let byte_idx = (idx * NUM_WORDS_TO_ABSORB + round) * NUM_BYTES_PER_WORD;
-                    if byte_idx >= bytes.len() {
-                        0
-                    } else {
-                        let end = std::cmp::min(byte_idx + NUM_BYTES_PER_WORD, bytes.len());
-                        let mut word_bytes = bytes[byte_idx..end].to_vec().clone();
-                        word_bytes.resize(NUM_BYTES_PER_WORD, 0);
-                        u64::from_le_bytes(word_bytes.try_into().unwrap())
-                    }
-                } else {
-                    0
-                };
-                let byte_idx = if round < NUM_WORDS_TO_ABSORB {
-                    round * NUM_BYTES_PER_WORD + std::cmp::min(row_idx, NUM_BYTES_PER_WORD - 1)
-                } else {
-                    NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD
-                } + idx * NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD;
-                let bytes_left = if byte_idx >= bytes.len() { 0 } else { bytes.len() - byte_idx };
-                rows.push(KeccakRow {
-                    q_enable: row_idx == 0,
-                    q_round: row_idx == 0 && round < NUM_ROUNDS,
-                    q_absorb: row_idx == 0 && round == NUM_ROUNDS,
-                    q_round_last: row_idx == 0 && round == NUM_ROUNDS,
-                    q_input: row_idx == 0 && round < NUM_WORDS_TO_ABSORB,
-                    q_input_last: row_idx == 0 && round == NUM_WORDS_TO_ABSORB - 1,
-                    round_cst,
-                    is_final: is_final_block && round == NUM_ROUNDS && row_idx == 0,
-                    cell_values: regions[round].rows.get(row_idx).unwrap_or(&vec![]).clone(),
-                    hash,
-                    bytes_left: F::from_u128(bytes_left as u128),
-                    word_value: F::from_u128(word_value as u128),
-                });
-                #[cfg(debug_assertions)]
-                {
-                    let mut r = rows.last().unwrap().clone();
-                    r.cell_values.clear();
-                    log::trace!("offset {:?} row idx {} row {:?}", rows.len() - 1, row_idx, r);
-                }
-            }
-            log::trace!(" = = = = = = round {} end", round);
-        }
-        log::trace!(" ====================== chunk {} end", idx);
-    }
-
-    #[cfg(debug_assertions)]
-    {
-        let hash_bytes = s
-            .into_iter()
-            .take(4)
-            .map(|a| {
-                pack_with_base::<F>(&unpack(a[0]), 2)
-                    .to_bytes_le()
-                    .into_iter()
-                    .take(8)
-                    .collect::<Vec<_>>()
-                    .to_vec()
-            })
-            .collect::<Vec<_>>();
-        debug!("hash: {:x?}", &(hash_bytes[0..4].concat()));
-        assert_eq!(length, bytes.len());
-    }
-}
-
-/// Witness generation for multiple keccak hashes of little-endian `bytes`.
-pub fn multi_keccak<F: Field>(
-    bytes: &[Vec<u8>],
-    capacity: Option<usize>,
-    parameters: KeccakConfigParams,
-) -> (Vec<KeccakRow<F>>, Vec<[F; NUM_WORDS_TO_SQUEEZE]>) {
-    let num_rows_per_round = parameters.rows_per_round;
-    let mut rows =
-        Vec::with_capacity((1 + capacity.unwrap_or(0) * (NUM_ROUNDS + 1)) * num_rows_per_round);
-    // Dummy first row so that the initial data is absorbed
-    // The initial data doesn't really matter, `is_final` just needs to be disabled.
-    rows.append(&mut KeccakRow::dummy_rows(num_rows_per_round));
-    // Actual keccaks
-    let artifacts = bytes
-        .par_iter()
-        .map(|bytes| {
-            let num_keccak_f = get_num_keccak_f(bytes.len());
-            let mut squeeze_digests = Vec::with_capacity(num_keccak_f);
-            let mut rows = Vec::with_capacity(num_keccak_f * (NUM_ROUNDS + 1) * num_rows_per_round);
-            keccak(&mut rows, &mut squeeze_digests, bytes, parameters);
-            (rows, squeeze_digests)
-        })
-        .collect::<Vec<_>>();
-
-    let mut squeeze_digests = Vec::with_capacity(capacity.unwrap_or(0));
-    for (rows_part, squeezes) in artifacts {
-        rows.extend(rows_part);
-        squeeze_digests.extend(squeezes);
-    }
-
-    if let Some(capacity) = capacity {
-        // Pad with no data hashes to the expected capacity
-        while rows.len() < (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
-            keccak(&mut rows, &mut squeeze_digests, &[], parameters);
-        }
-        // Check that we are not over capacity
-        if rows.len() > (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
-            panic!("{:?}", Error::BoundsFailure);
-        }
-    }
-    (rows, squeeze_digests)
-}
+/// Module for coprocessor circuits.
+pub mod coprocessor;
+/// Module for Keccak circuits in vanilla halo2.
+pub mod vanilla;
diff --git a/hashes/zkevm/src/keccak/cell_manager.rs b/hashes/zkevm/src/keccak/vanilla/cell_manager.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/cell_manager.rs
rename to hashes/zkevm/src/keccak/vanilla/cell_manager.rs
diff --git a/hashes/zkevm/src/keccak/keccak_packed_multi.rs b/hashes/zkevm/src/keccak/vanilla/keccak_packed_multi.rs
similarity index 96%
rename from hashes/zkevm/src/keccak/keccak_packed_multi.rs
rename to hashes/zkevm/src/keccak/vanilla/keccak_packed_multi.rs
index 1b9b005d..5a76d248 100644
--- a/hashes/zkevm/src/keccak/keccak_packed_multi.rs
+++ b/hashes/zkevm/src/keccak/vanilla/keccak_packed_multi.rs
@@ -148,16 +148,17 @@ pub struct KeccakTable {
 impl KeccakTable {
     /// Construct a new KeccakTable
     pub fn construct<F: Field>(meta: &mut ConstraintSystem<F>) -> Self {
-        let input_len = meta.advice_column();
+        let is_enabled = meta.advice_column();
         let word_value = meta.advice_column();
         let bytes_left = meta.advice_column();
-        meta.enable_equality(input_len);
-        Self {
-            is_enabled: meta.advice_column(),
-            output: Word::new([meta.advice_column(), meta.advice_column()]),
-            word_value,
-            bytes_left,
-        }
+        let hash_lo = meta.advice_column();
+        let hash_hi = meta.advice_column();
+        meta.enable_equality(is_enabled);
+        meta.enable_equality(word_value);
+        meta.enable_equality(bytes_left);
+        meta.enable_equality(hash_lo);
+        meta.enable_equality(hash_hi);
+        Self { is_enabled, output: Word::new([hash_lo, hash_hi]), word_value, bytes_left }
     }
 }
 
@@ -166,7 +167,7 @@ pub(crate) type KeccakAssignedValue<'v, F> = Halo2AssignedCell<'v, F>;
 /// Recombines parts back together
 pub(crate) mod decode {
     use super::{Expr, Part, PartValue, PrimeField};
-    use crate::{halo2_proofs::plonk::Expression, keccak::param::*};
+    use crate::{halo2_proofs::plonk::Expression, keccak::vanilla::param::*};
 
     pub(crate) fn expr<F: PrimeField>(parts: Vec<Part<F>>) -> Expression<F> {
         parts.iter().rev().fold(0.expr(), |acc, part| {
@@ -189,7 +190,7 @@ pub(crate) mod split {
     };
     use crate::{
         halo2_proofs::plonk::{ConstraintSystem, Expression},
-        keccak::util::{pack, pack_part, unpack, WordParts},
+        keccak::vanilla::util::{pack, pack_part, unpack, WordParts},
     };
 
     #[allow(clippy::too_many_arguments)]
@@ -260,7 +261,7 @@ pub(crate) mod split_uniform {
     use super::decode;
     use crate::{
         halo2_proofs::plonk::{ConstraintSystem, Expression},
-        keccak::{
+        keccak::vanilla::{
             param::*,
             target_part_sizes,
             util::{pack, pack_part, rotate, rotate_rev, unpack, WordParts},
@@ -492,9 +493,9 @@ pub(crate) mod transform {
 pub(crate) mod transform_to {
     use crate::{
         halo2_proofs::plonk::{ConstraintSystem, TableColumn},
-        keccak::{
+        keccak::vanilla::{
             util::{pack, to_bytes, unpack},
-            {Cell, Expr, Field, KeccakRegion, Part, PartValue, PrimeField},
+            Cell, Expr, Field, KeccakRegion, Part, PartValue, PrimeField,
         },
     };
 
diff --git a/hashes/zkevm/src/keccak/vanilla/mod.rs b/hashes/zkevm/src/keccak/vanilla/mod.rs
new file mode 100644
index 00000000..90c461a4
--- /dev/null
+++ b/hashes/zkevm/src/keccak/vanilla/mod.rs
@@ -0,0 +1,883 @@
+use self::{cell_manager::*, keccak_packed_multi::*, param::*, table::*, util::*};
+use crate::{
+    halo2_proofs::{
+        circuit::{Layouter, Region, Value},
+        halo2curves::ff::PrimeField,
+        plonk::{Column, ConstraintSystem, Error, Expression, Fixed, TableColumn, VirtualCells},
+        poly::Rotation,
+    },
+    util::{
+        constraint_builder::BaseConstraintBuilder,
+        eth_types::{self, Field},
+        expression::{and, from_bytes, not, select, sum, Expr},
+        word::{self, Word, WordExpr},
+    },
+};
+use halo2_base::utils::halo2::{raw_assign_advice, raw_assign_fixed};
+use itertools::Itertools;
+use log::{debug, info};
+use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
+use std::marker::PhantomData;
+
+pub mod cell_manager;
+pub mod keccak_packed_multi;
+pub mod param;
+pub mod table;
+#[cfg(test)]
+mod tests;
+pub mod util;
+/// Module for witness generation.
+pub mod witness;
+
+/// Configuration parameters to define [`KeccakCircuitConfig`]
+#[derive(Copy, Clone, Debug, Default)]
+pub struct KeccakConfigParams {
+    /// The circuit degree, i.e., circuit has 2<sup>k</sup> rows
+    pub k: u32,
+    /// The number of rows to use for each round in the keccak_f permutation
+    pub rows_per_round: usize,
+}
+
+/// KeccakConfig
+#[derive(Clone, Debug)]
+pub struct KeccakCircuitConfig<F> {
+    // Bool. True on 1st row of each round.
+    q_enable: Column<Fixed>,
+    // Bool. True on 1st row.
+    q_first: Column<Fixed>,
+    // Bool. True on 1st row of all rounds except last rounds.
+    q_round: Column<Fixed>,
+    // Bool. True on 1st row of last rounds.
+    q_absorb: Column<Fixed>,
+    // Bool. True on 1st row of last rounds.
+    q_round_last: Column<Fixed>,
+    // Bool. True on 1st row of rounds which might contain inputs.
+    // Note: first NUM_WORDS_TO_ABSORB rounds of each chunk might contain inputs.
+    // It "might" contain inputs because it's possible that a round only have paddings.
+    q_input: Column<Fixed>,
+    // Bool. True on 1st row of all last input round.
+    q_input_last: Column<Fixed>,
+
+    pub keccak_table: KeccakTable,
+
+    cell_manager: CellManager<F>,
+    round_cst: Column<Fixed>,
+    normalize_3: [TableColumn; 2],
+    normalize_4: [TableColumn; 2],
+    normalize_6: [TableColumn; 2],
+    chi_base_table: [TableColumn; 2],
+    pack_table: [TableColumn; 2],
+
+    // config parameters for convenience
+    pub parameters: KeccakConfigParams,
+
+    _marker: PhantomData<F>,
+}
+
+impl<F: Field> KeccakCircuitConfig<F> {
+    /// Return a new KeccakCircuitConfig
+    pub fn new(meta: &mut ConstraintSystem<F>, parameters: KeccakConfigParams) -> Self {
+        let k = parameters.k;
+        let num_rows_per_round = parameters.rows_per_round;
+
+        let q_enable = meta.fixed_column();
+        let q_first = meta.fixed_column();
+        let q_round = meta.fixed_column();
+        let q_absorb = meta.fixed_column();
+        let q_round_last = meta.fixed_column();
+        let q_input = meta.fixed_column();
+        let q_input_last = meta.fixed_column();
+        let round_cst = meta.fixed_column();
+        let keccak_table = KeccakTable::construct(meta);
+
+        let is_final = keccak_table.is_enabled;
+        let hash_word = keccak_table.output;
+
+        let normalize_3 = array_init::array_init(|_| meta.lookup_table_column());
+        let normalize_4 = array_init::array_init(|_| meta.lookup_table_column());
+        let normalize_6 = array_init::array_init(|_| meta.lookup_table_column());
+        let chi_base_table = array_init::array_init(|_| meta.lookup_table_column());
+        let pack_table = array_init::array_init(|_| meta.lookup_table_column());
+
+        let mut cell_manager = CellManager::new(num_rows_per_round);
+        let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+        let mut total_lookup_counter = 0;
+
+        let start_new_hash = |meta: &mut VirtualCells<F>, rot| {
+            // A new hash is started when the previous hash is done or on the first row
+            meta.query_fixed(q_first, rot) + meta.query_advice(is_final, rot)
+        };
+
+        // Round constant
+        let mut round_cst_expr = 0.expr();
+        meta.create_gate("Query round cst", |meta| {
+            round_cst_expr = meta.query_fixed(round_cst, Rotation::cur());
+            vec![0u64.expr()]
+        });
+        // State data
+        let mut s = vec![vec![0u64.expr(); 5]; 5];
+        let mut s_next = vec![vec![0u64.expr(); 5]; 5];
+        for i in 0..5 {
+            for j in 0..5 {
+                let cell = cell_manager.query_cell(meta);
+                s[i][j] = cell.expr();
+                s_next[i][j] = cell.at_offset(meta, num_rows_per_round as i32).expr();
+            }
+        }
+        // Absorb data
+        let absorb_from = cell_manager.query_cell(meta);
+        let absorb_data = cell_manager.query_cell(meta);
+        let absorb_result = cell_manager.query_cell(meta);
+        let mut absorb_from_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
+        let mut absorb_data_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
+        let mut absorb_result_next = vec![0u64.expr(); NUM_WORDS_TO_ABSORB];
+        for i in 0..NUM_WORDS_TO_ABSORB {
+            let rot = ((i + 1) * num_rows_per_round) as i32;
+            absorb_from_next[i] = absorb_from.at_offset(meta, rot).expr();
+            absorb_data_next[i] = absorb_data.at_offset(meta, rot).expr();
+            absorb_result_next[i] = absorb_result.at_offset(meta, rot).expr();
+        }
+
+        // Store the pre-state
+        let pre_s = s.clone();
+
+        // Absorb
+        // The absorption happening at the start of the 24 rounds is done spread out
+        // over those 24 rounds. In a single round (in 17 of the 24 rounds) a
+        // single word is absorbed so the work is spread out. The absorption is
+        // done simply by doing state + data and then normalizing the result to [0,1].
+        // We also need to convert the input data into bytes to calculate the input data
+        // rlc.
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        let part_size = get_num_bits_per_absorb_lookup(k);
+        let input = absorb_from.expr() + absorb_data.expr();
+        let absorb_fat =
+            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
+        cell_manager.start_region();
+        let absorb_res = transform::expr(
+            "absorb",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            absorb_fat,
+            normalize_3,
+            true,
+        );
+        cb.require_equal("absorb result", decode::expr(absorb_res), absorb_result.expr());
+        info!("- Post absorb:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Squeeze
+        // The squeezing happening at the end of the 24 rounds is done spread out
+        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
+        // single word is converted to bytes.
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        // Potential optimization: could do multiple bytes per lookup
+        let packed_parts =
+            split::expr(meta, &mut cell_manager, &mut cb, absorb_data.expr(), 0, 8, false, None);
+        cell_manager.start_region();
+        // input_bytes.len() = packed_parts.len() = 64 / 8 = 8 = NUM_BYTES_PER_WORD
+        let input_bytes = transform::expr(
+            "squeeze unpack",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            packed_parts,
+            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
+            true,
+        );
+        debug_assert_eq!(input_bytes.len(), NUM_BYTES_PER_WORD);
+
+        // Padding data
+        cell_manager.start_region();
+        let is_paddings = input_bytes.iter().map(|_| cell_manager.query_cell(meta)).collect_vec();
+        info!("- Post padding:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Theta
+        // Calculate
+        // - `c[i] = s[i][0] + s[i][1] + s[i][2] + s[i][3] + s[i][4]`
+        // - `bc[i] = normalize(c)`.
+        // - `t[i] = bc[(i + 4) % 5] + rot(bc[(i + 1)% 5], 1)`
+        // This is done by splitting the bc values in parts in a way
+        // that allows us to also calculate the rotated value "for free".
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        let part_size_c = get_num_bits_per_theta_c_lookup(k);
+        let mut c_parts = Vec::new();
+        for s in s.iter() {
+            // Calculate c and split into parts
+            let c = s[0].clone() + s[1].clone() + s[2].clone() + s[3].clone() + s[4].clone();
+            c_parts.push(split::expr(
+                meta,
+                &mut cell_manager,
+                &mut cb,
+                c,
+                1,
+                part_size_c,
+                false,
+                None,
+            ));
+        }
+        // Now calculate `bc` by normalizing `c`
+        cell_manager.start_region();
+        let mut bc = Vec::new();
+        for c in c_parts {
+            // Normalize c
+            bc.push(transform::expr(
+                "theta c",
+                meta,
+                &mut cell_manager,
+                &mut lookup_counter,
+                c,
+                normalize_6,
+                true,
+            ));
+        }
+        // Now do `bc[(i + 4) % 5] + rot(bc[(i + 1) % 5], 1)` using just expressions.
+        // We don't normalize the result here. We do it as part of the rho/pi step, even
+        // though we would only have to normalize 5 values instead of 25, because of the
+        // way the rho/pi and chi steps can be combined it's more efficient to
+        // do it there (the max value for chi is 4 already so that's the
+        // limiting factor).
+        let mut os = vec![vec![0u64.expr(); 5]; 5];
+        for i in 0..5 {
+            let t = decode::expr(bc[(i + 4) % 5].clone())
+                + decode::expr(rotate(bc[(i + 1) % 5].clone(), 1, part_size_c));
+            for j in 0..5 {
+                os[i][j] = s[i][j].clone() + t.clone();
+            }
+        }
+        s = os.clone();
+        info!("- Post theta:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Rho/Pi
+        // For the rotation of rho/pi we split up the words like expected, but in a way
+        // that allows reusing the same parts in an optimal way for the chi step.
+        // We can save quite a few columns by not recombining the parts after rho/pi and
+        // re-splitting the words again before chi. Instead we do chi directly
+        // on the output parts of rho/pi. For rho/pi specically we do
+        // `s[j][2 * i + 3 * j) % 5] = normalize(rot(s[i][j], RHOM[i][j]))`.
+        cell_manager.start_region();
+        let mut lookup_counter = 0;
+        let part_size = get_num_bits_per_base_chi_lookup(k);
+        // To combine the rho/pi/chi steps we have to ensure a specific layout so
+        // query those cells here first.
+        // For chi we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) & s[(i+2)%5][j])`. `j`
+        // remains static but `i` is accessed in a wrap around manner. To do this using
+        // multiple rows with lookups in a way that doesn't require any
+        // extra additional cells or selectors we have to put all `s[i]`'s on the same
+        // row. This isn't that strong of a requirement actually because we the
+        // words are split into multipe parts, and so only the parts at the same
+        // position of those words need to be on the same row.
+        let target_word_sizes = target_part_sizes(part_size);
+        let num_word_parts = target_word_sizes.len();
+        let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] = array_init::array_init(|_| {
+            array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
+        });
+        let mut num_columns = 0;
+        let mut column_starts = [0usize; 3];
+        for p in 0..3 {
+            column_starts[p] = cell_manager.start_region();
+            let mut row_idx = 0;
+            num_columns = 0;
+            for j in 0..5 {
+                for _ in 0..num_word_parts {
+                    for i in 0..5 {
+                        rho_pi_chi_cells[p][i][j]
+                            .push(cell_manager.query_cell_at_row(meta, row_idx));
+                    }
+                    if row_idx == 0 {
+                        num_columns += 1;
+                    }
+                    row_idx = (((row_idx as usize) + 1) % num_rows_per_round) as i32;
+                }
+            }
+        }
+        // Do the transformation, resulting in the word parts also being normalized.
+        let pi_region_start = cell_manager.start_region();
+        let mut os_parts = vec![vec![Vec::new(); 5]; 5];
+        for (j, os_part) in os_parts.iter_mut().enumerate() {
+            for i in 0..5 {
+                // Split s into parts
+                let s_parts = split_uniform::expr(
+                    meta,
+                    &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
+                    &mut cell_manager,
+                    &mut cb,
+                    s[i][j].clone(),
+                    RHO_MATRIX[i][j],
+                    part_size,
+                    true,
+                );
+                // Normalize the data to the target cells
+                let s_parts = transform_to::expr(
+                    "rho/pi",
+                    meta,
+                    &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
+                    &mut lookup_counter,
+                    s_parts.clone(),
+                    normalize_4,
+                    true,
+                );
+                os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
+            }
+        }
+        let pi_region_end = cell_manager.start_region();
+        // Pi parts range checks
+        // To make the uniform stuff work we had to combine some parts together
+        // in new cells (see split_uniform). Here we make sure those parts are range
+        // checked. Potential improvement: Could combine multiple smaller parts
+        // in a single lookup but doesn't save that much.
+        for c in pi_region_start..pi_region_end {
+            meta.lookup("pi part range check", |_| {
+                vec![(cell_manager.columns()[c].expr.clone(), normalize_4[0])]
+            });
+            lookup_counter += 1;
+        }
+        info!("- Post rho/pi:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // Chi
+        // In groups of 5 columns, we have to do `s[i][j] ^ ((~s[(i+1)%5][j]) &
+        // s[(i+2)%5][j])` five times, on each row (no selector needed).
+        // This is calculated by making use of `CHI_BASE_LOOKUP_TABLE`.
+        let mut lookup_counter = 0;
+        let part_size_base = get_num_bits_per_base_chi_lookup(k);
+        for idx in 0..num_columns {
+            // First fetch the cells we wan to use
+            let mut input: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
+            let mut output: [Expression<F>; 5] = array_init::array_init(|_| 0.expr());
+            for c in 0..5 {
+                input[c] = cell_manager.columns()[column_starts[1] + idx * 5 + c].expr.clone();
+                output[c] = cell_manager.columns()[column_starts[2] + idx * 5 + c].expr.clone();
+            }
+            // Now calculate `a ^ ((~b) & c)` by doing `lookup[3 - 2*a + b - c]`
+            for i in 0..5 {
+                let input = scatter::expr(3, part_size_base) - 2.expr() * input[i].clone()
+                    + input[(i + 1) % 5].clone()
+                    - input[(i + 2) % 5].clone();
+                let output = output[i].clone();
+                meta.lookup("chi base", |_| {
+                    vec![(input.clone(), chi_base_table[0]), (output.clone(), chi_base_table[1])]
+                });
+                lookup_counter += 1;
+            }
+        }
+        // Now just decode the parts after the chi transformation done with the lookups
+        // above.
+        let mut os = vec![vec![0u64.expr(); 5]; 5];
+        for (i, os) in os.iter_mut().enumerate() {
+            for (j, os) in os.iter_mut().enumerate() {
+                let mut parts = Vec::new();
+                for idx in 0..num_word_parts {
+                    parts.push(Part {
+                        num_bits: part_size_base,
+                        cell: rho_pi_chi_cells[2][i][j][idx].clone(),
+                        expr: rho_pi_chi_cells[2][i][j][idx].expr(),
+                    });
+                }
+                *os = decode::expr(parts);
+            }
+        }
+        s = os.clone();
+
+        // iota
+        // Simply do the single xor on state [0][0].
+        cell_manager.start_region();
+        let part_size = get_num_bits_per_absorb_lookup(k);
+        let input = s[0][0].clone() + round_cst_expr.clone();
+        let iota_parts =
+            split::expr(meta, &mut cell_manager, &mut cb, input, 0, part_size, false, None);
+        cell_manager.start_region();
+        // Could share columns with absorb which may end up using 1 lookup/column
+        // fewer...
+        s[0][0] = decode::expr(transform::expr(
+            "iota",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            iota_parts,
+            normalize_3,
+            true,
+        ));
+        // Final results stored in the next row
+        for i in 0..5 {
+            for j in 0..5 {
+                cb.require_equal("next row check", s[i][j].clone(), s_next[i][j].clone());
+            }
+        }
+        info!("- Post chi:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        let mut lookup_counter = 0;
+        cell_manager.start_region();
+
+        // Squeeze data
+        let squeeze_from = cell_manager.query_cell(meta);
+        let mut squeeze_from_prev = vec![0u64.expr(); NUM_WORDS_TO_SQUEEZE];
+        for (idx, squeeze_from_prev) in squeeze_from_prev.iter_mut().enumerate() {
+            let rot = (-(idx as i32) - 1) * num_rows_per_round as i32;
+            *squeeze_from_prev = squeeze_from.at_offset(meta, rot).expr();
+        }
+        // Squeeze
+        // The squeeze happening at the end of the 24 rounds is done spread out
+        // over those 24 rounds. In a single round (in 4 of the 24 rounds) a
+        // single word is converted to bytes.
+        // Potential optimization: could do multiple bytes per lookup
+        cell_manager.start_region();
+        // Unpack a single word into bytes (for the squeeze)
+        // Potential optimization: could do multiple bytes per lookup
+        let squeeze_from_parts =
+            split::expr(meta, &mut cell_manager, &mut cb, squeeze_from.expr(), 0, 8, false, None);
+        cell_manager.start_region();
+        let squeeze_bytes = transform::expr(
+            "squeeze unpack",
+            meta,
+            &mut cell_manager,
+            &mut lookup_counter,
+            squeeze_from_parts,
+            pack_table.into_iter().rev().collect::<Vec<_>>().try_into().unwrap(),
+            true,
+        );
+        info!("- Post squeeze:");
+        info!("Lookups: {}", lookup_counter);
+        info!("Columns: {}", cell_manager.get_width());
+        total_lookup_counter += lookup_counter;
+
+        // The round constraints that we've been building up till now
+        meta.create_gate("round", |meta| cb.gate(meta.query_fixed(q_round, Rotation::cur())));
+
+        // Absorb
+        meta.create_gate("absorb", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let continue_hash = not::expr(start_new_hash(meta, Rotation::cur()));
+            let absorb_positions = get_absorb_positions();
+            let mut a_slice = 0;
+            for j in 0..5 {
+                for i in 0..5 {
+                    if absorb_positions.contains(&(i, j)) {
+                        cb.condition(continue_hash.clone(), |cb| {
+                            cb.require_equal(
+                                "absorb verify input",
+                                absorb_from_next[a_slice].clone(),
+                                pre_s[i][j].clone(),
+                            );
+                        });
+                        cb.require_equal(
+                            "absorb result copy",
+                            select::expr(
+                                continue_hash.clone(),
+                                absorb_result_next[a_slice].clone(),
+                                absorb_data_next[a_slice].clone(),
+                            ),
+                            s_next[i][j].clone(),
+                        );
+                        a_slice += 1;
+                    } else {
+                        cb.require_equal(
+                            "absorb state copy",
+                            pre_s[i][j].clone() * continue_hash.clone(),
+                            s_next[i][j].clone(),
+                        );
+                    }
+                }
+            }
+            cb.gate(meta.query_fixed(q_absorb, Rotation::cur()))
+        });
+
+        // Collect the bytes that are spread out over previous rows
+        let mut hash_bytes = Vec::new();
+        for i in 0..NUM_WORDS_TO_SQUEEZE {
+            for byte in squeeze_bytes.iter() {
+                let rot = (-(i as i32) - 1) * num_rows_per_round as i32;
+                hash_bytes.push(byte.cell.at_offset(meta, rot).expr());
+            }
+        }
+
+        // Squeeze
+        meta.create_gate("squeeze", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let start_new_hash = start_new_hash(meta, Rotation::cur());
+            // The words to squeeze
+            let hash_words: Vec<_> =
+                pre_s.into_iter().take(4).map(|a| a[0].clone()).take(4).collect();
+            // Verify if we converted the correct words to bytes on previous rows
+            for (idx, word) in hash_words.iter().enumerate() {
+                cb.condition(start_new_hash.clone(), |cb| {
+                    cb.require_equal(
+                        "squeeze verify packed",
+                        word.clone(),
+                        squeeze_from_prev[idx].clone(),
+                    );
+                });
+            }
+
+            let hash_bytes_le = hash_bytes.into_iter().rev().collect::<Vec<_>>();
+            cb.condition(start_new_hash, |cb| {
+                cb.require_equal_word(
+                    "output check",
+                    word::Word32::new(hash_bytes_le.try_into().expect("32 limbs")).to_word(),
+                    hash_word.map(|col| meta.query_advice(col, Rotation::cur())),
+                );
+            });
+            cb.gate(meta.query_fixed(q_round_last, Rotation::cur()))
+        });
+
+        // Some general input checks
+        meta.create_gate("input checks", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            cb.require_boolean("boolean is_final", meta.query_advice(is_final, Rotation::cur()));
+            cb.gate(meta.query_fixed(q_enable, Rotation::cur()))
+        });
+
+        // Enforce fixed values on the first row
+        meta.create_gate("first row", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            cb.require_zero(
+                "is_final needs to be disabled on the first row",
+                meta.query_advice(is_final, Rotation::cur()),
+            );
+            cb.gate(meta.query_fixed(q_first, Rotation::cur()))
+        });
+
+        // some utility query functions
+        let q = |col: Column<Fixed>, meta: &mut VirtualCells<'_, F>| {
+            meta.query_fixed(col, Rotation::cur())
+        };
+        /*
+        eg：
+            data:
+                get_num_rows_per_round: 18
+                input: "12345678abc"
+            table:
+                Note[1]: be careful: is_paddings is not column here! It is [Cell; 8] and it will be constrained later.
+                Note[2]: only first row of each round has constraints on bytes_left. This example just shows how witnesses are filled.
+        offset word_value bytes_left  is_paddings q_enable q_input_last
+        18     0x87654321    11          0         1        0 // 1st round begin
+        19        0          10          0         0        0
+        20        0          9           0         0        0
+        21        0          8           0         0        0
+        22        0          7           0         0        0
+        23        0          6           0         0        0
+        24        0          5           0         0        0
+        25        0          4           0         0        0
+        26        0          4           NA        0        0
+        ...
+        35        0          4           NA        0        0  // 1st round end
+        36      0xcba        3           0         1        1  // 2nd round begin
+        37        0          2           0         0        0
+        38        0          1           0         0        0
+        39        0          0           1         0        0
+        40        0          0           1         0        0
+        41        0          0           1         0        0
+        42        0          0           1         0        0
+        43        0          0           1         0        0
+        */
+
+        meta.create_gate("word_value", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let masked_input_bytes = input_bytes
+                .iter()
+                .zip(is_paddings.clone())
+                .map(|(input_byte, is_padding)| {
+                    input_byte.expr.clone() * not::expr(is_padding.expr().clone())
+                })
+                .collect_vec();
+            let input_word = from_bytes::expr(&masked_input_bytes);
+            cb.require_equal(
+                "word value",
+                input_word,
+                meta.query_advice(keccak_table.word_value, Rotation::cur()),
+            );
+            cb.gate(q(q_input, meta))
+        });
+        meta.create_gate("bytes_left", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let bytes_left_expr = meta.query_advice(keccak_table.bytes_left, Rotation::cur());
+
+            // bytes_left is 0 in the absolute first `rows_per_round` of the entire circuit, i.e., the first dummy round.
+            cb.condition(q(q_first, meta), |cb| {
+                cb.require_zero(
+                    "bytes_left needs to be zero on the absolute first dummy round",
+                    meta.query_advice(keccak_table.bytes_left, Rotation::cur()),
+                );
+            });
+            // is_final ==> bytes_left == 0.
+            // Note: is_final = true only in the last round, which doesn't have any data to absorb.
+            cb.condition(meta.query_advice(is_final, Rotation::cur()), |cb| {
+                cb.require_zero("bytes_left should be 0 when is_final", bytes_left_expr.clone());
+            });
+            //q_input[cur] ==> bytes_left[cur + num_rows_per_round] + word_len == bytes_left[cur]
+            cb.condition(q(q_input, meta), |cb| {
+                // word_len = NUM_BYTES_PER_WORD - sum(is_paddings)
+                let word_len = NUM_BYTES_PER_WORD.expr() - sum::expr(is_paddings.clone());
+                let bytes_left_next_expr =
+                    meta.query_advice(keccak_table.bytes_left, Rotation(num_rows_per_round as i32));
+                cb.require_equal(
+                    "if there is a word in this round, bytes_left[curr + num_rows_per_round] + word_len == bytes_left[curr]",
+                    bytes_left_expr.clone(),
+                    bytes_left_next_expr + word_len,
+                );
+            });
+            // Logically here we want !q_input[cur] && !start_new_hash(cur) ==> bytes_left[cur + num_rows_per_round] == bytes_left[cur]
+            // In practice, in order to save a degree we use !(q_input[cur] ^ start_new_hash(cur)) ==> bytes_left[cur + num_rows_per_round] == bytes_left[cur]
+            // Because when both q_input[cur] and is_final in start_new_hash(cur) are true, is_final ==> bytes_left == 0 and this round must not be a final 
+            // round becuase q_input[cur] == 1. Therefore bytes_left_next must 0.
+            // Note: is_final could be true in rounds after the input rounds and before the last round, as long as the keccak_f is final.
+            cb.condition(not::expr(q(q_input, meta) + start_new_hash(meta, Rotation::cur())), |cb| {
+                let bytes_left_next_expr =
+                    meta.query_advice(keccak_table.bytes_left, Rotation(num_rows_per_round as i32));
+                cb.require_equal(
+                    "if no input and not starting new hash, bytes_left should keep the same",
+                    bytes_left_expr,
+                    bytes_left_next_expr,
+                );
+            });
+
+            cb.gate(q(q_enable, meta))
+        });
+
+        // Enforce logic for when this block is the last block for a hash
+        let last_is_padding_in_block = is_paddings.last().unwrap().at_offset(
+            meta,
+            -(((NUM_ROUNDS + 1 - NUM_WORDS_TO_ABSORB) * num_rows_per_round) as i32),
+        );
+        meta.create_gate("is final", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            // All absorb rows except the first row
+            cb.condition(
+                meta.query_fixed(q_absorb, Rotation::cur())
+                    - meta.query_fixed(q_first, Rotation::cur()),
+                |cb| {
+                    cb.require_equal(
+                        "is_final needs to be the same as the last is_padding in the block",
+                        meta.query_advice(is_final, Rotation::cur()),
+                        last_is_padding_in_block.expr(),
+                    );
+                },
+            );
+            // For all the rows of a round, only the first row can have `is_final == 1`.
+            cb.condition(
+                (1..num_rows_per_round as i32)
+                    .map(|i| meta.query_fixed(q_enable, Rotation(-i)))
+                    .fold(0.expr(), |acc, elem| acc + elem),
+                |cb| {
+                    cb.require_zero(
+                        "is_final only when q_enable",
+                        meta.query_advice(is_final, Rotation::cur()),
+                    );
+                },
+            );
+            cb.gate(1.expr())
+        });
+
+        // Padding
+        // May be cleaner to do this padding logic in the byte conversion lookup but
+        // currently easier to do it like this.
+        let prev_is_padding =
+            is_paddings.last().unwrap().at_offset(meta, -(num_rows_per_round as i32));
+        meta.create_gate("padding", |meta| {
+            let mut cb = BaseConstraintBuilder::new(MAX_DEGREE);
+            let q_input = meta.query_fixed(q_input, Rotation::cur());
+            let q_input_last = meta.query_fixed(q_input_last, Rotation::cur());
+
+            // All padding selectors need to be boolean
+            for is_padding in is_paddings.iter() {
+                cb.condition(meta.query_fixed(q_enable, Rotation::cur()), |cb| {
+                    cb.require_boolean("is_padding boolean", is_padding.expr());
+                });
+            }
+            // This last padding selector will be used on the first round row so needs to be
+            // zero
+            cb.condition(meta.query_fixed(q_absorb, Rotation::cur()), |cb| {
+                cb.require_zero(
+                    "last is_padding should be zero on absorb rows",
+                    is_paddings.last().unwrap().expr(),
+                );
+            });
+            // Now for each padding selector
+            for idx in 0..is_paddings.len() {
+                // Previous padding selector can be on the previous row
+                let is_padding_prev =
+                    if idx == 0 { prev_is_padding.expr() } else { is_paddings[idx - 1].expr() };
+                let is_first_padding = is_paddings[idx].expr() - is_padding_prev.clone();
+
+                // Check padding transition 0 -> 1 done only once
+                cb.condition(q_input.expr(), |cb| {
+                    cb.require_boolean("padding step boolean", is_first_padding.clone());
+                });
+
+                // Padding start/intermediate/end byte checks
+                if idx == is_paddings.len() - 1 {
+                    // These can be combined in the future, but currently this would increase the
+                    // degree by one Padding start/intermediate byte, all
+                    // padding rows except the last one
+                    cb.condition(
+                        and::expr([q_input.expr() - q_input_last.expr(), is_paddings[idx].expr()]),
+                        |cb| {
+                            // Input bytes need to be zero, or one if this is the first padding byte
+                            cb.require_equal(
+                                "padding start/intermediate byte last byte",
+                                input_bytes[idx].expr.clone(),
+                                is_first_padding.expr(),
+                            );
+                        },
+                    );
+                    // Padding start/end byte, only on the last padding row
+                    cb.condition(and::expr([q_input_last.expr(), is_paddings[idx].expr()]), |cb| {
+                        // The input byte needs to be 128, unless it's also the first padding
+                        // byte then it's 129
+                        cb.require_equal(
+                            "padding start/end byte",
+                            input_bytes[idx].expr.clone(),
+                            is_first_padding.expr() + 128.expr(),
+                        );
+                    });
+                } else {
+                    // Padding start/intermediate byte
+                    cb.condition(and::expr([q_input.expr(), is_paddings[idx].expr()]), |cb| {
+                        // Input bytes need to be zero, or one if this is the first padding byte
+                        cb.require_equal(
+                            "padding start/intermediate byte",
+                            input_bytes[idx].expr.clone(),
+                            is_first_padding.expr(),
+                        );
+                    });
+                }
+            }
+            cb.gate(1.expr())
+        });
+
+        info!("Degree: {}", meta.degree());
+        info!("Minimum rows: {}", meta.minimum_rows());
+        info!("Total Lookups: {}", total_lookup_counter);
+        #[cfg(feature = "display")]
+        {
+            println!("Total Keccak Columns: {}", cell_manager.get_width());
+            std::env::set_var("KECCAK_ADVICE_COLUMNS", cell_manager.get_width().to_string());
+        }
+        #[cfg(not(feature = "display"))]
+        info!("Total Keccak Columns: {}", cell_manager.get_width());
+        info!("num unused cells: {}", cell_manager.get_num_unused_cells());
+        info!("part_size absorb: {}", get_num_bits_per_absorb_lookup(k));
+        info!("part_size theta: {}", get_num_bits_per_theta_c_lookup(k));
+        info!("part_size theta c: {}", get_num_bits_per_lookup(THETA_C_LOOKUP_RANGE, k));
+        info!("part_size theta t: {}", get_num_bits_per_lookup(4, k));
+        info!("part_size rho/pi: {}", get_num_bits_per_rho_pi_lookup(k));
+        info!("part_size chi base: {}", get_num_bits_per_base_chi_lookup(k));
+        info!("uniform part sizes: {:?}", target_part_sizes(get_num_bits_per_theta_c_lookup(k)));
+
+        KeccakCircuitConfig {
+            q_enable,
+            q_first,
+            q_round,
+            q_absorb,
+            q_round_last,
+            q_input,
+            q_input_last,
+            keccak_table,
+            cell_manager,
+            round_cst,
+            normalize_3,
+            normalize_4,
+            normalize_6,
+            chi_base_table,
+            pack_table,
+            parameters,
+            _marker: PhantomData,
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct KeccakAssignedRow<'v, F: Field> {
+    pub is_final: KeccakAssignedValue<'v, F>,
+    pub hash_lo: KeccakAssignedValue<'v, F>,
+    pub hash_hi: KeccakAssignedValue<'v, F>,
+    pub bytes_left: KeccakAssignedValue<'v, F>,
+    pub word_value: KeccakAssignedValue<'v, F>,
+}
+
+impl<F: Field> KeccakCircuitConfig<F> {
+    /// Returns vector of `is_final`, `length`, `hash.lo`, `hash.hi` for assigned rows
+    pub fn assign<'v>(
+        &self,
+        region: &mut Region<F>,
+        witness: &[KeccakRow<F>],
+    ) -> Vec<KeccakAssignedRow<'v, F>> {
+        witness
+            .iter()
+            .enumerate()
+            .map(|(offset, keccak_row)| self.set_row(region, offset, keccak_row))
+            .collect()
+    }
+
+    /// Output is `is_final`, `length`, `hash.lo`, `hash.hi` at that row
+    pub fn set_row<'v>(
+        &self,
+        region: &mut Region<F>,
+        offset: usize,
+        row: &KeccakRow<F>,
+    ) -> KeccakAssignedRow<'v, F> {
+        // Fixed selectors
+        for (_, column, value) in &[
+            ("q_enable", self.q_enable, F::from(row.q_enable)),
+            ("q_first", self.q_first, F::from(offset == 0)),
+            ("q_round", self.q_round, F::from(row.q_round)),
+            ("q_round_last", self.q_round_last, F::from(row.q_round_last)),
+            ("q_absorb", self.q_absorb, F::from(row.q_absorb)),
+            ("q_input", self.q_input, F::from(row.q_input)),
+            ("q_input_last", self.q_input_last, F::from(row.q_input_last)),
+        ] {
+            raw_assign_fixed(region, *column, offset, *value);
+        }
+
+        // Keccak data
+        let [is_final, hash_lo, hash_hi, bytes_left, word_value] = [
+            ("is_final", self.keccak_table.is_enabled, Value::known(F::from(row.is_final))),
+            ("hash_lo", self.keccak_table.output.lo(), row.hash.lo()),
+            ("hash_hi", self.keccak_table.output.hi(), row.hash.hi()),
+            ("bytes_left", self.keccak_table.bytes_left, Value::known(row.bytes_left)),
+            ("word_value", self.keccak_table.word_value, Value::known(row.word_value)),
+        ]
+        .map(|(_name, column, value)| raw_assign_advice(region, column, offset, value));
+
+        // Cell values
+        row.cell_values.iter().zip(self.cell_manager.columns()).for_each(|(bit, column)| {
+            raw_assign_advice(region, column.advice, offset, Value::known(*bit));
+        });
+
+        // Round constant
+        raw_assign_fixed(region, self.round_cst, offset, row.round_cst);
+
+        KeccakAssignedRow { is_final, hash_lo, hash_hi, bytes_left, word_value }
+    }
+
+    pub fn load_aux_tables(&self, layouter: &mut impl Layouter<F>, k: u32) -> Result<(), Error> {
+        load_normalize_table(layouter, "normalize_6", &self.normalize_6, 6u64, k)?;
+        load_normalize_table(layouter, "normalize_4", &self.normalize_4, 4u64, k)?;
+        load_normalize_table(layouter, "normalize_3", &self.normalize_3, 3u64, k)?;
+        load_lookup_table(
+            layouter,
+            "chi base",
+            &self.chi_base_table,
+            get_num_bits_per_base_chi_lookup(k),
+            &CHI_BASE_LOOKUP_TABLE,
+        )?;
+        load_pack_table(layouter, &self.pack_table)
+    }
+}
diff --git a/hashes/zkevm/src/keccak/param.rs b/hashes/zkevm/src/keccak/vanilla/param.rs
similarity index 98%
rename from hashes/zkevm/src/keccak/param.rs
rename to hashes/zkevm/src/keccak/vanilla/param.rs
index 159b7e52..abecd264 100644
--- a/hashes/zkevm/src/keccak/param.rs
+++ b/hashes/zkevm/src/keccak/vanilla/param.rs
@@ -1,5 +1,5 @@
 #![allow(dead_code)]
-pub(crate) const MAX_DEGREE: usize = 4;
+pub(crate) const MAX_DEGREE: usize = 3;
 pub(crate) const ABSORB_LOOKUP_RANGE: usize = 3;
 pub(crate) const THETA_C_LOOKUP_RANGE: usize = 6;
 pub(crate) const RHO_PI_LOOKUP_RANGE: usize = 4;
diff --git a/hashes/zkevm/src/keccak/table.rs b/hashes/zkevm/src/keccak/vanilla/table.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/table.rs
rename to hashes/zkevm/src/keccak/vanilla/table.rs
diff --git a/hashes/zkevm/src/keccak/tests.rs b/hashes/zkevm/src/keccak/vanilla/tests.rs
similarity index 93%
rename from hashes/zkevm/src/keccak/tests.rs
rename to hashes/zkevm/src/keccak/vanilla/tests.rs
index 211d91c1..7d0089d1 100644
--- a/hashes/zkevm/src/keccak/tests.rs
+++ b/hashes/zkevm/src/keccak/vanilla/tests.rs
@@ -1,4 +1,4 @@
-use super::*;
+use super::{witness::*, *};
 use crate::halo2_proofs::{
     circuit::SimpleFloorPlanner,
     dev::MockProver,
@@ -212,15 +212,28 @@ fn extract_u128<F: Field>(assigned_value: KeccakAssignedValue<F>) -> u128 {
 #[test_case(12, 5; "k: 12, rows_per_round: 5")]
 fn packed_multi_keccak_simple(k: u32, rows_per_round: usize) {
     let _ = env_logger::builder().is_test(true).try_init();
-
-    let inputs = vec![
-        vec![],
-        (0u8..1).collect::<Vec<_>>(),
-        (0u8..135).collect::<Vec<_>>(),
-        (0u8..136).collect::<Vec<_>>(),
-        (0u8..200).collect::<Vec<_>>(),
-    ];
-    verify::<Fr>(KeccakConfigParams { k, rows_per_round }, inputs, true);
+    {
+        // First input is empty.
+        let inputs = vec![
+            vec![],
+            (0u8..1).collect::<Vec<_>>(),
+            (0u8..135).collect::<Vec<_>>(),
+            (0u8..136).collect::<Vec<_>>(),
+            (0u8..200).collect::<Vec<_>>(),
+        ];
+        verify::<Fr>(KeccakConfigParams { k, rows_per_round }, inputs, true);
+    }
+    {
+        // First input is not empty.
+        let inputs = vec![
+            (0u8..200).collect::<Vec<_>>(),
+            vec![],
+            (0u8..1).collect::<Vec<_>>(),
+            (0u8..135).collect::<Vec<_>>(),
+            (0u8..136).collect::<Vec<_>>(),
+        ];
+        verify::<Fr>(KeccakConfigParams { k, rows_per_round }, inputs, true);
+    }
 }
 
 #[test_case(14, 25 ; "k: 14, rows_per_round: 25")]
@@ -231,11 +244,11 @@ fn packed_multi_keccak_prover(k: u32, rows_per_round: usize) {
     let params = ParamsKZG::<Bn256>::setup(k, OsRng);
 
     let inputs = vec![
+        (0u8..200).collect::<Vec<_>>(),
         vec![],
         (0u8..1).collect::<Vec<_>>(),
         (0u8..135).collect::<Vec<_>>(),
         (0u8..136).collect::<Vec<_>>(),
-        (0u8..200).collect::<Vec<_>>(),
     ];
     let circuit = KeccakCircuit::new(
         KeccakConfigParams { k, rows_per_round },
diff --git a/hashes/zkevm/src/keccak/util.rs b/hashes/zkevm/src/keccak/vanilla/util.rs
similarity index 100%
rename from hashes/zkevm/src/keccak/util.rs
rename to hashes/zkevm/src/keccak/vanilla/util.rs
diff --git a/hashes/zkevm/src/keccak/vanilla/witness.rs b/hashes/zkevm/src/keccak/vanilla/witness.rs
new file mode 100644
index 00000000..d97d487d
--- /dev/null
+++ b/hashes/zkevm/src/keccak/vanilla/witness.rs
@@ -0,0 +1,418 @@
+// This file is moved out from mod.rs.
+use super::*;
+
+/// Witness generation for multiple keccak hashes of little-endian `bytes`.
+pub fn multi_keccak<F: Field>(
+    bytes: &[Vec<u8>],
+    capacity: Option<usize>,
+    parameters: KeccakConfigParams,
+) -> (Vec<KeccakRow<F>>, Vec<[F; NUM_WORDS_TO_SQUEEZE]>) {
+    let num_rows_per_round = parameters.rows_per_round;
+    let mut rows =
+        Vec::with_capacity((1 + capacity.unwrap_or(0) * (NUM_ROUNDS + 1)) * num_rows_per_round);
+    // Dummy first row so that the initial data is absorbed
+    // The initial data doesn't really matter, `is_final` just needs to be disabled.
+    rows.append(&mut KeccakRow::dummy_rows(num_rows_per_round));
+    // Actual keccaks
+    let artifacts = bytes
+        .par_iter()
+        .map(|bytes| {
+            let num_keccak_f = get_num_keccak_f(bytes.len());
+            let mut squeeze_digests = Vec::with_capacity(num_keccak_f);
+            let mut rows = Vec::with_capacity(num_keccak_f * (NUM_ROUNDS + 1) * num_rows_per_round);
+            keccak(&mut rows, &mut squeeze_digests, bytes, parameters);
+            (rows, squeeze_digests)
+        })
+        .collect::<Vec<_>>();
+
+    let mut squeeze_digests = Vec::with_capacity(capacity.unwrap_or(0));
+    for (rows_part, squeezes) in artifacts {
+        rows.extend(rows_part);
+        squeeze_digests.extend(squeezes);
+    }
+
+    if let Some(capacity) = capacity {
+        // Pad with no data hashes to the expected capacity
+        while rows.len() < (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
+            keccak(&mut rows, &mut squeeze_digests, &[], parameters);
+        }
+        // Check that we are not over capacity
+        if rows.len() > (1 + capacity * (NUM_ROUNDS + 1)) * num_rows_per_round {
+            panic!("{:?}", Error::BoundsFailure);
+        }
+    }
+    (rows, squeeze_digests)
+}
+/// Witness generation for keccak hash of little-endian `bytes`.
+fn keccak<F: Field>(
+    rows: &mut Vec<KeccakRow<F>>,
+    squeeze_digests: &mut Vec<[F; NUM_WORDS_TO_SQUEEZE]>,
+    bytes: &[u8],
+    parameters: KeccakConfigParams,
+) {
+    let k = parameters.k;
+    let num_rows_per_round = parameters.rows_per_round;
+
+    let mut bits = into_bits(bytes);
+    let mut s = [[F::ZERO; 5]; 5];
+    let absorb_positions = get_absorb_positions();
+    let num_bytes_in_last_block = bytes.len() % RATE;
+    let two = F::from(2u64);
+
+    // Padding
+    bits.push(1);
+    while (bits.len() + 1) % RATE_IN_BITS != 0 {
+        bits.push(0);
+    }
+    bits.push(1);
+
+    // running length of absorbed input in bytes
+    let mut length = 0;
+    let chunks = bits.chunks(RATE_IN_BITS);
+    let num_chunks = chunks.len();
+
+    let mut cell_managers = Vec::with_capacity(NUM_ROUNDS + 1);
+    let mut regions = Vec::with_capacity(NUM_ROUNDS + 1);
+    // keeps track of running lengths over all rounds in an absorb step
+    let mut round_lengths = Vec::with_capacity(NUM_ROUNDS + 1);
+    let mut hash_words = [F::ZERO; NUM_WORDS_TO_SQUEEZE];
+    let mut hash = Word::default();
+
+    for (idx, chunk) in chunks.enumerate() {
+        let is_final_block = idx == num_chunks - 1;
+
+        let mut absorb_rows = Vec::new();
+        // Absorb
+        for (idx, &(i, j)) in absorb_positions.iter().enumerate() {
+            let absorb = pack(&chunk[idx * 64..(idx + 1) * 64]);
+            let from = s[i][j];
+            s[i][j] = field_xor(s[i][j], absorb);
+            absorb_rows.push(AbsorbData { from, absorb, result: s[i][j] });
+        }
+
+        // better memory management to clear already allocated Vecs
+        cell_managers.clear();
+        regions.clear();
+        round_lengths.clear();
+
+        for round in 0..NUM_ROUNDS + 1 {
+            let mut cell_manager = CellManager::new(num_rows_per_round);
+            let mut region = KeccakRegion::new();
+
+            let mut absorb_row = AbsorbData::default();
+            if round < NUM_WORDS_TO_ABSORB {
+                absorb_row = absorb_rows[round].clone();
+            }
+
+            // State data
+            for s in &s {
+                for s in s {
+                    let cell = cell_manager.query_cell_value();
+                    cell.assign(&mut region, 0, *s);
+                }
+            }
+
+            // Absorb data
+            let absorb_from = cell_manager.query_cell_value();
+            let absorb_data = cell_manager.query_cell_value();
+            let absorb_result = cell_manager.query_cell_value();
+            absorb_from.assign(&mut region, 0, absorb_row.from);
+            absorb_data.assign(&mut region, 0, absorb_row.absorb);
+            absorb_result.assign(&mut region, 0, absorb_row.result);
+
+            // Absorb
+            cell_manager.start_region();
+            let part_size = get_num_bits_per_absorb_lookup(k);
+            let input = absorb_row.from + absorb_row.absorb;
+            let absorb_fat =
+                split::value(&mut cell_manager, &mut region, input, 0, part_size, false, None);
+            cell_manager.start_region();
+            let _absorb_result = transform::value(
+                &mut cell_manager,
+                &mut region,
+                absorb_fat.clone(),
+                true,
+                |v| v & 1,
+                true,
+            );
+
+            // Padding
+            cell_manager.start_region();
+            // Unpack a single word into bytes (for the absorption)
+            // Potential optimization: could do multiple bytes per lookup
+            let packed =
+                split::value(&mut cell_manager, &mut region, absorb_row.absorb, 0, 8, false, None);
+            cell_manager.start_region();
+            let input_bytes =
+                transform::value(&mut cell_manager, &mut region, packed, false, |v| *v, true);
+            cell_manager.start_region();
+            let is_paddings =
+                input_bytes.iter().map(|_| cell_manager.query_cell_value()).collect::<Vec<_>>();
+            debug_assert_eq!(is_paddings.len(), NUM_BYTES_PER_WORD);
+            if round < NUM_WORDS_TO_ABSORB {
+                for (padding_idx, is_padding) in is_paddings.iter().enumerate() {
+                    let byte_idx = round * NUM_BYTES_PER_WORD + padding_idx;
+                    let padding = if is_final_block && byte_idx >= num_bytes_in_last_block {
+                        true
+                    } else {
+                        length += 1;
+                        false
+                    };
+                    is_padding.assign(&mut region, 0, F::from(padding));
+                }
+            }
+            cell_manager.start_region();
+
+            if round != NUM_ROUNDS {
+                // Theta
+                let part_size = get_num_bits_per_theta_c_lookup(k);
+                let mut bcf = Vec::new();
+                for s in &s {
+                    let c = s[0] + s[1] + s[2] + s[3] + s[4];
+                    let bc_fat =
+                        split::value(&mut cell_manager, &mut region, c, 1, part_size, false, None);
+                    bcf.push(bc_fat);
+                }
+                cell_manager.start_region();
+                let mut bc = Vec::new();
+                for bc_fat in bcf {
+                    let bc_norm = transform::value(
+                        &mut cell_manager,
+                        &mut region,
+                        bc_fat.clone(),
+                        true,
+                        |v| v & 1,
+                        true,
+                    );
+                    bc.push(bc_norm);
+                }
+                cell_manager.start_region();
+                let mut os = [[F::ZERO; 5]; 5];
+                for i in 0..5 {
+                    let t = decode::value(bc[(i + 4) % 5].clone())
+                        + decode::value(rotate(bc[(i + 1) % 5].clone(), 1, part_size));
+                    for j in 0..5 {
+                        os[i][j] = s[i][j] + t;
+                    }
+                }
+                s = os;
+                cell_manager.start_region();
+
+                // Rho/Pi
+                let part_size = get_num_bits_per_base_chi_lookup(k);
+                let target_word_sizes = target_part_sizes(part_size);
+                let num_word_parts = target_word_sizes.len();
+                let mut rho_pi_chi_cells: [[[Vec<Cell<F>>; 5]; 5]; 3] =
+                    array_init::array_init(|_| {
+                        array_init::array_init(|_| array_init::array_init(|_| Vec::new()))
+                    });
+                let mut column_starts = [0usize; 3];
+                for p in 0..3 {
+                    column_starts[p] = cell_manager.start_region();
+                    let mut row_idx = 0;
+                    for j in 0..5 {
+                        for _ in 0..num_word_parts {
+                            for i in 0..5 {
+                                rho_pi_chi_cells[p][i][j]
+                                    .push(cell_manager.query_cell_value_at_row(row_idx as i32));
+                            }
+                            row_idx = (row_idx + 1) % num_rows_per_round;
+                        }
+                    }
+                }
+                cell_manager.start_region();
+                let mut os_parts: [[Vec<PartValue<F>>; 5]; 5] =
+                    array_init::array_init(|_| array_init::array_init(|_| Vec::new()));
+                for (j, os_part) in os_parts.iter_mut().enumerate() {
+                    for i in 0..5 {
+                        let s_parts = split_uniform::value(
+                            &rho_pi_chi_cells[0][j][(2 * i + 3 * j) % 5],
+                            &mut cell_manager,
+                            &mut region,
+                            s[i][j],
+                            RHO_MATRIX[i][j],
+                            part_size,
+                            true,
+                        );
+
+                        let s_parts = transform_to::value(
+                            &rho_pi_chi_cells[1][j][(2 * i + 3 * j) % 5],
+                            &mut region,
+                            s_parts.clone(),
+                            true,
+                            |v| v & 1,
+                        );
+                        os_part[(2 * i + 3 * j) % 5] = s_parts.clone();
+                    }
+                }
+                cell_manager.start_region();
+
+                // Chi
+                let part_size_base = get_num_bits_per_base_chi_lookup(k);
+                let three_packed = pack::<F>(&vec![3u8; part_size_base]);
+                let mut os = [[F::ZERO; 5]; 5];
+                for j in 0..5 {
+                    for i in 0..5 {
+                        let mut s_parts = Vec::new();
+                        for ((part_a, part_b), part_c) in os_parts[i][j]
+                            .iter()
+                            .zip(os_parts[(i + 1) % 5][j].iter())
+                            .zip(os_parts[(i + 2) % 5][j].iter())
+                        {
+                            let value =
+                                three_packed - two * part_a.value + part_b.value - part_c.value;
+                            s_parts.push(PartValue {
+                                num_bits: part_size_base,
+                                rot: j as i32,
+                                value,
+                            });
+                        }
+                        os[i][j] = decode::value(transform_to::value(
+                            &rho_pi_chi_cells[2][i][j],
+                            &mut region,
+                            s_parts.clone(),
+                            true,
+                            |v| CHI_BASE_LOOKUP_TABLE[*v as usize],
+                        ));
+                    }
+                }
+                s = os;
+                cell_manager.start_region();
+
+                // iota
+                let part_size = get_num_bits_per_absorb_lookup(k);
+                let input = s[0][0] + pack_u64::<F>(ROUND_CST[round]);
+                let iota_parts = split::value::<F>(
+                    &mut cell_manager,
+                    &mut region,
+                    input,
+                    0,
+                    part_size,
+                    false,
+                    None,
+                );
+                cell_manager.start_region();
+                s[0][0] = decode::value(transform::value(
+                    &mut cell_manager,
+                    &mut region,
+                    iota_parts.clone(),
+                    true,
+                    |v| v & 1,
+                    true,
+                ));
+            }
+
+            // Assign the hash result
+            let is_final = is_final_block && round == NUM_ROUNDS;
+            hash = if is_final {
+                let hash_bytes_le = s
+                    .into_iter()
+                    .take(4)
+                    .flat_map(|a| to_bytes::value(&unpack(a[0])))
+                    .rev()
+                    .collect::<Vec<_>>();
+
+                let word: Word<Value<F>> =
+                    Word::from(eth_types::Word::from_little_endian(hash_bytes_le.as_slice()))
+                        .map(Value::known);
+                word
+            } else {
+                Word::default().into_value()
+            };
+
+            // The words to squeeze out: this is the hash digest as words with
+            // NUM_BYTES_PER_WORD (=8) bytes each
+            for (hash_word, a) in hash_words.iter_mut().zip(s.iter()) {
+                *hash_word = a[0];
+            }
+
+            round_lengths.push(length);
+
+            cell_managers.push(cell_manager);
+            regions.push(region);
+        }
+
+        // Now that we know the state at the end of the rounds, set the squeeze data
+        let num_rounds = cell_managers.len();
+        for (idx, word) in hash_words.iter().enumerate() {
+            let cell_manager = &mut cell_managers[num_rounds - 2 - idx];
+            let region = &mut regions[num_rounds - 2 - idx];
+
+            cell_manager.start_region();
+            let squeeze_packed = cell_manager.query_cell_value();
+            squeeze_packed.assign(region, 0, *word);
+
+            cell_manager.start_region();
+            let packed = split::value(cell_manager, region, *word, 0, 8, false, None);
+            cell_manager.start_region();
+            transform::value(cell_manager, region, packed, false, |v| *v, true);
+        }
+        squeeze_digests.push(hash_words);
+
+        for round in 0..NUM_ROUNDS + 1 {
+            let round_cst = pack_u64(ROUND_CST[round]);
+
+            for row_idx in 0..num_rows_per_round {
+                let word_value = if round < NUM_WORDS_TO_ABSORB && row_idx == 0 {
+                    let byte_idx = (idx * NUM_WORDS_TO_ABSORB + round) * NUM_BYTES_PER_WORD;
+                    if byte_idx >= bytes.len() {
+                        0
+                    } else {
+                        let end = std::cmp::min(byte_idx + NUM_BYTES_PER_WORD, bytes.len());
+                        let mut word_bytes = bytes[byte_idx..end].to_vec().clone();
+                        word_bytes.resize(NUM_BYTES_PER_WORD, 0);
+                        u64::from_le_bytes(word_bytes.try_into().unwrap())
+                    }
+                } else {
+                    0
+                };
+                let byte_idx = if round < NUM_WORDS_TO_ABSORB {
+                    round * NUM_BYTES_PER_WORD + std::cmp::min(row_idx, NUM_BYTES_PER_WORD - 1)
+                } else {
+                    NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD
+                } + idx * NUM_WORDS_TO_ABSORB * NUM_BYTES_PER_WORD;
+                let bytes_left = if byte_idx >= bytes.len() { 0 } else { bytes.len() - byte_idx };
+                rows.push(KeccakRow {
+                    q_enable: row_idx == 0,
+                    q_round: row_idx == 0 && round < NUM_ROUNDS,
+                    q_absorb: row_idx == 0 && round == NUM_ROUNDS,
+                    q_round_last: row_idx == 0 && round == NUM_ROUNDS,
+                    q_input: row_idx == 0 && round < NUM_WORDS_TO_ABSORB,
+                    q_input_last: row_idx == 0 && round == NUM_WORDS_TO_ABSORB - 1,
+                    round_cst,
+                    is_final: is_final_block && round == NUM_ROUNDS && row_idx == 0,
+                    cell_values: regions[round].rows.get(row_idx).unwrap_or(&vec![]).clone(),
+                    hash,
+                    bytes_left: F::from_u128(bytes_left as u128),
+                    word_value: F::from_u128(word_value as u128),
+                });
+                #[cfg(debug_assertions)]
+                {
+                    let mut r = rows.last().unwrap().clone();
+                    r.cell_values.clear();
+                    log::trace!("offset {:?} row idx {} row {:?}", rows.len() - 1, row_idx, r);
+                }
+            }
+            log::trace!(" = = = = = = round {} end", round);
+        }
+        log::trace!(" ====================== chunk {} end", idx);
+    }
+
+    #[cfg(debug_assertions)]
+    {
+        let hash_bytes = s
+            .into_iter()
+            .take(4)
+            .map(|a| {
+                pack_with_base::<F>(&unpack(a[0]), 2)
+                    .to_bytes_le()
+                    .into_iter()
+                    .take(8)
+                    .collect::<Vec<_>>()
+                    .to_vec()
+            })
+            .collect::<Vec<_>>();
+        debug!("hash: {:x?}", &(hash_bytes[0..4].concat()));
+        assert_eq!(length, bytes.len());
+    }
+}
diff --git a/hashes/zkevm/src/lib.rs b/hashes/zkevm/src/lib.rs
index c1ed5026..272e4bf8 100644
--- a/hashes/zkevm/src/lib.rs
+++ b/hashes/zkevm/src/lib.rs
@@ -7,5 +7,3 @@ use halo2_base::halo2_proofs;
 pub mod keccak;
 /// Util
 pub mod util;
-
-pub use keccak::KeccakCircuitConfig as KeccakConfig;