diff --git a/benchmarks/benches/regex_execute.rs b/benchmarks/benches/regex_execute.rs
index b6ae4dcd8d..0ffd9ec2da 100644
--- a/benchmarks/benches/regex_execute.rs
+++ b/benchmarks/benches/regex_execute.rs
@@ -1,4 +1,4 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use openvm_benchmarks::utils::build_bench_program;
 use openvm_circuit::arch::{instructions::exe::VmExe, VmExecutor};
 use openvm_keccak256_circuit::Keccak256Rv32Config;
@@ -33,7 +33,7 @@ fn benchmark_function(c: &mut Criterion) {
     group.bench_function("execute", |b| {
         b.iter(|| {
             executor
-                .execute(exe.clone(), StdIn::from_bytes(&fe_bytes))
+                .execute(exe.clone(), black_box(StdIn::from_bytes(&fe_bytes)))
                 .unwrap();
         })
     });
diff --git a/benchmarks/examples/regex_execute.rs b/benchmarks/examples/regex_execute.rs
index 802dc800d5..ea59f0c7a1 100644
--- a/benchmarks/examples/regex_execute.rs
+++ b/benchmarks/examples/regex_execute.rs
@@ -27,7 +27,9 @@ fn main() {
 
     let data = include_str!("../programs/regex/regex_email.txt");
 
+    let timer = std::time::Instant::now();
     executor
         .execute(exe.clone(), StdIn::from_bytes(data.as_bytes()))
         .unwrap();
+    println!("execute_time: {:?}", timer.elapsed());
 }
diff --git a/crates/prof/src/aggregate.rs b/crates/prof/src/aggregate.rs
index 886c69bdcd..f0234a1c40 100644
--- a/crates/prof/src/aggregate.rs
+++ b/crates/prof/src/aggregate.rs
@@ -232,12 +232,12 @@ impl AggregateMetrics {
             }
             if !group_name.contains("keygen") {
                 // Proving time in keygen group is dummy and not part of total.
-                total_proof_time.val += stats.sum.val;
-                *total_proof_time.diff.as_mut().unwrap() += stats.sum.diff.unwrap_or(0.0);
-                total_par_proof_time.val += stats.max.val;
-                *total_par_proof_time.diff.as_mut().unwrap() += stats.max.diff.unwrap_or(0.0);
+                total_proof_time.val += sum.val;
+                *total_proof_time.diff.as_mut().unwrap() += sum.diff.unwrap_or(0.0);
+                total_par_proof_time.val += max.val;
+                *total_par_proof_time.diff.as_mut().unwrap() += max.diff.unwrap_or(0.0);
             }
-            rows.push((group_name, stats.sum, stats.max));
+            rows.push((group_name, sum, max));
         }
         writeln!(
             writer,
diff --git a/crates/vm/src/metrics/mod.rs b/crates/vm/src/metrics/mod.rs
index fd9b4542e8..c2d32e5011 100644
--- a/crates/vm/src/metrics/mod.rs
+++ b/crates/vm/src/metrics/mod.rs
@@ -14,6 +14,7 @@ pub mod cycle_tracker;
 
 #[derive(Clone, Debug, Default)]
 pub struct VmMetrics {
+    pub cycle_count: usize,
     pub chip_heights: Vec<(String, usize)>,
     /// Maps (dsl_ir, opcode) to number of times opcode was executed
     pub counts: BTreeMap<(Option<String>, String), usize>,
@@ -42,7 +43,7 @@ where
         opcode: VmOpcode,
         dsl_instr: Option<String>,
     ) {
-        counter!("total_cycles").increment(1u64);
+        self.metrics.cycle_count += 1;
 
         if self.system_config().profiling {
             let executor = self.chip_complex.inventory.get_executor(opcode).unwrap();
@@ -60,6 +61,7 @@ where
     }
 
     pub fn finalize_metrics(&mut self) {
+        counter!("total_cycles").absolute(self.metrics.cycle_count as u64);
         counter!("main_cells_used")
             .absolute(self.current_trace_cells().into_iter().sum::<usize>() as u64);