diff --git a/CHANGELOG.md b/CHANGELOG.md
index d378b2068..6b26526d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
 
 - Introducing an example for proving knowledge of exponent
 - Add api to get SRS size.
+- Adding micro benchmarks for MSM, FFT and Poly Evaluation.
 
 ### Improvements
 
diff --git a/README.md b/README.md
index 24880fc7a..e15fff082 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,7 @@ The additional flags allow using assembly implementation of `square_in_place` an
 For benchmark, run:
 
 ```
-RAYON_NUM_THREADS=N cargo bench
+RAYON_NUM_THREADS=N cargo bench --features bench
 ```
 
 where N is the number of threads you want to use (N = 1 for single-thread).
diff --git a/plonk/Cargo.toml b/plonk/Cargo.toml
index 018ee5267..f0ae37267 100644
--- a/plonk/Cargo.toml
+++ b/plonk/Cargo.toml
@@ -51,6 +51,8 @@ path = "benches/bench.rs"
 harness = false
 
 [features]
-std = []
+std = [ ]
 # exposing apis for testing purpose
 test_apis = []
+# enabling mircobench
+bench = []
\ No newline at end of file
diff --git a/plonk/benches/bench.rs b/plonk/benches/bench.rs
index 19004b3fe..ecee43343 100644
--- a/plonk/benches/bench.rs
+++ b/plonk/benches/bench.rs
@@ -5,7 +5,7 @@
 // along with the Jellyfish library. If not, see <https://mit-license.org/>.
 
 // For benchmark, run:
-//     RAYON_NUM_THREADS=N cargo bench
+//     RAYON_NUM_THREADS=N cargo bench --features bench
 // where N is the number of threads you want to use (N = 1 for single-thread).
 
 use ark_bls12_377::{Bls12_377, Fr as Fr377};
@@ -13,7 +13,9 @@ use ark_bls12_381::{Bls12_381, Fr as Fr381};
 use ark_bn254::{Bn254, Fr as Fr254};
 use ark_bw6_761::{Fr as Fr761, BW6_761};
 use ark_ff::PrimeField;
+use ark_std::{fs::File, io::Write};
 use jf_plonk::{
+    bencher::{init_timers, total_fft_time, total_msm_time, total_poly_eval_time},
     circuit::{Circuit, PlonkCircuit},
     errors::PlonkError,
     proof_system::{PlonkKzgSnark, Snark},
@@ -54,6 +56,7 @@ macro_rules! plonk_prove_bench {
 
         let (pk, _) = PlonkKzgSnark::<$bench_curve>::preprocess(&srs, &cs).unwrap();
 
+        init_timers();
         let start = ark_std::time::Instant::now();
 
         for _ in 0..NUM_REPETITIONS {
@@ -62,13 +65,98 @@ macro_rules! plonk_prove_bench {
             )
             .unwrap();
         }
+        println!("=====================================");
+        println!(
+            "proving time for {}, {} with dim {}: {} ns/gate",
+            stringify!($bench_curve),
+            stringify!($bench_plonk_type),
+            $num_gates,
+            start.elapsed().as_nanos() / NUM_REPETITIONS as u128 / $num_gates as u128
+        );
+        println!(
+            "total batch verify time: {:.2} ms",
+            start.elapsed().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64
+        );
+        println!(
+            "time spend on FFT:  {:.2} ms, or {:.2}%",
+            total_fft_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_fft_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!(
+            "time spend on MSM:  {:.2} ms, or {:.2}%",
+            total_msm_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_msm_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!(
+            "time spend on poly evaluation: {:.2} ms, or {:.2}%",
+            total_poly_eval_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_poly_eval_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!("=====================================");
+    };
+}
 
+macro_rules! plonk_prove_mt_bench {
+    ($bench_curve:ty, $bench_field:ty, $bench_plonk_type:expr, $num_gates:expr, $file:expr) => {
+        let rng = &mut ark_std::test_rng();
+        let cs = gen_circuit_for_bench::<$bench_field>($num_gates, $bench_plonk_type).unwrap();
+
+        let max_degree = $num_gates + 2;
+        let srs = PlonkKzgSnark::<$bench_curve>::universal_setup(max_degree, rng).unwrap();
+
+        let (pk, _) = PlonkKzgSnark::<$bench_curve>::preprocess(&srs, &cs).unwrap();
+
+        init_timers();
+        let start = ark_std::time::Instant::now();
+
+        for _ in 0..NUM_REPETITIONS {
+            let _ = PlonkKzgSnark::<$bench_curve>::prove::<_, _, StandardTranscript>(
+                rng, &cs, &pk, None,
+            )
+            .unwrap();
+        }
+        println!("=====================================");
         println!(
-            "proving time for {}, {}: {} ns/gate",
+            "proving time for {}, {} with dim {}: {} ns/gate",
             stringify!($bench_curve),
             stringify!($bench_plonk_type),
+            $num_gates,
             start.elapsed().as_nanos() / NUM_REPETITIONS as u128 / $num_gates as u128
         );
+        println!(
+            "total batch verify time: {:.2} ms",
+            start.elapsed().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64
+        );
+        println!(
+            "time spend on FFT:  {:.2} ms, or {:.2}%",
+            total_fft_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_fft_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!(
+            "time spend on MSM:  {:.2} ms, or {:.2}%",
+            total_msm_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_msm_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!(
+            "time spend on poly evaluation: {:.2} ms, or {:.2}%",
+            total_poly_eval_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_poly_eval_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!("=====================================");
+        $file
+            .write_all(
+                format!(
+                    "{}     {:.2}   {:.2}   {:.2}   {:.2}   {:.2}\n",
+                    $num_gates,
+                    start.elapsed().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+                    total_fft_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+                    100f64 * total_fft_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64,
+                    total_msm_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+                    100f64 * total_msm_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64,
+                )
+                .as_ref(),
+            )
+            .expect("Unable to write data");
     };
 }
 
@@ -97,6 +185,7 @@ macro_rules! plonk_verify_bench {
             PlonkKzgSnark::<$bench_curve>::prove::<_, _, StandardTranscript>(rng, &cs, &pk, None)
                 .unwrap();
 
+        init_timers();
         let start = ark_std::time::Instant::now();
 
         for _ in 0..NUM_REPETITIONS {
@@ -104,13 +193,30 @@ macro_rules! plonk_verify_bench {
                 PlonkKzgSnark::<$bench_curve>::verify::<StandardTranscript>(&vk, &[], &proof, None)
                     .unwrap();
         }
-
+        println!("=====================================");
         println!(
-            "verifying time for {}, {}: {} ns",
+            "verifying time for {}, {} with dim {}: {} ns",
             stringify!($bench_curve),
             stringify!($bench_plonk_type),
+            $num_gates,
             start.elapsed().as_nanos() / NUM_REPETITIONS as u128
         );
+        println!(
+            "total batch verify time: {:.2} ms",
+            start.elapsed().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64
+        );
+        println!(
+            "time spend on FFT:  {:.2} ms, or {:.2}%",
+            total_fft_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_fft_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!(
+            "time spend on MSM:  {:.2} ms, or {:.2}%",
+            total_msm_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_msm_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+
+        println!("=====================================");
     };
 }
 
@@ -144,6 +250,7 @@ macro_rules! plonk_batch_verify_bench {
         let public_inputs_ref = vec![&pub_input[..]; $num_proofs];
         let proofs_ref = vec![&proof; $num_proofs];
 
+        init_timers();
         let start = ark_std::time::Instant::now();
 
         for _ in 0..NUM_REPETITIONS {
@@ -163,6 +270,21 @@ macro_rules! plonk_batch_verify_bench {
             stringify!($num_proofs),
             start.elapsed().as_nanos() / NUM_REPETITIONS as u128 / $num_proofs as u128
         );
+
+        println!(
+            "total batch verify time: {:.2} ms",
+            start.elapsed().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64
+        );
+        println!(
+            "time spend on FFT:  {:.2} ms, or {:.2}%",
+            total_fft_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_fft_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
+        println!(
+            "time spend on MSM:  {:.2} ms, or {:.2}%",
+            total_msm_time().as_nanos() as f64 / NUM_REPETITIONS as f64 / 1_000_000f64,
+            100f64 * total_msm_time().as_nanos() as f64 / start.elapsed().as_nanos() as f64
+        );
     };
 }
 
@@ -177,7 +299,28 @@ fn bench_batch_verify() {
     plonk_batch_verify_bench!(BW6_761, Fr761, PlonkType::UltraPlonk, 1000);
 }
 
+fn bench_intense() {
+    let mut f = File::create(format!(
+        "../target/{}-threads.txt",
+        rayon::current_num_threads()
+    ))
+    .expect("Unable to create file");
+
+    for i in 10..=30 {
+        let dim = 1 << i;
+        println!("bench with log(dim) =  {}", i);
+        plonk_prove_mt_bench!(Bls12_377, Fr377, PlonkType::TurboPlonk, dim, f);
+    }
+
+    for i in 10..=30 {
+        let dim = 1 << i;
+        println!("bench with log(dim) =  {}", i);
+        plonk_verify_bench!(Bls12_377, Fr377, PlonkType::TurboPlonk, dim);
+    }
+}
+
 fn main() {
+    bench_intense();
     bench_prove();
     bench_verify();
     bench_batch_verify();
diff --git a/plonk/src/bencher.rs b/plonk/src/bencher.rs
new file mode 100644
index 000000000..46db6c165
--- /dev/null
+++ b/plonk/src/bencher.rs
@@ -0,0 +1,184 @@
+//! Helper functions for micro-benchmarks
+
+use ark_std::{thread_local, time::Instant};
+use core::{cell::RefCell, time::Duration};
+
+thread_local!(static FFT_START_TIME: RefCell<Instant> = RefCell::new(Instant::now()));
+thread_local!(static FFT_TIMER_LOCK: RefCell<bool> = RefCell::new(false));
+thread_local!(static FFT_TOTAL_TIME: RefCell<Duration> = RefCell::new(Duration::ZERO));
+
+thread_local!(static MSM_START_TIME: RefCell<Instant> = RefCell::new(Instant::now()));
+thread_local!(static MSM_TIMER_LOCK: RefCell<bool> = RefCell::new(false));
+thread_local!(static MSM_TOTAL_TIME: RefCell<Duration> = RefCell::new(Duration::ZERO));
+
+thread_local!(static POLY_EVAL_START_TIME: RefCell<Instant> = RefCell::new(Instant::now()));
+thread_local!(static POLY_EVAL_TIMER_LOCK: RefCell<bool> = RefCell::new(false));
+thread_local!(static POLY_EVAL_TOTAL_TIME: RefCell<Duration> = RefCell::new(Duration::ZERO));
+
+/// Initializing the timers
+#[inline]
+pub fn init_timers() {
+    #[cfg(feature = "bench")]
+    {
+        FFT_TOTAL_TIME.with(|timer| {
+            *timer.borrow_mut() = Duration::ZERO;
+        });
+        FFT_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = false;
+        });
+        MSM_TOTAL_TIME.with(|timer| {
+            *timer.borrow_mut() = Duration::ZERO;
+        });
+        MSM_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = false;
+        });
+        POLY_EVAL_TOTAL_TIME.with(|timer| {
+            *timer.borrow_mut() = Duration::ZERO;
+        });
+        POLY_EVAL_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = false;
+        });
+    }
+}
+
+/// Get the total time that we have spend on FFT related computations
+#[inline]
+pub fn total_fft_time() -> Duration {
+    #[cfg(feature = "bench")]
+    {
+        FFT_TOTAL_TIME.with(|duration| *duration.borrow())
+    }
+    #[cfg(not(feature = "bench"))]
+    Duration::ZERO
+}
+
+/// Get the total time that we have spend on MSM related computations
+#[inline]
+pub fn total_msm_time() -> Duration {
+    #[cfg(feature = "bench")]
+    {
+        MSM_TOTAL_TIME.with(|duration| *duration.borrow())
+    }
+    #[cfg(not(feature = "bench"))]
+    Duration::ZERO
+}
+
+/// Get the total time that we have spend on polynomial evaluations
+#[inline]
+pub fn total_poly_eval_time() -> Duration {
+    #[cfg(feature = "bench")]
+    {
+        POLY_EVAL_TOTAL_TIME.with(|duration| *duration.borrow())
+    }
+    #[cfg(not(feature = "bench"))]
+    Duration::ZERO
+}
+
+#[inline]
+pub(crate) fn fft_start() {
+    #[cfg(feature = "bench")]
+    {
+        if FFT_TIMER_LOCK.with(|lock| *lock.borrow()) {
+            panic!("another FFT timer has already started somewhere else");
+        }
+
+        FFT_START_TIME.with(|timer| {
+            *timer.borrow_mut() = Instant::now();
+        });
+
+        FFT_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = true;
+        })
+    }
+}
+
+#[inline]
+pub(crate) fn fft_end() {
+    #[cfg(feature = "bench")]
+    {
+        if !FFT_TIMER_LOCK.with(|lock| *lock.borrow()) {
+            panic!("FFT timer has not started yet");
+        }
+
+        let start_time = FFT_START_TIME.with(|timer| *timer.borrow());
+        let end_time = Instant::now();
+        FFT_TOTAL_TIME.with(|duration| {
+            *duration.borrow_mut() += end_time - start_time;
+        });
+        FFT_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = false;
+        })
+    }
+}
+
+#[inline]
+pub(crate) fn msm_start() {
+    #[cfg(feature = "bench")]
+    {
+        if MSM_TIMER_LOCK.with(|lock| *lock.borrow()) {
+            panic!("another MSM timer has already started somewhere else");
+        }
+
+        MSM_START_TIME.with(|timer| {
+            *timer.borrow_mut() = Instant::now();
+        });
+
+        MSM_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = true;
+        })
+    }
+}
+
+#[inline]
+pub(crate) fn msm_end() {
+    #[cfg(feature = "bench")]
+    {
+        if !MSM_TIMER_LOCK.with(|lock| *lock.borrow()) {
+            panic!("MSM timer has not started yet");
+        }
+        let start_time = MSM_START_TIME.with(|timer| *timer.borrow());
+        let end_time = Instant::now();
+        MSM_TOTAL_TIME.with(|duration| {
+            *duration.borrow_mut() += end_time - start_time;
+        });
+        MSM_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = false;
+        })
+    }
+}
+
+#[inline]
+pub(crate) fn poly_eval_start() {
+    #[cfg(feature = "bench")]
+    {
+        if POLY_EVAL_TIMER_LOCK.with(|lock| *lock.borrow()) {
+            panic!("another poly eval timer has already started somewhere else");
+        }
+
+        POLY_EVAL_START_TIME.with(|timer| {
+            *timer.borrow_mut() = Instant::now();
+        });
+
+        POLY_EVAL_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = true;
+        })
+    }
+}
+
+#[inline]
+pub(crate) fn poly_eval_end() {
+    #[cfg(feature = "bench")]
+    {
+        if !POLY_EVAL_TIMER_LOCK.with(|lock| *lock.borrow()) {
+            panic!("poly eval timer has not started yet");
+        }
+        let start_time = POLY_EVAL_START_TIME.with(|timer| *timer.borrow());
+        let end_time = Instant::now();
+        POLY_EVAL_TOTAL_TIME.with(|duration| {
+            *duration.borrow_mut() += end_time - start_time;
+        });
+        POLY_EVAL_TIMER_LOCK.with(|lock| {
+            *lock.borrow_mut() = false;
+        })
+    }
+}
diff --git a/plonk/src/circuit/basic.rs b/plonk/src/circuit/basic.rs
index 8c8cc3c16..1d8100893 100644
--- a/plonk/src/circuit/basic.rs
+++ b/plonk/src/circuit/basic.rs
@@ -7,6 +7,7 @@
 //! Basic instantiations of Plonk-based constraint systems
 use super::{Arithmetization, Circuit, GateId, Variable, WireId};
 use crate::{
+    bencher::{fft_end, fft_start},
     circuit::{gates::*, SortedLookupVecAndPolys},
     constants::{compute_coset_representatives, GATE_WIDTH, N_MUL_SELECTORS},
     errors::{CircuitError::*, PlonkError},
@@ -1059,6 +1060,8 @@ where
     }
 
     fn compute_selector_polynomials(&self) -> Result<Vec<DensePolynomial<F>>, PlonkError> {
+        fft_start();
+
         self.check_finalize_flag(true)?;
         let domain = &self.eval_domain;
         if domain.size() < self.num_gates() {
@@ -1074,12 +1077,16 @@ where
             .map(|selector| DensePolynomial::from_coefficients_vec(domain.ifft(selector)))
             .collect();
 
+        fft_end();
+
         Ok(selector_polys)
     }
 
     fn compute_extended_permutation_polynomials(
         &self,
     ) -> Result<Vec<DensePolynomial<F>>, PlonkError> {
+        fft_start();
+
         self.check_finalize_flag(true)?;
         let domain = &self.eval_domain;
         let n = domain.size();
@@ -1092,6 +1099,8 @@ where
                 )
             })
             .collect();
+        fft_end();
+
         Ok(extended_perm_polys)
     }
 
@@ -1100,6 +1109,8 @@ where
         beta: &F,
         gamma: &F,
     ) -> Result<DensePolynomial<F>, PlonkError> {
+        fft_start();
+
         self.check_finalize_flag(true)?;
         let mut product_vec = vec![F::one()];
         let domain = &self.eval_domain;
@@ -1119,10 +1130,14 @@ where
             product_vec.push(prev_prod * a / b);
         }
         domain.ifft_in_place(&mut product_vec);
-        Ok(DensePolynomial::from_coefficients_vec(product_vec))
+
+        let res = DensePolynomial::from_coefficients_vec(product_vec);
+        fft_end();
+        Ok(res)
     }
 
     fn compute_wire_polynomials(&self) -> Result<Vec<DensePolynomial<F>>, PlonkError> {
+        fft_start();
         self.check_finalize_flag(true)?;
         let domain = &self.eval_domain;
         if domain.size() < self.num_gates() {
@@ -1145,10 +1160,13 @@ where
             })
             .collect();
         assert_eq!(wire_polys.len(), self.num_wire_types());
+        fft_end();
         Ok(wire_polys)
     }
 
     fn compute_pub_input_polynomial(&self) -> Result<DensePolynomial<F>, PlonkError> {
+        fft_start();
+
         self.check_finalize_flag(true)?;
         let domain = &self.eval_domain;
         let mut pub_input_vec = vec![F::zero(); domain.size()];
@@ -1157,25 +1175,31 @@ where
             pub_input_vec[io_gate_id] = self.witness[var];
         });
         domain.ifft_in_place(&mut pub_input_vec);
-        Ok(DensePolynomial::from_coefficients_vec(pub_input_vec))
+        let res = DensePolynomial::from_coefficients_vec(pub_input_vec);
+        fft_end();
+        Ok(res)
     }
 
     // Plookup-related methods
     //
     fn compute_range_table_polynomial(&self) -> Result<DensePolynomial<F>, PlonkError> {
+        fft_start();
         let range_table = self.compute_range_table()?;
         let domain = &self.eval_domain;
-        Ok(DensePolynomial::from_coefficients_vec(
-            domain.ifft(&range_table),
-        ))
+
+        let res = DensePolynomial::from_coefficients_vec(domain.ifft(&range_table));
+        fft_end();
+        Ok(res)
     }
 
     fn compute_key_table_polynomial(&self) -> Result<DensePolynomial<F>, PlonkError> {
+        fft_start();
         let key_table = self.compute_key_table()?;
         let domain = &self.eval_domain;
-        Ok(DensePolynomial::from_coefficients_vec(
-            domain.ifft(&key_table),
-        ))
+
+        let res = DensePolynomial::from_coefficients_vec(domain.ifft(&key_table));
+        fft_end();
+        Ok(res)
     }
 
     fn compute_merged_lookup_table(&self, tau: F) -> Result<Vec<F>, PlonkError> {
@@ -1252,8 +1276,12 @@ where
             product_vec.push(prev_prod * a / b);
         }
         product_vec.push(F::one());
+
+        fft_start();
         domain.ifft_in_place(&mut product_vec);
-        Ok(DensePolynomial::from_coefficients_vec(product_vec))
+        let res = DensePolynomial::from_coefficients_vec(product_vec);
+        fft_end();
+        Ok(res)
     }
 
     fn compute_lookup_sorted_vec_polynomials(
@@ -1301,8 +1329,11 @@ where
         if sorted_vec.len() != 2 * n - 1 {
             return Err(ParameterError("The sorted vector has wrong length, some lookup variables might be outside the table".to_string()).into());
         }
+
+        fft_start();
         let h1_poly = DensePolynomial::from_coefficients_vec(domain.ifft(&sorted_vec[..n]));
         let h2_poly = DensePolynomial::from_coefficients_vec(domain.ifft(&sorted_vec[n - 1..]));
+        fft_end();
         Ok((sorted_vec, h1_poly, h2_poly))
     }
 }
diff --git a/plonk/src/lib.rs b/plonk/src/lib.rs
index 4020ea392..25b6139ac 100644
--- a/plonk/src/lib.rs
+++ b/plonk/src/lib.rs
@@ -18,6 +18,7 @@ extern crate downcast_rs;
 #[macro_use]
 extern crate derivative;
 
+pub mod bencher;
 pub mod circuit;
 pub mod constants;
 pub mod errors;
diff --git a/plonk/src/proof_system/prover.rs b/plonk/src/proof_system/prover.rs
index fbc0ea40d..5bbe2592d 100644
--- a/plonk/src/proof_system/prover.rs
+++ b/plonk/src/proof_system/prover.rs
@@ -11,6 +11,7 @@ use super::structs::{
     PlookupOracles, ProofEvaluations, ProvingKey,
 };
 use crate::{
+    bencher::{fft_end, fft_start, msm_end, msm_start, poly_eval_end, poly_eval_start},
     circuit::Arithmetization,
     constants::{domain_size_ratio, GATE_WIDTH},
     errors::{PlonkError, SnarkError::*},
@@ -80,7 +81,9 @@ impl<E: PairingEngine> Prover<E> {
             .into_iter()
             .map(|poly| self.mask_polynomial(prng, poly, 1))
             .collect();
+        msm_start();
         let wires_poly_comms = Self::commit_polynomials(ck, &wire_polys)?;
+        msm_end();
         let pub_input_poly = cs.compute_pub_input_polynomial()?;
         Ok(((wires_poly_comms, wire_polys), pub_input_poly))
     }
@@ -104,7 +107,9 @@ impl<E: PairingEngine> Prover<E> {
         let h_1_poly = self.mask_polynomial(prng, h_1_poly, 2);
         let h_2_poly = self.mask_polynomial(prng, h_2_poly, 2);
         let h_polys = vec![h_1_poly, h_2_poly];
+        msm_start();
         let h_poly_comms = Self::commit_polynomials(ck, &h_polys)?;
+        msm_end();
         Ok(((h_poly_comms, h_polys), sorted_vec, merged_lookup_table))
     }
 
@@ -122,7 +127,9 @@ impl<E: PairingEngine> Prover<E> {
             cs.compute_prod_permutation_polynomial(&challenges.beta, &challenges.gamma)?,
             2,
         );
+        msm_start();
         let prod_perm_comm = Self::commit_polynomial(ck, &prod_perm_poly)?;
+        msm_end();
         Ok((prod_perm_comm, prod_perm_poly))
     }
 
@@ -155,7 +162,9 @@ impl<E: PairingEngine> Prover<E> {
             )?,
             2,
         );
+        msm_start();
         let prod_lookup_comm = Self::commit_polynomial(ck, &prod_lookup_poly)?;
+        msm_end();
         Ok((prod_lookup_comm, prod_lookup_poly))
     }
 
@@ -173,8 +182,9 @@ impl<E: PairingEngine> Prover<E> {
         let quot_poly =
             self.compute_quotient_polynomial(challenges, pks, online_oracles, num_wire_types)?;
         let split_quot_polys = self.split_quotient_polynomial(&quot_poly, num_wire_types)?;
+        msm_start();
         let split_quot_poly_comms = Self::commit_polynomials(ck, &split_quot_polys)?;
-
+        msm_end();
         Ok((split_quot_poly_comms, split_quot_polys))
     }
 
@@ -190,6 +200,9 @@ impl<E: PairingEngine> Prover<E> {
         online_oracles: &Oracles<E::Fr>,
         num_wire_types: usize,
     ) -> ProofEvaluations<E::Fr> {
+        // TODO: a potential optimization -- dense polynomial evaluations re-computed
+        // powers-of-zetas consider pre-compute them and pass them in
+        poly_eval_start();
         let wires_evals: Vec<E::Fr> = online_oracles
             .wire_polys
             .par_iter()
@@ -205,6 +218,7 @@ impl<E: PairingEngine> Prover<E> {
             .prod_perm_poly
             .evaluate(&(challenges.zeta * self.domain.group_gen));
 
+        poly_eval_end();
         ProofEvaluations {
             wires_evals,
             wire_sigma_evals,
@@ -220,6 +234,8 @@ impl<E: PairingEngine> Prover<E> {
         challenges: &Challenges<E::Fr>,
         online_oracles: &Oracles<E::Fr>,
     ) -> Result<PlookupEvaluations<E::Fr>, PlonkError> {
+        poly_eval_start();
+
         if pk.plookup_pk.is_none() {
             return Err(ParameterError(
                 "Evaluate Plookup polynomials without supporting lookup".to_string(),
@@ -241,6 +257,8 @@ impl<E: PairingEngine> Prover<E> {
         let h_1_eval = online_oracles.plookup_oracles.h_polys[0].evaluate(&challenges.zeta);
         let q_lookup_eval = pk.q_lookup_poly()?.evaluate(&challenges.zeta);
 
+        // TODO: a potential optimization -- dense polynomial evaluations re-computed
+        // powers-of-gs consider pre-compute them and pass them in
         let zeta_mul_g = challenges.zeta * self.domain.group_gen;
         let prod_next_eval = online_oracles
             .plookup_oracles
@@ -254,6 +272,7 @@ impl<E: PairingEngine> Prover<E> {
         let w_3_next_eval = online_oracles.wire_polys[3].evaluate(&zeta_mul_g);
         let w_4_next_eval = online_oracles.wire_polys[4].evaluate(&zeta_mul_g);
 
+        poly_eval_end();
         Ok(PlookupEvaluations {
             range_table_eval,
             key_table_eval,
@@ -483,8 +502,10 @@ impl<E: PairingEngine> Prover<E> {
             *eval_point,
             &empty_rand,
         )?;
-
-        Self::commit_polynomial(ck, &witness_poly)
+        msm_start();
+        let res = Self::commit_polynomial(ck, &witness_poly);
+        msm_end();
+        res
     }
 
     /// Compute the quotient polynomial via (i)FFTs.
@@ -521,10 +542,12 @@ impl<E: PairingEngine> Prover<E> {
         let alpha_3 = challenges.alpha.square() * challenges.alpha;
         let alpha_7 = alpha_3.square() * challenges.alpha;
         // enumerate proving instances
+        fft_start();
         for (oracles, pk) in online_oracles.iter().zip(pks.iter()) {
             // lookup_flag = 1 if support Plookup argument.
             let lookup_flag = pk.plookup_pk.is_some();
 
+            // fft_start();
             // Compute coset evaluations.
             let selectors_coset_fft: Vec<Vec<E::Fr>> = pk
                 .selectors
@@ -581,6 +604,8 @@ impl<E: PairingEngine> Prover<E> {
                 (None, None, None, None)
             };
 
+            // fft_end();
+
             // Compute coset evaluations of the quotient polynomial.
             let quot_poly_coset_evals: Vec<E::Fr> = (0..m)
                 .into_par_iter()
@@ -646,9 +671,12 @@ impl<E: PairingEngine> Prover<E> {
             }
         }
         // Compute the coefficient form of the quotient polynomial
-        Ok(DensePolynomial::from_coefficients_vec(
+        // fft_start();
+        let res = DensePolynomial::from_coefficients_vec(
             self.quot_domain.coset_ifft(&quot_poly_coset_evals_sum),
-        ))
+        );
+        fft_end();
+        Ok(res)
     }
 
     // Compute the i-th coset evaluation of the circuit part of the quotient
diff --git a/plonk/src/proof_system/structs.rs b/plonk/src/proof_system/structs.rs
index 9a6f2d33a..1ac67f8f4 100644
--- a/plonk/src/proof_system/structs.rs
+++ b/plonk/src/proof_system/structs.rs
@@ -6,6 +6,7 @@
 
 //! Data structures used in Plonk proof systems
 use crate::{
+    bencher::{msm_end, msm_start},
     circuit::{
         customized::{
             ecc::{Point, SWToTEConParam},
@@ -866,13 +867,16 @@ impl<E: PairingEngine> ScalarsAndBases<E> {
     }
     /// Compute the multi-scalar multiplication.
     pub(crate) fn multi_scalar_mul(&self) -> E::G1Projective {
+        msm_start();
         let mut bases = vec![];
         let mut scalars = vec![];
         for (&base, scalar) in &self.base_scalar_map {
             bases.push(base);
             scalars.push(scalar.into_repr());
         }
-        VariableBaseMSM::multi_scalar_mul(&bases, &scalars)
+        let res = VariableBaseMSM::multi_scalar_mul(&bases, &scalars);
+        msm_end();
+        res
     }
 }
 
diff --git a/scripts/run_mt_bench.sh b/scripts/run_mt_bench.sh
new file mode 100755
index 000000000..f6bdcaa64
--- /dev/null
+++ b/scripts/run_mt_bench.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+rm target/*.txt
+rm target/*.log
+RAYON_NUM_THREADS=64 cargo bench --features=bench > target/64core.log 
+RAYON_NUM_THREADS=32 cargo bench --features=bench > target/32core.log 
+RAYON_NUM_THREADS=16 cargo bench --features=bench > target/16core.log 
+RAYON_NUM_THREADS=8 cargo bench --features=bench > target/8core.log 
+RAYON_NUM_THREADS=4 cargo bench --features=bench > target/4core.log 
+
+
+