diff --git a/.ackrc b/.ackrc
new file mode 100644
index 0000000..30903c0
--- /dev/null
+++ b/.ackrc
@@ -0,0 +1,3 @@
+--ignore-dir=build
+--ignore-dir=3rdparty
+--ignore-dir=bin
diff --git a/scripts/exprlib/env.py b/scripts/exprlib/env.py
index 56a8b21..07515be 100644
--- a/scripts/exprlib/env.py
+++ b/scripts/exprlib/env.py
@@ -218,26 +218,31 @@ def get_nodes_with_nics():
         return sorted(nodes_with_nics)
 
     @staticmethod
-    def get_cpu_mask_with_nics(only_phys_cores=True):
+    def get_cpu_mask_with_nics(num_max_cores_per_node, only_phys_cores=True):
         '''
         Return a CPU core index bitmask of all cores in the NUMA nodes that have NICs.
         '''
         core_bits = 0
         node_cpus = ExperimentEnv.get_core_topology()
         nodes_with_nics = ExperimentEnv.get_nodes_with_nics()
+        ht_div = ExperimentEnv.get_hyperthreading_degree() if only_phys_cores else 1
         for node_id in nodes_with_nics:
-            ht_div = ExperimentEnv.get_hyperthreading_degree() if only_phys_cores else 1
             phys_cnt = len(node_cpus[node_id]) // ht_div
-            for core_id in node_cpus[node_id][:phys_cnt]:
+            max_cnt = min(num_max_cores_per_node, phys_cnt)
+            for core_id in node_cpus[node_id][:max_cnt]:
                 core_bits |= (1 << core_id)
+            # Force add device-handling cores
+            core_bits |= (1 << node_cpus[node_id][phys_cnt - 1])
         return core_bits
 
     @staticmethod
     def mangle_main_args(config_name, click_name,
+                         num_max_cores_per_node,
                          emulate_opts=None,
                          extra_args=None, extra_dpdk_args=None):
+        core_mask = ExperimentEnv.get_cpu_mask_with_nics(num_max_cores_per_node)
         args = [
-            '-c', hex(ExperimentEnv.get_cpu_mask_with_nics()),
+            '-c', '{:x}'.format(core_mask),
             '-n', os.environ.get('NBA_MEM_CHANNELS', '4'),
         ]
         # TODO: translate emulate_opts to void_pmd options
@@ -269,6 +274,7 @@ def chdir_to_root():
     @asyncio.coroutine
     def execute_main(self, config_name, click_name,
                      running_time=30.0,
+                     num_max_cores_per_node=64,
                      emulate_opts=None,
                      extra_args=None, extra_dpdk_args=None,
                      custom_stdout_coro=None):
@@ -278,7 +284,7 @@ def execute_main(self, config_name, click_name,
         self.chdir_to_root()
         config_path = os.path.normpath(os.path.join('configs', config_name))
         click_path = os.path.normpath(os.path.join('configs', click_name))
-        args = self.mangle_main_args(config_path, click_path,
+        args = self.mangle_main_args(config_path, click_path, num_max_cores_per_node,
                                      emulate_opts, extra_args, extra_dpdk_args)
 
         # Reset/initialize events.
diff --git a/scripts/run_all_apps.sh b/scripts/run_all_apps.sh
index 2e5d534..6d2624c 100755
--- a/scripts/run_all_apps.sh
+++ b/scripts/run_all_apps.sh
@@ -6,9 +6,9 @@ dropbox.py stop
 ./run_app_perf.py --prefix latency -b bin-backup/main -p 72 -l default.py ipv6-router-cpuonly.click
 ./run_app_perf.py --prefix latency -b bin-backup/main -p 72 -l default.py ipv6-router-gpuonly.click
 ./run_app_perf.py --prefix latency -b bin-backup/main.noreuse -p 72 -l default.py ipv6-router-gpuonly.click
-./run_app_perf.py --prefix latency -b bin-backup/main -p 64 -l default.py ipsec-encryption-cpuonly.click
-./run_app_perf.py --prefix latency -b bin-backup/main -p 64 -l default.py ipsec-encryption-gpuonly.click
-./run_app_perf.py --prefix latency -b bin-backup/main.noreuse -p 64 -l default.py ipsec-encryption-gpuonly.click
+./run_app_perf.py --prefix latency -b bin-backup/lmain -p 64 -l default.py ipsec-encryption-cpuonly.click
+./run_app_perf.py --prefix latency -b bin-backup/lmain -p 64 -l default.py ipsec-encryption-gpuonly.click
+./run_app_perf.py --prefix latency -b bin-backup/lmain.noreuse -p 64 -l default.py ipsec-encryption-gpuonly.click
 ./run_app_perf.py --prefix thruput -b bin-backup/main -p 64,128,256,512,1024,1500 default.py ipv4-router.click --combine-cpu-gpu
 ./run_app_perf.py --prefix thruput -b bin-backup/main -p 64,128,256,512,1024,1500 default.py ipv6-router.click --combine-cpu-gpu
 ./run_app_perf.py --prefix thruput -b bin-backup/main -p 64,128,256,512,1024,1500 default.py ipsec-encryption.click --combine-cpu-gpu
diff --git a/scripts/run_app_perf.py b/scripts/run_app_perf.py
index 5ac6e04..893907c 100755
--- a/scripts/run_app_perf.py
+++ b/scripts/run_app_perf.py
@@ -34,7 +34,7 @@
 
 async def do_experiment(loop, env, args, conds, thruput_reader):
     result = ExperimentResult()
-    conf_name, io_batchsz, comp_batchsz, coproc_ppdepth, pktsz = conds
+    conf_name, io_batchsz, comp_batchsz, coproc_ppdepth, num_cores, pktsz = conds
 
     env.envvars['NBA_IO_BATCH_SIZE'] = str(io_batchsz)
     env.envvars['NBA_COMP_BATCH_SIZE'] = str(comp_batchsz)
@@ -50,8 +50,8 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
     elif 'ipsec' in args.element_config_to_use:
         # ipv4 pkts with fixed 1K flows
         pktgen.args = ['-i', 'all', '-f', '1024', '-r', '0', '-v', '4', '-p', str(pktsz)]
-        extra_nba_args.append('--preserve-latency')
         if args.latency:
+            extra_nba_args.append('--preserve-latency')
             pktgen.args += ['-g', '3', '-l', '--latency-histogram']
     else:
         # All random ipv4 pkts
@@ -84,6 +84,7 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
         else:
             retcode = await env.execute_main(args.sys_config_to_use,
                                              conf_name + '.click',
+                                             num_max_cores_per_node=num_cores,
                                              extra_args=extra_nba_args,
                                              running_time=32.0)
 
@@ -108,7 +109,7 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
     for n in range(env.get_num_nodes()):
         if per_node_cnt[n] > 0:
             avg_thruput_records.append((
-                (conf_name, io_batchsz, comp_batchsz, coproc_ppdepth, n, pktsz),
+                (conf_name, io_batchsz, comp_batchsz, coproc_ppdepth, num_cores, n, pktsz),
                 (per_node_mpps_sum[n] / per_node_cnt[n],
                  per_node_gbps_sum[n] / per_node_cnt[n])))
     result.thruput_records = avg_thruput_records
@@ -146,6 +147,7 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
     parser.add_argument('--io-batch-sizes', type=comma_sep_numbers(1, 256), metavar='NUM[,NUM...]', default=[32])
     parser.add_argument('--comp-batch-sizes', type=comma_sep_numbers(1, 256), metavar='NUM[,NUM...]', default=[64])
     parser.add_argument('--coproc-ppdepths', type=comma_sep_numbers(1, 256), metavar='NUM[,NUM...]', default=[32])
+    parser.add_argument('--num-cores', type=comma_sep_numbers(1, 64), metavar='NUM[,NUM...]', default=[64])
     parser.add_argument('-t', '--transparent', action='store_true', default=False, help='Pass-through the standard output instead of parsing it. No default timeout is applied.')
     parser.add_argument('--timeout', type=int, default=None, help='Set a forced timeout for transparent mode.')
     parser.add_argument('--no-record', action='store_true', default=False, help='Do NOT record the results.')
@@ -178,6 +180,7 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
         args.io_batch_sizes,
         args.comp_batch_sizes,
         args.coproc_ppdepths,
+        args.num_cores,
         tuple(range(env.get_num_nodes())),
         args.pkt_sizes
     ))
@@ -186,6 +189,7 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
         args.io_batch_sizes,
         args.comp_batch_sizes,
         args.coproc_ppdepths,
+        args.num_cores,
         args.pkt_sizes
     ))
     mi = pd.MultiIndex.from_tuples(combinations, names=[
@@ -193,6 +197,7 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
         'io_batchsz',
         'comp_batchsz',
         'coproc_ppdepth',
+        'num_cores',
         'node_id',
         'pktsz',
     ])
@@ -230,7 +235,7 @@ async def do_experiment(loop, env, args, conds, thruput_reader):
     pd.set_option('display.expand_frame_repr', False)
     pd.set_option('display.float_format', lambda f: '{:.2f}'.format(f))
     system_tput = all_tput_recs.sum(level=['conf', 'io_batchsz', 'comp_batchsz',
-                                           'coproc_ppdepth', 'pktsz'])
+                                           'coproc_ppdepth', 'num_cores', 'pktsz'])
     print('Throughput per NUMA node')
     print('========================')
     print(all_tput_recs)
diff --git a/scripts/run_compbatching.sh b/scripts/run_compbatching.sh
new file mode 100755
index 0000000..74a1565
--- /dev/null
+++ b/scripts/run_compbatching.sh
@@ -0,0 +1,6 @@
+#! /bin/sh
+dropbox.py stop
+./run_app_perf.py --prefix compbatching -b bin-backup/main --comp-batch-sizes 1,4,8,16,32,64 -p 64,256,1500 default.py ipv4-router.click --combine-cpu-gpu
+./run_app_perf.py --prefix compbatching -b bin-backup/main --comp-batch-sizes 1,4,8,16,32,64 -p 64,256,1500 default.py ipv6-router.click --combine-cpu-gpu
+./run_app_perf.py --prefix compbatching -b bin-backup/main --comp-batch-sizes 1,4,8,16,32,64 -p 64,256,1500 default.py ipsec-encryption.click --combine-cpu-gpu
+dropbox.py start
diff --git a/scripts/run_scalability.sh b/scripts/run_scalability.sh
new file mode 100755
index 0000000..8fde44b
--- /dev/null
+++ b/scripts/run_scalability.sh
@@ -0,0 +1,6 @@
+#! /bin/sh
+dropbox.py stop
+./run_app_perf.py --prefix scalability -b bin-backup/main --num-cores 1,2,4,7 -p 64 default.py ipv4-router.click --combine-cpu-gpu
+./run_app_perf.py --prefix scalability -b bin-backup/main --num-cores 1,2,4,7 -p 64 default.py ipv6-router.click --combine-cpu-gpu
+./run_app_perf.py --prefix scalability -b bin-backup/main --num-cores 1,2,4,7 -p 64 default.py ipsec-encryption.click --combine-cpu-gpu
+dropbox.py start