From 42398184c7a109c8c3a8cf9c3d2df669a9778575 Mon Sep 17 00:00:00 2001
From: Josh Bundt <joshua.bundt@gmail.com>
Date: Fri, 28 Jan 2022 13:03:33 -0500
Subject: [PATCH] adds support for new compilers

- modifies regex in `tiknib/utils.py` to match any version of gcc/clang
- adds `config/path_variables.py` to enable ease of use.
  note: `path_variables.py` was written to work in Bash and Python
  universally.
- fixes a problem with the ase18 dataset and the coreutils debug
  information which caused all v6.5 functions to be discarded.
- adds tablulate to print out a formated ROC table.
- adds `-P+` to enable compressing the IDA Pro databases.  Saves a lot of
  storage space for this massive dataset! For example objdump reduces from
  48M to 6M.

```
6.8M Dec  2  2018 /tmp/notpacked/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf
 48M Jan 28 10:43 /tmp/notpacked/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf.i64
6.8M Dec  2  2018 /tmp/packed/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf
6.2M Jan 28 10:41 /tmp/packed/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf.i64
```
---
 README.md                           | 11 ++++---
 config/config_list_openssl.txt      | 20 +++++------
 example/example.sh                  | 33 ++++++++++++++++---
 helper/do_idascript.py              |  5 +--
 helper/extract_lineno.py            |  8 +++--
 helper/filter_functions.py          |  2 +-
 helper/get_roc_table.py             | 51 +++++++++++++++++++----------
 helper/run_ase.sh                   | 31 ++++++++++++------
 helper/run_ase_roc.sh               | 18 +++++-----
 helper/run_extra_roc.sh             |  6 ++--
 helper/run_gnu.sh                   | 24 +++++++-------
 helper/run_gnu_roc.sh               | 15 +++++----
 helper/run_gnu_roc_type.sh          | 16 +++++----
 helper/run_openssl_roc.sh           |  4 ++-
 helper/run_openssl_roc_topk.sh      |  4 ++-
 helper/run_openssl_roc_topk_type.sh |  4 ++-
 helper/run_openssl_roc_type.sh      |  4 ++-
 requirements.txt                    |  1 +
 tiknib/idascript.py                 |  5 +--
 tiknib/utils.py                     | 17 +++++-----
 20 files changed, 178 insertions(+), 101 deletions(-)

diff --git a/README.md b/README.md
index 060a401..a9b83e6 100644
--- a/README.md
+++ b/README.md
@@ -111,7 +111,10 @@ confirm their equivalence. Based on these criteria we conducted several steps to
 build ground truth and clean the datasets. For more details, please check [our
 paper](https://arxiv.org/abs/2011.10749).
 
-### 1. Run IDA Pro to extract preliminary data for each functions.
+### 1. Configure path variables for IDA Pro and this repository (`config/path_variables.py`).
+
+
+### 2. Run IDA Pro to extract preliminary data for each functions.
 
 **This step takes the most time.**
 
@@ -145,7 +148,7 @@ Additionally, **you can use this script to run any idascript for numerous
 binaries in parallel.**
 
 
-### 2. Extract source file names and line numbers to build ground truth.
+### 3. Extract source file names and line numbers to build ground truth.
 This extracts source file name and line number by parsing the debugging
 information in a given binary. The binary must have been compiled with
 the `-g` option.
@@ -156,7 +159,7 @@ $ python helper/extract_lineno.py \
     --threshold 1
 ```
 
-### 3. Filter functions.
+### 4. Filter functions.
 This filters functions by checking the source file name and line number.
 This removes compiler intrinsic functions and duplicate functions spread
 over multiple binaries within the same package.
@@ -167,7 +170,7 @@ $ python helper/filter_functions.py \
     --threshold 1
 ```
 
-### (Optional) 4. Counting the number of functions.
+### (Optional) 5. Counting the number of functions.
 This counts the number of functions and generates a graph of that function
 on the same path of `input_list`. This also prints the numbers separated
 by `','`. In the below example, a pdf file containing the graph will be
diff --git a/config/config_list_openssl.txt b/config/config_list_openssl.txt
index adf3bb2..a37747b 100644
--- a/config/config_list_openssl.txt
+++ b/config/config_list_openssl.txt
@@ -1,10 +1,10 @@
-/home/dongkwan/tiknib/config/openssl/config_openssl_all.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_arm_arm.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_arm_mips.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_arm_x86.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_mips_arm.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_mips_mips.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_mips_x86.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_x86_arm.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_x86_mips.yml
-/home/dongkwan/tiknib/config/openssl/config_openssl_x86_x86.yml
+config/openssl/config_openssl_all.yml
+config/openssl/config_openssl_arm_arm.yml
+config/openssl/config_openssl_arm_mips.yml
+config/openssl/config_openssl_arm_x86.yml
+config/openssl/config_openssl_mips_arm.yml
+config/openssl/config_openssl_mips_mips.yml
+config/openssl/config_openssl_mips_x86.yml
+config/openssl/config_openssl_x86_arm.yml
+config/openssl/config_openssl_x86_mips.yml
+config/openssl/config_openssl_x86_x86.yml
diff --git a/example/example.sh b/example/example.sh
index 0ebcae8..64c3e56 100755
--- a/example/example.sh
+++ b/example/example.sh
@@ -1,24 +1,47 @@
 #!/bin/bash
+
+source config/path_variables.py
+
+SECONDS=0
 echo "Processing IDA analysis ..."
 python3 helper/do_idascript.py \
-    --idapath "/home/dongkwan/.tools/ida-6.95" \
-    --idc "tiknib/ida/fetch_funcdata.py" \
+    --idapath "${IDA_PATH}" \
+    --idc "${IDA_FETCH_FUNCDATA}" \
     --input_list "example/input_list_find.txt" \
     --log
 
-echo "Extracting function types ..."
+
+echo "Extract source file names and line numbers... ${SECONDS}s"
+python3 helper/extract_lineno.py \
+    --input_list "example/input_list_find.txt" \
+    --threshold 1
+
+
+echo "Filtering functions... ${SECONDS}s"
+python3 helper/filter_functions.py \
+    --input_list "example/input_list_find.txt" \
+    --threshold 1
+
+
+echo "Counting functions..."
+python3 helper/count_functions.py \
+    --input_list "example/input_list_find.txt" \
+    --threshold 1
+
+
+echo "Extracting function types ... ${SECONDS}s"
 python3 helper/extract_functype.py \
     --source_list "example/source_list.txt" \
     --input_list "example/input_list_find.txt" \
     --ctags_dir "data/ctags" \
     --threshold 1
 
-echo "Extracting features ..."
+echo "Extracting features ... ${SECONDS}s"
 python3 helper/extract_features.py \
     --input_list "example/input_list_find.txt" \
     --threshold 1
 
-echo "Testing features ..."
+echo "Testing features ... ${SECONDS}s"
 python3 helper/test_roc.py \
     --input_list "example/input_list_find.txt" \
     --config "config/gnu/config_gnu_normal_all.yml"
diff --git a/helper/do_idascript.py b/helper/do_idascript.py
index b884c8a..703386b 100644
--- a/helper/do_idascript.py
+++ b/helper/do_idascript.py
@@ -5,6 +5,7 @@
 sys.path.insert(0, os.path.join(sys.path[0], ".."))
 from tiknib.idascript import IDAScript
 from tiknib.utils import do_multiprocess
+from config.path_variables import IDA_PATH, IDA_FETCH_FUNCDATA
 
 if __name__ == "__main__":
     op = OptionParser()
@@ -16,7 +17,7 @@
         action="store",
         type=str,
         dest="idapath",
-        default="/home/dongkwan/.tools/ida-6.95",
+        default=IDA_PATH,
         help="IDA directory path",
     )
     op.add_option(
@@ -24,7 +25,7 @@
         action="store",
         type=str,
         dest="idc",
-        default="tiknib/ida/fetch_funcdata.py",
+        default=IDA_FETCH_FUNCDATA,
         help="IDA script file",
     )
     op.add_option(
diff --git a/helper/extract_lineno.py b/helper/extract_lineno.py
index 134a0fb..bbe4024 100644
--- a/helper/extract_lineno.py
+++ b/helper/extract_lineno.py
@@ -8,6 +8,7 @@
 from tiknib.utils import do_multiprocess
 from tiknib.utils import load_func_data, store_func_data
 from tiknib.utils import parse_source_path
+from config.path_variables import *
 
 import logging
 import coloredlogs
@@ -31,6 +32,9 @@ def extract_func_lineno(bin_name):
         func["src_path"] = line_map[func_addr][0]
         func["src_file"] = parse_source_path(func["src_path"])
         func["src_line"] = line_map[func_addr][1]
+        # Fix ase18 source paths coreutils-6.7-6.5 / coreutils-6.7-6.7
+        if 'coreutils-6.7-6.5' in func['src_path']:
+            func['src_path'] = func['src_path'].replace('6.7-6.5', '6.5')
     store_func_data(bin_name, func_data_list)
     return
 
@@ -84,8 +88,8 @@ def extract_func_lineno(bin_name):
 
         from tiknib.idascript import IDAScript
         idascript = IDAScript(
-            idapath="/home/dongkwan/.tools/ida-6.95",
-            idc="tiknib/ida/fetch_funcdata.py",
+            idapath=IDA_PATH,
+            idc=IDA_FETCH_FUNCDATA,
             force=True,
             log=True,
         )
diff --git a/helper/filter_functions.py b/helper/filter_functions.py
index 9dad62a..2f866af 100644
--- a/helper/filter_functions.py
+++ b/helper/filter_functions.py
@@ -39,7 +39,7 @@ def filter_funcs(bin_path):
     #        print(func['name'], func['src_file'], func['src_line'])
 
     # filter functions by package name (remove functions inserted by compilers)
-    funcs = list(filter(lambda x: x['package'] in x['src_path'], funcs))
+    funcs = list(filter(lambda x: pack_name in x['src_path'], funcs))
     num_pack_funcs = len(funcs)
 
     if num_pack_funcs == 0:
diff --git a/helper/get_roc_table.py b/helper/get_roc_table.py
index f0d8465..2675c36 100644
--- a/helper/get_roc_table.py
+++ b/helper/get_roc_table.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from optparse import OptionParser
+from tabulate import tabulate
 
 sys.path.insert(0, os.path.join(sys.path[0], ".."))
 from tiknib.utils import load_cache
@@ -15,6 +16,12 @@
 rootLogger = logging.getLogger()
 rootLogger.setLevel(logging.INFO)
 
+def config_rename(config_fname):
+    # TODO: clean up the key name (config_fname to something neat).
+    config_key = os.path.basename(config_fname)
+    config_key = re.search("config_(.+).yml", config_key).groups()[0]
+    return config_key
+
 def calc_tptn_gap(tps, tns):
     return np.mean(np.abs(tps - tns), axis=0)
 
@@ -43,9 +50,7 @@ def load_results(opts):
         # select the latest one
         cache_dir = sorted(glob.glob("{}/*".format(outdir)))[-1]
 
-        # TODO: clean up the key name (config_fname to something neat).
-        config_key = os.path.basename(config_fname)
-        config_key = re.search("config_(.+).yml", config_key).groups()[0]
+        config_key = config_rename(config_fname)
         all_data[config_key] = []
         features_inter = set()
         for idx in range(10):
@@ -65,9 +70,7 @@ def load_results(opts):
 
     # Now fetch real data
     for config_idx, config_fname in enumerate(config_fnames):
-        # TODO: clean up the key name (config_fname to something neat).
-        config_key = os.path.basename(config_fname)
-        config_key = re.search("config_(.+).yml", config_key).groups()[0]
+        config_key = config_rename(config_fname)
 
         rocs = []
         aps = []
@@ -154,18 +157,22 @@ def get_results(opts):
     config_fnames, total_data, features, features_union = load_results(opts)
 
     # first rows
-    print(','.join(map(lambda x:
+    row1 = ["# Train pairs (10^6)"]
+    row1.extend(list(map(lambda x:
                        '%.2f' % (x[0] / 1000000.0)
                        if x[0] > 100000
                        else '%.2fF' % (x[0] / 10000), total_data[0])))
-    print(','.join(map(lambda x:
+    row2 = ["# Test pairs (10^6)"]
+    row2.extend(list(map(lambda x:
                        '%.2f' % (x[1] / 1000000.0)
                        if x[1] > 100000
                        else '%.2fF' % (x[1] / 10000), total_data[0])))
 
     # second rows
-    print(','.join(map(lambda x: '%.1f' % (x[0]), total_data[1])))
-    print(','.join(map(lambda x: '%.1f' % (x[1]), total_data[1])))
+    row3 = ["Train time"] + ['%.1f' % x[0] for x in total_data[1]]
+    row4 = ["Test time"] + ['%.1f' % x[1] for x in total_data[1]]
+
+    table = [row1, row2, row3, row4]
 
     # third rows
     for idx in features_union:
@@ -177,22 +184,30 @@ def get_results(opts):
                 s.append('%.2f-' % (data[feature][0]))
             else:
                 s.append('%.2f' % (data[feature][0]))
-        print(','.join(s))
+        table.append(s)
 
     # fourth row
-    print(','.join(map(lambda x: '%.1f' % (x), total_data[3])))
+    row = ["Avg # features"] + ['%.1f' % x for x in total_data[3]]
+    table.append(row)
 
     # fifth rows
-    print(','.join(map(lambda x: '%.2f' % (x[0]), total_data[4])))
-    print(','.join(map(lambda x: '%.2f' % (x[1]), total_data[4])))
+    row = ["Mean tptn_gap"] + ['%.2f' % x[0] for x in total_data[4]]
+    table.append(row)
+    row = ["Std tptn_gap"] + ['%.2f' % x[1] for x in total_data[4]]
+    table.append(row)
 
     # sixth rows
-    print(','.join(map(lambda x: '%.2f' % (x[0]), total_data[5])))
-    print(','.join(map(lambda x: '%.2f' % (x[1]), total_data[5])))
+    row = ["ROC AUC"] + ['%.2f' % x[0] for x in total_data[5]]
+    table.append(row)
+    row = ["Std. of  ROC"] + ['%.2f' % x[1] for x in total_data[5]]
+    table.append(row)
 
     # seventh rows
-    print(','.join(map(lambda x: '%.2f' % (x[0]), total_data[6])))
-    print(','.join(map(lambda x: '%.2f' % (x[1]), total_data[6])))
+    row = ["Avg Prec (AP)"] + ['%.2f' % x[0] for x in total_data[6]]
+    table.append(row)
+    row = ["Std of AP"] + ['%.2f' % x[1] for x in total_data[6]]
+    table.append(row)
+    print(tabulate(table, floatfmt=".2f"))
 
 
 if __name__ == "__main__":
diff --git a/helper/run_ase.sh b/helper/run_ase.sh
index 9c89557..08fd68a 100755
--- a/helper/run_ase.sh
+++ b/helper/run_ase.sh
@@ -1,29 +1,34 @@
-#!/bin/bash
+#!/bin/bash -ue
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   # This one is for processing all functions.
-  "/home/dongkwan/binkit-dataset/ase_debug.txt"
+  "${BINKIT_DATASET}/ase_debug.txt"
   # Then, for experiment and counting, we utilize them separately.
-#  "/home/dongkwan/binkit-dataset/ase1_debug.txt"
-#  "/home/dongkwan/binkit-dataset/ase2_debug.txt"
-#  "/home/dongkwan/binkit-dataset/ase3_debug.txt"
-#  "/home/dongkwan/binkit-dataset/ase4_debug.txt"
+#  "${BINKIT_DATASET}/ase1_debug.txt"
+#  "${BINKIT_DATASET}/ase2_debug.txt"
+#  "${BINKIT_DATASET}/ase3_debug.txt"
+#  "${BINKIT_DATASET}/ase4_debug.txt"
 )
 
-source_list="/home/dongkwan/binkit-dataset/ase_source_list.txt"
-ctags_dir="/home/dongkwan/binkit-dataset/ase_ctags_data"
+source_list="${BINKIT_DATASET}/ase_source_list.txt"
+ctags_dir="${BINKIT_DATASET}/ase_ctags_data"
 
+SECONDS=0
+echo "Processing IDA analysis ..."
 for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/do_idascript.py \
-    --idapath "/home/dongkwan/.tools/ida-6.95" \
-    --idc "/home/dongkwan/tiknib/tiknib/ida/fetch_funcdata_v6.95.py" \
+    --idapath "${IDA_PATH}" \
+    --idc "${IDA_FETCH_FUNCDATA}" \
     --input_list "${f}" \
     --log
 done
 
+echo "Extract source file names and line numbers... ${SECONDS}s"
 for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
@@ -32,6 +37,7 @@ do
     --threshold 1
 done
 
+echo "Filtering functions... ${SECONDS}s"
 for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
@@ -40,6 +46,7 @@ do
     --threshold 1
 done
 
+echo "Counting functions... ${SECONDS}s"
 for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
@@ -48,6 +55,7 @@ do
     --threshold 1
 done
 
+echo "Extracting function types ... ${SECONDS}s"
 for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
@@ -58,6 +66,7 @@ do
     --threshold 1
 done
 
+echo "Extracting features ... ${SECONDS}s"
 for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
@@ -65,3 +74,5 @@ do
     --input_list "${f}" \
     --threshold 1
 done
+
+echo "DONE in ${SECONDS}s"
diff --git a/helper/run_ase_roc.sh b/helper/run_ase_roc.sh
index db2bbaf..8ad01e2 100755
--- a/helper/run_ase_roc.sh
+++ b/helper/run_ase_roc.sh
@@ -1,39 +1,41 @@
 #!/bin/bash
 set -x
 
+source config/path_variables.py
+
 # You can run below commands in parallel.
 echo "Testing Presemantic features ..."
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase1_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase1_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_ase1.yml"
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase2_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase2_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_ase2.yml"
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase3_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase3_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_ase3.yml"
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase4_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase4_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_ase4.yml"
 
 echo "Testing Type features ..."
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase1_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase1_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_type_ase1.yml"
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase2_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase2_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_type_ase2.yml"
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase3_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase3_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_type_ase3.yml"
 python helper/test_roc.py \
-  --input_list "/home/dongkwan/binkit-dataset/ase4_debug.txt" \
+  --input_list "${BINKIT_DATASET}/ase4_debug.txt" \
   --train_funcs_limit 200000 \
   --config "config/ase18/config_type_ase4.yml"
diff --git a/helper/run_extra_roc.sh b/helper/run_extra_roc.sh
index 7940e32..50587ba 100755
--- a/helper/run_extra_roc.sh
+++ b/helper/run_extra_roc.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   "config/gnu_lto/config_gnu_normal_others_lto_clang4.yml"
   "config/gnu_lto/config_gnu_normal_others_lto_clang5.yml"
@@ -16,7 +18,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_lto.txt" \
+    --input_list "${BINKIT_DATASET}/test_lto.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -30,7 +32,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_noinline.txt" \
+    --input_list "${BINKIT_DATASET}/test_noinline.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
diff --git a/helper/run_gnu.sh b/helper/run_gnu.sh
index 0d3d231..1cd9268 100755
--- a/helper/run_gnu.sh
+++ b/helper/run_gnu.sh
@@ -1,23 +1,25 @@
-#!/bin/bash
+#!/bin/bash -eu
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
-  "/home/dongkwan/binkit-dataset/gnu_debug.txt"
-  "/home/dongkwan/binkit-dataset/gnu_debug_sizeopt.txt"
-  "/home/dongkwan/binkit-dataset/gnu_debug_pie.txt"
-  "/home/dongkwan/binkit-dataset/gnu_debug_noinline.txt"
-  "/home/dongkwan/binkit-dataset/gnu_debug_lto.txt"
-  "/home/dongkwan/binkit-dataset/gnu_debug_obfus.txt"
+  "${BINKIT_DATASET}/gnu_debug.txt"
+  "${BINKIT_DATASET}/gnu_debug_sizeopt.txt"
+  "${BINKIT_DATASET}/gnu_debug_pie.txt"
+  "${BINKIT_DATASET}/gnu_debug_noinline.txt"
+  "${BINKIT_DATASET}/gnu_debug_lto.txt"
+# "${BINKIT_DATASET}/gnu_debug_obfus.txt"
 )
-source_list="/home/dongkwan/binkit-dataset/gnu_source_list.txt"
-ctags_dir="/home/dongkwan/binkit-dataset/gnu_ctags_data"
+source_list="${BINKIT_DATASET}/gnu_source_list.txt"
+ctags_dir="${BINKIT_DATASET}/gnu_ctags_data"
 
 for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/do_idascript.py \
-    --idapath "/home/dongkwan/.tools/ida-6.95" \
-    --idc "/home/dongkwan/tiknib/tiknib/ida/fetch_funcdata.py" \
+    --idapath "${IDA_PATH}" \
+    --idc "${IDA_FETCH_FUNCDATA}" \
     --input_list "${f}" \
     --log
 done
diff --git a/helper/run_gnu_roc.sh b/helper/run_gnu_roc.sh
index 041b53b..772bead 100755
--- a/helper/run_gnu_roc.sh
+++ b/helper/run_gnu_roc.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   "config/gnu/config_gnu_normal_all.yml"
 
@@ -26,11 +28,12 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \
+    --input_list "${BINKIT_DATASET}/gnu_debug.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
 
+exit 0
 
 
 declare -a input_list=(
@@ -42,7 +45,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_size.txt" \
+    --input_list "${BINKIT_DATASET}/test_size.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -54,7 +57,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_lto.txt" \
+    --input_list "${BINKIT_DATASET}/test_lto.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -67,7 +70,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_noinline.txt" \
+    --input_list "${BINKIT_DATASET}/test_noinline.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -79,7 +82,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_pie.txt" \
+    --input_list "${BINKIT_DATASET}/test_pie.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -96,7 +99,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_obfus.txt" \
+    --input_list "${BINKIT_DATASET}/test_obfus.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
diff --git a/helper/run_gnu_roc_type.sh b/helper/run_gnu_roc_type.sh
index aef6aa0..427d2a4 100755
--- a/helper/run_gnu_roc_type.sh
+++ b/helper/run_gnu_roc_type.sh
@@ -1,6 +1,8 @@
-#!/bin/bash
+#!/bin/bash -eu
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   "config/gnu/config_gnu_normal_all_type.yml"
 
@@ -26,7 +28,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \
+    --input_list "${BINKIT_DATASET}/gnu_debug.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -42,7 +44,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_size.txt" \
+    --input_list "${BINKIT_DATASET}/test_size.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -54,7 +56,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_lto.txt" \
+    --input_list "${BINKIT_DATASET}/test_lto.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -67,7 +69,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_noinline.txt" \
+    --input_list "${BINKIT_DATASET}/test_noinline.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -79,7 +81,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_pie.txt" \
+    --input_list "${BINKIT_DATASET}/test_pie.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
@@ -96,7 +98,7 @@ for f in "${input_list[@]}"
 do
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/test_obfus.txt" \
+    --input_list "${BINKIT_DATASET}/test_obfus.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
diff --git a/helper/run_openssl_roc.sh b/helper/run_openssl_roc.sh
index fa23d7a..bbcbc58 100755
--- a/helper/run_openssl_roc.sh
+++ b/helper/run_openssl_roc.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   "config/openssl/config_openssl_all.yml"
   "config/openssl/config_openssl_arm_mips.yml"
@@ -15,7 +17,7 @@ do
   # this is feature selecting for openssl, so we use gnu normal dataset
   echo "Processing ${f} ..."
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \
+    --input_list "${BINKIT_DATASET}/gnu_debug.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
diff --git a/helper/run_openssl_roc_topk.sh b/helper/run_openssl_roc_topk.sh
index b58ab1b..3b91902 100755
--- a/helper/run_openssl_roc_topk.sh
+++ b/helper/run_openssl_roc_topk.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   "config/openssl_topk/config_topk_openssl.yml"
 )
@@ -9,7 +11,7 @@ do
   # this is feature selecting for openssl, so we use gnu normal dataset
   echo "Processing ${f} ..."
   python helper/test_topk.py \
-    --input_list "/home/dongkwan/binkit-dataset/ase4_debug_openssl.txt" \
+    --input_list "${BINKIT_DATASET}/ase4_debug_openssl.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 
diff --git a/helper/run_openssl_roc_topk_type.sh b/helper/run_openssl_roc_topk_type.sh
index d2c7c36..8cedf37 100755
--- a/helper/run_openssl_roc_topk_type.sh
+++ b/helper/run_openssl_roc_topk_type.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   "config/openssl_topk/config_topk_openssl_type.yml"
 )
@@ -9,7 +11,7 @@ do
   # this is feature selecting for openssl, so we use gnu normal dataset
   echo "Processing ${f} ..."
   python helper/test_topk.py \
-    --input_list "/home/dongkwan/binkit-dataset/ase4_debug_openssl.txt" \
+    --input_list "${BINKIT_DATASET}/ase4_debug_openssl.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 
diff --git a/helper/run_openssl_roc_type.sh b/helper/run_openssl_roc_type.sh
index ae347dd..25813eb 100755
--- a/helper/run_openssl_roc_type.sh
+++ b/helper/run_openssl_roc_type.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 set -x
 
+source config/path_variables.py
+
 declare -a input_list=(
   "config/openssl/config_openssl_all_type.yml"
   "config/openssl/config_openssl_arm_mips_type.yml"
@@ -15,7 +17,7 @@ do
   echo "Processing ${f} ..."
   # this is feature selecting for openssl, so we use gnu normal dataset
   python helper/test_roc.py \
-    --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \
+    --input_list "${BINKIT_DATASET}/gnu_debug.txt" \
     --train_funcs_limit 200000 \
     --config "${f}"
 done
diff --git a/requirements.txt b/requirements.txt
index 15ddc06..2c96c12 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ coloredlogs==6.0
 pyelftools==0.27
 PyYAML==5.4.1
 scikit_learn==0.24.2
+tabulate==0.8.9
diff --git a/tiknib/idascript.py b/tiknib/idascript.py
index 00669bb..db01fd1 100644
--- a/tiknib/idascript.py
+++ b/tiknib/idascript.py
@@ -6,6 +6,7 @@
 from subprocess import run, PIPE
 
 from tiknib.utils import system, get_file_type, do_multiprocess
+from config.path_variables import IDA_PATH
 
 import logging
 import coloredlogs
@@ -17,7 +18,7 @@
 class IDAScript:
     def __init__(
         self,
-        idapath="/home/dongkwan/.tools/ida-6.95",
+        idapath=IDA_PATH,
         idc=None,
         idcargs="",
         chunk_size=1,
@@ -111,7 +112,7 @@ def run_helper(self, input_fname):
             ida = ida.replace("idal", "idat")
 
         # Setup command line arguments
-        path = [ida, "-A", '-S"{}"'.format(idc_args)]
+        path = [ida, '-A', '-P+', '-S"{}"'.format(idc_args)]
         if self.log or self.stdout:
             fd, tmp_fname = tempfile.mkstemp()
             os.close(fd)
diff --git a/tiknib/utils.py b/tiknib/utils.py
index 29b8b0b..fc58abf 100644
--- a/tiknib/utils.py
+++ b/tiknib/utils.py
@@ -7,7 +7,6 @@
 import itertools
 from hashlib import sha1
 from subprocess import Popen, PIPE
-from statistics import mean as stat_mean
 
 import multiprocessing
 from multiprocessing import Pool, cpu_count
@@ -25,27 +24,26 @@
 
 logger = logging.getLogger(__name__)
 
-RESTR = (
+RE_PATTERN = (
     "(.*)_"
-    + "(gcc-4.9.4|gcc-5.5.0|gcc-6.4.0|gcc-7.3.0|gcc-8.2.0|"
-    + "clang-4.0|clang-5.0|clang-6.0|clang-7.0|"
-    + "clang-obfus-fla|clang-obfus-sub|clang-obfus-bcf|"
-    + "clang-obfus-all|clang-obfus-all-2|"
+    + "(gcc-[.0-9]+|clang-[.0-9]+|"
+    + "clang-obfus-[-a-z2]+|"
     + "gcc|clang)_"
-    + "(x86_32|x86_64|arm_32|arm_64|mips_32|mips_64|mipseb_32|mipseb_64)_"
+    + "((?:x86|arm|mips|mipseb|ppc)_(?:32|64))_"
     + "(O0|O1|O2|O3|Os)_"
     + "(.*)"
 )
+RESTR = re.compile(RE_PATTERN)
 
 # matches => package, compiler, arch, opti, bin_name
 def parse_fname(bin_path):
     base_name = os.path.basename(bin_path)
-    matches = re.search(RESTR, base_name).groups()
+    matches = RESTR.search(base_name).groups()
     return matches
 
 
 def parse_source_path(src_path):
-    matches = re.search(RESTR, src_path)
+    matches = RESTR.search(src_path)
     if not matches:
         return ""
     src_file = matches.groups()[-1]
@@ -55,6 +53,7 @@ def parse_source_path(src_path):
 
 # statistics mean function cannot handle the empty list
 def mean(l):
+    from statistics import mean as stat_mean
     return stat_mean(l or [0])