From 42398184c7a109c8c3a8cf9c3d2df669a9778575 Mon Sep 17 00:00:00 2001 From: Josh Bundt Date: Fri, 28 Jan 2022 13:03:33 -0500 Subject: [PATCH] adds support for new compilers - modifies regex in `tiknib/utils.py` to match any version of gcc/clang - adds `config/path_variables.py` to enable ease of use. note: `path_variables.py` was written to work in Bash and Python universally. - fixes a problem with the ase18 dataset and the coreutils debug information which caused all v6.5 functions to be discarded. - adds tablulate to print out a formated ROC table. - adds `-P+` to enable compressing the IDA Pro databases. Saves a lot of storage space for this massive dataset! For example objdump reduces from 48M to 6M. ``` 6.8M Dec 2 2018 /tmp/notpacked/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf 48M Jan 28 10:43 /tmp/notpacked/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf.i64 6.8M Dec 2 2018 /tmp/packed/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf 6.2M Jan 28 10:41 /tmp/packed/binutils-2.30_clang-7.0_arm_64_O0_objdump.elf.i64 ``` --- README.md | 11 ++++--- config/config_list_openssl.txt | 20 +++++------ example/example.sh | 33 ++++++++++++++++--- helper/do_idascript.py | 5 +-- helper/extract_lineno.py | 8 +++-- helper/filter_functions.py | 2 +- helper/get_roc_table.py | 51 +++++++++++++++++++---------- helper/run_ase.sh | 31 ++++++++++++------ helper/run_ase_roc.sh | 18 +++++----- helper/run_extra_roc.sh | 6 ++-- helper/run_gnu.sh | 24 +++++++------- helper/run_gnu_roc.sh | 15 +++++---- helper/run_gnu_roc_type.sh | 16 +++++---- helper/run_openssl_roc.sh | 4 ++- helper/run_openssl_roc_topk.sh | 4 ++- helper/run_openssl_roc_topk_type.sh | 4 ++- helper/run_openssl_roc_type.sh | 4 ++- requirements.txt | 1 + tiknib/idascript.py | 5 +-- tiknib/utils.py | 17 +++++----- 20 files changed, 178 insertions(+), 101 deletions(-) diff --git a/README.md b/README.md index 060a401..a9b83e6 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,10 @@ confirm their equivalence. Based on these criteria we conducted several steps to build ground truth and clean the datasets. For more details, please check [our paper](https://arxiv.org/abs/2011.10749). -### 1. Run IDA Pro to extract preliminary data for each functions. +### 1. Configure path variables for IDA Pro and this repository (`config/path_variables.py`). + + +### 2. Run IDA Pro to extract preliminary data for each functions. **This step takes the most time.** @@ -145,7 +148,7 @@ Additionally, **you can use this script to run any idascript for numerous binaries in parallel.** -### 2. Extract source file names and line numbers to build ground truth. +### 3. Extract source file names and line numbers to build ground truth. This extracts source file name and line number by parsing the debugging information in a given binary. The binary must have been compiled with the `-g` option. @@ -156,7 +159,7 @@ $ python helper/extract_lineno.py \ --threshold 1 ``` -### 3. Filter functions. +### 4. Filter functions. This filters functions by checking the source file name and line number. This removes compiler intrinsic functions and duplicate functions spread over multiple binaries within the same package. @@ -167,7 +170,7 @@ $ python helper/filter_functions.py \ --threshold 1 ``` -### (Optional) 4. Counting the number of functions. +### (Optional) 5. Counting the number of functions. This counts the number of functions and generates a graph of that function on the same path of `input_list`. This also prints the numbers separated by `','`. In the below example, a pdf file containing the graph will be diff --git a/config/config_list_openssl.txt b/config/config_list_openssl.txt index adf3bb2..a37747b 100644 --- a/config/config_list_openssl.txt +++ b/config/config_list_openssl.txt @@ -1,10 +1,10 @@ -/home/dongkwan/tiknib/config/openssl/config_openssl_all.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_arm_arm.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_arm_mips.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_arm_x86.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_mips_arm.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_mips_mips.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_mips_x86.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_x86_arm.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_x86_mips.yml -/home/dongkwan/tiknib/config/openssl/config_openssl_x86_x86.yml +config/openssl/config_openssl_all.yml +config/openssl/config_openssl_arm_arm.yml +config/openssl/config_openssl_arm_mips.yml +config/openssl/config_openssl_arm_x86.yml +config/openssl/config_openssl_mips_arm.yml +config/openssl/config_openssl_mips_mips.yml +config/openssl/config_openssl_mips_x86.yml +config/openssl/config_openssl_x86_arm.yml +config/openssl/config_openssl_x86_mips.yml +config/openssl/config_openssl_x86_x86.yml diff --git a/example/example.sh b/example/example.sh index 0ebcae8..64c3e56 100755 --- a/example/example.sh +++ b/example/example.sh @@ -1,24 +1,47 @@ #!/bin/bash + +source config/path_variables.py + +SECONDS=0 echo "Processing IDA analysis ..." python3 helper/do_idascript.py \ - --idapath "/home/dongkwan/.tools/ida-6.95" \ - --idc "tiknib/ida/fetch_funcdata.py" \ + --idapath "${IDA_PATH}" \ + --idc "${IDA_FETCH_FUNCDATA}" \ --input_list "example/input_list_find.txt" \ --log -echo "Extracting function types ..." + +echo "Extract source file names and line numbers... ${SECONDS}s" +python3 helper/extract_lineno.py \ + --input_list "example/input_list_find.txt" \ + --threshold 1 + + +echo "Filtering functions... ${SECONDS}s" +python3 helper/filter_functions.py \ + --input_list "example/input_list_find.txt" \ + --threshold 1 + + +echo "Counting functions..." +python3 helper/count_functions.py \ + --input_list "example/input_list_find.txt" \ + --threshold 1 + + +echo "Extracting function types ... ${SECONDS}s" python3 helper/extract_functype.py \ --source_list "example/source_list.txt" \ --input_list "example/input_list_find.txt" \ --ctags_dir "data/ctags" \ --threshold 1 -echo "Extracting features ..." +echo "Extracting features ... ${SECONDS}s" python3 helper/extract_features.py \ --input_list "example/input_list_find.txt" \ --threshold 1 -echo "Testing features ..." +echo "Testing features ... ${SECONDS}s" python3 helper/test_roc.py \ --input_list "example/input_list_find.txt" \ --config "config/gnu/config_gnu_normal_all.yml" diff --git a/helper/do_idascript.py b/helper/do_idascript.py index b884c8a..703386b 100644 --- a/helper/do_idascript.py +++ b/helper/do_idascript.py @@ -5,6 +5,7 @@ sys.path.insert(0, os.path.join(sys.path[0], "..")) from tiknib.idascript import IDAScript from tiknib.utils import do_multiprocess +from config.path_variables import IDA_PATH, IDA_FETCH_FUNCDATA if __name__ == "__main__": op = OptionParser() @@ -16,7 +17,7 @@ action="store", type=str, dest="idapath", - default="/home/dongkwan/.tools/ida-6.95", + default=IDA_PATH, help="IDA directory path", ) op.add_option( @@ -24,7 +25,7 @@ action="store", type=str, dest="idc", - default="tiknib/ida/fetch_funcdata.py", + default=IDA_FETCH_FUNCDATA, help="IDA script file", ) op.add_option( diff --git a/helper/extract_lineno.py b/helper/extract_lineno.py index 134a0fb..bbe4024 100644 --- a/helper/extract_lineno.py +++ b/helper/extract_lineno.py @@ -8,6 +8,7 @@ from tiknib.utils import do_multiprocess from tiknib.utils import load_func_data, store_func_data from tiknib.utils import parse_source_path +from config.path_variables import * import logging import coloredlogs @@ -31,6 +32,9 @@ def extract_func_lineno(bin_name): func["src_path"] = line_map[func_addr][0] func["src_file"] = parse_source_path(func["src_path"]) func["src_line"] = line_map[func_addr][1] + # Fix ase18 source paths coreutils-6.7-6.5 / coreutils-6.7-6.7 + if 'coreutils-6.7-6.5' in func['src_path']: + func['src_path'] = func['src_path'].replace('6.7-6.5', '6.5') store_func_data(bin_name, func_data_list) return @@ -84,8 +88,8 @@ def extract_func_lineno(bin_name): from tiknib.idascript import IDAScript idascript = IDAScript( - idapath="/home/dongkwan/.tools/ida-6.95", - idc="tiknib/ida/fetch_funcdata.py", + idapath=IDA_PATH, + idc=IDA_FETCH_FUNCDATA, force=True, log=True, ) diff --git a/helper/filter_functions.py b/helper/filter_functions.py index 9dad62a..2f866af 100644 --- a/helper/filter_functions.py +++ b/helper/filter_functions.py @@ -39,7 +39,7 @@ def filter_funcs(bin_path): # print(func['name'], func['src_file'], func['src_line']) # filter functions by package name (remove functions inserted by compilers) - funcs = list(filter(lambda x: x['package'] in x['src_path'], funcs)) + funcs = list(filter(lambda x: pack_name in x['src_path'], funcs)) num_pack_funcs = len(funcs) if num_pack_funcs == 0: diff --git a/helper/get_roc_table.py b/helper/get_roc_table.py index f0d8465..2675c36 100644 --- a/helper/get_roc_table.py +++ b/helper/get_roc_table.py @@ -6,6 +6,7 @@ import numpy as np from optparse import OptionParser +from tabulate import tabulate sys.path.insert(0, os.path.join(sys.path[0], "..")) from tiknib.utils import load_cache @@ -15,6 +16,12 @@ rootLogger = logging.getLogger() rootLogger.setLevel(logging.INFO) +def config_rename(config_fname): + # TODO: clean up the key name (config_fname to something neat). + config_key = os.path.basename(config_fname) + config_key = re.search("config_(.+).yml", config_key).groups()[0] + return config_key + def calc_tptn_gap(tps, tns): return np.mean(np.abs(tps - tns), axis=0) @@ -43,9 +50,7 @@ def load_results(opts): # select the latest one cache_dir = sorted(glob.glob("{}/*".format(outdir)))[-1] - # TODO: clean up the key name (config_fname to something neat). - config_key = os.path.basename(config_fname) - config_key = re.search("config_(.+).yml", config_key).groups()[0] + config_key = config_rename(config_fname) all_data[config_key] = [] features_inter = set() for idx in range(10): @@ -65,9 +70,7 @@ def load_results(opts): # Now fetch real data for config_idx, config_fname in enumerate(config_fnames): - # TODO: clean up the key name (config_fname to something neat). - config_key = os.path.basename(config_fname) - config_key = re.search("config_(.+).yml", config_key).groups()[0] + config_key = config_rename(config_fname) rocs = [] aps = [] @@ -154,18 +157,22 @@ def get_results(opts): config_fnames, total_data, features, features_union = load_results(opts) # first rows - print(','.join(map(lambda x: + row1 = ["# Train pairs (10^6)"] + row1.extend(list(map(lambda x: '%.2f' % (x[0] / 1000000.0) if x[0] > 100000 else '%.2fF' % (x[0] / 10000), total_data[0]))) - print(','.join(map(lambda x: + row2 = ["# Test pairs (10^6)"] + row2.extend(list(map(lambda x: '%.2f' % (x[1] / 1000000.0) if x[1] > 100000 else '%.2fF' % (x[1] / 10000), total_data[0]))) # second rows - print(','.join(map(lambda x: '%.1f' % (x[0]), total_data[1]))) - print(','.join(map(lambda x: '%.1f' % (x[1]), total_data[1]))) + row3 = ["Train time"] + ['%.1f' % x[0] for x in total_data[1]] + row4 = ["Test time"] + ['%.1f' % x[1] for x in total_data[1]] + + table = [row1, row2, row3, row4] # third rows for idx in features_union: @@ -177,22 +184,30 @@ def get_results(opts): s.append('%.2f-' % (data[feature][0])) else: s.append('%.2f' % (data[feature][0])) - print(','.join(s)) + table.append(s) # fourth row - print(','.join(map(lambda x: '%.1f' % (x), total_data[3]))) + row = ["Avg # features"] + ['%.1f' % x for x in total_data[3]] + table.append(row) # fifth rows - print(','.join(map(lambda x: '%.2f' % (x[0]), total_data[4]))) - print(','.join(map(lambda x: '%.2f' % (x[1]), total_data[4]))) + row = ["Mean tptn_gap"] + ['%.2f' % x[0] for x in total_data[4]] + table.append(row) + row = ["Std tptn_gap"] + ['%.2f' % x[1] for x in total_data[4]] + table.append(row) # sixth rows - print(','.join(map(lambda x: '%.2f' % (x[0]), total_data[5]))) - print(','.join(map(lambda x: '%.2f' % (x[1]), total_data[5]))) + row = ["ROC AUC"] + ['%.2f' % x[0] for x in total_data[5]] + table.append(row) + row = ["Std. of ROC"] + ['%.2f' % x[1] for x in total_data[5]] + table.append(row) # seventh rows - print(','.join(map(lambda x: '%.2f' % (x[0]), total_data[6]))) - print(','.join(map(lambda x: '%.2f' % (x[1]), total_data[6]))) + row = ["Avg Prec (AP)"] + ['%.2f' % x[0] for x in total_data[6]] + table.append(row) + row = ["Std of AP"] + ['%.2f' % x[1] for x in total_data[6]] + table.append(row) + print(tabulate(table, floatfmt=".2f")) if __name__ == "__main__": diff --git a/helper/run_ase.sh b/helper/run_ase.sh index 9c89557..08fd68a 100755 --- a/helper/run_ase.sh +++ b/helper/run_ase.sh @@ -1,29 +1,34 @@ -#!/bin/bash +#!/bin/bash -ue set -x +source config/path_variables.py + declare -a input_list=( # This one is for processing all functions. - "/home/dongkwan/binkit-dataset/ase_debug.txt" + "${BINKIT_DATASET}/ase_debug.txt" # Then, for experiment and counting, we utilize them separately. -# "/home/dongkwan/binkit-dataset/ase1_debug.txt" -# "/home/dongkwan/binkit-dataset/ase2_debug.txt" -# "/home/dongkwan/binkit-dataset/ase3_debug.txt" -# "/home/dongkwan/binkit-dataset/ase4_debug.txt" +# "${BINKIT_DATASET}/ase1_debug.txt" +# "${BINKIT_DATASET}/ase2_debug.txt" +# "${BINKIT_DATASET}/ase3_debug.txt" +# "${BINKIT_DATASET}/ase4_debug.txt" ) -source_list="/home/dongkwan/binkit-dataset/ase_source_list.txt" -ctags_dir="/home/dongkwan/binkit-dataset/ase_ctags_data" +source_list="${BINKIT_DATASET}/ase_source_list.txt" +ctags_dir="${BINKIT_DATASET}/ase_ctags_data" +SECONDS=0 +echo "Processing IDA analysis ..." for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/do_idascript.py \ - --idapath "/home/dongkwan/.tools/ida-6.95" \ - --idc "/home/dongkwan/tiknib/tiknib/ida/fetch_funcdata_v6.95.py" \ + --idapath "${IDA_PATH}" \ + --idc "${IDA_FETCH_FUNCDATA}" \ --input_list "${f}" \ --log done +echo "Extract source file names and line numbers... ${SECONDS}s" for f in "${input_list[@]}" do echo "Processing ${f} ..." @@ -32,6 +37,7 @@ do --threshold 1 done +echo "Filtering functions... ${SECONDS}s" for f in "${input_list[@]}" do echo "Processing ${f} ..." @@ -40,6 +46,7 @@ do --threshold 1 done +echo "Counting functions... ${SECONDS}s" for f in "${input_list[@]}" do echo "Processing ${f} ..." @@ -48,6 +55,7 @@ do --threshold 1 done +echo "Extracting function types ... ${SECONDS}s" for f in "${input_list[@]}" do echo "Processing ${f} ..." @@ -58,6 +66,7 @@ do --threshold 1 done +echo "Extracting features ... ${SECONDS}s" for f in "${input_list[@]}" do echo "Processing ${f} ..." @@ -65,3 +74,5 @@ do --input_list "${f}" \ --threshold 1 done + +echo "DONE in ${SECONDS}s" diff --git a/helper/run_ase_roc.sh b/helper/run_ase_roc.sh index db2bbaf..8ad01e2 100755 --- a/helper/run_ase_roc.sh +++ b/helper/run_ase_roc.sh @@ -1,39 +1,41 @@ #!/bin/bash set -x +source config/path_variables.py + # You can run below commands in parallel. echo "Testing Presemantic features ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase1_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase1_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_ase1.yml" python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase2_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase2_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_ase2.yml" python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase3_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase3_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_ase3.yml" python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase4_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase4_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_ase4.yml" echo "Testing Type features ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase1_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase1_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_type_ase1.yml" python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase2_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase2_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_type_ase2.yml" python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase3_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase3_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_type_ase3.yml" python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/ase4_debug.txt" \ + --input_list "${BINKIT_DATASET}/ase4_debug.txt" \ --train_funcs_limit 200000 \ --config "config/ase18/config_type_ase4.yml" diff --git a/helper/run_extra_roc.sh b/helper/run_extra_roc.sh index 7940e32..50587ba 100755 --- a/helper/run_extra_roc.sh +++ b/helper/run_extra_roc.sh @@ -1,6 +1,8 @@ #!/bin/bash set -x +source config/path_variables.py + declare -a input_list=( "config/gnu_lto/config_gnu_normal_others_lto_clang4.yml" "config/gnu_lto/config_gnu_normal_others_lto_clang5.yml" @@ -16,7 +18,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_lto.txt" \ + --input_list "${BINKIT_DATASET}/test_lto.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -30,7 +32,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_noinline.txt" \ + --input_list "${BINKIT_DATASET}/test_noinline.txt" \ --train_funcs_limit 200000 \ --config "${f}" done diff --git a/helper/run_gnu.sh b/helper/run_gnu.sh index 0d3d231..1cd9268 100755 --- a/helper/run_gnu.sh +++ b/helper/run_gnu.sh @@ -1,23 +1,25 @@ -#!/bin/bash +#!/bin/bash -eu set -x +source config/path_variables.py + declare -a input_list=( - "/home/dongkwan/binkit-dataset/gnu_debug.txt" - "/home/dongkwan/binkit-dataset/gnu_debug_sizeopt.txt" - "/home/dongkwan/binkit-dataset/gnu_debug_pie.txt" - "/home/dongkwan/binkit-dataset/gnu_debug_noinline.txt" - "/home/dongkwan/binkit-dataset/gnu_debug_lto.txt" - "/home/dongkwan/binkit-dataset/gnu_debug_obfus.txt" + "${BINKIT_DATASET}/gnu_debug.txt" + "${BINKIT_DATASET}/gnu_debug_sizeopt.txt" + "${BINKIT_DATASET}/gnu_debug_pie.txt" + "${BINKIT_DATASET}/gnu_debug_noinline.txt" + "${BINKIT_DATASET}/gnu_debug_lto.txt" +# "${BINKIT_DATASET}/gnu_debug_obfus.txt" ) -source_list="/home/dongkwan/binkit-dataset/gnu_source_list.txt" -ctags_dir="/home/dongkwan/binkit-dataset/gnu_ctags_data" +source_list="${BINKIT_DATASET}/gnu_source_list.txt" +ctags_dir="${BINKIT_DATASET}/gnu_ctags_data" for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/do_idascript.py \ - --idapath "/home/dongkwan/.tools/ida-6.95" \ - --idc "/home/dongkwan/tiknib/tiknib/ida/fetch_funcdata.py" \ + --idapath "${IDA_PATH}" \ + --idc "${IDA_FETCH_FUNCDATA}" \ --input_list "${f}" \ --log done diff --git a/helper/run_gnu_roc.sh b/helper/run_gnu_roc.sh index 041b53b..772bead 100755 --- a/helper/run_gnu_roc.sh +++ b/helper/run_gnu_roc.sh @@ -1,6 +1,8 @@ #!/bin/bash set -x +source config/path_variables.py + declare -a input_list=( "config/gnu/config_gnu_normal_all.yml" @@ -26,11 +28,12 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \ + --input_list "${BINKIT_DATASET}/gnu_debug.txt" \ --train_funcs_limit 200000 \ --config "${f}" done +exit 0 declare -a input_list=( @@ -42,7 +45,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_size.txt" \ + --input_list "${BINKIT_DATASET}/test_size.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -54,7 +57,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_lto.txt" \ + --input_list "${BINKIT_DATASET}/test_lto.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -67,7 +70,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_noinline.txt" \ + --input_list "${BINKIT_DATASET}/test_noinline.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -79,7 +82,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_pie.txt" \ + --input_list "${BINKIT_DATASET}/test_pie.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -96,7 +99,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_obfus.txt" \ + --input_list "${BINKIT_DATASET}/test_obfus.txt" \ --train_funcs_limit 200000 \ --config "${f}" done diff --git a/helper/run_gnu_roc_type.sh b/helper/run_gnu_roc_type.sh index aef6aa0..427d2a4 100755 --- a/helper/run_gnu_roc_type.sh +++ b/helper/run_gnu_roc_type.sh @@ -1,6 +1,8 @@ -#!/bin/bash +#!/bin/bash -eu set -x +source config/path_variables.py + declare -a input_list=( "config/gnu/config_gnu_normal_all_type.yml" @@ -26,7 +28,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \ + --input_list "${BINKIT_DATASET}/gnu_debug.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -42,7 +44,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_size.txt" \ + --input_list "${BINKIT_DATASET}/test_size.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -54,7 +56,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_lto.txt" \ + --input_list "${BINKIT_DATASET}/test_lto.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -67,7 +69,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_noinline.txt" \ + --input_list "${BINKIT_DATASET}/test_noinline.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -79,7 +81,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_pie.txt" \ + --input_list "${BINKIT_DATASET}/test_pie.txt" \ --train_funcs_limit 200000 \ --config "${f}" done @@ -96,7 +98,7 @@ for f in "${input_list[@]}" do echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/test_obfus.txt" \ + --input_list "${BINKIT_DATASET}/test_obfus.txt" \ --train_funcs_limit 200000 \ --config "${f}" done diff --git a/helper/run_openssl_roc.sh b/helper/run_openssl_roc.sh index fa23d7a..bbcbc58 100755 --- a/helper/run_openssl_roc.sh +++ b/helper/run_openssl_roc.sh @@ -1,6 +1,8 @@ #!/bin/bash set -x +source config/path_variables.py + declare -a input_list=( "config/openssl/config_openssl_all.yml" "config/openssl/config_openssl_arm_mips.yml" @@ -15,7 +17,7 @@ do # this is feature selecting for openssl, so we use gnu normal dataset echo "Processing ${f} ..." python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \ + --input_list "${BINKIT_DATASET}/gnu_debug.txt" \ --train_funcs_limit 200000 \ --config "${f}" done diff --git a/helper/run_openssl_roc_topk.sh b/helper/run_openssl_roc_topk.sh index b58ab1b..3b91902 100755 --- a/helper/run_openssl_roc_topk.sh +++ b/helper/run_openssl_roc_topk.sh @@ -1,6 +1,8 @@ #!/bin/bash set -x +source config/path_variables.py + declare -a input_list=( "config/openssl_topk/config_topk_openssl.yml" ) @@ -9,7 +11,7 @@ do # this is feature selecting for openssl, so we use gnu normal dataset echo "Processing ${f} ..." python helper/test_topk.py \ - --input_list "/home/dongkwan/binkit-dataset/ase4_debug_openssl.txt" \ + --input_list "${BINKIT_DATASET}/ase4_debug_openssl.txt" \ --train_funcs_limit 200000 \ --config "${f}" diff --git a/helper/run_openssl_roc_topk_type.sh b/helper/run_openssl_roc_topk_type.sh index d2c7c36..8cedf37 100755 --- a/helper/run_openssl_roc_topk_type.sh +++ b/helper/run_openssl_roc_topk_type.sh @@ -1,6 +1,8 @@ #!/bin/bash set -x +source config/path_variables.py + declare -a input_list=( "config/openssl_topk/config_topk_openssl_type.yml" ) @@ -9,7 +11,7 @@ do # this is feature selecting for openssl, so we use gnu normal dataset echo "Processing ${f} ..." python helper/test_topk.py \ - --input_list "/home/dongkwan/binkit-dataset/ase4_debug_openssl.txt" \ + --input_list "${BINKIT_DATASET}/ase4_debug_openssl.txt" \ --train_funcs_limit 200000 \ --config "${f}" diff --git a/helper/run_openssl_roc_type.sh b/helper/run_openssl_roc_type.sh index ae347dd..25813eb 100755 --- a/helper/run_openssl_roc_type.sh +++ b/helper/run_openssl_roc_type.sh @@ -1,6 +1,8 @@ #!/bin/bash set -x +source config/path_variables.py + declare -a input_list=( "config/openssl/config_openssl_all_type.yml" "config/openssl/config_openssl_arm_mips_type.yml" @@ -15,7 +17,7 @@ do echo "Processing ${f} ..." # this is feature selecting for openssl, so we use gnu normal dataset python helper/test_roc.py \ - --input_list "/home/dongkwan/binkit-dataset/gnu_debug.txt" \ + --input_list "${BINKIT_DATASET}/gnu_debug.txt" \ --train_funcs_limit 200000 \ --config "${f}" done diff --git a/requirements.txt b/requirements.txt index 15ddc06..2c96c12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ coloredlogs==6.0 pyelftools==0.27 PyYAML==5.4.1 scikit_learn==0.24.2 +tabulate==0.8.9 diff --git a/tiknib/idascript.py b/tiknib/idascript.py index 00669bb..db01fd1 100644 --- a/tiknib/idascript.py +++ b/tiknib/idascript.py @@ -6,6 +6,7 @@ from subprocess import run, PIPE from tiknib.utils import system, get_file_type, do_multiprocess +from config.path_variables import IDA_PATH import logging import coloredlogs @@ -17,7 +18,7 @@ class IDAScript: def __init__( self, - idapath="/home/dongkwan/.tools/ida-6.95", + idapath=IDA_PATH, idc=None, idcargs="", chunk_size=1, @@ -111,7 +112,7 @@ def run_helper(self, input_fname): ida = ida.replace("idal", "idat") # Setup command line arguments - path = [ida, "-A", '-S"{}"'.format(idc_args)] + path = [ida, '-A', '-P+', '-S"{}"'.format(idc_args)] if self.log or self.stdout: fd, tmp_fname = tempfile.mkstemp() os.close(fd) diff --git a/tiknib/utils.py b/tiknib/utils.py index 29b8b0b..fc58abf 100644 --- a/tiknib/utils.py +++ b/tiknib/utils.py @@ -7,7 +7,6 @@ import itertools from hashlib import sha1 from subprocess import Popen, PIPE -from statistics import mean as stat_mean import multiprocessing from multiprocessing import Pool, cpu_count @@ -25,27 +24,26 @@ logger = logging.getLogger(__name__) -RESTR = ( +RE_PATTERN = ( "(.*)_" - + "(gcc-4.9.4|gcc-5.5.0|gcc-6.4.0|gcc-7.3.0|gcc-8.2.0|" - + "clang-4.0|clang-5.0|clang-6.0|clang-7.0|" - + "clang-obfus-fla|clang-obfus-sub|clang-obfus-bcf|" - + "clang-obfus-all|clang-obfus-all-2|" + + "(gcc-[.0-9]+|clang-[.0-9]+|" + + "clang-obfus-[-a-z2]+|" + "gcc|clang)_" - + "(x86_32|x86_64|arm_32|arm_64|mips_32|mips_64|mipseb_32|mipseb_64)_" + + "((?:x86|arm|mips|mipseb|ppc)_(?:32|64))_" + "(O0|O1|O2|O3|Os)_" + "(.*)" ) +RESTR = re.compile(RE_PATTERN) # matches => package, compiler, arch, opti, bin_name def parse_fname(bin_path): base_name = os.path.basename(bin_path) - matches = re.search(RESTR, base_name).groups() + matches = RESTR.search(base_name).groups() return matches def parse_source_path(src_path): - matches = re.search(RESTR, src_path) + matches = RESTR.search(src_path) if not matches: return "" src_file = matches.groups()[-1] @@ -55,6 +53,7 @@ def parse_source_path(src_path): # statistics mean function cannot handle the empty list def mean(l): + from statistics import mean as stat_mean return stat_mean(l or [0])