diff --git a/README.md b/README.md index e642ccd..990221d 100644 --- a/README.md +++ b/README.md @@ -85,8 +85,15 @@ For building the cross-compiling environment and dataset, please check ### 1. Run IDA Pro to extract preliminary data for each functions. -This step takes the most time. Please configure the `chunk_size` for parallel -processing. +**This step takes the most time.** + +This step fetches preliminary data for the functions in each binary and stores +the data in a `pickle` format. For a given binary, it generates a pickle file on +the same path with a suffix of `.pickle`. Please configure the `chunk_size` for +parallel processing. + +For IDA Pro v6.95 (original version in the paper), use +`tiknib/ida/fetch_funcdata.py`. ```bash $ python helper/do_idascript.py \ @@ -96,10 +103,25 @@ $ python helper/do_idascript.py \ --log ``` -Additionally, you can use this script to run any idascript in parallel. +For IDA Pro v7.5, use `tiknib/ida/fetch_funcdata_v7.5.py`. + +```bash +$ python helper/do_idascript.py \ + --idapath "/home/dongkwan/.tools/ida-v7.5" \ + --idc "tiknib/ida/fetch_funcdata_v7.5.py" \ + --input_list "example/input_list_find.txt" \ + --log +``` + +Additionally, **you can use this script to run any idascript for numerous +binaries in parallel.** ### 2. Extract function type information for type features. +By utilizing `ctags`, this will extract type information. This will add +`abstract_args_type` and `abstract_ret_type` into the previously created +`.pickle` file. + ```bash $ python helper/extract_functype.py \ --source_list "example/source_list.txt" \ @@ -108,14 +130,121 @@ $ python helper/extract_functype.py \ --threshold 1 ``` +For example, for a function type of `mode_change *__usercall@(const char +*ref_file@)` extracted from IDA Pro, it will follow the ctags and +recognizes `mode_change` represents for a custom `struct`. Consequently, it adds +new data as below. + +``` python + 'abstract_args_type': ['char *'], + 'abstract_ret_type': 'struct *', +``` + ### 3. Extract numeric presemantic features and type features. +This extracts numeric presemantic features as stated above. + ```bash $ python helper/extract_features.py \ --input_list "example/input_list_find.txt" \ --threshold 1 ``` +The extracted features will be stored in each `.pickle` file. Below is an +example showing a part of extracted features for the `mode_create_from_ref` +function in the `find` binary in `findutils`. + +```python +{ + 'package': 'findutils-4.6.0', + 'bin_name': 'find.elf', + 'name': 'mode_create_from_ref', + 'arch': 'x86_64', + 'opti': 'O3', + 'compiler': 'gcc-8.2.0', + 'others': 'normal', + 'func_type': 'mode_change *__usercall@(const char *ref_file@)', + 'abstract_args_type': ['char *'], + 'ret_type': 'mode_change *', + 'abstract_ret_type': 'struct *', + 'cfg': [(0, 1), (0, 2), (1, 2)], + 'cfg_size': 3, + 'feature': { + 'cfg_avg_degree': 2, + 'cfg_avg_indegree': 1, + 'cfg_avg_loopintersize': 0, + 'cfg_avg_loopsize': 0, + 'cfg_avg_outdegree': 1, + 'cfg_avg_sccsize': 1, + 'cfg_max_depth': 2, + 'cfg_max_width': 2, + 'cfg_num_backedges': 0, + 'cfg_num_bfs_edges': 2, + 'cfg_num_degree': 6, + 'cfg_num_indegree': 3, + 'cfg_num_loops': 0, + 'cfg_num_loops_inter': 0, + 'cfg_num_outdegree': 3, + 'cfg_num_scc': 3, + 'cfg_size': 3, + 'cfg_sum_loopintersize': 0, + 'cfg_sum_loopsize': 0, + 'cfg_sum_sccsize': 3, + 'cg_num_callees': 2, + 'cg_num_callers': 0, + 'cg_num_imported_callees': 1, + 'cg_num_imported_calls': 1, + 'cg_num_incalls': 0, + 'cg_num_outcalls': 2, + 'data_avg_abs_strings': 0, + 'data_avg_arg_type': 2, + 'data_avg_consts': 144, + 'data_avg_strlen': 0, + 'data_mul_arg_type': 2, + 'data_num_args': 1, + 'data_num_consts': 1, + 'data_num_strings': 0, + 'data_ret_type': 2, + 'data_sum_abs_strings': 0, + 'data_sum_abs_strings_seq': 0, + 'data_sum_arg_type': 2, + 'data_sum_arg_type_seq': 2, + 'data_sum_consts_seq': 144, + 'data_sum_strlen': 0, + 'data_sum_strlen_seq': 0, + 'inst_avg_abs_arith': 0.6666666666666666, + 'inst_avg_abs_ctransfer': 1.3333333333333333, + 'inst_avg_abs_dtransfer': 4.666666666666667, + 'inst_avg_arith': 0.6666666666666666, + 'inst_avg_bitflag': 0.3333333333333333, + 'inst_avg_cmp': 0.3333333333333333, + 'inst_avg_cndctransfer': 0.3333333333333333, + 'inst_avg_ctransfer': 1.0, + 'inst_avg_dtransfer': 4.666666666666667, + 'inst_avg_grp_call': 0.6666666666666666, + 'inst_avg_grp_jump': 0.3333333333333333, + 'inst_avg_grp_ret': 0.3333333333333333, + 'inst_avg_logic': 0.3333333333333333, + 'inst_avg_total': 7.333333333333333, + 'inst_num_abs_arith': 2.0, + 'inst_num_abs_ctransfer': 4.0, + 'inst_num_abs_dtransfer': 14.0, + 'inst_num_arith': 2.0, + 'inst_num_bitflag': 1.0, + 'inst_num_cmp': 1.0, + 'inst_num_cndctransfer': 1.0, + 'inst_num_ctransfer': 3.0, + 'inst_num_dtransfer': 14.0, + 'inst_num_grp_call': 2.0, + 'inst_num_grp_jump': 1.0, + 'inst_num_grp_ret': 1.0, + 'inst_num_logic': 1.0, + 'inst_num_total': 22 + }, + ... +} +``` + ### 4. Evaluate target configuration ```bash @@ -125,22 +254,8 @@ $ python helper/test_roc.py \ ``` For more details, please check `example/`. All configuration files for our -experiments are in `config/`. - -# Issues - -### Tested environment -We ran all our experiments on a server equipped with four Intel Xeon E7-8867v4 -2.40 GHz CPUs (total 144 cores), 896 GB DDR4 RAM, and 4 TB SSD. We setup Ubuntu -16.04 with IDA Pro v6.95 on the server. - -We will make it run on IDA Pro v7.5 soon. - -### Tested python version -- Python 3.8.0 - -### Running example -The time spent for running `example/example.sh` took as below. +experiments are in `config/`. The time spent for running `example/example.sh` +took as below. - Processing IDA analysis: 1384 s - Extracting function types: 102 s @@ -148,8 +263,7 @@ The time spent for running `example/example.sh` took as below. - Training: 31 s - Testing: 0.8 s -You can obtain below information after running `test_roc.py` in the example. -Note that below is just one example. +You can obtain below information after running `test_roc.py`. ``` Features: @@ -172,6 +286,15 @@ Avg. # of Train Pairs: 155437 Avg. # of Test Pairs: 17270 ``` +# Issues + +### Tested environment +We ran all our experiments on a server equipped with four Intel Xeon E7-8867v4 +2.40 GHz CPUs (total 144 cores), 896 GB DDR4 RAM, and 4 TB SSD. We setup Ubuntu +16.04 with IDA Pro v6.95 on the server. + +Currently, it works on IDA Pro v7.5 and Python 3.8.0 on the system. + # Authors This project has been conducted by the below authors at KAIST. * [Dongkwan Kim](https://0xdkay.me/) diff --git a/tiknib/ida/fetch_funcdata_v7.5.py b/tiknib/ida/fetch_funcdata_v7.5.py new file mode 100644 index 0000000..8432df0 --- /dev/null +++ b/tiknib/ida/fetch_funcdata_v7.5.py @@ -0,0 +1,267 @@ +import os +import sys +import string + +from hashlib import sha1 +from collections import defaultdict + +import time +import pprint as pp + +import idautils +import idc +import idaapi +import ida_pro +import ida_nalt +import ida_bytes + +sys.path.insert(0, ".") +from tiknib.utils import demangle, get_arch, init_idc, parse_fname, store_func_data + +printset = set(string.printable) +isprintable = lambda x: set(x).issubset(printset) + +# find consts +def get_consts(start_addr, end_addr): + consts = [] + for h in idautils.Heads(start_addr, end_addr): + insn = DecodeInstruction(h) + if insn: + for op in insn.ops: + if op.type == idaapi.o_imm: + # get operand value + imm_value = op.value + # check if addres is loaded in idb + if not ida_bytes.is_loaded(imm_value): + consts.append(imm_value) + return consts + + +# find strings +def get_strings(start_addr, end_addr): + strings = [] + for h in idautils.Heads(start_addr, end_addr): + refs = idautils.DataRefsFrom(h) + for ref in refs: + t = idc.get_str_type(ref) + if isinstance(t, int) and t >= 0: + s = idc.get_strlit_contents(ref) + if s and isprintable(s): + strings.append([h, s, t, ref]) + return strings + + +# This function returns a caller map, and callee map for each function. +def get_call_graph(): + callee_map = defaultdict(list) + caller_map = defaultdict(list) + for callee_ea in idautils.Functions(): + callee = idaapi.get_func(callee_ea) + # TODO: Sometimes, IDA returns false result. so we need to check this + if not callee: + continue + + callee_name = idc.get_func_name(callee_ea) + # TODO: check flow boolean 1 + for caller_ea in CodeRefsTo(callee_ea, 1): + caller = idaapi.get_func(caller_ea) + # TODO: Sometimes, IDA returns false result. so we need to check + if not caller: + continue + + caller_name = idc.get_func_name(caller_ea) + # TODO: check the correction - caller_ea -> callee_ea + callee_map[caller_name].append([callee_name, callee_ea]) + caller_map[callee_name].append([caller_name, caller_ea]) + + return caller_map, callee_map + + +# This function returns edges, and updates caller_map, and callee_map +def get_bb_graph(caller_map, callee_map): + edge_map = {} + bb_callee_map = {} + for func_ea in idautils.Functions(): + func = idaapi.get_func(func_ea) + if not func or func.start_ea == idaapi.BADADDR or func.end_ea == idaapi.BADADDR: + continue + + # TODO: study how to use flags + graph = idaapi.FlowChart(func, flags=idaapi.FC_PREDS) + func_name = idc.get_func_name(func.start_ea) + edge_map[func_name] = [] + bb_callee_map[func_name] = [] + for bb in graph: + if bb.start_ea == idaapi.BADADDR or bb.end_ea == idaapi.BADADDR: + continue + + for succbb in bb.succs(): + edge_map[func_name].append((bb.id, succbb.id)) + + for callee_name, callee_ea in callee_map[func_name]: + # Get address where current function calls a callee. + if bb.start_ea <= callee_ea < bb.end_ea: + bb_callee_map[func_name].append((bb.id, callee_name, callee_ea)) + + return edge_map, bb_callee_map + + +def get_type(addr): + tif = idaapi.tinfo_t() + ida_nalt.get_tinfo(tif, addr) + funcdata = idaapi.func_type_data_t() + tif.get_func_details(funcdata) + func_type = idaapi.print_tinfo("", 0, 0, PRTYPE_1LINE, tif, "", "") + ret_type = idaapi.print_tinfo("", 0, 0, PRTYPE_1LINE, funcdata.rettype, "", "") + args = [] + for i in range(funcdata.size()): + arg_type = idaapi.print_tinfo("", 0, 0, PRTYPE_1LINE, funcdata[i].type, "", "") + args.append([i, funcdata[i].name, arg_type, funcdata[i].argloc.atype()]) + return [func_type, ret_type, args] + + +def main(): + # Get IDA default information + bin_path = ida_nalt.get_input_file_path() + with open(bin_path, "rb") as f: + bin_hash = sha1(f.read()).hexdigest() + img_base = idaapi.get_imagebase() + info = idaapi.get_inf_structure() + if info.is_64bit(): + bits = 64 + elif info.is_32bit(): + bits = 32 + else: + bits = 16 + + endian = "little" + if info.is_be(): + endian = "big" + arch = "_".join([info.procName, str(bits), endian]) + arch = get_arch(arch) + + # Parse option information + package, compiler, arch, opti, bin_name = parse_fname(bin_path) + if "_noinline" in bin_path: + other_option = "noinline" + elif "_pie" in bin_path: + other_option = "pie" + elif "_lto" in bin_path: + other_option = "lto" + else: + other_option = "normal" + + # Prepare default information for processing + caller_map, callee_map = get_call_graph() + edge_map, bb_callee_map = get_bb_graph(caller_map, callee_map) + + # Now extract function information + func_data = [] + for idx, addr in enumerate(list(idautils.Functions())): + function = idaapi.get_func(addr) + if ( + not function + or function.start_ea == idaapi.BADADDR + or function.end_ea == idaapi.BADADDR + ): + continue + + # IDA's default function information + func_name = get_func_name(addr).strip() + demangled_name, demangled_full_name = demangle(func_name) + graph = idaapi.FlowChart(function, flags=idaapi.FC_PREDS) + data = idc.get_bytes(addr, function.size()) or "" + data_hash = sha1(data).hexdigest() + stack_size = get_frame_size(addr) + + # Get imported callees. Note that the segment name is used because + # idaapi.get_import_module_name() sometimes returns bad results ... + imported_callees = [] + if func_name in callee_map: + imported_callees = list( + filter(lambda x: get_segm_name(x[1]) != get_segm_name(addr), callee_map[func_name]) + ) + + # Get type information from IDA + func_type, ret_type, args = get_type(addr) + + # Prepare basic block information for feature extraction + func_strings = [] + func_consts = [] + bb_data = [] + for bb in graph: + if bb.start_ea == idaapi.BADADDR or bb.end_ea == idaapi.BADADDR: + continue + + bb_size = bb.end_ea - bb.start_ea + block_data = idc.get_bytes(bb.start_ea, bb_size) or b"" + block_data_hash = sha1(block_data).hexdigest() + bb_strings = get_strings(bb.start_ea, bb.end_ea) + bb_consts = get_consts(bb.start_ea, bb.end_ea) + bb_callees = list(filter(lambda x: x[0] == bb.id, bb_callee_map[func_name])) + bb_data.append( + { + "size": bb_size, + "block_id": bb.id, + "startEA": bb.start_ea, + "endEA": bb.end_ea, + "type": bb.type, + "is_ret": idaapi.is_ret_block(bb.type), + "hash": block_data_hash, + "callees": bb_callees, + "strings": bb_strings, + "consts": bb_consts, + } + ) + func_strings.extend(bb_strings) + func_consts.extend(bb_consts) + func_data.append( + { + "ida_idx": idx, + "seg_name": get_segm_name(addr), + "name": func_name, + "demangled_name": demangled_name, + "demangled_full_name": demangled_full_name, + "hash": data_hash, + "size": function.size(), + "startEA": function.start_ea, + "endEA": function.end_ea, + "cfg_size": graph.size, + "img_base": img_base, + "bin_path": bin_path, + "bin_hash": bin_hash, + "bin_offset": addr - img_base, + "stack_size": stack_size, + "package": package, + "compiler": compiler, + "arch": arch, + "opti": opti, + "others": other_option, + "bin_name": bin_name, + "func_type": func_type, + "ret_type": ret_type, + "args": args, + "callers": caller_map[func_name], + "callees": callee_map[func_name], + "imported_callees": imported_callees, + "cfg": edge_map[func_name], + "strings": func_strings, + "consts": func_consts, + "bb_data": bb_data, + } + ) + return func_data + + +init_idc() +try: + func_data = main() +except: + import traceback + + traceback.print_exc() + ida_pro.qexit(1) +else: + bin_path = ida_nalt.get_input_file_path() + store_func_data(bin_path, func_data) + ida_pro.qexit(0) diff --git a/tiknib/idascript.py b/tiknib/idascript.py index 3ac1f15..0911910 100644 --- a/tiknib/idascript.py +++ b/tiknib/idascript.py @@ -97,6 +97,10 @@ def run_helper(self, input_fname): else: ida = self.idapath + "/idal64" + # >= IDA Pro v7.4 use "idat" instead of "idal" + if not os.path.exists(ida): + ida = ida.replace('idal', 'idat') + # Setup command line arguments path = [ida, "-A", "-S{}".format(idc_args)] if self.log or self.stdout: