From d0e693e8d044b886c0edafb91fcf7dd589972a45 Mon Sep 17 00:00:00 2001 From: merces Date: Mon, 20 Apr 2020 10:52:55 -0300 Subject: [PATCH] First public release --- .gitignore | 23 ++ LICENSE | 202 +++++++++++++++ MANIFEST.in | 7 + NOTICE | 5 + README.md | 138 ++++++++++ requirements.txt | 3 + setup.py | 67 +++++ telfhash/VERSION | 1 + telfhash/__init__.py | 8 + telfhash/__main__.py | 28 ++ telfhash/grouping.py | 225 ++++++++++++++++ telfhash/telfhash.py | 599 +++++++++++++++++++++++++++++++++++++++++++ 12 files changed, 1306 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 NOTICE create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 telfhash/VERSION create mode 100644 telfhash/__init__.py create mode 100644 telfhash/__main__.py create mode 100644 telfhash/grouping.py create mode 100755 telfhash/telfhash.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..03c70c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +.DS_Store +.python-version +.vscode +.idea +__pycache__ +samples/ +.eggs + +# Compiled python modules. +*.pyc + +# Setuptools distribution folder. +dist/ + +# Python egg metadata, regenerated from source files by setuptools. +/*.egg-info + +# Python setup.py build artifacts +.eggs/ +build/ + +# our temp folder +tmp/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..3d7da38 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,7 @@ +include MANIFEST.in +include README.md +include LICENSE +include NOTICE +include telfhash/VERSION +global-exclude __pycache__ +global-exclude *.py[co] diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..7c5463c --- /dev/null +++ b/NOTICE @@ -0,0 +1,5 @@ +telfhash +Copyright 2020 Trend Micro + +This product includes software developed at +Trend Micro (https://www.trendmicro.com/). diff --git a/README.md b/README.md new file mode 100644 index 0000000..080f152 --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# Trend Micro ELF Hash (telfhash) + +telfhash is symbol hash for ELF files, just like imphash is imports hash for PE files. + +## Installation + +### Requirements + +telfhash uses TLSH in generating the hash. TLSH must be installed in your system in order for telfhash to work. + +You can install TLSH from here: + +* [https://github.com/trendmicro/tlsh/](https://github.com/trendmicro/tlsh/) + +The TLSH git repo has detailed instructions on how to compile and install the TLSH binaries and libraries. Don't forget to also install the TLSH Python library. telfhash uses the TLSH Python library to generate the actual hash. + +### Installing + +Clone the telfhash repository from here: + +* [https://github.com/trendmicro/telfhash](https://github.com/trendmicro/telfhash) + +Use the `setup.py` to install the telfhash library: + + python setup.py install + +## Usage + +If the TLSH Python library is not installed, telfhash will generate an exception error. + +### Command line + +If telfhash was installed via the `python setup.py install` command, the `telfhash` executable will by made available. + + $ telfhash -h + usage: telfhash.py [-h] [-g] [-t THRESHOLD] [-r] [-d] files [files ...] + + positional arguments: + files Target ELF file(s). Accepts wildcards + + optional arguments: + -h, --help show this help message and exit + -g, --group Group the files according to how close their telfhashes + are + -t THRESHOLD, --threshold THRESHOLD + Minimum distance betweeen telfhashes to be considered + as related. Only works with -g/--group. Defaults to 50 + -r, --recursive Deep dive into all the subfolders. Input should be a + folder + -d, --debug Print debug messages + + $ telfhash /bin/trace* + /bin/tracepath 09d097025c0b40af18cb0c08ac3f2f5df100d850483bc1404f108809113290a2d6ae4f + /bin/traceroute 65e02002d9b9552f56f35e709caf6fa57115f841e83b87148f04b592c023542ed0549f + /bin/traceroute6 65e02002d9b9552f56f35e709caf6fa57115f841e83b87148f04b592c023542ed0549f + + $ telfhash -g /sbin/ip* + /sbin/ip 33c15268ac66484d58be0e68ed2d7e68c25b5b97edf02b10dff4c412d2c3586725f01b + /sbin/ip6tables 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/ip6tables-legacy 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/ip6tables-legacy-restore 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/ip6tables-legacy-save 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/ip6tables-restore 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/ip6tables-save 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/ipmaddr 7dc08c0a6622ad4b2af66e781c3322864248e073b06ccb56aaaf854088062091c6011c + /sbin/ipset e4a0029085e66bce4ed2146959136540409454e38028d780613002a6d70154d5023d6a + /sbin/iptables 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/iptables-apply - + /sbin/iptables-legacy 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/iptables-legacy-restore 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/iptables-legacy-save 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/iptables-restore 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/iptables-save 083169fc5722ee8734bfb9357cf23b41a5092db0b9a8d0a95d08d743464636ca143c66 + /sbin/iptstate 1ef02223f4318ca385920c9910f975a131268721a1dbb80dff038e758bad21e65718cf + /sbin/iptunnel d5c08c4aa612ad5b3ae72e781c3330868248e0b2b05c8b52aa2a854089062090c60518 + + Group 1: + /sbin/ipmaddr + /sbin/iptunnel + Group 2: + /sbin/ip6tables + /sbin/ip6tables-legacy + /sbin/ip6tables-legacy-restore + /sbin/ip6tables-legacy-save + /sbin/ip6tables-restore + /sbin/ip6tables-save + /sbin/iptables + /sbin/iptables-legacy + /sbin/iptables-legacy-restore + /sbin/iptables-legacy-save + /sbin/iptables-restore + /sbin/iptables-save + Cannot be grouped: + /sbin/iptstate + /sbin/ipset + /sbin/ip + +### Python module + + >>> import telfhash + >>> import pprint + >>> telfhash.telfhash("/bin/ping") + {'file': '/bin/ping', 'telfhash': '6901d303587a847f9aa30ce44c3f3f5c6101e9525eb2d354cf1297948022b40aa4a99f', 'msg': ''} + >>> + >>> results = telfhash.telfhash("telfhash/tests/samples/hdumps/*") + >>> groups = telfhash.group(results) + >>> pprint.pprint(groups) + {'grouped': (('telfhash/tests/samples/hdumps/hdump_32_so_stat_stripped', + 'telfhash/tests/samples/hdumps/hdump_32_stat_stripped'), + ('telfhash/tests/samples/hdumps/hdump_64_so_stat_stripped', + 'telfhash/tests/samples/hdumps/hdump_64_stat_stripped'), + ('telfhash/tests/samples/hdumps/hdump_32_so_stat', + 'telfhash/tests/samples/hdumps/hdump_32_stat', + 'telfhash/tests/samples/hdumps/hdump_64_so_stat', + 'telfhash/tests/samples/hdumps/hdump_64_stat', + 'telfhash/tests/samples/hdumps/hdump_static'), + ('telfhash/tests/samples/hdumps/hdump', + 'telfhash/tests/samples/hdumps/hdump32', + 'telfhash/tests/samples/hdumps/hdump_32_dyn', + 'telfhash/tests/samples/hdumps/hdump_32_dyn_stripped', + 'telfhash/tests/samples/hdumps/hdump_32_so_dyn', + 'telfhash/tests/samples/hdumps/hdump_32_so_dyn_stripped', + 'telfhash/tests/samples/hdumps/hdump_64_dyn', + 'telfhash/tests/samples/hdumps/hdump_64_dyn_stripped', + 'telfhash/tests/samples/hdumps/hdump_64_so_dyn', + 'telfhash/tests/samples/hdumps/hdump_64_so_dyn_stripped', + 'telfhash/tests/samples/hdumps/hdump_dynamic', + 'telfhash/tests/samples/hdumps/hdump_stripped')), + 'nogroup': []} + >>> + >>> telfhash.telfhash("samples/LinuxMoose/LinuxMoose.arm7.2015.0.bin") + {'file': 'samples/LinuxMoose/LinuxMoose.arm7.2015.0.bin', 'telfhash': None, 'msg': 'No symbols found'} + >>> telfhash.telfhash("/bin/ls", "/bin/lsattr") + [{'file': '/bin/ls', 'telfhash': '1ff0994248230af71762c8b15c0533da9a208b2656e5bf302f1985d04e2a5be779284f', 'msg': ''}, {'file': '/bin/lsattr', 'telfhash': '69c08017dd0fe4f35dd90d589c07380ae7dee06057b9d7400d3c46c1755058c5d5555d', 'msg': ''}] + +## Publications + +[Grouping Linux IoT Malware Samples With Trend Micro ELF Hash aka telfash](https://blog.trendmicro.com/trendlabs-security-intelligence/) - Trend Micro Blog, 2020 April 20th. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d8d5a5c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +capstone>=4.0.1 +nose>=1.3.7 +pyelftools>=0.25 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d9ce792 --- /dev/null +++ b/setup.py @@ -0,0 +1,67 @@ +''' +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +''' + +import os.path +from setuptools import setup + +# The directory containing this file +HERE = os.path.abspath(os.path.dirname(__file__)) + +with open(os.path.join(HERE, 'telfhash', 'VERSION')) as version_file: + VERSION = version_file.read().strip() + +def readme(): + with open(os.path.join(HERE, "README.md")) as f: + return f.read() + +def requires(): + requirements = [] + + with open(os.path.join(HERE, "requirements.txt")) as f: + for line in f: + if len(line) > 0: + requirements.append(line.strip()) + + return requirements + +setup( + name="telfhash", + version=VERSION, + description="Generates hash for ELF files", + long_description=readme(), + url="https://github.com/trendmicro/telfhash", + author="Fernando Merces, Joey Costoya", + license="Apache", + classifiers=[ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 2.7" + ], + keywords="telfhash elf linux hash symbols", + packages=["telfhash"], + include_package_data=True, + install_requires=requires(), + entry_points={ + "console_scripts": ["telfhash=telfhash.__main__:main"] + }, + test_suite="nose.collector", + tests_require=["nose"], + zip_safe=False +) diff --git a/telfhash/VERSION b/telfhash/VERSION new file mode 100644 index 0000000..85b7c69 --- /dev/null +++ b/telfhash/VERSION @@ -0,0 +1 @@ +0.9.6 diff --git a/telfhash/__init__.py b/telfhash/__init__.py new file mode 100644 index 0000000..803c083 --- /dev/null +++ b/telfhash/__init__.py @@ -0,0 +1,8 @@ +# __init__.py + +from .telfhash import telfhash +from .telfhash import group +from .telfhash import expand_filepath +from .telfhash import get_max_len +from .telfhash import print_hashes +from .telfhash import VERSION as __version__ \ No newline at end of file diff --git a/telfhash/__main__.py b/telfhash/__main__.py new file mode 100644 index 0000000..167b6af --- /dev/null +++ b/telfhash/__main__.py @@ -0,0 +1,28 @@ +''' +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +''' + +import sys + +from .telfhash import _main + +def main(): + _main() + +if __name__ == "__main__": + sys.exit(main()) diff --git a/telfhash/grouping.py b/telfhash/grouping.py new file mode 100644 index 0000000..2dbc568 --- /dev/null +++ b/telfhash/grouping.py @@ -0,0 +1,225 @@ +''' +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +''' + +# make it Python 2 compatible +from __future__ import print_function + +import sys +import itertools +import functools +import operator + +# https://github.com/trendmicro/tlsh +import tlsh + + +def get_combination(telfhash_data): + + # + # TLSH hash is 70 characters long. if the telfhash is not 70 + # characters in length, exclude from the list + # + files_list = [x for x in list(telfhash_data.keys()) if telfhash_data[x]["telfhash"] is not None and len(telfhash_data[x]["telfhash"]) == 70] + + # + # get the combination of all the possible pairs of filenames + # we use itertools.combinations_with_replacement. this function + # returns the combinations, but will treat the combination + # (A,B) and (B,A) as equivalent + # + # the following list comprehension is to weed out the + # combination pair of the same element, like (A,A) + # + ll = list(itertools.combinations_with_replacement(files_list, 2)) + files_combi = [x for x in ll if x[0] != x[1]] + + return files_combi + + +def get_distances(telfhash_data, files_combination): + """Get the distance between each telfhash TLSH values + + Input: + telfhash_data - dictionary of telfhash data with the keys being the + filename + files_combination - a list of list. each component list contains + two files + """ + + distances = [] + + for element in files_combination: + file1 = element[0] + file2 = element[1] + distance = tlsh.diff(telfhash_data[file1]["telfhash"], telfhash_data[file2]["telfhash"]) + + distances.append((file1, file2, distance)) + + return distances + + +def condense(groups): + """Condense the output list. some groups that appear are subset of + another group. for example, the grouping output can be: + + ( ('A', 'B'), + ('A', 'B', 'C'), + ('A', 'B', 'C', 'D', 'E'), + ('C', 'D', 'E', 'F'), + ('G', 'H', 'I')) + + the condense() function will condense the above output to the + following: + + ( ('G', 'H', 'I'), + ('C', 'D', 'E', 'F'), + ('A', 'B', 'C', 'D', 'E')) + """ + group_set = [set(x) for x in groups] + group_sorted = sorted(group_set, key=lambda x: len(x)) + + condensed = [] + + for i in range(len(group_sorted)): + + item = group_sorted[i] + rest = group_sorted[i+1:] + + if len(rest) == 0: + rest = group_sorted[:i] + + subset_check = [item.issubset(x) for x in rest] + if not functools.reduce(operator.or_, subset_check, False): + condensed.append(tuple(sorted(list(item)))) + + return tuple(condensed) + + +def group_distances(distances, threshold=50): + """ + Group files that are similar to each other according to their + TLSH distance + + this works on the principle that when + A is related to B; and + B is related to C; therefore + A is related to C + + Inputs: + distances: list of tuples. each tuple is composed of + (fileA, fileB, TLSH_diff) + + threshold: maximum TLSH distance for two telfhashes to be + considered as related. defaults to 50 + """ + + groups = set() + + for round1 in distances: + A, B, dist1 = round1 + + if dist1 <= threshold: + group = set() + group.update((A, B)) + + for round2 in distances: + if round1 == round2: + continue + + C, D, dist2 = round2 + if dist2 > threshold: + continue + + if (A in round2) or (B in round2): + group.update((C, D)) + + if len(group) > 0: + groups.add(tuple(sorted(tuple(group)))) + + # + # condense the output list. some groups that appear are subset of + # another group. for example, the grouping output can be + # + # ( ('A', 'B'), + # ('A', 'B', 'C'), + # ('A', 'B', 'C', 'D', 'E'), + # ('C', 'D', 'E', 'F'), + # ('G', 'H', 'I')) + # + # the condense() function will condense the above output to the + # following: + # + # ( ('G', 'H', 'I'), + # ('C', 'D', 'E', 'F'), + # ('A', 'B', 'C', 'D', 'E')) + # + condensed = condense(groups) + + # get the list of files that do not belong to any group + files_list = [x[0] for x in distances] + files_list += distances[-1:][0][:2] + grouped_list = set([x for y in condensed for x in y]) + nogroup = [x for x in set(files_list) if x not in grouped_list] + + results = {} + results["grouped"] = condensed + results["nogroup"] = nogroup + + return results + + +def group(results, threshold=50): + """Group the files according to how close their telfhashes are + + Input: + results: a list of dicts containing the filename and their telfhash + """ + + # + # `results` is a list of dicts. we now make a dictionary + # with the key being the filename. + # + telfhash_data = {x["file"]:x for x in results} + + # + # get all the possible file combinations, using the these conditions: + # - removing duplicates + # - these file combinations are equal: (A, B), (B, A) + # + files_combi = get_combination(telfhash_data) + + # + # get the distances between all the file combinations + # + distances = get_distances(telfhash_data, files_combi) + + # + # group according to distance. default distance threshold to be considered + # in the same group is 50 + # + groups = group_distances(distances, threshold) + + return groups + + +def main(): + return 0 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/telfhash/telfhash.py b/telfhash/telfhash.py new file mode 100755 index 0000000..65cfcb4 --- /dev/null +++ b/telfhash/telfhash.py @@ -0,0 +1,599 @@ +#!/usr/bin/env python + +''' +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +''' + +# make it Python 2 compatible +from __future__ import print_function + +import argparse +import os +import sys +import hashlib +import re +import functools +import operator +import glob +import json +import tlsh # https://github.com/trendmicro/tlsh +import elftools +from elftools.elf.elffile import ELFFile +from capstone import * + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +import grouping + +# The directory containing this file +HERE = os.path.abspath(os.path.dirname(__file__)) + +with open(os.path.join(HERE, 'VERSION')) as version_file: + VERSION = version_file.read().strip() + +EXCLUSIONS_REGEX = [ + r"^[_\.].*$", # Function names starting with . or _ + "^.*64$", # x64-64 specific functions + "^str.*$", # gcc significantly changes string functions depending on the target architecture, so we ignore them + "^mem.*$" # gcc significantly changes string functions depending on the target architecture, so we ignore them +] + +EXCLUSIONS_STRINGS = [ + "__libc_start_main", # main function + "main", # main function + "abort", # ARM default + "cachectl", # MIPS default + "cacheflush", # MIPS default + "puts", # Compiler optimization (function replacement) + "atol", # Compiler optimization (function replacement) + "malloc_trim" # GNU extensions +] + + +# the op code and the mnemonic used for call functions to an absolute address +# across various architectures +CALL_LIST = { + "x86": { + "cs_arch": CS_ARCH_X86, + "cs_mode": CS_MODE_32 + }, + "x64": { + "cs_arch": CS_ARCH_X86, + "cs_mode": CS_MODE_64 + }, + "ARM": { + "cs_arch": CS_ARCH_ARM, + "cs_mode": CS_MODE_ARM + }, + "MIPS": { + "cs_arch": CS_ARCH_MIPS, + "cs_mode": CS_MODE_MIPS32 | CS_MODE_BIG_ENDIAN + } +} + + +def perror(msg): + print(msg, file=sys.stderr) + + +def build_exclude_list(): + EXCLUDE_LIST = {} + EXCLUDE_LIST['simple'] = [] + EXCLUDE_LIST['regex'] = [] + + excludes = {} + excludes['simple'] = [] + excludes['regex'] = [] + + for exclude_string in EXCLUSIONS_STRINGS: + EXCLUDE_LIST['simple'].append(exclude_string) + + for exclude_re in EXCLUSIONS_REGEX: + try: + EXCLUDE_LIST['regex'].append(re.compile(exclude_re)) + except Exception as e: + perror("Skipping '{}': {}".format(exclude_re, e.msg)) + + return EXCLUDE_LIST + + +def can_exclude(symbol, exclude_list): + + if symbol in exclude_list['simple']: + return True + + # use a list comprehension to generate an array of booleans if they match the list of supplied regexes + # we then use functools.reduce() and operator.or_() to determine if at least one of the regex matched the + # symbol we're searching + re_matches = [True if x.search(symbol) else False for x in exclude_list['regex']] + if functools.reduce(operator.or_, re_matches, False): + return True + + return False + + +def get_hash(symbols_list): + + symbol_string = ",".join(symbols_list) + encoded_symbol_string = symbol_string.encode('ascii') + + return tlsh.forcehash(encoded_symbol_string).lower() + + +def elf_get_imagebase(elf): + i=0 + while elf.iter_segments(): + if (elf._get_segment_header(i)['p_type'] == 'PT_LOAD'): + return elf._get_segment_header(i)['p_vaddr'] + i+=1 + + return 0 + + +def elf_is_static_stripped(elf): + # If either PT_INTERP segment or .interp section is present, the executable is dynamic + for s in elf.iter_segments(): + if (s['p_type'] == 'PT_INTERP'): + return False + + # If .symtab is present, symbols were NOT stripped + for s in elf.iter_sections(): + if (s['sh_type'] == 'SHT_SYMTAB'): + return False + + return True + + +def get_ep_section_or_segment(elf): + """Get the code section/segment where the entry point is located + """ + + # get the entry point + ep = elf.header.e_entry + + # enumerate all the sections. the code section is where the entry point + # falls in between the start and end address of the section + for section in elf.iter_sections(): + start_offset = section.header.sh_addr + end_offset = start_offset + section.header.sh_size - 1 + + if (ep >= start_offset) and (ep <= end_offset): + return section + + # if we reached this point, then we failed to get the code section using + # the above method. we use the default '.text' section + code_section_or_segment = elf.get_section_by_name('.text') + + if code_section_or_segment: + return code_section_or_segment + + for segment in elf.iter_segments(): + if segment['p_type'] == "PT_LOAD" and segment['p_flags'] == 5: # r-x segment + return segment + + return code_section_or_segment + + +def extract_call_destinations(elf): + symbols_list = [] + + # get the code section or segment (if there's no section) + code_section_or_segment = get_ep_section_or_segment(elf) + + # if we only got the segment, start extracting calls from the EP + if type(code_section_or_segment) == elftools.elf.segments.Segment: + ofs = elf.header.e_entry + code_data = code_section_or_segment.data()[ofs - code_section_or_segment['p_vaddr']:] + # otherwise we use the code section + else: + ofs = elf_get_imagebase(elf) + code_section_or_segment['sh_offset'] + code_data = code_section_or_segment.data() + + # get the architecture of our ELF file. + # the disassembly and the call opcode and mnemonic will be based on the + # determined architecture, as defined by the CALL_LIST dict above + arch = elf.get_machine_arch() + + # in case we have not specified the opcode, mnemonic, and the + # capstone arch and mode, skip + if arch not in CALL_LIST: + return [] + + # TODO: automatically identify the architecture the binary was compiled to + md = Cs(CALL_LIST[arch]["cs_arch"], CALL_LIST[arch]["cs_mode"]) + + if code_section_or_segment is not None: + # TODO: handle UPX-packed binaries as they have no sections so we should go straight to segment offset + for i in md.disasm(code_data, ofs): + if arch in ("x86", "x64") and i.mnemonic == "call": + # Consider only call to absolute addresses + if i.op_str.startswith('0x'): + address = i.op_str[2:] # cut off '0x' prefix + if not address in symbols_list: + symbols_list.append(address) + + elif arch == "ARM" and i.mnemonic.startswith("bl"): + if i.op_str.startswith('#0x'): + address = i.op_str[3:] + if not address in symbols_list: + symbols_list.append(address) + + elif arch == "MIPS" and i.mnemonic == "lw": + if i.op_str.startswith("$t9, "): + address = i.op_str[8:-5] + if not address in symbols_list: + symbols_list.append(address) + + return symbols_list + + +def extract_symbols(filepath, **kwargs): + """Returns a list of symbols read from the ELF file, excluding those + symbols found in our exclusion list + """ + + debug = False + + if "debug" in kwargs and kwargs["debug"] is True: + debug = True + + if "exclude_list" not in kwargs: + exclude_list = build_exclude_list() + else: + exclude_list = kwargs["exclude_list"] + + fh = open(filepath, 'rb') + + try: + elf = ELFFile(fh) + except: + if not fh.closed: + fh.close() + raise + + # Types: 'SHT_SYMTAB', 'SHT_DYNSYM', 'SHT_SUNW_LDYNSYM' + + if debug: + print(elf['e_ident']['EI_CLASS']) + + symtab='' + for s in elf.iter_sections(): + if (s['sh_size'] <= 0): + continue + + if (s['sh_type'] == 'SHT_DYNSYM'): + symtab = s + break # dynamic symbol table has higher priority + + elif (s['sh_type'] == 'SHT_SYMTAB'): + symtab = s + break + + if (not symtab): + call_destinations = extract_call_destinations(elf) + fh.close() + + if debug: + print("Statically compiled") + print("{} call addresses considered".format(len(call_destinations))) + + return call_destinations + + if debug: + print('{} symbols found'.format(symtab.num_symbols())) + + symbols_list = [] + i=0 + for sym in symtab.iter_symbols(): + sym_type = sym.entry['st_info']['type'] + sym_bind = sym.entry['st_info']['bind'] + sym_visibility = sym.entry['st_other']['visibility'] + + if (sym_type != 'STT_FUNC' or + sym_bind != 'STB_GLOBAL' or + sym_visibility != 'STV_DEFAULT' or + len(sym.name) <= 0): + continue + + # Function name exceptions + if can_exclude(sym.name, exclude_list): + continue + + i += 1 + symbols_list.append(sym.name.lower()) # lowercase + + # sort the symbol list + symbols_list.sort() + + # creates the symbol string + syms = ",".join(symbols_list) + + if debug: + print("{} symbols considered:\n{}".format(i, syms)) + + fh.close() + return symbols_list + + +def fopen(fname): + try: + fh = open(fname, 'rb') + except: + perror('{}: could not open file for reading'.format(fname)) + return fh + + +def expand_filepath(input_filepath, recursive=False): + """get the list of files, expanding on wildcards if necessary + (using glob.glob)""" + + files_list = [] + + if recursive is True: + for i in os.walk(input_filepath): + for j in i[2]: + files_list.append("{}".format(os.path.join(i[0], j))) + + else: + for filepath in glob.glob(input_filepath): + if os.path.isfile(filepath): + files_list.append(filepath) + + return files_list + + +def get_max_len(files_list): + """Get the length of the file with the longest filename""" + ret = 0 + + if len(files_list) > 0: + filename_lengths = [len(x) for x in files_list] + max_len = max(filename_lengths) + ret = max_len + + return ret + + +def get_args(): + parser = argparse.ArgumentParser(prog="telfhash") + parser.add_argument('-g', '--group', help='Group the files according to how close their telfhashes are', action='store_true') + parser.add_argument('-t', '--threshold', default="50", help='Minimum distance between telfhashes to be considered as related. Only works with -g/--group. Defaults to 50') + parser.add_argument('-r', '--recursive', default=False, help='Deep dive into all the subfolders. Input should be a folder', action='store_true') + parser.add_argument('-o', '--output', default=None, help='Output file') + parser.add_argument('-f', '--format', default=None, help='Log output format. Accepts tsv or json. If -o/--output is not specified, formatted output is printed on stdout') + parser.add_argument('-d', '--debug', help='Print debug messages', action='store_true') + parser.add_argument('-v', '--version', help='Print version', action='version', version="%(prog)s {}".format(VERSION)) + parser.add_argument('files', help='Target ELF file(s). Accepts wildcards', default=[], nargs='+') + args = parser.parse_args() + + # after parsing, args.files is a list + args.files_list = [] + for f in args.files: + args.files_list += expand_filepath(f, args.recursive) + + # get the length of the longest filename. this is helpful later when + # printing the telfhashes in STDOUT, where we'll use the `max_len` to + # vertically align the telfhash column + args.max_len = get_max_len(args.files_list) + + if args.threshold.isdigit(): + args.threshold = int(args.threshold) + else: + perror("'{}' is an invalid value for threshold. defaulting the threshold to 50.\n".format(args.threshold)) + args.threshold = 50 + + # convert our args into a dictionary + params = args.__dict__ + + return params + + +def telfhash_single(filepath, **kwargs): + result = {} + result["file"] = filepath + result["telfhash"] = '-' + result["msg"] = "" + + debug = False + + if "debug" in kwargs and kwargs["debug"] is True: + debug = True + + if "exclude_list" not in kwargs: + exclude_list = build_exclude_list() + else: + exclude_list = kwargs["exclude_list"] + + try: + symbols_list = extract_symbols(filepath, debug=debug, exclude_list=exclude_list) + except FileNotFoundError as e: + symbols_list = None + result["msg"] = e.strerror + except elftools.common.exceptions.ELFError: + symbols_list = None + result["msg"] = "Could not parse file as ELF" + except: + symbols_list = None + result["msg"] = "Unknown error" + + if symbols_list is not None: + if len(symbols_list) > 0: + h = get_hash(symbols_list) + + # if the hash of our symbols generated a blank string + if len(h) == 0: + h = '-' + + result["telfhash"] = h + else: + result["msg"] = "No symbols found" + else: + if len(result["msg"]) == 0: + result["msg"] = "No symbols found" + + return result + + +def telfhash(*paths, **kwargs): + """Get the telfhash of specified files. Accepts wildcards + + Args: + paths One or more file paths to get telfhashes on. Accepts + wildcards. Module uses glob.glob for file expansion + + recursive [Optional] Boolean. Recursively find files to get the + telfhash. Defaults to False + + debug [Optional] Boolean. Display debug messages. Defaults to + False + + Returns: + A list of dicts, each dict contains the telfhash data of each file. + """ + + # default values + results = [] + recursive = False + debug = False + files_list = [] + + if len(paths) == 0: + return results + + if "recursive" in kwargs and kwargs["recursive"] is True: + recursive = True + + if "debug" in kwargs and kwargs["debug"] is True: + debug = True + + exclude_list = build_exclude_list() + + for path in paths: + files_list += expand_filepath(path, recursive=recursive) + + for f in files_list: + result = telfhash_single(f, debug=debug, exclude_list=exclude_list) + results.append(result) + + return results + + +def group(telfhash_results, threshold=50): + """Group the files according to the TLSH distances between the telfhashes + of the files + + Args: + telfhash_results: The output of the telfhash.telfhash function call. List + of telfhash data of the files + threshold: [Optional] The minimum TLSH distance between telfhashes + for the files to be considered as related + + Returns: + Tuple of tuples, each member tuple is one group + """ + groups = grouping.group(telfhash_results, threshold=threshold) + + return groups + + +def output_format_tsv(args, results): + if args['output'] is None: + # output to stdout + for result in results: + print("{}\t{}".format(result['file'], result['telfhash'])) + + else: + with open(args['output'], 'w') as fh: + for result in results: + fh.write("{}\t{}\n".format(result['file'], result['telfhash'])) + + +def output_format_json(args, results): + json_output = json.dumps(results) + + if args['output'] is None: + # output to stdout + print("{}".format(json_output)) + + else: + with open(args['output'], 'w') as fh: + fh.write("{}\n".format(json_output)) + + +def print_hashes(args): + + exclude_list = build_exclude_list() + + results = [] + for filepath in args["files_list"]: + result = telfhash_single(filepath, debug=args["debug"], exclude_list=exclude_list) + results.append(result) + + # the fancy formatting is done so that we could properly vertically + # align the telfhashes in the second column. we're using the `max_len` + # value computed before in the get_args() function + # + # data is printed as soon as the data is obtained so that the user sees + # data right away, and it makes the console more active. + # only go this path if args['output']=None and args['format']=None + if args['output'] is None and args['format'] is None: + if result["telfhash"] is not None: + print("{:<{max_len}} {}".format(result["file"], result["telfhash"], max_len=args["max_len"])) + else: + print('{:<{max_len}} {msg}'.format(filepath, max_len=args["max_len"], msg=result["msg"])) + + if args['format'] == 'tsv': + output_format_tsv(args, results) + + elif args['format'] == 'json': + output_format_json(args, results) + + if args['group'] and len(results) > 1: + groups = grouping.group(results, threshold=args['threshold']) + + print() + for i in range(len(groups["grouped"])): + print("Group {}:".format(i+1)) + for f in groups["grouped"][i]: + print(" {}".format(f)) + + if len(groups["nogroup"]) > 0: + print("Ungrouped:") + for f in groups["nogroup"]: + print(" {}".format(f)) + + + print() + + +def _main(): + + args = get_args() + + if len(args["files_list"]) == 0: + perror("No files found") + return 1 + + print_hashes(args) + + +def main(): + return _main() + + +if __name__ == "__main__": + sys.exit(main())