Initial import.

joxeankoret · Dec 30, 2009 · 442e620 · 442e620
commit 442e620
Show file tree

Hide file tree

Showing 8 changed files with 1,606 additions and 0 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1 @@
+Joxean Koret <[email protected]>
diff --git a/COPYING b/COPYING
diff --git a/LICENSE b/LICENSE
diff --git a/PROBLEMS b/PROBLEMS
@@ -0,0 +1,15 @@
+Known Problems
+==============
+
+Right know, the unique problem I found is with the generated signature's output
+and with the block size. That's: For small files you should use a small block
+size and signature's size, for medium size files the defaults are OK and, for 
+"somewhat" big files (more than 5MB) you should change the signature's output 
+size to a bigger value (in example, 64 bits).
+
+It's planned to do, in the future, the calculation automatically but, as it's 
+more flexible as is. Well, I need to think about...
+
+--
+Joxean Koret
+
diff --git a/README b/README
@@ -0,0 +1,15 @@
+What is deeptoad?
+=================
+
+"Deeptoad" is a (python) library and a tool to clusterize similar files using fuzzy
+hashing techniques. This project is inspired by the well known tool ssdeep.
+
+Links
+=====
+
+Project Page:
+http://code.google.com/p/deeptoad
+
+Wiki:
+http://code.google.com/p/deeptoad/w/list
+
diff --git a/deeptoad.py b/deeptoad.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python
+
+"""
+This file is part of DeepToad
+Copyright (C) 2009, Joxean Koret
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+"""
+
+import os
+import sys
+
+from kfuzzy import CKoretFuzzyHashing
+
+MAX_EDIT_DISTANCE = 16
+PROGRAM = "DeepToad"
+VERSION = "1.0"
+
+class CDeepToad:
+    groups = {}
+    ingroups = {}
+    kfd = None
+    extensions = []
+    ignore_extensions = []
+    edit_distance = MAX_EDIT_DISTANCE
+    maximum = 0
+    aggresive = True
+    just_print = False
+    just_compare = False
+
+    def __init__(self):
+        self.kfd = CKoretFuzzyHashing()
+        self.kfd.bsize = 512
+        self.kfd.output_size = 32
+        self.kfd.ignore_range = 2
+
+    def cluster(self, hashes, filename):
+        if self.just_print or self.just_compare:
+            self.groups[filename] = hashes
+            return
+
+        for hash in hashes:
+            hashed = False
+            for key in self.groups:
+                # Check for maximum edit distance
+                if self.kfd.edit_distance(key, hash) <= self.edit_distance:
+                    self.groups[key].append(filename)
+                    hashed = True
+                    break
+
+            if hashed:
+                continue
+
+            if not self.groups.has_key(hash):
+                self.groups[hash] = []
+
+            self.groups[hash].append(filename)
+
+    def hashFile(self, filename):
+        try:
+            s1, s2, s3 = self.kfd.hash_file(filename, self.aggresive).split(";")
+            self.cluster((s1, s2, s3), filename)
+        except KeyboardInterrupt:
+            raise
+        except:
+            sys.stderr.write(" -> %s\n" % str(sys.exc_info()[1]))
+            sys.stderr.flush()
+
+    def clusterDirectory(self, path, output_dir):
+        last_size = 0
+        total = 0
+        for root, dirs, files in os.walk(path):
+            for name in files:
+                if self.maximum != 0 and total >= self.maximum:
+                    break
+                total += 1
+
+                basename, extension = os.path.splitext(name)
+                if extension in self.ignore_extensions:
+                    continue
+                elif len(self.extensions) != 0:
+                    if extension not in self.extensions:
+                        continue
+
+                sys.stderr.write("\b"*last_size + " "*last_size + "\b"*last_size)
+                sys.stderr.flush()
+                sys.stderr.write("Processing file %s ..." % os.path.join(root, name))
+                last_size = len("Processing file %s ..." % os.path.join(root, name))
+                sys.stderr.flush()
+                self.hashFile(os.path.join(root, name))
+
+            if self.maximum != 0 and total >= self.maximum:
+                break
+
+        if total > 0:
+            sys.stderr.write("\n")
+            sys.stderr.flush()
+
+    def sortByCount(self):
+        # First, sort by count of elements
+        newGrp = {}
+        for x in self.groups:
+            if x == "":
+                continue
+
+            newGrp[x] = len(self.groups[x])
+
+        # Now sort the dict by values
+        outgrp = {}
+        alist = sorted(newGrp.iteritems(), key=lambda (k,v): (v,k), reverse=True)
+        dones = []
+
+        # Create the new dict with only non empty groups
+        for x in alist:
+            val = x[0]
+            outgrp[val] = []
+            for element in self.groups[val]:
+                if element not in dones:
+                    outgrp[val].append(element)
+                    dones.append(element)
+
+            if len(outgrp[val]) == 0:
+                del outgrp[val]
+
+        return outgrp
+
+    def printHashes(self):
+        print "Signature;Simple Signature;Reverse Signature;Filename"
+        for x in self.groups:
+            hashes = self.groups[x]
+            print "%s;%s;%s;%s" % (hashes[0], hashes[1], hashes[2], x)
+
+    def compareAndReportHashes(self, x, y, hashesx, hashesy, dones):
+        finished = False
+
+        for hx in hashesx:
+            if hx == "":
+                continue
+            elif finished:
+                break
+
+            if hx not in hashesy:
+                continue
+
+            for hy in hashesy:
+                if hy == "":
+                    continue
+
+                dis = self.kfd.edit_distance(hx, hy)
+
+                if dis == 0 or dis < len(hx)/3:
+                    if dis != 0:
+                        percent = len(hx)*100.00/dis
+                    else:
+                        percent = 100
+                    print "File '%s' matches '%s' (%0.2f%%)" % (x, y, percent)
+                    dones[y] = x
+                    finished = True
+                    break
+
+        return dones
+
+    def compareHashes(self):
+        dones = {}
+
+        for x in self.groups:
+            for y in self.groups:
+                if x == y:
+                    continue
+                elif dones.has_key(x):
+                    if dones[x] == y:
+                        #print "Ignored"
+                        continue
+
+                hashesx = self.groups[x]
+                hashesy = self.groups[y]
+
+                dones = self.compareAndReportHashes(x, y, hashesx, hashesy, dones)
+
+    def printReport(self):
+        if self.just_print:
+            self.printHashes()
+            return
+
+        if self.just_compare:
+            self.compareHashes()
+            return
+
+        grp = self.sortByCount()
+        already = []
+        for x in grp:
+            for element in self.groups[x]:
+                if element not in already:
+                    print "%s;%s" % (x, element)
+                    already.append(element)
+
+def main(args):
+    output_dir = None
+    kfdCluster = CDeepToad()
+
+    for arg in args:
+        if os.path.exists(arg):
+            if os.path.isdir(arg):
+                try:
+                    kfdCluster.groups = {}
+                    kfdCluster.clusterDirectory(arg, output_dir)
+                except KeyboardInterrupt:
+                    print "Aborted..."
+
+                kfdCluster.printReport()
+            else:
+                try:
+                    ret = kfdCluster.kfd.hash_file(arg)
+                    print "%s;%s" % (ret, arg)
+                except KeyboardInterrupt:
+                    print "Aborted..."
+        elif arg.startswith("-o"):
+            output_dir = arg[3:]
+        elif arg.startswith("-e="):
+            extensions = arg[3:].split(",")
+            kfdCluster.ignore_extensions = extensions
+        elif arg.startswith("-i="):
+            extensions = arg[3:].split(",")
+            kfdCluster.extensions = extensions
+        elif arg.startswith("-m="):
+            value = int(arg[3:])
+            kfdCluster.maximum = value
+        elif arg.startswith("-d="):
+            value = int(arg[3:])
+            kfdCluster.edit_distance = value
+        elif arg.startswith("-b="):
+            value = int(arg[3:])
+            kfdCluster.kfd.bsize = value
+        elif arg.startswith("-r="):
+            value = int(arg[3:])
+            kfdCluster.kfd.ignore_range = value
+        elif arg.startswith("-s="):
+            value = int(arg[3:])
+            kfdCluster.kfd.output_size = value
+        elif arg == "-c":
+            kfdCluster.just_compare = True
+        elif arg == "-f":
+            kfdCluster.kfd.algorithm = kfdCluster.kfd._fast_hash
+        elif arg == "-x":
+            kfdCluster.kfd.algorithm = kfdCluster.kfd._experimental_hash
+        elif arg == "-na":
+            kfdCluster.aggresive = False
+        elif arg == "-ag":
+            kfdCluster.aggresive = True
+        elif arg == "-nb":
+            kfdCluster.kfd.reduce_errors = True
+        elif arg == "-cb":
+            kfdCluster.kfd.reduce_errors = False
+        elif arg == "-simple":
+            kfdCluster.kfd.algorithm = kfdCluster.kfd.simplified
+        elif arg == "-p":
+            kfdCluster.just_print = True
+        elif arg == "-ida":
+            kfdCluster.ignore_extensions = [".idb", ".id0", ".id1", ".til", ".nam"]
+        elif arg.startswith("-echo"):
+            print arg[6:]
+            print "="*len(arg[6:])
+        else:
+            sys.stderr.write("Unknown argument '%s' ignored\n" % arg)
+
+def usage():
+    print "%s v%s, Copyright (c) 2009, 2010 Joxean Koret <[email protected]>" % (PROGRAM, VERSION)
+    print "Usage:", sys.argv[0], "[parameters] <directory>"
+    print
+    print "Common parameters:"
+    print "   -o=<directory>    Not yet implemented"
+    print "   -e=<extensions>   Exclude extensions (separated by comma)"
+    print "   -i=<extensions>   Clusterize only specified extensions (separated by comma)"
+    print "   -m=<value>        Clusterize a maximum of <value> file(s)"
+    print "   -d=<distance>     Specify the maximum edit distance (by default, 16 or 33%)"
+    print "   -ida              Ignore files created by IDA"
+    print "   -p                Just print the generated hashes"
+    print "   -c                Compare the files"
+    print "   -echo=<msg>       Print a message (usefull to generate reports)"
+    print
+    print "Advanced parameters:"
+    print "   -b=<block size>   Specify the block size (by default, 512)"
+    print "   -r=<ignore range> Specify the range of bytes to be ignored (by default, 2)"
+    print "   -s=<output size>  Specify the signature's size (by default, 32)"
+    print "   -f                Use faster (but weaker) algorithm"
+    print "   -x                Use eXperimental algorithm"
+    print "   -simple           Use the simplified algorithm"
+    print "   -na               Use non aggresive method (only applicable to default algorithm)"
+    print "   -ag               Use aggresive method (default)"
+    print "   -nb               Ignore null blocks (default)"
+    print "   -cb               Consider null blocks"
+    print
+    print "Example:"
+    print
+    print "Analyze a maximum of 25 files excluding zip and rar files:"
+    print "%s -e=.zip,.rar -m=25 /home/luser/samples" % sys.argv[0]
+    print
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        usage()
+    else:
+        sys.exit(main(sys.argv[1:]))
+