Skip to content
This repository has been archived by the owner on Apr 19, 2022. It is now read-only.

Commit

Permalink
Initial import.
Browse files Browse the repository at this point in the history
  • Loading branch information
joxean committed Dec 30, 2009
0 parents commit 442e620
Show file tree
Hide file tree
Showing 8 changed files with 1,606 additions and 0 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Joxean Koret <[email protected]>
504 changes: 504 additions & 0 deletions COPYING

Large diffs are not rendered by default.

504 changes: 504 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions PROBLEMS
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Known Problems
==============

Right know, the unique problem I found is with the generated signature's output
and with the block size. That's: For small files you should use a small block
size and signature's size, for medium size files the defaults are OK and, for
"somewhat" big files (more than 5MB) you should change the signature's output
size to a bigger value (in example, 64 bits).

It's planned to do, in the future, the calculation automatically but, as it's
more flexible as is. Well, I need to think about...

--
Joxean Koret

15 changes: 15 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
What is deeptoad?
=================

"Deeptoad" is a (python) library and a tool to clusterize similar files using fuzzy
hashing techniques. This project is inspired by the well known tool ssdeep.

Links
=====

Project Page:
http://code.google.com/p/deeptoad

Wiki:
http://code.google.com/p/deeptoad/w/list

316 changes: 316 additions & 0 deletions deeptoad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
#!/usr/bin/env python

"""
This file is part of DeepToad
Copyright (C) 2009, Joxean Koret
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
"""

import os
import sys

from kfuzzy import CKoretFuzzyHashing

MAX_EDIT_DISTANCE = 16
PROGRAM = "DeepToad"
VERSION = "1.0"

class CDeepToad:
groups = {}
ingroups = {}
kfd = None
extensions = []
ignore_extensions = []
edit_distance = MAX_EDIT_DISTANCE
maximum = 0
aggresive = True
just_print = False
just_compare = False

def __init__(self):
self.kfd = CKoretFuzzyHashing()
self.kfd.bsize = 512
self.kfd.output_size = 32
self.kfd.ignore_range = 2

def cluster(self, hashes, filename):
if self.just_print or self.just_compare:
self.groups[filename] = hashes
return

for hash in hashes:
hashed = False
for key in self.groups:
# Check for maximum edit distance
if self.kfd.edit_distance(key, hash) <= self.edit_distance:
self.groups[key].append(filename)
hashed = True
break

if hashed:
continue

if not self.groups.has_key(hash):
self.groups[hash] = []

self.groups[hash].append(filename)

def hashFile(self, filename):
try:
s1, s2, s3 = self.kfd.hash_file(filename, self.aggresive).split(";")
self.cluster((s1, s2, s3), filename)
except KeyboardInterrupt:
raise
except:
sys.stderr.write(" -> %s\n" % str(sys.exc_info()[1]))
sys.stderr.flush()

def clusterDirectory(self, path, output_dir):
last_size = 0
total = 0
for root, dirs, files in os.walk(path):
for name in files:
if self.maximum != 0 and total >= self.maximum:
break
total += 1

basename, extension = os.path.splitext(name)
if extension in self.ignore_extensions:
continue
elif len(self.extensions) != 0:
if extension not in self.extensions:
continue

sys.stderr.write("\b"*last_size + " "*last_size + "\b"*last_size)
sys.stderr.flush()
sys.stderr.write("Processing file %s ..." % os.path.join(root, name))
last_size = len("Processing file %s ..." % os.path.join(root, name))
sys.stderr.flush()
self.hashFile(os.path.join(root, name))

if self.maximum != 0 and total >= self.maximum:
break

if total > 0:
sys.stderr.write("\n")
sys.stderr.flush()

def sortByCount(self):
# First, sort by count of elements
newGrp = {}
for x in self.groups:
if x == "":
continue

newGrp[x] = len(self.groups[x])

# Now sort the dict by values
outgrp = {}
alist = sorted(newGrp.iteritems(), key=lambda (k,v): (v,k), reverse=True)
dones = []

# Create the new dict with only non empty groups
for x in alist:
val = x[0]
outgrp[val] = []
for element in self.groups[val]:
if element not in dones:
outgrp[val].append(element)
dones.append(element)

if len(outgrp[val]) == 0:
del outgrp[val]

return outgrp

def printHashes(self):
print "Signature;Simple Signature;Reverse Signature;Filename"
for x in self.groups:
hashes = self.groups[x]
print "%s;%s;%s;%s" % (hashes[0], hashes[1], hashes[2], x)

def compareAndReportHashes(self, x, y, hashesx, hashesy, dones):
finished = False

for hx in hashesx:
if hx == "":
continue
elif finished:
break

if hx not in hashesy:
continue

for hy in hashesy:
if hy == "":
continue

dis = self.kfd.edit_distance(hx, hy)

if dis == 0 or dis < len(hx)/3:
if dis != 0:
percent = len(hx)*100.00/dis
else:
percent = 100
print "File '%s' matches '%s' (%0.2f%%)" % (x, y, percent)
dones[y] = x
finished = True
break

return dones

def compareHashes(self):
dones = {}

for x in self.groups:
for y in self.groups:
if x == y:
continue
elif dones.has_key(x):
if dones[x] == y:
#print "Ignored"
continue

hashesx = self.groups[x]
hashesy = self.groups[y]

dones = self.compareAndReportHashes(x, y, hashesx, hashesy, dones)

def printReport(self):
if self.just_print:
self.printHashes()
return

if self.just_compare:
self.compareHashes()
return

grp = self.sortByCount()
already = []
for x in grp:
for element in self.groups[x]:
if element not in already:
print "%s;%s" % (x, element)
already.append(element)

def main(args):
output_dir = None
kfdCluster = CDeepToad()

for arg in args:
if os.path.exists(arg):
if os.path.isdir(arg):
try:
kfdCluster.groups = {}
kfdCluster.clusterDirectory(arg, output_dir)
except KeyboardInterrupt:
print "Aborted..."

kfdCluster.printReport()
else:
try:
ret = kfdCluster.kfd.hash_file(arg)
print "%s;%s" % (ret, arg)
except KeyboardInterrupt:
print "Aborted..."
elif arg.startswith("-o"):
output_dir = arg[3:]
elif arg.startswith("-e="):
extensions = arg[3:].split(",")
kfdCluster.ignore_extensions = extensions
elif arg.startswith("-i="):
extensions = arg[3:].split(",")
kfdCluster.extensions = extensions
elif arg.startswith("-m="):
value = int(arg[3:])
kfdCluster.maximum = value
elif arg.startswith("-d="):
value = int(arg[3:])
kfdCluster.edit_distance = value
elif arg.startswith("-b="):
value = int(arg[3:])
kfdCluster.kfd.bsize = value
elif arg.startswith("-r="):
value = int(arg[3:])
kfdCluster.kfd.ignore_range = value
elif arg.startswith("-s="):
value = int(arg[3:])
kfdCluster.kfd.output_size = value
elif arg == "-c":
kfdCluster.just_compare = True
elif arg == "-f":
kfdCluster.kfd.algorithm = kfdCluster.kfd._fast_hash
elif arg == "-x":
kfdCluster.kfd.algorithm = kfdCluster.kfd._experimental_hash
elif arg == "-na":
kfdCluster.aggresive = False
elif arg == "-ag":
kfdCluster.aggresive = True
elif arg == "-nb":
kfdCluster.kfd.reduce_errors = True
elif arg == "-cb":
kfdCluster.kfd.reduce_errors = False
elif arg == "-simple":
kfdCluster.kfd.algorithm = kfdCluster.kfd.simplified
elif arg == "-p":
kfdCluster.just_print = True
elif arg == "-ida":
kfdCluster.ignore_extensions = [".idb", ".id0", ".id1", ".til", ".nam"]
elif arg.startswith("-echo"):
print arg[6:]
print "="*len(arg[6:])
else:
sys.stderr.write("Unknown argument '%s' ignored\n" % arg)

def usage():
print "%s v%s, Copyright (c) 2009, 2010 Joxean Koret <[email protected]>" % (PROGRAM, VERSION)
print "Usage:", sys.argv[0], "[parameters] <directory>"
print
print "Common parameters:"
print " -o=<directory> Not yet implemented"
print " -e=<extensions> Exclude extensions (separated by comma)"
print " -i=<extensions> Clusterize only specified extensions (separated by comma)"
print " -m=<value> Clusterize a maximum of <value> file(s)"
print " -d=<distance> Specify the maximum edit distance (by default, 16 or 33%)"
print " -ida Ignore files created by IDA"
print " -p Just print the generated hashes"
print " -c Compare the files"
print " -echo=<msg> Print a message (usefull to generate reports)"
print
print "Advanced parameters:"
print " -b=<block size> Specify the block size (by default, 512)"
print " -r=<ignore range> Specify the range of bytes to be ignored (by default, 2)"
print " -s=<output size> Specify the signature's size (by default, 32)"
print " -f Use faster (but weaker) algorithm"
print " -x Use eXperimental algorithm"
print " -simple Use the simplified algorithm"
print " -na Use non aggresive method (only applicable to default algorithm)"
print " -ag Use aggresive method (default)"
print " -nb Ignore null blocks (default)"
print " -cb Consider null blocks"
print
print "Example:"
print
print "Analyze a maximum of 25 files excluding zip and rar files:"
print "%s -e=.zip,.rar -m=25 /home/luser/samples" % sys.argv[0]
print

if __name__ == "__main__":
if len(sys.argv) == 1:
usage()
else:
sys.exit(main(sys.argv[1:]))

Loading

0 comments on commit 442e620

Please sign in to comment.