-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassifierWindows.py
executable file
·143 lines (116 loc) · 5.33 KB
/
classifierWindows.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
__author__ = 'AR'
import os, hashlib, json, sys, time, hashlib, subprocess as sp
from collections import Counter
from datetime import datetime as dt
count = int(os.sys.argv[5])
# Usage Message
if len(os.sys.argv) < 5:
print "Usage: ./classifier -i [path to malware sample] -t [path to training set directory]"
os.sys.exit()
for i in xrange(len(sys.argv)):
if sys.argv[i] == '-i':
inSample = sys.argv[i + 1]
if sys.argv[i] == '-t':
trainingSetPath = sys.argv[i + 1]
# Returns all system calls from strace output file
def getUniqSysCalls(straceFile):
with open(straceFile) as file:
syscalls = [ln.split('(', 1)[0] for ln in file if (ln.find('(') != -1)]
return syscalls
########## Execute the command as child process ##########
def executeChildProcess(command):
modifiedCommand = 'nttrace ' + command
with open(destPath + str(command)[len(sourcePath) + 2: len(sourcePath) + 6] + '.trace', 'a') as f:
ps = subprocess.Popen(modifiedCommand, stderr=f, stdout=subprocess.PIPE, shell=True)
time.sleep(1)
ps.terminate()
# Returns list of overlapping n-gram features
def returnNGramFeature(sysCallsList, nGram):
returningSet = []
for index in range(0, len(sysCallsList) - nGram, 3):
if sysCallsList[index: index + nGram] not in returningSet:
returningSet.append(sysCallsList[index: index + nGram])
return returningSet
def compair(unknownFeatureList, trainingFeatureList):
tempList = []
flag = True
m = hashlib.sha256()
for feature in unknownFeatureList:
m.update(str(feature))
featureHash = m.hexdigest()
# if flag:
# print feature, len(str(feature)), len(str(trainingFeatureList[0])), '\n', trainingFeatureList[0]
# flag = False
if featureHash in trainingFeatureList:
tempList.append(feature)
if len(tempList):
tempDict = {'unknowFeatures': tempList}
with open(trainingSetPath + 'unknownFeature.json', 'a') as unknownFeatureFile:
json.dump(tempDict, unknownFeatureFile)
# print len(tempList), len(unknownFeatureList)
if len(tempList):
result = float(len(tempList)) / len(unknownFeatureList)
else:
result = 0
return result
def main(inSample):
global trainingSetPath, count
# Generate nttrace file for given input sample
cmd = 'nttrace ' + str(inSample)
with open('C:\Users\user\AppData\Local\mlmc\\' + os.path.splitext(os.path.basename(str(inSample)))[0] + '.trace', 'w') as traceFile:
ps = sp.Popen(cmd, stderr=sp.PIPE, stdout=traceFile, shell=True)
time.sleep(1)
ps.terminate()
featureLists = [] # [[3gramList],[4gramList], [5gramList], ...]
for i in xrange(3, 8):
featureLists.append(returnNGramFeature(getUniqSysCalls('C:\Users\user\AppData\Local\mlmc\\' + \
os.path.splitext(os.path.basename(str(inSample)))[0] + '.trace'), i))
# Fetching the feature hashes from the training set
trainingSetFileList = [filename for filename in os.listdir(trainingSetPath) if filename.endswith('.json') and 'HashList' in filename]
for filename in sorted(trainingSetFileList):
with open(os.path.join(trainingSetPath, filename)) as data:
dictionaryy = json.load(data)
if filename[0] == '3':
featureHashList3gram = dictionaryy['featureHashList']
elif filename[0] == '4':
featureHashList4gram = dictionaryy['featureHashList']
elif filename[0] == '5':
featureHashList5gram = dictionaryy['featureHashList']
elif filename[0] == '6':
featureHashList6gram = dictionaryy['featureHashList']
elif filename[0] == '7':
featureHashList7gram = dictionaryy['featureHashList']
# listname = str(os.path.splitext(os.path.basename(str(os.path.join(trainingSetPath, filename))))[0])
# print featureHashList3gram[0], featureHashList4gram[0], featureHashList7gram[0], featureHashList5gram[0], featureHashList6gram[0]
finalResult =[]
# Compair features from training set features
finalResult.append(compair(featureLists[0], featureHashList3gram))
finalResult.append(compair(featureLists[1], featureHashList4gram))
finalResult.append(compair(featureLists[2], featureHashList5gram))
finalResult.append(compair(featureLists[3], featureHashList6gram))
finalResult.append(compair(featureLists[4], featureHashList7gram))
# print 'Similarity with 3,4,5,6,7-gram feature = {0:3.2f} %'.format(sum(finalResult) / float(len(finalResult)) * 100)
# Combine output for many samples in to a file
with open('Z:\Desktop\output7000.txt', 'a') as outFile:
outFile.write("{0:2.5f} \t {1:s} \n".format(sum(finalResult) / float(len(finalResult)) * 100, inSample))
print "{0:2.5f} \t {1:s} \t".format(sum(finalResult) / float(len(finalResult)) * 100, inSample), count, '\n'
count += 1
# Kill the dangling NtTrace process if any
os.system("taskkill /IM NtTrace.exe /F")
# os.system("taskkill /IM 979971590.exe /F")
def test():
global inSample
if os.path.isfile(inSample):
main(inSample)
elif os.path.isdir(inSample):
filelist = os.listdir(inSample)
for sample in sorted(filelist)[count:]:
if sample.endswith('.malware'):
main(inSample + sample)
if __name__ == '__main__':
start = dt.now()
print '\n{:*^90}\n'.format(" Machine Learning and Malware Classification ")
test()
end = dt.now()
print '\n{:*^90}\n'.format(' EOP Program took ' + str(end - start)[:10] + ' to complete ')