-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter.py
350 lines (313 loc) · 13.5 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#!/usr/bin/env python3
import argparse
import logging
import sqlite3
import sys
from pathlib import Path
from typing import Optional, Union
from pandas import DataFrame, read_sql_query, to_datetime, unique
class DBFilter:
QUERY = "SELECT * from dewolf" # query to read from samples.sqlite3
ISSUE_SCHEMA = """CREATE TABLE IF NOT EXISTS samples_githubissue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
case_group TEXT,
title TEXT,
description TEXT,
status VARCHAR(100),
number INTEGER,
html_url TEXT,
created_at TIMESTAMP,
updated_at TIMESTAMP
);
"""
ERROR_SCHEMA = """CREATE TABLE IF NOT EXISTS dewolf_errors (
id INTEGER NOT NULL PRIMARY KEY,
function_name TEXT,
function_basic_block_count INTEGER,
function_size INTEGER,
function_arch TEXT,
function_platform TEXT,
sample_hash TEXT,
sample_name TEXT,
sample_total_function_count INTEGER,
sample_decompilable_function_count INTEGER,
dewolf_current_commit TEXT,
binaryninja_version TEXT,
dewolf_max_basic_blocks INTEGER,
dewolf_exception TEXT,
dewolf_traceback TEXT,
dewolf_decompilation_time REAL,
dewolf_undecorated_code TEXT,
is_successful INTEGER,
timestamp TEXT,
error_file_path TEXT,
error_line TEXT,
case_group TEXT,
errors_per_group_count_pre_filter INTEGER,
tag TEXT
)
"""
SUMMARY_SCHEMA = """CREATE TABLE IF NOT EXISTS summary (
id INTEGER NOT NULL PRIMARY KEY,
dewolf_current_commit TEXT,
avg_dewolf_decompilation_time REAL,
total_functions INTEGER,
total_errors INTEGER,
unique_exceptions INTEGER,
unique_tracebacks INTEGER,
processed_samples INTEGER,
tag TEXT
)
"""
INDEX_CREATION = """
CREATE INDEX IF NOT EXISTS idx_dewolf_errors_is_successful ON dewolf_errors(is_successful);
CREATE INDEX IF NOT EXISTS idx_dewolf_errors_dewolf_current_commit ON dewolf_errors(dewolf_current_commit);
CREATE INDEX IF NOT EXISTS idx_dewolf_errors_case_group ON dewolf_errors(case_group);
CREATE INDEX IF NOT EXISTS idx_dewolf_function_basic_block_count ON dewolf_errors(function_basic_block_count);
CREATE INDEX IF NOT EXISTS idx_dewolf_errors_tag ON dewolf_errors(tag);
"""
def __init__(self, df: DataFrame, tag: Optional[str] = None):
self._summary = None
self._filtered = None
self._samples = None
self._df = df
self._tag = tag
@property
def is_empty(self):
"""The is_empty property."""
return len(self._df) == 0
@property
def tag(self):
"""The tag property."""
return self._tag
@tag.setter
def tag(self, value):
self._tag = value
@classmethod
def from_file(cls, file_path):
with sqlite3.connect(file_path) as con:
df = read_sql_query(cls.QUERY, con)
df["timestamp"] = to_datetime(df["timestamp"], format="mixed")
return cls(df)
@classmethod
def init_db(cls, file_path):
with sqlite3.connect(file_path) as conn:
print("Creating tables...")
cursor = conn.cursor()
cursor.execute(cls.ISSUE_SCHEMA)
conn.commit()
cursor.execute(cls.ERROR_SCHEMA)
conn.commit()
cursor.execute(cls.SUMMARY_SCHEMA)
conn.commit()
for index_query in cls.INDEX_CREATION.splitlines():
cursor.execute(index_query)
conn.commit()
cursor.execute(f"PRAGMA index_list('dewolf_errors')")
indexes = cursor.fetchall()
print("Indexes created for:")
for index in indexes:
print(f"\tTable 'dewolf_errors': Index '{index[1]}'")
@staticmethod
def _write_df_to_sqlite3(df: DataFrame, database_path: Union[str, Path], table_name: str, append: bool = False):
"""
Write DataFrame to SQLite table
"""
dtype_mapping = {
"int64": "INTEGER",
"float64": "REAL",
"bool": "INTEGER", # SQLite uses INTEGER for boolean values
"datetime64[ns]": "TEXT", # SQLite does not have a separate datetime type
"object": "TEXT", # For all other non-numeric types
}
dtype = {name: dtype_mapping.get(str(dt), "TEXT") for name, dt in df.dtypes.items()}
dtype["id"] = "INTEGER NOT NULL PRIMARY KEY"
if append:
with sqlite3.connect(database_path) as con:
df.drop("id", axis=1).to_sql(table_name, con, index=False, if_exists="append", dtype=dtype)
else:
with sqlite3.connect(database_path) as con:
df.to_sql(table_name, con, index=False, if_exists="replace", dtype=dtype)
def _get_summary(self) -> DataFrame:
"""
Return DataFrame containing summary statistics over all samples
"""
commits = self._df.dewolf_current_commit.unique()
if len(commits) != 1:
logging.error(f"expect exactly one commit in samples.sqlite3. Got: {commits}")
raise ValueError("Non-unique commit data")
failed_runs = self._df[self._df.is_successful == 0]
summary = {
"id": 0,
"dewolf_current_commit": commits[0],
"avg_dewolf_decompilation_time": self._df.dewolf_decompilation_time.dropna().mean(),
"total_functions": len(self._df),
"total_errors": len(failed_runs),
"unique_exceptions": len(failed_runs.dewolf_exception.unique()),
"unique_tracebacks": len(failed_runs.dewolf_traceback.unique()),
"processed_samples": len(self._df.sample_hash.unique()),
"tag": self.tag,
}
return DataFrame(summary, index=[0])
def _get_samples(self) -> DataFrame:
"""
Return DataFrame containing per sample statistics
"""
unique_lst_to_str = lambda x: ",".join(iter(unique(x)))
samples = (
self._df.groupby(["sample_hash", "dewolf_current_commit"])
.agg(
platform=("function_platform", unique_lst_to_str),
binaryninja_version=("binaryninja_version", unique_lst_to_str),
count_error=("is_successful", lambda x: sum(x == 0)),
count_success=("is_successful", lambda x: sum(x == 1)),
count_total_processed=("is_successful", "count"),
timestamp=("timestamp", "min"),
duration_seconds=("timestamp", lambda x: (x.max() - x.min()).total_seconds()),
dewolf_max_basic_blocks=("dewolf_max_basic_blocks", lambda x: x.iloc[0]),
sample_total_function_count=("sample_total_function_count", lambda x: x.iloc[0]),
sample_decompilable_function_count=("sample_decompilable_function_count", lambda x: x.iloc[0]),
)
.reset_index()
)
samples["id"] = samples.index
return samples
def _get_filtered(self) -> DataFrame:
"""
Filter dataset to contain 10 smallest cases per unique exception AND traceback
generate case id for semantic grouping of similar errors. (e.g., [email protected]:42)
"""
failed_runs = self._df[self._df.is_successful == 0].copy()
# enrich dewolf errors data
failed_runs["error_file_path"] = failed_runs["dewolf_traceback"].apply(self._get_last_file_path)
failed_runs["error_line"] = failed_runs["dewolf_traceback"].apply(self._get_last_line_number)
# case id: [email protected]:42
failed_runs["case_group"] = (
failed_runs["dewolf_exception"].str.split().str[0].str.strip(": ")
+ "@"
+ failed_runs["error_file_path"].str.split("/").str[-1]
+ ":"
+ failed_runs["error_line"]
)
errors_per_group_count = failed_runs["case_group"].value_counts()
failed_runs["errors_per_group_count_pre_filter"] = failed_runs["case_group"].map(errors_per_group_count)
# truncate traceback
failed_runs["dewolf_traceback"] = failed_runs["dewolf_traceback"].apply(self.truncate_middle)
# filter n smallest unique per exception and traceback
f = lambda x: x.nsmallest(10, "function_basic_block_count")
filtered_df = failed_runs.groupby(["dewolf_exception", "dewolf_traceback"]).apply(f)
filtered_df["tag"] = self._tag
assert isinstance(filtered_df, DataFrame)
return filtered_df.reset_index(drop=True)
@staticmethod
def truncate_middle(s, n=4000, indicator="... TRUNCATED ..."):
"""
Truncate the middle part of a string if its length exceeds n characters.
"""
if not s or len(s) <= n:
return s
half_n = (n - len(indicator)) // 2
return s[:half_n] + "\n" + indicator + "\n" + s[-half_n:]
@staticmethod
def _get_last_file_path(traceback: str) -> str:
"""Extract the last file path contained in a Traceback"""
file_path, _, *_ = traceback.rpartition("File ")[-1].split(", ")
return file_path.strip('"')
@staticmethod
def _get_last_line_number(traceback: str) -> str:
"""Return the line number (as str) from the last row of a Traceback"""
_, line, *_ = traceback.rpartition("File ")[-1].split(", ")
return line.lstrip("line ")
@property
def summary(self) -> DataFrame:
if self._summary is None:
self._summary = self._get_summary()
return self._summary
@property
def samples(self) -> DataFrame:
if self._summary is None:
self._summary = self._get_samples()
return self._summary
@property
def filtered(self) -> DataFrame:
if self._filtered is None:
self._filtered = self._get_filtered()
return self._filtered
def write(self, file_path: Path):
# create tables
with sqlite3.connect(file_path) as con:
cursor = con.cursor()
cursor.execute(self.ISSUE_SCHEMA)
con.commit()
cursor.execute(self.ERROR_SCHEMA)
con.commit()
cursor.execute(self.SUMMARY_SCHEMA)
con.commit()
self._write_df_to_sqlite3(self.summary, file_path, table_name="summary", append=True)
self._write_df_to_sqlite3(self.filtered, file_path, table_name="dewolf_errors", append=True)
def print_sample_hashes(db_file: Path, commit: str = ""):
"""Print all sample hashes contained in dewolf_errors table,
filter by commit if specified with a pattern that starts with the given commit hash"""
stmt = "SELECT DISTINCT sample_hash FROM dewolf_errors"
params = ()
if commit:
commit = commit[:8]
stmt += " WHERE dewolf_current_commit LIKE ?"
params = (commit + "%",)
with sqlite3.connect(db_file) as con:
cursor = con.cursor()
cursor.execute(stmt, params)
for sample_hash in cursor:
print(sample_hash[0])
cursor.close()
def print_slow_sample_hashes(db_file: Path, duration: int):
"""Given path to samples.sqlite3 and duration in seconds:
print sample hashes of samples that contain a function with decompilation time of more than 'duration' seconds"""
df = DBFilter.from_file(db_file)._df
diff_shift = lambda x: x["timestamp"].diff().shift(-1).dt.total_seconds()
df["duration_seconds"] = df.groupby("sample_hash").apply(diff_shift).droplevel(0).fillna(-1).astype("int")
for sample_hash in df[df.duration_seconds >= duration]["sample_hash"].unique():
print(sample_hash)
def existing_file(path):
"""Check if the provided path is an existing file."""
file_path = Path(path)
if not file_path.is_file():
raise argparse.ArgumentTypeError(f"File '{path}' does not exist.")
return file_path
def parse_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Bug finding tool for dewolf decompiler")
parser.add_argument("-i", "--input", required=True, type=existing_file, help="Path to SQLite file")
parser.add_argument(
"-o",
"--output",
default="filtered.sqlite3",
type=Path,
help="File path of filtered output (SQLite file, default: filtered.sqlite3)",
)
parser.add_argument("-l", "--list", action="store_true", help="List sample hashes contained in SQLite DB")
parser.add_argument("-s", "--slow", type=int, help="List samples with functions that take more than X seconds.")
parser.add_argument("--commit", type=str, help="Filter by commit hash when listing sample hashes", required=False)
parser.add_argument("--tag", type=str, help="Add a tag to filtered rows.", required=False)
parser.add_argument("--init", action="store_true", help="Create tables and index if it does not exist")
return parser.parse_args()
def main(args: argparse.Namespace) -> int:
if args.slow is not None:
print_slow_sample_hashes(args.input, args.slow)
return 0
if args.list:
print_sample_hashes(args.input, commit=args.commit)
return 0
if args.init:
DBFilter.init_db(args.input)
return 0
logging.info("filtering database")
f = DBFilter.from_file(args.input)
f.tag = args.tag
if f.is_empty:
logging.warning("empty df. is samples.sqlite3 empty?")
return 0
f.write(args.output)
return 0
if __name__ == "__main__":
args = parse_arguments()
sys.exit(main(args))