Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move dataloder into seperate module and add BGL and Thunderbird dataloader #88

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions benchmarks/HDFS_bechmark.py
Original file line number Diff line number Diff line change
@@ -5,27 +5,28 @@
sys.path.append('../')
import pandas as pd
from loglizer.models import *
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR',
run_models = ['PCA', 'InvariantsMiner', 'LogClustering', 'IsolationForest', 'LR',
'SVM', 'DecisionTree']
struct_log = '../data/HDFS/HDFS.npz' # The benchmark dataset

if __name__ == '__main__':
(x_tr, y_train), (x_te, y_test) = dataloader.load_HDFS(struct_log,
window='session',
train_ratio=0.5,
split_type='uniform')
(x_tr, y_train), (x_te, y_test) = HDFS.loadDataset(struct_log,
window='session',
train_ratio=0.5,
split_type='uniform')
benchmark_results = []
for _model in run_models:
print('Evaluating {} on HDFS:'.format(_model))
if _model == 'PCA':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf',
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf',
normalization='zero-mean')
model = PCA()
model.fit(x_train)

elif _model == 'InvariantsMiner':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr)
@@ -41,7 +42,7 @@
elif _model == 'IsolationForest':
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_tr)
model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03,
model = IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03,
n_jobs=4)
model.fit(x_train)

@@ -62,7 +63,7 @@
x_train = feature_extractor.fit_transform(x_tr, term_weighting='tf-idf')
model = DecisionTree()
model.fit(x_train, y_train)

x_test = feature_extractor.transform(x_te)
print('Train accuracy:')
precision, recall, f1 = model.evaluate(x_train, y_train)
8 changes: 5 additions & 3 deletions demo/DecisionTree_demo.py
Original file line number Diff line number Diff line change
@@ -4,15 +4,17 @@
import sys
sys.path.append('../')
from loglizer.models import DecisionTree
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file

if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='uniform')

9 changes: 5 additions & 4 deletions demo/DeepLog_demo.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,8 @@
from loglizer import dataloader
from loglizer.models import DeepLog
from loglizer.preprocessing import Vectorizer, Iterator

from loglizer import preprocessing
from loglizer.dataloader import HDFS

batch_size = 32
hidden_size = 32
@@ -16,14 +17,14 @@
window_size = 10
epoches = 2
num_workers = 2
device = 0
device = 0

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file

if __name__ == '__main__':
(x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform')
(x_train, window_y_train, y_train), (x_test, window_y_test, y_test) = HDFS.loadDataset(struct_log, label_file=label_file, window='session', window_size=window_size, train_ratio=train_ratio, split_type='uniform')

feature_extractor = Vectorizer()
train_dataset = feature_extractor.fit_transform(x_train, window_y_train, y_train)
test_dataset = feature_extractor.transform(x_test, window_y_test, y_test)
10 changes: 6 additions & 4 deletions demo/InvariantsMiner_demo.py
Original file line number Diff line number Diff line change
@@ -4,16 +4,18 @@
import sys
sys.path.append('../')
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
epsilon = 0.5 # threshold for estimating invariant space

if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='sequential')
feature_extractor = preprocessing.FeatureExtractor()
@@ -25,7 +27,7 @@

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)

print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

16 changes: 9 additions & 7 deletions demo/InvariantsMiner_demo_without_labels.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
''' This is a demo file for the Invariants Mining model.
API usage:
dataloader.load_HDFS(): load HDFS dataset
HDFS.loadDataset(): load HDFS dataset
feature_extractor.fit_transform(): fit and transform features
feature_extractor.transform(): feature transform after fitting
model.fit(): fit the model
@@ -13,16 +13,18 @@
import sys
sys.path.append('../')
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
epsilon = 0.5 # threshold for estimating invariant space

if __name__ == '__main__':
# Load structured log without label info
(x_train, _), (x_test, _) = dataloader.load_HDFS(struct_log,
window='session',
(x_train, _), (x_test, _) = HDFS.loadDataset(struct_log,
window='session',
train_ratio=0.5,
split_type='sequential')
# Feature extraction
@@ -43,11 +45,11 @@

# If you have labeled data, you can evaluate the accuracy of the model as well.
# Load structured log with label info
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='sequential')
split_type='sequential')
x_test = feature_extractor.transform(x_test)
precision, recall, f1 = model.evaluate(x_test, y_test)

9 changes: 5 additions & 4 deletions demo/IsolationForest_demo.py
Original file line number Diff line number Diff line change
@@ -4,16 +4,17 @@
import sys
sys.path.append('../')
from loglizer.models import IsolationForest
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
anomaly_ratio = 0.03 # Estimate the ratio of anomaly samples in the data

if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='uniform')
feature_extractor = preprocessing.FeatureExtractor()
@@ -25,7 +26,7 @@

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)

print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

8 changes: 5 additions & 3 deletions demo/LR_demo.py
Original file line number Diff line number Diff line change
@@ -4,15 +4,17 @@
import sys
sys.path.append('../')
from loglizer.models import LR
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file

if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='uniform')

10 changes: 6 additions & 4 deletions demo/LogClustering_demo.py
Original file line number Diff line number Diff line change
@@ -4,17 +4,19 @@
import sys
sys.path.append('../')
from loglizer.models import LogClustering
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
max_dist = 0.3 # the threshold to stop the clustering process
anomaly_threshold = 0.3 # the threshold for anomaly detection

if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='uniform')
feature_extractor = preprocessing.FeatureExtractor()
@@ -26,6 +28,6 @@

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)

print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)
12 changes: 7 additions & 5 deletions demo/PCA_demo.py
Original file line number Diff line number Diff line change
@@ -4,19 +4,21 @@
import sys
sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file

if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='uniform')
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
normalization='zero-mean')
x_test = feature_extractor.transform(x_test)

@@ -25,6 +27,6 @@

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)

print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)
22 changes: 12 additions & 10 deletions demo/PCA_demo_without_labels.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
''' This is a demo file for the PCA model.
API usage:
dataloader.load_HDFS(): load HDFS dataset
HDFS.loadDataset(): load HDFS dataset
feature_extractor.fit_transform(): fit and transform features
feature_extractor.transform(): feature transform after fitting
model.fit(): fit the model
@@ -13,36 +13,38 @@
import sys
sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file

if __name__ == '__main__':
## 1. Load strutured log file and extract feature vectors
# Save the raw event sequence file by setting save_csv=True
(x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session',
(x_train, _), (_, _) = HDFS.loadDataset(struct_log, window='session',
split_type='sequential', save_csv=True)
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
normalization='zero-mean')

## 2. Train an unsupervised model
print('Train phase:')
# Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
model = PCA()
model = PCA()
# Model hyper-parameters may be sensitive to log data, here we use the default for demo
model.fit(x_train)
# Make predictions and manually check for correctness. Details may need to go into the raw logs
y_train = model.predict(x_train)
y_train = model.predict(x_train)

## 3. Use the trained model for online anomaly detection
print('Test phase:')
# Load another new log file. Here we use struct_log for demo only
(x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', split_type='sequential')
(x_test, _), (_, _) = HDFS.loadDataset(struct_log, window='session', split_type='sequential')
# Go through the same feature extraction process with training, using transform() instead
x_test = feature_extractor.transform(x_test)
x_test = feature_extractor.transform(x_test)
# Finally make predictions and alter on anomaly cases
y_test = model.predict(x_test)



8 changes: 5 additions & 3 deletions demo/SVM_demo.py
Original file line number Diff line number Diff line change
@@ -4,15 +4,17 @@
import sys
sys.path.append('../')
from loglizer.models import SVM
from loglizer import dataloader, preprocessing
from loglizer import preprocessing
from loglizer import preprocessing
from loglizer.dataloader import HDFS

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file

if __name__ == '__main__':
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
(x_train, y_train), (x_test, y_test) = HDFS.loadDataset(struct_log,
label_file=label_file,
window='session',
window='session',
train_ratio=0.5,
split_type='uniform')

92 changes: 92 additions & 0 deletions loglizer/dataloader/BGL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
The interface to load BGL log datasets.
Authors:
Vincent-Therrien
Hans Aschenloher
"""

import random
import pandas as pd
import os
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict

def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30,
train_ratio=0.8):
""" Read a BGL log file to obtain training and test data.
Args:
--------
log_file: Input file name with the header
"LineId,Label,Timestamp,,,,,,,,,EventId,,", where commas stand for
unnecessary fields.
windows: Type of windows to use. Can be either 'sliding' or 'fixed'.
time_interval: Time scope of a window in seconds. Used for both fixed and
sliding windows.
stepping_size: Step size of sliding windows in seconds. Used only for
sliding windows.
train_ratio: Fraction of examples to use for training.
Returns
-------
(x_train, y_train): The training data.
(x_test, y_test): The testing data.
"""

# Load the file and sort lines according to time.
df = pd.read_csv(log_file)
df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f")
df = df.sort_values(by="Time")
df.reset_index(drop=True, inplace=True)
df['LineId'] = range(0, df.shape[0])

examples = [] # List of sequences and anomaly labels.

start_time = df['Timestamp'][0]
end_time = df['Timestamp'].iloc[-1]

assert window == 'fixed' or window == 'sliding', "Unsupported window."
index = 0
t0 = start_time
t1 = t0 + time_interval
while t1 < end_time:
sequence = []
is_anomaly = 0
# Make a sequence and label it as normal or abnormal.
while df['Timestamp'][index] < t1:
sequence.append(df['EventId'][index])
if df['Label'][index] != '-':
is_anomaly = 1
index += 1
if sequence:
examples.append([sequence, is_anomaly])
# Translate the window.
if window == "fixed":
t0 = t1
elif window == "sliding":
t0 += stepping_size
t1 = t0 + time_interval

random.shuffle(examples)
x = [t[0] for t in examples]
y = [t[1] for t in examples]

n_train = int(len(x) * train_ratio)

x_train = np.array(x[:n_train], dtype=list)
y_train = np.array(y[:n_train], dtype=int)
x_test = np.array(x[n_train:], dtype=list)
y_test = np.array(y[n_train:], dtype=int)

print('Total: {} instances, {} anomaly, {} normal' \
.format(len(y), sum(y), len(y) - sum(y)))
print('Train: {} instances, {} anomaly, {} normal' \
.format(len(y_train), sum(y_train), len(y_train) - sum(y_train)))
print('Test: {} instances, {} anomaly, {} normal' \
.format(len(y_test), sum(y_test), len(y_test) - sum(y_test)))

return (x_train, y_train), (x_test, y_test)
181 changes: 38 additions & 143 deletions loglizer/dataloader.py → loglizer/dataloader/HDFS.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,20 @@
"""
The interface to load log datasets. The datasets currently supported include
HDFS and BGL.
The interface to load HDFS log datasets.
Authors:
LogPAI Team
Hans Aschenloher
"""

import random
import pandas as pd
import os
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict

def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
if split_type == 'uniform' and y_data is not None:
pos_idx = y_data > 0
x_pos = x_data[pos_idx]
y_pos = y_data[pos_idx]
x_neg = x_data[~pos_idx]
y_neg = y_data[~pos_idx]
train_pos = int(train_ratio * x_pos.shape[0])
train_neg = int(train_ratio * x_neg.shape[0])
x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
elif split_type == 'sequential':
num_train = int(train_ratio * x_data.shape[0])
x_train = x_data[0:num_train]
x_test = x_data[num_train:]
if y_data is None:
y_train = None
y_test = None
else:
y_train = y_data[0:num_train]
y_test = y_data[num_train:]
# Random shuffle
indexes = shuffle(np.arange(x_train.shape[0]))
x_train = x_train[indexes]
if y_train is not None:
y_train = y_train[indexes]
return (x_train, y_train), (x_test, y_test)

def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
def loadDataset(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
""" Load HDFS structured log into train and test data
Arguments
@@ -87,7 +57,7 @@ def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, spli
data_dict[blk_Id] = []
data_dict[blk_Id].append(row['EventId'])
data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])

if label_file:
# Split training and validation set in a class-uniform way
label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
@@ -96,9 +66,9 @@ def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, spli
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

# Split train and test data
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
data_df['Label'].values, train_ratio, split_type)

print(y_train.sum(), y_test.sum())

if save_csv:
@@ -142,6 +112,37 @@ def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, spli

return (x_train, y_train), (x_test, y_test)

def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
if split_type == 'uniform' and y_data is not None:
pos_idx = y_data > 0
x_pos = x_data[pos_idx]
y_pos = y_data[pos_idx]
x_neg = x_data[~pos_idx]
y_neg = y_data[~pos_idx]
train_pos = int(train_ratio * x_pos.shape[0])
train_neg = int(train_ratio * x_neg.shape[0])
x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
elif split_type == 'sequential':
num_train = int(train_ratio * x_data.shape[0])
x_train = x_data[0:num_train]
x_test = x_data[num_train:]
if y_data is None:
y_train = None
y_test = None
else:
y_train = y_data[0:num_train]
y_test = y_data[num_train:]
# Random shuffle
indexes = shuffle(np.arange(x_train.shape[0]))
x_train = x_train[indexes]
if y_train is not None:
y_train = y_train[indexes]
return (x_train, y_train), (x_test, y_test)


def slice_hdfs(x, y, window_size):
results_data = []
print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
@@ -160,109 +161,3 @@ def slice_hdfs(x, y, window_size):
print("Slicing done, {} windows generated".format(results_df.shape[0]))
return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]



def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, stepping_size=60,
train_ratio=0.8):
""" TODO
"""


def bgl_preprocess_data(para, raw_data, event_mapping_data):
""" split logs into sliding windows, built an event count matrix and get the corresponding label
Args:
--------
para: the parameters dictionary
raw_data: list of (label, time)
event_mapping_data: a list of event index, where each row index indicates a corresponding log
Returns:
--------
event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
labels: a list of labels, 1 represents anomaly
"""

# create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
if not os.path.exists(para['save_path']):
os.mkdir(para['save_path'])
log_size = raw_data.shape[0]
sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'

#=============divide into sliding windows=========#
start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
label_data, time_data = raw_data[:,0], raw_data[:, 1]
if not os.path.exists(sliding_file_path):
# split into sliding window
start_time = time_data[0]
start_index = 0
end_index = 0

# get the first start, end index, end time
for cur_time in time_data:
if cur_time < start_time + para['window_size']*3600:
end_index += 1
end_time = cur_time
else:
start_end_pair=tuple((start_index,end_index))
start_end_index_list.append(start_end_pair)
break
# move the start and end index until next sliding window
while end_index < log_size:
start_time = start_time + para['step_size']*3600
end_time = end_time + para['step_size']*3600
for i in range(start_index,end_index):
if time_data[i] < start_time:
i+=1
else:
break
for j in range(end_index, log_size):
if time_data[j] < end_time:
j+=1
else:
break
start_index = i
end_index = j
start_end_pair = tuple((start_index, end_index))
start_end_index_list.append(start_end_pair)
inst_number = len(start_end_index_list)
print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
else:
print('Loading start_end_index_list from file')
start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
inst_number = len(start_end_index_list)
print('there are %d instances (sliding windows) in this dataset' % inst_number)

# get all the log indexes in each time window by ranging from start_index to end_index
expanded_indexes_list=[]
for t in range(inst_number):
index_list = []
expanded_indexes_list.append(index_list)
for i in range(inst_number):
start_index = start_end_index_list[i][0]
end_index = start_end_index_list[i][1]
for l in range(start_index, end_index):
expanded_indexes_list[i].append(l)

event_mapping_data = [row[0] for row in event_mapping_data]
event_num = len(list(set(event_mapping_data)))
print('There are %d log events'%event_num)

#=============get labels and event count of each sliding window =========#
labels = []
event_count_matrix = np.zeros((inst_number,event_num))
for j in range(inst_number):
label = 0 #0 represent success, 1 represent failure
for k in expanded_indexes_list[j]:
event_index = event_mapping_data[k]
event_count_matrix[j, event_index] += 1
if label_data[k]:
label = 1
continue
labels.append(label)
assert inst_number == len(labels)
print("Among all instances, %d are anomalies"%sum(labels))
assert event_count_matrix.shape[0] == len(labels)
return event_count_matrix, labels
91 changes: 91 additions & 0 deletions loglizer/dataloader/Thunderbird.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""
The interface to load Thunderbird log datasets.
Authors:
Hans Aschenloher
"""

import random
import pandas as pd
import os
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict

def loadDataset(log_file, window='sliding', time_interval=60, stepping_size=30,
train_ratio=0.8):
""" Read a Thunderbird log file to obtain training and test data.
Args:
--------
log_file: Input file name with the header
"LineId,Label,Timestamp,,,,,,,,,EventId,,", where commas stand for
unnecessary fields.
windows: Type of windows to use. Can be either 'sliding' or 'fixed'.
time_interval: Time scope of a window in seconds. Used for both fixed and
sliding windows.
stepping_size: Step size of sliding windows in seconds. Used only for
sliding windows.
train_ratio: Fraction of examples to use for training.
Returns
-------
(x_train, y_train): The training data.
(x_test, y_test): The testing data.
"""

# Load the file and sort lines according to time.
df = pd.read_csv(log_file)
df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d-%H.%M.%S.%f")
df = df.sort_values(by="Timestamp")
df.reset_index(drop=True, inplace=True)
df['LineId'] = range(0, df.shape[0])

examples = [] # List of sequences and anomaly labels.

start_time = df['Timestamp'][0]
end_time = df['Timestamp'].iloc[-1]

assert window == 'fixed' or window == 'sliding', "Unsupported window."
index = 0
t0 = start_time
t1 = t0 + time_interval
while t1 < end_time:
sequence = []
is_anomaly = 0
# Make a sequence and label it as normal or abnormal.
while df['Timestamp'][index] < t1:
sequence.append(df['EventId'][index])
if df['Label'][index] != '-':
is_anomaly = 1
index += 1
if sequence:
examples.append([sequence, is_anomaly])
# Translate the window.
if window == "fixed":
t0 = t1
elif window == "sliding":
t0 += stepping_size
t1 = t0 + time_interval

random.shuffle(examples)
x = [t[0] for t in examples]
y = [t[1] for t in examples]

n_train = int(len(x) * train_ratio)

x_train = np.array(x[:n_train], dtype=list)
y_train = np.array(y[:n_train], dtype=int)
x_test = np.array(x[n_train:], dtype=list)
y_test = np.array(y[n_train:], dtype=int)

print('Total: {} instances, {} anomaly, {} normal' \
.format(len(y), sum(y), len(y) - sum(y)))
print('Train: {} instances, {} anomaly, {} normal' \
.format(len(y_train), sum(y_train), len(y_train) - sum(y_train)))
print('Test: {} instances, {} anomaly, {} normal' \
.format(len(y_test), sum(y_test), len(y_test) - sum(y_test)))

return (x_train, y_train), (x_test, y_test)