-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
178 lines (166 loc) · 7.22 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import gradio as gr
import datetime
import os
import preprocessing
import BASE_LOG_ANALYSE
import pandas as pd
# from multiprocess import Pool
from ray.util.multiprocessing import Pool
import urllib3
from functools import partial
urllib3.disable_warnings()
import plotly.express as px
import nltk
nltk.download('wordnet')
data = dict()
entropy = dict()
vectorizer = None
def training_data(folder, start, end, hostname):
df = load_data_log_entity(folder, 'messages*', hostname)
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp'])
df.sort_values(by=['timestamp'], inplace=True)
data['message'] = df
training_data = {k: v[(v['timestamp'] >= start) & (v['timestamp'] < end)] for k, v in data.items()}
global entropy, vectorizer
entropy, vectorizer = preprocessing.preprocess_training_data(training_data)
return "Training done!"
def inspect_data(start, end):
inspect = {k: v[(v['timestamp'] >= start) & (v['timestamp'] < end)] for k, v in data.items()}['message']
return inspect
def testing_data(start, end, num_core, threshold):
score_chunks = split_chunks_and_calculate_score(num_core, data, start, end, vectorizer, entropy)
score = pd.DataFrame(score_chunks)
p = px.line(score, x='timestamp', y='score', width=800, height=500)
anomaly = score[score['score'] >= threshold]
testing = {k: v[(v['timestamp'] >= start) & (v['timestamp'] < end)] for k, v in data.items()}['message']
grouped = testing.groupby(pd.Grouper(key='timestamp', axis=0, freq='H')).count()
p2 = px.line(grouped, width=800, height=500)
return p, anomaly, p2
def load_data_log_entity(folder, entity_name, hostname):
data1 = {'timestamp': [], 'message': []}
for filename in BASE_LOG_ANALYSE.get_file_list_by_filename_filter(folder, entity_name):
# Get file content - Start
if '.gz' in filename:
content = BASE_LOG_ANALYSE.read_gzip_file(filename)
else:
with open(filename, 'r', encoding="latin-1") as f_in:
content = f_in.read()
# Get file content - End
# Parsing data to get time and info - Start
if 'messages' in filename:
for line in content.split('\n'):
tokens = line.split(hostname)
if "logfile turned over due" in line:
pass
else:
if len(tokens) == 2:
if '<' in tokens[0]:
tokens[0] = tokens[0].split()[1]
data1['timestamp'].append(tokens[0])
data1['message'].append(tokens[1])
else:
for line in content.split('\n'):
tokens = line.split(hostname)
if "logfile turned over due" in line:
pass
else:
if len(tokens) == 2:
if '<' in tokens[0]:
tokens[0] = tokens[0].split()[1]
data1['timestamp'].append(tokens[0])
data1['message'].append(tokens[1])
# Parsing data to get time and info - End
return pd.DataFrame(data1)
def speed_up(time_filter, data, vectorizer, entropy):
data_filter = { k: v[(v['timestamp'] >= str(time_filter[0])) & (v['timestamp'] < str(time_filter[1]))] for k, v in data.items() }
return {'timestamp': time_filter[0].to_timestamp(), 'score': preprocessing.calculate_score(data_filter, vectorizer, entropy)['message']}
def split_chunks_and_calculate_score(num_core, data, start, end, vectorizer, entropy, chunk_size='10s'):
period = pd.period_range(start=start, end=end, freq=chunk_size)
input_args = []
for i in range(len(period) - 1):
input_args.append([period[i], period[i + 1]])
pool = Pool(processes=int(num_core))
chunks = pool.map(partial(speed_up, data=data, vectorizer=vectorizer, entropy=entropy), input_args)
return chunks
demo = gr.Blocks()
with demo:
gr.Markdown(
r"<h1>Please choose the training period and train model by this data to reuse in test data. Please change the datetime format if the current datetime format does not match with '_message' log file</h1>"
)
with gr.Row().style(equal_height=True):
with gr.Column(scale=1):
with gr.Row():
dir_path = gr.Textbox(
value=os.path.abspath(os.getcwd()),
interactive=True,
label="Log directory"
)
hostname = gr.Textbox(
value="",
interactive=True,
label="Hostname in log file, please check the _message file"
)
with gr.Row():
start_training = gr.Textbox(
label="Start training time",
interactive=True,
value=str(datetime.datetime.now())
)
end_training = gr.Textbox(
label="End training time",
interactive=True,
value=str(datetime.datetime.now())
)
btn1 = gr.Button(value="Train data").style(full_width=False)
output = gr.Textbox(label="Training result")
with gr.Row():
num_core = gr.Number(
label="Number CPU core (use more cores to speed up)",
interactive=True,
value=4
)
threshold = gr.Number(
label="Threshold to consider error",
interactive=True,
value=5
)
with gr.Row():
start_testing = gr.Textbox(
label="Start testing time",
interactive=True,
value=str(datetime.datetime.now())
)
end_testing = gr.Textbox(
label="End testing time",
interactive=True,
value=str(datetime.datetime.now())
)
btn2 = gr.Button(value="Check testing data").style(full_width=False)
with gr.Column(scale=2):
gr.Markdown("<h1>Score</h1>")
pl = gr.Plot()
gr.Markdown("<h1>Total log count per hour</h1>")
pl2 = gr.Plot()
# Refactor from here
btn1.click(fn=training_data, inputs=[dir_path, start_training, end_training, hostname], outputs=output)
gr.Markdown("<h1>Possible error</h1>")
ano = gr.DataFrame()
btn2.click(fn=testing_data, inputs=[start_testing, end_testing, num_core, threshold], outputs=[pl, ano, pl2])
gr.Markdown("<h1>Inspect data</h1>")
with gr.Row():
start_inspect = gr.Textbox(
label="Start inspect time",
interactive=True,
value=str(datetime.datetime.now())
)
end_inspect = gr.Textbox(
label="End inspect time",
interactive=True,
value=str(datetime.datetime.now())
)
btn3 = gr.Button(value="Inspect data in above interval").style(full_width=False)
raw = gr.DataFrame()
btn3.click(fn=inspect_data, inputs=[start_inspect, end_inspect], outputs=raw)
if __name__ == "__main__":
demo.launch()