Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

task:4151977Log analyzer analyze telemetry logs #277

Merged
merged 9 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions plugins/ufm_log_analyzer_plugin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,6 @@ sudo yum install -y libjpeg-devel zlib-devel
```
Know your UFM sysdump location.

#### Running on a remote server
Since the tool generates graphs, you will need to setup an X11 forwarding:

1. Mac - Install and run [Quartz](https://www.xquartz.org/). Windows - Install and run [Xming](http://www.straightrunning.com)
2. On your remote server (Ubuntu/RedHat), make sure the x11 forwarding is enabled:
```
vim /etc/ssh/sshd_config
#Enalbe x11
X11Forwarding yes
```
3. Restart the ssh service `systemctl restart ssh` or `systemctl restart sshd` depends on the OS.
4. Install `python3-tk` using `sudo yum install python3-tkinter` or `sudo apt-get install python3-tk` depends on the OS.
5. When you SSH to the server, use the flag `-X`, for example `ssh -X root@my-vm`

If you would like to make sure it is working, once connection is done, do `xclock &`. This should start a clock on your machine.

### How to run
```
./log_analzer.sh [options] -l <path to dump>
Expand Down Expand Up @@ -89,7 +73,6 @@ This logic will show links that:
2. Thermal shut down.
3. If one side went down and the other side was not rebooted.


![Tool flow](img/loganalzer.png)


4 changes: 3 additions & 1 deletion plugins/ufm_log_analyzer_plugin/src/loganalyze/.pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@ disable=missing-function-docstring,
too-few-public-methods,
logging-fstring-interpolation,


[DESIGN]
max-locals=20
max-locals=20
max-args=8
73 changes: 49 additions & 24 deletions plugins/ufm_log_analyzer_plugin/src/loganalyze/log_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
from loganalyze.log_analyzers.console_log_analyzer import ConsoleLogAnalyzer
from loganalyze.log_analyzers.rest_api_log_analyzer import RestApiAnalyzer
from loganalyze.log_analyzers.link_flapping_analyzer import LinkFlappingAnalyzer
from loganalyze.log_analyzers.ibdiagnet2_port_counters_analyzer \
import Ibdiagnet2PortCountersAnalyzer

from loganalyze.pdf_creator import PDFCreator
from loganalyze.utils.common import delete_files_by_types
Expand Down Expand Up @@ -252,7 +254,8 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
in the full report.
Returns the created analyzer
"""
if log_name in full_extracted_logs_list:
# Checking the base name since some logs in the list are with a directory name
if any(os.path.basename(log) == log_name for log in full_extracted_logs_list):
log_csvs = get_files_in_dest_by_type(parsed_args.destination,
log_name,
parsed_args.extract_level)
Expand Down Expand Up @@ -305,7 +308,7 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
log.LOGGER.debug("Starting analyzing the data")
partial_create_analyzer = partial(create_analyzer,
parsed_args=args,
full_extracted_logs_list=full_logs_list,
full_extracted_logs_list=logs_to_work_with,
ufm_top_analyzer_obj=ufm_top_analyzer)

# Creating the analyzer for each log
Expand All @@ -328,6 +331,15 @@ def create_analyzer(parsed_args, full_extracted_logs_list,

rest_api_log_analyzer = partial_create_analyzer(log_name="rest_api.log",
analyzer_clc=RestApiAnalyzer)

ibdianget_2_ports_primary_analyzer = partial_create_analyzer(
log_name="ufm_logs_ibdiagnet2_port_counters.log",
boazhaim marked this conversation as resolved.
Show resolved Hide resolved
analyzer_clc=Ibdiagnet2PortCountersAnalyzer)

ibdianget_2_ports_secondary_analyzer = partial_create_analyzer(
log_name="secondary_telemetry_ibdiagnet2_port_counters.log",
analyzer_clc=Ibdiagnet2PortCountersAnalyzer)

second_telemetry_samples = get_files_in_dest_by_type(args.destination,
"secondary_",
1000,
Expand Down Expand Up @@ -358,36 +370,49 @@ def create_analyzer(parsed_args, full_extracted_logs_list,
)

used_ufm_version = console_log_analyzer.ufm_versions
text_to_show_in_pdf = f"Used ufm version in console log {used_ufm_version}"
fabric_info = "fabric info:" + os.linesep + str(ibdiagnet_analyzer.get_fabric_size()) \
if ibdiagnet_analyzer else "No Fabric Info found" # pylint: disable=invalid-name
text_to_show_in_pdf = f"Used ufm version in console log {used_ufm_version}{os.linesep}"

pdf = PDFCreator(pdf_path, pdf_header, png_images, text_to_show_in_pdf)
dataframes_for_pdf = []
fabric_info = ibdiagnet_analyzer.get_fabric_size() \
if ibdiagnet_analyzer else "No Fabric Info found"
dataframes_for_pdf.append(("Fabric info", fabric_info))
if links_flapping_analyzer:
link_flapping = links_flapping_analyzer.get_link_flapping_last_week() \
if links_flapping_analyzer else "No link flapping info"
text_to_show_in_pdf += os.linesep + str(fabric_info) + os.linesep + \
"Link Flapping:" + os.linesep + str(link_flapping)

critical_events_burst = event_log_analyzer.get_critical_event_bursts()
critical_events_text = "The minute event_type event count" # pylint: disable=invalid-name
for critical_event in critical_events_burst:
timestamp = critical_event['timestamp']
event_type = critical_event['event_type']
event = critical_event['event']
counter = critical_event['count']
event_text = f"{timestamp} {event_type} {event} {counter}"
critical_events_text = critical_events_text + os.linesep + event_text

text_to_show_in_pdf += os.linesep + os.linesep + "More than 5 events burst over a minute:" \
+ os.linesep + critical_events_text
dataframes_for_pdf.append(("Link Flapping past week",
links_flapping_analyzer.get_link_flapping_last_week()))
lists_to_add = []
critical_events_headers = ["timestamp", "event_type", "event", "count"]
lists_to_add.append((event_log_analyzer.get_critical_event_bursts(),
"More than 5 events burst over a minute",
critical_events_headers))

for cur_telemetry in \
[ibdianget_2_ports_primary_analyzer, ibdianget_2_ports_secondary_analyzer]:
dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} Telemetry iteration time",
cur_telemetry.get_last_iterations_time_stats()))
dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} "
"Telemetry iteration first and last timestamps",
cur_telemetry.get_first_last_iteration_timestamp()))
dataframes_for_pdf.append((f"{cur_telemetry.telemetry_type} Telemetry fabric size",
cur_telemetry.get_number_of_switches_and_ports()))
lists_to_add.append(([cur_telemetry.get_number_of_core_dumps()],
f"{cur_telemetry.telemetry_type} "
"number of core dumps found in the logs",
["Amount"]))


# PDF creator gets all the images and to add to the report
pdf = PDFCreator(pdf_path, pdf_header, png_images, text_to_show_in_pdf)
pdf.created_pdf()
pdf.create_pdf(dataframes_for_pdf, lists_to_add)
boazhaim marked this conversation as resolved.
Show resolved Hide resolved
# Generated a report that can be located in the destination
log.LOGGER.info("Analysis is done, please see the following outputs:")
for image, title in images_and_title_to_present:
log.LOGGER.info(f"{title}: {image}")
log.LOGGER.info(f"Summary PDF was created! you can open here at {pdf_path}")

if args.interactive:
import IPython
IPython.embed()

# Clean some unended files created during run
files_types_to_delete = set()
files_types_to_delete.add("png") #png images created for PDF report
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# pylint: disable=missing-function-docstring
# pylint: disable=missing-module-docstring

import logging
import os
import csv
import shutil
Expand All @@ -21,15 +22,18 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.dates as mdates

from loganalyze.log_analyzers.constants import DataConstants
import loganalyze.logger as log
# This makes sure the user does not see the warning from plotting
logging.getLogger('matplotlib').setLevel(logging.ERROR)
matplotlib.use('Agg') # This allows to run the tool on servers without graphic card/headless

pd.set_option("display.max_colwidth", None)
warnings.filterwarnings("ignore")


class BaseImageCreator:
# Setting the graph time interval to 1 hour
# This is out side of the constructor since
Expand All @@ -47,7 +51,7 @@ def __init__(self, dest_image_path):
self._funcs_for_analysis = set()

def _save_data_based_on_timestamp(
self, data_to_plot, x_label, y_label, title
self, data_to_plot, x_label, y_label, title, large_sample=False
):
with plt.ion():
log.LOGGER.debug(f"saving {title}")
Expand All @@ -61,7 +65,10 @@ def _save_data_based_on_timestamp(
# Set the locator to show ticks every hour and the formatter to
# include both date and time
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.HourLocator())
if large_sample:
ax.xaxis.set_major_locator(mdates.HourLocator(interval=24))
else:
ax.xaxis.set_major_locator(mdates.HourLocator())
ax.xaxis.set_minor_locator(
mdates.MinuteLocator(interval=15)
) # Add minor ticks every 15 minutes
Expand Down Expand Up @@ -94,7 +101,7 @@ def _save_data_based_on_timestamp(
self._images_created.extend(images_list_with_title)
plt.close()

def _save_pivot_data_in_bars( # pylint: disable=too-many-arguments
def _save_pivot_data_in_bars(
self, pivoted_data, x_label, y_label, title, legend_title
):
if pivoted_data.empty:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#
# Copyright © 2013-2024 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# This software product is a proprietary product of Nvidia Corporation and its affiliates
# (the "Company") and all right, title, and interest in and to the software
# product, including all associated intellectual property rights, are and
# shall remain exclusively with the Company.
#
# This software product is governed by the End User License Agreement
# provided with the software product.
#

from typing import List
import warnings
import pandas as pd
from loganalyze.log_analyzers.base_analyzer import BaseAnalyzer


class Ibdiagnet2PortCountersAnalyzer(BaseAnalyzer):
def __init__(self,
logs_csvs: List[str],
hours: int,
dest_image_path: str,
sort_timestamp=False):
super().__init__(logs_csvs, hours, dest_image_path, sort_timestamp)
self._iteration_time_data = None
self._iteration_time_stats = None
self.text_to_show_in_pdf = ""
# This will make sure all the extra columns are int
extra_columns = ['extra1', 'extra2', 'extra3', 'extra4', 'extra5']
for col in extra_columns:
self._log_data_sorted[col] = pd.to_numeric(
self._log_data_sorted[col],
errors='coerce'
).astype('Int64')
self._funcs_for_analysis = {self.plot_iteration_time_over_time}
# Based on the log path, decided if this is primary or secondary
if "ufm_logs" in logs_csvs[0]:
haithamwj marked this conversation as resolved.
Show resolved Hide resolved
self.telemetry_type = "primary"
elif "secondary_telemetry" in logs_csvs[0]:
self.telemetry_type = "secondary"
else:
self.telemetry_type = "Unknown_telemetry_type"

self._first_timestamp_of_logs = None
self._last_timestamp_of_logs = None

def get_collectx_versions(self):
unique_collectx_versions = self._log_data_sorted[\
self._log_data_sorted['type'] == 'collectx_version']['data'].unique()
return unique_collectx_versions

def get_number_of_switches_and_ports(self):
"""
Generate summary statistics for 'total_devices_ports' data.
This function calculates the average, maximum, minimum
for switches, CAs, routers, and ports.
"""
filtered_data = self._log_data_sorted[\
self._log_data_sorted['type'] == 'total_devices_ports']

ports_numbers_columns = ['extra1', 'extra3', 'extra5']
filtered_data['extra135'] = pd.to_numeric(
filtered_data[ports_numbers_columns].stack(), errors='coerce'
).groupby(level=0).sum(min_count=1)

columns_of_interest = ['data', 'extra2', 'extra4', 'extra135']
column_mapping = {
'data': '# of Switches',
'extra2': 'CAs',
'extra4': 'Routers',
'extra135': 'Ports'
}

summary_stats = []

for col in columns_of_interest:
numeric_col = pd.to_numeric(filtered_data[col], errors='coerce')
non_zero_col = numeric_col[numeric_col != 0]

avg = round(non_zero_col.mean()) if not non_zero_col.empty else 0
max_val = int(non_zero_col.max()) if not non_zero_col.empty else 0
min_val = int(non_zero_col.min()) if not non_zero_col.empty else 0
count = int(non_zero_col.count())

summary_stats.append({
'Category': column_mapping.get(col, col),
'Average': avg,
'Maximum': max_val,
'Minimum': min_val,
'Total Rows (Non-Zero)': count
})

summary_df = pd.DataFrame(summary_stats)

return summary_df

def analyze_iteration_time(self, threshold=0.15):
"""
Analyze rows where 'type' is 'iteration_time'.
Keep only 'type', 'timestamp', and 'data' columns.
Calculate statistics for the 'data' column, including timestamps for max and min.
Also, find gaps of at least 2 minutes with no data and allow filtering by a threshold.

Parameters:
- threshold (float): Minimum value to consider for analysis. Default is 0.5 seconds.
"""
filtered_data = self._log_data_sorted[self._log_data_sorted['type'] == 'iteration_time']
boazhaim marked this conversation as resolved.
Show resolved Hide resolved
filtered_data = filtered_data[['type', 'timestamp', 'data']]
filtered_data['data'] = pd.to_numeric(filtered_data['data'], errors='coerce')

filtered_data = filtered_data[filtered_data['data'] >= threshold]
filtered_data['timestamp'] = pd.to_datetime(filtered_data['timestamp'], errors='coerce')
filtered_data = filtered_data.dropna(subset=['timestamp'])

filtered_data = filtered_data.sort_values(by='timestamp').reset_index(drop=True)

if not filtered_data['data'].empty:
average = filtered_data['data'].mean()
max_value = filtered_data['data'].max()
min_value = filtered_data['data'].min()

max_timestamp = filtered_data.loc[filtered_data['data'] \
== max_value, 'timestamp'].iloc[0]
min_timestamp = filtered_data.loc[filtered_data['data'] \
== min_value, 'timestamp'].iloc[0]
first_timestamp = filtered_data['timestamp'].iloc[0]
last_timestamp = filtered_data['timestamp'].iloc[-1]

else:
average = max_value = min_value = 0.0
max_timestamp = min_timestamp = None
first_timestamp = last_timestamp = None

stats = {
'Average': average,
'Maximum': max_value,
'Max Timestamp': max_timestamp,
'Minimum': min_value,
'Min Timestamp': min_timestamp,
'Total Rows': filtered_data['data'].count()
}
stats_df = pd.DataFrame([stats])
self._iteration_time_data = filtered_data
self._iteration_time_stats = stats_df
self._first_timestamp_of_logs = first_timestamp
self._last_timestamp_of_logs = last_timestamp
return stats_df

def get_first_last_iteration_timestamp(self):
if not self._first_timestamp_of_logs or not self._last_timestamp_of_logs:
self.analyze_iteration_time()
times ={
'first': str(self._first_timestamp_of_logs),
'last': str(self._last_timestamp_of_logs)
}
return pd.DataFrame([times])

def get_last_iterations_time_stats(self):
return self._iteration_time_stats

def plot_iteration_time_over_time(self):
if self._iteration_time_data is None:
self.analyze_iteration_time()

self._iteration_time_data.set_index('timestamp', inplace=True)

with warnings.catch_warnings():
warnings.filterwarnings("ignore", ".*Locator attempting to generate.*")
haithamwj marked this conversation as resolved.
Show resolved Hide resolved
self._save_data_based_on_timestamp(
data_to_plot=self._iteration_time_data['data'],
x_label='Timestamp',
y_label='Iteration Time (s)',
title=f'{self.telemetry_type} Iteration Time',
large_sample=True)

def get_number_of_core_dumps(self):
core_dumps = self._log_data_sorted[self._log_data_sorted['type'] == 'timeout_dump_core']
return {"Amount":len(core_dumps)}
Loading