Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update calculate_results to calculate statistics excluding trimmed samples #12

Open
wants to merge 3 commits into
base: trunk
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 25 additions & 20 deletions coordinator/scripts/calculate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ def process_lats(lats):
df['latsS'] = df.lats / 10**3
df['pDate'] = df.time.values.astype('datetime64[ns]')
df = df[df.time > 1609459200000] # Filter out (corrupt) times before 2021
# trim first <trim_samples> samples from dataframe
if 'TRIM_SAMPLES' in environ:
trim_samples = int(environ['TRIM_SAMPLES'])
df = df[df.time > df[0][0] + trim_samples*(block_time_ms / 1000)*one_sec]
dat = df.groupby(by=MyBinnerTime(expression=df.pDate, resolution='s', df=df, label='pDate'), agg={'count': 'count', 'lats': vaex.agg.list('lats')})
dat['lats'] = dat['lats'].apply(process_lats)

Expand Down Expand Up @@ -236,7 +240,7 @@ def process_lats(lats):
lat_99999.append(tps_its[2][1][idx][2])
idx += 1
current += datetime.timedelta(seconds=1)

dat = df.groupby(df.latsS, agg='count')
lats = dat.values
lat_max = df.max(df.latsS)
Expand All @@ -262,11 +266,11 @@ def process_lats(lats):
lat_lines.append({"lats":lat_99999, "title":"99.999%", "freq": 1})

periods = []

idx = 0
tps_target_files = [join('outputs',x) for x in listdir('outputs') \
if 'tps_target_' in x and 'hdf5' not in x]
if len(tps_target_files) > 0:
if len(tps_target_files) > 0:
t_index = pandas.date_range(start=begin- datetime.timedelta(seconds=5), end=end, freq='1s')
exports = 0
for f in tps_target_files:
Expand All @@ -285,7 +289,7 @@ def process_lats(lats):
exports = exports + 1
else:
print('{} has no rows', f)

if exports > 0:
df2 = vaex.open('outputs/*-tps_target_*.txt.hdf5')
df2['pDate'] = df2['index']
Expand All @@ -296,11 +300,11 @@ def process_lats(lats):
dat3.drop('tps_target', inplace=True)
dat3.drop('pDate', inplace=True)
dat2.join(dat3, inplace=True)

its = dat2.to_items()
tps_target = make_tps_target_series_line(its, begin, end, (lambda its,idx: its[0][1][idx].astype(datetime.datetime)), (lambda its,idx: its[1][1][idx]))
periods = extract_tps_target_periods(its, begin, end, (lambda its,idx: its[0][1][idx].astype(datetime.datetime)), (lambda its,idx: its[1][1][idx]), (lambda its,idx: its[3][1][idx]))

tps_lines.append({"tps":tps_target, "title":"Loadgen target", "freq": 1, "ma": False})

prev_lat99 = 0
Expand All @@ -326,6 +330,13 @@ def process_lats(lats):
elbow_latmean.append(prev_latmean)
elbow_lat99.append(prev_lat99)
elbow_lat99999.append(prev_lat99999)
elif 'TRIM_SAMPLES' in environ :
## Lob off (configurable) more "warm up" samples
trim_samples = int(environ['TRIM_SAMPLES'])
for i in range(len(tps_lines)):
tps_lines[i]["tps"] = tps_lines[i]["tps"][trim_samples:]
for i in range(len(lat_lines)):
lat_lines[i]["lats"] = lat_lines[i]["lats"][trim_samples:]
Comment on lines +336 to +339
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@maurermi my python is a bit rusty. I was expecting this code to trim the first trim_samples samples from the set, but that doesn't appear to be what's happening when I go to test (the beginning of the test is preserved, and clearly the last trim_samples samples are removed).

It's pretty clear the previous version of the code did exactly that, but I'm not sure why. Perhaps this should be merged (since it correctly applies the trimming to the stats and charts), and we should consider offering a manual trim-from-begin and trim-from-end inputs? (would mirror the auto-trim-zeros split well)


if archiver_based:
for output_file in output_files:
Expand Down Expand Up @@ -356,13 +367,7 @@ def process_lats(lats):
while len(lat_lines[i]["lats"]) > 0 and int(lat_lines[i]["lats"][-1]) == 0:
lat_lines[i]["lats"].pop()

## Lob off (configurable) more "warm up" samples
if 'TRIM_SAMPLES' in environ:
trim_samples = int(environ['TRIM_SAMPLES'])
for i in range(len(tps_lines)):
tps_lines[i]["tps"] = tps_lines[i]["tps"][trim_samples:]
for i in range(len(lat_lines)):
lat_lines[i]["lats"] = lat_lines[i]["lats"][trim_samples:]



## Create throughput histogram
Expand Down Expand Up @@ -480,7 +485,7 @@ def dev_to_val(dev):
if len(colors) > j:
color = colors[j]
ax.plot(tps_time, tps_ma, label='{} ({}ms MA)'.format(tps_line["title"],tps_ma_ms), color=color)



max = max * 1.02
Expand Down Expand Up @@ -572,7 +577,7 @@ def dev_to_val(dev):
if len(markers) > i:
marker = markers[i]
ax.plot(elbow_tps, yy, label=titles[i], color=color, marker=marker)

max = max * 1.02

ax.set_ylabel('Latency (ms)')
Expand All @@ -584,13 +589,13 @@ def dev_to_val(dev):
# TODO: Find proper way of finding peak TPS range. None of this is working
# accurately
# for yy in y:
# delta_ma_tmp = []
# delta_ma_tmp = []
# pf_x = x
# pf_y = yy
# while math.isnan(pf_y[-1]):
# pf_y = pf_y[:-1]
# pf_x = pf_x[:-1]

# while math.isnan(pf_y[1]):
# pf_y = pf_y[1:]
# pf_x = pf_x[1:]
Expand Down Expand Up @@ -625,16 +630,16 @@ def dev_to_val(dev):
# peak_lb_idx = 0
# if peak_ub_idx < 0:
# peak_ub_idx = 0

# peak_lb = pf_x[peak_lb_idx]
# peak_ub = pf_x[peak_ub_idx]
# peak_found = True
# if delta_ma_above < 8:
# peak_found = False

# if peak_found:
# break

# if peak_ub > 0:
# ax.set_title('Latency/Throughput Elbow\nDetected peak {}-{} TX/s'.format(peak_lb, peak_ub))

Expand Down