-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharima_analysis.py
174 lines (144 loc) · 6.56 KB
/
arima_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# This file is part of MovementAnalysis.
#
# [1] Ferreira, M. D., Campbell, J. N., & Matwin, S. (2022).
# A novel machine learning approach to analyzing geospatial vessel patterns using AIS data.
# GIScience & Remote Sensing, 59(1), 1473-1490.
#
from preprocessing.clean_trajectories import Trajectories
from approach.ar_models import Models
import os
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def boxplot_measure(data, measure, folder, measure2=None, limit=None):
"""
It plots the box plots for all 24 configurations of latitude and longitude arima models.
:param data: the measurements of the ARIMA models.
:param measure: the selected measure for the first plot (latitude).
:param folder: folder to save the images
:param measure2: the selected measure for the second plot (longitude).
:param limit: limits for y-axis in the plot.
"""
x = pd.DataFrame()
if measure2 is not None:
x1 = pd.DataFrame()
for i in data.keys():
x = pd.concat([x, data[i][measure]], axis=1)
if measure2 is not None:
x1 = pd.concat([x1, curr_config[i][f'{measure}.1']], axis=1)
if measure == 'MSE':
x = np.sqrt(x)
if measure2 is not None:
x1 = np.sqrt(x1)
x.columns = data.keys()
x = x.replace([np.inf], np.nan)
x = x.replace([np.nan], x.max().max())
if measure2 is not None:
x1.columns = data.keys()
x1 = x1.replace([np.inf], np.nan)
x1 = x1.replace([np.nan], x1.max().max())
# fig = plt.figure(figsize=(15, 10))
print(f'{measure}:')
print(f'{pd.concat([x.mean(), x.std()], axis=1)}')
if measure2 is not None:
print(f'{pd.concat([x1.mean(), x1.std()], axis=1)}')
# Plot
col_names = [i.replace('_', ',') for i in x.columns]
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 4))
bp1 = ax.boxplot(x)
# change the fontsize of the xtick and ytick labels and axes
ax.set_ylim([limit[0], limit[1]])
ax.set_xticklabels(col_names, rotation=45, fontsize=15)
plt.yticks(fontsize=15)
ax.set_ylabel('MSE', fontsize=15)
plt.savefig(f'{folder}/features_arima_latitude_{measure}.png', bbox_inches='tight')
plt.close()
if measure2 is not None:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 4))
bp2 = ax.boxplot(x1)
# change the fontsize of the xtick and ytick labels and axes
ax.set_ylim([limit[0], limit[1]])
ax.set_xticklabels(col_names, rotation=45, fontsize=15)
plt.yticks(fontsize=15)
ax.set_ylabel('MSE', fontsize=15)
plt.savefig(f'{folder}/features_arima_longitude_{measure}.png', bbox_inches='tight')
plt.close()
all_stats = get_box_plot_data(x.columns, bp1, x)
if measure2 is not None:
lon_stats = get_box_plot_data(x.columns, bp2, x1)
all_stats = pd.concat([all_stats, lon_stats], axis=0)
all_stats.to_csv(f'{folder}/{measure}_stats.csv')
decimals = 4
for index in range(24):
row1 = all_stats.iloc[index,:]
row2 = all_stats.iloc[index+24,:]
row3 = all_stats.iloc[index+48,:]
lbl = row1['label']
lbl = lbl.replace('_', ',')
lbl2 = row2['label']
lbl2 = lbl2.replace('_', ',')
lbl3 = row3['label']
lbl3 = lbl3.replace('_', ',')
avg = round(row1['average'], decimals)
st = round(row1['std'], decimals)
avg2 = round(row2['average'], decimals)
st2 = round(row2['std'], decimals)
avg3 = round(row3['average'], decimals)
st3 = round(row3['std'], decimals)
print(f'$({lbl})$ & ${avg} \pm {st}$ & ${avg2} \pm {st2}$ & ${avg3} \pm {st3}$ \\\\')
def get_box_plot_data(labels, bp, data):
"""
It gets the boxplot and data information and save into a csv file.
:param labels: the configuration under analysis
:param bp: the boxplot information
:param data: the results of the configuration under analysis.
:return: a summary of all the boxplot and data information
"""
rows_list = []
for i in range(len(labels)):
dict1 = {}
dict1['label'] = labels[i]
dict1['lower_whisker'] = bp['whiskers'][i*2].get_ydata()[1]
dict1['lower_quartile'] = bp['boxes'][i].get_ydata()[1]
dict1['median'] = bp['medians'][i].get_ydata()[1]
dict1['upper_quartile'] = bp['boxes'][i].get_ydata()[2]
dict1['upper_whisker'] = bp['whiskers'][(i*2)+1].get_ydata()[1]
dict1['average'] = data[labels[i]].mean(axis=0)
dict1['std'] = data[labels[i]].std(axis=0)
dict1['n_outliers_lower'] = (data[labels[i]] < dict1['lower_whisker']).sum()
dict1['n_outliers_lower_prc'] = (data[labels[i]] < dict1['lower_whisker']).sum() / data[labels[i]].shape[0]
dict1['n_outliers_upper'] = (data[labels[i]] > dict1['upper_whisker']).sum()
dict1['n_outliers_upper_prc'] = (data[labels[i]] > dict1['upper_whisker']).sum() / data[labels[i]].shape[0]
dict1['n_between_0_max'] = (data[labels[i]][data[labels[i]] >= 0] <= dict1['upper_whisker']).sum()
dict1['n_between_0_max_prc'] = (data[labels[i]][data[labels[i]] >= 0] <= dict1['upper_whisker']).sum() / data[labels[i]].shape[0]
rows_list.append(dict1)
return pd.DataFrame(rows_list)
print('Starting all Experiments...')
n_samples = 300
## Fishing
vessel_type = [30, 1001, 1002]
## Dates
start_day = datetime(2020, 4, 1)
end_day = datetime(2020, 4, 30)
dataset = Trajectories(dataset='DCAIS', n_samples=n_samples, vessel_type=vessel_type, time_period=(start_day, end_day))
main_folder = f'./results/DCAIS/type_{vessel_type}/period_{start_day.date()}_to_{end_day.date()}/arima_analysis_sog_{n_samples}'
metric = 'arima'
dim_set = ['lat', 'lon']
curr_config = {}
silhouette = pd.DataFrame()
count = 0
for ar_i in [0, 1]:
for ar_p in [1, 2, 3]:
for ar_q in [0, 1, 2, 3]:
print(f'{ar_p}_{ar_i}_{ar_q}')
folder = f'{main_folder}/{metric}_{ar_p}_{ar_i}_{ar_q}/'
features_path = f'{folder}/features_distance.p'
if not os.path.exists(features_path):
dataset_dict = dataset.pandas_to_dict()
features = Models(dataset=dataset_dict, features_opt=metric, dim_set=dim_set,
ar_prm=ar_p, i_prm=ar_i, ma_prm=ar_q, folder=folder, znorm=False)
file_path = f'{main_folder}/{metric}_{ar_p}_{ar_i}_{ar_q}/features_measures.csv'
curr_config[f'{ar_p}_{ar_i}_{ar_q}'] = pd.read_csv(file_path)
folder = f'{main_folder}/'
boxplot_measure(curr_config, 'MSE', measure2='MSE', limit=(-0.01, 0.1), folder=folder)