-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsmoothing_ohio.py
124 lines (86 loc) · 4.18 KB
/
smoothing_ohio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#simple exponential smoothing per univariate data seza trend o seasonality
from statsmodels.tsa.api import SimpleExpSmoothing
import pandas as pd
import numpy as np
from sklearn import metrics
from statsmodels.tsa.stattools import adfuller
df = df
X = df['glucose']
#numero di osservazioni nel testing
test = X.iloc[-3000:]
train = X.iloc[:-3000]
''''
smoothing_level (float, optional) – The smoothing_level value of the simple exponential smoothing, if the value is set then this value will be used as the value.
optimized (bool) – Should the values that have not been set above be optimized automatically?
''''
def timeseries_evaluation_metrics_func(y_true, y_pred):
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
print('Evaluation metric results:-')
print(f'MSE is : {metrics.mean_squared_error(y_true, y_pred)}')
print(f'MAE is : {metrics.mean_absolute_error(y_true, y_pred)}')
print(f'RMSE is : {np.sqrt(metrics.mean_squared_error(y_true, y_pred))}')
print(f'MAPE is : {mean_absolute_percentage_error(y_true, y_pred)}')
print(f'R2 is : {metrics.r2_score(y_true, y_pred)}',end='\n\n')
#grid search for the smoothing parameter
#l'R^2 sul libro è sbagliato e qui viene negativo!
resu = []
temp_df = pd.DataFrame()
df2 = pd.DataFrame(columns=['smoothing parameter', 'RMSE'])
for i in [0 , 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90,1]:
print(f'Fitting for smoothing level= {i}')
fit_v = SimpleExpSmoothing(np.asarray(train)).fit(i)
fcst_pred_v= fit_v.forecast(3000)
timeseries_evaluation_metrics_func(test,fcst_pred_v)
rmse = np.sqrt(metrics.mean_squared_error(test, fcst_pred_v))
df3 = {'smoothing parameter':i, 'RMSE': rmse}
temp_df = temp_df.append(df3, ignore_index=True)
temp_df.sort_values(by=['RMSE']).head(3) #anche questo è un risultato strano
#using values from grid search to fit the model
fitSES = SimpleExpSmoothing(np.asarray(train)).fit( smoothing_level = 0, optimized= False)
fcst_gs_pred = fitSES.forecast(3000)
timeseries_evaluation_metrics_func(test,fcst_gs_pred)
#Automated smoothing_level
fitSESauto = SimpleExpSmoothing(np.asarray(train)).fit( optimized= True, use_brute = True)
fcst_auto_pred = fitSESauto.forecast(3000)
timeseries_evaluation_metrics_func(test,fcst_auto_pred)
fitSESauto.summary()
#plotting...non fatto per ora
df_fcst_gs_pred = pd.DataFrame(fcst_gs_pred, columns=['Close_grid_Search'])
df_fcst_gs_pred["new_index"] = range(1229 , 1259)
df_fcst_gs_pred = df_fcst_gs_pred.set_index("new_index")
df_fcst_auto_pred = pd.DataFrame(fcst_auto_pred, columns=['Close_auto_search'])
df_fcst_auto_pred["new_index"] = range(1229 , 1259)
df_fcst_auto_pred = df_fcst_auto_pred.set_index("new_index")
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [16,9]
plt.plot( train, label='Train')
plt.plot(test, label='Test')
plt.plot(df_fcst_auto_pred, label='Simple Exponential Smoothing using optimized =True')
plt.plot(df_fcst_gs_pred, label='Simple Exponential Smoothing using custom grid search')
plt.legend(loc='best')
plt.show()
# il duble exponential smoothing viene usato per i dati con trend e il triple exp smoothing per i dati con il trand e la stagionalità, ma non penso ci siano per cui per ora non li implemento
#dikey fuller test
X.plot()
def Augmented_Dickey_Fuller_Test_func(series , column_name):
print (f'Results of Dickey-Fuller Test for column: {column_name}')
dftest = adfuller(series, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','No Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
if dftest[1] <= 0.05:
print("Conclusion:====>")
print("Reject the null hypothesis")
print("Data is stationary")
else:
print("Conclusion:====>")
print("Fail to reject the null hypothesis")
print("Data is non-stationary")
Augmented_Dickey_Fuller_Test_func(df['glucose'],'glucose')
#la serie è stazionaria