Merge pull request #251 from winedarksea/dev

0.6.16
winedarksea · Nov 19, 2024 · 830156b · 830156b
2 parents 6e473e8 + 7e97114
commit 830156b
Show file tree

Hide file tree

Showing 70 changed files with 8,026 additions and 981 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ A combination of metrics and cross-validation options, the ability to apply subs
 * [Installation](https://github.com/winedarksea/AutoTS#installation)
 * [Basic Use](https://github.com/winedarksea/AutoTS#basic-use)
 * [Tips for Speed and Large Data](https://github.com/winedarksea/AutoTS#tips-for-speed-and-large-data)
+* [Flowchart](https://github.com/winedarksea/AutoTS#autots-process)
 * Extended Tutorial [GitHub](https://github.com/winedarksea/AutoTS/blob/master/extended_tutorial.md) or [Docs](https://winedarksea.github.io/AutoTS/build/html/source/tutorial.html)
 * [Production Example](https://github.com/winedarksea/AutoTS/blob/master/production_example.py)
 
@@ -59,10 +60,10 @@ df = load_daily(long=long)
 
 model = AutoTS(
     forecast_length=21,
-    frequency='infer',
+    frequency="infer",
     prediction_interval=0.9,
-    ensemble='auto',
-    model_list="fast",  # "superfast", "default", "fast_parallel"
+    ensemble=None,
+    model_list="superfast",  # "fast", "default", "fast_parallel"
     transformer_list="fast",  # "superfast",
     drop_most_recent=1,
     max_generations=4,
@@ -133,4 +134,37 @@ Also take a look at the [production_example.py](https://github.com/winedarksea/A
 * And, of course, contributing to the codebase directly on GitHub.
 
 
+## AutoTS Process
+```mermaid
+flowchart TD
+    A[Initiate AutoTS Model] --> B[Import Template]
+    B --> C[Load Data]
+    C --> D[Split Data Into Initial Train/Test Holdout]
+    D --> E[Run Initial Template Models]
+    E --> F[Evaluate Accuracy Metrics on Results]
+    F --> G[Generate Score from Accuracy Metrics]
+    G --> H{Max Generations Reached or Timeout?}
+
+    H -->|No| I[Evaluate All Previous Templates]
+    I --> J[Genetic Algorithm Combines Best Results and New Random Parameters into New Template]
+    J --> K[Run New Template Models and Evaluate]
+    K --> G
+
+    H -->|Yes| L[Select Best Models by Score for Validation Template]
+    L --> M[Run Validation Template on Additional Holdouts]
+    M --> N[Evaluate and Score Validation Results]
+    N --> O{Create Ensembles?}
+    
+    O -->|Yes| P[Generate Ensembles from Validation Results]
+    P --> Q[Run Ensembles Through Validation]
+    Q --> N
+
+    O -->|No| R[Export Best Models Template]
+    R --> S[Select Single Best Model]
+    S --> T[Generate Future Time Forecast]
+    T --> U[Visualize Results]
+
+    R --> B[Import Best Models Template]
+```
+
 *Also known as Project CATS (Catlin's Automated Time Series) hence the logo.*
diff --git a/TODO.md b/TODO.md
@@ -13,17 +13,34 @@
 * Forecasts are desired for the future immediately following the most recent data.
 * trimmed_mean to AverageValueNaive
 
-# 0.6.15 🇺🇦 🇺🇦 🇺🇦
-* Constraint transformer added
-* historical_growth constraint method added
-* fft as multivariate_feature for Cassandra
-* None trend_window now searched as part of Cassandra
-* "quarterlydayofweek" method added for datepart
-* threshold_method arg to AlignLastValue
-* general tempate updated
-* slight change to MATSE metric, now only abs values for scaling
-* additional args to DatepartRegression
-* bug fixes
+# 0.6.16 🇺🇦 🇺🇦 🇺🇦
+* export_template added focus_models option
+* added OneClassSVM and GaussianMixture anomaly model options
+* added plot_unpredictability_score
+* added a few more NeuralForecast search options
+* bounds_only to Constraint transformer
+* updates for deprecated upstream args
+* FIRFilter transformer added
+* mle and imle downscaled to reduce score imbalance issues with these two in generate score
+* SectionalMotif now more robust to forecast lengths longer than history
+* new transformer and metric options for SectionalMotif
+* NaN robustness to matse
+* 'round' option to Constraint
+* minor change to mosaic min style ensembles to remove edge case errors
+* 'mosaic-profile', 'filtered', 'unpredictability_adjusted' and 'median' style mosaics added
+* updated profiler, and improved feature generation for horizontal generalization
+* changepoint style trend as an option to GLM and GLS
+* added ShiftFirstValue which is only a minor nuance on PositiveShift transformer
+* added BasicLinearModel model
+* datepart_method, scale, and fourier encodig to WindowRegression
+* trimmed_mean and more date part options to SeasonalityMotif
+* some additional options to MultivariateRegression
+* added ThetaTransformer
+* added TVVAR model (time varying VAR)
+* added ChangepointDetrend transformer
+* added MeanPercentSplitter transformer
+* updated load_daily with more recent history
+* added support for passing a custom metric
 
 ### Unstable Upstream Pacakges (those that are frequently broken by maintainers)
 * Pytorch-Forecasting

diff --git a/autots/__init__.py b/autots/__init__.py
@@ -27,7 +27,7 @@
 from autots.models.cassandra import Cassandra
 
 
-__version__ = '0.6.15'
+__version__ = '0.6.16'
 
 TransformTS = GeneralTransformer
 

diff --git a/autots/datasets/_base.py b/autots/datasets/_base.py
@@ -12,16 +12,54 @@
 def load_daily(long: bool = True):
     """Daily sample data.
 
+    ```
+    # most of the wiki data was chosen to show holidays or holiday-like patterns
     wiki = [
-            "Germany", "Thanksgiving", 'all', 'Microsoft',
-            "Procter_%26_Gamble", "YouTube", "United_States", "Elizabeth_II",
-            "William_Shakespeare", "Cleopatra", "George_Washington",
-            "Chinese_New_Year", "Standard_deviation", "Christmas",
-            "List_of_highest-grossing_films",
-            "List_of_countries_that_have_gained_independence_from_the_United_Kingdom",
-            "Periodic_table"
+        'United_States',
+        'Germany',
+        'List_of_highest-grossing_films',
+        'Jesus',
+        'Michael_Jackson',
+        'List_of_United_States_cities_by_population',
+        'Microsoft_Office',
+        'Google_Chrome',
+        'Periodic_table',
+        'Standard_deviation',
+        'Easter',
+        'Christmas',
+        'Chinese_New_Year',
+        'Thanksgiving',
+        'List_of_countries_that_have_gained_independence_from_the_United_Kingdom',
+        'History_of_the_hamburger',
+        'Elizabeth_II',
+        'William_Shakespeare',
+        'George_Washington',
+        'Cleopatra',
+        'all'
     ]
 
+    df2 = load_live_daily(
+        observation_start="2017-01-01", weather_years=7, trends_list=None,
+        gov_domain_list=None, wikipedia_pages=wiki,
+        fred_series=['DGS10', 'T5YIE', 'SP500','DEXUSEU'], sleep_seconds=10,
+        fred_key = "93873d40f10c20fe6f6e75b1ad0aed4d",
+        weather_data_types = ["WSF2", "PRCP"],
+        weather_stations = ["USW00014771"],  # looking for intermittent
+        tickers=None, london_air_stations=None,
+        weather_event_types=None, earthquake_min_magnitude=None,
+    )
+    data_file_name = join("autots", "datasets", 'data', 'holidays.zip')
+    df2.to_csv(
+        data_file_name,
+        index=True,
+        compression={
+            'method': 'zip',
+            'archive_name': 'holidays.csv',
+            'compresslevel': 9  # Maximum compression level (0-9)
+        }
+    )
+    ```
+
     Sources: Wikimedia Foundation
 
     Args:
@@ -224,8 +262,8 @@ def load_live_daily(
     tickers: list = ["MSFT"],
     trends_list: list = ["forecasting", "cycling", "microsoft"],
     trends_geo: str = "US",
-    weather_data_types: list = ["AWND", "WSF2", "TAVG"],
-    weather_stations: list = ["USW00094846", "USW00014925"],
+    weather_data_types: list = ["AWND", "WSF2", "TAVG", "PRCP"],
+    weather_stations: list = ["USW00094846", "USW00014925", "USW00014771"],
     weather_years: int = 5,
     london_air_stations: list = ['CT3', 'SK8'],
     london_air_species: str = "PM25",
@@ -769,14 +807,42 @@ def load_artificial(long=False, date_start=None, date_end=None):
         date_end = date_end.date()
     if date_start is None:
         if isinstance(date_end, datetime.date):
-            date_start = date_end - datetime.timedelta(days=720)
+            date_start = date_end - datetime.timedelta(days=740)
         else:
-            date_start = datetime.datetime.now().date() - datetime.timedelta(days=720)
+            date_start = datetime.datetime.now().date() - datetime.timedelta(days=740)
     if isinstance(date_start, datetime.datetime):
         date_start = date_start.date()
     dates = pd.date_range(date_start, date_end)
     size = dates.size
+    new_size = int(size / 10)
     rng = np.random.default_rng()
+    holiday = pd.Series(
+        np.arange(size) * 0.025
+        + rng.normal(0, 0.2, size)
+        + (np.sin((np.pi / 7) * np.arange(size)) * 0.5),
+        index=dates,
+        name='holiday',
+    )
+    # January 1st
+    holiday[holiday.index.month == 1 & (holiday.index.day == 1)] += 10
+    # December 25th
+    holiday[(holiday.index.month == 12) & (holiday.index.day == 25)] += -4
+    # Second Tuesday of April
+    # Find all Tuesdays in April
+    second_tuesday_of_april = (
+        (holiday.index.month == 4)
+        & (holiday.index.weekday == 1)
+        & (holiday.index.day >= 8)
+        & (holiday.index.day <= 14)
+    )
+    holiday[second_tuesday_of_april] += 10
+    # Last Monday of August
+    last_monday_of_august = (
+        (holiday.index.month == 8)
+        & (holiday.index.weekday == 0)
+        & ((holiday.index + pd.Timedelta(7, unit='D')).month == 9)
+    )
+    holiday[last_monday_of_august] += 12
 
     df_wide = pd.DataFrame(
         {
@@ -810,6 +876,13 @@ def load_artificial(long=False, date_start=None, date_end=None):
                 / 2,
             ),
             "linear": np.arange(size) * 0.025,
+            "flat": 1,
+            "new_product": np.concatenate(
+                [
+                    np.zeros(int(size - new_size)),
+                    np.random.choice(a=[-0.8, 0, 0.8], size=new_size).cumsum(),
+                ]
+            ),
             "sine_wave": np.sin(np.arange(size)),
             "sine_seasonality_monthweek": (
                 (np.sin((np.pi / 7) * np.arange(size)) * 0.25 + 0.25)
@@ -902,6 +975,7 @@ def load_artificial(long=False, date_start=None, date_end=None):
         },
         index=dates,
     )
+    df_wide = df_wide.merge(holiday, left_index=True, right_index=True)
 
     if not long:
         return df_wide

diff --git a/autots/datasets/data/holidays.zip b/autots/datasets/data/holidays.zip
diff --git a/autots/evaluator/anomaly_detector.py b/autots/evaluator/anomaly_detector.py
@@ -147,7 +147,7 @@ def detect(self, df):
             self.anomalies[mask_replace] = 1
         return self.anomalies, self.scores
 
-    def plot(self, series_name=None, title=None, plot_kwargs={}):
+    def plot(self, series_name=None, title=None, marker_size=None, plot_kwargs={}):
         import matplotlib.pyplot as plt
 
         if series_name is None:
@@ -162,7 +162,14 @@ def plot(self, series_name=None, title=None, plot_kwargs={}):
             series_anom = self.anomalies[series_name]
             i_anom = series_anom[series_anom == -1].index
         if len(i_anom) > 0:
-            ax.scatter(i_anom.tolist(), self.df.loc[i_anom, :][series_name], c="red")
+            if marker_size is None:
+                marker_size = max(20, fig.dpi * 0.45)
+            ax.scatter(
+                i_anom.tolist(),
+                self.df.loc[i_anom, :][series_name],
+                c="red",
+                s=marker_size,
+            )
 
     def fit(self, df):
         return self.detect(df)
@@ -230,8 +237,8 @@ def get_new_params(method="random"):
 
         if preforecast or method_choice == "prediction_interval":
             forecast_params = random_model(
-                model_list=['LastValueNaive', 'GLS', 'RRVAR'],
-                model_prob=[0.8, 0.1, 0.1],
+                model_list=['LastValueNaive', 'GLS', 'RRVAR', "SeasonalityMotif"],
+                model_prob=[0.8, 0.1, 0.05, 0.05],
                 transformer_max_depth=5,
                 transformer_list="superfast",
                 keyword_format=True,
@@ -256,8 +263,9 @@ def __init__(
         use_wkdeom_holidays=True,
         use_lunar_holidays=True,
         use_lunar_weekday=False,
-        use_islamic_holidays=True,
-        use_hebrew_holidays=True,
+        use_islamic_holidays=False,
+        use_hebrew_holidays=False,
+        use_hindu_holidays=False,
         output: str = "multivariate",
         n_jobs: int = 1,
     ):
@@ -292,6 +300,7 @@ def __init__(
         self.use_lunar_weekday = use_lunar_weekday
         self.use_islamic_holidays = use_islamic_holidays
         self.use_hebrew_holidays = use_hebrew_holidays
+        self.use_hindu_holidays = use_hindu_holidays
         self.n_jobs = n_jobs
         self.output = output
         self.anomaly_model = AnomalyDetector(
@@ -313,6 +322,7 @@ def detect(self, df):
             self.lunar_weekday,
             self.islamic_holidays,
             self.hebrew_holidays,
+            self.hindu_holidays,
         ) = anomaly_df_to_holidays(
             self.anomaly_model.anomalies,
             splash_threshold=self.splash_threshold,
@@ -328,6 +338,7 @@ def detect(self, df):
             use_lunar_weekday=self.use_lunar_weekday,
             use_islamic_holidays=self.use_islamic_holidays,
             use_hebrew_holidays=self.use_hebrew_holidays,
+            use_hindu_holidays=self.use_hindu_holidays,
         )
 
     def plot_anomaly(self, kwargs={}):
@@ -338,6 +349,7 @@ def plot(
         series_name=None,
         include_anomalies=True,
         title=None,
+        marker_size=None,
         plot_kwargs={},
         series=None,
     ):
@@ -355,6 +367,8 @@ def plot(
             )
         fig, ax = plt.subplots()
         self.df[series_name].plot(ax=ax, title=title, **plot_kwargs)
+        if marker_size is None:
+            marker_size = max(20, fig.dpi * 0.45)
         if include_anomalies:
             # directly copied from above
             if self.anomaly_model.output == "univariate":
@@ -366,13 +380,21 @@ def plot(
                 i_anom = series_anom[series_anom == -1].index
             if len(i_anom) > 0:
                 ax.scatter(
-                    i_anom.tolist(), self.df.loc[i_anom, :][series_name], c="red"
+                    i_anom.tolist(),
+                    self.df.loc[i_anom, :][series_name],
+                    c="red",
+                    s=marker_size,
                 )
         # now the actual holidays
         i_anom = self.dates_to_holidays(self.df.index, style="series_flag")[series_name]
         i_anom = i_anom.index[i_anom == 1]
         if len(i_anom) > 0:
-            ax.scatter(i_anom.tolist(), self.df.loc[i_anom, :][series_name], c="green")
+            ax.scatter(
+                i_anom.tolist(),
+                self.df.loc[i_anom, :][series_name],
+                c="green",
+                s=marker_size,
+            )
 
     def dates_to_holidays(self, dates, style="flag", holiday_impacts=False):
         """Populate date information for a given pd.DatetimeIndex.
@@ -400,6 +422,7 @@ def dates_to_holidays(self, dates, style="flag", holiday_impacts=False):
             lunar_weekday=self.lunar_weekday,
             islamic_holidays=self.islamic_holidays,
             hebrew_holidays=self.hebrew_holidays,
+            hindu_holidays=self.hindu_holidays,
         )
 
     def fit(self, df):