diff --git a/docs/.buildinfo b/docs/.buildinfo index c05a61b00..bbb1f0912 100644 --- a/docs/.buildinfo +++ b/docs/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 80ad8422a14367dc91c123d767b2edc6 +config: 76e6161839572ede51e212e970f19cd7 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_modules/api/dtype.html b/docs/_modules/api/dtype.html index 0e2f8f029..bf1dc52ce 100644 --- a/docs/_modules/api/dtype.html +++ b/docs/_modules/api/dtype.html @@ -7,7 +7,7 @@ - api.dtype — lightwood 1.6.0 documentation + api.dtype — lightwood 1.6.1 documentation @@ -67,7 +67,7 @@
- 1.6.0 + 1.6.1
@@ -95,7 +95,13 @@ diff --git a/docs/_modules/api/high_level.html b/docs/_modules/api/high_level.html index 9ee0759d7..3ed0dba50 100644 --- a/docs/_modules/api/high_level.html +++ b/docs/_modules/api/high_level.html @@ -7,7 +7,7 @@ - api.high_level — lightwood 1.6.0 documentation + api.high_level — lightwood 1.6.1 documentation @@ -67,7 +67,7 @@
- 1.6.0 + 1.6.1
@@ -95,7 +95,13 @@ diff --git a/docs/_modules/api/json_ai.html b/docs/_modules/api/json_ai.html index 0ab48b724..4616da6b6 100644 --- a/docs/_modules/api/json_ai.html +++ b/docs/_modules/api/json_ai.html @@ -7,7 +7,7 @@ - api.json_ai — lightwood 1.6.0 documentation + api.json_ai — lightwood 1.6.1 documentation @@ -67,7 +67,7 @@
- 1.6.0 + 1.6.1
@@ -95,7 +95,13 @@ diff --git a/docs/_modules/api/predictor.html b/docs/_modules/api/predictor.html index d63e68d21..fd578fcc4 100644 --- a/docs/_modules/api/predictor.html +++ b/docs/_modules/api/predictor.html @@ -7,7 +7,7 @@ - api.predictor — lightwood 1.6.0 documentation + api.predictor — lightwood 1.6.1 documentation @@ -67,7 +67,7 @@
- 1.6.0 + 1.6.1
@@ -95,7 +95,13 @@ diff --git a/docs/_modules/api/types.html b/docs/_modules/api/types.html index 22833c16a..6fe6e3866 100644 --- a/docs/_modules/api/types.html +++ b/docs/_modules/api/types.html @@ -7,7 +7,7 @@ - api.types — lightwood 1.6.0 documentation + api.types — lightwood 1.6.1 documentation @@ -67,7 +67,7 @@
- 1.6.0 + 1.6.1
@@ -95,7 +95,13 @@ @@ -163,7 +169,6 @@

Source code for api.types

 # TODO: type hint the returns
-# TODO: df_std_dev is not clear in behavior; this would imply all std. of each column but that is not true, it should be renamed df_std_target_dev  # noqa
 
 from typing import Dict, List, Optional, Union
 import sys
@@ -323,7 +328,7 @@ 

Source code for api.types

              in the information.
 
     :param nr_rows: Number of rows (samples) in the dataset
-    :param df_std_dev: The standard deviation of the target of the dataset
+    :param df_target_stddev: The standard deviation of the target of the dataset
     :param train_observed_classes:
     :param target_class_distribution:
     :param histograms:
@@ -336,7 +341,7 @@ 

Source code for api.types

     """
 
     nr_rows: int
-    df_std_dev: Optional[float]
+    df_target_stddev: Optional[float]
     train_observed_classes: object  # Union[None, List[str]]
     target_class_distribution: object  # Dict[str, float]
     histograms: object  # Dict[str, Dict[str, List[object]]]
diff --git a/docs/_modules/data/cleaner.html b/docs/_modules/data/cleaner.html
new file mode 100644
index 000000000..169525802
--- /dev/null
+++ b/docs/_modules/data/cleaner.html
@@ -0,0 +1,538 @@
+
+
+
+
+
+  
+  
+  
+  
+  data.cleaner — lightwood 1.6.1 documentation
+  
+
+  
+  
+  
+  
+
+  
+  
+
+  
+  
+
+  
+
+  
+  
+  
+    
+      
+        
+        
+        
+        
+        
+    
+    
+
+    
+    
+     
+
+
+
+
+   
+  
+ + + +
+ + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
    + +
  • »
  • + +
  • Module code »
  • + +
  • data.cleaner
  • + + +
  • + +
  • + +
+ + +
+
+
+
+ +

Source code for data.cleaner

+import re
+from copy import deepcopy
+
+import pandas as pd
+import datetime
+from dateutil.parser import parse as parse_dt
+
+from lightwood.api.dtype import dtype
+from lightwood.helpers import text
+from lightwood.helpers.log import log
+from lightwood.api.types import TimeseriesSettings
+from lightwood.helpers.numeric import is_nan_numeric
+
+from typing import Dict, List, Optional, Tuple, Callable, Union
+
+
+
[docs]def cleaner( + data: pd.DataFrame, + dtype_dict: Dict[str, str], + pct_invalid: float, + identifiers: Dict[str, str], + target: str, + mode: str, + timeseries_settings: TimeseriesSettings, + anomaly_detection: bool, + custom_cleaning_functions: Dict[str, str] = {} +) -> pd.DataFrame: + """ + The cleaner is a function which takes in the raw data, plus additional information about it's types and about the problem. Based on this it generates a "clean" representation of the data, where each column has an ideal standardized type and all malformed or otherwise missing or invalid elements are turned into ``None`` + + :param data: The raw data + :param dtype_dict: Type information for each column + :param pct_invalid: How much of each column can be invalid + :param identifiers: A dict containing all identifier typed columns + :param target: The target columns + :param mode: Can be "predict" or "train" + :param timeseries_settings: Timeseries related settings, only relevant for timeseries predictors, otherwise can be the default object + :param anomaly_detection: Are we detecting anomalies with this predictor? + + :returns: The cleaned data + """ # noqa + + data = _remove_columns(data, identifiers, target, mode, timeseries_settings, + anomaly_detection, dtype_dict) + + for col in _get_columns_to_clean(data, dtype_dict, mode, target): + + # Get and apply a cleaning function for each data type + # If you want to customize the cleaner, it's likely you can to modify ``get_cleaning_func`` + data[col] = data[col].apply(get_cleaning_func(dtype_dict[col], custom_cleaning_functions)) + + return data
+ + +def _check_if_invalid(new_data: pd.Series, pct_invalid: float, col_name: str): + """ + Checks how many invalid data points there are. Invalid data points are flagged as "Nones" from the cleaning processs (see data/cleaner.py for default). + If there are too many invalid data points (specified by `pct_invalid`), then an error message will pop up. This is used as a safeguard for very messy data. + + :param new_data: data to check for invalid values. + :param pct_invalid: maximum percentage of invalid values. If this threshold is surpassed, an exception is raised. + :param col_name: name of the column to analyze. + + """ # noqa + + chk_invalid = ( + 100 + * (len(new_data) - len([x for x in new_data if x is not None])) + / len(new_data) + ) + + if chk_invalid > pct_invalid: + err = f'Too many ({chk_invalid}%) invalid values in column {col_name}nam' + log.error(err) + raise Exception(err) + + +
[docs]def get_cleaning_func(data_dtype: dtype, custom_cleaning_functions: Dict[str, str]) -> Callable: + """ + For the provided data type, provide the appropriate cleaning function. Below are the defaults, users can either override this function OR impose a custom block. + + :param data_dtype: The data-type (inferred from a column) as prescribed from ``api.dtype`` + + :returns: The appropriate function that will pre-process (clean) data of specified dtype. + """ # noqa + if data_dtype in custom_cleaning_functions: + clean_func = eval(custom_cleaning_functions[data_dtype]) + + elif data_dtype in (dtype.date, dtype.datetime): + clean_func = _standardize_datetime + + elif data_dtype in (dtype.float, dtype.tsarray): + clean_func = _clean_float + + elif data_dtype in (dtype.integer): + clean_func = _clean_int + + elif data_dtype in (dtype.array): + clean_func = _standardize_array + + elif data_dtype in (dtype.tags): + clean_func = _tags_to_tuples + + elif data_dtype in (dtype.quantity): + clean_func = _clean_quantity + + elif data_dtype in ( + dtype.short_text, + dtype.rich_text, + dtype.categorical, + dtype.binary, + dtype.audio, + dtype.image, + dtype.video + ): + clean_func = _clean_text + + else: + raise ValueError(f"{data_dtype} is not supported. Check lightwood.api.dtype") + + return clean_func
+ + +# ------------------------- # +# Temporal Cleaning +# ------------------------- # + + +def _standardize_datetime(element: object) -> Optional[float]: + """ + Parses an expected date-time element. Intakes an element that can in theory be anything. + """ + try: + date = parse_dt(str(element)) + except Exception: + try: + date = datetime.datetime.utcfromtimestamp(element) + except Exception: + return None + + return date.timestamp() + + +# ------------------------- # +# Tags/Sequences +# ------------------------- # + +# TODO Make it split on something other than commas +def _tags_to_tuples(tags_str: str) -> Tuple[str]: + """ + Converts comma-separated values into a tuple to preserve a sequence/array. + + Ex: + >> x = 'apples, oranges, bananas' + >> _tags_to_tuples(x) + >> ('apples', 'oranges', 'bananas') + """ + try: + return tuple([x.strip() for x in tags_str.split(",")]) + except Exception: + return tuple() + + +def _standardize_array(element: object) -> Optional[Union[List[float], float]]: + """ + Given an array of numbers in the form ``[1, 2, 3, 4]``, converts into a numerical sequence. + + :param element: An array-like element in a sequence + :returns: standardized array OR scalar number IF edge case + + Ex of edge case: + >> element = [1] + >> _standardize_array(element) + >> 1 + """ + try: + element = str(element) + element = element.rstrip("]").lstrip("[") + element = element.rstrip(" ").lstrip(" ") + element = element.replace(", ", " ").replace(",", " ") + # Handles cases where arrays are numbers + if " " not in element: + element = _clean_float(element) + else: + element = [float(x) for x in element.split(" ")] + except Exception: + pass + + return element + + +# ------------------------- # +# Integers/Floats/Quantities +# ------------------------- # + +def _clean_float(element: object) -> Optional[float]: + """ + Given an element, converts it into float numeric format. If element is NaN, or inf, then returns None. + """ + try: + cleaned_float = text.clean_float(element) + if is_nan_numeric(cleaned_float): + return None + return cleaned_float + except Exception: + return None + + +def _clean_int(element: object) -> Optional[int]: + """ + Given an element, converts it into integer numeric format. If element is NaN, or inf, then returns None. + """ + element = _clean_float(element) + if element is not None: + element = int(element) + return element + + +def _clean_quantity(element: object) -> Optional[float]: + """ + Given a quantity, clean and convert it into float numeric format. If element is NaN, or inf, then returns None. + """ + element = float(re.sub("[^0-9.,]", "", str(element)).replace(",", ".")) + return _clean_float(element) + + +# ------------------------- # +# Text +# ------------------------- # +def _clean_text(element: object) -> str: + return str(element) + + +# ------------------------- # +# Other helpers +# ------------------------- # +def _rm_rows_w_empty_targets(df: pd.DataFrame, target: str) -> pd.DataFrame: + """ + Drop any rows that have targets as unknown. Targets are necessary to train. + + :param df: The input dataframe including the target value + :param target: the column name that is the output target variable + + :returns: Data with any target smissing + """ + # Compare length before/after + len_before = len(df) + + # Use Pandas ```dropna``` to omit any rows with missing values for targets; these cannot be trained + df = df.dropna(subset=[target]) + + # Compare length with after + len_after = len(df) + nr_removed = len_before - len_after + + if nr_removed != 0: + log.warning( + f"Removed {nr_removed} rows because target was missing. Training on these rows is not possible." + ) # noqa + + return df + + +def _remove_columns(data: pd.DataFrame, identifiers: Dict[str, object], target: str, + mode: str, timeseries_settings: TimeseriesSettings, anomaly_detection: bool, + dtype_dict: Dict[str, dtype]) -> pd.DataFrame: + """ + Drop columns we don't want to use in order to train or predict + + :param data: The raw data + :param dtype_dict: Type information for each column + :param identifiers: A dict containing all identifier typed columns + :param target: The target columns + :param mode: Can be "predict" or "train" + :param timeseries_settings: Timeseries related settings, only relevant for timeseries predictors, otherwise can be the default object + :param anomaly_detection: Are we detecting anomalies with this predictor? + + :returns: A (new) dataframe without the dropped columns + """ # noqa + data = deepcopy(data) + to_drop = [*[x for x in identifiers.keys() if x != target], + *[x for x in data.columns if x in dtype_dict and dtype_dict[x] == dtype.invalid]] + exceptions = ["__mdb_make_predictions"] + to_drop = [x for x in to_drop if x in data.columns] + data = data.drop(columns=to_drop) + + if mode == "train": + data = _rm_rows_w_empty_targets(data, target) + if mode == "predict": + if ( + target in data.columns + and not timeseries_settings.use_previous_target + and not anomaly_detection + ): + data = data.drop(columns=[target]) + + # Drop extra columns + for name in list(data.columns): + if name not in dtype_dict and name not in exceptions: + data = data.drop(columns=[name]) + + return data + + +def _get_columns_to_clean(data: pd.DataFrame, dtype_dict: Dict[str, dtype], mode: str, target: str) -> List[str]: + """ + :param data: The raw data + :param dtype_dict: Type information for each column + :param target: The target columns + :param mode: Can be "predict" or "train" + + :returns: A list of columns that we want to clean + """ # noqa + + cleanable_columns = [] + for name, _ in dtype_dict.items(): + if mode == "predict": + if name == target: + continue + if name in data.columns: + cleanable_columns.append(name) + return cleanable_columns +
+ +
+ +
+
+ +
+ +
+

+ © Copyright 2017-2021, MindsDB. + +

+
+ + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
+
+
+ +
+ +
+ + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/index.html b/docs/_modules/index.html index c271bfa9e..811540e75 100644 --- a/docs/_modules/index.html +++ b/docs/_modules/index.html @@ -7,7 +7,7 @@ - Overview: module code — lightwood 1.6.0 documentation + Overview: module code — lightwood 1.6.1 documentation @@ -67,7 +67,7 @@
- 1.6.0 + 1.6.1
@@ -95,7 +95,13 @@ @@ -165,6 +171,46 @@

All modules for which code is available

  • api.json_ai
  • api.predictor
  • api.types
  • +
  • data.cleaner
  • +
  • lightwood.analysis.analyze
  • +
  • lightwood.analysis.base
  • +
  • lightwood.analysis.explain
  • +
  • lightwood.analysis.helpers.acc_stats
  • +
  • lightwood.analysis.helpers.feature_importance
  • +
  • lightwood.analysis.nc.calibrate
  • +
  • lightwood.data.cleaner
  • +
  • lightwood.data.encoded_ds
  • +
  • lightwood.data.splitter
  • +
  • lightwood.data.timeseries_analyzer
  • +
  • lightwood.data.timeseries_transform
  • +
  • lightwood.encoder.array.array
  • +
  • lightwood.encoder.base
  • +
  • lightwood.encoder.categorical.autoencoder
  • +
  • lightwood.encoder.categorical.binary
  • +
  • lightwood.encoder.categorical.multihot
  • +
  • lightwood.encoder.categorical.onehot
  • +
  • lightwood.encoder.datetime.datetime
  • +
  • lightwood.encoder.datetime.datetime_sin_normalizer
  • +
  • lightwood.encoder.image.img_2_vec
  • +
  • lightwood.encoder.numeric.numeric
  • +
  • lightwood.encoder.numeric.ts_array_numeric
  • +
  • lightwood.encoder.numeric.ts_numeric
  • +
  • lightwood.encoder.text.pretrained
  • +
  • lightwood.encoder.text.short
  • +
  • lightwood.encoder.text.vocab
  • +
  • lightwood.encoder.time_series.rnn
  • +
  • lightwood.ensemble.base
  • +
  • lightwood.ensemble.best_of
  • +
  • lightwood.ensemble.mean_ensemble
  • +
  • lightwood.ensemble.mode_ensemble
  • +
  • lightwood.ensemble.weighted_mean_ensemble
  • +
  • lightwood.mixer.base
  • +
  • lightwood.mixer.lightgbm
  • +
  • lightwood.mixer.lightgbm_array
  • +
  • lightwood.mixer.neural
  • +
  • lightwood.mixer.regression
  • +
  • lightwood.mixer.sktime
  • +
  • lightwood.mixer.unit
  • diff --git a/docs/_modules/lightwood/analysis/analyze.html b/docs/_modules/lightwood/analysis/analyze.html new file mode 100644 index 000000000..65168f112 --- /dev/null +++ b/docs/_modules/lightwood/analysis/analyze.html @@ -0,0 +1,314 @@ + + + + + + + + + + lightwood.analysis.analyze — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.analysis.analyze
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.analysis.analyze

    +from typing import Dict, List, Tuple, Optional
    +
    +from lightwood.helpers.log import log
    +from lightwood.api import dtype
    +from lightwood.ensemble import BaseEnsemble
    +from lightwood.analysis.base import BaseAnalysisBlock
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood.encoder.text.pretrained import PretrainedLangEncoder
    +from lightwood.api.types import ModelAnalysis, StatisticalAnalysis, TimeseriesSettings, PredictionArguments
    +
    +
    +
    [docs]def model_analyzer( + predictor: BaseEnsemble, + data: EncodedDs, + train_data: EncodedDs, + stats_info: StatisticalAnalysis, + target: str, + ts_cfg: TimeseriesSettings, + dtype_dict: Dict[str, str], + accuracy_functions, + analysis_blocks: Optional[List[BaseAnalysisBlock]] = [] +) -> Tuple[ModelAnalysis, Dict[str, object]]: + """ + Analyses model on a validation subset to evaluate accuracy, estimate feature importance and generate a + calibration model to estimating confidence in future predictions. + + Additionally, any user-specified analysis blocks (see class `BaseAnalysisBlock`) are also called here. + + :return: + runtime_analyzer: This dictionary object gets populated in a sequential fashion with data generated from + any `.analyze()` block call. This dictionary object is stored in the predictor itself, and used when + calling the `.explain()` method of all analysis blocks when generating predictions. + + model_analysis: `ModelAnalysis` object that contains core analysis metrics, not necessarily needed when predicting. + """ + + runtime_analyzer = {} + data_type = dtype_dict[target] + + # retrieve encoded data representations + encoded_train_data = train_data + encoded_val_data = data + data = encoded_val_data.data_frame + input_cols = list([col for col in data.columns if col != target]) + + # predictive task + is_numerical = data_type in (dtype.integer, dtype.float, dtype.array, dtype.tsarray, dtype.quantity) + is_classification = data_type in (dtype.categorical, dtype.binary) + is_multi_ts = ts_cfg.is_timeseries and ts_cfg.nr_predictions > 1 + has_pretrained_text_enc = any([isinstance(enc, PretrainedLangEncoder) + for enc in encoded_train_data.encoders.values()]) + + # raw predictions for validation dataset + args = {} if not is_classification else {"predict_proba": True} + normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args)) + normal_predictions = normal_predictions.set_index(data.index) + + # ------------------------- # + # Run analysis blocks, both core and user-defined + # ------------------------- # + kwargs = { + 'predictor': predictor, + 'target': target, + 'input_cols': input_cols, + 'dtype_dict': dtype_dict, + 'normal_predictions': normal_predictions, + 'data': data, + 'train_data': train_data, + 'encoded_val_data': encoded_val_data, + 'is_classification': is_classification, + 'is_numerical': is_numerical, + 'is_multi_ts': is_multi_ts, + 'stats_info': stats_info, + 'ts_cfg': ts_cfg, + 'accuracy_functions': accuracy_functions, + 'has_pretrained_text_enc': has_pretrained_text_enc + } + + for block in analysis_blocks: + log.info("The block %s is now running its analyze() method", block.__class__.__name__) + runtime_analyzer = block.analyze(runtime_analyzer, **kwargs) + + # ------------------------- # + # Populate ModelAnalysis object + # ------------------------- # + model_analysis = ModelAnalysis( + accuracies=runtime_analyzer.get('score_dict', {}), + accuracy_histogram=runtime_analyzer.get('acc_histogram', {}), + accuracy_samples=runtime_analyzer.get('acc_samples', {}), + train_sample_size=len(encoded_train_data), + test_sample_size=len(encoded_val_data), + confusion_matrix=runtime_analyzer['cm'], + column_importances=runtime_analyzer.get('column_importances', {}), + histograms=stats_info.histograms, + dtypes=dtype_dict + ) + + return model_analysis, runtime_analyzer
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/analysis/base.html b/docs/_modules/lightwood/analysis/base.html new file mode 100644 index 000000000..df163faf2 --- /dev/null +++ b/docs/_modules/lightwood/analysis/base.html @@ -0,0 +1,262 @@ + + + + + + + + + + lightwood.analysis.base — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.analysis.base
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.analysis.base

    +from typing import Tuple, Dict, Optional
    +
    +import pandas as pd
    +from lightwood.helpers.log import log
    +
    +
    +
    [docs]class BaseAnalysisBlock: + """Class to be inherited by any analysis/explainer block.""" + def __init__(self, + deps: Optional[Tuple] = () + ): + + self.dependencies = deps # can be parallelized when there are no dependencies @TODO enforce + +
    [docs] def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + """ + This method should be called once during the analysis phase, or not called at all. + It computes any information that the block may either output to the model analysis object, + or use at inference time when `.explain()` is called (in this case, make sure all needed + objects are added to the runtime analyzer so that `.explain()` can access them). + + :param info: Dictionary where any new information or objects are added. The next analysis block will use + the output of the previous block as a starting point. + :param kwargs: Dictionary with named variables from either the core analysis or the rest of the prediction + pipeline. + """ + log.info(f"{self.__class__.__name__}.analyze() has not been implemented, no modifications will be done to the model analysis.") # noqa + return info
    + +
    [docs] def explain(self, + row_insights: pd.DataFrame, + global_insights: Dict[str, object], **kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]: + """ + This method should be called once during the explaining phase at inference time, or not called at all. + Additional explanations can be at an instance level (row-wise) or global. + For the former, return a data frame with any new insights. For the latter, a dictionary is required. + + :param row_insights: dataframe with previously computed row-level explanations. + :param global_insights: dict() with any explanations that concern all predicted instances or the model itself. + + :returns: + - row_insights: modified input dataframe with any new row insights added here. + - global_insights: dict() with any explanations that concern all predicted instances or the model itself. + """ + log.info(f"{self.__class__.__name__}.explain() has not been implemented, no modifications will be done to the data insights.") # noqa + return row_insights, global_insights
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/analysis/explain.html b/docs/_modules/lightwood/analysis/explain.html new file mode 100644 index 000000000..33e74a5b1 --- /dev/null +++ b/docs/_modules/lightwood/analysis/explain.html @@ -0,0 +1,308 @@ + + + + + + + + + + lightwood.analysis.explain — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.analysis.explain
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.analysis.explain

    +from typing import Optional, List, Dict
    +import torch
    +import pandas as pd
    +
    +from lightwood.helpers.log import log
    +from lightwood.api.types import TimeseriesSettings
    +from lightwood.helpers.ts import get_inferred_timestamps
    +from lightwood.analysis.base import BaseAnalysisBlock
    +
    +
    +
    [docs]def explain(data: pd.DataFrame, + encoded_data: torch.Tensor, + predictions: pd.DataFrame, + timeseries_settings: TimeseriesSettings, + analysis: Dict, + target_name: str, + target_dtype: str, + + positive_domain: bool, # @TODO: pass inside a {} with params for each block to avoid signature overload + fixed_confidence: float, + anomaly_detection: bool, + + # forces specific confidence level in ICP + anomaly_error_rate: float, + + # ignores anomaly detection for N steps after an + # initial anomaly triggers the cooldown period; + # implicitly assumes series are regularly spaced + anomaly_cooldown: int, + + explainer_blocks: Optional[List[BaseAnalysisBlock]] = [], + ts_analysis: Optional[Dict] = {} + ): + """ + This procedure runs at the end of every normal `.predict()` call. Its goal is to generate prediction insights, + potentially using information generated at the model analysis stage (e.g. confidence estimation). + + As in `analysis()`, any user-specified analysis blocks (see class `BaseAnalysisBlock`) are also called here. + + :return: + row_insights: a DataFrame containing predictions and all generated insights at a row-level. + """ + + # ------------------------- # + # Setup base insights + # ------------------------- # + data = data.reset_index(drop=True) + + row_insights = pd.DataFrame() + global_insights = {} + row_insights['prediction'] = predictions['prediction'] + + if target_name in data.columns: + row_insights['truth'] = data[target_name] + else: + row_insights['truth'] = [None] * len(predictions['prediction']) + + if timeseries_settings.is_timeseries: + if timeseries_settings.group_by: + for col in timeseries_settings.group_by: + row_insights[f'group_{col}'] = data[col] + + for col in timeseries_settings.order_by: + row_insights[f'order_{col}'] = data[col] + + for col in timeseries_settings.order_by: + row_insights[f'order_{col}'] = get_inferred_timestamps( + row_insights, col, ts_analysis['deltas'], timeseries_settings) + + kwargs = { + 'data': data, + 'encoded_data': encoded_data, + 'predictions': predictions, + 'analysis': analysis, + 'target_name': target_name, + 'target_dtype': target_dtype, + 'tss': timeseries_settings, + 'positive_domain': positive_domain, + 'fixed_confidence': fixed_confidence, + 'anomaly_detection': anomaly_detection, + 'anomaly_error_rate': anomaly_error_rate, + 'anomaly_cooldown': anomaly_cooldown + } + + # ------------------------- # + # Call explanation blocks + # ------------------------- # + for block in explainer_blocks: + log.info("The block %s is now running its explain() method", block.__class__.__name__) + row_insights, global_insights = block.explain(row_insights, global_insights, **kwargs) + + return row_insights, global_insights
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/analysis/helpers/acc_stats.html b/docs/_modules/lightwood/analysis/helpers/acc_stats.html new file mode 100644 index 000000000..676a39c1b --- /dev/null +++ b/docs/_modules/lightwood/analysis/helpers/acc_stats.html @@ -0,0 +1,402 @@ + + + + + + + + + + lightwood.analysis.helpers.acc_stats — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.analysis.helpers.acc_stats
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.analysis.helpers.acc_stats

    +import random
    +from types import SimpleNamespace
    +from typing import Dict, Optional
    +
    +import numpy as np
    +from sklearn.metrics import confusion_matrix
    +
    +from lightwood.api.dtype import dtype
    +from lightwood.analysis.base import BaseAnalysisBlock
    +from lightwood.helpers.general import evaluate_accuracy
    +
    +
    +
    [docs]class AccStats(BaseAnalysisBlock): + """ Computes accuracy stats and a confusion matrix for the validation dataset """ + + def __init__(self, deps=('ICP',)): + super().__init__(deps=deps) # @TODO: enforce that this actually prevents early execution somehow + +
    [docs] def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + ns = SimpleNamespace(**kwargs) + + # @TODO: maybe pass ts_analysis to trigger group-wise MASE instead of R2 mean, though it wouldn't be 0-1 bounded + info['score_dict'] = evaluate_accuracy(ns.data, ns.normal_predictions['prediction'], + ns.target, ns.accuracy_functions) + info['normal_accuracy'] = np.mean(list(info['score_dict'].values())) + + self.fit(ns, info['result_df']) + info['val_overall_acc'], info['acc_histogram'], info['cm'], info['acc_samples'] = self.get_accuracy_stats() + return info
    + + def fit(self, ns: SimpleNamespace, conf=Optional[np.ndarray]): + self.col_stats = ns.dtype_dict + self.target = ns.target + self.input_cols = list(ns.dtype_dict.keys()) + self.buckets = ns.stats_info.buckets if ns.stats_info.buckets else {} + + self.normal_predictions_bucketized = [] + self.real_values_bucketized = [] + self.numerical_samples_arr = [] + + column_indexes = {} + for i, col in enumerate(self.input_cols): + column_indexes[col] = i + + real_present_inputs_arr = [] + for _, row in ns.data.iterrows(): + present_inputs = [1] * len(self.input_cols) + for i, col in enumerate(self.input_cols): + if str(row[col]) in ('None', 'nan', '', 'Nan', 'NAN', 'NaN'): + present_inputs[i] = 0 + real_present_inputs_arr.append(present_inputs) + + for n in range(len(ns.normal_predictions)): + row = ns.data.iloc[n] + real_value = row[self.target] + predicted_value = ns.normal_predictions.iloc[n]['prediction'] + + if isinstance(predicted_value, list): + # T+N time series, for now we compare the T+1 prediction only @TODO: generalize + predicted_value = predicted_value[0] + + predicted_value = predicted_value \ + if self.col_stats[self.target] not in [dtype.integer, dtype.float, dtype.quantity] \ + else float(predicted_value) + + real_value = real_value \ + if self.col_stats[self.target] not in [dtype.integer, dtype.float, dtype.quantity] \ + else float(real_value) + + if self.buckets: + bucket = self.buckets[self.target] + predicted_value_b = get_value_bucket(predicted_value, bucket, self.col_stats[self.target]) + real_value_b = get_value_bucket(real_value, bucket, self.col_stats[self.target]) + else: + predicted_value_b = predicted_value + real_value_b = real_value + + if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float, dtype.quantity]: + predicted_range = conf.iloc[n][['lower', 'upper']].tolist() + else: + predicted_range = (predicted_value_b, predicted_value_b) + + self.real_values_bucketized.append(real_value_b) + self.normal_predictions_bucketized.append(predicted_value_b) + if conf is not None and self.col_stats[self.target] in [dtype.integer, dtype.float, dtype.quantity]: + self.numerical_samples_arr.append((real_value, predicted_range)) + + def get_accuracy_stats(self, is_classification=None, is_numerical=None): + bucket_accuracy = {} + bucket_acc_counts = {} + for i, bucket in enumerate(self.normal_predictions_bucketized): + if bucket not in bucket_acc_counts: + bucket_acc_counts[bucket] = [] + + if len(self.numerical_samples_arr) != 0: + bucket_acc_counts[bucket].append(self.numerical_samples_arr[i][1][0] < + self.numerical_samples_arr[i][0] < self.numerical_samples_arr[i][1][1]) # noqa + else: + bucket_acc_counts[bucket].append(1 if bucket == self.real_values_bucketized[i] else 0) + + for bucket in bucket_acc_counts: + bucket_accuracy[bucket] = sum(bucket_acc_counts[bucket]) / len(bucket_acc_counts[bucket]) + + accuracy_count = [] + for counts in list(bucket_acc_counts.values()): + accuracy_count += counts + + overall_accuracy = sum(accuracy_count) / len(accuracy_count) + + for bucket in range(len(self.buckets)): + if bucket not in bucket_accuracy: + if bucket in self.real_values_bucketized: + # If it was never predicted, but it did exist as a real value, then assume 0% confidence when it does get predicted # noqa + bucket_accuracy[bucket] = 0 + + for bucket in range(len(self.buckets)): + if bucket not in bucket_accuracy: + # If it wasn't seen either in the real values or in the predicted values, assume average confidence (maybe should be 0 instead ?) # noqa + bucket_accuracy[bucket] = overall_accuracy + + accuracy_histogram = { + 'buckets': list(bucket_accuracy.keys()), + 'accuracies': list(bucket_accuracy.values()), + 'is_classification': is_classification, + 'is_numerical': is_numerical + } + + labels = list(set([*self.real_values_bucketized, *self.normal_predictions_bucketized])) + matrix = confusion_matrix(self.real_values_bucketized, self.normal_predictions_bucketized, labels=labels) + matrix = [[int(y) if str(y) != 'nan' else 0 for y in x] for x in matrix] + + target_bucket = self.buckets[self.target] + bucket_values = [target_bucket[i] if i < len(target_bucket) else None for i in labels] + + cm = { + 'matrix': matrix, + 'predicted': bucket_values, + 'real': bucket_values + } + + accuracy_samples = None + if len(self.numerical_samples_arr) > 0: + nr_samples = min(400, len(self.numerical_samples_arr)) + sampled_numerical_samples_arr = random.sample(self.numerical_samples_arr, nr_samples) + accuracy_samples = { + 'y': [x[0] for x in sampled_numerical_samples_arr], + 'x': [x[1] for x in sampled_numerical_samples_arr] + } + + return overall_accuracy, accuracy_histogram, cm, accuracy_samples
    + + +def get_value_bucket(value, buckets, target_dtype): + """ + :return: The bucket in the `histogram` in which our `value` falls + """ + if buckets is None: + return None + + if target_dtype in (dtype.binary, dtype.categorical): + if value in buckets: + bucket = buckets.index(value) + else: + bucket = len(buckets) # for null values + + elif target_dtype in (dtype.integer, dtype.float, dtype.quantity): + bucket = closest(buckets, value) + else: + bucket = len(buckets) # for null values + + return bucket + + +def closest(arr, value): + """ + :return: The index of the member of `arr` which is closest to `value` + """ + if value is None: + return -1 + + for i, ele in enumerate(arr): + value = float(str(value).replace(',', '.')) + if ele > value: + return i - 1 + + return len(arr) - 1 +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/analysis/helpers/feature_importance.html b/docs/_modules/lightwood/analysis/helpers/feature_importance.html new file mode 100644 index 000000000..743e8869c --- /dev/null +++ b/docs/_modules/lightwood/analysis/helpers/feature_importance.html @@ -0,0 +1,287 @@ + + + + + + + + + + lightwood.analysis.helpers.feature_importance — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.analysis.helpers.feature_importance
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.analysis.helpers.feature_importance

    +from copy import deepcopy
    +from types import SimpleNamespace
    +from typing import Dict
    +
    +import torch
    +import numpy as np
    +
    +from lightwood.analysis.base import BaseAnalysisBlock
    +from lightwood.helpers.general import evaluate_accuracy
    +from lightwood.analysis.nc.util import t_softmax
    +from lightwood.api.types import PredictionArguments
    +
    +
    +
    [docs]class GlobalFeatureImportance(BaseAnalysisBlock): + """ + Analysis block that estimates column importance with a variant of the LOCO (leave-one-covariate-out) algorithm. + + Roughly speaking, the procedure: + - iterates over all input columns + - if the input column is optional, then make a predict with its values set to None + - compare this accuracy with the accuracy obtained using all data + - all accuracy differences are passed through a softmax and reported as estimated column importance scores + + Note that, crucially, this method does not refit the predictor at any point. + + Reference: + https://compstat-lmu.github.io/iml_methods_limitations/pfi.html + """ + def __init__(self, disable_column_importance): + super().__init__() + self.disable_column_importance = disable_column_importance + +
    [docs] def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + ns = SimpleNamespace(**kwargs) + + if self.disable_column_importance or ns.ts_cfg.is_timeseries or ns.has_pretrained_text_enc: + info['column_importances'] = None + else: + empty_input_accuracy = {} + ignorable_input_cols = [x for x in ns.input_cols if (not ns.ts_cfg.is_timeseries or + (x not in ns.ts_cfg.order_by and + x not in ns.ts_cfg.historical_columns))] + for col in ignorable_input_cols: + partial_data = deepcopy(ns.encoded_val_data) + partial_data.clear_cache() + partial_data.data_frame[col] = [None] * len(partial_data.data_frame[col]) + + args = {'predict_proba': True} if ns.is_classification else {} + empty_input_preds = ns.predictor(partial_data, args=PredictionArguments.from_dict(args)) + + empty_input_accuracy[col] = np.mean(list(evaluate_accuracy( + ns.data, + empty_input_preds['prediction'], + ns.target, + ns.accuracy_functions + ).values())) + + column_importances = {} + acc_increases = [] + for col in ignorable_input_cols: + accuracy_increase = (info['normal_accuracy'] - empty_input_accuracy[col]) + acc_increases.append(accuracy_increase) + + # low 0.2 temperature to accentuate differences + acc_increases = t_softmax(torch.Tensor([acc_increases]), t=0.2).tolist()[0] + for col, inc in zip(ignorable_input_cols, acc_increases): + column_importances[col] = inc # scores go from 0 to 1 + + info['column_importances'] = column_importances + + return info
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/analysis/nc/calibrate.html b/docs/_modules/lightwood/analysis/nc/calibrate.html new file mode 100644 index 000000000..0225f0603 --- /dev/null +++ b/docs/_modules/lightwood/analysis/nc/calibrate.html @@ -0,0 +1,598 @@ + + + + + + + + + + lightwood.analysis.nc.calibrate — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.analysis.nc.calibrate
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.analysis.nc.calibrate

    +from copy import deepcopy
    +from itertools import product
    +from typing import Dict, Tuple
    +from types import SimpleNamespace
    +
    +import numpy as np
    +import pandas as pd
    +from sklearn.preprocessing import OneHotEncoder
    +
    +from lightwood.api.dtype import dtype
    +from lightwood.helpers.ts import add_tn_conf_bounds
    +
    +from lightwood.analysis.base import BaseAnalysisBlock
    +from lightwood.analysis.nc.norm import Normalizer
    +from lightwood.analysis.nc.icp import IcpRegressor, IcpClassifier
    +from lightwood.analysis.nc.base import CachedRegressorAdapter, CachedClassifierAdapter
    +from lightwood.analysis.nc.nc import BoostedAbsErrorErrFunc, RegressorNc, ClassifierNc, MarginErrFunc
    +from lightwood.analysis.nc.util import clean_df, set_conf_range, get_numeric_conf_range, \
    +    get_categorical_conf, get_anomalies
    +
    +
    +
    [docs]class ICP(BaseAnalysisBlock): + """ Confidence estimation block, uses inductive conformal predictors (ICPs) for model agnosticity """ + + def __init__(self, + fixed_significance: float, + positive_domain: bool, + confidence_normalizer: bool + ): + super().__init__() + self.fixed_significance = fixed_significance + self.positive_domain = positive_domain + self.confidence_normalizer = confidence_normalizer + +
    [docs] def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: + ns = SimpleNamespace(**kwargs) + + data_type = ns.dtype_dict[ns.target] + output = {'icp': {'__mdb_active': False}} + + fit_params = {'nr_preds': ns.ts_cfg.nr_predictions or 0, 'columns_to_ignore': []} + fit_params['columns_to_ignore'].extend([f'timestep_{i}' for i in range(1, fit_params['nr_preds'])]) + + if ns.is_classification: + if ns.predictor.supports_proba: + all_cat_cols = [col for col in ns.normal_predictions.columns if '__mdb_proba' in col] + all_classes = np.array([col.replace('__mdb_proba_', '') for col in all_cat_cols]) + else: + class_keys = sorted(ns.encoded_val_data.encoders[ns.target].rev_map.keys()) + all_classes = np.array([ns.encoded_val_data.encoders[ns.target].rev_map[idx] for idx in class_keys]) + + if data_type != dtype.tags: + enc = OneHotEncoder(sparse=False, handle_unknown='ignore') + enc.fit(all_classes.reshape(-1, 1)) + output['label_encoders'] = enc # needed to repr cat labels inside nonconformist + else: + output['label_encoders'] = None + + adapter = CachedClassifierAdapter + nc_function = MarginErrFunc() + nc_class = ClassifierNc + icp_class = IcpClassifier + + else: + adapter = CachedRegressorAdapter + nc_function = BoostedAbsErrorErrFunc() + nc_class = RegressorNc + icp_class = IcpRegressor + + result_df = pd.DataFrame() + + if ns.is_numerical or (ns.is_classification and data_type != dtype.tags): + model = adapter(ns.predictor) + + norm_params = {'target': ns.target, 'dtype_dict': ns.dtype_dict, 'predictor': ns.predictor, + 'encoders': ns.encoded_val_data.encoders, 'is_multi_ts': ns.is_multi_ts, 'stop_after': 1e2} + if self.confidence_normalizer: + normalizer = Normalizer(fit_params=norm_params) + normalizer.fit(ns.train_data) + normalizer.prediction_cache = normalizer(ns.encoded_val_data) + else: + normalizer = None + + # instance the ICP + nc = nc_class(model, nc_function, normalizer=normalizer) + icp = icp_class(nc) + + output['icp']['__default'] = icp + + # setup prediction cache to avoid additional .predict() calls + if ns.is_classification: + if ns.predictor.supports_proba: + icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values + else: + predicted_classes = pd.get_dummies( + ns.normal_predictions['prediction']).values # inflate to one-hot enc + icp.nc_function.model.prediction_cache = predicted_classes + + elif ns.is_multi_ts: + # we fit ICPs for time series confidence bounds only at t+1 forecast + icp.nc_function.model.prediction_cache = np.array([p[0] for p in ns.normal_predictions['prediction']]) + else: + icp.nc_function.model.prediction_cache = np.array(ns.normal_predictions['prediction']) + + if not ns.is_classification: + output['df_target_stddev'] = {'__default': ns.stats_info.df_target_stddev} + + # fit additional ICPs in time series tasks with grouped columns + if ns.ts_cfg.is_timeseries and ns.ts_cfg.group_by: + + # create an ICP for each possible group + group_info = ns.data[ns.ts_cfg.group_by].to_dict('list') + all_group_combinations = list(product(*[set(x) for x in group_info.values()])) + output['icp']['__mdb_groups'] = all_group_combinations + output['icp']['__mdb_group_keys'] = [x for x in group_info.keys()] + + for combination in all_group_combinations: + output['icp'][frozenset(combination)] = deepcopy(icp) + + # calibrate ICP + icp_df = deepcopy(ns.data) + icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None)) + output['icp']['__default'].index = icp_df.columns + output['icp']['__default'].calibrate(icp_df.values, y) + + # get confidence estimation for validation dataset + conf, ranges = set_conf_range( + icp_df, icp, ns.dtype_dict[ns.target], + output, positive_domain=self.positive_domain, significance=self.fixed_significance) + if not ns.is_classification: + result_df = pd.DataFrame(index=ns.data.index, columns=['confidence', 'lower', 'upper'], dtype=float) + result_df.loc[icp_df.index, 'lower'] = ranges[:, 0] + result_df.loc[icp_df.index, 'upper'] = ranges[:, 1] + else: + result_df = pd.DataFrame(index=ns.data.index, columns=['confidence'], dtype=float) + + result_df.loc[icp_df.index, 'confidence'] = conf + + # calibrate additional grouped ICPs + if ns.ts_cfg.is_timeseries and ns.ts_cfg.group_by: + icps = output['icp'] + group_keys = icps['__mdb_group_keys'] + + # add all predictions to DF + icps_df = deepcopy(ns.data) + if ns.is_multi_ts: + icps_df[f'__predicted_{ns.target}'] = [p[0] for p in ns.normal_predictions['prediction']] + else: + icps_df[f'__predicted_{ns.target}'] = ns.normal_predictions['prediction'] + + for group in icps['__mdb_groups']: + icp_df = icps_df + if icps[frozenset(group)].nc_function.normalizer is not None: + icp_df[f'__norm_{ns.target}'] = icps[frozenset(group)].nc_function.normalizer.prediction_cache + + # filter irrelevant rows for each group combination + for key, val in zip(group_keys, group): + icp_df = icp_df[icp_df[key] == val] + + # save relevant predictions in the caches, then calibrate the ICP + pred_cache = icp_df.pop(f'__predicted_{ns.target}').values + icps[frozenset(group)].nc_function.model.prediction_cache = pred_cache + icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None)) + if icps[frozenset(group)].nc_function.normalizer is not None: + icps[frozenset(group)].nc_function.normalizer.prediction_cache = icp_df.pop( + f'__norm_{ns.target}').values + + icps[frozenset(group)].index = icp_df.columns # important at inference time + icps[frozenset(group)].calibrate(icp_df.values, y) + + # save training std() for bounds width selection + if not ns.is_classification: + icp_train_df = ns.data + for key, val in zip(group_keys, group): + icp_train_df = icp_train_df[icp_train_df[key] == val] + y_train = icp_train_df[ns.target].values + output['df_target_stddev'][frozenset(group)] = y_train.std() + + # get bounds for relevant rows in validation dataset + conf, group_ranges = set_conf_range( + icp_df, icps[frozenset(group)], + ns.dtype_dict[ns.target], + output, group=frozenset(group), + positive_domain=self.positive_domain, significance=self.fixed_significance) + # save group bounds + if not ns.is_classification: + result_df.loc[icp_df.index, 'lower'] = group_ranges[:, 0] + result_df.loc[icp_df.index, 'upper'] = group_ranges[:, 1] + + result_df.loc[icp_df.index, 'confidence'] = conf + + # consolidate all groups here + output['icp']['__mdb_active'] = True + + output['result_df'] = result_df + + info = {**info, **output} + return info
    + +
    [docs] def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object], + **kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]: + ns = SimpleNamespace(**kwargs) + + if ns.analysis['icp']['__mdb_active']: + icp_X = deepcopy(ns.data) + + # replace observed data w/predictions + preds = ns.predictions['prediction'] + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1: + preds = [p[0] for p in preds] + + for col in [f'timestep_{i}' for i in range(1, ns.tss.nr_predictions)]: + if col in icp_X.columns: + icp_X.pop(col) # erase ignorable columns + + icp_X[ns.target_name] = preds + + is_categorical = ns.target_dtype in (dtype.binary, dtype.categorical, dtype.array) + is_numerical = ns.target_dtype in [dtype.integer, dtype.float, + dtype.quantity] or ns.target_dtype in (dtype.array, dtype.tsarray) + is_anomaly_task = is_numerical and ns.tss.is_timeseries and ns.anomaly_detection + + if (is_numerical or is_categorical) and ns.analysis['icp'].get('__mdb_active', False): + + # reorder DF index + index = ns.analysis['icp']['__default'].index.values + index = np.append(index, ns.target_name) if ns.target_name not in index else index + icp_X = icp_X.reindex(columns=index) # important, else bounds can be invalid + + # only one normalizer, even if it's a grouped time series task + normalizer = ns.analysis['icp']['__default'].nc_function.normalizer + if normalizer: + normalizer.prediction_cache = normalizer(ns.encoded_data) + icp_X['__mdb_selfaware_scores'] = normalizer.prediction_cache + + # get ICP predictions + result_cols = ['lower', 'upper', 'significance'] if is_numerical else ['significance'] + result = pd.DataFrame(index=icp_X.index, columns=result_cols) + + # base ICP + X = deepcopy(icp_X) + # Calling `values` multiple times increased runtime of this function; referenced var is faster + icp_values = X.values + + # get all possible ranges + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1 and is_numerical: + + # bounds in time series are only given for the first forecast + ns.analysis['icp']['__default'].nc_function.model.prediction_cache = \ + [p[0] for p in ns.predictions['prediction']] + all_confs = ns.analysis['icp']['__default'].predict(icp_values) + + elif is_numerical: + ns.analysis['icp']['__default'].nc_function.model.prediction_cache = ns.predictions['prediction'] + all_confs = ns.analysis['icp']['__default'].predict(icp_values) + + # categorical + else: + predicted_proba = True if any(['__mdb_proba' in col for col in ns.predictions.columns]) else False + if predicted_proba: + all_cat_cols = [col for col in ns.predictions.columns if '__mdb_proba' in col] + class_dists = ns.predictions[all_cat_cols].values + for icol, cat_col in enumerate(all_cat_cols): + row_insights.loc[X.index, cat_col] = class_dists[:, icol] + else: + class_dists = pd.get_dummies(ns.predictions['prediction']).values + + ns.analysis['icp']['__default'].nc_function.model.prediction_cache = class_dists + + conf_candidates = list(range(20)) + list(range(20, 100, 10)) + all_ranges = np.array( + [ns.analysis['icp']['__default'].predict(icp_values, significance=s / 100) + for s in conf_candidates]) + all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1) + + # convert (B, 2, 99) into (B, 2) given width or error rate constraints + if is_numerical: + significances = ns.fixed_confidence + if significances is not None: + confs = all_confs[:, :, int(100 * (1 - significances)) - 1] + else: + error_rate = ns.anomaly_error_rate if is_anomaly_task else None + significances, confs = get_numeric_conf_range(all_confs, + df_target_stddev=ns.analysis['df_target_stddev'], + positive_domain=self.positive_domain, + error_rate=error_rate) + result.loc[X.index, 'lower'] = confs[:, 0] + result.loc[X.index, 'upper'] = confs[:, 1] + else: + conf_candidates = list(range(20)) + list(range(20, 100, 10)) + significances = get_categorical_conf(all_confs, conf_candidates) + + result.loc[X.index, 'significance'] = significances + + # grouped time series, we replace bounds in rows that have a trained ICP + if ns.analysis['icp'].get('__mdb_groups', False): + icps = ns.analysis['icp'] + group_keys = icps['__mdb_group_keys'] + + for group in icps['__mdb_groups']: + icp = icps[frozenset(group)] + + # check ICP has calibration scores + if icp.cal_scores[0].shape[0] > 0: + + # filter rows by group + X = deepcopy(icp_X) + for key, val in zip(group_keys, group): + X = X[X[key] == val] + + if X.size > 0: + # set ICP caches + icp.nc_function.model.prediction_cache = X.pop(ns.target_name).values + if icp.nc_function.normalizer: + icp.nc_function.normalizer.prediction_cache = X.pop('__mdb_selfaware_scores').values + + # predict and get confidence level given width or error rate constraints + if is_numerical: + all_confs = icp.predict(X.values) + error_rate = ns.anomaly_error_rate if is_anomaly_task else None + significances, confs = get_numeric_conf_range( + all_confs, + df_target_stddev=ns.analysis['df_target_stddev'], + positive_domain=self.positive_domain, + group=frozenset(group), + error_rate=error_rate + ) + + # only replace where grouped ICP is more informative (i.e. tighter) + if ns.fixed_confidence is None: + default_widths = result.loc[X.index, 'upper'] - result.loc[X.index, 'lower'] + grouped_widths = np.subtract(confs[:, 1], confs[:, 0]) + insert_index = (default_widths > grouped_widths)[lambda x: x.isin([True])].index + conf_index = (default_widths.reset_index(drop=True) > + grouped_widths)[lambda x: x.isin([True])].index + + result.loc[insert_index, 'lower'] = confs[conf_index, 0] + result.loc[insert_index, 'upper'] = confs[conf_index, 1] + result.loc[insert_index, 'significance'] = significances[conf_index] + + else: + conf_candidates = list(range(20)) + list(range(20, 100, 10)) + all_ranges = np.array( + [icp.predict(X.values, significance=s / 100) + for s in conf_candidates]) + all_confs = np.swapaxes(np.swapaxes(all_ranges, 0, 2), 0, 1) + significances = get_categorical_conf(all_confs, conf_candidates) + result.loc[X.index, 'significance'] = significances + + row_insights['confidence'] = result['significance'].astype(float).tolist() + + if is_numerical: + row_insights['lower'] = result['lower'].astype(float) + row_insights['upper'] = result['upper'].astype(float) + + # anomaly detection + if is_anomaly_task: + anomalies = get_anomalies(row_insights, + ns.data[ns.target_name], + cooldown=ns.anomaly_cooldown) + row_insights['anomaly'] = anomalies + + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1 and is_numerical: + row_insights = add_tn_conf_bounds(row_insights, ns.tss) + + # Make sure the target and real values are of an appropriate type + if ns.tss.is_timeseries and ns.tss.nr_predictions > 1: + # Array output that are not of type <array> originally are odd and I'm not sure how to handle them + # Or if they even need handling yet + pass + elif ns.target_dtype in (dtype.integer): + row_insights['prediction'] = row_insights['prediction'].clip(-pow(2, 62), pow(2, 62)).astype(int) + row_insights['upper'] = row_insights['upper'].clip(-pow(2, 62), pow(2, 62)).astype(int) + row_insights['lower'] = row_insights['lower'].clip(-pow(2, 62), pow(2, 62)).astype(int) + elif ns.target_dtype in (dtype.float, dtype.quantity): + row_insights['prediction'] = row_insights['prediction'].astype(float) + row_insights['upper'] = row_insights['upper'].astype(float) + row_insights['lower'] = row_insights['lower'].astype(float) + elif ns.target_dtype in (dtype.short_text, dtype.rich_text, dtype.binary, dtype.categorical): + row_insights['prediction'] = row_insights['prediction'].astype(str) + + return row_insights, global_insights
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/data/cleaner.html b/docs/_modules/lightwood/data/cleaner.html new file mode 100644 index 000000000..8c507641c --- /dev/null +++ b/docs/_modules/lightwood/data/cleaner.html @@ -0,0 +1,538 @@ + + + + + + + + + + lightwood.data.cleaner — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.data.cleaner
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.data.cleaner

    +import re
    +from copy import deepcopy
    +
    +import pandas as pd
    +import datetime
    +from dateutil.parser import parse as parse_dt
    +
    +from lightwood.api.dtype import dtype
    +from lightwood.helpers import text
    +from lightwood.helpers.log import log
    +from lightwood.api.types import TimeseriesSettings
    +from lightwood.helpers.numeric import is_nan_numeric
    +
    +from typing import Dict, List, Optional, Tuple, Callable, Union
    +
    +
    +
    [docs]def cleaner( + data: pd.DataFrame, + dtype_dict: Dict[str, str], + pct_invalid: float, + identifiers: Dict[str, str], + target: str, + mode: str, + timeseries_settings: TimeseriesSettings, + anomaly_detection: bool, + custom_cleaning_functions: Dict[str, str] = {} +) -> pd.DataFrame: + """ + The cleaner is a function which takes in the raw data, plus additional information about it's types and about the problem. Based on this it generates a "clean" representation of the data, where each column has an ideal standardized type and all malformed or otherwise missing or invalid elements are turned into ``None`` + + :param data: The raw data + :param dtype_dict: Type information for each column + :param pct_invalid: How much of each column can be invalid + :param identifiers: A dict containing all identifier typed columns + :param target: The target columns + :param mode: Can be "predict" or "train" + :param timeseries_settings: Timeseries related settings, only relevant for timeseries predictors, otherwise can be the default object + :param anomaly_detection: Are we detecting anomalies with this predictor? + + :returns: The cleaned data + """ # noqa + + data = _remove_columns(data, identifiers, target, mode, timeseries_settings, + anomaly_detection, dtype_dict) + + for col in _get_columns_to_clean(data, dtype_dict, mode, target): + + # Get and apply a cleaning function for each data type + # If you want to customize the cleaner, it's likely you can to modify ``get_cleaning_func`` + data[col] = data[col].apply(get_cleaning_func(dtype_dict[col], custom_cleaning_functions)) + + return data
    + + +def _check_if_invalid(new_data: pd.Series, pct_invalid: float, col_name: str): + """ + Checks how many invalid data points there are. Invalid data points are flagged as "Nones" from the cleaning processs (see data/cleaner.py for default). + If there are too many invalid data points (specified by `pct_invalid`), then an error message will pop up. This is used as a safeguard for very messy data. + + :param new_data: data to check for invalid values. + :param pct_invalid: maximum percentage of invalid values. If this threshold is surpassed, an exception is raised. + :param col_name: name of the column to analyze. + + """ # noqa + + chk_invalid = ( + 100 + * (len(new_data) - len([x for x in new_data if x is not None])) + / len(new_data) + ) + + if chk_invalid > pct_invalid: + err = f'Too many ({chk_invalid}%) invalid values in column {col_name}nam' + log.error(err) + raise Exception(err) + + +def get_cleaning_func(data_dtype: dtype, custom_cleaning_functions: Dict[str, str]) -> Callable: + """ + For the provided data type, provide the appropriate cleaning function. Below are the defaults, users can either override this function OR impose a custom block. + + :param data_dtype: The data-type (inferred from a column) as prescribed from ``api.dtype`` + + :returns: The appropriate function that will pre-process (clean) data of specified dtype. + """ # noqa + if data_dtype in custom_cleaning_functions: + clean_func = eval(custom_cleaning_functions[data_dtype]) + + elif data_dtype in (dtype.date, dtype.datetime): + clean_func = _standardize_datetime + + elif data_dtype in (dtype.float, dtype.tsarray): + clean_func = _clean_float + + elif data_dtype in (dtype.integer): + clean_func = _clean_int + + elif data_dtype in (dtype.array): + clean_func = _standardize_array + + elif data_dtype in (dtype.tags): + clean_func = _tags_to_tuples + + elif data_dtype in (dtype.quantity): + clean_func = _clean_quantity + + elif data_dtype in ( + dtype.short_text, + dtype.rich_text, + dtype.categorical, + dtype.binary, + dtype.audio, + dtype.image, + dtype.video + ): + clean_func = _clean_text + + else: + raise ValueError(f"{data_dtype} is not supported. Check lightwood.api.dtype") + + return clean_func + + +# ------------------------- # +# Temporal Cleaning +# ------------------------- # + + +def _standardize_datetime(element: object) -> Optional[float]: + """ + Parses an expected date-time element. Intakes an element that can in theory be anything. + """ + try: + date = parse_dt(str(element)) + except Exception: + try: + date = datetime.datetime.utcfromtimestamp(element) + except Exception: + return None + + return date.timestamp() + + +# ------------------------- # +# Tags/Sequences +# ------------------------- # + +# TODO Make it split on something other than commas +def _tags_to_tuples(tags_str: str) -> Tuple[str]: + """ + Converts comma-separated values into a tuple to preserve a sequence/array. + + Ex: + >> x = 'apples, oranges, bananas' + >> _tags_to_tuples(x) + >> ('apples', 'oranges', 'bananas') + """ + try: + return tuple([x.strip() for x in tags_str.split(",")]) + except Exception: + return tuple() + + +def _standardize_array(element: object) -> Optional[Union[List[float], float]]: + """ + Given an array of numbers in the form ``[1, 2, 3, 4]``, converts into a numerical sequence. + + :param element: An array-like element in a sequence + :returns: standardized array OR scalar number IF edge case + + Ex of edge case: + >> element = [1] + >> _standardize_array(element) + >> 1 + """ + try: + element = str(element) + element = element.rstrip("]").lstrip("[") + element = element.rstrip(" ").lstrip(" ") + element = element.replace(", ", " ").replace(",", " ") + # Handles cases where arrays are numbers + if " " not in element: + element = _clean_float(element) + else: + element = [float(x) for x in element.split(" ")] + except Exception: + pass + + return element + + +# ------------------------- # +# Integers/Floats/Quantities +# ------------------------- # + +def _clean_float(element: object) -> Optional[float]: + """ + Given an element, converts it into float numeric format. If element is NaN, or inf, then returns None. + """ + try: + cleaned_float = text.clean_float(element) + if is_nan_numeric(cleaned_float): + return None + return cleaned_float + except Exception: + return None + + +def _clean_int(element: object) -> Optional[int]: + """ + Given an element, converts it into integer numeric format. If element is NaN, or inf, then returns None. + """ + element = _clean_float(element) + if element is not None: + element = int(element) + return element + + +def _clean_quantity(element: object) -> Optional[float]: + """ + Given a quantity, clean and convert it into float numeric format. If element is NaN, or inf, then returns None. + """ + element = float(re.sub("[^0-9.,]", "", str(element)).replace(",", ".")) + return _clean_float(element) + + +# ------------------------- # +# Text +# ------------------------- # +def _clean_text(element: object) -> str: + return str(element) + + +# ------------------------- # +# Other helpers +# ------------------------- # +def _rm_rows_w_empty_targets(df: pd.DataFrame, target: str) -> pd.DataFrame: + """ + Drop any rows that have targets as unknown. Targets are necessary to train. + + :param df: The input dataframe including the target value + :param target: the column name that is the output target variable + + :returns: Data with any target smissing + """ + # Compare length before/after + len_before = len(df) + + # Use Pandas ```dropna``` to omit any rows with missing values for targets; these cannot be trained + df = df.dropna(subset=[target]) + + # Compare length with after + len_after = len(df) + nr_removed = len_before - len_after + + if nr_removed != 0: + log.warning( + f"Removed {nr_removed} rows because target was missing. Training on these rows is not possible." + ) # noqa + + return df + + +def _remove_columns(data: pd.DataFrame, identifiers: Dict[str, object], target: str, + mode: str, timeseries_settings: TimeseriesSettings, anomaly_detection: bool, + dtype_dict: Dict[str, dtype]) -> pd.DataFrame: + """ + Drop columns we don't want to use in order to train or predict + + :param data: The raw data + :param dtype_dict: Type information for each column + :param identifiers: A dict containing all identifier typed columns + :param target: The target columns + :param mode: Can be "predict" or "train" + :param timeseries_settings: Timeseries related settings, only relevant for timeseries predictors, otherwise can be the default object + :param anomaly_detection: Are we detecting anomalies with this predictor? + + :returns: A (new) dataframe without the dropped columns + """ # noqa + data = deepcopy(data) + to_drop = [*[x for x in identifiers.keys() if x != target], + *[x for x in data.columns if x in dtype_dict and dtype_dict[x] == dtype.invalid]] + exceptions = ["__mdb_make_predictions"] + to_drop = [x for x in to_drop if x in data.columns] + data = data.drop(columns=to_drop) + + if mode == "train": + data = _rm_rows_w_empty_targets(data, target) + if mode == "predict": + if ( + target in data.columns + and not timeseries_settings.use_previous_target + and not anomaly_detection + ): + data = data.drop(columns=[target]) + + # Drop extra columns + for name in list(data.columns): + if name not in dtype_dict and name not in exceptions: + data = data.drop(columns=[name]) + + return data + + +def _get_columns_to_clean(data: pd.DataFrame, dtype_dict: Dict[str, dtype], mode: str, target: str) -> List[str]: + """ + :param data: The raw data + :param dtype_dict: Type information for each column + :param target: The target columns + :param mode: Can be "predict" or "train" + + :returns: A list of columns that we want to clean + """ # noqa + + cleanable_columns = [] + for name, _ in dtype_dict.items(): + if mode == "predict": + if name == target: + continue + if name in data.columns: + cleanable_columns.append(name) + return cleanable_columns +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/data/encoded_ds.html b/docs/_modules/lightwood/data/encoded_ds.html new file mode 100644 index 000000000..1828dc278 --- /dev/null +++ b/docs/_modules/lightwood/data/encoded_ds.html @@ -0,0 +1,409 @@ + + + + + + + + + + lightwood.data.encoded_ds — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.data.encoded_ds
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.data.encoded_ds

    +import inspect
    +from typing import List, Tuple
    +import torch
    +import numpy as np
    +import pandas as pd
    +from torch.utils.data import Dataset
    +from lightwood.encoder.base import BaseEncoder
    +
    +
    +
    [docs]class EncodedDs(Dataset): + def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target: str) -> None: + """ + Create a Lightwood datasource from a data frame and some encoders. This class inherits from `torch.utils.data.Dataset`. + + Note: normal behavior is to cache encoded representations to avoid duplicated computations. If you want an option to disable, this please open an issue. + + :param encoders: list of Lightwood encoders used to encode the data per each column. + :param data_frame: original dataframe. + :param target: name of the target column to predict. + """ # noqa + self.data_frame = data_frame + self.encoders = encoders + self.target = target + self.cache_encoded = True + self.cache = [None] * len(self.data_frame) + self.encoder_spans = {} + self.input_length = 0 + + # save encoder span, has to use same iterator as in __getitem__ for correct indeces + for col in self.data_frame: + if col != self.target and self.encoders.get(col, False): + self.encoder_spans[col] = (self.input_length, + self.input_length + self.encoders[col].output_size) + self.input_length += self.encoders[col].output_size + + def __len__(self): + """ + The length of an `EncodedDs` datasource equals the amount of rows of the original dataframe. + + :return: length of the `EncodedDs` + """ + return int(self.data_frame.shape[0]) + + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: + """ + The getter yields a tuple (X, y), where: + - `X `is a concatenation of all encoded representations of the row + - `y` is the encoded target + + :param idx: index of the row to access. + + :return: tuple (X, y) with encoded data. + + """ # noqa + if self.cache_encoded: + if self.cache[idx] is not None: + return self.cache[idx] + + X = torch.FloatTensor() + Y = torch.FloatTensor() + for col in self.data_frame: + if self.encoders.get(col, None): + kwargs = {} + if 'dependency_data' in inspect.signature(self.encoders[col].encode).parameters: + kwargs['dependency_data'] = {dep: [self.data_frame.iloc[idx][dep]] + for dep in self.encoders[col].dependencies} + if hasattr(self.encoders[col], 'data_window'): + cols = [self.target] + [f'{self.target}_timestep_{i}' + for i in range(1, self.encoders[col].data_window)] + else: + cols = [col] + + data = self.data_frame[cols].iloc[idx].tolist() + encoded_tensor = self.encoders[col].encode(data, **kwargs)[0] + if col != self.target: + X = torch.cat([X, encoded_tensor]) + else: + Y = encoded_tensor + + if self.cache_encoded: + self.cache[idx] = (X, Y) + + return X, Y + +
    [docs] def get_column_original_data(self, column_name: str) -> pd.Series: + """ + Gets the original data for any given column of the `EncodedDs`. + + :param column_name: name of the column. + :return: A `pd.Series` with the original data stored in the `column_name` column. + """ + return self.data_frame[column_name]
    + +
    [docs] def get_encoded_column_data(self, column_name: str) -> torch.Tensor: + """ + Gets the encoded data for any given column of the `EncodedDs`. + + :param column_name: name of the column. + :return: A `torch.Tensor` with the encoded data of the `column_name` column. + """ + kwargs = {} + if 'dependency_data' in inspect.signature(self.encoders[column_name].encode).parameters: + deps = [dep for dep in self.encoders[column_name].dependencies if dep in self.data_frame.columns] + kwargs['dependency_data'] = {dep: self.data_frame[dep].tolist() for dep in deps} + encoded_data = self.encoders[column_name].encode(self.data_frame[column_name], **kwargs) + + if not isinstance(encoded_data, torch.Tensor): + raise Exception( + f'The encoder: {self.encoders[column_name]} for column: {column_name} does not return a Tensor !') + return encoded_data
    + +
    [docs] def get_encoded_data(self, include_target=True) -> torch.Tensor: + """ + Gets all encoded data. + + :param include_target: whether to include the target column in the output or not. + :return: A `torch.Tensor` with the encoded dataframe. + """ + encoded_dfs = [] + for col in self.data_frame.columns: + if (include_target or col != self.target) and self.encoders.get(col, False): + encoded_dfs.append(self.get_encoded_column_data(col)) + + return torch.cat(encoded_dfs, 1)
    + +
    [docs] def clear_cache(self): + """ + Clears the `EncodedDs` cache. + """ + self.cache = [None] * len(self.data_frame)
    + + +
    [docs]class ConcatedEncodedDs(EncodedDs): + """ + `ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity. + """ # noqa + def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None: + # @TODO: missing super() call here? + self.encoded_ds_arr = encoded_ds_arr + self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr] + self.encoders = self.encoded_ds_arr[0].encoders + self.encoder_spans = self.encoded_ds_arr[0].encoder_spans + self.target = self.encoded_ds_arr[0].target + + def __len__(self): + """ + See `lightwood.data.encoded_ds.EncodedDs.__len__()`. + """ + # @TODO: behavior here is not intuitive + return max(0, np.sum(self.encoded_ds_lenghts) - 2) + + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: + """ + See `lightwood.data.encoded_ds.EncodedDs.__getitem__()`. + """ + for ds_idx, length in enumerate(self.encoded_ds_lenghts): + if idx - length < 0: + return self.encoded_ds_arr[ds_idx][idx] + else: + idx -= length + raise StopIteration() + + @property + def data_frame(self) -> pd.DataFrame: + """ + Property that concatenates all underlying `EncodedDs`'s dataframes and returns them. + + Note: be careful to not modify a `ConcatedEncodedDs`, as you can see in the source, it will not have an effect. + + :return: Dataframe with all original data. + """ # noqa + return pd.concat([x.data_frame for x in self.encoded_ds_arr]) + +
    [docs] def get_column_original_data(self, column_name: str) -> pd.Series: + """ + See `lightwood.data.encoded_ds.EncodedDs.get_column_original_data()`. + """ + encoded_df_arr = [x.get_column_original_data(column_name) for x in self.encoded_ds_arr] + return pd.concat(encoded_df_arr)
    + +
    [docs] def get_encoded_column_data(self, column_name: str) -> torch.Tensor: + """ + See `lightwood.data.encoded_ds.EncodedDs.get_encoded_column_data()`. + """ + encoded_df_arr = [x.get_encoded_column_data(column_name) for x in self.encoded_ds_arr] + return torch.cat(encoded_df_arr, 0)
    + +
    [docs] def clear_cache(self): + """ + See `lightwood.data.encoded_ds.EncodedDs.clear_cache()`. + """ + for ds in self.encoded_ds_arr: + ds.clear_cache()
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/data/splitter.html b/docs/_modules/lightwood/data/splitter.html new file mode 100644 index 000000000..b0976c547 --- /dev/null +++ b/docs/_modules/lightwood/data/splitter.html @@ -0,0 +1,361 @@ + + + + + + + + + + lightwood.data.splitter — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.data.splitter
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.data.splitter

    +from typing import List, Dict
    +from itertools import product
    +
    +import numpy as np
    +import pandas as pd
    +
    +from lightwood.helpers.log import log
    +from lightwood.api.dtype import dtype
    +from lightwood.api.types import TimeseriesSettings
    +
    +
    +
    [docs]def splitter( + data: pd.DataFrame, + tss: TimeseriesSettings, + dtype_dict: Dict[str, str], + seed: int, + pct_train: float, + pct_dev: float, + pct_test: float, + target: str +) -> Dict[str, pd.DataFrame]: + """ + Splits data into training, dev and testing datasets. + + The proportion of data for each split must be specified (JSON-AI sets defaults to 80/10/10). First, rows in the dataset are shuffled randomly. Then a simple split is done. If a target value is provided and is of data type categorical/binary, then the splits will be stratified to maintain the representative populations of each class. + + :param data: Input dataset to be split + :param tss: time-series specific details for splitting + :param dtype_dict: Dictionary with the data type of all columns + :param seed: Random state for pandas data-frame shuffling + :param pct_train: training fraction of data; must be less than 1 + :param pct_dev: dev fraction of data; must be less than 1 + :param pct_test: testing fraction of data; must be less than 1 + :param target: Name of the target column; if specified, data will be stratified on this column + + :returns: A dictionary containing the keys train, test and dev with their respective data frames, as well as the "stratified_on" key indicating which columns the data was stratified on (None if it wasn't stratified on anything) + """ # noqa + pct_sum = pct_train + pct_dev + pct_test + if not (np.isclose(pct_sum, 1, atol=0.001) and np.less(pct_sum, 1 + 1e-5)): + raise Exception(f'The train, dev and test percentage of the data needs to sum up to 1 (got {pct_sum})') + + # Shuffle the data + np.random.seed(seed) + if not tss.is_timeseries: + data = data.sample(frac=1, random_state=seed).reset_index(drop=True) + + # Check if stratification should be done + stratify_on = [] + if target is not None: + if dtype_dict[target] in (dtype.categorical, dtype.binary) and not tss.is_timeseries: + stratify_on = [target] + if tss.is_timeseries and isinstance(tss.group_by, list): + stratify_on = tss.group_by + + # Split the data + if stratify_on: + reshuffle = not tss.is_timeseries + train, dev, test = stratify(data, pct_train, pct_dev, pct_test, stratify_on, seed, reshuffle) + else: + train, dev, test = simple_split(data, pct_train, pct_dev, pct_test) + + return {"train": train, "test": test, "dev": dev, "stratified_on": stratify_on}
    + + +def simple_split(data: pd.DataFrame, + pct_train: float, + pct_dev: float, + pct_test: float) -> List[pd.DataFrame]: + """ + Simple split method to separate data into training, dev and testing datasets. + + :param data: Input dataset to be split + :param pct_train: training fraction of data; must be less than 1 + :param pct_dev: dev fraction of data; must be less than 1 + :param pct_test: testing fraction of data; must be less than 1 + + :returns Train, dev, and test dataframes + """ + train_cutoff = round(data.shape[0] * pct_train) + dev_cutoff = round(data.shape[0] * pct_dev) + train_cutoff + test_cutoff = round(data.shape[0] * pct_test) + dev_cutoff + + train = data[:train_cutoff] + dev = data[train_cutoff:dev_cutoff] + test = data[dev_cutoff:test_cutoff] + + return [train, dev, test] + + +def stratify(data: pd.DataFrame, + pct_train: float, + pct_dev: float, + pct_test: float, + stratify_on: List[str], + seed: int, + reshuffle: bool) -> List[pd.DataFrame]: + """ + Stratified data splitter. + + The `stratify_on` columns yield a cartesian product by which every different subset will be stratified + independently from the others, and recombined at the end in fractions specified by `pcts`. + + For grouped time series tasks, stratification is done based on the group-by columns. + + :param data: dataframe with data to be split + :param pct_train: fraction of data to use for training split + :param pct_dev: fraction of data to use for dev split (used internally by mixers) + :param pct_test: fraction of data to use for test split (used post-training for analysis) + :param stratify_on: Columns to consider when stratifying + :param seed: Random state for pandas data-frame shuffling + :param reshuffle: specify if reshuffling should be done post-split + + :returns Stratified train, dev, test dataframes + """ # noqa + + train_st = pd.DataFrame(columns=data.columns) + dev_st = pd.DataFrame(columns=data.columns) + test_st = pd.DataFrame(columns=data.columns) + + all_group_combinations = list(product(*[data[col].unique() for col in stratify_on])) + for group in all_group_combinations: + df = data + for idx, col in enumerate(stratify_on): + df = df[df[col] == group[idx]] + + train_cutoff = round(df.shape[0] * pct_train) + dev_cutoff = round(df.shape[0] * pct_dev) + train_cutoff + test_cutoff = round(df.shape[0] * pct_test) + dev_cutoff + + train_st = train_st.append(df[:train_cutoff]) + dev_st = dev_st.append(df[train_cutoff:dev_cutoff]) + test_st = test_st.append(df[dev_cutoff:test_cutoff]) + + if reshuffle: + train_st, dev_st, test_st = [df.sample(frac=1, random_state=seed).reset_index(drop=True) + for df in [train_st, dev_st, test_st]] + + # check that stratified lengths conform to expected percentages + if not np.isclose(len(train_st) / len(data), pct_train, atol=0.01) or \ + not np.isclose(len(dev_st) / len(data), pct_dev, atol=0.01) or \ + not np.isclose(len(test_st) / len(data), pct_test, atol=0.01): + log.info("Could not stratify; reverting to simple split") + train_st, dev_st, test_st = simple_split(data, pct_train, pct_dev, pct_test) + + return [train_st, dev_st, test_st] +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/data/timeseries_analyzer.html b/docs/_modules/lightwood/data/timeseries_analyzer.html new file mode 100644 index 000000000..b07328010 --- /dev/null +++ b/docs/_modules/lightwood/data/timeseries_analyzer.html @@ -0,0 +1,349 @@ + + + + + + + + + + lightwood.data.timeseries_analyzer — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.data.timeseries_analyzer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.data.timeseries_analyzer

    +from typing import Dict, Tuple, List
    +
    +import numpy as np
    +import pandas as pd
    +
    +from lightwood.api.types import TimeseriesSettings
    +from lightwood.api.dtype import dtype
    +from lightwood.encoder.time_series.helpers.common import generate_target_group_normalizers
    +from lightwood.helpers.general import get_group_matches
    +
    +
    +
    [docs]def timeseries_analyzer(data: pd.DataFrame, dtype_dict: Dict[str, str], + timeseries_settings: TimeseriesSettings, target: str) -> Dict: + """ + This module analyzes (pre-processed) time series data and stores a few useful insights used in the rest of Lightwood's pipeline. + + :param data: dataframe with time series dataset. + :param dtype_dict: dictionary with inferred types for every column. + :param timeseries_settings: A `TimeseriesSettings` object. For more details, check `lightwood.types.TimeseriesSettings`. + :param target: name of the target column. + + The following things are extracted from each time series inside the dataset: + - group_combinations: all observed combinations of values for the set of `group_by` columns. The length of this list determines how many time series are in the data. + - deltas: inferred sampling interval + - ts_naive_residuals: Residuals obtained from the data by a naive forecaster that repeats the last-seen value. + - ts_naive_mae: Mean residual value obtained from the data by a naive forecaster that repeats the last-seen value. + - target_normalizers: objects that may normalize the data within any given time series for effective learning. See `lightwood.encoder.time_series.helpers.common` for available choices. + + :return: Dictionary with the aforementioned insights and the `TimeseriesSettings` object for future references. + """ # noqa + info = { + 'original_type': dtype_dict[target], + 'data': data[target].values + } + if timeseries_settings.group_by is not None: + info['group_info'] = {gcol: data[gcol].tolist() for gcol in timeseries_settings.group_by} # group col values + else: + info['group_info'] = {} + + # @TODO: maybe normalizers should fit using only the training subsets?? + new_data = generate_target_group_normalizers(info) + + if dtype_dict[target] in (dtype.integer, dtype.float, dtype.tsarray): + naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(info, new_data['group_combinations']) + else: + naive_forecast_residuals, scale_factor = {}, {} + + deltas = get_delta(data[timeseries_settings.order_by], + info, + new_data['group_combinations'], + timeseries_settings.order_by) + + return {'target_normalizers': new_data['target_normalizers'], + 'deltas': deltas, + 'tss': timeseries_settings, + 'group_combinations': new_data['group_combinations'], + 'ts_naive_residuals': naive_forecast_residuals, + 'ts_naive_mae': scale_factor + }
    + + +def get_delta(df: pd.DataFrame, ts_info: dict, group_combinations: list, order_cols: list) -> Dict[str, Dict]: + """ + Infer the sampling interval of each time series, by picking the most popular time interval observed in the training data. + + :param df: Dataframe with time series data. + :param ts_info: Dictionary used internally by `timeseries_analyzer`. Contains group-wise series information, among other things. + :param group_combinations: all tuples with distinct values for `TimeseriesSettings.group_by` columns, defining all available time series. + :param order_cols: all columns specified in `TimeseriesSettings.order_by`. + + :return: + Dictionary with group combination tuples as keys. Values are dictionaries with the inferred delta for each series, for each `order_col`. + """ # noqa + deltas = {"__default": {}} + + # get default delta for all data + for col in order_cols: + series = pd.Series([x[-1] for x in df[col]]) + rolling_diff = series.rolling(window=2).apply(lambda x: x.iloc[1] - x.iloc[0]) + delta = rolling_diff.value_counts(ascending=False).keys()[0] # pick most popular + deltas["__default"][col] = delta + + # get group-wise deltas (if applicable) + if ts_info.get('group_info', False): + original_data = ts_info['data'] + for group in group_combinations: + if group != "__default": + deltas[group] = {} + for col in order_cols: + ts_info['data'] = pd.Series([x[-1] for x in df[col]]) + _, subset = get_group_matches(ts_info, group) + if subset.size > 1: + rolling_diff = pd.Series( + subset.squeeze()).rolling( + window=2).apply( + lambda x: x.iloc[1] - x.iloc[0]) + delta = rolling_diff.value_counts(ascending=False).keys()[0] + deltas[group][col] = delta + ts_info['data'] = original_data + + return deltas + + +def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, float]: + """ + Computes forecasting residuals for the naive method (forecasts for time `t` is the value observed at `t-1`). + Useful for computing MASE forecasting error. + + Note: method assumes predictions are all for the same group combination. For a dataframe that contains multiple + series, use `get_grouped_naive_resiudals`. + + :param target_data: observed time series targets + :param m: season length. the naive forecasts will be the m-th previously seen value for each series + + :return: (list of naive residuals, average residual value) + """ # noqa + residuals = target_data.rolling(window=m + 1).apply(lambda x: abs(x.iloc[m] - x.iloc[0]))[m:].values.flatten() + scale_factor = np.average(residuals) + return residuals.tolist(), scale_factor + + +def get_grouped_naive_residuals(info: Dict, group_combinations: List) -> Tuple[Dict, Dict]: + """ + Wraps `get_naive_residuals` for a dataframe with multiple co-existing time series. + """ # noqa + group_residuals = {} + group_scale_factors = {} + for group in group_combinations: + idxs, subset = get_group_matches(info, group) + residuals, scale_factor = get_naive_residuals(pd.DataFrame(subset)) # @TODO: pass m once we handle seasonality + group_residuals[group] = residuals + group_scale_factors[group] = scale_factor + return group_residuals, group_scale_factors +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/data/timeseries_transform.html b/docs/_modules/lightwood/data/timeseries_transform.html new file mode 100644 index 000000000..2ed3dc2db --- /dev/null +++ b/docs/_modules/lightwood/data/timeseries_transform.html @@ -0,0 +1,562 @@ + + + + + + + + + + lightwood.data.timeseries_transform — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.data.timeseries_transform
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.data.timeseries_transform

    +import copy
    +import datetime
    +import dateutil
    +import numpy as np
    +import pandas as pd
    +import multiprocessing as mp
    +from lightwood.helpers.parallelism import get_nr_procs
    +from functools import partial
    +from typing import Dict
    +from lightwood.api.types import TimeseriesSettings
    +from lightwood.helpers.log import log
    +from lightwood.api import dtype
    +
    +
    +
    [docs]def transform_timeseries( + data: pd.DataFrame, dtype_dict: Dict[str, str], + timeseries_settings: TimeseriesSettings, target: str, mode: str) -> pd.DataFrame: + """ + Block that transforms the dataframe of a time series task to a convenient format for use in posterior phases like model training. + + The main transformations performed by this block are: + - Type casting (e.g. to numerical for `order_by` columns). + - Windowing functions for historical context based on `TimeseriesSettings.window` parameter. + - Explicitly add target columns according to the `TimeseriesSettings.nr_predictions` parameter. + - Flag all rows that are "predictable" based on all `TimeseriesSettings`. + - Plus, handle all logic for the streaming use case (where forecasts are only emitted for the last observed data point). + + :param data: Dataframe with data to transform. + :param dtype_dict: Dictionary with the types of each column. + :param timeseries_settings: A `TimeseriesSettings` object. + :param target: The name of the target column to forecast. + :param mode: Either "train" or "predict", depending on what phase is calling this procedure. + + :return: A dataframe with all the transformations applied. + """ # noqa + + tss = timeseries_settings + original_df = copy.deepcopy(data) + gb_arr = tss.group_by if tss.group_by is not None else [] + ob_arr = tss.order_by + window = tss.window + + if '__mdb_make_predictions' in original_df.columns: + index = original_df[original_df['__mdb_make_predictions'].map( + {'True': True, 'False': False, True: True, False: False}).isin([True])] + infer_mode = index.shape[0] == 0 # condition to trigger: __mdb_make_predictions is set to False everywhere + # @TODO: dont drop and use instead of original_index? + original_df = original_df.reset_index(drop=True) if infer_mode else original_df + else: + infer_mode = False + + original_index_list = [] + idx = 0 + for row in original_df.itertuples(): + if _make_pred(row) or infer_mode: + original_index_list.append(idx) + idx += 1 + else: + original_index_list.append(None) + + original_df['original_index'] = original_index_list + + secondary_type_dict = {} + for col in ob_arr: + if dtype_dict[col] in (dtype.date, dtype.integer, dtype.float): + secondary_type_dict[col] = dtype_dict[col] + + # Convert order_by columns to numbers (note, rows are references to mutable rows in `original_df`) + for _, row in original_df.iterrows(): + for col in ob_arr: + # @TODO: Remove if the TS encoder can handle `None` + if row[col] is None or pd.isna(row[col]): + row[col] = 0.0 + else: + if dtype_dict[col] == dtype.date: + try: + row[col] = dateutil.parser.parse( + row[col], + **{} + ) + except (TypeError, ValueError): + pass + + if isinstance(row[col], datetime.datetime): + row[col] = row[col].timestamp() + + try: + row[col] = float(row[col]) + except ValueError: + raise ValueError(f'Failed to order based on column: "{col}" due to faulty value: {row[col]}') + + for oby in tss.order_by: + original_df[f'__mdb_original_{oby}'] = original_df[oby] + + group_lengths = [] + if len(gb_arr) > 0: + df_arr = [] + for _, df in original_df.groupby(gb_arr): + df_arr.append(df.sort_values(by=ob_arr)) + group_lengths.append(len(df)) + else: + df_arr = [original_df] + group_lengths.append(len(original_df)) + + n_groups = len(df_arr) + last_index = original_df['original_index'].max() + for i, subdf in enumerate(df_arr): + if '__mdb_make_predictions' in subdf.columns and mode == 'predict': + if infer_mode: + df_arr[i] = _ts_infer_next_row(subdf, ob_arr, last_index) + last_index += 1 + + if len(original_df) > 500: + # @TODO: restore possibility to override this with args + nr_procs = get_nr_procs(original_df) + log.info(f'Using {nr_procs} processes to reshape.') + pool = mp.Pool(processes=nr_procs) + # Make type `object` so that dataframe cells can be python lists + df_arr = pool.map(partial(_ts_to_obj, historical_columns=ob_arr + tss.historical_columns), df_arr) + df_arr = pool.map(partial(_ts_order_col_to_cell_lists, + order_cols=ob_arr + tss.historical_columns), df_arr) + df_arr = pool.map( + partial( + _ts_add_previous_rows, order_cols=ob_arr + tss.historical_columns, window=window), + df_arr) + + df_arr = pool.map(partial(_ts_add_future_target, target=target, nr_predictions=tss.nr_predictions, + data_dtype=tss.target_type, mode=mode), + df_arr) + + if tss.use_previous_target: + df_arr = pool.map( + partial(_ts_add_previous_target, target=target, window=tss.window), + df_arr) + pool.close() + pool.join() + else: + for i in range(n_groups): + df_arr[i] = _ts_to_obj(df_arr[i], historical_columns=ob_arr + tss.historical_columns) + df_arr[i] = _ts_order_col_to_cell_lists(df_arr[i], order_cols=ob_arr + tss.historical_columns) + df_arr[i] = _ts_add_previous_rows(df_arr[i], + order_cols=ob_arr + tss.historical_columns, window=window) + df_arr[i] = _ts_add_future_target(df_arr[i], target=target, nr_predictions=tss.nr_predictions, + data_dtype=tss.target_type, mode=mode) + if tss.use_previous_target: + df_arr[i] = _ts_add_previous_target(df_arr[i], target=target, window=tss.window) + + combined_df = pd.concat(df_arr) + + if '__mdb_make_predictions' in combined_df.columns: + combined_df = pd.DataFrame(combined_df[combined_df['__mdb_make_predictions'].astype(bool).isin([True])]) + del combined_df['__mdb_make_predictions'] + + if not infer_mode and any([i < tss.window for i in group_lengths]): + if tss.allow_incomplete_history: + log.warning("Forecasting with incomplete historical context, predictions might be subpar") + else: + raise Exception(f'Not enough historical context to make a timeseries prediction. Please provide a number of rows greater or equal to the window size. If you can\'t get enough rows, consider lowering your window size. If you want to force timeseries predictions lacking historical context please set the `allow_incomplete_history` timeseries setting to `True`, but this might lead to subpar predictions.') # noqa + + df_gb_map = None + if n_groups > 1: + df_gb_list = list(combined_df.groupby(tss.group_by)) + df_gb_map = {} + for gb, df in df_gb_list: + df_gb_map['_' + '_'.join(gb)] = df + + timeseries_row_mapping = {} + idx = 0 + + if df_gb_map is None: + for _, row in combined_df.iterrows(): + if not infer_mode: + timeseries_row_mapping[idx] = int( + row['original_index']) if row['original_index'] is not None and not np.isnan( + row['original_index']) else None + else: + timeseries_row_mapping[idx] = idx + idx += 1 + else: + for gb in df_gb_map: + for _, row in df_gb_map[gb].iterrows(): + if not infer_mode: + timeseries_row_mapping[idx] = int( + row['original_index']) if row['original_index'] is not None and not np.isnan( + row['original_index']) else None + else: + timeseries_row_mapping[idx] = idx + + idx += 1 + + del combined_df['original_index'] + + # return combined_df, secondary_type_dict, timeseries_row_mapping, df_gb_map + return combined_df
    + + +def _ts_infer_next_row(df: pd.DataFrame, ob: str, last_index: int) -> pd.DataFrame: + """ + Adds an inferred next row for streaming mode purposes. + + :param df: dataframe from which next row is inferred. + :param ob: `order_by` column. + :param last_index: index number of the latest row in `df`. + + :return: Modified `df` with the inferred row appended to it. + """ + last_row = df.iloc[[-1]].copy() + if df.shape[0] > 1: + butlast_row = df.iloc[[-2]] + delta = (last_row[ob].values - butlast_row[ob].values).flatten()[0] + else: + delta = 1 + last_row.original_index = None + last_row.index = [last_index + 1] + last_row['__mdb_make_predictions'] = True + last_row['__mdb_ts_inferred'] = True + last_row[ob] += delta + return df.append(last_row) + + +def _make_pred(row) -> bool: + """ + Indicates whether a prediction should be made for `row` or not. + """ + return not hasattr(row, '__mdb_make_predictions') or row.make_predictions + + +def _ts_to_obj(df: pd.DataFrame, historical_columns: list) -> pd.DataFrame: + """ + Casts all historical columns in a dataframe to `object` type. + + :param df: Input dataframe + :param historical_columns: Historical columns to type cast + + :return: Dataframe with `object`-typed historical columns + """ + for hist_col in historical_columns: + df.loc[:, hist_col] = df[hist_col].astype(object) + return df + + +def _ts_order_col_to_cell_lists(df: pd.DataFrame, order_cols: list) -> pd.DataFrame: + """ + Casts all data in `order_by` columns into cells. + + :param df: Input dataframe + :param order_cols: `order_by` columns + + :return: Dataframe with all `order_cols` modified so that their values are cells, e.g. `1` -> `[1]` + """ + for order_col in order_cols: + for ii in range(len(df)): + label = df.index.values[ii] + df.at[label, order_col] = [df.at[label, order_col]] + return df + + +def _ts_add_previous_rows(df: pd.DataFrame, order_cols: list, window: int) -> pd.DataFrame: + """ + Adds previous rows (as determined by `TimeseriesSettings.window`) into the cells of all `order_by` columns. + + :param df: Input dataframe. + :param order_cols: `order_by` columns. + :param window: value of `TimeseriesSettings.window` parameter. + + :return: Dataframe with all `order_cols` modified so that their values are now arrays of historical context. + """ # noqa + for order_col in order_cols: + for i in range(len(df)): + previous_indexes = [*range(max(0, i - window), i)] + + for prev_i in reversed(previous_indexes): + df.iloc[i][order_col].append( + df.iloc[prev_i][order_col][-1] + ) + + # Zero pad + # @TODO: Remove since RNN encoder can do without (???) + df.iloc[i][order_col].extend( + [0] * (1 + window - len(df.iloc[i][order_col])) + ) + df.iloc[i][order_col].reverse() + return df + + +def _ts_add_previous_target(df: pd.DataFrame, target: str, window: int) -> pd.DataFrame: + """ + Adds previous rows (as determined by `TimeseriesSettings.window`) into the cells of the target column. + + :param df: Input dataframe. + :param target: target column name. + :param window: value of `TimeseriesSettings.window` parameter. + + :return: Dataframe with new `__mdb_ts_previous_{target}` column that contains historical target context. + """ # noqa + if target not in df: + return df + previous_target_values = list(df[target]) + del previous_target_values[-1] + previous_target_values = [None] + previous_target_values + + previous_target_values_arr = [] + for i in range(len(previous_target_values)): + prev_vals = previous_target_values[max(i - window, 0):i + 1] + arr = [None] * (window - len(prev_vals) + 1) + arr.extend(prev_vals) + previous_target_values_arr.append(arr) + + df[f'__mdb_ts_previous_{target}'] = previous_target_values_arr + return df + + +def _ts_add_future_target(df, target, nr_predictions, data_dtype, mode): + """ + Adds as many columns to the input dataframe as the forecasting horizon asks for (as determined by `TimeseriesSettings.nr_predictions`). + + :param df: Input dataframe. + :param target: target column name. + :param nr_predictions: value of `TimeseriesSettings.nr_predictions` parameter. + :param data_dtype: dictionary with types of all input columns + :param mode: either "train" or "predict". `Train` will drop rows with incomplet target info. `Predict` has no effect, for now. + + :return: Dataframe with new `{target}_timestep_{i}'` columns that contains target labels at timestep `i` of a total `TimeseriesSettings.nr_predictions`. + """ # noqa + if target not in df: + return df + if data_dtype in (dtype.integer, dtype.float, dtype.array, dtype.tsarray): + df[target] = df[target].astype(float) + + for timestep_index in range(1, nr_predictions): + next_target_value_arr = list(df[target]) + for del_index in range(0, min(timestep_index, len(next_target_value_arr))): + del next_target_value_arr[0] + next_target_value_arr.append(None) + col_name = f'{target}_timestep_{timestep_index}' + df[col_name] = next_target_value_arr + df[col_name] = df[col_name].fillna(value=np.nan) + + # drop rows with incomplete target info. + if mode == 'train': + for col in [f'{target}_timestep_{i}' for i in range(1, nr_predictions)]: + if '__mdb_make_predictions' not in df.columns: + df['__mdb_make_predictions'] = True + df.loc[df[col].isna(), ['__mdb_make_predictions']] = False + + return df +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/array/array.html b/docs/_modules/lightwood/encoder/array/array.html new file mode 100644 index 000000000..46ed1afa5 --- /dev/null +++ b/docs/_modules/lightwood/encoder/array/array.html @@ -0,0 +1,305 @@ + + + + + + + + + + lightwood.encoder.array.array — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.array.array
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.array.array

    +from typing import List, Union
    +import torch
    +import pandas as pd
    +import numpy as np
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.api import dtype
    +from lightwood.encoder.helpers import MinMaxNormalizer, CatNormalizer
    +from lightwood.helpers.general import is_none
    +
    +
    +
    [docs]class ArrayEncoder(BaseEncoder): + """ + Fits a normalizer for array data. To encode, `ArrayEncoder` returns a normalized window of previous data. + It can be used for generic arrays, as well as for handling historical target values in time series tasks. + + Currently supported normalizing strategies are minmax for numerical arrays, and a simple one-hot for categorical arrays. See `lightwood.encoder.helpers` for more details on each approach. + + :param stop_after: time budget in seconds. + :param window: expected length of array data. + :param original_dtype: element-wise data type + """ # noqa + + is_trainable_encoder: bool = True + + def __init__(self, stop_after: int, window: int = None, is_target: bool = False, original_type: dtype = None): + super().__init__(is_target) + self.stop_after = stop_after + self.original_type = original_type + self._normalizer = None + if window is not None: + self.output_size = window + 1 + else: + self.output_size = None + + def _pad_and_strip(self, array: List[object]): + if len(array) < self.output_size: + array = array + [0] * (self.output_size - len(array)) + if len(array) > self.output_size: + array = array[:self.output_size] + return array + + def prepare(self, train_priming_data, dev_priming_data): + priming_data = pd.concat([train_priming_data, dev_priming_data]) + priming_data = priming_data.values + + if self.output_size is None: + self.output_size = np.max([len(x) for x in priming_data if x is not None]) + for i in range(len(priming_data)): + if is_none(priming_data[i]): + priming_data[i] = [0] * self.output_size + + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + if self.original_type in (dtype.categorical, dtype.binary): + self._normalizer = CatNormalizer(encoder_class='ordinal') + else: + self._normalizer = MinMaxNormalizer() + + if isinstance(priming_data, pd.Series): + priming_data = priming_data.values + + priming_data = [self._pad_and_strip(list(x)) for x in priming_data] + + self._normalizer.prepare(priming_data) + self.output_size *= self._normalizer.output_size + self.is_prepared = True + + def encode(self, column_data: Union[list, np.ndarray, torch.Tensor]) -> torch.Tensor: + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + if isinstance(column_data, pd.Series): + column_data = column_data.values + + for i in range(len(column_data)): + if is_none(column_data[i]): + column_data[i] = [0] * self.output_size + column_data = [self._pad_and_strip(list(x)) for x in column_data] + + data = torch.cat([self._normalizer.encode(column_data)], dim=-1) + data[torch.isnan(data)] = 0.0 + data[torch.isinf(data)] = 0.0 + + return data + + def decode(self, data) -> torch.tensor: + decoded = data.tolist() + return decoded
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/base.html b/docs/_modules/lightwood/encoder/base.html new file mode 100644 index 000000000..d6f37c120 --- /dev/null +++ b/docs/_modules/lightwood/encoder/base.html @@ -0,0 +1,275 @@ + + + + + + + + + + lightwood.encoder.base — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.base
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.base

    +from typing import List
    +import torch
    +
    +
    +
    [docs]class BaseEncoder: + """ + Base class for all encoders. + + An encoder should return encoded representations of any columnar data. + The procedure for this is defined inside the `encode()` method. + + If this encoder is expected to handle an output column, then it also needs to implement the respective `decode()` method that handles the inverse transformation from encoded representations to the final prediction in the original column space. + + For encoders that learn representations (as opposed to rule-based), the `prepare()` method will handle all learning logic. + + The `to()` method is used to move PyTorch-based encoders to and from a GPU. + + :param is_target: Whether the data to encode is the target, as per the problem definition. + :param is_timeseries_encoder: Whether encoder represents sequential/time-series data. Lightwood must provide specific treatment for this kind of encoder + :param is_trainable_encoder: Whether the encoder must return learned representations. Lightwood checks whether this flag is present in order to pass data to the feature representation via the ``prepare`` statement. + + Class Attributes: + - is_prepared: Internal flag to signal that the `prepare()` method has been successfully executed. + - is_nn_encoder: Whether the encoder is neural network-based. + - dependencies: list of additional columns that the encoder might need to encode. + - output_size: length of each encoding tensor for a single data point. + + """ # noqa + is_target: bool + is_prepared: bool + + is_timeseries_encoder: bool = False + is_trainable_encoder: bool = False + + def __init__(self, is_target=False) -> None: + self.is_target = is_target + self.is_prepared = False + self.dependencies = [] + self.output_size = None + + # Not all encoders need to be prepared + def prepare(self, priming_data) -> None: + self.is_prepared = True + + def encode(self, column_data) -> torch.Tensor: + raise NotImplementedError + + def decode(self, encoded_data) -> List[object]: + raise NotImplementedError + + # Should work for all torch-based encoders, but custom behavior may have to be implemented for weird models + def to(self, device, available_devices): + # Find all nn.Module type objects and convert them + # @TODO: Make this work recursively + for v in vars(self): + attr = getattr(self, v) + if isinstance(attr, torch.nn.Module): + attr.to(device) + return self
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/categorical/autoencoder.html b/docs/_modules/lightwood/encoder/categorical/autoencoder.html new file mode 100644 index 000000000..c8b60f587 --- /dev/null +++ b/docs/_modules/lightwood/encoder/categorical/autoencoder.html @@ -0,0 +1,323 @@ + + + + + + + + + + lightwood.encoder.categorical.autoencoder — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.categorical.autoencoder
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.categorical.autoencoder

    +import random
    +import numpy as np
    +import torch
    +from torch.utils.data import DataLoader
    +from lightwood.mixer.helpers.ranger import Ranger
    +from lightwood.encoder.categorical.onehot import OneHotEncoder
    +from lightwood.encoder.categorical.gym import Gym
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.helpers.log import log
    +from lightwood.mixer.helpers.default_net import DefaultNet
    +import pandas as pd
    +
    +
    +
    [docs]class CategoricalAutoEncoder(BaseEncoder): + is_trainable_encoder: bool = True + + def __init__(self, stop_after: int = 3600, is_target: bool = False, max_encoded_length: int = 100): + super().__init__(is_target) + self.is_prepared = False + self.name = 'Categorical Autoencoder' + self.net = None + self.encoder = None + self.decoder = None + self.onehot_encoder = OneHotEncoder(is_target=self.is_target) + self.desired_error = 0.01 + self.stop_after = stop_after + # @TODO stop using instead of ONEHOT !!!@! + self.output_size = None + self.max_encoded_length = max_encoded_length + + def _encoder_targets(self, data): + oh_encoded_categories = self.onehot_encoder.encode(data) + target = oh_encoded_categories.cpu().numpy() + target_indexes = np.where(target > 0)[1] + targets_c = torch.LongTensor(target_indexes) + labels = targets_c.to(self.net.device) + return labels + + def prepare(self, train_priming_data, dev_priming_data): + priming_data = pd.concat([train_priming_data, dev_priming_data]) + random.seed(len(priming_data)) + + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + self.onehot_encoder.prepare(priming_data) + + input_len = self.onehot_encoder._lang.n_words + + if self.is_target: + log.warning('You are trying to use an autoencoder for the target value! \ + This is very likely a bad idea') + log.info('Preparing a categorical autoencoder, this might take a while') + + embeddings_layer_len = self.max_encoded_length + + self.net = DefaultNet(shape=[input_len, embeddings_layer_len, input_len]) + + criterion = torch.nn.CrossEntropyLoss() + optimizer = Ranger(self.net.parameters()) + + gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, + device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, + output_encoder=self._encoder_targets) + + batch_size = min(200, int(len(priming_data) / 50)) + + priming_data_str = [str(x) for x in priming_data] + train_data_loader = DataLoader( + list(zip(priming_data_str, priming_data_str)), + batch_size=batch_size, shuffle=True) + + test_data_loader = None + + best_model, _, _ = gym.fit(train_data_loader, + test_data_loader, + desired_error=self.desired_error, + max_time=self.stop_after, + eval_every_x_epochs=1, + max_unimproving_models=5) + + self.net = best_model.to(self.net.device) + + modules = [module for module in self.net.modules() if type( + module) != torch.nn.Sequential and type(module) != DefaultNet] + self.encoder = torch.nn.Sequential(*modules[0:2]).eval() + self.decoder = torch.nn.Sequential(*modules[2:3]).eval() + log.info('Categorical autoencoder ready') + + self.output_size = self.onehot_encoder._lang.n_words + self.output_size = self.max_encoded_length + self.is_prepared = True + + def encode(self, column_data): + oh_encoded_tensor = self.onehot_encoder.encode(column_data) + + with torch.no_grad(): + oh_encoded_tensor = oh_encoded_tensor.to(self.net.device) + embeddings = self.encoder(oh_encoded_tensor) + return embeddings.to('cpu') + + def decode(self, encoded_data): + with torch.no_grad(): + encoded_data = encoded_data.to(self.net.device) + oh_encoded_tensor = self.decoder(encoded_data) + oh_encoded_tensor = oh_encoded_tensor.to('cpu') + return self.onehot_encoder.decode(oh_encoded_tensor)
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/categorical/binary.html b/docs/_modules/lightwood/encoder/categorical/binary.html new file mode 100644 index 000000000..4f9a25d8d --- /dev/null +++ b/docs/_modules/lightwood/encoder/categorical/binary.html @@ -0,0 +1,287 @@ + + + + + + + + + + lightwood.encoder.categorical.binary — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.categorical.binary
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.categorical.binary

    +import torch
    +import numpy as np
    +from scipy.special import softmax
    +from lightwood.encoder.base import BaseEncoder
    +
    +
    +# Exists mainly for datasets with loads of binary flags where OHE can be too slow to fit
    +
    [docs]class BinaryEncoder(BaseEncoder): + + def __init__(self, is_target=False, target_class_distribution=None): + super().__init__(is_target) + self.map = {} + self.rev_map = {} + self.output_size = 2 + if self.is_target: + self.target_class_distribution = target_class_distribution + self.index_weights = None + + def prepare(self, priming_data): + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + for x in priming_data: + x = str(x) + if x not in self.map: + self.map[x] = len(self.map) + self.rev_map[len(self.rev_map)] = x + + if len(self.map) == 2: + break + + if self.is_target: + self.index_weights = [None, None] + for word in self.map: + if self.target_class_distribution is not None: + self.index_weights[self.map[word]] = 1 / self.target_class_distribution[word] + else: + self.index_weights[self.map[word]] = 1 + + self.index_weights = torch.Tensor(self.index_weights) + + self.is_prepared = True + + def encode(self, column_data): + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + ret = [] + + for word in column_data: + index = self.map.get(word, None) + ret.append([0, 0]) + if index is not None: + ret[-1][index] = 1 + + return torch.Tensor(ret) + + def decode(self, encoded_data, return_raw=False): + encoded_data_list = encoded_data.tolist() + ret = [] + probs = [] + + for vector in encoded_data_list: + ret.append(self.rev_map[np.argmax(vector)]) + + if return_raw: + probs.append(softmax(vector).tolist()) + + if return_raw: + return ret, probs, self.rev_map + else: + return ret
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/categorical/multihot.html b/docs/_modules/lightwood/encoder/categorical/multihot.html new file mode 100644 index 000000000..1ed7045a3 --- /dev/null +++ b/docs/_modules/lightwood/encoder/categorical/multihot.html @@ -0,0 +1,254 @@ + + + + + + + + + + lightwood.encoder.categorical.multihot — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.categorical.multihot
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.categorical.multihot

    +import torch
    +import numpy as np
    +from lightwood.encoder import BaseEncoder
    +from sklearn.preprocessing import MultiLabelBinarizer
    +
    +
    +
    [docs]class MultiHotEncoder(BaseEncoder): + def __init__(self, is_target: bool = False): + super().__init__(is_target) + self._binarizer = MultiLabelBinarizer() + self._seen = set() + self.output_size = None + + @staticmethod + def _clean_col_data(column_data): + column_data = [(arr if arr is not None else []) for arr in column_data] + column_data = [[str(x) for x in arr] for arr in column_data] + return column_data + + def prepare(self, priming_data, max_dimensions=100): + priming_data = self._clean_col_data(priming_data) + self._binarizer.fit(priming_data + [('None')]) + for arr in priming_data: + for x in arr: + self._seen.add(x) + self.is_prepared = True + self.output_size = len(self.encode(priming_data[0:1])[0]) + + def encode(self, column_data): + column_data = self._clean_col_data(column_data) + data_array = self._binarizer.transform(column_data) + return torch.Tensor(data_array) + + def decode(self, vectors): + # It these are logits output by the neural network, we need to treshold them to binary vectors + vectors = np.where(vectors > 0, 1, 0) + words_tuples = self._binarizer.inverse_transform(vectors) + return [list(w) for w in words_tuples]
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/categorical/onehot.html b/docs/_modules/lightwood/encoder/categorical/onehot.html new file mode 100644 index 000000000..3456469ca --- /dev/null +++ b/docs/_modules/lightwood/encoder/categorical/onehot.html @@ -0,0 +1,347 @@ + + + + + + + + + + lightwood.encoder.categorical.onehot — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.categorical.onehot
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.categorical.onehot

    +import torch
    +import numpy as np
    +from scipy.special import softmax
    +from lightwood.encoder.text.helpers.rnn_helpers import Lang
    +from lightwood.helpers.log import log
    +from lightwood.encoder.base import BaseEncoder
    +
    +UNCOMMON_WORD = '__mdb_unknown_cat'
    +UNCOMMON_TOKEN = 0
    +
    +
    +
    [docs]class OneHotEncoder(BaseEncoder): + + def __init__(self, is_target=False, target_class_distribution=None, handle_unknown='unknown_token'): + super().__init__(is_target) + self._lang = None + self.rev_map = {} + + if handle_unknown not in {"unknown_token", "return_zeros"}: + raise ValueError(f"handle_unknown should be either 'unknown_token' or 'return_zeros', got {handle_unknown}") + else: + self.handle_unknown = handle_unknown + + if self.is_target: + self.target_class_distribution = target_class_distribution + self.index_weights = None + + def prepare(self, priming_data, max_dimensions=20000): + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + self._lang = Lang('default') + if self.handle_unknown == "return_zeros": + priming_data = [x for x in priming_data if x is not None] + self._lang.index2word = {} + self._lang.word2index = {} + self._lang.n_words = 0 + else: # self.handle_unknown == "unknown_token" + priming_data = [x if x is not None else UNCOMMON_WORD for x in priming_data] + self._lang.index2word = {UNCOMMON_TOKEN: UNCOMMON_WORD} + self._lang.word2index = {UNCOMMON_WORD: UNCOMMON_TOKEN} + self._lang.word2count[UNCOMMON_WORD] = 0 + self._lang.n_words = 1 + + for category in priming_data: + if category is not None: + self._lang.addWord(str(category)) + + while self._lang.n_words > max_dimensions: + if self.handle_unknown == "return_zeros": + necessary_words = [] + else: # self.handle_unknown == "unknown_token" + necessary_words = [UNCOMMON_WORD] + least_occuring_words = self._lang.getLeastOccurring(n=len(necessary_words) + 1) + + word_to_remove = None + for word in least_occuring_words: + if word not in necessary_words: + word_to_remove = word + break + + self._lang.removeWord(word_to_remove) + + if self.is_target: + self.index_weights = [None] * self._lang.n_words + if self.target_class_distribution is not None: + self.index_weights[0] = np.mean(list(self.target_class_distribution.values())) + else: + self.index_weights[0] = 1 + for word in set(priming_data): + if self.target_class_distribution is not None: + self.index_weights[self._lang.word2index[str(word)]] = 1 / self.target_class_distribution[word] + else: + self.index_weights[self._lang.word2index[str(word)]] = 1 + self.index_weights = torch.Tensor(self.index_weights) + + self.output_size = self._lang.n_words + self.rev_map = self._lang.index2word + self.is_prepared = True + + def encode(self, column_data): + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + ret = [] + v_len = self._lang.n_words + + for word in column_data: + encoded_word = [0] * v_len + if word is not None: + word = str(word) + if self.handle_unknown == "return_zeros": + if word in self._lang.word2index: + index = self._lang.word2index[word] + encoded_word[index] = 1 + else: + # Encoding an unknown value will result in a vector of zeros + log.warning('Trying to encode a value never seen before, returning vector of zeros') + else: # self.handle_unknown == "unknown_token" + index = self._lang.word2index[word] if word in self._lang.word2index else UNCOMMON_TOKEN + encoded_word[index] = 1 + + ret.append(encoded_word) + + return torch.Tensor(ret) + + def decode(self, encoded_data, return_raw=False): + encoded_data_list = encoded_data.tolist() + ret = [] + probs = [] + + for vector in encoded_data_list: + # Logits and onehots are not the same in definition + # But this explicitly operates on logits; it will take care of + # the one hot (so you can pass something in the softmax logit space) + # But will not affect something that is already OHE. + + all_zeros = not np.any(vector) + if self.handle_unknown == "return_zeros" and all_zeros: + ret.append(UNCOMMON_WORD) + else: # self.handle_unknown == "unknown_token" + ohe_index = np.argmax(vector) + ret.append(self._lang.index2word[ohe_index]) + + if return_raw: + probs.append(softmax(vector).tolist()) + + if return_raw: + return ret, probs, self.rev_map + else: + return ret
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/datetime/datetime.html b/docs/_modules/lightwood/encoder/datetime/datetime.html new file mode 100644 index 000000000..9260491d9 --- /dev/null +++ b/docs/_modules/lightwood/encoder/datetime/datetime.html @@ -0,0 +1,306 @@ + + + + + + + + + + lightwood.encoder.datetime.datetime — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.datetime.datetime
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.datetime.datetime

    +import datetime
    +import calendar
    +from typing import Optional
    +import torch
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.helpers.general import is_none
    +
    +
    +
    [docs]class DatetimeEncoder(BaseEncoder): + """ + This encoder produces an encoded representation for timestamps. + + The approach consists on decomposing the timestamp objects into its constituent units (e.g. day-of-week, month, year, etc), and describing each of those with a single value that represents the magnitude in a sensible cycle length. + """ # noqa + def __init__(self, is_target: bool = False): + super().__init__(is_target) + self.fields = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second'] + self.constants = {'year': 3000.0, 'month': 12.0, 'weekday': 7.0, + 'hour': 24.0, 'minute': 60.0, 'second': 60.0} + self.output_size = 7 + + def prepare(self, priming_data): + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + self.is_prepared = True + +
    [docs] def encode(self, data): + """ + :param data: # @TODO: receive a consistent data type here; currently either list of lists or pd.Series w/lists + :return: encoded data + """ + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + ret = [self.encode_one(unix_timestamp) for unix_timestamp in data] + + return torch.Tensor(ret)
    + +
    [docs] def encode_one(self, unix_timestamp: Optional[float]): + """ + Encodes a list of unix_timestamps, or a list of tensors with unix_timestamps + :param data: list of unix_timestamps (unix_timestamp resolution is seconds) + :return: a list of vectors + """ + if is_none(unix_timestamp): + vector = [0] * len(self.fields) + else: + c = self.constants + date = datetime.datetime.fromtimestamp(unix_timestamp) + day_constant = calendar.monthrange(date.year, date.month)[1] + vector = [date.year / c['year'], date.month / c['month'], date.day / day_constant, + date.weekday() / c['weekday'], date.hour / c['hour'], + date.minute / c['minute'], date.second / c['second']] + return vector
    + + def decode(self, encoded_data, return_as_datetime=False): + ret = [] + if len(encoded_data.shape) > 2 and encoded_data.shape[0] == 1: + encoded_data = encoded_data.squeeze(0) + + for vector in encoded_data.tolist(): + ret.append(self.decode_one(vector, return_as_datetime=return_as_datetime)) + + return ret + + def decode_one(self, vector, return_as_datetime=False): + if sum(vector) == 0: + decoded = None + + else: + c = self.constants + + year = max(0, round(vector[0] * c['year'])) + month = max(1, min(12, round(vector[1] * c['month']))) + day_constant = calendar.monthrange(year, month)[-1] + day = max(1, min(round(vector[2] * day_constant), day_constant)) + hour = max(0, min(23, round(vector[4] * c['hour']))) + minute = max(0, min(59, round(vector[5] * c['minute']))) + second = max(0, min(59, round(vector[6] * c['second']))) + + dt = datetime.datetime(year=year, month=month, day=day, hour=hour, + minute=minute, second=second) + + if return_as_datetime is True: + decoded = dt + else: + decoded = round(dt.timestamp()) + + return decoded
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/datetime/datetime_sin_normalizer.html b/docs/_modules/lightwood/encoder/datetime/datetime_sin_normalizer.html new file mode 100644 index 000000000..6b999d59a --- /dev/null +++ b/docs/_modules/lightwood/encoder/datetime/datetime_sin_normalizer.html @@ -0,0 +1,326 @@ + + + + + + + + + + lightwood.encoder.datetime.datetime_sin_normalizer — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.datetime.datetime_sin_normalizer
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.datetime.datetime_sin_normalizer

    +import datetime
    +import calendar
    +import numpy as np
    +import pandas as pd  # @TODO: remove?
    +import torch
    +from lightwood.encoder.base import BaseEncoder
    +from collections.abc import Iterable
    +from lightwood.helpers.general import is_none
    +
    +
    +
    [docs]class DatetimeNormalizerEncoder(BaseEncoder): + def __init__(self, is_target: bool = False, sinusoidal: bool = False): + super().__init__(is_target) + self.sinusoidal = sinusoidal + self.fields = ['year', 'month', 'day', 'weekday', 'hour', 'minute', 'second'] + self.constants = {'year': 3000.0, 'month': 12.0, 'weekday': 7.0, + 'hour': 24.0, 'minute': 60.0, 'second': 60.0} + if self.sinusoidal: + self.output_size = 2 + else: + self.output_size = 7 + + def prepare(self, priming_data): + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + self.is_prepared = True + +
    [docs] def encode(self, data): + """ + :param data: # @TODO: receive a consistent data type here; currently either list of lists or pd.Series w/lists + :return: encoded data + """ + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + if isinstance(data, pd.Series): + data = data.values + if not isinstance(data[0], Iterable): + data = [data] + + ret = [self.encode_one(row) for row in data] + + return torch.Tensor(ret)
    + +
    [docs] def encode_one(self, data): + """ + Encodes a list of unix_timestamps, or a list of tensors with unix_timestamps + :param data: list of unix_timestamps (unix_timestamp resolution is seconds) + :return: a list of vectors + """ + ret = [] + for unix_timestamp in data: + if is_none(unix_timestamp): + if self.sinusoidal: + vector = [0, 1] * len(self.fields) + else: + vector = [0] * len(self.fields) + else: + c = self.constants + if isinstance(unix_timestamp, torch.Tensor): + unix_timestamp = unix_timestamp.item() + date = datetime.datetime.fromtimestamp(unix_timestamp) + day_constant = calendar.monthrange(date.year, date.month)[1] + vector = [date.year / c['year'], date.month / c['month'], date.day / day_constant, + date.weekday() / c['weekday'], date.hour / c['hour'], + date.minute / c['minute'], date.second / c['second']] + if self.sinusoidal: + vector = np.array([(np.sin(n), np.cos(n)) for n in vector]).flatten() + + ret.append(vector) + + return ret
    + + def decode(self, encoded_data, return_as_datetime=False): + ret = [] + if len(encoded_data.shape) > 2 and encoded_data.shape[0] == 1: + encoded_data = encoded_data.squeeze(0) + + for vector in encoded_data.tolist(): + ret.append(self.decode_one(vector, return_as_datetime=return_as_datetime)) + + return ret + + def decode_one(self, vector, return_as_datetime=False): + if sum(vector) == 0: + decoded = None + + else: + if self.sinusoidal: + vector = list(map(lambda x: np.arcsin(x), vector))[::2] + c = self.constants + + year = max(0, round(vector[0] * c['year'])) + month = max(1, min(12, round(vector[1] * c['month']))) + day_constant = calendar.monthrange(year, month)[-1] + day = max(1, min(round(vector[2] * day_constant), day_constant)) + hour = max(0, min(23, round(vector[4] * c['hour']))) + minute = max(0, min(59, round(vector[5] * c['minute']))) + second = max(0, min(59, round(vector[6] * c['second']))) + + dt = datetime.datetime(year=year, month=month, day=day, hour=hour, + minute=minute, second=second) + + if return_as_datetime is True: + decoded = dt + else: + decoded = round(dt.timestamp()) + + return decoded
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/image/img_2_vec.html b/docs/_modules/lightwood/encoder/image/img_2_vec.html new file mode 100644 index 000000000..eb1f05c20 --- /dev/null +++ b/docs/_modules/lightwood/encoder/image/img_2_vec.html @@ -0,0 +1,285 @@ + + + + + + + + + + lightwood.encoder.image.img_2_vec — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.image.img_2_vec
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.image.img_2_vec

    +from typing import List
    +import logging
    +import torch
    +import torchvision.transforms as transforms
    +from PIL import Image
    +import pandas as pd
    +from lightwood.encoder.image.helpers.img_to_vec import Img2Vec
    +from lightwood.encoder.base import BaseEncoder
    +
    +
    +
    [docs]class Img2VecEncoder(BaseEncoder): + is_trainable_encoder: bool = True + + def __init__(self, stop_after: int = 3600, is_target: bool = False): + super().__init__(is_target) + # # I think we should make this an enum, something like: speed, balance, accuracy + # self.aim = aim + self.is_prepared = False + + self._scaler = transforms.Resize((224, 224)) + self._normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self._to_tensor = transforms.ToTensor() + self._img_to_tensor = transforms.Compose([ + self._scaler, + self._to_tensor, + self._normalize + ]) + self.stop_after = stop_after + + pil_logger = logging.getLogger('PIL') + pil_logger.setLevel(logging.ERROR) + + def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series): + # @TODO: Add a bit of training here (maybe? depending on time aim) + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + self.model = Img2Vec() + self.output_size = self.model.output_size + self.is_prepared = True + + def to(self, device, available_devices): + self.model.to(device, available_devices) + return self + +
    [docs] def encode(self, images: List[str]) -> torch.Tensor: + """ + Encode list of images + + :param images: list of images, each image is a path to a file or a url + :return: a torch.floatTensor + """ + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + img_tensors = [self._img_to_tensor( + Image.open(img_path) + ) for img_path in images] + vec_arr = [] + + self.model.eval() + with torch.no_grad(): + for img_tensor in img_tensors: + vec = self.model(img_tensor.unsqueeze(0), batch=False) + vec_arr.append(vec) + return torch.stack(vec_arr).to('cpu')
    + + def decode(self, encoded_values_tensor): + raise Exception('This encoder is not bi-directional')
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/numeric/numeric.html b/docs/_modules/lightwood/encoder/numeric/numeric.html new file mode 100644 index 000000000..ee564f7a2 --- /dev/null +++ b/docs/_modules/lightwood/encoder/numeric/numeric.html @@ -0,0 +1,347 @@ + + + + + + + + + + lightwood.encoder.numeric.numeric — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.numeric.numeric
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.numeric.numeric

    +import math
    +import torch
    +import numpy as np
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.helpers.log import log
    +from lightwood.helpers.general import is_none
    +
    +
    +
    [docs]class NumericEncoder(BaseEncoder): + + def __init__(self, data_type=None, is_target: bool = False, positive_domain: bool = False): + super().__init__(is_target) + self._type = data_type + self._abs_mean = None + self.positive_domain = positive_domain + self.decode_log = False + self.output_size = 4 if not self.is_target else 3 + + def prepare(self, priming_data): + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + value_type = 'int' + for number in priming_data: + try: + number = float(number) + except Exception: + continue + + if np.isnan(number): + err = 'Lightwood does not support working with NaN values !' + log.warning(err) + continue + + if int(number) != number: + value_type = 'float' + + self._type = value_type if self._type is None else self._type + non_null_priming_data = [float(str(x).replace(',', '.')) for x in priming_data if not is_none(x)] + self._abs_mean = np.mean(np.abs(non_null_priming_data)) + self.is_prepared = True + + def encode(self, data): + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + ret = [] + for real in data: + try: + real = float(real) + except Exception: + try: + real = float(real.replace(',', '.')) + except Exception: + real = None + if self.is_target: + vector = [0] * 3 + if real is not None and self._abs_mean > 0: + vector[0] = 1 if real < 0 and not self.positive_domain else 0 + vector[1] = math.log(abs(real)) if abs(real) > 0 else -20 + vector[2] = real / self._abs_mean + else: + log.debug(f'Can\'t encode target value: {real}') + + else: + vector = [0] * 4 + try: + if is_none(real): + vector[0] = 0 + else: + vector[0] = 1 + vector[1] = math.log(abs(real)) if abs(real) > 0 else -20 + vector[2] = 1 if real < 0 and not self.positive_domain else 0 + vector[3] = real / self._abs_mean + except Exception as e: + vector = [0] * 4 + log.error(f'Can\'t encode input value: {real}, exception: {e}') + + ret.append(vector) + + return torch.Tensor(ret) + + def decode(self, encoded_values, decode_log=None) -> list: + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + if decode_log is None: + decode_log = self.decode_log + + ret = [] + if isinstance(encoded_values, torch.Tensor): + encoded_values = encoded_values.tolist() + + for vector in encoded_values: + if self.is_target: + if np.isnan( + vector[0]) or vector[0] == float('inf') or np.isnan( + vector[1]) or vector[1] == float('inf') or np.isnan( + vector[2]) or vector[2] == float('inf'): + log.error(f'Got weird target value to decode: {vector}') + real_value = pow(10, 63) + else: + if decode_log: + sign = -1 if vector[0] > 0.5 else 1 + try: + real_value = math.exp(vector[1]) * sign + except OverflowError: + real_value = pow(10, 63) * sign + else: + real_value = vector[2] * self._abs_mean + + if self.positive_domain: + real_value = abs(real_value) + + if self._type == 'int': + real_value = int(real_value) + + else: + if vector[0] < 0.5: + ret.append(None) + continue + + real_value = vector[3] * self._abs_mean + + if self._type == 'int': + real_value = round(real_value) + + if isinstance(real_value, torch.Tensor): + real_value = real_value.item() + ret.append(real_value) + return ret
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/numeric/ts_array_numeric.html b/docs/_modules/lightwood/encoder/numeric/ts_array_numeric.html new file mode 100644 index 000000000..61267f070 --- /dev/null +++ b/docs/_modules/lightwood/encoder/numeric/ts_array_numeric.html @@ -0,0 +1,280 @@ + + + + + + + + + + lightwood.encoder.numeric.ts_array_numeric — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.numeric.ts_array_numeric
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.numeric.ts_array_numeric

    +import torch
    +import torch.nn.functional as F
    +from lightwood.encoder import BaseEncoder
    +from lightwood.encoder.numeric import TsNumericEncoder
    +
    +
    +
    [docs]class TsArrayNumericEncoder(BaseEncoder): + """ + Variant of vanilla numerical encoder, supports dynamic mean re-scaling + """ + + def __init__(self, timesteps: int, is_target: bool = False, positive_domain: bool = False, grouped_by=None): + super(TsArrayNumericEncoder, self).__init__(is_target=is_target) + # time series normalization params + self.normalizers = None + self.group_combinations = None + self.dependencies = grouped_by + self.data_window = timesteps + self.positive_domain = positive_domain + self.sub_encoder = TsNumericEncoder(is_target=is_target, positive_domain=positive_domain, grouped_by=grouped_by) + self.output_size = self.data_window * self.sub_encoder.output_size + + def prepare(self, priming_data): + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + + self.sub_encoder.prepare(priming_data) + self.is_prepared = True + +
    [docs] def encode(self, data, dependency_data={}): + """ + :param dependency_data: dict with grouped_by column info, to retrieve the correct normalizer for each datum + :return: tensor with shape (batch, NxK) where N: self.data_window and K: sub-encoder # of output features + """ # noqa + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + if not dependency_data: + dependency_data = {'__default': [None] * len(data)} + + ret = [] + for data_point in data: + ret.append(self.sub_encoder.encode([data_point], dependency_data=dependency_data)) + + ret = torch.hstack(ret) + padding_size = self.output_size - ret.shape[-1] + + if padding_size > 0: + ret = F.pad(ret, (0, padding_size)) + + return ret
    + + def decode(self, encoded_values, dependency_data=None, return_all=False): + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + encoded_values = encoded_values.reshape(encoded_values.shape[0], + self.data_window, + self.sub_encoder.output_size) + + ret = [] + for encoded_timestep in torch.split(encoded_values, 1, dim=1): + ret.extend(self.sub_encoder.decode(encoded_timestep.squeeze(1), dependency_data=dependency_data)) + + return ret
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/numeric/ts_numeric.html b/docs/_modules/lightwood/encoder/numeric/ts_numeric.html new file mode 100644 index 000000000..9dc684e9c --- /dev/null +++ b/docs/_modules/lightwood/encoder/numeric/ts_numeric.html @@ -0,0 +1,341 @@ + + + + + + + + + + lightwood.encoder.numeric.ts_numeric — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.numeric.ts_numeric
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.numeric.ts_numeric

    +import math
    +import torch
    +import numpy as np
    +from lightwood.encoder.numeric import NumericEncoder
    +from lightwood.helpers.log import log
    +
    +
    +
    [docs]class TsNumericEncoder(NumericEncoder): + """ + Variant of vanilla numerical encoder, supports dynamic mean re-scaling + """ + is_timeseries_encoder: bool = True + + def __init__(self, is_target: bool = False, positive_domain: bool = False, grouped_by=None): + super(TsNumericEncoder, self).__init__(is_target=is_target, positive_domain=positive_domain) + # time series normalization params + self.normalizers = None + self.group_combinations = None + self.dependencies = grouped_by + self.output_size = 2 if is_target else 3 + +
    [docs] def encode(self, data, dependency_data={}): + """ + :param dependency_data: dict with grouped_by column info, to retrieve the correct normalizer for each datum + """ # noqa + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + if not dependency_data: + dependency_data = {'__default': [None] * len(data)} + + ret = [] + for real, group in zip(data, list(zip(*dependency_data.values()))): + try: + real = float(real) + except Exception: + try: + real = float(real.replace(',', '.')) + except Exception: + real = None + if self.is_target: + vector = [0] * 2 + if group is not None and self.normalizers is not None: + try: + mean = self.normalizers[frozenset(group)].abs_mean + except KeyError: + # novel group-by, we use default normalizer mean + mean = self.normalizers['__default'].abs_mean + else: + mean = self._abs_mean + + if real is not None: + vector[0] = 1 if real < 0 and not self.positive_domain else 0 + vector[1] = real / mean if mean != 0 else real + else: + raise Exception(f'Can\'t encode target value: {real}') + + else: + vector = [0] * 3 + try: + if real is not None: + vector[0] = 1 + vector[1] = 1 if real < 0 and not self.positive_domain else 0 + vector[2] = real / self._abs_mean + except Exception as e: + log.error(f'Can\'t encode input value: {real}, exception: {e}') + + ret.append(vector) + + return torch.Tensor(ret)
    + + def decode(self, encoded_values, decode_log=None, dependency_data=None): + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + if decode_log is None: + decode_log = self.decode_log + + ret = [] + if not dependency_data: + dependency_data = {'__default': [None] * len(encoded_values)} + if isinstance(encoded_values, torch.Tensor): + encoded_values = encoded_values.tolist() + + for vector, group in zip(encoded_values, list(zip(*dependency_data.values()))): + if self.is_target: + if np.isnan(vector[0]) or vector[0] == float('inf') or np.isnan(vector[1]) or vector[1] == float('inf'): + log.error(f'Got weird target value to decode: {vector}') + real_value = pow(10, 63) + else: + if decode_log: + sign = -1 if vector[0] > 0.5 else 1 + try: + real_value = math.exp(vector[1]) * sign + except OverflowError: + real_value = pow(10, 63) * sign + else: + if group is not None and self.normalizers is not None: + try: + mean = self.normalizers[frozenset(group)].abs_mean + except KeyError: + # decode new group with default normalizer + mean = self.normalizers['__default'].abs_mean + else: + mean = self._abs_mean + + real_value = vector[1] * mean if mean != 0 else vector[1] + + if self.positive_domain: + real_value = abs(real_value) + + if self._type == 'int': + real_value = int(round(real_value, 0)) + + else: + if vector[0] < 0.5: + ret.append(None) + continue + + real_value = vector[2] * self._abs_mean + + if self._type == 'int': + real_value = round(real_value) + + ret.append(real_value) + return ret
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/text/pretrained.html b/docs/_modules/lightwood/encoder/text/pretrained.html new file mode 100644 index 000000000..ef9f9ef43 --- /dev/null +++ b/docs/_modules/lightwood/encoder/text/pretrained.html @@ -0,0 +1,590 @@ + + + + + + + + + + lightwood.encoder.text.pretrained — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.text.pretrained
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.text.pretrained

    +"""
    +2021.07.16
    +Adding flag "embedmode".
    +
    +Embed-mode is made for when text is one of many columns in the model.
    +IF the model is direct (text) -> output, then it's worth just using
    +the fine-tuned encoder as the "mixer" persay; thus, turn embed-mode OFF.
    +
    +This means there are 3 possible modes:
    +
    +(1) Classification
    +    -> Fine tuned, output of encoder is [CLS] embedding
    +    -> Fine tuned, output of encoder is the class value
    +(2) Regression
    +    -> Untrained; output of encoder is [CLS] embedding
    +
    +Training with regression is WIP; seems like quantile-binning is the best approach
    +but using MSE loss while fine-tuning did not demonstrate decent results. Particularly
    +because the mixer seems to address this.
    +
    +2021.03.18
    +
    +## Padding changes the answer slightly in the model.
    +
    +The following text encoder uses huggingface's
    +Distilbert. Internal benchmarks suggest
    +1 epoch of fine tuning is ideal [classification].
    +Training ONLY occurs for classification. Regression problems
    +are not trained, embeddings are directly generated.
    +
    +See: https://huggingface.co/transformers/training.html
    +for further details.
    +
    +Currently the model supports only distilbert.
    +
    +When instantiating the DistilBertForSeq.Class object,
    +num_labels indicates whether you use classification or regression.
    +
    +See: https://huggingface.co/transformers/model_doc/distilbert.html#distilbertforsequenceclassification
    +under the 'labels' command
    +
    +For classification - we use num_labels = 1 + num_classes ***
    +
    +If you do num_classes + 1, we reserve the LAST label
    +as the "unknown" label; this is different from the original
    +distilbert model. (prior to 2021.03)
    +
    +TODOs:
    ++ Regression
    ++ Batch encodes() tokenization step
    ++ Look into auto-encoding lower dimensional representations
    +of the output embedding
    ++ Look into regression tuning (will require grad. clipping)
    ++ Look into tuning to the encoded space of output.
    +"""
    +import time
    +import torch
    +from torch.utils.data import DataLoader
    +import os
    +import pandas as pd
    +from lightwood.encoder.text.helpers.pretrained_helpers import TextEmbed
    +from lightwood.helpers.device import get_devices
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.helpers.log import log
    +from lightwood.helpers.torch import LightwoodAutocast
    +from lightwood.api import dtype
    +from transformers import (
    +    DistilBertModel,
    +    DistilBertForSequenceClassification,
    +    DistilBertTokenizerFast,
    +    AdamW,
    +    get_linear_schedule_with_warmup,
    +)
    +from lightwood.helpers.general import is_none
    +
    +
    +
    [docs]class PretrainedLangEncoder(BaseEncoder): + is_trainable_encoder: bool = True + + """ + Pretrained language models. + Option to train on a target encoding of choice. + + Args: + is_target ::Bool; data column is the target of ML. + model_name ::str; name of pre-trained model + custom_tokenizer ::function; custom tokenizing function + batch_size ::int; size of batch + max_position_embeddings ::int; max sequence length of input text + custom_train ::Bool; If true, trains model on target procided + frozen ::Bool; If true, freezes transformer layers during training. + epochs ::int; number of epochs to train model with + embed_mode ::Bool; If true, assumes the output of the encode() step is the CLS embedding. + """ + + def __init__( + self, + stop_after: int, + is_target=False, + model_name="distilbert", + custom_tokenizer=None, + batch_size=10, + max_position_embeddings=None, + frozen=False, + epochs=1, + output_type=None, + embed_mode=True, + ): + super().__init__(is_target) + + self.output_type = output_type + self.name = model_name + " text encoder" + log.info(self.name) + + self._max_len = max_position_embeddings + self._frozen = frozen + self._batch_size = batch_size + self._epochs = epochs + + # Model setup + self._tokenizer = custom_tokenizer + self._model = None + self.model_type = None + + # TODO: Other LMs; Distilbert is a good balance of speed/performance + self._classifier_model_class = DistilBertForSequenceClassification + self._embeddings_model_class = DistilBertModel + self._tokenizer_class = DistilBertTokenizerFast + self._pretrained_model_name = "distilbert-base-uncased" + + self.device, _ = get_devices() + self.stop_after = stop_after + + self.embed_mode = embed_mode + self.uses_target = True + self.output_size = None + + # DEBUGGING!!! + if self.embed_mode: + log.info("Embedding mode on. [CLS] embedding dim output of encode()") + else: + log.info("Embedding mode off. Logits are output of encode()") + +
    [docs] def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series, encoded_target_values: torch.Tensor): + """ + Prepare the encoder by training on the target. + + Training data must be a dict with "targets" avail. + Automatically assumes this. + """ + os.environ['TOKENIZERS_PARALLELISM'] = 'true' + priming_data = pd.concat([train_priming_data, dev_priming_data]) + priming_data = priming_data.values + if self.is_prepared: + raise Exception("Encoder is already prepared.") + + # TODO: Make tokenizer custom with partial function; feed custom->model + if self._tokenizer is None: + self._tokenizer = self._tokenizer_class.from_pretrained(self._pretrained_model_name) + + # Replaces empty strings with '' + priming_data = [x if x is not None else "" for x in priming_data] + + # Checks training data details + # TODO: Regression flag; currently training supported for categorical only + + if (self.output_type in (dtype.categorical, dtype.binary)): + log.info("Training model.") + + # Prepare priming data into tokenized form + attention masks + text = self._tokenizer(priming_data, truncation=True, padding=True) + + log.info("\tOutput trained is categorical") + + labels = encoded_target_values.argmax(dim=1) + + # Construct the model + self._model = self._classifier_model_class.from_pretrained( + self._pretrained_model_name, + num_labels=len(encoded_target_values[0]), + ).to(self.device) + + # Construct the dataset for training + xinp = TextEmbed(text, labels) + dataset = DataLoader(xinp, batch_size=self._batch_size, shuffle=True) + + # If max length not set, adjust + if self._max_len is None: + self._max_len = self._model.config.max_position_embeddings + + if self._frozen: + log.info("\tFrozen Model + Training Classifier Layers") + """ + Freeze the base transformer model and train + a linear layer on top + """ + # Freeze all the transformer parameters + for param in self._model.base_model.parameters(): + param.requires_grad = False + + optimizer_grouped_parameters = self._model.parameters() + + else: + log.info("\tFine-tuning model") + """ + Fine-tuning parameters with weight decay + """ + no_decay = [ + "bias", + "LayerNorm.weight", + ] # decay on all terms EXCLUDING bias/layernorms + optimizer_grouped_parameters = [ + { + "params": [ + p + for n, p in self._model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": 0.01, + }, + { + "params": [ + p + for n, p in self._model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": 0.0, + }, + ] + + optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5) + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=0, # default value for GLUE + num_training_steps=len(dataset) * self._epochs, + ) + + # Train model; declare optimizer earlier if desired. + self._tune_model( + dataset, optim=optimizer, scheduler=scheduler, n_epochs=self._epochs + ) + + else: + log.info("Target is not classification; Embeddings Generator only") + + self.model_type = "embeddings_generator" + self._model = self._embeddings_model_class.from_pretrained( + self._pretrained_model_name + ).to(self.device) + + # TODO: Not a great flag + # Currently, if the task is not classification, you must have + # an embedding generator only. + if self.embed_mode is False: + log.info("Embedding mode must be ON for non-classification targets.") + self.embed_mode = True + + self.is_prepared = True + encoded = self.encode(priming_data[0:1]) + self.output_size = len(encoded[0])
    + + def _tune_model(self, dataset, optim, scheduler, n_epochs=1): + """ + Given a model, train for n_epochs. + Specifically intended for tuning; it does NOT use loss/ + stopping criterion. + + model - torch.nn model; + dataset - torch.DataLoader; dataset to train + device - torch.device; cuda/cpu + log - lightwood.logger.log; log.info output + optim - transformers.optimization.AdamW; optimizer + scheduler - scheduling params + n_epochs - number of epochs to train + + """ + self._model.train() + + if optim is None: + log.info("No opt. provided, setting all params with AdamW.") + optim = AdamW(self._model.parameters(), lr=5e-5) + else: + log.info("Optimizer provided") + + if scheduler is None: + log.info("No scheduler provided.") + else: + log.info("Scheduler provided.") + + started = time.time() + for epoch in range(n_epochs): + total_loss = 0 + if time.time() - started > self.stop_after: + break + + for batch in dataset: + optim.zero_grad() + + with LightwoodAutocast(): + inpids = batch["input_ids"].to(self.device) + attn = batch["attention_mask"].to(self.device) + labels = batch["labels"].to(self.device) + outputs = self._model(inpids, attention_mask=attn, labels=labels) + loss = outputs[0] + + total_loss += loss.item() + + loss.backward() + optim.step() + if scheduler is not None: + scheduler.step() + + self._train_callback(epoch, total_loss / len(dataset)) + + def _train_callback(self, epoch, loss): + log.info(f"{self.name} at epoch {epoch+1} and loss {loss}!") + +
    [docs] def encode(self, column_data): + """ + TODO: Maybe batch the text up; may take too long + Given column data, encode the dataset. + + Currently, returns the embedding of the pre-classifier layer. + + Args: + column_data:: [list[str]] list of text data in str form + + Returns: + encoded_representation:: [torch.Tensor] N_sentences x Nembed_dim + """ + if self.is_prepared is False: + raise Exception("You need to first prepare the encoder.") + + # Set model to testing/eval mode. + self._model.eval() + + encoded_representation = [] + + with torch.no_grad(): + # Set the weights; this is GPT-2 + for text in column_data: + + # Omit NaNs + if is_none(text): + text = "" + + # Tokenize the text with the built-in tokenizer. + inp = self._tokenizer.encode( + text, truncation=True, return_tensors="pt" + ).to(self.device) + + if self.embed_mode: # Embedding mode ON; return [CLS] + output = self._model.base_model(inp).last_hidden_state[:, 0] + + # If the model has a pre-classifier layer, use this embedding. + if hasattr(self._model, "pre_classifier"): + output = self._model.pre_classifier(output) + + else: # Embedding mode off; return classes + output = self._model(inp).logits + + encoded_representation.append(output.detach()) + + return torch.stack(encoded_representation).squeeze(1).to('cpu')
    + + def decode(self, encoded_values_tensor, max_length=100): + raise Exception("Decoder not implemented.") + + def to(self, device, available_devices): + for v in vars(self): + attr = getattr(self, v) + if isinstance(attr, torch.nn.Module): + attr.to(device) + return self
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/text/short.html b/docs/_modules/lightwood/encoder/text/short.html new file mode 100644 index 000000000..2e2c3ceb9 --- /dev/null +++ b/docs/_modules/lightwood/encoder/text/short.html @@ -0,0 +1,331 @@ + + + + + + + + + + lightwood.encoder.text.short — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.text.short
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.text.short

    +from typing import List
    +import torch
    +from lightwood.encoder import BaseEncoder
    +from lightwood.encoder.categorical import CategoricalAutoEncoder
    +from lightwood.helpers.text import tokenize_text
    +from lightwood.helpers.torch import concat_vectors_and_pad, average_vectors
    +import pandas as pd
    +
    +
    +
    [docs]class ShortTextEncoder(BaseEncoder): + def __init__(self, is_target=False, mode=None): + """ + :param is_target: + :param mode: + None or "concat" or "mean". + When None, it will be set automatically based on is_target: + (is_target) -> 'concat' + (not is_target) -> 'mean' + """ + super().__init__(is_target) + + if mode is None: + if is_target: + self._mode = 'concat' + else: + self._mode = 'mean' + else: + if mode not in ['concat', 'mean']: + self._unexpected_mode() + + if is_target and mode != 'concat': + raise ValueError('mode must be "concat" when is_target=True') + + self._mode = mode + + # Defined in self.prepare() + self._combine_fn = None + self.max_words_per_sent = None + self.cae = CategoricalAutoEncoder(is_target=is_target, max_encoded_length=100) + self.is_prepared = False + + def _unexpected_mode(self): + raise ValueError('unexpected combine value (must be "mean" or "concat")') + + # defining both of these as normal functions because pickle can't deal with lambdas + def _combine_concat(self, vecs): + return concat_vectors_and_pad(vecs, self.max_words_per_sent) + + def _combine_mean(self, vecs): + return average_vectors(vecs) + + def prepare(self, priming_data): + no_null_sentences = (x if x is not None else '' for x in priming_data) + unique_tokens = set() + max_words_per_sent = 0 + for sent in no_null_sentences: + tokens = tokenize_text(sent) + max_words_per_sent = max(max_words_per_sent, len(tokens)) + for tok in tokens: + unique_tokens.add(tok) + + self.cae.prepare(pd.Series(list(unique_tokens)), pd.Series([])) + + if self._mode == 'concat': + self.max_words_per_sent = max_words_per_sent + self._combine_fn = self._combine_concat + elif self._mode == 'mean': + self._combine_fn = self._combine_mean + else: + self._unexpected_mode() + + self.is_prepared = True + encoded = self.encode([priming_data[0]]) + self.output_size = len(encoded[0]) + + def encode(self, column_data: List[str]) -> torch.Tensor: + no_null_sentences = (x if x is not None else '' for x in column_data) + output = [] + for sent in no_null_sentences: + tokens = tokenize_text(sent) + encoded_words = self.cae.encode(tokens) + encoded_sent = self._combine_fn(encoded_words) + output.append(torch.Tensor(encoded_sent)) + output = torch.stack(output) + return output + + def decode(self, vectors): + if self._mode == 'concat': + + vec_size = self.cae.max_encoded_length + + output = [] + for vec in vectors: + + viewed_vec = vec.view(-1, vec_size) + + # Find index of first padding vector + for index, v in enumerate(viewed_vec): + if v.abs().sum() == 0: + break + else: + index = viewed_vec.size(0) + + out = self.cae.decode( + viewed_vec[:index] + ) + + output.append(out) + + return output + + elif self._mode == 'mean': + raise ValueError('decode is only defined for mode="concat"') + else: + self._unexpected_mode()
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/text/vocab.html b/docs/_modules/lightwood/encoder/text/vocab.html new file mode 100644 index 000000000..8901f32be --- /dev/null +++ b/docs/_modules/lightwood/encoder/text/vocab.html @@ -0,0 +1,252 @@ + + + + + + + + + + lightwood.encoder.text.vocab — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.text.vocab
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.text.vocab

    +import os
    +import torch
    +from transformers import DistilBertTokenizer
    +from lightwood.encoder.base import BaseEncoder
    +
    +
    +
    [docs]class VocabularyEncoder(BaseEncoder): + def __init__(self, is_target: bool = False): + super().__init__(is_target) + self._tokenizer_class = DistilBertTokenizer + self._pretrained_model_name = 'distilbert-base-uncased' + self._max_len = None + self._tokenizer = None + self._pad_id = None + + def prepare(self, priming_data): + os.environ['TOKENIZERS_PARALLELISM'] = 'true' + self._max_len = max([len(x) for x in priming_data]) + self._tokenizer = self._tokenizer_class.from_pretrained(self._pretrained_model_name) + self._pad_id = self._tokenizer.convert_tokens_to_ids([self._tokenizer.pad_token])[0] + + def encode(self, column_data): + vec = [] + for text in column_data: + encoded = self._tokenizer.encode(text[:self._max_len], add_special_tokens=True) + encoded = torch.tensor(encoded + [self._pad_id] * (self._max_len - len(encoded))) + vec.append(encoded) + return torch.stack(vec) + + def decode(self, encoded_values_tensor): + vec = [] + for encoded in encoded_values_tensor: + decoded = self._tokenizer.decode(encoded) + decoded = decoded.split('[PAD]')[0].rstrip().lstrip().lstrip('[CLS] ').rstrip(' [SEP]') + vec.append(decoded) + return vec
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/encoder/time_series/rnn.html b/docs/_modules/lightwood/encoder/time_series/rnn.html new file mode 100644 index 000000000..ccd8765fb --- /dev/null +++ b/docs/_modules/lightwood/encoder/time_series/rnn.html @@ -0,0 +1,717 @@ + + + + + + + + + + lightwood.encoder.time_series.rnn — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.encoder.time_series.rnn
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.encoder.time_series.rnn

    +import time
    +from math import gcd
    +from typing import List
    +from copy import deepcopy
    +
    +import numpy as np
    +import pandas as pd
    +import torch
    +import torch.nn as nn
    +from torch import optim
    +
    +from lightwood.api import dtype
    +from lightwood.helpers.log import log
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.helpers.device import get_devices
    +from lightwood.helpers.torch import LightwoodAutocast
    +from lightwood.encoder.datetime import DatetimeNormalizerEncoder
    +from lightwood.encoder.time_series.helpers.rnn_helpers import EncoderRNNNumerical, DecoderRNNNumerical
    +from lightwood.encoder.helpers import MinMaxNormalizer, CatNormalizer
    +from lightwood.helpers.general import get_group_matches
    +from lightwood.encoder.time_series.helpers.transformer_helpers import TransformerEncoder, get_chunk, len_to_mask
    +
    +
    +
    [docs]class TimeSeriesEncoder(BaseEncoder): + """ + Time series encoder. This module can learn features for any `order_by` temporal column, both with and without accompanying target data. + + The backbone of this encoder is either a recurrent neural network or a transformer; both structured in an encoder-decoder fashion. + """ # noqa + is_timeseries_encoder: bool = True + is_trainable_encoder: bool = True + + def __init__(self, stop_after: int, is_target=False, original_type: str = None, target: str = None, + grouped_by: List[str] = [], encoder_type='rnn'): + super().__init__(is_target) + self.device, _ = get_devices() + self.target = target + self.grouped_by = grouped_by + self._learning_rate = 0.01 + self.output_size = 128 + self._transformer_hidden_size = None + self._epochs = int(1e5) # default training epochs + self._stop_on_n_bad_epochs = 5 # stop training after N epochs where loss is worse than running avg + self._epochs_running_avg = 5 # amount of epochs for running average + self._pytorch_wrapper = torch.FloatTensor + self.is_prepared = False + self._is_setup = False + self._max_ts_length = 0 + self._sos = 0.0 # start of sequence for decoding + self._eos = 0.0 # end of input sequence -- padding value for batches + self._n_dims = 1 + self._normalizer = None + self.dep_norms = {} # dict of dict of normalizers for each dependency (can be grouped-by some column) + self._target_type = None + self._group_combinations = None + self.original_type = original_type + self.stop_after = stop_after + if encoder_type.lower() == 'rnn': + self.encoder_class = EncoderRNNNumerical + elif encoder_type.lower() == 'transformer': + self.encoder_class = TransformerEncoder + +
    [docs] def setup_nn(self, ts_analysis, dependencies=None): + """This method must be executed after initializing, else types are unassigned""" + if self.original_type in (dtype.datetime, dtype.date): + self._normalizer = DatetimeNormalizerEncoder(sinusoidal=True) + self._n_dims *= len(self._normalizer.fields) * 2 # sinusoidal datetime components + elif self.original_type in (dtype.float, dtype.integer): + self._normalizer = MinMaxNormalizer() + + total_dims = self._n_dims + dec_hsize = self.output_size + + if dependencies: + for dep_name, dep in dependencies.items(): + self.dependencies.append(dep_name) + + if dep_name in self.grouped_by: + continue # we only use group column for indexing and selecting rows + + assert dep['original_type'] in (dtype.categorical, dtype.binary, + dtype.integer, dtype.float, dtype.tsarray) + + if f'__mdb_ts_previous_{self.target}' == dep_name: + self.dep_norms[dep_name] = ts_analysis['target_normalizers'] + self._group_combinations = ts_analysis['group_combinations'] + self._target_type = dep['original_type'] + + # if TS analysis yields no normalizers for this dependency, we create a generic one based on its dtype + else: + if dep['original_type'] in (dtype.categorical, dtype.binary): + self.dep_norms[dep_name]['__default'] = CatNormalizer() + else: + self.dep_norms[dep_name]['__default'] = MinMaxNormalizer() + + self.dep_norms[dep_name]['__default'].prepare(dep['data']) + self._group_combinations = {'__default': None} + + # add descriptor size to the total encoder output dimensionality + if dep['original_type'] in (dtype.categorical, dtype.binary): + total_dims += len(self.dep_norms[dep_name]['__default'].scaler.categories_[0]) + elif dep['original_type'] in (dtype.integer, dtype.float, dtype.tsarray): + total_dims += 1 + + if self.encoder_class == EncoderRNNNumerical: + self._enc_criterion = nn.MSELoss() + self._dec_criterion = self._enc_criterion + self._encoder = self.encoder_class(input_size=total_dims, + hidden_size=self.output_size).to(self.device) + elif self.encoder_class == TransformerEncoder: + self._enc_criterion = self._masked_criterion + self._dec_criterion = nn.MSELoss() + self._base_criterion = nn.MSELoss(reduction="none") + if self._transformer_hidden_size is None: + self._transformer_hidden_size = total_dims * 2 # arbitrary + + self._encoder = self.encoder_class(ninp=total_dims, + nhead=gcd(dec_hsize, total_dims), + nhid=self._transformer_hidden_size, + nlayers=1).to(self.device) + else: + raise Exception(f"Time series encoder class not supported: {self.encoder_class}") + + self._decoder = DecoderRNNNumerical(output_size=total_dims, hidden_size=dec_hsize).to(self.device) + self._parameters = list(self._encoder.parameters()) + list(self._decoder.parameters()) + self._optimizer = optim.AdamW(self._parameters, lr=self._learning_rate, weight_decay=1e-4) + self._n_dims = total_dims + self._is_setup = True
    + + def to(self, device, available_devices): + if self._is_setup: + self.device = device + return super().to(device, available_devices) + return self + + def _prepare_raw_data(self, data): + """Convert to array and determine max length""" + out_data = [] + for e in data: + if not isinstance(e, torch.Tensor): + e = np.array(e, dtype=float) + e[np.isnan(e)] = 0.0 + t = torch.tensor(e, dtype=torch.float) + else: + t = e.float() + t[torch.isnan(t)] = 0.0 + out_data.append(t) + lengths = torch.tensor([len(e) for e in data], dtype=torch.float) + return out_data, lengths + + def _get_batch(self, source, start, end): + end = min(end, len(source)) + return source[start:end] + +
    [docs] def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series, dependency_data={}, ts_analysis=None, + feedback_hoop_function=log.info, batch_size=256): + """ + :param priming_data: a list of (self._n_dims)-dimensional time series [[dim1_data], ...] + :param dependency_data: raw data from other columns + :param ts_analysis: dictionary with time analysis info (e.g. normalizers for each target group) + :param feedback_hoop_function: method to use if you want to get feedback on the training process + :param batch_size + """ + priming_data = pd.concat([train_priming_data, dev_priming_data]) + priming_data = list(priming_data.values) + + if self.is_prepared: + raise Exception('You can only call "prepare" once for a given encoder.') + else: + self.setup_nn(ts_analysis, dependency_data) + + started = time.time() + + # Convert to array and determine max length + priming_data, lengths_data = self._prepare_raw_data(priming_data) + self._max_ts_length = int(lengths_data.max()) + + if self._normalizer: + self._normalizer.prepare(priming_data) + priming_data = self._normalizer.encode(priming_data).to(self.device) + if len(priming_data.shape) < 3: + priming_data = priming_data.unsqueeze(-1) + else: + priming_data = torch.stack([d for d in priming_data]).unsqueeze(-1).to(self.device) + + # merge all normalized data into a training batch + normalized_tensors = [] + for dep_name, dep_data in dependency_data.items(): + if dep_name in self.grouped_by: + continue + if dep_data['original_type'] in (dtype.integer, dtype.float): + dep_data['group_info'] = {group: dependency_data[group]['data'] for group in self.grouped_by} + data = torch.zeros((len(priming_data), lengths_data.max().int().item(), 1)) + all_idxs = set(range(len(data))) + for group_name, normalizer in self.dep_norms[dep_name].items(): + if group_name != '__default': + idxs, subset = get_group_matches(dep_data, normalizer.combination) + normalized = normalizer.encode(subset).unsqueeze(-1) + data[idxs, :, :] = normalized + all_idxs -= set(idxs) + if len(all_idxs) > 0 and '__default' in self.dep_norms[dep_name].keys(): + default_norm = self.dep_norms[dep_name]['__default'] + subset = [dep_data['data'][idx] for idx in list(all_idxs)] + data[list(all_idxs), :, :] = torch.Tensor(default_norm.encode(subset)).unsqueeze(-1) + + else: + # categorical has only one normalizer at all times + normalizer = self.dep_norms[dep_name]['__default'] + data = normalizer.encode(dep_data['data'].values) + if len(data.shape) < 3: + data = data.unsqueeze(-1) # add feature dimension + data[torch.isnan(data)] = 0.0 + normalized_tensors.append(data) + + if normalized_tensors: + normalized_data = torch.cat(normalized_tensors, dim=-1).to(self.device) + priming_data = torch.cat([priming_data, normalized_data], dim=-1) + + self._encoder.train() + running_losses = np.full(self._epochs_running_avg, np.nan) + bad_epochs = 0 + + for epoch in range(self._epochs): + average_loss = 0 + + for batch_idx in range(0, len(priming_data), batch_size): + # setup loss and optimizer + self._optimizer.zero_grad() + loss = 0 + + # shape: (batch_size, timesteps, n_dims) + batch = self._get_batch(priming_data, batch_idx, min(batch_idx + batch_size, len(priming_data))) + + # encode and decode through time + with LightwoodAutocast(): + if self.encoder_class == TransformerEncoder: + # pack batch length info tensor + len_batch = self._get_batch(lengths_data, batch_idx, min( + batch_idx + batch_size, len(priming_data))) + batch = batch, len_batch + + next_tensor, hidden_state, dec_loss = self._encoder.bptt( + batch, self._enc_criterion, self.device) + loss += dec_loss + + else: + next_tensor, hidden_state, enc_loss = self._encoder.bptt( + batch, self._enc_criterion, self.device) + loss += enc_loss + + next_tensor, hidden_state, dec_loss = self._decoder.decode( + batch, next_tensor, self._dec_criterion, self.device, hidden_state=hidden_state) + loss += dec_loss + + loss.backward() + + self._optimizer.step() + average_loss += loss.item() + + average_loss = average_loss / len(priming_data) + batch_idx += batch_size + + if epoch > self._epochs_running_avg and average_loss > np.average(running_losses): + bad_epochs += 1 + + # update running loss + running_losses[:-1] = running_losses[1:] + running_losses[-1] = average_loss + + if feedback_hoop_function is not None: + feedback_hoop_function( + "time series encoder epoch [{epoch_n}/{total}] average_loss = {average_loss}".format( + epoch_n=epoch + 1, total=self._epochs, average_loss=average_loss)) + + if bad_epochs > self._stop_on_n_bad_epochs: + break + elif (time.time() - started) > self.stop_after: + break + + self.is_prepared = True
    + + def _encode_one(self, data, previous=None, initial_hidden=None, return_next_value=False): + """ + This method encodes one single row of serial data + :param data: multidimensional time series as list of lists [[dim1_data], [dim2_data], ...] + (dim_data: string with format "x11, x12, ... x1n") + :param initial_hidden: if you want to encode from an initial hidden state other than 0s + :param return_next_value: if you want to return the next value in the time series too + + :return: either encoded_value or (encoded_value, next_value) + """ + self._encoder.eval() + with torch.no_grad(): + # Convert to array and determine max length + data, lengths_data = self._prepare_raw_data(data) + self._max_ts_length = int(lengths_data.max()) + + if self._normalizer: + data = self._normalizer.encode(data).to(self.device) + if len(data.shape) < 3: + data = data.unsqueeze(-1) + else: + data = torch.stack([d for d in data]).unsqueeze(-1).to(self.device) + + if previous is not None: + target_tensor = torch.stack(previous).to(self.device) + target_tensor[torch.isnan(target_tensor)] = 0.0 + if len(target_tensor.shape) < 3: + target_tensor = target_tensor.transpose(0, 1).unsqueeze(0) + data_tensor = torch.cat((data, target_tensor), dim=-1) + else: + data_tensor = data + + steps = data_tensor.shape[1] + + if self.encoder_class == EncoderRNNNumerical: + encoder_hidden = self._encoder.init_hidden(self.device) + encoder_hidden = encoder_hidden if initial_hidden is None else initial_hidden + + next_tensor = None + for tensor_i in range(steps): + next_tensor, encoder_hidden = self._encoder.forward(data_tensor[:, tensor_i, :].unsqueeze(dim=0), + encoder_hidden) + + else: + next_tensor = None + len_batch = self._get_batch(lengths_data, 0, len(data)) + batch_size, timesteps, _ = data_tensor.shape + + for start_chunk in range(0, timesteps, timesteps): + data, targets, lengths_chunk = get_chunk(data_tensor, len_batch, start_chunk, timesteps) + data = data.transpose(0, 1) + next_tensor, encoder_hidden = self._encoder.forward(data, lengths_chunk, self.device) + + if return_next_value: + return encoder_hidden, next_tensor + else: + return encoder_hidden + +
    [docs] def encode(self, column_data, dependency_data=None, get_next_count=None): + """ + Encode a list of time series data + :param column_data: a list of (self._n_dims)-dimensional time series [[dim1_data], ...] to encode + :param get_next_count: default None, but you can pass a number X and it will return the X following predictions + on the series for each ts_data_point in column_data + :return: a list of encoded time series or if get_next_count !=0 two lists (encoded_values, projected_numbers) + """ + + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + if isinstance(column_data, pd.Series): + data = deepcopy(column_data.values) # get a copy to avoid modifying the actual data frame + else: + data = column_data + + for i in range(len(data)): + if not isinstance(data[i][0], list): + data[i] = [data[i]] # add dimension for 1D timeseries + + # include autoregressive target data + ptd = [] + if dependency_data is not None: + for dep, dep_data in dependency_data.items(): + if dep in self.grouped_by: + continue + # normalize numerical target per group-by + if self._target_type in (dtype.integer, dtype.float, dtype.tsarray): + dep_info = { + 'group_info': {group: dependency_data[group] for group in self.grouped_by}, + 'data': dep_data + } + tensor = torch.zeros((len(dep_data), len(dep_data[0]), 1)).to(self.device) + all_idxs = set(range(len(dep_data))) + + for combination in [c for c in self._group_combinations if c != '__default']: + normalizer = self.dep_norms[dep].get(frozenset(combination), None) + if normalizer is None: + normalizer = self.dep_norms[dep]['__default'] + idxs, subset = get_group_matches(dep_info, normalizer.combination) + if idxs: + tensor[idxs, :, :] = torch.Tensor(normalizer.encode(subset)).unsqueeze(-1).to(self.device) + all_idxs -= set(idxs) + + # encode all remaining rows (not belonging to any grouped combination) with default normalizer + if all_idxs: + default_norm = self.dep_norms[dep]['__default'] + subset = [dep_data[idx] for idx in all_idxs] + tensor[list(all_idxs), :, :] = torch.Tensor( + default_norm.encode(subset)).unsqueeze(-1).to(self.device) + tensor[torch.isnan(tensor)] = 0.0 + + # normalize categorical target + else: + normalizer = self.dep_norms[dep]['__default'] + tensor = normalizer.encode(dep_data) + tensor[torch.isnan(tensor)] = 0.0 + + ptd.append(tensor) + + ret = [] + next = [] + + for i, val in enumerate(data): + if get_next_count is None: + if dependency_data is not None and len(dependency_data) > 0 and len(ptd) > 0: + encoded = self._encode_one(val, previous=[values[i] for values in ptd]) + else: + encoded = self._encode_one(val) + + else: + if get_next_count <= 0: + raise Exception('get_next_count must be greater than 0') + + hidden = None + vector = val + next_i = [] + + for j in range(get_next_count): + hidden, next_reading = self._encode_one(vector, initial_hidden=hidden, return_next_value=True) + vector = [next_reading] + if j == 0: + encoded = hidden + next_i.append(next_reading) + + next_value = next_i[0][0].cpu() + + if self._normalizer: + next_value = torch.Tensor(self._normalizer.decode(next_value)) + + next.append(next_value) + + ret.append(encoded[0][0].cpu()) + + if get_next_count is None: + return torch.stack(ret) + else: + return torch.stack(ret), torch.stack(next)
    + + def _decode_one(self, hidden, steps): + """ + Decodes a single time series from its encoded representation. + :param hidden: time series embedded representation tensor, with size self.output_size + :param steps: as in decode(), defines how many values to output when reconstructing + :return: decoded time series list + """ + self._decoder.eval() + with torch.no_grad(): + ret = [] + next_tensor = torch.full((1, 1, self._n_dims), self._sos, dtype=torch.float32).to(self.device) + timesteps = steps if steps else self._max_ts_length + for _ in range(timesteps): + next_tensor, hidden = self._decoder.forward(next_tensor, hidden) + ret.append(next_tensor) + return torch.stack(ret) + +
    [docs] def decode(self, encoded_data, steps=None): + """ + Decode a list of embedded multidimensional time series + :param encoded_data: a list of embeddings [ e1, e2, ...] to be decoded into time series + :param steps: fixed number of timesteps to reconstruct from each embedding. + If None, encoder will output the largest length encountered during training. + :return: a list of reconstructed time series + """ + if not self.is_prepared: + raise Exception('You need to call "prepare" before calling "encode" or "decode".') + + ret = [] + for _, val in enumerate(encoded_data): + hidden = torch.unsqueeze(torch.unsqueeze(val, dim=0), dim=0).to(self.device) + reconstruction = self._decode_one(hidden, steps).cpu().squeeze().T.numpy() + + if self._n_dims == 1: + reconstruction = reconstruction.reshape(1, -1) + + if self._normalizer: + reconstruction = self._normalizer.decode(reconstruction) + + ret.append(reconstruction) + + return torch.Tensor(ret)
    + + def _masked_criterion(self, output, targets, lengths): + """ Computes the loss of the first `lengths` items in the chunk """ + # Put in (B, T) format and zero-out the unnecessary values + mask = len_to_mask(lengths, zeros=False).t() + + # Inflate to feature dimension + mask = mask.unsqueeze(-1).repeat(1, 1, output.shape[-1]) + output = output * mask + targets = targets * mask + + # compute the loss with respect to the appropriate lengths and average across the batch-size + # We compute for every output (x_i)_i=1^L and target (y_i)_i=1^L, loss = 1/L \sum (x_i - y_i)^2 + # And average across the mini-batch + losses = self._base_criterion(output, targets).sum(dim=2).sum(dim=0) + + # The TBPTT will compute a slightly different loss, but it is not problematic + loss = torch.dot((1.0 / lengths.float()), losses) / len(losses) + + return loss
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/ensemble/base.html b/docs/_modules/lightwood/ensemble/base.html new file mode 100644 index 000000000..167773144 --- /dev/null +++ b/docs/_modules/lightwood/ensemble/base.html @@ -0,0 +1,253 @@ + + + + + + + + + + lightwood.ensemble.base — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.ensemble.base
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.ensemble.base

    +from typing import List
    +
    +import pandas as pd
    +
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood.api.types import PredictionArguments
    +
    +
    +
    [docs]class BaseEnsemble: + """ + Base class for all ensembles. + + Ensembles wrap sets of Lightwood mixers, with the objective of generating better predictions based on the output of each mixer. + + There are two important methods for any ensemble to work: + 1. `__init__()` should prepare all mixers and internal ensemble logic. + 2. `__call__()` applies any aggregation rules to generate final predictions based on the output of each mixer. + + Class Attributes: + - mixers: List of mixers the ensemble will use. + - supports_proba: For classification tasks, whether the ensemble supports yielding per-class scores rather than only returning the predicted label. + + """ # noqa + data: EncodedDs + mixers: List[BaseMixer] + best_index: int # @TODO: maybe only applicable to BestOf + supports_proba: bool + + def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs) -> None: + self.data = data + self.mixers = mixers + self.best_index = 0 + self.supports_proba = False + + def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: + raise NotImplementedError()
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/ensemble/best_of.html b/docs/_modules/lightwood/ensemble/best_of.html new file mode 100644 index 000000000..325c121a5 --- /dev/null +++ b/docs/_modules/lightwood/ensemble/best_of.html @@ -0,0 +1,280 @@ + + + + + + + + + + lightwood.ensemble.best_of — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.ensemble.best_of
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.ensemble.best_of

    +from typing import List, Optional
    +
    +import numpy as np
    +import pandas as pd
    +
    +from lightwood.helpers.log import log
    +from lightwood.helpers.numeric import is_nan_numeric
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.ensemble.base import BaseEnsemble
    +from lightwood.api.types import PredictionArguments
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood.helpers.general import evaluate_accuracy
    +
    +
    +
    [docs]class BestOf(BaseEnsemble): + """ + This ensemble acts as a mixer selector. + After evaluating accuracy for all internal mixers with the validation data, it sets the best mixer as the underlying model. + """ # noqa + indexes_by_accuracy: List[float] + + def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, accuracy_functions, + args: PredictionArguments, ts_analysis: Optional[dict] = None) -> None: + super().__init__(target, mixers, data) + + score_list = [] + for _, mixer in enumerate(mixers): + score_dict = evaluate_accuracy( + data.data_frame, + mixer(data, args)['prediction'], + target, + accuracy_functions, + ts_analysis=ts_analysis + ) + avg_score = np.mean(list(score_dict.values())) + log.info(f'Mixer: {type(mixer).__name__} got accuracy: {avg_score}') + + if is_nan_numeric(avg_score): + avg_score = -pow(2, 63) + log.warning(f'Change the accuracy of mixer {type(mixer).__name__} to valid value: {avg_score}') + + score_list.append(avg_score) + + self.indexes_by_accuracy = list(reversed(np.array(score_list).argsort())) + self.supports_proba = self.mixers[self.indexes_by_accuracy[0]].supports_proba + log.info(f'Picked best mixer: {type(self.mixers[self.indexes_by_accuracy[0]]).__name__}') + + def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: + if args.all_mixers: + predictions = {} + for mixer in self.mixers: + predictions[f'__mdb_mixer_{type(mixer).__name__}'] = mixer(ds, args=args)['prediction'] + return pd.DataFrame(predictions) + else: + for mixer_index in self.indexes_by_accuracy: + mixer = self.mixers[mixer_index] + try: + return mixer(ds, args=args) + except Exception as e: + if mixer.stable: + raise(e) + else: + log.warning(f'Unstable mixer {type(mixer).__name__} failed with exception: {e}.\ + Trying next best')
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/ensemble/mean_ensemble.html b/docs/_modules/lightwood/ensemble/mean_ensemble.html new file mode 100644 index 000000000..02194ed9c --- /dev/null +++ b/docs/_modules/lightwood/ensemble/mean_ensemble.html @@ -0,0 +1,241 @@ + + + + + + + + + + lightwood.ensemble.mean_ensemble — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.ensemble.mean_ensemble
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.ensemble.mean_ensemble

    +from typing import List
    +
    +import pandas as pd
    +
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.ensemble.base import BaseEnsemble
    +from lightwood.api.types import PredictionArguments
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood import dtype
    +
    +
    +
    [docs]class MeanEnsemble(BaseEnsemble): + def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, dtype_dict: dict) -> None: + super().__init__(target, mixers, data) + if dtype_dict[target] not in (dtype.float, dtype.integer, dtype.quantity): + raise Exception( + f'This ensemble can only be used regression problems! Got target dtype {dtype_dict[target]} instead!') + + def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: + predictions_df = pd.DataFrame() + for mixer in self.mixers: + predictions_df[f'__mdb_mixer_{type(mixer).__name__}'] = mixer(ds, args=args)['prediction'] + + return pd.DataFrame(predictions_df.mean(axis='columns'), columns=['prediction'])
    + +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/ensemble/mode_ensemble.html b/docs/_modules/lightwood/ensemble/mode_ensemble.html new file mode 100644 index 000000000..710aee7ea --- /dev/null +++ b/docs/_modules/lightwood/ensemble/mode_ensemble.html @@ -0,0 +1,296 @@ + + + + + + + + + + lightwood.ensemble.mode_ensemble — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.ensemble.mode_ensemble
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.ensemble.mode_ensemble

    +from typing import List, Optional, Dict
    +
    +import pandas as pd
    +import numpy as np
    +
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.ensemble.base import BaseEnsemble
    +from lightwood.api.types import PredictionArguments
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood import dtype
    +from lightwood.helpers.general import evaluate_accuracy
    +from lightwood.helpers.numeric import is_nan_numeric
    +from lightwood.helpers.log import log
    +
    +
    +
    [docs]class ModeEnsemble(BaseEnsemble): + mixer_scores: Dict[str, float] + + def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, dtype_dict: dict, + accuracy_functions, args: PredictionArguments, ts_analysis: Optional[dict] = None) -> None: + super().__init__(target, mixers, data) + self.mixer_scores = {} + + if dtype_dict[target] not in (dtype.binary, dtype.categorical, dtype.tags): + raise Exception( + 'This ensemble can only be used in classification problems! ' + + f'Got target dtype {dtype_dict[target]} instead!') + + for _, mixer in enumerate(mixers): + score_dict = evaluate_accuracy( + data.data_frame, + mixer(data, args)['prediction'], + target, + accuracy_functions, + ts_analysis=ts_analysis + ) + avg_score = np.mean(list(score_dict.values())) + log.info(f'Mixer: {type(mixer).__name__} got accuracy: {avg_score}') + + if is_nan_numeric(avg_score): + avg_score = -pow(2, 63) + log.warning(f'Change the accuracy of mixer {type(mixer).__name__} to valid value: {avg_score}') + + self.mixer_scores[f'__mdb_mixer_{type(mixer).__name__}'] = avg_score + + def _pick_mode_highest_score(self, prediction: pd.Series): + """If the predictions are unimodal, return the mode. If there are multiple modes, return the mode whose voting + mixers have the highest score.""" + prediction_counts = prediction.value_counts() + + # If there is a clear winner, i.e. only one prediction + if len(prediction_counts) == 1: + return prediction_counts.index[0] + + counts = prediction_counts.values # how many times all predictions have appeared + max_count = np.max(counts) # how many times the most frequent predictions have apppeared + + # most frequent predictions and how many times they appeared + modes = prediction_counts[prediction_counts == max_count] + + modes_predictions = modes.index # most frequent predictions + + # For each mode, get the sum of the scores of the predictors who voted for it + modes_predictions_scores = {} + for mode_prediction in modes_predictions: + voting_mixers_name = prediction[prediction == mode_prediction].index.tolist() + modes_predictions_scores[mode_prediction] = np.sum( + [self.mixer_scores[mixer_name] for mixer_name in voting_mixers_name]) + + # Return the mode with the maximum sum of accuracies + return max(modes_predictions_scores, key=modes_predictions_scores.get) + + def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: + predictions_df = pd.DataFrame() + for mixer in self.mixers: + predictions_df[f'__mdb_mixer_{type(mixer).__name__}'] = mixer(ds, args=args)['prediction'] + + mode_df = predictions_df.apply(func=self._pick_mode_highest_score, axis='columns') + + return pd.DataFrame(mode_df, columns=['prediction'])
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/ensemble/weighted_mean_ensemble.html b/docs/_modules/lightwood/ensemble/weighted_mean_ensemble.html new file mode 100644 index 000000000..831e4fc37 --- /dev/null +++ b/docs/_modules/lightwood/ensemble/weighted_mean_ensemble.html @@ -0,0 +1,273 @@ + + + + + + + + + + lightwood.ensemble.weighted_mean_ensemble — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.ensemble.weighted_mean_ensemble
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.ensemble.weighted_mean_ensemble

    +from typing import List, Optional
    +
    +import numpy as np
    +import pandas as pd
    +
    +from lightwood.helpers.log import log
    +from lightwood.helpers.numeric import is_nan_numeric
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.ensemble.base import BaseEnsemble
    +from lightwood.api.types import PredictionArguments
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood.helpers.general import evaluate_accuracy
    +from lightwood import dtype
    +
    +
    +
    [docs]class WeightedMeanEnsemble(BaseEnsemble): + def __init__(self, target, mixers: List[BaseMixer], data: EncodedDs, args: PredictionArguments, + dtype_dict: dict, accuracy_functions, ts_analysis: Optional[dict] = None) -> None: + super().__init__(target, mixers, data) + if dtype_dict[target] not in (dtype.float, dtype.integer, dtype.quantity): + raise Exception( + f'This ensemble can only be used regression problems! Got target dtype {dtype_dict[target]} instead!') + + score_list = [] + for _, mixer in enumerate(mixers): + score_dict = evaluate_accuracy( + data.data_frame, + mixer(data, args)['prediction'], + target, + accuracy_functions, + ts_analysis=ts_analysis + ) + avg_score = np.mean(list(score_dict.values())) + log.info(f'Mixer: {type(mixer).__name__} got accuracy: {avg_score}') + + if is_nan_numeric(avg_score): + log.warning(f'Could not compute a valid accuracy for mixer: {type(mixer).__name__}, \ + functions: {accuracy_functions}, yielded invalid average score {avg_score}, \ + resetting that to -pow(2,63) instead.') + avg_score = -pow(2, 63) + + score_list.append(avg_score) + + self.weights = self.accuracies_to_weights(np.array(score_list)) + + def __call__(self, ds: EncodedDs, args: PredictionArguments) -> pd.DataFrame: + df = pd.DataFrame() + for mixer in self.mixers: + df[f'__mdb_mixer_{type(mixer).__name__}'] = mixer(ds, args=args)['prediction'] + + avg_predictions_df = df.apply(lambda x: np.average(x, weights=self.weights), axis='columns') + return pd.DataFrame(avg_predictions_df, columns=['prediction']) + + def accuracies_to_weights(self, x: np.array) -> np.array: + # Converts accuracies to weights using the softmax function. + e_x = np.exp(x - np.max(x)) + return e_x / e_x.sum()
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/mixer/base.html b/docs/_modules/lightwood/mixer/base.html new file mode 100644 index 000000000..8ed89c2f1 --- /dev/null +++ b/docs/_modules/lightwood/mixer/base.html @@ -0,0 +1,283 @@ + + + + + + + + + + lightwood.mixer.base — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.mixer.base
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.mixer.base

    +import pandas as pd
    +
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood.api.types import PredictionArguments
    +
    +
    +
    [docs]class BaseMixer: + """ + Base class for all mixers. + + Mixers are the backbone of all Lightwood machine learning models. They intake encoded feature representations for every column, and are tasked with learning to fulfill the predictive requirements stated in a problem definition. + + There are two important methods for any mixer to work: + 1. `fit()` contains all logic to train the mixer with the training data that has been encoded by all the (already trained) Lightwood encoders for any given task. + 2. `__call__()` is executed to generate predictions once the mixer has been trained using `fit()`. + + An additional `partial_fit()` method is used to update any mixer that has already been trained. + + Class Attributes: + - stable: If set to `True`, this mixer should always work. Any mixer with `stable=False` can be expected to fail under some circumstances. + - fit_data_len: Length of the training data. + - supports_proba: For classification tasks, whether the mixer supports yielding per-class scores rather than only returning the predicted label. + + """ # noqa + stable: bool + fit_data_len: int # @TODO (Patricio): should this really be in `BaseMixer`? + supports_proba: bool + + def __init__(self, stop_after: int): + """ + Initializer a mixer. + + :param stop_after: Time budget to train this mixer. + """ + self.stop_after = stop_after + self.supports_proba = False + +
    [docs] def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + """ + Fits/trains a mixer with training data. + + :param train_data: encoded representations of the training data subset. + :param dev_data: encoded representations of the "dev" data subset. This can be used as an internal validation subset (e.g. it is used for early stopping in the default `Neural` mixer). + + """ # noqa + raise NotImplementedError()
    + + def __call__(self, ds: EncodedDs, + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + """ + Calls a trained mixer to predict the target column given some input data. + + :param ds: encoded representations of input data. + :param args: a `lightwood.api.types.PredictionArguments` object, including all relevant inference-time arguments to customize the behavior. + :return: + """ # noqa + raise NotImplementedError() + +
    [docs] def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + """ + Partially fits/trains a mixer with new training data. This is a somewhat experimental method, and it aims at updating pre-existing Lightwood predictors. + + :param train_data: encoded representations of the new training data subset. + :param dev_data: encoded representations of new the "dev" data subset. As in `fit()`, this can be used as an internal validation subset. + + """ # noqa + pass
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/mixer/lightgbm.html b/docs/_modules/lightwood/mixer/lightgbm.html new file mode 100644 index 000000000..776e044ac --- /dev/null +++ b/docs/_modules/lightwood/mixer/lightgbm.html @@ -0,0 +1,461 @@ + + + + + + + + + + lightwood.mixer.lightgbm — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.mixer.lightgbm
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.mixer.lightgbm

    +import time
    +from typing import Dict, List, Set
    +
    +import torch
    +import optuna
    +import lightgbm
    +import numpy as np
    +import pandas as pd
    +from sklearn.preprocessing import OrdinalEncoder
    +import optuna.integration.lightgbm as optuna_lightgbm
    +
    +from lightwood.api import dtype
    +from lightwood.helpers.log import log
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.helpers.device import get_devices
    +from lightwood.api.types import PredictionArguments
    +from lightwood.data.encoded_ds import EncodedDs
    +
    +
    +optuna.logging.set_verbosity(optuna.logging.CRITICAL)
    +
    +
    +def check_gpu_support():
    +    try:
    +        data = np.random.rand(50, 2)
    +        label = np.random.randint(2, size=50)
    +        train_data = lightgbm.Dataset(data, label=label)
    +        params = {'num_iterations': 1, 'device': 'gpu'}
    +        lightgbm.train(params, train_set=train_data)
    +        device, nr_devices = get_devices()
    +        if nr_devices > 0 and str(device) != 'cpu':
    +            return True
    +        else:
    +            return False
    +    except Exception:
    +        return False
    +
    +
    +
    [docs]class LightGBM(BaseMixer): + model: lightgbm.LGBMModel + ordinal_encoder: OrdinalEncoder + label_set: Set[str] + max_bin: int + device: torch.device + device_str: str + num_iterations: int + use_optuna: bool + supports_proba: bool + + def __init__( + self, stop_after: int, target: str, dtype_dict: Dict[str, str], + input_cols: List[str], + fit_on_dev: bool, use_optuna: bool = True): + super().__init__(stop_after) + self.model = None + self.ordinal_encoder = None + self.positive_domain = False + self.label_set = set() + self.target = target + self.dtype_dict = dtype_dict + self.input_cols = input_cols + self.use_optuna = use_optuna + self.params = {} + self.fit_on_dev = fit_on_dev + self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical] + self.stable = True + + # GPU Only available via --install-option=--gpu with opencl-dev and libboost dev (a bunch of them) installed, so let's turn this off for now and we can put it behind some flag later # noqa + gpu_works = check_gpu_support() + if not gpu_works: + self.device = torch.device('cpu') + self.device_str = 'cpu' + log.warning('LightGBM running on CPU, this somewhat slower than the GPU version, consider using a GPU instead') # noqa + else: + self.device = torch.device('cuda') + self.device_str = 'gpu' + + self.max_bin = 255 + + def _to_dataset(self, data, output_dtype): + for subset_name in data.keys(): + for input_col in self.input_cols: + if data[subset_name]['data'] is None: + data[subset_name]['data'] = data[subset_name]['ds'].get_encoded_column_data( + input_col).to(self.device) + else: + enc_col = data[subset_name]['ds'].get_encoded_column_data(input_col) + data[subset_name]['data'] = torch.cat((data[subset_name]['data'], enc_col.to(self.device)), 1) + + data[subset_name]['data'] = data[subset_name]['data'].numpy() + + label_data = data[subset_name]['ds'].get_column_original_data(self.target) + + if output_dtype in (dtype.categorical, dtype.binary): + if subset_name == 'train': + self.ordinal_encoder = OrdinalEncoder() + self.label_set = set(label_data) + self.label_set.add('__mdb_unknown_cat') + self.ordinal_encoder.fit(np.array(list(self.label_set)).reshape(-1, 1)) + + label_data = [x if x in self.label_set else '__mdb_unknown_cat' for x in label_data] + label_data = self.ordinal_encoder.transform(np.array(label_data).reshape(-1, 1)).flatten() + elif output_dtype == dtype.integer: + label_data = label_data.clip(-pow(2, 63), pow(2, 63)).astype(int) + elif output_dtype in (dtype.float, dtype.quantity): + label_data = label_data.astype(float) + + data[subset_name]['label_data'] = label_data + + return data + +
    [docs] def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + log.info('Started fitting LGBM model') + data = { + 'train': {'ds': train_data, 'data': None, 'label_data': {}}, + 'dev': {'ds': dev_data, 'data': None, 'label_data': {}} + } + self.fit_data_len = len(data['train']['ds']) + self.positive_domain = getattr(train_data.encoders.get(self.target, None), 'positive_domain', False) + + output_dtype = self.dtype_dict[self.target] + + data = self._to_dataset(data, output_dtype) + + if output_dtype not in (dtype.categorical, dtype.integer, dtype.float, dtype.binary, dtype.quantity): + log.error(f'Lightgbm mixer not supported for type: {output_dtype}') + raise Exception(f'Lightgbm mixer not supported for type: {output_dtype}') + else: + objective = 'regression' if output_dtype in (dtype.integer, dtype.float, dtype.quantity) else 'multiclass' + metric = 'l2' if output_dtype in (dtype.integer, dtype.float, dtype.quantity) else 'multi_logloss' + + self.params = { + 'objective': objective, + 'metric': metric, + 'verbose': -1, + 'lambda_l1': 0.1, + 'lambda_l2': 0.1, + 'force_row_wise': True, + 'device_type': self.device_str, + } + + if objective == 'multiclass': + self.all_classes = self.ordinal_encoder.categories_[0] + self.params['num_class'] = self.all_classes.size + if self.device_str == 'gpu': + self.params['gpu_use_dp'] = True + + # Determine time per iterations + start = time.time() + self.params['num_iterations'] = 1 + self.model = lightgbm.train(self.params, lightgbm.Dataset( + data['train']['data'], + label=data['train']['label_data']), + verbose_eval=False) + end = time.time() + seconds_for_one_iteration = max(0.1, end - start) + + # Determine nr of iterations + log.info(f'A single GBM iteration takes {seconds_for_one_iteration} seconds') + self.num_iterations = int(self.stop_after * 0.8 / seconds_for_one_iteration) + + # Turn on grid search if training doesn't take too long using it + kwargs = {} + if self.use_optuna and self.num_iterations >= 200: + model_generator = optuna_lightgbm + kwargs['time_budget'] = self.stop_after * 0.4 + self.num_iterations = int(self.num_iterations / 2) + kwargs['optuna_seed'] = 0 + else: + model_generator = lightgbm + + # Prepare the data + train_dataset = lightgbm.Dataset(data['train']['data'], label=data['train']['label_data']) + dev_dataset = lightgbm.Dataset(data['dev']['data'], label=data['dev']['label_data']) + + # Train the models + log.info( + f'Training GBM ({model_generator}) with {self.num_iterations} iterations given {self.stop_after} seconds constraint') # noqa + if self.num_iterations < 1: + self.num_iterations = 1 + self.params['num_iterations'] = int(self.num_iterations) + + self.params['early_stopping_rounds'] = 5 + + self.model = model_generator.train( + self.params, train_dataset, valid_sets=[dev_dataset, train_dataset], + valid_names=['dev', 'train'], + verbose_eval=False, **kwargs) + self.num_iterations = self.model.best_iteration + log.info(f'Lightgbm model contains {self.model.num_trees()} weak estimators') + + if self.fit_on_dev: + self.partial_fit(dev_data, train_data)
    + +
    [docs] def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + pct_of_original = len(train_data) / self.fit_data_len + iterations = max(1, int(self.num_iterations * pct_of_original) / 2) + + data = {'retrain': {'ds': train_data, 'data': None, 'label_data': {}}, 'dev': { + 'ds': dev_data, 'data': None, 'label_data': {}}} + + output_dtype = self.dtype_dict[self.target] + data = self._to_dataset(data, output_dtype) + + train_dataset = lightgbm.Dataset(data['retrain']['data'], label=data['retrain']['label_data']) + dev_dataset = lightgbm.Dataset(data['dev']['data'], label=data['dev']['label_data']) + + log.info(f'Updating lightgbm model with {iterations} iterations') + if iterations < 1: + iterations = 1 + self.params['num_iterations'] = int(iterations) + self.model = lightgbm.train( + self.params, train_dataset, valid_sets=[dev_dataset, train_dataset], + valid_names=['dev', 'retrain'], + verbose_eval=False, init_model=self.model) + log.info(f'Model now has a total of {self.model.num_trees()} weak estimators')
    + + def __call__(self, ds: EncodedDs, + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + data = None + for input_col in self.input_cols: + if data is None: + data = ds.get_encoded_column_data(input_col).to(self.device) + else: + data = torch.cat((data, ds.get_encoded_column_data(input_col).to(self.device)), 1) + + data = data.numpy() + raw_predictions = self.model.predict(data) + + if self.ordinal_encoder is not None: + decoded_predictions = self.ordinal_encoder.inverse_transform( + np.argmax(raw_predictions, axis=1).reshape(-1, 1)).flatten() + else: + decoded_predictions = raw_predictions + + if self.positive_domain: + decoded_predictions = [max(0, p) for p in decoded_predictions] + + ydf = pd.DataFrame({'prediction': decoded_predictions}) + + if args.predict_proba and self.ordinal_encoder is not None: + for idx, label in enumerate(self.ordinal_encoder.categories_[0].tolist()): + ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] + + return ydf
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/mixer/lightgbm_array.html b/docs/_modules/lightwood/mixer/lightgbm_array.html new file mode 100644 index 000000000..fb715581b --- /dev/null +++ b/docs/_modules/lightwood/mixer/lightgbm_array.html @@ -0,0 +1,285 @@ + + + + + + + + + + lightwood.mixer.lightgbm_array — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.mixer.lightgbm_array
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.mixer.lightgbm_array

    +import numpy as np
    +import pandas as pd
    +from typing import Dict, List, Union
    +
    +from lightwood.api import dtype
    +from lightwood.helpers.log import log
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.mixer.lightgbm import LightGBM
    +from lightwood.api.types import PredictionArguments
    +from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs
    +
    +
    +
    [docs]class LightGBMArray(BaseMixer): + """LightGBM-based model, intended for usage in time series tasks.""" + models: List[LightGBM] + n_ts_predictions: int + submodel_stop_after: float + target: str + supports_proba: bool + + def __init__( + self, stop_after: int, target: str, dtype_dict: Dict[str, str], + input_cols: List[str], + n_ts_predictions: int, fit_on_dev: bool): + super().__init__(stop_after) + self.submodel_stop_after = stop_after / n_ts_predictions + self.target = target + dtype_dict[target] = dtype.float + self.models = [LightGBM(self.submodel_stop_after, target, dtype_dict, input_cols, fit_on_dev, use_optuna=False) + for _ in range(n_ts_predictions)] + self.n_ts_predictions = n_ts_predictions # for time series tasks, how long is the forecast horizon + self.supports_proba = False + self.stable = True + +
    [docs] def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + log.info('Started fitting LGBM models for array prediction') + + for timestep in range(self.n_ts_predictions): + if timestep > 0: + train_data.data_frame[self.target] = train_data.data_frame[f'{self.target}_timestep_{timestep}'] + dev_data.data_frame[self.target] = dev_data.data_frame[f'{self.target}_timestep_{timestep}'] + + self.models[timestep].fit(train_data, dev_data) # @TODO: this call could be parallelized
    + +
    [docs] def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + log.info('Updating array of LGBM models...') + + for timestep in range(self.n_ts_predictions): + if timestep > 0: + train_data.data_frame[self.target] = train_data.data_frame[f'{self.target}_timestep_{timestep}'] + dev_data.data_frame[self.target] = dev_data.data_frame[f'{self.target}_timestep_{timestep}'] + + self.models[timestep].partial_fit(train_data, dev_data) # @TODO: this call could be parallelized
    + + def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + if args.predict_proba: + log.warning('This model does not output probability estimates') + + length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) + ydf = pd.DataFrame(0, # zero-filled + index=np.arange(length), + columns=[f'prediction_{i}' for i in range(self.n_ts_predictions)]) + + for timestep in range(self.n_ts_predictions): + ydf[f'prediction_{timestep}'] = self.models[timestep](ds, args) + + ydf['prediction'] = ydf.values.tolist() + return ydf[['prediction']]
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/mixer/neural.html b/docs/_modules/lightwood/mixer/neural.html new file mode 100644 index 000000000..bfb6ff25e --- /dev/null +++ b/docs/_modules/lightwood/mixer/neural.html @@ -0,0 +1,563 @@ + + + + + + + + + + lightwood.mixer.neural — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.mixer.neural
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.mixer.neural

    +import time
    +from copy import deepcopy
    +from typing import Dict, List
    +
    +import torch
    +import numpy as np
    +import pandas as pd
    +from torch import nn
    +import torch_optimizer as ad_optim
    +from sklearn.metrics import r2_score
    +from torch.cuda.amp import GradScaler
    +from torch.utils.data import DataLoader
    +from torch.nn.modules.loss import MSELoss
    +from torch.optim.optimizer import Optimizer
    +
    +from lightwood.api import dtype
    +from lightwood.helpers.log import log
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.helpers.torch import LightwoodAutocast
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.mixer.helpers.ar_net import ArNet
    +from lightwood.mixer.helpers.default_net import DefaultNet
    +from lightwood.api.types import TimeseriesSettings, PredictionArguments
    +from lightwood.mixer.helpers.transform_corss_entropy_loss import TransformCrossEntropyLoss
    +
    +
    +
    [docs]class Neural(BaseMixer): + model: nn.Module + dtype_dict: dict + target: str + epochs_to_best: int + fit_on_dev: bool + supports_proba: bool + + def __init__( + self, stop_after: int, target: str, dtype_dict: Dict[str, str], + timeseries_settings: TimeseriesSettings, target_encoder: BaseEncoder, net: str, fit_on_dev: bool, + search_hyperparameters: bool): + """ + The Neural mixer trains a fully connected dense network from concatenated encoded outputs of each of the features in the dataset to predicted the encoded output. + + :param stop_after: How long the total fitting process should take + :param target: Name of the target column + :param dtype_dict: Data type dictionary + :param timeseries_settings: TimeseriesSettings object for time-series tasks, refer to its documentation for available settings. + :param target_encoder: Reference to the encoder used for the target + :param net: The network type to use (`DeafultNet` or `ArNet`) + :param fit_on_dev: If we should fit on the dev dataset + :param search_hyperparameters: If the network should run a more through hyperparameter search (currently disabled) + """ # noqa + super().__init__(stop_after) + self.dtype_dict = dtype_dict + self.target = target + self.timeseries_settings = timeseries_settings + self.target_encoder = target_encoder + self.epochs_to_best = 0 + self.fit_on_dev = fit_on_dev + self.net_class = DefaultNet if net == 'DefaultNet' else ArNet + self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical] + self.search_hyperparameters = search_hyperparameters + self.stable = True + + def _final_tuning(self, data): + if self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.quantity): + self.model = self.model.eval() + with torch.no_grad(): + acc_dict = {} + for decode_log in [True, False]: + self.target_encoder.decode_log = decode_log + decoded_predictions = [] + decoded_real_values = [] + for X, Y in data: + X = X.to(self.model.device) + Y = Y.to(self.model.device) + Yh = self.model(X) + + Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh + Y = torch.unsqueeze(Y, 0) if len(Y.shape) < 2 else Y + + decoded_predictions.extend(self.target_encoder.decode(Yh)) + decoded_real_values.extend(self.target_encoder.decode(Y)) + + acc_dict[decode_log] = r2_score(decoded_real_values, decoded_predictions) + + self.target_encoder.decode_log = acc_dict[True] > acc_dict[False] + + def _select_criterion(self) -> torch.nn.Module: + if self.dtype_dict[self.target] in (dtype.categorical, dtype.binary): + criterion = TransformCrossEntropyLoss(weight=self.target_encoder.index_weights.to(self.model.device)) + elif self.dtype_dict[self.target] in (dtype.tags): + criterion = nn.BCEWithLogitsLoss() + elif (self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.tsarray, dtype.quantity) + and self.timeseries_settings.is_timeseries): + criterion = nn.L1Loss() + elif self.dtype_dict[self.target] in (dtype.integer, dtype.float, dtype.quantity): + criterion = MSELoss() + else: + criterion = MSELoss() + + return criterion + + def _select_optimizer(self) -> Optimizer: + # ad_optim.Ranger + # torch.optim.AdamW + if self.timeseries_settings.is_timeseries: + optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr) + else: + optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr, weight_decay=2e-2) + + return optimizer + + def _find_lr(self, dl): + optimizer = self._select_optimizer() + criterion = self._select_criterion() + scaler = GradScaler() + + running_losses: List[float] = [] + cum_loss = 0 + lr_log = [] + best_model = self.model + stop = False + batches = 0 + for epoch in range(1, 101): + if stop: + break + + for i, (X, Y) in enumerate(dl): + if stop: + break + + batches += len(X) + X = X.to(self.model.device) + Y = Y.to(self.model.device) + with LightwoodAutocast(): + optimizer.zero_grad() + Yh = self.model(X) + loss = criterion(Yh, Y) + if LightwoodAutocast.active: + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + optimizer.step() + cum_loss += loss.item() + + # Account for ranger lookahead update + if (i + 1) * epoch % 6: + batches = 0 + lr = optimizer.param_groups[0]['lr'] + log.info(f'Loss of {cum_loss} with learning rate {lr}') + running_losses.append(cum_loss) + lr_log.append(lr) + cum_loss = 0 + if len(running_losses) < 2 or np.mean(running_losses[:-1]) > np.mean(running_losses): + optimizer.param_groups[0]['lr'] = lr * 1.4 + # Time saving since we don't have to start training fresh + best_model = deepcopy(self.model) + else: + stop = True + + best_loss_lr = lr_log[np.argmin(running_losses)] + lr = best_loss_lr + log.info(f'Found learning rate of: {lr}') + return lr, best_model + + def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after): + started = time.time() + epochs_to_best = 0 + best_dev_error = pow(2, 32) + running_errors = [] + best_model = self.model + + for epoch in range(1, return_model_after + 1): + self.model = self.model.train() + running_losses: List[float] = [] + for i, (X, Y) in enumerate(train_dl): + X = X.to(self.model.device) + Y = Y.to(self.model.device) + with LightwoodAutocast(): + optimizer.zero_grad() + Yh = self.model(X) + loss = criterion(Yh, Y) + if LightwoodAutocast.active: + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + optimizer.step() + + running_losses.append(loss.item()) + + train_error = np.mean(running_losses) + epoch_error = self._error(dev_dl, criterion) + running_errors.append(epoch_error) + log.info(f'Loss @ epoch {epoch}: {epoch_error}') + + if np.isnan(train_error) or np.isnan( + running_errors[-1]) or np.isinf(train_error) or np.isinf( + running_errors[-1]): + break + + if best_dev_error > running_errors[-1]: + best_dev_error = running_errors[-1] + best_model = deepcopy(self.model) + epochs_to_best = epoch + + if len(running_errors) >= 5: + delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)], + weights=[(1 / 2)**i for i in range(1, 5)]) + if delta_mean <= 0: + break + elif (time.time() - started) > stop_after: + break + elif running_errors[-1] < 0.0001 or train_error < 0.0001: + break + + if np.isnan(best_dev_error): + best_dev_error = pow(2, 32) + return best_model, epochs_to_best, best_dev_error + + def _error(self, dev_dl, criterion) -> float: + self.model = self.model.eval() + running_losses: List[float] = [] + with torch.no_grad(): + for X, Y in dev_dl: + X = X.to(self.model.device) + Y = Y.to(self.model.device) + Yh = self.model(X) + running_losses.append(criterion(Yh, Y).item()) + return np.mean(running_losses) + + def _init_net(self, ds: EncodedDs): + net_kwargs = {'input_size': len(ds[0][0]), + 'output_size': len(ds[0][1]), + 'num_hidden': self.num_hidden, + 'dropout': 0} + + if self.net_class == ArNet: + net_kwargs['encoder_span'] = ds.encoder_spans + net_kwargs['target_name'] = self.target + + self.model = self.net_class(**net_kwargs) + + # @TODO: Compare partial fitting fully on and fully off on the benchmarks! + # @TODO: Writeup on the methodology for partial fitting +
    [docs] def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + """ + Fits the Neural mixer on some data, making it ready to predit + + :param train_data: The EncodedDs on which to train the network + :param dev_data: Data used for early stopping and hyperparameter determination + """ + # ConcatedEncodedDs + self.batch_size = min(200, int(len(train_data) / 10)) + self.batch_size = max(40, self.batch_size) + + dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=False) + train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=False) + + self.lr = 1e-4 + self.num_hidden = 1 + + # Find learning rate + # keep the weights + self._init_net(train_data) + self.lr, self.model = self._find_lr(train_dl) + + # Keep on training + optimizer = self._select_optimizer() + criterion = self._select_criterion() + scaler = GradScaler() + + self.model, epoch_to_best_model, err = self._max_fit( + train_dl, dev_dl, criterion, optimizer, scaler, self.stop_after, return_model_after=20000) + + self.epochs_to_best += epoch_to_best_model + + if self.fit_on_dev: + self.partial_fit(dev_data, train_data) + self._final_tuning(dev_data)
    + +
    [docs] def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + """ + Augments the mixer's fit with new data, nr of epochs is based on the amount of epochs the original fitting took + + :param train_data: The EncodedDs on which to train the network + :param dev_data: Data used for early stopping and hyperparameter determination + """ + + # Based this on how long the initial training loop took, at a low learning rate as to not mock anything up tooo badly # noqa + train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=True) + dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=True) + optimizer = self._select_optimizer() + criterion = self._select_criterion() + scaler = GradScaler() + + self.model, _, _ = self._max_fit(train_dl, dev_dl, criterion, optimizer, scaler, + self.stop_after, max(1, int(self.epochs_to_best / 3)))
    + + def __call__(self, ds: EncodedDs, + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + """ + Make predictions based on datasource similar to the one used to fit (sans the target column) + + :param ds: The EncodedDs for which to generate the predictions + :param arg: Argument for predicting + + :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class + """ # noqa + self.model = self.model.eval() + decoded_predictions: List[object] = [] + all_probs: List[List[float]] = [] + rev_map = {} + + with torch.no_grad(): + for idx, (X, Y) in enumerate(ds): + X = X.to(self.model.device) + Yh = self.model(X) + Yh = torch.unsqueeze(Yh, 0) if len(Yh.shape) < 2 else Yh + + kwargs = {} + for dep in self.target_encoder.dependencies: + kwargs['dependency_data'] = {dep: ds.data_frame.iloc[idx][[dep]].values} + + if args.predict_proba and self.supports_proba: + kwargs['return_raw'] = True + decoded_prediction, probs, rev_map = self.target_encoder.decode(Yh, **kwargs) + all_probs.append(probs) + else: + decoded_prediction = self.target_encoder.decode(Yh, **kwargs) + + if not self.timeseries_settings.is_timeseries or self.timeseries_settings.nr_predictions == 1: + decoded_predictions.extend(decoded_prediction) + else: + decoded_predictions.append(decoded_prediction) + + ydf = pd.DataFrame({'prediction': decoded_predictions}) + + if args.predict_proba and self.supports_proba: + raw_predictions = np.array(all_probs).squeeze() + for idx, label in enumerate(rev_map.values()): + ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] + + return ydf
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/mixer/regression.html b/docs/_modules/lightwood/mixer/regression.html new file mode 100644 index 000000000..9871b21e3 --- /dev/null +++ b/docs/_modules/lightwood/mixer/regression.html @@ -0,0 +1,279 @@ + + + + + + + + + + lightwood.mixer.regression — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.mixer.regression
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.mixer.regression

    +import torch
    +import pandas as pd
    +from scipy.special import softmax
    +from sklearn.linear_model import LinearRegression
    +
    +from lightwood.helpers.log import log
    +from lightwood.api.dtype import dtype
    +from lightwood.mixer import BaseMixer
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.api.types import PredictionArguments
    +from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs
    +
    +
    +
    [docs]class Regression(BaseMixer): + model: LinearRegression + label_map: dict + supports_proba: bool + + def __init__(self, stop_after: int, target_encoder: BaseEncoder, dtype_dict: dict, target: str): + super().__init__(stop_after) + self.target_encoder = target_encoder + self.target_dtype = dtype_dict[target] + self.supports_proba = self.target_dtype in [dtype.binary, dtype.categorical] + self.label_map = {} + self.stable = False + +
    [docs] def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + if self.target_dtype not in (dtype.float, dtype.integer, dtype.quantity): + raise Exception(f'Unspported {self.target_dtype} type for regression') + log.info('Fitting Linear Regression model') + X = [] + Y = [] + for x, y in ConcatedEncodedDs([train_data, dev_data]): + X.append(x.tolist()) + Y.append(y.tolist()) + + if self.supports_proba: + self.label_map = self.target_encoder.rev_map + + self.model = LinearRegression().fit(X, Y) + log.info(f'Regression based correlation of: {self.model.score(X, Y)}')
    + +
    [docs] def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + self.fit(train_data, dev_data)
    + + def __call__(self, ds: EncodedDs, + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + X = [] + for x, _ in ds: + X.append(x.tolist()) + + Yh = self.model.predict(X) + + decoded_predictions = self.target_encoder.decode(torch.Tensor(Yh)) + + ydf = pd.DataFrame({'prediction': decoded_predictions}) + + if args.predict_proba and self.label_map: + raw_predictions = softmax(Yh.squeeze(), axis=1) + for idx, label in enumerate(self.target_encoder.rev_map.values()): + ydf[f'__mdb_proba_{label}'] = raw_predictions[:, idx] + + return ydf
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/mixer/sktime.html b/docs/_modules/lightwood/mixer/sktime.html new file mode 100644 index 000000000..cfabba4ed --- /dev/null +++ b/docs/_modules/lightwood/mixer/sktime.html @@ -0,0 +1,342 @@ + + + + + + + + + + lightwood.mixer.sktime — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.mixer.sktime
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.mixer.sktime

    +import numpy as np
    +import pandas as pd
    +from typing import Dict, Union
    +from sktime.forecasting.arima import AutoARIMA
    +
    +from lightwood.api import dtype
    +from lightwood.helpers.log import log
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.api.types import PredictionArguments
    +from lightwood.helpers.general import get_group_matches
    +from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs
    +
    +
    +
    [docs]class SkTime(BaseMixer): + forecaster: str + n_ts_predictions: int + target: str + supports_proba: bool + + def __init__( + self, stop_after: int, target: str, dtype_dict: Dict[str, str], + n_ts_predictions: int, ts_analysis: Dict): + super().__init__(stop_after) + self.target = target + dtype_dict[target] = dtype.float + self.model_class = AutoARIMA + self.models = {} + self.n_ts_predictions = n_ts_predictions + self.ts_analysis = ts_analysis + self.forecasting_horizon = np.arange(1, self.n_ts_predictions) + self.cutoff_index = {} # marks index at which training data stops and forecasting window starts + self.grouped_by = ['__default'] if not ts_analysis['tss'].group_by else ts_analysis['tss'].group_by + self.supports_proba = False + self.stable = True + self.prepared = False + +
    [docs] def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + log.info('Started fitting sktime forecaster for array prediction') + + all_subsets = ConcatedEncodedDs([train_data, dev_data]) + df = all_subsets.data_frame.sort_values(by=f'__mdb_original_{self.ts_analysis["tss"].order_by[0]}') + data = {'data': df[self.target], + 'group_info': {gcol: df[gcol].tolist() + for gcol in self.grouped_by} if self.ts_analysis['tss'].group_by else {}} + + for group in self.ts_analysis['group_combinations']: + # many warnings might be thrown inside of statsmodels during stepwise procedure + self.models[group] = self.model_class(suppress_warnings=True) + + if self.grouped_by == ['__default']: + series_idxs = data['data'].index + series_data = data['data'].values + else: + series_idxs, series_data = get_group_matches(data, group) + + if series_data.size > 0: + series = pd.Series(series_data.squeeze(), index=series_idxs) + series = series.sort_index(ascending=True) + series = series.reset_index(drop=True) + try: + self.models[group].fit(series) + except ValueError: + self.models[group] = self.model_class(deseasonalize=False) + self.models[group].fit(series) + + self.cutoff_index[group] = len(series) + + if self.grouped_by == ['__default']: + break
    + +
    [docs] def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + """ + Note: sktime asks for "specification of the time points for which forecasts are requested", + and this mixer complies by assuming forecasts will start immediately after the last observed + value. + + Because of this, `partial_fit` ensures that both `dev` and `test` splits are used to fit the AutoARIMA model. + + Due to how lightwood implements the `update` procedure, expected inputs are (for a train-dev-test split): + + :param dev_data: original `test` split (used to validate and select model if ensemble is `BestOf`) + :param train_data: includes original `train` and `dev` split + """ # noqa + self.fit(dev_data, train_data) + self.prepared = True
    + + def __call__(self, ds: Union[EncodedDs, ConcatedEncodedDs], + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + if args.predict_proba: + log.warning('This mixer does not output probability estimates') + + length = sum(ds.encoded_ds_lenghts) if isinstance(ds, ConcatedEncodedDs) else len(ds) + ydf = pd.DataFrame(0, # zero-filled + index=np.arange(length), + columns=['prediction'], + dtype=object) + + data = {'data': ds.data_frame[self.target].reset_index(drop=True), + 'group_info': {gcol: ds.data_frame[gcol].tolist() + for gcol in self.grouped_by} if self.ts_analysis['tss'].group_by else {}} + + # all_idxs = list(range(length)) # @TODO: substract, and assign empty predictions to remainder + + for group in self.ts_analysis['group_combinations']: + + if self.grouped_by == ['__default']: + series_idxs = data['data'].index + series_data = data['data'].values + else: + series_idxs, series_data = get_group_matches(data, group) + + if series_data.size > 0: + forecaster = self.models[group] if self.models[group].is_fitted else self.models['__default'] + + series = pd.Series(series_data.squeeze(), index=series_idxs) + series = series.sort_index(ascending=True) + series = series.reset_index(drop=True) + + for idx, _ in enumerate(series.iteritems()): + ydf['prediction'].iloc[series_idxs[idx]] = forecaster.predict( + np.arange(idx, idx + self.n_ts_predictions)).tolist() + + if self.grouped_by == ['__default']: + break + + return ydf[['prediction']]
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/lightwood/mixer/unit.html b/docs/_modules/lightwood/mixer/unit.html new file mode 100644 index 000000000..76071fc93 --- /dev/null +++ b/docs/_modules/lightwood/mixer/unit.html @@ -0,0 +1,262 @@ + + + + + + + + + + lightwood.mixer.unit — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + +
      + +
    • »
    • + +
    • Module code »
    • + +
    • lightwood.mixer.unit
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for lightwood.mixer.unit

    +"""
    +2021.07.16
    +
    +For encoders that already fine-tune on the targets (namely text)
    +the unity mixer just arg-maxes the output of the encoder.
    +"""
    +
    +from typing import List
    +
    +import torch
    +import pandas as pd
    +
    +from lightwood.helpers.log import log
    +from lightwood.mixer.base import BaseMixer
    +from lightwood.encoder.base import BaseEncoder
    +from lightwood.data.encoded_ds import EncodedDs
    +from lightwood.api.types import PredictionArguments
    +
    +
    +
    [docs]class Unit(BaseMixer): + def __init__(self, stop_after: int, target_encoder: BaseEncoder): + super().__init__(stop_after) + self.target_encoder = target_encoder + self.supports_proba = False + self.stable = True + +
    [docs] def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + log.info("Unit Mixer just borrows from encoder")
    + +
    [docs] def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: + pass
    + + def __call__(self, ds: EncodedDs, + args: PredictionArguments = PredictionArguments()) -> pd.DataFrame: + if args.predict_proba: + # @TODO: depending on the target encoder, this might be enabled + log.warning('This model does not output probability estimates') + + decoded_predictions: List[object] = [] + + for X, _ in ds: + decoded_prediction = self.target_encoder.decode(torch.unsqueeze(X, 0)) + decoded_predictions.extend(decoded_prediction) + + ydf = pd.DataFrame({"prediction": decoded_predictions}) + return ydf
    +
    + +
    + +
    +
    + +
    + +
    +

    + © Copyright 2017-2021, MindsDB. + +

    +
    + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. + +
    +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_sources/analysis.rst.txt b/docs/_sources/analysis.rst.txt new file mode 100644 index 000000000..2885844f5 --- /dev/null +++ b/docs/_sources/analysis.rst.txt @@ -0,0 +1,7 @@ +:mod:`Analysis` +========================== + +Analyse mixer ensembles to extract static insights and train predict-time models for dynamic insights. + +.. automodule:: analysis + :members: \ No newline at end of file diff --git a/docs/_sources/api.rst.txt b/docs/_sources/api.rst.txt index fd5df71d1..f31245f63 100644 --- a/docs/_sources/api.rst.txt +++ b/docs/_sources/api.rst.txt @@ -1,15 +1,15 @@ -:mod:`API Module` +:mod:`API` ========================== -The Lightwood API Table of Contents --------------------------------------- The API module is how Lightwood interfaces with the user. .. toctree:: :maxdepth: 1 :caption: Table of Contents: + api/high_level api/dtype api/types api/predictor api/json_ai + api/encode \ No newline at end of file diff --git a/docs/_sources/data.rst.txt b/docs/_sources/data.rst.txt new file mode 100644 index 000000000..3ea9748d2 --- /dev/null +++ b/docs/_sources/data.rst.txt @@ -0,0 +1,7 @@ +:mod:`Data` +========================== + +The focus of these modules is on storing, transforming, cleaning, splitting, merging, getting and removing data. + +.. automodule:: data + :members: \ No newline at end of file diff --git a/docs/_sources/data/cleaner.rst.txt b/docs/_sources/data/cleaner.rst.txt new file mode 100644 index 000000000..09efa5de3 --- /dev/null +++ b/docs/_sources/data/cleaner.rst.txt @@ -0,0 +1,5 @@ +Data Cleaning +-------------------- + +.. automodule:: data.cleaner + :members: \ No newline at end of file diff --git a/docs/_sources/encoder.rst.txt b/docs/_sources/encoder.rst.txt new file mode 100644 index 000000000..839e7f770 --- /dev/null +++ b/docs/_sources/encoder.rst.txt @@ -0,0 +1,7 @@ +:mod:`Encoders` +========================== + +Used for encoding data into PyTorch tensors and decoding it from pytorch tensors + +.. automodule:: encoder + :members: diff --git a/docs/_sources/ensemble.rst.txt b/docs/_sources/ensemble.rst.txt new file mode 100644 index 000000000..82c01f068 --- /dev/null +++ b/docs/_sources/ensemble.rst.txt @@ -0,0 +1,7 @@ +:mod:`Ensemble` +========================== + +Ensemble mixers together in order to generate predictions + +.. automodule:: ensemble + :members: \ No newline at end of file diff --git a/docs/_sources/helpers.rst.txt b/docs/_sources/helpers.rst.txt new file mode 100644 index 000000000..959ebb231 --- /dev/null +++ b/docs/_sources/helpers.rst.txt @@ -0,0 +1,7 @@ +:mod:`Helpers` +========================== + +Various helper functions + +.. automodule:: helpers + :members: \ No newline at end of file diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt index 68b1df110..e6ac21efb 100644 --- a/docs/_sources/index.rst.txt +++ b/docs/_sources/index.rst.txt @@ -268,4 +268,10 @@ Other Links lightwood_philosophy tutorials - api \ No newline at end of file + api + data + encoder + mixer + ensemble + analysis + helpers \ No newline at end of file diff --git a/docs/_sources/mixer.rst.txt b/docs/_sources/mixer.rst.txt new file mode 100644 index 000000000..b1c56f273 --- /dev/null +++ b/docs/_sources/mixer.rst.txt @@ -0,0 +1,7 @@ +:mod:`Mixers` +========================== + +Machine learning models which learn to predict the target value using the encoded representations. + +.. automodule:: mixer + :members: diff --git a/docs/_sources/tutorials/tutorial_update_models/Tutorial -- Update a predictor.ipynb.txt b/docs/_sources/tutorials/tutorial_update_models/Tutorial -- Update a predictor.ipynb.txt new file mode 100644 index 000000000..fcb2a4397 --- /dev/null +++ b/docs/_sources/tutorials/tutorial_update_models/Tutorial -- Update a predictor.ipynb.txt @@ -0,0 +1,703 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "In this tutorial, we will go through an example to update a preexisting model. This might be useful when you come across additional data that you would want to consider, without having to train a model from scratch.\n", + "\n", + "The main abstraction that Lightwood offers for this is the `BaseMixer.partial_fit()` method. To call it, you need to pass new training data and a held-out dev subset for internal mixer usage (e.g. early stopping). If you are using an aggregate ensemble, it's likely you will want to do this for every single mixer. The convienient `PredictorInterface.adjust()` does this automatically for you.\n", + "\n", + "\n", + "# Initial model training\n", + "\n", + "First, let's train a Lightwood predictor for the `concrete strength` dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from lightwood.api.high_level import ProblemDefinition, json_ai_from_problem, predictor_from_json_ai\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train dataframe shape: (206, 10)\n", + "Update dataframe shape: (618, 10)\n", + "Test dataframe shape: (206, 10)\n" + ] + } + ], + "source": [ + "# Load data\n", + "df = pd.read_csv('https://raw.githubusercontent.com/mindsdb/lightwood/staging/tests/data/concrete_strength.csv')\n", + "\n", + "df = df.sample(frac=1, random_state=1)\n", + "train_df = df[:int(0.2*len(df))]\n", + "update_df = df[int(0.2*len(df)):int(0.8*len(df))]\n", + "test_df = df[int(0.8*len(df)):]\n", + "\n", + "print(f'Train dataframe shape: {train_df.shape}')\n", + "print(f'Update dataframe shape: {update_df.shape}')\n", + "print(f'Test dataframe shape: {test_df.shape}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we have three different data splits.\n", + "\n", + "We will use the `training` split for the initial model training. As you can see, it's only a 20% of the total data we have. The `update` split will be used as training data to adjust/update our model. Finally, the held out `test` set will give us a rough idea of the impact our updating procedure has on the model's predictive capabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32mINFO:lightwood-91181:Dropping features: []\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Analyzing a sample of 979\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:from a total population of 1030, this is equivalent to 95.0% of your data.\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Using 15 processes to deduct types.\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Starting statistical analysis\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Finished statistical analysis\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Unable to import black formatter, predictor code might be a bit ugly.\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Dropping features: []\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Performing statistical analysis on data\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Starting statistical analysis\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Finished statistical analysis\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Cleaning the data\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Splitting the data into train/test\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Preparing the encoders\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 1\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 2\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 3\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 4\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 5\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 6\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 7\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 8\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 9\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Encoder prepping dict length of: 10\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: concrete_strength\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: id\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: cement\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: slag\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: flyAsh\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: water\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: superPlasticizer\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: coarseAggregate\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: fineAggregate\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Done running for: age\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Featurizing the data\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Training the mixers\u001b[0m\n", + "torch.cuda.amp.GradScaler is enabled, but CUDA is not available. Disabling.\n", + "This overload of addcmul_ is deprecated:\n", + "\taddcmul_(Number value, Tensor tensor1, Tensor tensor2)\n", + "Consider using one of the following signatures instead:\n", + "\taddcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1005.)\n", + "\u001b[32mINFO:lightwood-91181:Loss of 7.69654655456543 with learning rate 0.0001\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Loss of 6.121406078338623 with learning rate 0.00014\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Loss of 5.7169036865234375 with learning rate 0.00019599999999999997\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Loss of 4.907417297363281 with learning rate 0.00027439999999999995\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Loss of 3.7602126598358154 with learning rate 0.0003841599999999999\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Loss of 1.8155415058135986 with learning rate 0.0005378239999999999\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Loss of 3.7833187580108643 with learning rate 0.0007529535999999998\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Loss of 8.216030836105347 with learning rate 0.0010541350399999995\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Found learning rate of: 0.0005378239999999999\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 1: 0.7302289009094238\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 2: 0.9203720092773438\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 3: 0.8405624628067017\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 4: 0.7608699202537537\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 5: 0.6823285222053528\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 6: 0.606808602809906\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 7: 0.4470987617969513\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 8: 0.3933545649051666\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 9: 0.3497759997844696\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 10: 0.3151411712169647\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 11: 0.2879962623119354\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 12: 0.2667108178138733\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 13: 0.23354031145572662\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 14: 0.21926474571228027\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 15: 0.20496906340122223\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 16: 0.19059491157531738\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 17: 0.17612512409687042\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 18: 0.161383256316185\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 19: 0.12839828431606293\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 20: 0.1162123903632164\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 21: 0.10669219493865967\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 22: 0.09954904764890671\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 23: 0.09420691430568695\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 24: 0.0900391936302185\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 25: 0.08349908888339996\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 26: 0.0822099968791008\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 27: 0.08120812475681305\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 28: 0.0804857686161995\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 29: 0.07996372133493423\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 30: 0.07936403155326843\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 31: 0.07869081199169159\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 32: 0.07849359512329102\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 33: 0.07820077985525131\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 34: 0.07790301740169525\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 35: 0.07746117562055588\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 36: 0.0766073539853096\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 37: 0.07440945506095886\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 38: 0.07304742932319641\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 39: 0.07175709307193756\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 40: 0.0706694945693016\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 41: 0.06960804760456085\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 42: 0.0683063194155693\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 43: 0.06553898006677628\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 44: 0.06447519361972809\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 45: 0.06355087459087372\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 46: 0.06285689026117325\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 47: 0.0621829479932785\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 48: 0.06127836927771568\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 49: 0.05949181318283081\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 50: 0.058798886835575104\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 51: 0.058218929916620255\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 52: 0.057854749262332916\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 53: 0.05746406316757202\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 54: 0.056835610419511795\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 55: 0.05569766089320183\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 56: 0.05525219812989235\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 57: 0.05490746721625328\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 58: 0.054767243564128876\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 59: 0.05455196276307106\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 60: 0.0540977418422699\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 61: 0.05336076393723488\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 62: 0.053060129284858704\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 63: 0.05285469442605972\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 64: 0.0528554692864418\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 65: 0.05273965373635292\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 66: 0.05239948257803917\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 67: 0.05194811150431633\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 68: 0.05178629234433174\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 69: 0.05171119421720505\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 70: 0.05184203386306763\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 71: 0.05181184783577919\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 72: 0.05157444253563881\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 73: 0.05137106031179428\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 74: 0.05131785199046135\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 75: 0.05133713781833649\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 76: 0.05156172439455986\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Ensembling the mixer\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Mixer: Neural got accuracy: 0.5960601553597429\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Picked best mixer: Neural\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Analyzing the ensemble of mixers\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block ICP is now running its analyze() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block AccStats is now running its analyze() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block GlobalFeatureImportance is now running its analyze() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Adjustment on validation requested.\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Updating the mixers\u001b[0m\n", + "torch.cuda.amp.GradScaler is enabled, but CUDA is not available. Disabling.\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 1: 0.06892643496394157\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 2: 0.06978078782558442\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 3: 0.06783530339598656\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 4: 0.07201590612530709\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 5: 0.0718848429620266\u001b[0m\n" + ] + } + ], + "source": [ + "# Define predictive task and predictor\n", + "target = 'concrete_strength'\n", + "pdef = ProblemDefinition.from_dict({'target': target, 'time_aim': 200})\n", + "jai = json_ai_from_problem(df, pdef)\n", + "\n", + "# We will keep the architecture simple: a single neural mixer, and a `BestOf` ensemble:\n", + "jai.outputs[target].mixers = [{\n", + " \"module\": \"Neural\",\n", + " \"args\": {\n", + " \"fit_on_dev\": False,\n", + " \"stop_after\": \"$problem_definition.seconds_per_mixer\",\n", + " \"search_hyperparameters\": False,\n", + " }\n", + "}]\n", + "\n", + "jai.outputs[target].ensemble = {\n", + " \"module\": \"BestOf\",\n", + " \"args\": {\n", + " \"args\": \"$pred_args\",\n", + " \"accuracy_functions\": \"$accuracy_functions\",\n", + " }\n", + "}\n", + "\n", + "# Build and train the predictor\n", + "predictor = predictor_from_json_ai(jai)\n", + "predictor.learn(train_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32mINFO:lightwood-91181:Dropping features: []\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Cleaning the data\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Featurizing the data\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block ICP is now running its explain() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block AccStats is now running its explain() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:AccStats.explain() has not been implemented, no modifications will be done to the data insights.\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block GlobalFeatureImportance is now running its explain() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:GlobalFeatureImportance.explain() has not been implemented, no modifications will be done to the data insights.\u001b[0m\n" + ] + }, + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    predictiontruthconfidencelowerupper
    051.19360371.300.999130.54044371.846764
    128.50339039.600.99917.85022949.156551
    218.35613910.790.99910.00000039.009300
    316.0620944.830.99910.00000036.715254
    432.62362947.710.999111.97046953.276790
    ..................
    20145.63381140.930.999124.98065066.286972
    20241.61320952.820.999120.96004862.266369
    20331.29704439.660.999110.64388351.950204
    20429.40925813.290.99918.75609750.062418
    20537.71213817.840.999117.05897758.365298
    \n", + "

    206 rows × 5 columns

    \n", + "
    " + ], + "text/plain": [ + " prediction truth confidence lower upper\n", + "0 51.193603 71.30 0.9991 30.540443 71.846764\n", + "1 28.503390 39.60 0.9991 7.850229 49.156551\n", + "2 18.356139 10.79 0.9991 0.000000 39.009300\n", + "3 16.062094 4.83 0.9991 0.000000 36.715254\n", + "4 32.623629 47.71 0.9991 11.970469 53.276790\n", + ".. ... ... ... ... ...\n", + "201 45.633811 40.93 0.9991 24.980650 66.286972\n", + "202 41.613209 52.82 0.9991 20.960048 62.266369\n", + "203 31.297044 39.66 0.9991 10.643883 51.950204\n", + "204 29.409258 13.29 0.9991 8.756097 50.062418\n", + "205 37.712138 17.84 0.9991 17.058977 58.365298\n", + "\n", + "[206 rows x 5 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train and get predictions for the held out test set\n", + "predictions = predictor.predict(test_df)\n", + "predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Updating the predictor\n", + "\n", + "As previously mentioned, you can update any given mixer with a `BaseMixer.partial_fit()` call. If you have multiple mixers and want to update them all at once, you should use `PredictorInterface.adjust()`. \n", + "\n", + "For both of these methods, two encoded datasources are needed as input (for `adjust` you need to wrap them in a dictionary with 'old' and 'new' keys). \n", + "\n", + "Let's `adjust` our predictor:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32mINFO:lightwood-91181:Updating the mixers\u001b[0m\n", + "torch.cuda.amp.GradScaler is enabled, but CUDA is not available. Disabling.\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 1: 0.06545061928530534\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 2: 0.0679960281898578\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 3: 0.07171888339022796\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 4: 0.07307156516859929\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 5: 0.06360626469055812\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 6: 0.06457449619968732\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 7: 0.057915804286797844\u001b[0m\n", + "\u001b[37mDEBUG:lightwood-91181:Loss @ epoch 8: 0.06492673171063264\u001b[0m\n" + ] + } + ], + "source": [ + "from lightwood.data import EncodedDs\n", + "\n", + "train_ds = EncodedDs(predictor.encoders, train_df, target)\n", + "update_ds = EncodedDs(predictor.encoders, update_df, target)\n", + "\n", + "predictor.adjust({'old': train_ds, 'new': update_ds})" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32mINFO:lightwood-91181:Dropping features: []\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Cleaning the data\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:Featurizing the data\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block ICP is now running its explain() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block AccStats is now running its explain() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:AccStats.explain() has not been implemented, no modifications will be done to the data insights.\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:The block GlobalFeatureImportance is now running its explain() method\u001b[0m\n", + "\u001b[32mINFO:lightwood-91181:GlobalFeatureImportance.explain() has not been implemented, no modifications will be done to the data insights.\u001b[0m\n" + ] + }, + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    predictiontruthconfidencelowerupper
    053.39225371.300.999132.73909374.045414
    127.88629239.600.99917.23313248.539453
    216.30178810.790.99910.00000036.954948
    313.8628274.830.99910.00000034.515988
    431.42103547.710.999110.76787552.074196
    ..................
    20142.63103740.930.999121.97787663.284197
    20237.50244452.820.999116.84928358.155604
    20329.49148739.660.99918.83832650.144647
    20428.01357013.290.99917.36041048.666731
    20535.33604317.840.999114.68288355.989204
    \n", + "

    206 rows × 5 columns

    \n", + "
    " + ], + "text/plain": [ + " prediction truth confidence lower upper\n", + "0 53.392253 71.30 0.9991 32.739093 74.045414\n", + "1 27.886292 39.60 0.9991 7.233132 48.539453\n", + "2 16.301788 10.79 0.9991 0.000000 36.954948\n", + "3 13.862827 4.83 0.9991 0.000000 34.515988\n", + "4 31.421035 47.71 0.9991 10.767875 52.074196\n", + ".. ... ... ... ... ...\n", + "201 42.631037 40.93 0.9991 21.977876 63.284197\n", + "202 37.502444 52.82 0.9991 16.849283 58.155604\n", + "203 29.491487 39.66 0.9991 8.838326 50.144647\n", + "204 28.013570 13.29 0.9991 7.360410 48.666731\n", + "205 35.336043 17.84 0.9991 14.682883 55.989204\n", + "\n", + "[206 rows x 5 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_predictions = predictor.predict(test_df)\n", + "new_predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice! Our predictor was updated, and new predictions are looking good. Let's compare the old and new accuracies:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Old Accuracy: 0.583\n", + "New Accuracy: 0.624\n" + ] + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "\n", + "old_acc = r2_score(predictions['truth'], predictions['prediction'])\n", + "new_acc = r2_score(new_predictions['truth'], new_predictions['prediction'])\n", + "\n", + "print(f'Old Accuracy: {round(old_acc, 3)}\\nNew Accuracy: {round(new_acc, 3)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After updating, we see an increase in the R2 score of predictions for the held out test set.\n", + "\n", + "## Conclusion\n", + "\n", + "We have gone through a simple example of how Lightwood predictors can leverage newly acquired data to improve their predictions. The interface for doing so is fairly simple, requiring only some new data and a single call to update.\n", + "\n", + "You can further customize the logic for updating your mixers by modifying the `partial_fit()` methods in them." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mdb", + "language": "python", + "name": "mdb" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js index 50f8de062..713a8d72f 100644 --- a/docs/_static/documentation_options.js +++ b/docs/_static/documentation_options.js @@ -1,6 +1,6 @@ var DOCUMENTATION_OPTIONS = { URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: '1.6.0', + VERSION: '1.6.1', LANGUAGE: 'None', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/docs/analysis.html b/docs/analysis.html new file mode 100644 index 000000000..c08b70afa --- /dev/null +++ b/docs/analysis.html @@ -0,0 +1,485 @@ + + + + + + + + + + Analysis — lightwood 1.6.1 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + + + +
    + + + + +
    +
    +
    +
    + + + +
    +

    Analysis

    +

    Analyse mixer ensembles to extract static insights and train predict-time models for dynamic insights.

    +
    +
    +class analysis.AccStats(deps=('ICP',))[source]
    +

    Computes accuracy stats and a confusion matrix for the validation dataset

    +
    +
    +analyze(info, **kwargs)[source]
    +

    This method should be called once during the analysis phase, or not called at all. +It computes any information that the block may either output to the model analysis object, +or use at inference time when .explain() is called (in this case, make sure all needed +objects are added to the runtime analyzer so that .explain() can access them).

    +
    +
    Parameters
    +

    info (Dict[str, object]) – Dictionary where any new information or objects are added. The next analysis block will use

    +
    +
    +

    the output of the previous block as a starting point. +:param kwargs: Dictionary with named variables from either the core analysis or the rest of the prediction +pipeline.

    +
    +
    Return type
    +

    Dict[str, object]

    +
    +
    +
    + +
    + +
    +
    +class analysis.BaseAnalysisBlock(deps=())[source]
    +

    Class to be inherited by any analysis/explainer block.

    +
    +
    +analyze(info, **kwargs)[source]
    +

    This method should be called once during the analysis phase, or not called at all. +It computes any information that the block may either output to the model analysis object, +or use at inference time when .explain() is called (in this case, make sure all needed +objects are added to the runtime analyzer so that .explain() can access them).

    +
    +
    Parameters
    +

    info (Dict[str, object]) – Dictionary where any new information or objects are added. The next analysis block will use

    +
    +
    +

    the output of the previous block as a starting point. +:param kwargs: Dictionary with named variables from either the core analysis or the rest of the prediction +pipeline.

    +
    +
    Return type
    +

    Dict[str, object]

    +
    +
    +
    + +
    +
    +explain(row_insights, global_insights, **kwargs)[source]
    +

    This method should be called once during the explaining phase at inference time, or not called at all. +Additional explanations can be at an instance level (row-wise) or global. +For the former, return a data frame with any new insights. For the latter, a dictionary is required.

    +
    +
    Parameters
    +
      +
    • row_insights (DataFrame) – dataframe with previously computed row-level explanations.

    • +
    • global_insights (Dict[str, object]) – dict() with any explanations that concern all predicted instances or the model itself.

    • +
    +
    +
    Return type
    +

    Tuple[DataFrame, Dict[str, object]]

    +
    +
    Returns
    +

      +
    • row_insights: modified input dataframe with any new row insights added here.

    • +
    • global_insights: dict() with any explanations that concern all predicted instances or the model itself.

    • +
    +

    +
    +
    +
    + +
    + +
    +
    +class analysis.GlobalFeatureImportance(disable_column_importance)[source]
    +

    Analysis block that estimates column importance with a variant of the LOCO (leave-one-covariate-out) algorithm.

    +
    +
    Roughly speaking, the procedure:
      +
    • iterates over all input columns

    • +
    • if the input column is optional, then make a predict with its values set to None

    • +
    • compare this accuracy with the accuracy obtained using all data

    • +
    • all accuracy differences are passed through a softmax and reported as estimated column importance scores

    • +
    +
    +
    +

    Note that, crucially, this method does not refit the predictor at any point.

    +
    +
    Reference:

    https://compstat-lmu.github.io/iml_methods_limitations/pfi.html

    +
    +
    +
    +
    +analyze(info, **kwargs)[source]
    +

    This method should be called once during the analysis phase, or not called at all. +It computes any information that the block may either output to the model analysis object, +or use at inference time when .explain() is called (in this case, make sure all needed +objects are added to the runtime analyzer so that .explain() can access them).

    +
    +
    Parameters
    +

    info (Dict[str, object]) – Dictionary where any new information or objects are added. The next analysis block will use

    +
    +
    +

    the output of the previous block as a starting point. +:param kwargs: Dictionary with named variables from either the core analysis or the rest of the prediction +pipeline.

    +
    +
    Return type
    +

    Dict[str, object]

    +
    +
    +
    + +
    + +
    +
    +class analysis.ICP(fixed_significance, positive_domain, confidence_normalizer)[source]
    +

    Confidence estimation block, uses inductive conformal predictors (ICPs) for model agnosticity

    +
    +
    +analyze(info, **kwargs)[source]
    +

    This method should be called once during the analysis phase, or not called at all. +It computes any information that the block may either output to the model analysis object, +or use at inference time when .explain() is called (in this case, make sure all needed +objects are added to the runtime analyzer so that .explain() can access them).

    +
    +
    Parameters
    +

    info (Dict[str, object]) – Dictionary where any new information or objects are added. The next analysis block will use

    +
    +
    +

    the output of the previous block as a starting point. +:param kwargs: Dictionary with named variables from either the core analysis or the rest of the prediction +pipeline.

    +
    +
    Return type
    +

    Dict[str, object]

    +
    +
    +
    + +
    +
    +explain(row_insights, global_insights, **kwargs)[source]
    +

    This method should be called once during the explaining phase at inference time, or not called at all. +Additional explanations can be at an instance level (row-wise) or global. +For the former, return a data frame with any new insights. For the latter, a dictionary is required.

    +
    +
    Parameters
    +
      +
    • row_insights (DataFrame) – dataframe with previously computed row-level explanations.

    • +
    • global_insights (Dict[str, object]) – dict() with any explanations that concern all predicted instances or the model itself.

    • +
    +
    +
    Return type
    +

    Tuple[DataFrame, Dict[str, object]]

    +
    +
    Returns
    +

      +
    • row_insights: modified input dataframe with any new row insights added here.

    • +
    • global_insights: dict() with any explanations that concern all predicted instances or the model itself.

    • +
    +

    +
    +
    +
    + +
    + +
    +
    +analysis.explain(data, encoded_data, predictions, timeseries_settings, analysis, target_name, target_dtype, positive_domain, fixed_confidence, anomaly_detection, anomaly_error_rate, anomaly_cooldown, explainer_blocks=[], ts_analysis={})[source]
    +

    This procedure runs at the end of every normal .predict() call. Its goal is to generate prediction insights, +potentially using information generated at the model analysis stage (e.g. confidence estimation).

    +

    As in analysis(), any user-specified analysis blocks (see class BaseAnalysisBlock) are also called here.

    +
    +
    Returns
    +

    +
    +
    +

    row_insights: a DataFrame containing predictions and all generated insights at a row-level.

    +
    + +
    +
    +analysis.model_analyzer(predictor, data, train_data, stats_info, target, ts_cfg, dtype_dict, accuracy_functions, analysis_blocks=[])[source]
    +

    Analyses model on a validation subset to evaluate accuracy, estimate feature importance and generate a +calibration model to estimating confidence in future predictions.

    +

    Additionally, any user-specified analysis blocks (see class BaseAnalysisBlock) are also called here.

    +
    +
    Return type
    +

    Tuple[ModelAnalysis, Dict[str, object]]

    +
    +
    Returns
    +

    +
    +
    +

    runtime_analyzer: This dictionary object gets populated in a sequential fashion with data generated from +any .analyze() block call. This dictionary object is stored in the predictor itself, and used when +calling the .explain() method of all analysis blocks when generating predictions.

    +

    model_analysis: ModelAnalysis object that contains core analysis metrics, not necessarily needed when predicting.

    +
    + +
    + + +
    + +
    + +
    +
    + +
    + +
    + + + + + + + + + + + \ No newline at end of file diff --git a/docs/api.html b/docs/api.html index 3519e4817..911fa0ad0 100644 --- a/docs/api.html +++ b/docs/api.html @@ -7,7 +7,7 @@ - API Module — lightwood 1.6.0 documentation + API — lightwood 1.6.1 documentation @@ -41,7 +41,7 @@ - + @@ -69,7 +69,7 @@
    - 1.6.0 + 1.6.1
    @@ -97,16 +97,21 @@

    diff --git a/docs/api/encode.html b/docs/api/encode.html index 6714794ed..e21e0d6b2 100644 --- a/docs/api/encode.html +++ b/docs/api/encode.html @@ -7,7 +7,7 @@ - Encode your data — lightwood 1.6.0 documentation + Encode your data — lightwood 1.6.1 documentation @@ -40,7 +40,9 @@ - + + + @@ -67,7 +69,7 @@
    - 1.6.0 + 1.6.1
    @@ -92,10 +94,24 @@ -

    diff --git a/docs/api/high_level.html b/docs/api/high_level.html index c98fb81b7..42ef5251a 100644 --- a/docs/api/high_level.html +++ b/docs/api/high_level.html @@ -7,7 +7,7 @@ - JSON-AI Config — lightwood 1.6.0 documentation + JSON-AI Config — lightwood 1.6.1 documentation @@ -40,7 +40,9 @@ - + + + @@ -67,7 +69,7 @@
    - 1.6.0 + 1.6.1
    @@ -92,10 +94,34 @@ -