Skip to content

Commit

Permalink
Merge pull request #1172 from mindsdb/staging
Browse files Browse the repository at this point in the history
Release 23.7.1.0
  • Loading branch information
paxcema authored Jul 3, 2023
2 parents 3262859 + 9eadd14 commit 2812838
Show file tree
Hide file tree
Showing 21 changed files with 516 additions and 178 deletions.
2 changes: 1 addition & 1 deletion lightwood/__about__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__title__ = 'lightwood'
__package_name__ = 'lightwood'
__version__ = '23.6.4.0'
__version__ = '23.7.1.0'
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
__email__ = "[email protected]"
__author__ = 'MindsDB Inc'
Expand Down
97 changes: 54 additions & 43 deletions lightwood/analysis/analyze.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Dict, List, Tuple, Optional

import numpy as np
from dataprep_ml import StatisticalAnalysis

from lightwood.helpers.log import log
Expand All @@ -8,7 +9,7 @@
from lightwood.analysis.base import BaseAnalysisBlock
from lightwood.data.encoded_ds import EncodedDs
from lightwood.encoder.text.pretrained import PretrainedLangEncoder
from lightwood.api.types import ModelAnalysis, TimeseriesSettings, PredictionArguments
from lightwood.api.types import ModelAnalysis, ProblemDefinition, PredictionArguments


def model_analyzer(
Expand All @@ -17,7 +18,7 @@ def model_analyzer(
train_data: EncodedDs,
stats_info: StatisticalAnalysis,
target: str,
tss: TimeseriesSettings,
pdef: ProblemDefinition,
dtype_dict: Dict[str, str],
accuracy_functions,
ts_analysis: Dict,
Expand All @@ -39,54 +40,64 @@ def model_analyzer(

runtime_analyzer = {}
data_type = dtype_dict[target]
tss = pdef.timeseries_settings

# retrieve encoded data representations
encoded_train_data = train_data
encoded_val_data = data
data = encoded_val_data.data_frame
input_cols = list([col for col in data.columns if col != target])

# predictive task
is_numerical = data_type in (dtype.integer, dtype.float, dtype.num_tsarray, dtype.quantity)
is_classification = data_type in (dtype.categorical, dtype.binary, dtype.cat_tsarray)
is_multi_ts = tss.is_timeseries and tss.horizon > 1
has_pretrained_text_enc = any([isinstance(enc, PretrainedLangEncoder)
for enc in encoded_train_data.encoders.values()])

# raw predictions for validation dataset
args = {} if not is_classification else {"predict_proba": True}
filtered_df = encoded_val_data.data_frame
normal_predictions = None

if len(analysis_blocks) > 0:
normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args))
normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index)

# ------------------------- #
# Run analysis blocks, both core and user-defined
# ------------------------- #
kwargs = {
'predictor': predictor,
'target': target,
'input_cols': input_cols,
'dtype_dict': dtype_dict,
'normal_predictions': normal_predictions,
'data': filtered_df,
'train_data': train_data,
'encoded_val_data': encoded_val_data,
'is_classification': is_classification,
'is_numerical': is_numerical,
'is_multi_ts': is_multi_ts,
'stats_info': stats_info,
'tss': tss,
'ts_analysis': ts_analysis,
'accuracy_functions': accuracy_functions,
'has_pretrained_text_enc': has_pretrained_text_enc
}

for block in analysis_blocks:
log.info("The block %s is now running its analyze() method", block.__class__.__name__)
runtime_analyzer = block.analyze(runtime_analyzer, **kwargs)
if not pdef.embedding_only:
# predictive task
is_numerical = data_type in (dtype.integer, dtype.float, dtype.num_tsarray, dtype.quantity)
is_classification = data_type in (dtype.categorical, dtype.binary, dtype.cat_tsarray)
is_multi_ts = tss.is_timeseries and tss.horizon > 1
has_pretrained_text_enc = any([isinstance(enc, PretrainedLangEncoder)
for enc in encoded_train_data.encoders.values()])

# raw predictions for validation dataset
args = {} if not is_classification else {"predict_proba": True}
normal_predictions = None

if len(analysis_blocks) > 0:
if tss.is_timeseries:
# we retrieve the first entry per group (closest to supervision cutoff)
if tss.group_by:
encoded_val_data.data_frame['__mdb_val_idx'] = np.arange(len(encoded_val_data))
idxs = encoded_val_data.data_frame.groupby(by=tss.group_by).first()['__mdb_val_idx'].values
encoded_val_data.data_frame = encoded_val_data.data_frame.iloc[idxs, :]
if encoded_val_data.cache_built:
encoded_val_data.X_cache = encoded_val_data.X_cache[idxs, :]
encoded_val_data.Y_cache = encoded_val_data.Y_cache[idxs, :]
normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args))
normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index)

# ------------------------- #
# Run analysis blocks, both core and user-defined
# ------------------------- #
kwargs = {
'predictor': predictor,
'target': target,
'input_cols': input_cols,
'dtype_dict': dtype_dict,
'normal_predictions': normal_predictions,
'data': encoded_val_data.data_frame,
'train_data': train_data,
'encoded_val_data': encoded_val_data,
'is_classification': is_classification,
'is_numerical': is_numerical,
'is_multi_ts': is_multi_ts,
'stats_info': stats_info,
'tss': tss,
'ts_analysis': ts_analysis,
'accuracy_functions': accuracy_functions,
'has_pretrained_text_enc': has_pretrained_text_enc
}

for block in analysis_blocks:
log.info("The block %s is now running its analyze() method", block.__class__.__name__)
runtime_analyzer = block.analyze(runtime_analyzer, **kwargs)

# ------------------------- #
# Populate ModelAnalysis object
Expand Down
68 changes: 41 additions & 27 deletions lightwood/api/json_ai.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
# TODO: _add_implicit_values unit test ensures NO changes for a fully specified file.
import inspect
from copy import deepcopy

from type_infer.dtype import dtype
from type_infer.base import TypeInformation
from dataprep_ml import StatisticalAnalysis

from lightwood.helpers.log import log
from lightwood.helpers.templating import call, inline_dict, align
from lightwood.helpers.templating import _consolidate_analysis_blocks
from type_infer.dtype import dtype
from lightwood.helpers.templating import _consolidate_analysis_blocks, _add_cls_kwarg
from lightwood.api.types import (
JsonAI,
ProblemDefinition,
)
import inspect
from lightwood.helpers.log import log
from lightwood.__about__ import __version__ as lightwood_version

import lightwood.ensemble

# For custom modules, we create a module loader with necessary imports below
IMPORT_EXTERNAL_DIRS = """
Expand Down Expand Up @@ -535,29 +536,29 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
problem_definition = json_ai.problem_definition
tss = problem_definition.timeseries_settings
is_ts = tss.is_timeseries
# tsa_val = "self.ts_analysis" if is_ts else None # TODO: remove
mixers = json_ai.model['args']['submodels']

# Add implicit ensemble arguments
json_ai.model["args"]["target"] = json_ai.model["args"].get("target", "$target")
json_ai.model["args"]["data"] = json_ai.model["args"].get("data", "encoded_test_data")
json_ai.model["args"]["mixers"] = json_ai.model["args"].get("mixers", "$mixers")
json_ai.model["args"]["fit"] = json_ai.model["args"].get("fit", True)
json_ai.model["args"]["args"] = json_ai.model["args"].get("args", "$pred_args") # TODO correct?

# @TODO: change this to per-parameter basis and signature inspection
if json_ai.model["module"] in ("BestOf", "ModeEnsemble", "WeightedMeanEnsemble"):
json_ai.model["args"]["accuracy_functions"] = json_ai.model["args"].get("accuracy_functions",
"$accuracy_functions")

if json_ai.model["module"] in ("BestOf", "TsStackedEnsemble", "WeightedMeanEnsemble"):
tsa_val = "self.ts_analysis" if is_ts else None
json_ai.model["args"]["ts_analysis"] = json_ai.model["args"].get("ts_analysis", tsa_val)
param_pairs = {
'target': json_ai.model["args"].get("target", "$target"),
'data': json_ai.model["args"].get("data", "encoded_test_data"),
'mixers': json_ai.model["args"].get("mixers", "$mixers"),
'fit': json_ai.model["args"].get("fit", True),
'args': json_ai.model["args"].get("args", "$pred_args"),
'accuracy_functions': json_ai.model["args"].get("accuracy_functions", "$accuracy_functions"),
'ts_analysis': json_ai.model["args"].get("ts_analysis", "self.ts_analysis" if is_ts else None),
'dtype_dict': json_ai.model["args"].get("dtype_dict", "$dtype_dict"),
}
ensemble_cls = getattr(lightwood.ensemble, json_ai.model["module"])
filtered_params = {}
for p_name, p_value in param_pairs.items():
_add_cls_kwarg(ensemble_cls, filtered_params, p_name, p_value)

if json_ai.model["module"] in ("MeanEnsemble", "ModeEnsemble", "StackedEnsemble", "TsStackedEnsemble",
"WeightedMeanEnsemble"):
json_ai.model["args"]["dtype_dict"] = json_ai.model["args"].get("dtype_dict", "$dtype_dict")
json_ai.model["args"] = filtered_params
json_ai.model["args"]['submodels'] = mixers # add mixers back in

# Add implicit mixer arguments
mixers = json_ai.model['args']['submodels']
for i in range(len(mixers)):
if not mixers[i].get("args", False):
mixers[i]["args"] = {}
Expand Down Expand Up @@ -685,7 +686,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
"module": "model_analyzer",
"args": {
"stats_info": "$statistical_analysis",
"tss": "$problem_definition.timeseries_settings",
"pdef": "$problem_definition",
"accuracy_functions": "$accuracy_functions",
"predictor": "$ensemble",
"data": "encoded_test_data",
Expand Down Expand Up @@ -1170,7 +1171,12 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
# Prepare mixers
log.info(f'[Learn phase 6/{n_phases}] - Mixer training')
self.fit(enc_train_test)
if not self.problem_definition.embedding_only:
self.fit(enc_train_test)
else:
self.mixers = []
self.ensemble = Embedder(self.target, mixers=list(), data=enc_train_test['train'])
self.supports_proba = self.ensemble.supports_proba
# Analyze the ensemble
log.info(f'[Learn phase 7/{n_phases}] - Ensemble analysis')
Expand Down Expand Up @@ -1221,9 +1227,17 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
encoded_data = encoded_ds.get_encoded_data(include_target=False)
log.info(f'[Predict phase 3/{{n_phases}}] - Calling ensemble')
df = self.ensemble(encoded_ds, args=self.pred_args)
if self.pred_args.return_embedding:
embedder = Embedder(self.target, mixers=list(), data=encoded_ds)
df = embedder(encoded_ds, args=self.pred_args)
else:
df = self.ensemble(encoded_ds, args=self.pred_args)
if not self.pred_args.all_mixers:
if not(any(
[self.pred_args.all_mixers,
self.pred_args.return_embedding,
self.problem_definition.embedding_only]
)):
log.info(f'[Predict phase 4/{{n_phases}}] - Analyzing output')
df, global_insights = {call(json_ai.explainer)}
self.global_insights = {{**self.global_insights, **global_insights}}
Expand Down
6 changes: 6 additions & 0 deletions lightwood/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ class ProblemDefinition:
timeseries_settings: TimeseriesSettings
anomaly_detection: bool
use_default_analysis: bool
embedding_only: bool
dtype_dict: Optional[dict]
ignore_features: List[str]
fit_on_all: bool
Expand Down Expand Up @@ -220,6 +221,7 @@ def from_dict(obj: Dict):
ignore_features = obj.get('ignore_features', [])
fit_on_all = obj.get('fit_on_all', True)
use_default_analysis = obj.get('use_default_analysis', True)
embedding_only = obj.get('embedding_only', False)
strict_mode = obj.get('strict_mode', True)
seed_nr = obj.get('seed_nr', 1)
problem_definition = ProblemDefinition(
Expand All @@ -237,6 +239,7 @@ def from_dict(obj: Dict):
dtype_dict=dtype_dict,
ignore_features=ignore_features,
use_default_analysis=use_default_analysis,
embedding_only=embedding_only,
fit_on_all=fit_on_all,
strict_mode=strict_mode,
seed_nr=seed_nr
Expand Down Expand Up @@ -453,6 +456,7 @@ class PredictionArguments:
simple_ts_bounds: bool = False
time_format: str = ''
force_ts_infer: bool = False
return_embedding: bool = False

@staticmethod
def from_dict(obj: Dict):
Expand All @@ -474,6 +478,7 @@ def from_dict(obj: Dict):
simple_ts_bounds = obj.get('simple_ts_bounds', PredictionArguments.simple_ts_bounds)
time_format = obj.get('time_format', PredictionArguments.time_format)
force_ts_infer = obj.get('force_ts_infer', PredictionArguments.force_ts_infer)
return_embedding = obj.get('return_embedding', PredictionArguments.return_embedding)

pred_args = PredictionArguments(
predict_proba=predict_proba,
Expand All @@ -485,6 +490,7 @@ def from_dict(obj: Dict):
simple_ts_bounds=simple_ts_bounds,
time_format=time_format,
force_ts_infer=force_ts_infer,
return_embedding=return_embedding,
)

return pred_args
Expand Down
39 changes: 20 additions & 19 deletions lightwood/data/timeseries_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def transform_timeseries(
secondary_type_dict[oby] = dtype_dict[oby]

original_df[f'__mdb_original_{oby}'] = original_df[oby]
original_df = _ts_to_obj(original_df, [oby] + tss.historical_columns)
group_lengths = []
if len(gb_arr) > 0:
df_arr = []
Expand Down Expand Up @@ -136,30 +137,30 @@ def transform_timeseries(
make_preds = [True for _ in range(len(df_arr[i]))]
df_arr[i]['__make_predictions'] = make_preds

if len(original_df) > 500:
if len(df_arr) > 1 and len(original_df) > 5000:
# @TODO: restore possibility to override this with args
nr_procs = get_nr_procs(original_df)
biggest_sub_df = df_arr[np.argmax(group_lengths)]
nr_procs = min(get_nr_procs(biggest_sub_df), len(df_arr))
log.info(f'Using {nr_procs} processes to reshape.')
pool = mp.Pool(processes=nr_procs)
# Make type `object` so that dataframe cells can be python lists
df_arr = pool.map(partial(_ts_to_obj, historical_columns=[oby] + tss.historical_columns), df_arr)
df_arr = pool.map(
partial(
_ts_add_previous_rows, order_cols=[oby] + tss.historical_columns, window=window),
df_arr)
df_arr = pool.map(partial(_ts_add_future_target, target=target, horizon=tss.horizon,
data_dtype=tss.target_type, mode=mode),
df_arr)

if tss.use_previous_target:
with mp.Pool(processes=nr_procs) as pool:
df_arr = pool.map(
partial(_ts_add_previous_target, target=target, window=tss.window),
df_arr)
pool.close()
pool.join()
partial(_ts_add_previous_rows, order_cols=[oby] + tss.historical_columns, window=window),
df_arr
)

df_arr = pool.map(
partial(_ts_add_future_target, target=target, horizon=tss.horizon,
data_dtype=tss.target_type, mode=mode),
df_arr
)

if tss.use_previous_target:
df_arr = pool.map(
partial(_ts_add_previous_target, target=target, window=tss.window),
df_arr
)
else:
for i in range(n_groups):
df_arr[i] = _ts_to_obj(df_arr[i], historical_columns=[oby] + tss.historical_columns)
df_arr[i] = _ts_add_previous_rows(df_arr[i],
order_cols=[oby] + tss.historical_columns, window=window)
df_arr[i] = _ts_add_future_target(df_arr[i], target=target, horizon=tss.horizon,
Expand Down
3 changes: 2 additions & 1 deletion lightwood/encoder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from lightwood.encoder.text.short import ShortTextEncoder
from lightwood.encoder.text.vocab import VocabularyEncoder
from lightwood.encoder.text.rnn import RnnEncoder as TextRnnEncoder
from lightwood.encoder.categorical.simple_label import SimpleLabelEncoder
from lightwood.encoder.categorical.onehot import OneHotEncoder
from lightwood.encoder.categorical.binary import BinaryEncoder
from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder
Expand All @@ -23,5 +24,5 @@
__all__ = ['BaseEncoder', 'DatetimeEncoder', 'Img2VecEncoder', 'NumericEncoder', 'TsNumericEncoder',
'TsArrayNumericEncoder', 'ShortTextEncoder', 'VocabularyEncoder', 'TextRnnEncoder', 'OneHotEncoder',
'CategoricalAutoEncoder', 'TimeSeriesEncoder', 'ArrayEncoder', 'MultiHotEncoder', 'TsCatArrayEncoder',
'NumArrayEncoder', 'CatArrayEncoder',
'NumArrayEncoder', 'CatArrayEncoder', 'SimpleLabelEncoder',
'PretrainedLangEncoder', 'BinaryEncoder', 'DatetimeNormalizerEncoder', 'MFCCEncoder']
3 changes: 2 additions & 1 deletion lightwood/encoder/categorical/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from lightwood.encoder.categorical.onehot import OneHotEncoder
from lightwood.encoder.categorical.simple_label import SimpleLabelEncoder
from lightwood.encoder.categorical.multihot import MultiHotEncoder
from lightwood.encoder.categorical.autoencoder import CategoricalAutoEncoder

__all__ = ['OneHotEncoder', 'MultiHotEncoder', 'CategoricalAutoEncoder']
__all__ = ['OneHotEncoder', 'SimpleLabelEncoder', 'MultiHotEncoder', 'CategoricalAutoEncoder']
Loading

0 comments on commit 2812838

Please sign in to comment.