AutoAI experiment#

AutoAI#

class ibm_watson_machine_learning.experiment.autoai.autoai.AutoAI(wml_credentials=None, project_id=None, space_id=None, verify=None)[source]#

Bases: BaseExperiment

AutoAI class for pipeline models optimization automation.

Parameters:
  • wml_credentials (dict) – credentials to Watson Machine Learning instance

  • project_id (str, optional) – ID of the Watson Studio project

  • space_id (str, optional) – ID of the Watson Studio Space

  • verify (bool or str, optional) –

    user can pass as verify one of following:

    • the path to a CA_BUNDLE file

    • the path of directory with certificates of trusted CAs

    • True - default path to truststore will be taken

    • False - no verification will be made

Example

from ibm_watson_machine_learning.experiment import AutoAI

experiment = AutoAI(
    wml_credentials={
        "apikey": "...",
        "iam_apikey_description": "...",
        "iam_apikey_name": "...",
        "iam_role_crn": "...",
        "iam_serviceid_crn": "...",
        "instance_id": "...",
        "url": "https://us-south.ml.cloud.ibm.com"
    },
    project_id="...",
    space_id="...")
class ClassificationAlgorithms(value)#

Bases: Enum

Classification algorithms that AutoAI could use for IBM Cloud.

DT = 'DecisionTreeClassifier'#
EX_TREES = 'ExtraTreesClassifier'#
GB = 'GradientBoostingClassifier'#
LGBM = 'LGBMClassifier'#
LR = 'LogisticRegression'#
RF = 'RandomForestClassifier'#
SnapBM = 'SnapBoostingMachineClassifier'#
SnapDT = 'SnapDecisionTreeClassifier'#
SnapLR = 'SnapLogisticRegression'#
SnapRF = 'SnapRandomForestClassifier'#
SnapSVM = 'SnapSVMClassifier'#
XGB = 'XGBClassifier'#
class DataConnectionTypes#

Bases: object

Supported types of DataConnection.

CA = 'connection_asset'#
CN = 'container'#
DS = 'data_asset'#
FS = 'fs'#
S3 = 's3'#
class ForecastingAlgorithms(value)#

Bases: Enum

Forecasting algorithms that AutoAI could use for IBM Cloud Pak® for Data(CP4D).

ARIMA = 'ARIMA'#
BATS = 'BATS'#
ENSEMBLER = 'Ensembler'#
HW = 'HoltWinters'#
LR = 'LinearRegression'#
RF = 'RandomForest'#
SVM = 'SVM'#
class Metrics#

Bases: object

Supported types of classification and regression metrics in autoai.

ACCURACY_AND_DISPARATE_IMPACT_SCORE = 'accuracy_and_disparate_impact'#
ACCURACY_SCORE = 'accuracy'#
AVERAGE_PRECISION_SCORE = 'average_precision'#
EXPLAINED_VARIANCE_SCORE = 'explained_variance'#
F1_SCORE = 'f1'#
F1_SCORE_MACRO = 'f1_macro'#
F1_SCORE_MICRO = 'f1_micro'#
F1_SCORE_WEIGHTED = 'f1_weighted'#
LOG_LOSS = 'neg_log_loss'#
MEAN_ABSOLUTE_ERROR = 'neg_mean_absolute_error'#
MEAN_SQUARED_ERROR = 'neg_mean_squared_error'#
MEAN_SQUARED_LOG_ERROR = 'neg_mean_squared_log_error'#
MEDIAN_ABSOLUTE_ERROR = 'neg_median_absolute_error'#
PRECISION_SCORE = 'precision'#
PRECISION_SCORE_MACRO = 'precision_macro'#
PRECISION_SCORE_MICRO = 'precision_micro'#
PRECISION_SCORE_WEIGHTED = 'precision_weighted'#
R2_AND_DISPARATE_IMPACT_SCORE = 'r2_and_disparate_impact'#
R2_SCORE = 'r2'#
RECALL_SCORE = 'recall'#
RECALL_SCORE_MACRO = 'recall_macro'#
RECALL_SCORE_MICRO = 'recall_micro'#
RECALL_SCORE_WEIGHTED = 'recall_weighted'#
ROC_AUC_SCORE = 'roc_auc'#
ROOT_MEAN_SQUARED_ERROR = 'neg_root_mean_squared_error'#
ROOT_MEAN_SQUARED_LOG_ERROR = 'neg_root_mean_squared_log_error'#
class PipelineTypes#

Bases: object

Supported types of Pipelines.

LALE = 'lale'#
SKLEARN = 'sklearn'#
class PredictionType#

Bases: object

Supported types of learning.

BINARY = 'binary'#
CLASSIFICATION = 'classification'#
FORECASTING = 'forecasting'#
MULTICLASS = 'multiclass'#
REGRESSION = 'regression'#
TIMESERIES_ANOMALY_PREDICTION = 'timeseries_anomaly_prediction'#
class RegressionAlgorithms(value)#

Bases: Enum

Regression algorithms that AutoAI could use for IBM Cloud.

DT = 'DecisionTreeRegressor'#
EX_TREES = 'ExtraTreesRegressor'#
GB = 'GradientBoostingRegressor'#
LGBM = 'LGBMRegressor'#
LR = 'LinearRegression'#
RF = 'RandomForestRegressor'#
RIDGE = 'Ridge'#
SnapBM = 'SnapBoostingMachineRegressor'#
SnapDT = 'SnapDecisionTreeRegressor'#
SnapRF = 'SnapRandomForestRegressor'#
XGB = 'XGBRegressor'#
class SamplingTypes#

Bases: object

Types of training data sampling.

FIRST_VALUES = 'first_n_records'#
LAST_VALUES = 'truncate'#
RANDOM = 'random'#
STRATIFIED = 'stratified'#
class TShirtSize#

Bases: object

Possible sizes of the AutoAI POD. Depends on the POD size, AutoAI could support different data sets sizes.

  • S - small (2vCPUs and 8GB of RAM)

  • M - Medium (4vCPUs and 16GB of RAM)

  • L - Large (8vCPUs and 32GB of RAM))

  • XL - Extra Large (16vCPUs and 64GB of RAM)

L = 'l'#
M = 'm'#
S = 's'#
XL = 'xl'#
class Transformers#

Bases: object

Supported types of congito transformers names in autoai.

ABS = 'abs'#
CBRT = 'cbrt'#
COS = 'cos'#
CUBE = 'cube'#
DIFF = 'diff'#
DIVIDE = 'divide'#
FEATUREAGGLOMERATION = 'featureagglomeration'#
ISOFORESTANOMALY = 'isoforestanomaly'#
LOG = 'log'#
MAX = 'max'#
MINMAXSCALER = 'minmaxscaler'#
NXOR = 'nxor'#
PCA = 'pca'#
PRODUCT = 'product'#
ROUND = 'round'#
SIGMOID = 'sigmoid'#
SIN = 'sin'#
SQRT = 'sqrt'#
SQUARE = 'square'#
STDSCALER = 'stdscaler'#
SUM = 'sum'#
TAN = 'tan'#
optimizer(name, *, prediction_type, prediction_column=None, prediction_columns=None, timestamp_column_name=None, scoring=None, desc=None, test_size=None, holdout_size=None, max_number_of_estimators=None, train_sample_rows_test_size=None, include_only_estimators=None, daub_include_only_estimators=None, include_batched_ensemble_estimators=None, backtest_num=None, lookback_window=None, forecast_window=None, backtest_gap_length=None, feature_columns=None, pipeline_types=None, supporting_features_at_forecast=None, cognito_transform_names=None, data_join_graph=None, csv_separator=',', excel_sheet=None, encoding='utf-8', positive_label=None, data_join_only=False, drop_duplicates=True, outliers_columns=None, text_processing=None, word2vec_feature_number=None, daub_give_priority_to_runtime=None, fairness_info=None, sampling_type=None, sample_size_limit=None, sample_rows_limit=None, sample_percentage_limit=None, n_parallel_data_connections=None, number_of_batch_rows=None, categorical_imputation_strategy=None, numerical_imputation_strategy=None, numerical_imputation_value=None, imputation_threshold=None, retrain_on_holdout=None, categorical_columns=None, numerical_columns=None, test_data_csv_separator=',', test_data_excel_sheet=None, test_data_encoding='utf-8', confidence_level=None, incremental_learning=None, early_stop_enabled=None, early_stop_window_size=None, time_ordered_data=None, feature_selector_mode=None, **kwargs)[source]#

Initialize an AutoAi optimizer.

Parameters:
  • name (str) – name for the AutoPipelines

  • prediction_type (PredictionType) – type of the prediction

  • prediction_column (str, optional) – name of the target/label column, required for multiclass, binary and regression prediction types

  • prediction_columns (list[str], optional) – names of the target/label columns, required for forecasting prediction type

  • timestamp_column_name (str, optional) – name of timestamp column for time series forecasting

  • scoring (Metrics, optional) – type of the metric to optimize with, not used for forecasting

  • desc (str, optional) – description

  • test_size – deprecated, use holdout_size instead

  • holdout_size (float, optional) – percentage of the entire dataset to leave as a holdout

  • max_number_of_estimators (int, optional) – maximum number (top-K ranked by DAUB model selection) of the selected algorithm, or estimator types, for example LGBMClassifierEstimator, XGBoostClassifierEstimator, or LogisticRegressionEstimator to use in pipeline composition, the default is None that means the true default value will be determined by the internal different algorithms, where only the highest ranked by model selection algorithm type is used

  • train_sample_rows_test_size (float, optional) – training data sampling percentage

  • daub_include_only_estimators – deprecated, use include_only_estimators instead

  • include_batched_ensemble_estimators (list[BatchedClassificationAlgorithms or BatchedRegressionAlgorithms], optional) – list of batched ensemble estimators to include in computation process, see: AutoAI.BatchedClassificationAlgorithms, AutoAI.BatchedRegressionAlgorithms

  • include_only_estimators (List[ClassificationAlgorithms or RegressionAlgorithms or ForecastingAlgorithms]], optional) – list of estimators to include in computation process, see: AutoAI.ClassificationAlgorithms, AutoAI.RegressionAlgorithms or AutoAI.ForecastingAlgorithms

  • backtest_num (int, optional) – number of backtests used for forecasting prediction type, default value: 4, value from range [0, 20]

  • lookback_window (int, optional) – length of lookback window used for forecasting prediction type, default value: 10, if set to -1 lookback window will be auto-detected

  • forecast_window (int, optional) – length of forecast window used for forecasting prediction type, default value: 1, value from range [1, 60]

  • backtest_gap_length (int, optional) – gap between backtests used for forecasting prediction type, default value: 0, value from range [0, data length / 4]

  • feature_columns (list[str], optional) – list of feature columns used for forecasting prediction type, may contain target column and/or supporting feature columns, list of columns to be detected whether there are anomalies for timeseries anomaly prediction type

  • pipeline_types (list[ForecastingPipelineTypes or TimeseriesAnomalyPredictionPipelineTypes], optional) – list of pipeline types to be used for forecasting or timeseries anomaly prediction type

  • supporting_features_at_forecast (bool, optional) – enables usage of future supporting feature values during forecast

  • cognito_transform_names (list[Transformers], optional) – list of transformers to include in the feature enginnering computation process, see: AutoAI.Transformers

  • csv_separator (list[str] or str, optional) – the separator, or list of separators to try for separating columns in a CSV file, not used if the file_name is not a CSV file, default is ‘,’

  • excel_sheet (str, optional) – name of the excel sheet to use, only applicable when xlsx file is an input, support for number of the sheet is deprecated, by default first sheet is used

  • encoding (str, optional) – encoding type for CSV training file

  • positive_label (str, optional) – the positive class to report when binary classification, when multiclass or regression, this will be ignored

  • t_shirt_size (TShirtSize, optional) – the size of the remote AutoAI POD instance (computing resources), only applicable to a remote scenario, see: AutoAI.TShirtSize

  • data_join_graph (DataJoinGraph, optional) – a graph object with definition of join structure for multiple input data sources, data preprocess step for multiple files

  • data_join_only (bool, optional) – if True only preprocessing will be executed

  • drop_duplicates (bool, optional) – if True duplicated rows in data will be removed before further processing

  • outliers_columns (list, optional) – replace outliers with NaN using IQR method for specified columns. By default, turned ON for regression learning_type and target column. To turn OFF empty list of columns must be passed

  • text_processing (bool, optional) – if True text processing will be enabled, applicable only on Cloud

  • word2vec_feature_number (int, optional) – number of features which will be generated from text column, will be applied only if text_processing is True, if None the default value will be taken

  • daub_give_priority_to_runtime (float, optional) – the importance of run time over score for pipelines ranking, can take values between 0 and 5, if set to 0.0 only score is used, if set to 1 equally score and runtime are used, if set to value higher than 1 the runtime gets higher importance over score

  • fairness_info (fairness_info) – dictionary that specifies metadata needed for measuring fairness, it contains three key values: favorable_labels, unfavorable_labels and protected_attributes, the favorable_labels attribute indicates that when the class column contains one of the value from list, that is considered a positive outcome, the unfavorable_labels is oposite to the favorable_labels and is obligatory for regression learning type, a protected attribute is a list of features that partition the population into groups whose outcome should have parity, if protected attribute is empty list then automatic detection of protected attributes will be run, if fairness_info is passed then fairness metric will be calculated

  • n_parallel_data_connections (int, optional) – number of maximum parallel connection to data source, supported only for IBM Cloud Pak® for Data 4.0.1 and above

  • categorical_imputation_strategy (ImputationStrategy, optional) –

    missing values imputation strategy for categorical columns

    Possible values (only non-forecasting scenario):

    • ImputationStrategy.MEAN

    • ImputationStrategy.MEDIAN

    • ImputationStrategy.MOST_FREQUENT (default)

  • numerical_imputation_strategy

    missing values imputation strategy for numerical columns

    Possible values (non-forecasting scenario):

    • ImputationStrategy.MEAN

    • ImputationStrategy.MEDIAN (default)

    • ImputationStrategy.MOST_FREQUENT

    Possible values (forecasting scenario):

    • ImputationStrategy.MEAN

    • ImputationStrategy.MEDIAN

    • ImputationStrategy.BEST_OF_DEFAULT_IMPUTERS (default)

    • ImputationStrategy.VALUE

    • ImputationStrategy.FLATTEN_ITERATIVE

    • ImputationStrategy.LINEAR

    • ImputationStrategy.CUBIC

    • ImputationStrategy.PREVIOUS

    • ImputationStrategy.NEXT

    • ImputationStrategy.NO_IMPUTATION

  • numerical_imputation_value (float, optional) – value for filling missing values if numerical_imputation_strategy is set to ImputationStrategy.VALUE, for forecasting only

  • imputation_threshold (float, optional) – maximum threshold of missing values imputation, for forecasting only

  • retrain_on_holdout (bool, optional) – if True final pipelines will be train also on holdout data

  • categorical_columns (list, optional) – list of columns names that must be treated as categorical

  • numerical_columns (list, optional) – list of columns names that must be treated as numerical

  • sampling_type (str, optional) – type of sampling data for training, one of SamplingTypes enum values, default is SamplingTypes.FIRST_N_RECORDS, supported only for IBM Cloud Pak® for Data 4.0.1 and above

  • sample_size_limit (int, optional) – the size of sample upper bound (in bytes). The default value is 1GB, supported only for IBM Cloud Pak® for Data 4.5 and above

  • sample_rows_limit (int, optional) – the size of sample upper bound (in rows), supported only for IBM Cloud Pak® for Data 4.6 and above

  • sample_percentage_limit (float, optional) – the size of sample upper bound (as fraction of dataset size), supported only for IBM Cloud Pak® for Data 4.6 and above

  • number_of_batch_rows (int, optional) – number of rows to read in each batch when reading from flight connection

  • test_data_csv_separator (list[str] or str, optional) – the separator, or list of separators to try for separating columns in a CSV user-defined holdout/test file, not used if the file_name is not a CSV file, default is ‘,’

  • test_data_excel_sheet (str or int, optional) – name of the excel sheet to use for user-defined holdout/test data, only use when xlsx file is an test, dataset file, by default first sheet is used

  • test_data_encoding (str, optional) – encoding type for CSV user-defined holdout/test file

  • confidence_level (float, optional) – when the pipeline “PointwiseBoundedHoltWinters” or “PointwiseBoundedBATS” is used, the prediction interval is calculated at a given confidence_level to decide if a data record is an anomaly or not, optional for timeseries anomaly prediction

  • incremental_learning (bool, optional) – triggers incremental learning process for supported pipelines

  • early_stop_enabled (bool, optional) – enables early stop for incremental learning process

  • early_stop_window_size (int, optional) – the number of iterations without score improvements before training stop

  • time_ordered_data (bool, optional) – defines user preference about time-based analise. If True, the analysis will consider the data as time-ordered and time-based. Supported only for regression.

  • feature_selector_mode (str, optional) – defines if feature selector should be triggered [“on”, “off”, “auto”]. The “auto” mode analyses the impact of removal of insignificant features. If there is drop in accuracy, the PCA is applied to insignificant features. Principal components describing variance in 30% or higher are selected in place of insignificant features. The model is evaluated again. If there is still drop in accuracy all features are used. The “on” mode removes all insignificant features (0.0. importance). Feature selector is applied during cognito phase (applicable to pipelines with feature engineering stage).

Returns:

RemoteAutoPipelines or LocalAutoPipelines, depends on how you initialize the AutoAI object

Return type:

RemoteAutoPipelines or LocalAutoPipelines

Examples

from ibm_watson_machine_learning.experiment import AutoAI
experiment = AutoAI(...)

fairness_info = {
           "protected_attributes": [
               {"feature": "Sex", "reference_group": ['male'], "monitored_group": ['female']},
               {"feature": "Age", "reference_group": [[50,60]], "monitored_group": [[18, 49]]}
           ],
           "favorable_labels": ["No Risk"],
           "unfavorable_labels": ["Risk"],
           }

optimizer = experiment.optimizer(
       name="name of the optimizer.",
       prediction_type=AutoAI.PredictionType.BINARY,
       prediction_column="y",
       scoring=AutoAI.Metrics.ROC_AUC_SCORE,
       desc="Some description.",
       holdout_size=0.1,
       max_number_of_estimators=1,
       fairness_info= fairness_info,
       cognito_transform_names=[AutoAI.Transformers.SUM,AutoAI.Transformers.MAX],
       train_sample_rows_test_size=1,
       include_only_estimators=[AutoAI.ClassificationAlgorithms.LGBM, AutoAI.ClassificationAlgorithms.XGB],
       t_shirt_size=AutoAI.TShirtSize.L
   )

optimizer = experiment.optimizer(
       name="name of the optimizer.",
       prediction_type=AutoAI.PredictionType.MULTICLASS,
       prediction_column="y",
       scoring=AutoAI.Metrics.ROC_AUC_SCORE,
       desc="Some description.",
   )
runs(*, filter)[source]#

Get the historical runs but with WML Pipeline name filter (for remote scenario). Get the historical runs but with experiment name filter (for local scenario).

Parameters:

filter (str) – WML Pipeline name to filter the historical runs or experiment name to filter the local historical runs

Returns:

object managing the list of runs

Return type:

AutoPipelinesRuns or LocalAutoPipelinesRuns

Example

from ibm_watson_machine_learning.experiment import AutoAI

experiment = AutoAI(...)
experiment.runs(filter='Test').list()