Usage of AutoML loop, and hyperparams with sklearn models.

This demonstrates how you can build an AutoML loop that finds the best possible sklearn classifier. It also shows you how to add hyperparams to sklearn steps using SKLearnWrapper. This example has been derived and simplified from the following repository: https://github.com/Neuraxio/Kata-Clean-Machine-Learning-From-Dirty-Code Here, 2D data is fitted, whereas in the original example 3D (sequential / time series) data is preprocessed and then fitted with the same models.

Out:

/home/gui/Documents/GIT/www.neuraxle.org-builder/docs/Neuraxle/neuraxle/logging/warnings.py:42: UserWarning:  Visit https://www.neuraxle.org/stable/api.html for more information. If you want to disable these warnings, call `neuraxle.logging.warnings.silence_all_deprecation_warnings()`.
  warnings.warn(
/home/gui/Documents/GIT/www.neuraxle.org-builder/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), RidgeClassifier())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples.
  warnings.warn(
/home/gui/Documents/GIT/www.neuraxle.org-builder/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), RidgeClassifier())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples.
  warnings.warn(
/home/gui/Documents/GIT/www.neuraxle.org-builder/venv/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:1113: UserWarning: Setting penalty='none' will ignore the C and l1_ratio parameters
  warnings.warn(
/home/gui/Documents/GIT/www.neuraxle.org-builder/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py:141: FutureWarning: 'normalize' was deprecated in version 1.0 and will be removed in 1.2.
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), RidgeClassifier())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples.
  warnings.warn(
Test accuracy score: 0.9845

import shutil

from neuraxle.base import ExecutionContext as CX
from neuraxle.hyperparams.distributions import (Boolean, Choice, LogUniform,
                                                RandInt)
from neuraxle.hyperparams.space import HyperparameterSpace
from neuraxle.metaopt.auto_ml import (AutoML, RandomSearchSampler,
                                      ValidationSplitter)
from neuraxle.metaopt.callbacks import ScoringCallback
from neuraxle.metaopt.repositories.json import HyperparamsOnDiskRepository
from neuraxle.pipeline import Pipeline
from neuraxle.steps.flow import ChooseOneStepOf
from neuraxle.steps.numpy import NumpyRavel
from neuraxle.steps.output_handlers import OutputTransformerWrapper
from neuraxle.steps.sklearn import SKLearnWrapper
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier


def main(tmpdir: str):
    # Define classification models, and hyperparams.

    decision_tree_classifier = SKLearnWrapper(
        DecisionTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    extra_tree_classifier = SKLearnWrapper(
        ExtraTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    ridge_classifier = Pipeline([OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper(
        RidgeClassifier(),
        HyperparameterSpace({
            'alpha': Choice([0.0, 1.0, 10.0, 100.0]),
            'fit_intercept': Boolean(),
            'normalize': Boolean()
        }))
    ]).set_name('RidgeClassifier')

    logistic_regression = Pipeline([OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper(
        LogisticRegression(),
        HyperparameterSpace({
            'C': LogUniform(0.01, 10.0),
            'fit_intercept': Boolean(),
            'penalty': Choice(['none', 'l2']),
            'max_iter': RandInt(20, 200)
        }))
    ]).set_name('LogisticRegression')

    random_forest_classifier = Pipeline([OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper(
        RandomForestClassifier(),
        HyperparameterSpace({
            'n_estimators': RandInt(50, 600),
            'criterion': Choice(['gini', 'entropy']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4),
            'bootstrap': Boolean()
        }))
    ]).set_name('RandomForestClassifier')

    # Define a classification pipeline that lets the AutoML loop choose one of the classifier.
    # See also ChooseOneStepOf documentation: https://www.neuraxle.org/stable/api/steps/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf

    pipeline = Pipeline([
        ChooseOneStepOf([
            decision_tree_classifier,
            extra_tree_classifier,
            ridge_classifier,
            logistic_regression,
            random_forest_classifier
        ])
    ])

    # Create the AutoML loop object.
    # See also AutoML documentation: https://www.neuraxle.org/stable/api/metaopt/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML

    auto_ml = AutoML(
        pipeline=pipeline,
        hyperparams_optimizer=RandomSearchSampler(),
        validation_splitter=ValidationSplitter(validation_size=0.20),
        scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True),
        n_trials=7,
        epochs=1,
        hyperparams_repository=HyperparamsOnDiskRepository(cache_folder=tmpdir),
        refit_best_trial=True,
        continue_loop_on_error=False
    )

    # Load data, and launch AutoML loop !

    X_train, y_train, X_test, y_test = generate_classification_data()
    auto_ml = auto_ml.fit(X_train, y_train)

    # Get the model from the best trial, and make predictions using predict, as per the `refit_best_trial=True` argument to AutoML.
    y_pred = auto_ml.predict(X_test)

    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    print("Test accuracy score:", accuracy)

    shutil.rmtree(tmpdir)


def generate_classification_data():
    data_inputs, expected_outputs = make_classification(
        n_samples=10000,
        n_repeated=0,
        n_classes=3,
        n_features=4,
        n_clusters_per_class=1,
        class_sep=1.5,
        flip_y=0,
        weights=[0.5, 0.5, 0.5]
    )

    X_train, X_test, y_train, y_test = train_test_split(
        data_inputs,
        expected_outputs,
        test_size=0.20
    )

    return X_train, y_train, X_test, y_test


if __name__ == '__main__':
    main(CX.get_new_cache_folder())

Total running time of the script: ( 0 minutes 1.606 seconds)

Gallery generated by Sphinx-Gallery