Usage of Checkpoints in Automatic Machine Learning (AutoML)

This demonstrates how you can use checkpoints in a pipeline to save computing time when doing a hyperparameter search.

import time

import numpy as np
from sklearn.metrics import mean_squared_error

from neuraxle.checkpoints import DefaultCheckpoint
from neuraxle.hyperparams.distributions import RandInt
from neuraxle.hyperparams.space import HyperparameterSpace
from neuraxle.metaopt.auto_ml import RandomSearch
from neuraxle.pipeline import ResumablePipeline, DEFAULT_CACHE_FOLDER, Pipeline
from neuraxle.steps.flow import ExpandDim
from neuraxle.steps.loop import ForEachDataInput
from neuraxle.steps.misc import Sleep
from neuraxle.steps.numpy import MultiplyByN


def main(tmpdir, sleep_time: float = 0, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by': RandInt(1, 2),
        'multiplication_2__multiply_by': RandInt(1, 2),
        'multiplication_3__multiply_by': RandInt(1, 2),
    })

    print('Classic Pipeline:')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_3', MultiplyByN()),
    ]).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(
        pipeline,
        n_iter=n_iter,
        higher_score_is_better=True
    ).fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')

    pipeline = ResumablePipeline([
        ('multiplication_1', MultiplyByN()),
        ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint1', ExpandDim(DefaultCheckpoint())),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint2', ExpandDim(DefaultCheckpoint())),
        ('multiplication_3', MultiplyByN())
    ], cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(
        pipeline,
        n_iter=n_iter,
        higher_score_is_better=True
    ).fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)


if __name__ == "__main__":
    main(DEFAULT_CACHE_FOLDER, sleep_time=0.001, n_iter=50)

Total running time of the script: ( 0 minutes 0.000 seconds)

Gallery generated by Sphinx-Gallery