JSDoc: Source: automl.js

const nn = require('./nn')
const model_selection = require('./model_selection')
const prep = require('./preprocessing')
const op = require('optimization-js')
const base = require('./base')
const opt = require('optimization-js')
const aml = require('./automljs')
const utils = require('./utils')

function determine_estimator(output){
    if(output['type'] === 'number'){
        return new nn.MLPRegressor()
    }else{
        return new nn.MLPClassifier()
    }
}

async function fill_last_column(dataset, callback, n_iter, max_epochs=100){
    
    // training features
    var X = dataset.map((v)=>v.slice(0, -1))
    var y = dataset.map((v)=>v.slice(-1)[0])

    // select all inputs with missing outputs
    var I = y.map((v, ix)=>ix)

    // these values need filling in
    var missing = I.filter((v, ix)=>y[ix]==='')

    // select all samples for training 
    var X_miss = X.filter((v, ix)=>y[ix]==='')
    var Xs = X.filter((v, ix)=>y[ix]!=='')
    var ys = y.filter((v, ix)=>y[ix]!=='')

    var [X_train, X_test, y_train, y_test] = model_selection.train_test_split([Xs, ys])

    // extract features
    var feats = new prep.TableFeaturesTransformer()
    await feats.fit(X_train, y_train)
    X_train = await feats.transform(X_train)
    X_test = await feats.transform(X_test)
    X_miss = await feats.transform(X_miss)
    
    // scale features
    var scaler = new prep.StandardScaler()
    await scaler.fit(X_train, y_train)
    X_train = await scaler.transform(X_train)
    X_test = await scaler.transform(X_test)
    X_miss = await scaler.transform(X_miss)

    // set up tuning algo
    var opt = new op.OMGOptimizer([
        new op.Integer(1, max_epochs),
        new op.Real(-4, -1)
    ])

    var best_score = null
    var best_model = null
    var state = {'dataset': dataset}
    
    // run the optimization cycle
    for(var iteration=0; iteration<n_iter; iteration++){
        var point = await opt.ask();
        var [epochs, lr] = point

        var model = await determine_estimator(feats.state['output'])

        model.params['epochs'] = epochs
        model.params['lr'] = Math.pow(10, lr)

        await model.fit(X_train, y_train)
        var score = await model.score(X_test, y_test)
        
        opt.tell([point], [-score])

        if((best_score === null) || (best_score < score)){
            best_score = score
            best_model = model

            var y_pred = model.predict(X_miss)

            var last_feature = dataset[0].length-1
            
            for(var [iy, v] of y_pred.entries()){
                dataset[missing[iy]][last_feature] = y_pred[iy]
            }
        }
        
        state['best_score'] = best_score
        state['iteration'] = iteration

        var result = callback(state)
        if(result === 'cancel'){
            break
        }
    }

    return state
}

module.exports.fill_last_column = fill_last_column


/**
 * A fully automated class for fitting to the data.
 * Automatically splits the data into training and testing,
 * select the algorithms to apply on the data, and generates
 * report on how well the models perform. 
 * The communication of the class to the outside world is 
 * done via the events. A new event can be created using
 * this.event_type[event_name] = event_handler.
 */
class AutoMLModel extends base.BaseEstimator{
    /**
     * @param {Object} params Configuration of AutoML algorithm.
     */
    constructor(params){
        super(params, {
            'max_iter': 1000,
        })

        // named events for the situation where a score for some 
        // type of model has been improved
        this.on_improve = {}

        // some ML fitting step has been done. This does not 
        // necessary imply any kind of improvement.
        this.on_step = {}
    }

    /**
     * Run the automated algorithm search.
     * Current algorithm: 
     * 1. Split the data into train / test partitions
     * 2. Optimize progressively a number of models, some specifically 
     * smaller for interpretation, some larger "black box" models for
     * high accuracy
     * @param {Array} X Array of input samples
     * @param {Array} y Array of target outputs
     */
    async fit(X, y){
        /*
        ToDo: Desired algorithm:
        1. Is the data historical?
        Y: split over time stamp
        N: split at random

        2. If the data is historical, does historical context
        help for estimation of the outputs?
        Y: Use a context for training. Context is determined by sequence ID.
        N: Use a single sample for training
        
        3. Optimize a number of models.
        */

        // extract features
        // train small decision tree
        // train large decision tree (bb)
        // train linear model with L1 regularization
        // train complete linear model (bb)
        // train gradient boosting model (bb)

        var features = new prep.TableFeaturesTransformer({})

        // extract features
        await features.fit(X, y)
        X = await features.transform(X, y)

        // remember the feature transformer
        this.state['features'] = features

        // determine type of the learning problem
        var regression = features.state['output']['type'] === 'number'

        // search parameters
        var grids = {
            'tree_small': {
                'max_depth': new opt.Integer(4)
            },
            'tree': {
                'max_depth': new opt.Integer(8)
            },
            'lsgd': {
                'alpha': new opt.Real(0.0001, 10.0),
                'max_iter': new opt.Integer(16, 64),
                'l1_ratio': new opt.Real(0.0, 1.0)
            },
            'gbdt': {
                'learning_rate': new opt.Real(0.001, 1.0),
                'n_estimators': new opt.Integer(16, 256),
                'max_depth': new opt.Integer(1, 4)
            }
        }

        var estimators = {}

        if(regression){
            estimators['tree_small'] = new aml.tree.DecisionTreeRegressor()
            estimators['tree'] = new aml.tree.DecisionTreeRegressor()
            estimators['lsgd'] = new aml.linear_model.SGDRegressor()
            estimators['gbdt'] = new aml.ensemble.GradientBoostingRegressor()
        }else{
            estimators['tree_small'] = new aml.tree.DecisionTreeClassifier()
            estimators['tree'] = new aml.tree.DecisionTreeClassifier()
            estimators['lsgd'] = new aml.linear_model.SGDClassifier()
            estimators['gbdt'] = new aml.ensemble.GradientBoostingClassifier()
        }

        // initialize the corresponding *SearchCV models
        var optimizers = {}

        for(var model_name in grids){
            var scv = new aml.model_selection.OMGSearchCV({
                'estimator': estimators[model_name],
                'param_grid': grids[model_name],
                'refit_on_improvement': true,  // search will be done manually
                'cv': 3
            })
            
            await scv.init(X, y)

            optimizers[model_name] = scv
        }

        // run the parameter search algorithm
        var n_iter = this.params['max_iter']
        var i = 0

        // optimize the algorithms in round robin fashion
        while(i < n_iter){
            for(var model_name in optimizers){
                if(!(i < n_iter)){
                    break
                }

                var iter_result = {
                    'iteration': i,
                    'n_iter': n_iter,
                    'model_name': model_name
                }

                var scv = optimizers[model_name]
                var result = await scv.step()

                // assign only when there is already some estimator
                this.state[model_name] = scv

                if(result['improved']){
                    var score = scv.state['best_score_']
                    var current_best = this.state['best_score_'] || null
                    
                    // check for improvement overall
                    if(current_best === null || current_best < score){
                        this.state['best_score_'] = score
                        this.state['best_estimator_'] = scv
                    }

                    for(var k in this.on_improve){
                        if(this.on_improve[k] === null){
                            continue
                        }
                        var callback_result = this.on_improve[k](iter_result) || false
                        
                        // terminate if the callback says so
                        if(callback_result){
                            i = Number.POSITIVE_INFINITY
                        }
                    }
                }

                for(var k in this.on_step){
                    if(this.on_step[k] === null){
                        continue
                    }
                    var callback_result = this.on_step[k](iter_result) || false
                    
                    // terminate if the callback says so
                    if(callback_result){
                        i = Number.POSITIVE_INFINITY
                    }
                }

                i++
            }
        }

    }

    /**
     * Get estimations with the best performing model.
     * @param {Array} X Array of input observations
     */
    async predict(X){
        utils.check_is_fitted(this, ['best_estimator_'])
        var features = this.state['features']
        var estimator = this.state['best_estimator_']
        var X = await features.transform(X)
        var y_pred = await estimator.predict(X)
        return y_pred
    }

    /**
     * Calculates the score of the best performing model
     * on the dataset provided.
     * @param {Array} X Array of input observations
     * @param {Array} y Array of output values.
     */
    async score(X, y){
        utils.check_is_fitted(this, ['best_estimator_'])
        var features = this.state['features']
        var estimator = this.state['best_estimator_']
        var X = await features.transform(X)
        var score = await estimator.score(X, y)
        return score
    }
}

module.exports.AutoMLModel = AutoMLModel