Source: model_selection.js

const utils = require('./utils')
const base = require('./base')
const prep = require('./preprocessing')
const io = require('./io')
const opt = require('optimization-js')

/**
 * Splits the data into training and testing partitions for model 
 * fitting. The split is done at random.
 * @param {Array} arrays List of arrays to be split into training
 * and testing parts.
 * @param {Number} test_split Value in the range of [0; 1]. Specifies
 * how much of the data will be used for test partition.
 */
function train_test_split(arrays, test_split=0.25){
    var N = arrays[0].length
    var partition = utils.random_partitioning(N, [1.0 - test_split, test_split])
    function train_test_split(arrays, test_split=0.25){
        var N = arrays[0].length
        var partition = utils.random_partitioning(N, [1.0 - test_split, test_split])
    
        var result = []
    
        // for arrays = [X, y], the function should return
        // [X_train, X_test, y_train, y_test]
        for(var arr of arrays){
            for(var I of partition){
                var part = I.map((ix)=>arr[ix])
                result.push(part)
            }
        }
    
        return result
    }
    
    module.exports.train_test_split = train_test_split
    
    var result = []

    // for arrays = [X, y], the function should return
    // [X_train, X_test, y_train, y_test]
    for(var arr of arrays){
        for(var I of partition){
            var part = I.map((ix)=>arr[ix])
            result.push(part)
        }
    }

    return result
}

module.exports.train_test_split = train_test_split

/**
 * Evaluate estimator performance via cross - validation.
 * Interface similar to `cross_val_score` function of sklearn.
 * @param {BaseEstimator} estimator Model to be scored.
 * @param {Array} X Input values to be used for sorting.
 * @param {Array} y Output values to be used for scoring
 * @param {Object} groups Ignored.
 * @param {Object} scoring Ignored for the time being.
 * @param {Number} cv Number of folds used for cross - validation.
 * @returns {Array} Scores on various partitions.
 */
async function cross_val_score(estimator, X, y=null, groups=null, scoring=null, cv=3){
    var length = []

    for(var i=0; i<cv; i++){
        length.push(1.0) // all partitions are made equal at this point
    }

    var partitions = utils.random_partitioning(X.length, length)
    var scores = []

    for(var fold=0; fold<cv; fold++){
        var p = partitions[fold]
        var test_set = {}
        
        // setify fold
        for(var i=0; i<p.length; i++){
            test_set[p[i]] = true
        }

        // sort into training and testing
        var [X_train, y_train, X_test, y_test] = [[], [], [], []]

        for(var i=0; i<X.length; i++){
            if(i in test_set){
                X_test.push(X[i])
                y_test.push(y[i])
            }else{
                X_train.push(X[i])
                y_train.push(y[i])
            }
        }

        // evaluate on the model
        var fold_estim = await io.clone(estimator, false)
        await fold_estim.fit(X_train, y_train)
        var score = await fold_estim.score(X_test, y_test)
        scores.push(score)
    }

    return scores
}

module.exports.cross_val_score = cross_val_score


/**
 * Optimizes hyperparameters for supplied estimator.
 * A very simple version of genetic algorithm is used
 * for this otpimization. See more in optimization-js,
 * in particular in OMGOptimizer docs.
 */
class OMGSearchCV extends base.BaseEstimator{
    /**
     * Creates an instance of OMGSearchCV.
     * @param {Object} params Parameters of the hyperparameter
     * optimization algorithm.
     * @param {BaseEstimator} [params.estimator] Estimator instance,
     * whose hyperparameters are optimized.
     * @param {Object} [params.param_grid] Specification of the search
     * space, over which to optimize the estimator parameters. The 
     * specification should be given in the following format:
     * name_of_parameter: optimization-js dimension object instance
     * Here dimension object should be one of Integer, Real or 
     * Categorical class instances from optimization-js package.
     * @param {Number} [params.max_iter] Maximum number of iterations, 
     * allowed for the algorithm  to execute. The algorithm could
     * be terminated prematurely via callbacks.
     * @param {Number} [params.cv] Number of cross validation folds 
     * that are used for evaluation of particular hyperparameter 
     * configuration.
     * @param {Boolean} [params.refit_on_improvement] Whether to fit
     * the final model on the whole dataset as soon as improvement in 
     * hyperparameter configuration is obtained, w.r.t. CV score.
     * @param {Boolean} [params.refit] Whether to fit the final model
     * after the hyperparameter search has concluded.
     */
    constructor(params){
        super(params, {
            'estimator': null,
            'param_grid': null,
            'max_iter': 100,
            'cv': 3,
            'refit_on_improvement': false,
            'refit': true
        })
    }

    /**
     * Gets the vector of dimensions that represent
     * a search space flattened to vector.
     */
    get_dim_vector(){
        var grid = this.params['param_grid']
        var grid_keys = []
        for(var key in grid){
            grid_keys.push(key)
        }
        grid_keys.sort()

        var dimensions = []

        for(var key of grid_keys){
            dimensions.push(grid[key])
        }

        return [dimensions, grid_keys]
    }

    /**
     * Initialize the search algorithm.
     */
    async init(X, y){
        var [dimensions, grid_keys] = this.get_dim_vector()

        // get dimensions from grid
        var optimizer = new opt.OMGOptimizer(dimensions)
        
        // temporary values
        this.grid_keys = grid_keys
        this.optimizer = optimizer
        this.X = X
        this.y = y
        this.improved = false

        this.state['best_params_'] = null
        this.state['best_score_'] = null
        this.state['best_estimator_'] = null
    }

    /**
     * Updates the ranges of hyperparameters to search over.
     * This is useful in case first the models with small
     * complexity are intended to be tried. Then in few initial
     * iterations, smaller parameters that control complexity 
     * should be given.
     * Note: only the ranges should be updated for dimensions.
     * Adding new dimensions will likely lead to unexpected behavior.
     * @param {Object} values Dictionary with keys being
     * names of the parameters, and values being dimensions
     * of the optimization-js type. 
     */
    update_ranges(values){
        for(var key in values){
            this.params['param_grid'][key] = values[key]
        }

        // create dimensions with updated ranges
        var [dimensions, grid_keys] = this.get_dim_vector()

        // create a new space
        var space = new opt.Space(dimensions)
        
        // update space in optimizer
        this.optimizer.space = space
    }

    /**
     * Run a single step of search algorithm.
     * @returns {Object} object that indicate result of a step.
     * Parameters of the object are as follows:
     * - {Boolean} improved whether there was an improvement in
     * score.
     */
    async step(){
        var optimizer = this.optimizer
        var grid_keys = this.grid_keys

        var p = optimizer.ask()
        var estim_params = {}

        // mix with parameters
        for(var i=0; i<grid_keys.length; i++){
            estim_params[grid_keys[i]] = p[i]
        }

        var estimator = this.params['estimator']
        var cv = this.params['cv']

        // set the current parameters
        await estimator.set_params(estim_params)
        
        // calculate CV score
        var cv_scores = await cross_val_score(estimator, this.X, this.y, null, null, cv)
        var avg_score = cv_scores.reduce((p, c)=>p+c) / cv
        var best_score_ = this.state['best_score_']

        if(best_score_ === null || best_score_ < avg_score){
            this.state['best_score_'] = avg_score
            this.state['best_params_'] = estim_params

            if(this.params['refit_on_improvement']){
                await this.fit_final(this.X, this.y)
            }

            return {"improved": true}
        }else{
            return {"improved": false}
        }
    }

    /**
     * Fits the model with set best parameters to the data.
     * @param {Array} X input samples
     * @param {Array} y output samples
     */
    async fit_final(X, y){
        var estimator = this.params['estimator']
        var best_params_ = this.params['best_params_']

        var estim = await io.clone(estimator)
        await estim.set_params(best_params_)
        await estim.fit(X, y)
        this.state['best_estimator_'] = estim
    }

    /**
     * Clear the temporarily stored variables
     * for the grid search.
     */
    async finish_search(){
        // fit the final model
        if(this.params['refit']){
            await this.fit_final(this.X, this.y)
        }

        // release the pointers to objectz
        this.X = null
        this.y = null
        this.optimizer = null
        this.grid_keys = null
    }

    async fit(X, y){
        await this.init(X, y)

        var n_iter = this.params['max_iter']
        for(var i=0; i<n_iter; i++){
            await this.step()
        }
        
        await this.finish_search()
    }

    async predict(X){
        utils.check_is_fitted(this, ['best_estimator_'])
        var estimator = this.state['best_estimator_']

        var y_pred = await estimator.predict(X)
        return y_pred
    }

    async score(X, y){
        utils.check_is_fitted(this, ['best_estimator_'])
        var estimator = this.state['best_estimator_']

        var score = await estimator.score(X, y)
        return score
    }
}

module.exports.OMGSearchCV = OMGSearchCV