Source: ensemble.js

const base = require('./base')
const RegressorMixin = base.RegressorMixin
const BaseEstimator = base.BaseEstimator
const utils = require('./utils')
const tf = require('@tensorflow/tfjs')
const prep = require('./preprocessing')
const tree = require('./tree')

/**
 * Gradient boosting model, for regression problems.
 * As a weak learner, decision tree is used. Inspired by:
 * https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
 */
class GradientBoostingRegressor extends RegressorMixin(BaseEstimator){
    /**
     * Instantiates GradientBoostingRegressor.
     * @param {Object} params Dictionary with parameters for
     * the model.
     * @param {String} [params.loss] Type of loss to use. Currently
     * available is 'squared_loss'.
     * @param {Number} [param.learning_rate] How much does a single
     * weak learner affect the final ensemble. Smaller values might
     * require larger number of estimators for satisfactory behavior,
     * but usually lead to better generalization.
     * @param {Number} [param.n_estimators] How many weak learners
     * to use in final ensemble.
     * @param {Number} [param.min_samples_split] Hyperparameter of
     * the underlying weak learner. Minimum number of samples in the
     * decision tree to form a split. Can either be a value in range
     * [0.0, 1.0], which indicates the fraction of overall dataset.
     * Alternatively, it could be a value greater equal 2, in which 
     * case specific number of samples is specified.
     * @param {Number} [param.max_depth] Maximum depth of the decision
     * tree weak learner.
     * @param {Number} [param.min_impurity_decrease] Minimal decrease
     * of impurity value, for which to form a new split. Used in the
     * training heuristic of the tree weak learner. Recommended to leave
     * as 0.0.
     */
    constructor(params){
        super(params, {
            'loss': 'squared_loss',
            'learning_rate': 0.1,
            'n_estimators': 100,
            'min_samples_split': 2,
            'max_depth': 3,
            'min_impurity_decrease': 0.0
        })
    }

    async fit(X, y){
        var [X, y] = utils.check_2dXy(X, y, false)

        var loss = this.params['loss']
        if(loss === 'ls'){
            loss = 'squared_loss'
        }
        var loss_id = {
            'squared_loss': 0,
            'hinge': 1
        }[loss]
        var learning_rate = this.params['learning_rate']
        var n_estimators = this.params['n_estimators']
        var min_samples_split = this.params['min_samples_split']
        var max_depth = this.params['max_depth']
        var min_impurity_decrease = this.params['min_impurity_decrease']

        // calculate initial value
        var init_ = 0.0
        for(var i=0; i<y.length; i++){
            init_ += y[i]
        }
        init_ /= y.length

        // predictions of the model, and residuals
        var y_pred = []
        var rm = []
        for(var i=0; i<y.length; i++){
            y_pred.push(init_)
            rm.push(0.0)
        }

        var estimators = []

        var aux0 = 0.0

        // main fitting loop
        for(var i=0; i<n_estimators; i++){
            // calculate the residuals - negative gradients of loss
            // w.r.t. the estimator score.
            for(var j=0; j<y_pred.length; j++){
                if(loss_id == 0){ // sq. loss
                    rm[j] = -2*(y[j] - y_pred[j])
                }else if(loss_id == 1){ // hinge
                    aux0 = 1.0 - y[j] * y_pred[j]
                    rm[j] = aux0 > 0.0? -y[j] : 0.0
                }
                rm[j] = -rm[j]
            }

            // fit the regression tree
            var estimator = new tree.DecisionTreeRegressor({
                'max_depth': max_depth,
                'min_impurity_decrease': min_impurity_decrease,
                'min_samples_split': min_samples_split
            })

            // learn the gradients
            await estimator.fit(X, rm)

            // estimate the gradients
            var h = await estimator.predict(X)
            var gamma_base = 0.0
            var step = 1.0

            var total_iter = 64

            // find optimal gamma; assumes convex loss w.r.t. gamma!
            while(total_iter > 0){   
                // calculate the total gradient w.r.t. gamma
                var grad = 0.0
                var gamma = gamma_base + step
                for(var j=0; j<h.length; j++){
                    // should calculate losss of L(y, f(x) + gamma*h(x)) w.r.t gamma
                    if(loss_id === 0){ // sq. loss
                        grad += -2*h[j]*(y[j] - y_pred[j] - gamma*h[j])
                    }else if(loss_id === 1){ // hinge loss
                        // d max(0, 1-y(f+h)) / dh = I(1-y(f+gh)) * (-yh)
                        aux0 = 1.0 - y[j]*(y_pred[j] + gamma*h[j]) 
                        grad += aux0 > 0.0 ? -y[j]*h[j] : 0.0
                    }
                }
                
                // find the smallest globally optimal gamma
                // prior: if equivalent, select smaller gamma
                // expectation of stability: smaller learning 
                // rates lead to more stable gradient descent
                if(grad >= 0.0){
                    step /= 2.0
                }else{
                    gamma_base += step
                }
                total_iter -= 1
            }

            var gamma = gamma_base + 0.5*step

            // update the predictions
            for(var j=0; j<y_pred.length; j++){
                y_pred[j] = y_pred[j] + learning_rate * gamma * h[j]
            }

            // add estimator
            estimators.push([estimator, gamma])

        }

        this.state['estimators_'] = estimators
        this.state['init_'] = init_
        
        return this // sklearn convention
    }

    async predict(X){
        utils.check_is_fitted(this, ['estimators_', 'init_'])
        var X = utils.check_2dX(X, false)

        var estimators = this.state['estimators_']
        var init_ = this.state['init_']
        var learning_rate = this.params['learning_rate']

        // initialize with init_
        var y_pred = []
        for(var x of X){
            y_pred.push(init_)
        }

        for(var eg of estimators){
            var [estim, gamma] = eg
            var h = await estim.predict(X)

            for(var j=0; j<y_pred.length; j++){
                y_pred[j] += learning_rate * gamma * h[j]
            }
        }

        return y_pred
    }
}

module.exports.GradientBoostingRegressor = GradientBoostingRegressor


class GradientBoostingClassifier extends base.ClassifierMixin(BaseEstimator){
    /**
     * Instantiates GradientBoostingClassifier.
     * @param {Object} params Dictionary with parameters for
     * the model.
     * @param {String} [params.loss] Type of loss to use. Currently
     * available is 'hinge'.
     * @param {Number} [param.learning_rate] How much does a single
     * weak learner affect the final ensemble. Smaller values might
     * require larger number of estimators for satisfactory behavior,
     * but usually lead to better generalization.
     * @param {Number} [param.n_estimators] How many weak learners
     * to use in final ensemble.
     * @param {Number} [param.min_samples_split] Hyperparameter of
     * the underlying weak learner. Minimum number of samples in the
     * decision tree to form a split. Can either be a value in range
     * [0.0, 1.0], which indicates the fraction of overall dataset.
     * Alternatively, it could be a value greater equal 2, in which 
     * case specific number of samples is specified.
     * @param {Number} [param.max_depth] Maximum depth of the decision
     * tree weak learner.
     * @param {Number} [param.min_impurity_decrease] Minimal decrease
     * of impurity value, for which to form a new split. Used in the
     * training heuristic of the tree weak learner. Recommended to leave
     * as 0.0.
     */
    constructor(params){
        super(params, {
            'loss': 'hinge',
            'learning_rate': 0.1,
            'n_estimators': 100,
            'min_samples_split': 2,
            'max_depth': 3,
            'min_impurity_decrease': 0.0
        })
    }

    async fit(X, y){
        var [X, y] = utils.check_2dXy(X, y, false)
        var binarizer = new prep.LabelBinarizer()
        binarizer.fit(y)
        var y = binarizer.transform(y).mul(2).add(-1)

        var n_models = y.shape[1]

        var models = []
        for(var yi=0; yi<n_models; yi++){
            var y_model = y.slice([0, yi], [y.shape[0], 1])
            y_model = Array.from(await y_model.data())
            var model = new GradientBoostingRegressor(this.params)
            await model.fit(X, y_model)
            models.push(model)
        }

        this.state['models'] = models
        this.state['binarizer'] = binarizer
        return this
    }

    async predict(X){
        utils.check_is_fitted(this, ['models', 'binarizer'])
        var X = utils.check_2dX(X, false)

        var y_models = []
        for(var model of this.state['models']){
            y_models.push(await model.predict(X))
        }

        var y_pred = []
        var inverse = this.state['binarizer'].state['inverse']

        for(var i=0; i<X.length; i++){
            var max_score = Number.NEGATIVE_INFINITY;
            var max_j = 0
            for(var j=0; j<y_models.length; j++){
                if(y_models[j][i] > max_score){
                    max_score = y_models[j][i]
                    max_j = j
                }
            }
            y_pred.push(inverse[max_j])
        }

        return y_pred
    }

}

module.exports.GradientBoostingClassifier = GradientBoostingClassifier