JSDoc: Source: preprocessing.js

const tf = require('@tensorflow/tfjs');
const utils = require('./utils')

/**
 * Converts a set of inputs, represented as
 * matrix X with columns corresponding to the
 * different features, into a numeric matrix X'.
 * Can handle missing values and categorical 
 * columns. The type of column is detected 
 * automatically.
 */
class TableFeaturesTransformer{
    constructor(params){
        this.params = utils.set_defaults(params)
        this.state = null
    }

    /**
     * Determine if the column is numeric, categorical, or other.
     * @param {Array} C 1d array of values, represents single column.
     * @returns {String} Type of the output. Could be either 'category'
     * or 'number'.
     */
    column_type(C){
        var feature = {'type': 'skip'}
        var n_rows = C.length

        // check if values are numerical or categorical
        var is_num = true

        for(var j=0; j<n_rows; j++){
            var x = C[j]
            
            // missing value, skip it
            if(x == ''){  
                continue
            }

            // check if the value can be converted to float
            if(isNaN(Number(x))){
                is_num = false  // if not - it is not a numeric value
                break
            }
        }

        if(is_num){

            // calculate the mean of values in column
            var sum = 0.0
            var N = 0

            for(var j=0; j<n_rows; j++){
                var x = C[j]
                if(x == ''){  
                    continue
                }
                sum += Number(x)
                N += 1
            }

            var mean = sum / N

            // used for imputation
            feature['type'] = 'number'
            feature['mean'] = mean
        }else{

            var cats = {} // all possible categories
            var cats_l = []
            var cat_index = 0

            for(var j=0; j<n_rows; j++){
                var x = C[j]

                if(x in cats){
                    continue
                }

                cats[x] = true
                cats_l.push(x)
            }

            // ensure that categories are in sorted order
            cats_l.sort()

            for(var j=0; j<cats_l.length; j++){
                cats[cats_l[j]] = j
            }

            feature['categories'] = cats
            feature['n_classes'] = cats_l.length
            feature['type'] = 'category'
        }

        return feature
    }

    /**
     * Select a single column of matrix X.
     * @param {Array} X nested array, represents matrix of raw inputs.
     * @param {Integer} i Index of the column to select
     */
    get_column(X, i){
        var C = []
        for(var j=0; j<X.length; j++){
            C.push(X[j][i])
        }
        return C
    }

    /**
     * Determines the type of columns, and calculates all the necessary
     * parameters for imputation of missing values and conversion of 
     * categorical values, if any.
     * @param {Array} X Matrix of raw inputs, where columns correspond
     * to different features.
     * @param {Array} y A vector of outputs. A type of problem could be
     * determined from the type of the output.
     * @param {Array} feature_names An array that contains names of the
     * features as strings.
     */
    fit(X, y, feature_names=null){
        var features = {}

        this.state = {
            'features': features
        }
        
        var n_cols = X[0].length

        for(var i=0; i<n_cols; i++){
            var C = this.get_column(X, i)
            features[i] = this.column_type(C)
        }

        this.state['output'] = this.column_type(y)

        if(feature_names !== null){
            var feature_params = []
            for(var j=0; j<X[0].length; j++){
                var f = features[j]
                if(f['type'] == 'number'){
                    feature_params.push({
                        name: feature_names[j],
                        type: 'number'
                    })
                }else{
                    var cats = f['categories']
                    var ivcats = {}
                    var N_cats = 0
                    // make format: {1: 'feat_a', 2: 'feat_b', ...}
                    for(var c in cats){
                        ivcats[cats[c]] = c
                        N_cats++
                    }
                    
                    for(var i=0; i<N_cats; i++){
                        feature_params.push({
                            name: feature_names[j] + "==" + ivcats[i],
                            type: 'boolean'
                        })
                    }
                }
            }

            this.state['feature_params'] = feature_params
        }
        

        return this
    }

    /**
     * Convert some inputs to purely numerical features, suitable for 
     * further use in ML pipelines. 
     * @param {Array} X Matrix of raw inputs. These should be of the same
     * format as the inputs used to `fit` method.
     * @param {Array} y Vector of raw inputs. 
     */
    transform(X, y=null){
        var X_feat = []
        var features = this.state['features']

        // for every sample ...
        for(var i=0; i<X.length; i++){
            var row = X[i]
            var x_feat = []

            // for every column in sample ...
            for(var j=0; j<row.length; j++){
                var feature = features[j]
                var xv = row[j]

                if(feature['type'] == 'number'){
                    var x_num = Number(xv)

                    // do imputation if number is not convertable or missing
                    if(isNaN(x_num) || (xv === '')){
                        x_feat.push(feature['mean'])
                        continue
                    }
                    x_feat.push(x_num)
                }else{
                    // do the one hot encoding of categorical data
                    var cats = feature['categories']
                    var ix = cats[xv]
                    for(var cat_ix =0; cat_ix<feature['n_classes']; cat_ix++){
                        x_feat.push(cat_ix == ix? 1.0 : 0.0)
                    }
                }

            }
            X_feat.push(x_feat)
        }

        return X_feat
    }
}

module.exports.TableFeaturesTransformer = TableFeaturesTransformer

/**
 * Standardize features by removing the mean and scaling to unit variance.
 */
class StandardScaler{
    constructor(params){
        this.params = utils.set_defaults(params)
        this.state = {}
    }

    /**
     * Compute the mean and variance to be used for later scaling.
     * @param {Array} X array-like, shape [n_samples, n_features]
     *     The data used to compute the mean and standard deviation
     *     used for later scaling along the features axis.
     * @param {Array} y Passthrough for ``Pipeline`` compatibility.
     */
    fit(X, y=null){
        var X = utils.t2d(X)

        var mean = tf.mean(X, 0)

        // calculate scaler similar to how sklearn does it - sq.root of variance
        var variance = tf.mean(tf.pow(tf.abs(tf.sub(X, mean)),2), 0)
        var scale = tf.sqrt(variance)
        
        // naming similar to sklearn
        this.state['mean_'] = mean
        this.state['scale_'] = scale

        // sklearn convention
        return this
    }

    /**
     * Perform standardization by centering and scaling values of 
     * the features.
     * @param {Array} X array-like, shape [n_samples, n_features]
     *    The data used to scale along the features axis.
     * @param {Array} y ignored
     */
    transform(X, y=null){
        var X = utils.t2d(X)

        var mean = this.state['mean_']
        var scale = this.state['scale_']

        var Xn = tf.div(tf.sub(X, mean), scale)
        return Xn
    }
}

module.exports.StandardScaler = StandardScaler

/**
 * Convert labels to one hot encoded representation.
 */
class LabelBinarizer {
    constructor(params){
        this.params = utils.set_defaults(params)
        this.state = null
    }

    fit(y){
        var seen = {} // all possible categories
        var categories_list = []
        
        for(var j=0; j<y.length; j++){
            var x = y[j]

            if(x in seen){
                continue
            }

            seen[x] = true
            categories_list.push(x)
        }

        // ensure that categories are in sorted order
        categories_list.sort()

        var categories = {}
        var inversecat = {}

        for(var j=0; j<categories_list.length; j++){
            categories[categories_list[j]] = j
            inversecat[j] = categories_list[j]
        }

        this.state = {
            'categories': categories,
            'inverse': inversecat,
            'n_classes': categories_list.length
        }

        return this
    }

    transform(y){
        var y_new = []
        var categories = this.state['categories']

        for(var yv of y){
            var y_feat = []
            var ix = categories[yv]
            for(var cat_ix=0; cat_ix<this.state['n_classes']; cat_ix++){
                y_feat.push(cat_ix == ix? 1.0 : 0.0)
            }
            y_new.push(y_feat)
        }

        y_new = utils.t2d(y_new)        
        return y_new
    }

    inverse_transform(y){
        var categories_inverse = this.state['inverse']

        var I = tf.argMax(y, 1)
        I = I.dataSync()

        var y_new = []
        for(var i of I){
            y_new.push(categories_inverse[i])
        }
        return y_new
    }
}

module.exports.LabelBinarizer = LabelBinarizer