RosettaCodeData/Task/K-means++-clustering/JavaScript/k-means++-clustering.js

/**
 * kmeans module
 *
 *   cluster(model, k, converged = assignmentsConverged)
 *   distance(p, q),
 *   distanceSquared(p, q),
 *   centroidsConverged(delta)
 *   assignmentsConverged(model, newModel)
 *   assignmentsToClusters(model)
 */
define(function () {
    "use strict";

    /**
     * @public
     * Calculate the squared distance between two vectors.
     *
     * @param [number] p vector with same dimension as q
     * @param [number] q vector with same dimension as p
     * @return {number} the distance between p and q squared
     */
    function distanceSquared(p, q) {
        const d = p.length; // dimension of vectors

        if(d !== q.length) throw Error("p and q vectors must be the same length")

        let sum = 0;
        for(let i = 0; i < d; i += 1) {
            sum += (p[i] - q[i])**2
        }
        return sum;
    }

    /**
     * @public
     * Calculate the distance between two vectors of the same dimension.
     *
     * @param [number] p vector of same dimension as q
     * @param [number] q vector of same dimension as p
     * @return the distance between vectors p and q
     */
    function distance(p, q) {
        return Math.sqrt(distanceSquared(p, q));
    }

    /**
     * @private
     * find the closest centroid for the given observation and return it's index.
     *
     * @param [[number]] centroids - array of k vectors, each vector with same dimension as observations.
     *                               these are the center of the k clusters
     * @param [[number]] observation - vector with same dimension as centroids.
     *                                 this is the observation to be clustered.
     * @return {number} the index of the closest centroid in centroids
     */
    function findClosestCentroid(centroids, observation) {
        const k = centroids.length; // number of clusters/centroids

        let centroid = 0;
        let minDistance = distance(centroids[0], observation);
        for(let i = 1; i < k; i += 1) {
            const dist = distance(centroids[i], observation);
            if(dist < minDistance) {
                centroid = i;
                minDistance = dist;
            }
        }
        return centroid;
    }

    /**
     * @private
     * Calculate the centroid for the given observations.
     * This takes the average of all observations (at each dimension).
     * This average vector is the centroid for those observations.
     *
     * @param [[number]] observations - array of observations (each observatino is a vectors)
     * @return [number] centroid for given observations (vector of same dimension as observations)
     */
    function calculateCentroid(observations) {
        const n = observations.length;      // number of observations
        const d = observations[0].length;   // dimension of vectors

        // create zero vector of same dimension as observation
        let centroid = [];
        for(let i = 0; i < d; i += 1) {
            centroid.push(0.0);
        }

        //
        // sum all observations at each dimension
        //
        for(let i = 0; i < n; i += 1) {
            //
            // add the observation to the sum vector, element by element
            // to prepare to calculate the average at each dimension.
            //
            for(let j = 0; j < d; j += 1) {
                centroid[j] += observations[i][j];
            }
        }

        //
        // divide each dimension by the number of observations
        // to create the average vector.
        //
        for(let j = 0; j < d; j += 1) {
            centroid[j] /= n;
        }

        return centroid;
    }

    /**
     * @private
     * calculate the cluster assignments for the observations, given the centroids.
     *
     * @param [[number]] centroids - list of vectors with same dimension as observations
     * @param [[number]] observations - list of vectors with same dimension as centroids
     * @return [number] list of indices into centroids; one per observation.
     */
    function assignClusters(centroids, observations) {
        const n = observations.length;  // number of observations

        const assignments = [];
        for(let i = 0; i < n; i += 1) {
            assignments.push(findClosestCentroid(centroids, observations[i]));
        }

        return assignments; // centroid index for each observation
    }

    /**
     * @private
     * calculate one step of the k-means algorithm;
     * - assign each observation to the nearest centroid to create clusters
     * - calculate a new centroid for each cluster given the observations in the cluster.
     *
     * @param [[number]] centroids - list of vectors with same dimension as observations
     * @param [[number]] observations - list of vectors with same dimension as centroids
     * @return a new model with observations, centroids and assignments
     */
    function kmeansStep(centroids, observations) {
        const k = centroids.length; // number of clusters/centroids

        // assign each observation to the nearest centroid to create clusters
        const assignments = assignClusters(centroids, observations); // array of cluster indices that correspond observations

        // calculate a new centroid for each cluster given the observations in the cluster
        const newCentroids = [];
        for(let i = 0; i < k; i += 1) {
            // get the observations for this cluster/centroid
            const clusteredObservations = observations.filter((v, j) => assignments[j] === i);

            // calculate a new centroid for the observations
            newCentroids.push(calculateCentroid(clusteredObservations));
        }
        return {'observations': observations, 'centroids': newCentroids, 'assignments': assignments }
    }

    /**
     * @public
     * Run k-means on the given model until each centroid converges to with the given delta
     * The initial model is NOT modified by the algorithm, rather a new model is returned.
     *
     * @param {*} model - object with
     *                    observations: array, length n, of data points; each datapoint is
     *                                  itself an array of numbers (a vector).
     *                                  The length each datapoint (d) vector should be the same.
     *                    centroids: array of data points.
     *                               The length of the centroids array indicates the number of
     *                               of desired clusters (k).
     *                               each datapoint is array (vector) of numbers
     *                               with same dimension as the datapoints in observations.
     *                    assignments: array of integers, one per observation,
     *                                 with values 0..centroids.length - 1
     * @param number delta - the maximum difference between each centroid in consecutive runs for convergence
     * @return {*} - result with
     *               model: model, as described above, with updated centroids and assignments,
     *               iterations: number of iterations,
     *               durationMs: elapsed time in milliseconds
     */
    function kmeans(model, maximumIterations = 200, converged = assignmentsConverged) {
        const start = new Date();

        // calculate new centroids and cluster assignments
        let newModel = kmeansStep(model.centroids, model.observations);

        // continue until centroids do not change (within given delta)
        let i = 0;
        while((i < maximumIterations) && !converged(model, newModel)) {
            model = newModel;   // new model is our model now
            // console.log(model);

            // calculate new centroids and cluster assignments
            newModel = kmeansStep(model.centroids, model.observations);
            i += 1;
        }

        // console.log(newModel);
        const finish = new Date();
        return {'model': newModel, 'iterations': i, 'durationMs': (finish.getTime() - start.getTime())};
    }

    /**
     * @public
     * Return a function that determines convergence based on the centroids.
     * If two consecutive sets of centroids remain within a given delta,
     * then the algorithm is converged.
     *
     * @param number delta, the maximum difference between each centroid in consecutive runs for convergence
     * @return function to use as the converged function in kmeans call.
     */
    function centroidsConverged(delta) {
        /**
         * determine if two consecutive set of centroids are converged given a maximum delta.
         *
         * @param [[number]] centroids - list of vectors with same dimension as observations
         * @param [[number]] newCentroids - list of vectors with same dimension as observations
         * @param number delta - the maximum difference between each centroid in consecutive runs for convergence
         */
        return function(model, newModel) {
            const centroids = model.centroids;
            const newCentroids = newModel.centroids;

            const k = centroids.length; // number of clusters/centroids
            for(let i = 0; i < k; i += 1) {
                if(distance(centroids[i], newCentroids[i]) > delta) {
                    return false;
                }
            }

            return true;
        }
    }

    /**
     * @public
     * determine if two consecutive set of clusters are converged;
     * the clusters are converged if the cluster assignments are the same.
     *
     * @param {*} model - object with observations, centroids, assignments
     * @param {*} newModel - object with observations, centroids, assignments
     * @param number delta - the maximum difference between each centroid in consecutive runs for convergence
     */
    function assignmentsConverged(model, newModel) {
        function arraysEqual(a, b) {
            if (a === b) return true;
            if (a === undefined || b === undefined) return false;
            if (a === null || b === null) return false;
            if (a.length !== b.length) return false;

            // If you don't care about the order of the elements inside
            // the array, you should sort both arrays here.

            for (var i = 0; i < a.length; ++i) {
            if (a[i] !== b[i]) return false;
            }
            return true;
        }

        return arraysEqual(model.assignments, newModel.assignments);
    }

    /**
     * Use the model assignments to create
     * array of observation indices for each centroid
     *
     * @param {object} model with observations, centroids and assignments
     * @reutrn [[number]] array of observation indices for each cluster
     */
    function assignmentsToClusters(model) {
        //
        // put offset of each data points into clusters using the assignments
        //
        const n = model.observations.length;
        const k = model.centroids.length;
        const assignments = model.assignments;
        const clusters = [];
        for(let i = 0; i < k; i += 1) {
            clusters.push([])
        }
        for(let i = 0; i < n; i += 1) {
            clusters[assignments[i]].push(i);
        }

        return clusters;
    }


    //
    // return public methods
    //
    return {
        'cluster': kmeans,
        'distance': distance,
        'distanceSquared': distanceSquared,
        'centroidsConverged': centroidsConverged,
        'assignmentsConverged': assignmentsConverged,
        "assignmentsToClusters": assignmentsToClusters
    };

});

/**
 * kmeans++ initialization module
 */
define(function (require) {
    "use strict";

    const kmeans = require("./kmeans");

    /**
     * @public
     * create an initial model given the data and the number of clusters.
     *
     * This uses the kmeans++ algorithm:
     * 1. Choose one center uniformly at random from among the data points.
     * 2. For each data point x, compute D(x), the distance between x and
     *    the nearest center that has already been chosen.
     * 3. Choose one new data point at random as a new center,
     *    using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2.
     * 4. Repeat Steps 2 and 3 until k centers have been chosen.
     * 5. Now that the initial centers have been chosen, proceed using
     *    standard k-means clustering.
     *
     * @param {[float]} observations the data as an array of number
     * @param {integer} k the number of clusters
     */
    return function(observations, k) {

        /**
         * given a set of n  weights,
         * choose a value in the range 0..n-1
         * at random using weights as a distribution.
         *
         * @param {*} weights
         */
        function weightedRandomIndex(weights, normalizationWeight) {
            const n = weights.length;
            if(typeof normalizationWeight !== 'number') {
                normalizationWeight = 0.0;
                for(let i = 0; i < n; i += 1) {
                    normalizationWeight += weights[i];
                }
            }

            const r = Math.random();  // uniformly random number 0..1 (a probability)
            let index = 0;
            let cumulativeWeight = 0.0;
            for(let i = 0; i < n; i += 1) {
                //
                // use the uniform probability to search
                // within the normalized weighting (we divide by totalWeight to normalize).
                // once we hit the probability, we have found our index.
                //
                cumulativeWeight += weights[i] / normalizationWeight;
                if(cumulativeWeight > r) {
                    return i;
                }
            }

            throw Error("algorithmic failure choosing weighted random index");
        }

        const n = observations.length;
        const distanceToCloseCentroid = []; // distance D(x) to closest centroid for each observation
        const centroids = [];   // indices of observations that are chosen as centroids

        //
        // keep list of all observations' indices so
        // we can remove centroids as they are created
        // so they can't be chosen twice
        //
        const index = [];
        for(let i = 0; i < n; i += 1) {
            index[i] = i;
        }

        //
        //  1. Choose one center uniformly at random from among the data points.
        //
        let centroidIndex = Math.floor(Math.random() * n);
        centroids.push(centroidIndex);

        for(let c = 1; c < k; c += 1) {
            index.slice(centroids[c - 1], 1);    // remove previous centroid from further consideration
            distanceToCloseCentroid[centroids[c - 1]] = 0;  // this effectively removes it from the probability distribution

            //
            // 2. For each data point x, compute D(x), the distance between x and
            //    the nearest center that has already been chosen.
            //
            // NOTE: we used the distance squared (L2 norm)
            //
            let totalWeight = 0.0;
            for(let i = 0; i < index.length; i += 1) {
                //
                // if this is the first time through, the distance is undefined, so just set it.
                // Otherwise, choose the minimum of the prior closest and this new centroid
                //
                const distanceToCentroid = kmeans.distanceSquared(observations[index[i]], observations[centroids[c - 1]]);
                distanceToCloseCentroid[index[i]] =
                    (typeof distanceToCloseCentroid[index[i]] === 'number')
                    ? Math.min(distanceToCloseCentroid[index[i]], distanceToCentroid)
                    : distanceToCentroid;
                totalWeight += distanceToCloseCentroid[index[i]];
            }

            //
            //  3. Choose one new data point at random as a new center,
            //     using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2.
            //
            centroidIndex = index[weightedRandomIndex(distanceToCloseCentroid, totalWeight)];
            centroids.push(centroidIndex);

            //  4. Repeat Steps 2 and 3 until k centers have been chosen.
        }

        //
        //  5. Now that the initial centers have been chosen, proceed using
        //     standard k-means clustering. Return the model so that
        //     kmeans can continue.
        //
        return {
            'observations': observations,
            'centroids': centroids.map(x => observations[x]), // map centroid index to centroid value
            'assignments': observations.map((x, i) => i % centroids.length) // distribute among centroids
        }
    }

});

/**
 * Extra Credit #1
 * module for creating random models for kmeans clustering
 */
define(function (require) {
    "use strict";

    const kmeans = require("./kmeans");

    /**
     * @return a random, normally distributed number
     */
    function randomNormal() {
        // n = 6 gives a good enough approximation
        return ((Math.random() + Math.random() + Math.random() + Math.random() + Math.random() + Math.random()) - 3) / 3;
    }

    /**
     * Generate a uniform random unit vector
     *
     * @param {Integer} d dimension of data
     * @return n random datapoints of dimension d with length == 1
     */
    function randomUnitVector(d) {
        const range = max - min;
        let magnitude = 0.0;
        const observation = [];

        // uniform random for each dimension
        for(let j = 0; j < d; j += 1) {
            const x = Math.random();
            observation[j] = x;
            magnitude = x * x;
        }

        // normalize
        const magnitude = Math.sqrt(magnitude);
        for(let j = 0; j < d; j += 1) {
            observation[j] /= magnitude;
        }

        return observation;
    }

    /**
     * Generate a uniform random unit vectors for clustering
     *
     * @param {Integer} n number of data points
     * @param {Integer} d dimension of data
     * @return n random datapoints of dimension d with length == 1
     */
    function randomUnitVectors(n, d) {

        // create n random observations, each of dimension d
        const observations = [];
        for(let i = 0; i < n; i += 1) {
            // create random observation of dimension d
            const observation = randomUnitVector(d);
            observations.push(observation);
        }

        return observations;
    }


    /**
     * Generate a spherical random vector
     *
     * @param {Integer} n number of data points
     * @param {Integer} d dimension of data
     * @param {Number} r radium from center for data point
     * @return n random datapoints of dimension d
     */
    function randomSphericalVector(d, r) {
        const observation = [];

        let magnitude = 0.0;
        for(let j = 0; j < d; j += 1)
        {
            const x = randomNormal();
            observation[j] = x;
            magnitude += x * x;
        }

        // normalize
        magnitude = Math.sqrt(magnitude);
        for(let j = 0; j < d; j += 1) {
            observation[j] = observation[j] * r / magnitude;
        }

        return observation;
    }


    /**
     * Generate a spherical random vectors
     *
     * @param {Integer} n number of data points
     * @param {Integer} d dimension of data
     * @param {Number} max radius from center for data points
     * @return n random datapoints of dimension d
     */
    function randomSphericalVectors(n, d, r) {

        // create n random observations, each of dimension d
        const observations = [];
        for(let i = 0; i < n; i += 1) {
            // create random observation of dimension d with random radius
            const observation = randomSphericalVector(d, Math.random() * r);
            observations.push(observation);
        }

        return observations;
    }

    /**
     * Generate a uniform random model for clustering
     *
     * @param {Integer} n number of data points
     * @param {Integer} d dimension of data
     * @param {Number} radius of sphere
     * @return n random datapoints of dimension d
     */
    function randomVectors(n, d, min, max) {

        const range = max - min;

        // create n random observations, each of dimension d
        const observations = [];
        for(let i = 0; i < n; i += 1) {
            // create random observation of dimension d
            const observation = randomVector(d, min, max);
            observations.push(observation);
        }

        return observations;
    }

    /**
     * Generate a uniform random model for clustering
     *
     * @param {Integer} d dimension of data
     * @param {Number} radius of sphere
     * @return n random datapoints of dimension d
     */
    function randomVector(d, min, max) {

        // create random observation of dimension d
        const range = max - min;
        const observation = [];
        for(let j = 0; j < d; j += 1) {
            observation.push(min + Math.random() * range);
        }

        return observation;
    }

    return {
        'randomVector': randomVector,
        'randomUnitVector': randomUnitVector,
        'randomSphericalVector': randomSphericalVector,
        'randomVectors': randomVectors,
        'randomUnitVectors': randomUnitVectors,
        'randomSphericalVectors': randomSphericalVectors
    }

});

/**
 * Extra Credit #4
 * Application to cluster random data using kmeans++
 *
 * cluster(k, n, d) - cluster n data points of dimension d into k clusters
 * plot(canvas, result) - plot the results of cluster() to the given html5 canvas using clusterjs
 */
define(function (require) {
    "use strict";
    const kmeans = require("./kmeans/kmeans");
    const kmeanspp = require("./kmeans/kmeanspp");
    const randomCentroidInitializer = require("./kmeans/randomCentroidInitializer");
    const kmeansRandomModel = require("./kmeans/kmeansRandomModel");


    /**
     * @public
     * Load iris dataset and run kmeans on it given the number of clusters
     *
     * @param {integer} k number of clusters to create
     */
    function cluster(k, n, d) {

        //
        // map iris data rows from dictionary to vector (array), leaving out the label
        //
        const observations = kmeansRandomModel.randomSphericalVectors(n, d, 10.0);

        //
        // create the intial model and run it
        //
        // const initialModel = randomCentroidInitializer(observations, k);
        const initialModel = kmeanspp(observations, k);

        //
        // cluster into given number of clusters
        //
        const results = kmeans.cluster(initialModel);

        //
        // do this for the convenience of the plotting functions
        //
        results.clusters = kmeans.assignmentsToClusters(results.model);

        return results;
    }

    const clusterColor = ['red', 'green', 'blue', 'yellow', 'purple', 'cyan', 'magenta', 'pink', 'brown', 'black'];
    let chart = undefined;

    /**
     * plot the clustred iris data model.
     *
     * @param {object} results of cluster(), with model, clusters and clusterCompositions
     * @param {boolean} showClusterColor true to show learned cluster points
     * @param {boolean} showSpeciesColor true to show known dataset labelled points
     */
    function plot(canvas, results) {

        //
        // map iris data rows from dictionary to vector (array), leaving out the label
        //
        const model = results.model;
        const observations = model.observations;
        const assignments = model.assignments;
        const centroids = model.centroids;
        const d = observations[0].length;
        const n = observations.length;
        const k = centroids.length;

        //
        // put offset of each data points into clusters using the assignments
        //
        const clusters = results.clusters;

        //
        // plot the clusters
        //
        const chartData = {
            // for the purposes of plotting in 2 dimensions, we will use
            // x = dimension 0 and y = dimension 1
            datasets: clusters.map(function(c, i) {
                return {
                    label: "cluster" + i,
                    data: c.map(d => ({'x': observations[d][0], 'y': observations[d][1]})),
                    backgroundColor: clusterColor[i % clusterColor.length],
                    pointBackgroundColor: clusterColor[i % clusterColor.length],
                    pointBorderColor:  clusterColor[i % clusterColor.length]
                };
            })
        };
        const chartOptions = {
            responsive: true,
            maintainAspectRatio: false,
            title: {
                display: true,
                text: 'Random spherical data set (d=$d, n=$n) clustered using K-Means (k=$k)'
                        .replace("$d", d)
                        .replace('$n', n)
                        .replace('$k', k)
            },
            legend: {
                position: 'bottom',
                display: true
            },
            scales: {
                xAxes: [{
                    type: 'linear',
                    position: 'bottom',
                    scaleLabel: {
                        labelString: 'x axis',
                        display: false,
                    }
                }],
                yAxes: [{
                    type: 'linear',
                    position: 'left',
                    scaleLabel: {
                        labelString: 'y axis',
                        display: false
                    }
                }]
            }
        };

        //
        // we need to destroy the previous chart so it's interactivity
        // does not continue to run
        //
        if(undefined !== chart) {
            chart.destroy()
        }
        chart = new Chart(canvas, {
            type: 'scatter',
            data: chartData,
            options: chartOptions,
        });

    }

    return {'cluster': cluster, 'plot': plot};
});