All files main.js

72.68% Statements 314/432
85.36% Branches 35/41
100% Functions 4/4
72.68% Lines 314/432

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 4331x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 17x 17x 17x 17x 17x 17x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 119x 25x 25x 25x 25x 25x 20x 20x 20x 119x 58x 58x 58x 24x 24x 58x 24x 24x 24x 94x 36x 36x 59x 59x 59x 59x 59x 59x 59x 59x 59x 119x 44x 44x 38x 38x 44x 119x 4x 4x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 119x             119x                     17x 17x 17x 17x 119x 11x 11x 11x 11x 11x 119x 6x 6x 6x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 14x 14x 14x 14x 14x 14x 14x 14x 14x 14x 14x 14x 14x           14x 14x 10x 10x 4x 4x 4x                                                                                                                     14x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 42x 42x 42x 42x 42x 42x 42x 42x 42x 42x 42x 42x 28x 10x 10x 18x 18x 42x 14x 14x 42x 20x 20x 42x 8x 8x 42x   42x 4x 4x                                                                             42x 119x 1x 1x 1x 1x 1x  
/**
* @license Apache-2.0
*
* Copyright (c) 2018 The Stdlib Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
 
'use strict';
 
// MODULES //
 
var isPositiveInteger = require( '@stdlib/assert/is-positive-integer' ).isPrimitive;
var isMatrixLike = require( '@stdlib/assert/is-matrix-like' );
var isVectorLike = require( '@stdlib/assert/is-vector-like' );
var setReadOnly = require( '@stdlib/utils/define-nonenumerable-read-only-property' );
var format = require( '@stdlib/string/format' );
var minstd = require( '@stdlib/random/base/minstd-shuffle' );
var floor = require( '@stdlib/math/base/special/floor' );
var ln = require( '@stdlib/math/base/special/ln' );
var dcopy = require( '@stdlib/blas/base/dcopy' );
var createMatrix = require( './matrix.js' );
var copyMatrix = require( './copy_matrix.js' );
var createVector = require( './vector.js' );
var copyVector = require( './copy_vector.js' );
var validate = require( './validate.js' );
var INIT_DEFAULTS = require( './init_defaults.json' );
var initialization = require( './init.js' );
var statistics = require( './stats.js' );
var incrstatistics = require( './incrstats.js' );
var squaredEuclidean = require( './squared_euclidean.js' );
var squaredCosine = require( './squared_cosine.js' );
var squaredCorrelation = require( './squared_correlation.js' );
var closestCentroid = require( './find_closest_centroid.js' );
var updateCentroid = require( './update_centroid.js' );
var normalize = require( './normalize.js' );
var normalizeMatrix = require( './normalize_matrix.js' );
var standardize = require( './standardize.js' );
var standardizeMatrix = require( './standardize_matrix.js' );
 
 
// VARIABLES //
 
// Number of cluster statistics:
var NSTATS = 4; // [ n_obs, sum_squared_dist, mean_squared_dist, stdev_squared_dist ]
 
 
// FUNCTIONS //
 
/**
* Returns a results object.
*
* @private
* @param {PositiveInteger} k - number of clusters
* @param {PositiveInteger} ndims - number of dimensions
* @returns {Object} results object
*/
function createResults( k, ndims ) {
	var out = {};
	out.centroids = createMatrix( k, ndims, false ); // high-level
	out.stats = createMatrix( k, NSTATS, false ); // high-level
	return out;
}
 
 
// MAIN //
 
/**
* Returns an accumulator function which incrementally partitions data into `k` clusters.
*
* @param {(PositiveInteger|ndarray)} k - number of clusters or a `k x ndims` matrix containing initial centroids
* @param {PositiveInteger} [ndims] - number of dimensions (should only be provided if provided a numeric `k` argument)
* @param {Options} [options] - function options
* @param {string} [options.metric="euclidean"] - distance metric
* @param {ArrayLikeObject} [options.init] - method for determining initial centroids
* @param {boolean} [options.normalize=true] - boolean indicating whether to normalize incoming data (only relevant for non-Euclidean distance metrics)
* @param {boolean} [options.copy=true] - boolean indicating whether to copy incoming data to prevent mutation during normalization
* @param {*} [options.seed] - PRNG seed
* @throws {TypeError} first argument must be a positive integer
* @throws {TypeError} second argument must be a positive integer
* @throws {TypeError} options argument must be an object
* @throws {TypeError} must provide valid options
* @throws {RangeError} when using sampling to generate initial centroids, the sample size must be greater than or equal to the number of clusters
* @returns {Function} accumulator function
*
* @example
* var Float64Array = require( '@stdlib/array/float64' );
* var ndarray = require( '@stdlib/ndarray/ctor' );
*
* // Define initial centroid locations:
* var buffer = [
*     0.0, 0.0,
*     1.0, 1.0,
*     1.0, -1.0,
*     -1.0, -1.0,
*     -1.0, 1.0
* ];
* var shape = [ 5, 2 ];
* var strides = [ 2, 1 ];
* var offset = 0;
* var order = 'row-major';
*
* var centroids = ndarray( 'float64', buffer, shape, strides, offset, order );
*
* // Create a k-means accumulator:
* var accumulator = incrkmeans( centroids );
*
* var out = accumulator();
* // returns {...}
*
* // Create a data vector:
* buffer = new Float64Array( 2 );
* shape = [ 2 ];
* strides = [ 1 ];
*
* var vec = ndarray( 'float64', buffer, shape, strides, offset, order );
*
* // Provide data to the accumulator:
* vec.set( 0, 2.0 );
* vec.set( 1, 1.0 );
*
* out = accumulator( vec );
* // returns {...}
*
* vec.set( 0, -5.0 );
* vec.set( 1, 3.14 );
*
* out = accumulator( vec );
* // returns {...}
*
* // Retrieve the current cluster results:
* out = accumulator();
* // returns {...}
*/
function incrkmeans() {
	var clusterstats;
	var centroids;
	var incrstats;
	var options;
	var results;
	var vcopy;
	var stats;
	var ndims;
	var dist;
	var opts;
	var init;
	var err;
	var FLG;
	var k;
 
	if ( isMatrixLike( arguments[ 0 ] ) ) {
		k = arguments[ 0 ].shape[ 0 ];
		ndims = arguments[ 0 ].shape[ 1 ];
		centroids = createMatrix( k, ndims, true ); // low-level
		centroids = copyMatrix( centroids, arguments[ 0 ] );
		if ( arguments.length > 1 ) {
			options = arguments[ 1 ];
			FLG = true;
		}
	} else if ( isPositiveInteger( arguments[ 0 ] ) ) {
		k = arguments[ 0 ];
		ndims = arguments[ 1 ];
		if ( !isPositiveInteger( ndims ) ) {
			throw new TypeError( format( 'invalid argument. Argument specifying number of dimensions must be a positive integer. Value: `%s`.', ndims ) );
		}
		if ( arguments.length > 2 ) {
			options = arguments[ 2 ];
			FLG = true;
		}
	} else {
		throw new TypeError( format( 'invalid argument. First argument must either be a positive integer specifying the number of clusters or a matrix containing initial centroids. Value: `%s`.', arguments[ 0 ] ) );
	}
	opts = {
		'metric': 'euclidean',
		'init': INIT_DEFAULTS[ 'kmeans++' ].slice(),
		'seed': minstd(),
		'normalize': true,
		'copy': true
	};
	opts.init[ 1 ] = k; // Note: this default applies to all initialization methods
	opts.init[ 2 ] = 2 + floor( ln( k ) ); // Note: from Arthur's and Vassilvitskii's paper "kmeans++: The Advantages of Careful Seeding" (see conclusion)
	if ( FLG ) {
		err = validate( opts, options );
		if ( err ) {
			throw err;
		}
	}
	if ( opts.init[ 1 ] < k ) {
		throw new RangeError( format( 'invalid option. First `%s` parameter option must be greater than or equal to the number of clusters. Options: `%f`.', 'init', opts.init[ 1 ] ) );
	}
	// Initialize a results object:
	results = createResults( k, ndims );
 
	// Initialize an internal matrix for tabulating cluster statistics:
	stats = createMatrix( k, NSTATS, true ); // low-level
 
	// Initialize an internal cluster statistics accumulator:
	clusterstats = statistics( stats, k );
 
	// Initialize metric-related variables...
	if ( opts.metric === 'cosine' ) {
		dist = squaredCosine;

		// Initialize a scratch vector for copying input vectors:
		if ( opts.copy ) {
			vcopy = createVector( ndims, true ); // low-level
		}
	} else if ( opts.metric === 'correlation' ) {
		dist = squaredCorrelation;

		// Initialize an accumulator for computing the mean vector and associated standard deviation along each dimension:
		if ( opts.normalize ) {
			incrstats = incrstatistics( ndims );
		}
		// Initialize a scratch vector for copying input vectors:
		if ( opts.copy ) {
			vcopy = createVector( ndims, true ); // low-level
		}
	} else {
		dist = squaredEuclidean;
	}
	// Check if we need to compute initial centroids...
	if ( centroids === void 0 ) {
		// Initialize an internal matrix for storing centroids:
		centroids = createMatrix( k, ndims, true ); // low-level
 
		// Initialize an accumulator for computing initial centroids:
		init = initialization( centroids, stats, clusterstats, incrstats, dist, opts ); // eslint-disable-line max-len
	} else {
		// Update cluster results to include the initial centroids (why? so that, even if no data is provided, the `results` object contains the provided centroids):
		copyMatrix( results.centroids, centroids );
	}
	// Attach properties and methods to the accumulator:
	setReadOnly( accumulator, 'seed', opts.seed );
	setReadOnly( accumulator, 'predict', predict );
 
	return accumulator;
 
	/**
	* If provided a data point vector, the accumulator function returns updated cluster results. If not provided a data point vector, the accumulator function returns the current cluster results.
	*
	* @private
	* @param {ndarray} [vec] - data vector
	* @throws {TypeError} must provide a 1-dimensional ndarray
	* @throws {Error} vector length must match centroid dimensions
	* @returns {(Object|null)} cluster results or null
	*/
	function accumulator( vec ) {
		var bool;
		var cbuf;
		var vbuf;
		var sbuf;
		var sv;
		var sc;
		var ov;
		var oc;
		var v;
		var N;
		var d;
		var c;
		if ( arguments.length === 0 ) {
			if ( init ) {
				return null;
			}
			return results;
		}
		v = vec; // Why? We mention `arguments` in the function and perform a subsequent reassignment.
		if ( !isVectorLike( v ) ) {
			throw new TypeError( format( 'invalid argument. Must provide a one-dimensional ndarray. Value: `%s`.', v ) );
		}
		if ( v.shape[ 0 ] !== ndims ) {
			throw new Error( format( 'invalid argument. Vector length must match centroid dimensions. Expected: `%u``. Actual: `%u``.', ndims, v.shape[ 0 ] ) );
		}
		// Check if we need to update the data point mean vector...
		if ( incrstats ) {
			incrstats( v );
		}
		// Check if we have yet to compute initial centroids...
		if ( init ) {
			bool = init( v );
			if ( bool === false ) {
				return null;
			}
			// De-reference `init` so that it and its internal variables can be garbage collected:
			init = void 0;
		} else {
			// If required by the metric, normalize the data vector...
			if ( opts.normalize ) {
				if ( opts.metric === 'cosine' ) {
					if ( opts.copy ) {
						v = copyVector( vcopy, v );
					}
					normalize( ndims, v.data, v.strides[ 0 ], v.offset );
				} else if ( opts.metric === 'correlation' ) {
					if ( opts.copy ) {
						v = copyVector( vcopy, v );
					}
					sbuf = incrstats();

					// Magic numbers come from knowing that `sbuf` is an interleaved strided array...
					standardize( ndims, v.data, v.strides[ 0 ], v.offset, sbuf, 2, 0, sbuf, 2, 1 ); // eslint-disable-line max-len
				}
			}
			cbuf = centroids.data;
			sc = centroids.strides[ 0 ];

			vbuf = v.data;
			sv = v.strides[ 0 ];
			ov = v.offset;

			// Find the closest centroid by computing the distance from the provided data point to each centroid:
			c = closestCentroid( dist, k, ndims, cbuf, sc, 0, vbuf, sv, ov ); // Magic number `0` for offset as we know that the matrix view begins at the first buffer element

			// Compute the centroids buffer index offset to point to the closest centroid:
			oc = sc * c;

			// Update the closest centroid:
			N = stats.get( c, 0 ) + 1;
			updateCentroid( ndims, N, cbuf, 1, oc, vbuf, sv, ov ); // Magic number `1` as we know that the matrix is row-major single-segment contiguous

			// Recompute the distance based on the updated centroid position:
			d = dist( ndims, cbuf, 1, oc, vbuf, sv, ov ); // Magic number `1` as we know that the matrix is row-major single-segment contiguous

			// Update cluster statistics:
			clusterstats( c, d );
		}
		// Update the results object:
		dcopy( centroids.length, centroids.data, 1, results.centroids.data, 1 ); // Magic number `1` as we know that these matrices are row-major single-segment contiguous
		dcopy( stats.length, stats.data, 1, results.stats.data, 1 ); // Magic number `1` as we know that these matrices are row-major single-segment contiguous

		return results;
	}
 
	/**
	* Computes data point distances to centroids and returns centroid assignment predictions.
	*
	* @private
	* @param {ndarray} [out] - output vector for storing centroid assignment predictions
	* @param {ndarray} X - matrix containing data points (`n x d`, where `n` is the number of data points and `d` is the number of dimensions)
	* @throws {TypeError} output argument must be a vector
	* @throws {TypeError} must provide a matrix
	* @throws {Error} vector length must match number of data points
	* @throws {Error} number of matrix columns must match centroid dimensions
	* @returns {(ndarray|null)} vector containing centroid (index) predictions or null
	*/
	function predict( out, X ) {
		var xbuf;
		var cbuf;
		var npts;
		var sx1;
		var sx2;
		var sc;
		var ox;
		var x;
		var o;
		var c;
		var i;
		if ( arguments.length > 1 ) {
			if ( !isVectorLike( out ) ) {
				throw new TypeError( format( 'invalid argument. Output argument must be a one-dimensional ndarray. Value: `%s`.', out ) );
			}
			o = out;
			x = X;
		} else {
			x = out;
		}
		if ( !isMatrixLike( x ) ) {
			throw new TypeError( format( 'invalid argument. Must provide a two-dimensional ndarray. Value: `%s`.', x ) );
		}
		if ( x.shape[ 1 ] !== ndims ) {
			throw new Error( format( 'invalid argument. Number of matrix columns must match centroid dimensions. Expected: `%u`. Actual: `%u`.', ndims, x.shape[ 1 ] ) );
		}
		if ( o === void 0 ) {
			o = createVector( x.shape[ 0 ], false ); // high-level
		} else if ( o.length !== x.shape[ 0 ] ) {
			throw new Error( format( 'invalid argument. Output vector length must match the number of data points. Expected: `%u`. Actual: `%u`.', x.shape[ 0 ], o.length ) );
		}
		if ( init ) {
			return null;
		}
		npts = x.shape[ 0 ];

		// If required by the metric, normalize the data vectors along the dimensions...
		if ( opts.normalize ) {
			if ( opts.metric === 'cosine' ) {
				if ( opts.copy ) {
					x = copyMatrix( createMatrix( npts, ndims, true ), x ); // low-level
				}
				x = normalizeMatrix( x );
			} else if ( opts.metric === 'correlation' ) {
				if ( opts.copy ) {
					x = copyMatrix( createMatrix( npts, ndims, true ), x ); // low-level
				}
				x = standardizeMatrix( x, incrstats() );
			}
		}
		cbuf = centroids.data;
		sc = centroids.strides[ 0 ];

		xbuf = x.data;
		sx1 = x.strides[ 0 ];
		sx2 = x.strides[ 1 ];
		ox = x.offset;

		// For each data point, find the closest centroid...
		for ( i = 0; i < npts; i++ ) {
			c = closestCentroid( dist, k, ndims, cbuf, sc, 0, xbuf, sx2, ox ); // Magic number `0` for offset as we know that the matrix view begins at the first buffer element

			// Update the output vector:
			o.set( i, c );

			// Compute the data point buffer index offset to point to the next data point:
			ox += sx1;
		}
		return o;
	}
}
 
 
// EXPORTS //
 
module.exports = incrkmeans;