Source code for evomap.mapping._mds

"""
Stress-Based Multidimensional Scaling.
"""
import numpy as np
from scipy.spatial.distance import cdist
from numba import jit
from ._optim import gradient_descent_line_search
from ._cmds import CMDS
from sklearn.linear_model import LinearRegression
from ._regression import IsotonicRegression
import inspect

EPSILON = 1e-10

[docs] class MDS(): def __init__( self, n_dims = 2, mds_type = None, n_iter = 2000, n_iter_check = 50, init = None, verbose = 0, input_type = 'distance', max_halves = 5, tol = 1e-3, n_inits = 1, step_size = 1 ): self.n_dims = n_dims self.mds_type = mds_type self.n_iter = n_iter self.n_iter_check = n_iter_check self.init = init self.verbose = verbose self.input_type = input_type self.max_halves = max_halves self.tol = tol self.n_inits = n_inits self.step_size = step_size self.method_str = "MDS"
[docs] def __str__(self): """Create a string representation of the MDS instance. Displays the key attributes and all parameters modified by the user. Returns ------- str A summary of the key attributes of this MDS object, including modified parameters. """ # Initial part of the output with always-displayed attributes result = f"MDS Type: {self.mds_type}, Input Type: {self.input_type}" # Get the signature of the __init__ method signature = inspect.signature(self.__init__) # Get the default values of the parameters from the __init__ method defaults = {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty} # Collect all attributes that differ from the default changed_attrs = [] for attr, default_val in defaults.items(): current_val = getattr(self, attr) if current_val != default_val: changed_attrs.append(f"{attr}={current_val}") # If any attributes were changed, append them to the result if changed_attrs: result += "\nUser-modified attributes: " + ", ".join(changed_attrs) return result
[docs] def fit(self, X): """Fit the MDS model to the input data, without returning the transformed positions. Parameters ---------- X : np.array of shape (n_samples, n_features) or (n_samples, n_samples) The input data. If `input_type` is 'vector', X should be the feature vectors of the samples. If `input_type` is 'distance', X should be a pairwise distance matrix. Returns ------- self : object The instance of the MDS class, after fitting the model to the input data. """ self.fit_transform(X) return self
[docs] def fit_transform(self, X): """Fit the MDS model to the input data and return transformed positions. Dependning on 'input_type', the input data is either interpreted as a distance matrix or feature vectors. The method uses gradient descent to optimize the lower-dimensional positions such that a Stress function, measuring the discrepancy between the input distances and resulting configuration, is minimized. Parameters ---------- X : np.array of shape (n_samples, n_features) or (n_samples, n_samples) The input data. If `input_type` is 'vector', X should be the feature vectors of the samples. If `input_type` is 'distance', X should be a pairwise distance matrix. Returns ------- np.array of shape (n_samples, n_dims) The transformed positions in the lower-dimensional space. Raises ------ ValueError If `input_type` is neither 'distance' nor 'vector', a ValueError is raised. """ if self.input_type == 'distance': D = X elif self.input_type == 'vector': D = cdist(X,X) else: raise ValueError("Input type should be 'distance' or 'vector', not {0}".format(self.input_type)) n_samples = D.shape[0] n_dims = self.n_dims # If initialization is provided, use it and run model (once). # Else, try n_init different initializiations if not self.init is None: self.n_inits = 1 best_cost = np.inf for i in range(self.n_inits): if self.verbose > 0: if self.n_inits > 1: print("[{0}] Initialization {1}/{2}".format(self.method_str,i+1, self.n_inits)) if self.init is None: init = np.random.normal(0,.1,(n_samples, n_dims)) elif type(self.init) == str and self.init == "cmds": init = CMDS().fit_transform(D) else: init = self.init # Set gradient descent arguments opt_args = { 'init': init, 'method_str': self.method_str, 'n_iter': self.n_iter, 'n_iter_check': self.n_iter_check, 'step_size': self.step_size, 'max_halves': self.max_halves, 'min_grad_norm': self.tol, 'verbose': self.verbose, 'args': [D], 'kwargs': {'mds_type': self.mds_type} } Y, cost = gradient_descent_line_search(_normalized_stress_function, **opt_args) if cost < best_cost: self.Y_ = Y self.cost_ = cost best_cost = cost # Enforce consistent signs of coordinates for dim in range(n_dims): # Check the sign of the first element in the dimension if self.Y_[0, dim] > 0: self.Y_[:, dim] *= -1 return self.Y_
[docs] def _normalized_stress_function( positions, disparities, mds_type = None, compute_error = True, compute_grad = True): """Compute normalized stress and its gradient. The stress function quantifies the goodness-of-fit between the input disparities (or distances) and the Euclidean distances in the low-dimensional space, with options for different MDS types: absolute, ratio, interval, and ordinal scaling. The input distances are transformed to disparities according to the mds type. Optionally, the function also computes the gradient to be used in optimization. Parameters ---------- positions : np.array of shape (n_samples, n_dims) The estimated positions in the low-dimensional space. disparities : np.array of shape (n_samples, n_samples) The input distances or disparities matrix, depending on the MDS type. mds_type : str, optional The type of MDS scaling to use: 'absolute', 'ratio', 'interval', or 'ordinal'. If None, 'absolute' scaling is used by default. compute_error : bool, optional Whether to compute the normalized stress value, by default True. compute_grad : bool, optional Whether to compute the gradient of the stress function, by default True. Returns ------- float or None The computed stress value, or None if `compute_error` is False. np.array of shape (n_samples, n_dims) or None The computed gradient of the stress function, or None if `compute_grad` is False. Raises ------ ValueError If an invalid `mds_type` is provided, or if `mds_type` is not recognized. """ def normalize_dhat(d_hat, n_samples): """Normalize the flattened disparities to match the total variance of the data. This method scales the input disparities so that their total sum of squares matches the total number of unique pairwise distances. The normalization ensures that the disparities are comparable to the distances. Parameters ---------- d_hat : np.array of shape (n_samples * (n_samples - 1) / 2) The flattened array of pairwise disparities. n_samples : int The number of samples or points in the dataset. Returns ------- np.array of shape (n_samples * (n_samples - 1) / 2) The normalized disparities. """ return d_hat * np.sqrt((n_samples * (n_samples - 1) / 2) / (d_hat**2).sum()) def rebuild_matrix(disp_flat, n_samples): """Rebuild a full symmetric matrix from a flattened array of lower triangular entries. Given a 1D array containing the lower triangular elements of a matrix, this method reconstructs the full symmetric matrix where the upper triangular part mirrors the lower triangular part. Parameters ---------- disp_flat : np.array of shape (n_samples * (n_samples - 1) / 2) The flattened array of lower triangular matrix entries (disparities). n_samples : int The number of samples or points, which determines the size of the matrix. Returns ------- np.array of shape (n_samples, n_samples) The symmetric matrix reconstructed from the lower triangular entries. """ D_hat = np.zeros((n_samples, n_samples)) D_hat[np.tril_indices(n_samples,-1)] = disp_flat D_hat = D_hat + D_hat.transpose() return D_hat distances = cdist(positions, positions, metric = 'euclidean') n_samples = distances.shape[0] if mds_type is None or mds_type == 'absolute': disp_hat = disparities elif mds_type == 'ratio': # Get n * (n-1) / 2 unique distances and disparities distances_flat = distances[np.tril_indices(len(distances),-1)] disparities_flat = disparities[np.tril_indices(len(disparities),-1)] lr = LinearRegression(positive=True, fit_intercept=False) X = disparities_flat.reshape(-1,1) y = distances_flat.reshape(-1,1) lr = lr.fit(X,y) disp_hat = lr.predict(X).reshape(-1) disp_hat = normalize_dhat(disp_hat, n_samples) # Rebuild matrix of (fitted) disparities disp_hat = rebuild_matrix(disp_hat, n_samples) elif mds_type == 'interval': distances_flat = distances[np.tril_indices(len(distances),-1)] disparities_flat = disparities[np.tril_indices(len(disparities),-1)] lr = LinearRegression(positive=True) X = disparities_flat.reshape(-1,1) y = distances_flat.reshape(-1,1) lr = lr.fit(X,y) disp_hat = lr.predict(X).reshape(-1) disp_hat = normalize_dhat(disp_hat, n_samples) # Rebuild matrix of (fitted) disparities disp_hat = rebuild_matrix(disp_hat, n_samples) elif mds_type == 'ordinal': ir = IsotonicRegression() distances_flat = distances[np.tril_indices(len(distances),-1)] disparities_flat = disparities[np.tril_indices(len(disparities),-1)] disp_hat = ir.fit_transform(X = disparities_flat, y = distances_flat) disp_hat = disp_hat.reshape(-1) disp_hat = normalize_dhat(disp_hat, n_samples) # Rebuild matrix of (fitted) disparities disp_hat = rebuild_matrix(disp_hat, n_samples) else: raise ValueError("Unknown MDS type {0}!".format(mds_type)) if compute_grad: grad = _normalized_stress_gradient(positions, distances, disp_hat) else: grad = None if compute_error: dis = np.triu(distances) disp = np.triu(disp_hat) stress = np.sqrt(np.sum((disp - dis)**2)/np.sum(dis**2)) else: stress = None return stress, grad
[docs] @jit(nopython=True) def _normalized_stress_gradient( positions, distances, disparities): """Calculate the gradient of the normalized stress function for MDS. Parameters ---------- positions : np.array of shape (n_samples, n_dims) The estimated positions in the low-dimensional space. distances : np.array of shape (n_samples, n_samples) The Euclidean distances among the estimated positions. disparities : np.array of shape (n_samples, n_samples) The input disparities (or distance) matrix. Returns ------- np.array of shape (n_samples, n_dims) The gradient of the stress function with respect to the positions. """ n_samples = distances.shape[0] n_dims = positions.shape[1] gradient = np.zeros(shape = (n_samples, n_dims)) for i in range(n_samples): for l in range(n_dims): grad_il = 0 for j in range(n_samples): if j != i: dist_ij = distances[i,j] if dist_ij == 0: dist_ij += EPSILON grad_il -= (1- disparities[i,j] / dist_ij) * (positions[j,l] - positions[i,l]) gradient[i,l] = grad_il return gradient / (n_samples - 1)