Source code for evomap.mapping._sammon

"""
Nonlinear Sammon Mapping, as proposed in: 

Sammon, J. W. (1969). A nonlinear mapping for data structure analysis. IEEE Transactions on computers, 100(5), 401-409.
"""
import numpy as np
from scipy.spatial.distance import cdist
from numba import jit
from ._optim import gradient_descent_line_search
import inspect


[docs]
class Sammon():

    def __init__(
        self,
        n_dims = 2, 
        n_iter = 2000,  
        n_iter_check = 50,
        init = None, 
        verbose = 0, 
        input_type = 'distance', 
        max_halves = 5, 
        tol = 1e-3,  
        n_inits = 1, 
        step_size = 1
    ):

        self.n_dims = n_dims
        self.n_iter = n_iter
        self.n_iter_check = n_iter_check
        self.init = init
        self.verbose = verbose
        self.input_type = input_type
        self.max_halves = max_halves
        self.tol = tol
        self.n_inits = n_inits
        self.step_size = step_size
        self.method_str = "SAMMON"


[docs]
    def __str__(self):
        """Return a string representation of the Sammon instance with input_type and user-modified parameters."""
        # Get the signature of the __init__ method
        signature = inspect.signature(self.__init__)
        
        # Get the default values of the parameters from the __init__ method
        defaults = {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty}
        
        modified_params = []
        
        # Always show the 'input_type' parameter
        result = f"Sammon(input_type={self.input_type}"
        
        # Check if any current attributes differ from their default values and append them
        for param, default_value in defaults.items():
            current_value = getattr(self, param)
            if current_value != default_value and param != "input_type":
                modified_params.append(f"{param}={current_value}")
        
        # If there are modified parameters, append them to the result
        if modified_params:
            result += ", " + ", ".join(modified_params)
        
        result += ")"
        return result



[docs]
    def fit(self, X):
        """Fit the Sammon model to the input data.

        Parameters
        ----------
        X : np.array of shape (n_samples, n_features) or (n_samples, n_samples)
            The input data. If `input_type` is 'vector', `X` should be the feature 
            vectors of the samples. If `input_type` is 'distance', `X` should be 
            the pairwise distance matrix.

        Returns
        -------
        self : object
            Returns the instance of the Sammon class with the configuration matrix 
            `Y_` stored as an attribute.
        """
        self.fit_transform(X)
        return self



[docs]
    def fit_transform(self, X):
        """Fit the Sammon mapping model and return the transformed coordinates.

        Parameters
        ----------
        X : np.array of shape (n_samples, n_features) or (n_samples, n_samples)
            The input data. If `input_type` is 'vector', `X` should be the feature 
            vectors of the samples. If `input_type` is 'distance', `X` should be 
            the pairwise distance matrix.

        Returns
        -------
        np.array of shape (n_samples, n_dims)
            The transformed coordinates of the samples in the reduced-dimensional space.
        
        Raises
        ------
        ValueError
            If the `input_type` is not 'distance' or 'vector'.
        """
        if self.input_type == 'distance':
            # Note: When using sammon with pre-computed distances, note that
            # optimzation can sometimes be tricky (gradient norm increases 
            # inverse proportionally to distance size. Thus, very small distances
            # can let the gradient explode). 
            D = X
        elif self.input_type == 'vector':
            D = cdist(X,X)
        else:
            raise ValueError("Input type should be 'distance' or 'vector', not {0}".format(self.input_type))

        # Validate input
        D = _check_prepare_input_sammon(D)

        n_samples = D.shape[0]
        n_dims = self.n_dims

        # If initialization is provided, use it and run model (once).
        # Else, try n_init different initializiations
        if not self.init is None:
            self.n_inits = 1

        best_cost = np.inf
        for i in range(self.n_inits):
            if self.verbose > 0:
                print("[{0}] Initialization {1}/{2}".format(self.method_str,i+1, self.n_inits))

            if self.init is None:
                init = np.random.normal(0,1,(n_samples, n_dims))
            else:
                init = self.init

            # Set gradient descent arguments
            opt_args = {
                'init': init,
                'method_str': self.method_str,
                'n_iter': self.n_iter,
                'n_iter_check': self.n_iter_check,
                'step_size': self.step_size,
                'max_halves': self.max_halves,
                'min_grad_norm': self.tol,
                'verbose': self.verbose,
                'args': [D]
            }

            Y, cost = gradient_descent_line_search(_sammon_stress_function, **opt_args)

            if cost < best_cost:
                self.Y_ = Y
                self.cost_ = cost
                best_cost = cost

        return self.Y_




[docs]
def _check_prepare_input_sammon(D):
    """Check and prepare the input distance matrix for Sammon Mapping.

    This function ensures that the input distance matrix is valid for Sammon Mapping.
    It checks for strictly positive off-diagonal dissimilarities and adds a small 
    diagonal correction if necessary.

    Parameters
    ----------
    D : ndarray of shape (n_samples, n_samples)
        Input distance matrix. Should be a square, symmetric matrix representing 
        pairwise dissimilarities between samples.

    Returns
    -------
    ndarray of shape (n_samples, n_samples)
        Prepared distance matrix with a diagonal correction applied.

    Raises
    ------
    ValueError
        If any off-diagonal entries in the distance matrix are non-positive.
    """
    n_samples = D[0].shape[0]
    D = D + np.eye(n_samples)     
    if np.count_nonzero(D<=0) > 0:
        raise ValueError("Off-diagonal dissimilarities must be strictly positive")  
    return D



[docs]
def _sammon_stress_function(positions, disparities, compute_error = True, compute_grad = True):
    """Compute the Sammon stress function and its gradient.

    The Sammon stress function measures the discrepancy between the input distances 
    (disparities) and the distances among the estimated positions in the reduced space. 
    Optionally, it can also compute the gradient of the stress function for optimization purposes.

    Parameters
    ----------
    positions : ndarray of shape (n_samples, n_dims)
        The estimated positions of the samples in the reduced-dimensional space.
    disparities : ndarray of shape (n_samples, n_samples)
        The input distance (or dissimilarity) matrix.
    compute_error : bool, optional
        Whether to compute the stress (error) value, by default True.
    compute_grad : bool, optional
        Whether to compute the gradient of the stress function, by default True.

    Returns
    -------
    float or None
        The Sammon stress value (cost), or None if `compute_error` is False.
    ndarray or None
        The gradient of the stress function, or None if `compute_grad` is False.
    """
    distances = cdist(positions, positions, metric = 'euclidean')
    if compute_error:    

        n_samples = positions.shape[0]
        D_map = distances.copy()
        D_map = D_map[np.triu_indices(n_samples, k = 1)]
        D_data = disparities[np.triu_indices(n_samples, k = 1)]

        delta = D_data - D_map
        D_data_inv = 1. / D_data
        cost = np.sum((delta**2) * D_data_inv) / np.sum(D_data)
    else:
        cost = None

    if compute_grad:
        grad = _sammon_stress_gradient(positions, distances, disparities)
        # Need to reshape gradient. Raveled for speed optim.
        grad = np.reshape(grad, (-1,positions.shape[1]),order='F')
    else:
        grad = None
        
    return cost, grad



[docs]
@jit(nopython=True)
def _sammon_stress_gradient(Y, D_map, D):
    """Compute the gradient of the Sammon stress function.

    Parameters
    ----------
    Y : ndarray of shape (n_samples, n_dims)
        The current positions of the samples in the reduced-dimensional space.
    D_map : ndarray of shape (n_samples, n_samples)
        The pairwise Euclidean distances between the estimated positions.
    D : ndarray of shape (n_samples, n_samples)
        The input distance (or dissimilarity) matrix.

    Returns
    -------
    ndarray of shape (n_samples * n_dims)
        The computed gradient of the Sammon stress function, flattened for optimization.
    """    
    n_samples = Y.shape[0]
    n_dims = Y.shape[1]
    D_map = D_map + np.eye(n_samples)
    D_map_inv = 1. / D_map
    D_inv = 1. / D

    delta = D_map_inv - D_inv

    one = np.ones((n_samples,n_dims))
    deltaone = np.dot(delta,one) # sum of distances, (reapated n_dims times)

    dY = np.dot(delta,Y) - (Y * deltaone)
    dinv3 = D_map_inv ** 3
    y2 = Y ** 2
    H = np.dot(dinv3,y2) - deltaone - 2*Y * np.dot(dinv3,Y) + y2 * np.dot(dinv3,one)
    iY = dY.transpose().flatten() / np.abs(H.transpose().flatten()) # Gradient / Hessian

    return iY