Source code for evomap.mapping._sammon
"""
Nonlinear Sammon Mapping, as proposed in:
Sammon, J. W. (1969). A nonlinear mapping for data structure analysis. IEEE Transactions on computers, 100(5), 401-409.
"""
import numpy as np
from scipy.spatial.distance import cdist
from numba import jit
from ._optim import gradient_descent_line_search
[docs]class Sammon():
def __init__(
self,
n_dims = 2,
n_iter = 2000,
n_iter_check = 50,
init = None,
verbose = 0,
input_type = 'distance',
max_halves = 5,
tol = 1e-3,
n_inits = 1,
step_size = 1
):
self.n_dims = n_dims
self.n_iter = n_iter
self.n_iter_check = n_iter_check
self.init = init
self.verbose = verbose
self.input_type = input_type
self.max_halves = max_halves
self.tol = tol
self.n_inits = n_inits
self.step_size = step_size
self.method_str = "SAMMON"
[docs] def fit_transform(self, X):
if self.input_type == 'distance':
# Note: When using sammon with pre-computed distances, note that
# optimzation can sometimes be tricky (gradient norm increases
# inverse proportionally to distance size. Thus, very small distances
# can let the gradient explode).
D = X
elif self.input_type == 'vector':
D = cdist(X,X)
else:
raise ValueError("Input type should be 'distance' or 'vector', not {0}".format(self.input_type))
# Validate input
D = _check_prepare_input_sammon(D)
n_samples = D.shape[0]
n_dims = self.n_dims
# If initialization is provided, use it and run model (once).
# Else, try n_init different initializiations
if not self.init is None:
self.n_inits = 1
best_cost = np.inf
for i in range(self.n_inits):
if self.verbose > 0:
print("[{0}] Initialization {1}/{2}".format(self.method_str,i+1, self.n_inits))
if self.init is None:
init = np.random.normal(0,1,(n_samples, n_dims))
else:
init = self.init
# Set gradient descent arguments
opt_args = {
'init': init,
'method_str': self.method_str,
'n_iter': self.n_iter,
'n_iter_check': self.n_iter_check,
'step_size': self.step_size,
'max_halves': self.max_halves,
'min_grad_norm': self.tol,
'verbose': self.verbose,
'args': [D]
}
Y, cost = gradient_descent_line_search(_sammon_stress_function, **opt_args)
if cost < best_cost:
self.Y_ = Y
self.cost_ = cost
best_cost = cost
return self.Y_
[docs]def _check_prepare_input_sammon(D):
""" Check and, if necessary, prepare data for Sammon Mapping.
Parameters
----------
D : ndarray of shape (n_samples, n_samples)
Input distance matrix.
Returns
------
ndarray of shape (n_samples, n_samples)
Prepared input data
"""
n_samples = D[0].shape[0]
D = D + np.eye(n_samples)
if np.count_nonzero(D<=0) > 0:
raise ValueError("Off-diagonal dissimilarities must be strictly positive")
return D
[docs]def _sammon_stress_function(positions, disparities, compute_error = True, compute_grad = True):
distances = cdist(positions, positions, metric = 'euclidean')
if compute_error:
n_samples = positions.shape[0]
D_map = distances.copy()
D_map = D_map[np.triu_indices(n_samples, k = 1)]
D_data = disparities[np.triu_indices(n_samples, k = 1)]
delta = D_data - D_map
D_data_inv = 1. / D_data
cost = np.sum((delta**2) * D_data_inv) / np.sum(D_data)
else:
cost = None
if compute_grad:
grad = _sammon_stress_gradient(positions, distances, disparities)
# Need to reshape gradient. Raveled for speed optim.
grad = np.reshape(grad, (-1,positions.shape[1]),order='F')
else:
grad = None
return cost, grad
[docs]@jit(nopython=True)
def _sammon_stress_gradient(Y, D_map, D):
n_samples = Y.shape[0]
n_dims = Y.shape[1]
D_map = D_map + np.eye(n_samples)
D_map_inv = 1. / D_map
D_inv = 1. / D
delta = D_map_inv - D_inv
one = np.ones((n_samples,n_dims))
deltaone = np.dot(delta,one) # sum of distances, (reapated n_dims times)
dY = np.dot(delta,Y) - (Y * deltaone)
dinv3 = D_map_inv ** 3
y2 = Y ** 2
H = np.dot(dinv3,y2) - deltaone - 2*Y * np.dot(dinv3,Y) + y2 * np.dot(dinv3,one)
iY = dY.transpose().flatten() / np.abs(H.transpose().flatten()) # Gradient / Hessian
return iY