Source code for evomap.mapping._tsne

""" 
T-Distributed Stochastic Neighborhood Embedding, as propsoed in:

Van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE. Journal of machine learning research, 9(11).
"""
import numpy as np
from scipy.spatial.distance import cdist
from numba import jit
from ._optim import gradient_descent_with_momentum
import inspect

EPSILON = 1e-12

[docs] class TSNE(): def __init__( self, n_dims = 2, perplexity = 15, n_iter = 2000, stop_lying_iter = 250, early_exaggeration = 4, initial_momentum = .5, final_momentum = .8, eta = 'auto', n_iter_check = 50, init = None, verbose = 0, input_type = 'distance', max_halves = 5, tol = 1e-3, n_inits = 1, step_size = 1 ): self.n_dims = n_dims self.perplexity = perplexity self.n_iter = n_iter self.stop_lying_iter = stop_lying_iter self.early_exaggeration = early_exaggeration self.initial_momentum = initial_momentum self.final_momentum = final_momentum self.eta = eta self.n_iter_check = n_iter_check self.init = init self.verbose = verbose self.input_type = input_type self.max_halves = max_halves self.tol = tol self.n_inits = n_inits self.step_size = step_size self.method_str = "TSNE"
[docs] def __str__(self): """Return a string representation of the TSNE instance with key parameters and user-modified values.""" # Get the signature of the __init__ method signature = inspect.signature(self.__init__) # Get the default values of the parameters from the __init__ method defaults = {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty} # Always show key parameters result = f"TSNE(perplexity={self.perplexity}, input_type={self.input_type})" # Collect all attributes that differ from the default changed_attrs = [] for attr, default_val in defaults.items(): current_val = getattr(self, attr) if current_val != default_val: changed_attrs.append(f"{attr}={current_val}") # If any attributes were changed, append them to the result if changed_attrs: result += "\nUser-modified attributes: " + ", ".join(changed_attrs) return result
[docs] def fit(self, X): """Fit the TSNE model to the input data, without returning the transformed coordinates. Parameters ---------- X : np.array of shape (n_samples, n_features) or (n_samples, n_samples) The input data. If `input_type` is 'vector', `X` should be the feature vectors of the samples. If `input_type` is 'distance', `X` should be the pairwise distance matrix. Returns ------- self : object Returns the instance of the TSNE class with the configuration matrix `Y_` stored as an attribute. """ self.fit_transform(X) return self
[docs] def fit_transform(self, X): """Fit the TSNE model and return the transformed coordinates. Parameters ---------- X : np.array of shape (n_samples, n_features) or (n_samples, n_samples) The input data. If `input_type` is 'vector', `X` should be the feature vectors of the samples. If `input_type` is 'distance', `X` should be the pairwise distance matrix. Returns ------- np.array of shape (n_samples, n_dims) The transformed coordinates in the reduced-dimensional space. Raises ------ ValueError If the `input_type` is not 'distance' or 'vector'. """ if self.input_type == 'distance': D = X elif self.input_type == 'vector': D = cdist(X,X) else: raise ValueError("Input type should be 'distance' or 'vector', not {0}".format(self.input_type)) # Get P-Matrix if self.verbose > 1: print("[TSNE] Calculating P matrix ...") P = _check_prepare_tsne(self, X) n_samples = P.shape[0] n_dims = self.n_dims # Initialize if self.init is None: init = np.random.normal(0,1,(n_samples, n_dims)) else: init = self.init # Set optimization arguments for early Exaggeration opt_args = { 'init': init, 'method_str': self.method_str, 'n_iter': self.stop_lying_iter, 'n_iter_check': self.n_iter_check, 'eta': self.eta, 'momentum': self.initial_momentum, 'start_iter': 0, 'min_grad_norm': self.tol, 'verbose': self.verbose, 'kwargs': {'P' : P*self.early_exaggeration} } Y, cost = gradient_descent_with_momentum(_kl_divergence, **opt_args) opt_args.update({ 'init': Y, 'n_iter': self.n_iter, 'momentum': self.final_momentum, 'start_iter': self.stop_lying_iter, 'kwargs': {'P': P} }) Y, cost = gradient_descent_with_momentum(_kl_divergence, **opt_args) self.Y_ = Y self.cost_ = cost return self.Y_
[docs] def _calc_p_matrix(X, included, input_type, perplexity): """Calculate the probability matrix (P-matrix) for t-SNE. The P-matrix is a joint probability distribution over pairwise similarities. Depending on the input type, this function calculates the matrix from either feature vectors, distance matrices, or similarity matrices. It also handles cases where certain rows are excluded from the calculation. Parameters ---------- X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) The input data, which can either be feature vectors (if `input_type` is 'vector'), a distance matrix (if `input_type` is 'distance'), or a similarity matrix (if `input_type` is 'similarity'). included : ndarray of shape (n_samples,), optional A binary array (0/1) indicating whether each sample should be included in the P-matrix calculation. If None, all samples are included by default. input_type : str Specifies the type of input. Should be one of {'vector', 'distance', 'similarity'}. perplexity : float The desired perplexity, used for tuning the distribution of the P-matrix. Perplexity determines the effective number of neighbors for each point. Returns ------- P : ndarray of shape (n_samples, n_samples) The joint probability distribution matrix over pairwise similarities. Raises ------ AssertionError If any rows of the input matrix contain only zeros, which would indicate invalid data for the calculation. ValueError If the input type is not recognized or if the P-matrix contains invalid values. """ from evomap.mapping.evomap import _utils # Import here to avoid circular dependency when initializing the module n = X.shape[0] if included is None: included = np.ones(n) if input_type == 'vector': X = X[included == 1, :] D = cdist(X, X) P = _utils._binary_search_perplexity(D.astype(np.float32), perplexity, 0) for i in range(n): if included[i] == 0: P = np.insert(P, i, 0*np.ones((1,P.shape[1])), 0) P = np.insert(P, i, 0*np.ones((1, P.shape[0])), 1) elif input_type == 'similarity': assert np.all(np.sum(X, axis = 0) != 0), "Zero Row(s) in Input Matrix" P = X elif input_type == 'distance': assert np.all(np.sum(X, axis = 0) != 0), "Zero Row(s) in Input Matrix" D = X D = D[included == 1, :][:, included == 1] P = _utils._binary_search_perplexity(D.astype(np.float32), perplexity, 0) for i in range(n): if included[i] == 0: P = np.insert(P, i, 0*np.ones((1,P.shape[1])), 0) P = np.insert(P, i, 0*np.ones((1, P.shape[0])), 1) P = cond_to_joint(P) P = np.maximum(P, EPSILON) np.fill_diagonal(P,0) assert np.all(np.isfinite(P)), "All probabilities should be finite. \ Check if object at index {0} has sufficient non-zero \ neighbors.".format(np.where(np.isfinite(P) == False)[0][0]) assert np.all(P >= 0), "All probabilities should be non-negative" assert np.all(P <= 1), ("All probabilities should be less than or equal \ to one") return P
[docs] def _check_prepare_tsne(model, X): """Check and prepare the input data for t-SNE. This function validates and prepares the input data for t-SNE by calculating the appropriate learning rate (`eta`) and generating the P-matrix based on the input data. Parameters ---------- model : TSNE or EvoMap The t-SNE or EvoMap model instance. The function will check and set the learning rate (`eta`) and other parameters from this model. X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) The input data, which can be feature vectors or a distance matrix. Returns ------- P : ndarray of shape (n_samples, n_samples) The prepared P-matrix representing pairwise similarities. Raises ------ ValueError If the learning rate (`eta`) is invalid. """ n_samples = X.shape[0] if model.eta == "auto": model.eta = n_samples / model.early_exaggeration model.eta = np.maximum(model.eta, 50) else: if not (model.eta > 0): raise ValueError("learning_rate 'eta' must be a positive number or 'auto'.") P = _calc_p_matrix(X, included = None, input_type = model.input_type, perplexity = model.perplexity) return P
[docs] @jit(nopython=True) def sqeuclidean_dist(Y): """Calculate the pairwise squared Euclidean distance matrix. Parameters ---------- Y : np.ndarray of shape (n_samples, n_dims) The coordinates of the points in the low-dimensional space. Returns ------- D : np.ndarray of shape (n_samples, n_samples) The squared Euclidean distance matrix. """ n = Y.shape[0] d = Y.shape[1] D = np.zeros((n,n)) for i in range(n): for j in range(n): for k in range(d): D[i,j] += (Y[i,k] - Y[j,k])**2 return D
[docs] @jit(nopython=True) def calc_q_matrix(Y, inclusions): """Calculate the Q-matrix of joint probabilities in low-dimensional space. The Q-matrix represents the joint probabilities in the low-dimensional space based on the pairwise Euclidean distances between points. A small constant is added to avoid division by zero. The method also allows excluding certain points from the calculation. Parameters ---------- Y : np.ndarray of shape (n_samples, n_dims) Array of map coordinates in the low-dimensional space. inclusions : np.ndarray of shape (n_samples,), optional A binary array where 1 indicates the point is included and 0 indicates the point is excluded from the probability matrix. If None, all points are included by default. Returns ------- Q : np.ndarray of shape (n_samples, n_samples) The joint probability matrix in the low-dimensional space. dist : np.ndarray of shape (n_samples, n_samples) The squared Euclidean distance matrix used to compute Q. """ n = Y.shape[0] if inclusions is None: inclusions = np.ones(n) dist = sqeuclidean_dist(Y) dist += 1 dist **= -1 # Mask non-included objects for idx in np.where(inclusions == 0)[0]: dist[idx, :] = 0 dist[:, idx] = 0 sum_dist = np.sum(dist) Q = np.maximum(dist/sum_dist, EPSILON) # Also return dist, to avoid recomputing it for the gradient return Q, dist
[docs] def _kl_divergence(Y, P, compute_error = True, compute_grad = True): """Calculate the KL-divergence between high-dimensional and low-dimensional joint probabilities. This function computes the Kullback-Leibler (KL) divergence between the joint probability distribution in the high-dimensional space (P) and the low-dimensional space (Q). Optionally, it also computes the gradient of the KL-divergence with respect to the low-dimensional coordinates. Parameters ---------- Y : np.ndarray of shape (n_samples, n_dims) Array of map coordinates in the low-dimensional space. P : np.ndarray of shape (n_samples, n_samples) The joint probability matrix in the high-dimensional space. compute_error : bool, optional Whether to compute the KL-divergence value, by default True. compute_grad : bool, optional Whether to compute the gradient of the KL-divergence, by default True. Returns ------- error : float or None The KL-divergence value, or None if `compute_error` is False. grad : np.ndarray or None The gradient of the KL-divergence, or None if `compute_grad` is False. """ Q, dist = calc_q_matrix(Y, None) if compute_error: n = P.shape[0] mask = np.eye(n, dtype = 'bool') p = P[~mask] q = Q[~mask] error = np.sum(p*np.log(p/q)) else: error = None if compute_grad: grad = _kl_divergence_grad(Y, P, Q, dist) else: grad = None return error, grad
[docs] @jit(nopython=True) def _kl_divergence_grad(Y, P, Q, dist): """Calculate the gradient of the KL-divergence with respect to the low-dimensional coordinates. Parameters ---------- Y : np.ndarray of shape (n_samples, n_dims) Array of map coordinates in the low-dimensional space. P : np.ndarray of shape (n_samples, n_samples) The joint probability matrix in the high-dimensional space. Q : np.ndarray of shape (n_samples, n_samples) The joint probability matrix in the low-dimensional space. dist : np.ndarray of shape (n_samples, n_samples) The squared Euclidean distance matrix in the low-dimensional space. Returns ------- dY : np.ndarray of shape (n_samples, n_dims) The gradient of the KL-divergence with respect to the map coordinates. """ # Gradient: dC/dY (n_samples, n_dims) = Y.shape dY = np.zeros((n_samples, n_dims), dtype = np.float32) PQd = (P - Q) * dist for i in range(n_samples): dY[i] = np.dot(np.ravel(PQd[i]), Y[i] - Y) dY *= 4.0 return dY
[docs] @jit(nopython=True) def cond_to_joint(P): """Convert a conditional probability matrix to a symmetric joint probability matrix. This function takes an asymmetric conditional probability matrix (P) and converts it into a symmetric joint probability matrix by averaging the pairwise probabilities and normalizing the result. Parameters ---------- P : np.ndarray of shape (n_samples, n_samples) The conditional probability matrix. Returns ------- P : np.ndarray of shape (n_samples, n_samples) The symmetric joint probability matrix. """ np.fill_diagonal(P,0) P = P + P.T sum_P = np.maximum(np.sum(P), EPSILON) P = np.maximum(P / sum_P, EPSILON) return P