Source code for evomap.preprocessing

"""
Useful transformation for data pre-processing.
"""

import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

[docs]def diss2sim(diss_mat, transformation = 'inverse', eps = 1e-3):
    """ Transform a dissimilarity matrix to a similarity matrix

    Parameters
    ----------
    diss_mat : ndarray of shape (n_samples, n_samples)
        Matrix of pairwise dissimilarities.
    transformation : str, optional
        Transformation function, either 'inverse' or 'mirror', by default 'inverse'
    eps : float, optional
        Incremental constant to avoid division by zero, by default 1e-3

    Returns
    -------
    ndarray of shape (n_samples, n_samples)
        Matrix of pairwise similarities.
    """

    if transformation == "inverse":
        sim_mat = 1/(1+diss_mat)

    elif transformation == "mirror":
        # Normalize dissimilarities to [0,1), than mirror it
        diss_mat = diss_mat / (np.max(diss_mat)+eps)
        sim_mat = 1 - diss_mat
        
    else:
        raise ValueError('Unknown transformation type')

    np.fill_diagonal(sim_mat, 1)
    return sim_mat   

[docs]def sim2diss(sim_mat, transformation = 'inverse', eps = 1e-4):
    """ Transform a similarity matrix to a dissimilarity matrix.

    Parameters
    ----------
    sim_mat : ndarray of shape (n_samples, n_samples)
        Matrix of pairwise similarities
    transformation : str, optional
        Transformation function, either 'inverse' or 'mirror', by default 'inverse'
    eps : float, optional
        Incremental constant to avoid division by zero, by default 1e-3

    Returns
    -------
    ndaray of shape (n_samples, n_samples)
        Matrix of pairwise dissimilarities.
    """

    if transformation == 'inverse':
        sim_mat = np.maximum(sim_mat, eps)
        diss_mat = (1/sim_mat)

    elif transformation == 'mirror':
        # Normalize similarities to [0,1]       
        if np.max(sim_mat)>1:
            sim_mat = sim_mat / (np.max(sim_mat)+eps)
        
        diss_mat = 1 - sim_mat

    np.fill_diagonal(diss_mat, 0)
    return diss_mat 

[docs]def coocc2sim(coocc_mat):
    """ Transform a matrix with co-occurrence counts to a similarity matrix. 

    Parameters
    ----------
    coocc_mat : ndarray of shape (n_samples, n_samples)
        Matrix of co-occurrence counts.

    Returns
    -------
    ndarray of shape (n_samples, n_samples)
        Matrix of pairwise similarities.
    """

    np.fill_diagonal(coocc_mat,0)
    sim_mat = coocc_mat / np.sum(coocc_mat, axis = 1).reshape((-1,1))

    # Make symmetric:
    n = sim_mat.shape[0]
    i_upper = np.triu_indices(n, 1)
    sim_mat[i_upper] = sim_mat.T[i_upper]

    return sim_mat

[docs]def edgelist2matrix(df, score_var, id_var_i, id_var_j, time_var = None, time_selected = None):
    """ Transform an edgelist to a relationship matrix.

    Parameters
    ----------
    df : DataFrame
        Data containing the edgelist. Each row should include a pair. Needs to include
        two id variables and a score variable. Can also include a time variable.
    score_var : string
        The score variable. 
    id_var_i : string
        The first id variable.
    id_var_j : string
        The second id variable.
    time_var : string, optional
        The time variable (int), by default None
    time_selected : int, optional
        The selected time, by default None

    Returns
    -------
    S: ndarray of shape (n_samples, n_samples)
        A matrix of pairwise relationships.

    ids: ndarray of shape (n_samles, )
        Identifiers for each element of the matrix.
    """

    if not time_var is None:
        df = df[df[time_var] == time_selected]
    ids = np.unique(np.concatenate([df[id_var_i], df[id_var_j]], axis = 0))
    ids = list(ids)
    n = len(ids)

    df = df[(df[id_var_i].isin(ids)) & (df[id_var_j].isin(ids))]
    row = [ids.index(id) for id in df[id_var_i]]
    col = [ids.index(id) for id in df[id_var_j]]

    scores = list(df[score_var])

    S = coo_matrix((scores, (row, col)), shape=(n, n))
    S = S.toarray()
    S = np.nan_to_num(S, 0)
    return S, np.array(ids)

[docs]def edgelist2matrices(df, score_var, id_var_i, id_var_j, time_var):
    """Transform a time-indexed edgelist to a sequence of relationship matrices.

    Parameters
    ----------
    df : DataFrame
        Data containing the edgelist. Each row should include a pair. Needs to include
        two id variables, a score variable, and a time variable. 
    score_var : string
        The score variable. 
    id_var_i : string
        The first id variable.
    id_var_j : string
        The second id variable.
    time_var : string
        The time variable (int)

    Returns
    -------
    S_t: list of ndarrays of shape (n_samples, n_samples) with length (n_periods)
        A sequence of relationship matrices.

    ids_t: ndarray of shape (n_samles, )
        Identifiers for each element of the matrix.
    """
    df = df.sort_values(by = time_var)
    periods = df[time_var].unique()
    S_t = []
    ids_t = []
    for period in periods:
        data_t = df[df[time_var] == period]
        S, ids = edgelist2matrix(data_t, 
                                        score_var = score_var,
                                         id_var_i = id_var_i, 
                                         id_var_j = id_var_j)
        S_t.append(S)
        ids_t.append(ids)

    return S_t, ids_t

[docs]def normalize_diss_mat(D):

    max_diss = np.max(D) 
    D_norm = D / max_diss
    return D_norm

[docs]def normalize_diss_mats(D_ts):
    """ Normalize a sequence of dissimilarity matrices by a common factor 
    (the max. dissimilarity within the sequence).
    
    Parameters
    ----------
    D_ts : list of ndarrays, each of shape (n_samples, n_samples)
        Sequence of dissimilarity matrices.
    Returns
    -------
    D_ts: ndarray of shape (n_samples, n_samples)
        Sequence of dissimilarity matrices, normalized by the maximum dissimilarity within
        the input sequence.

    """
    n_periods = len(D_ts)
    max_diss = - np.inf
    for t in range(n_periods):
        max_diss_t = np.max(D_ts[t]) 
        if max_diss_t > max_diss:
            max_diss = max_diss_t

    for t in range(n_periods):
        D_ts[t] = D_ts[t] / max_diss
    return D_ts


[docs]def expand_matrices(Xts, names_t):
	""" Exand list of similarity matrices to equal shape and calculate inclusion vectors.
	
	Args:

	Returns:
		(list, list, list): list of similarity matrices (equal size), list of inclusion vectors (0/1) and list of all labels.
	"""

	# Step 1: Construct a big similarity matrix
	
	# Get unique list while preserving the order 
	# careful: list(set(list)) does NOT produce a stable ordering (across runs / seeds)!

	all_labels = [[label for label in names] for names in names_t]
	all_labels = [item for sublist in all_labels for item in sublist]
	seen = set()
	seen_add = seen.add
	all_labels = [label for label in all_labels if not (label in seen or seen_add(label))]

	n_periods = len(Xts)
	S_ts = [] # similarity matrices
	Inc_ts = [] # inclusion vectors (initialized with -1)

	for t in range(n_periods):
		Inc_ts.append(np.ones(len(all_labels), dtype = int) * -1)

	for t in range(n_periods):
	# Step 2: Fill it for each period
		S_t = pd.DataFrame(data = np.zeros((len(all_labels), len(all_labels))), index = all_labels, columns = all_labels)
		
		labels_t = names_t[t]

		S_t.loc[labels_t,labels_t] = Xts[t]
		Inc_t = [1*(label in labels_t) for label in all_labels]

		S_ts.append(S_t)
		Inc_ts[t][:] = Inc_t

	S_ts = [S_t.values for S_t in S_ts]

	return S_ts, Inc_ts, all_labels

[docs]def calc_distances(X, metric = 'euclidean'):
    """ Caluclate matrix of pairwise distances among the rows of an input 
    matrix. 

    Parameters
    ----------
    X : ndarray of shape (n_samples, n_dims)
        Input matrix.

    metric: string
        The distance metric to use. Can be any of 
        'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 
        'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 
        'kulsinski', 'kulczynski1', 'mahalanobis', 'matching', 'minkowski', 
        'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 
        'sokalsneath', 'sqeuclidean', 'yule'.

	Returns:
		ndarray of shape (n_samples, n_samples): Matrix of pairwise distances.
	"""

    from scipy.spatial.distance import squareform, pdist
    dist_mat = squareform(pdist(X, metric = metric))
    return dist_mat
evomap documentation

Source code for evomap.preprocessing