"""
Useful transformation for data pre-processing.
"""
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
[docs]def diss2sim(diss_mat, transformation = 'inverse', eps = 1e-3):
""" Transform a dissimilarity matrix to a similarity matrix
Parameters
----------
diss_mat : ndarray of shape (n_samples, n_samples)
Matrix of pairwise dissimilarities.
transformation : str, optional
Transformation function, either 'inverse' or 'mirror', by default 'inverse'
eps : float, optional
Incremental constant to avoid division by zero, by default 1e-3
Returns
-------
ndarray of shape (n_samples, n_samples)
Matrix of pairwise similarities.
"""
if transformation == "inverse":
sim_mat = 1/(1+diss_mat)
elif transformation == "mirror":
# Normalize dissimilarities to [0,1), than mirror it
diss_mat = diss_mat / (np.max(diss_mat)+eps)
sim_mat = 1 - diss_mat
else:
raise ValueError('Unknown transformation type')
np.fill_diagonal(sim_mat, 1)
return sim_mat
[docs]def sim2diss(sim_mat, transformation = 'inverse', eps = 1e-4):
""" Transform a similarity matrix to a dissimilarity matrix.
Parameters
----------
sim_mat : ndarray of shape (n_samples, n_samples)
Matrix of pairwise similarities
transformation : str, optional
Transformation function, either 'inverse' or 'mirror', by default 'inverse'
eps : float, optional
Incremental constant to avoid division by zero, by default 1e-3
Returns
-------
ndaray of shape (n_samples, n_samples)
Matrix of pairwise dissimilarities.
"""
if transformation == 'inverse':
sim_mat = np.maximum(sim_mat, eps)
diss_mat = (1/sim_mat)
elif transformation == 'mirror':
# Normalize similarities to [0,1]
if np.max(sim_mat)>1:
sim_mat = sim_mat / (np.max(sim_mat)+eps)
diss_mat = 1 - sim_mat
np.fill_diagonal(diss_mat, 0)
return diss_mat
[docs]def coocc2sim(coocc_mat):
""" Transform a matrix with co-occurrence counts to a similarity matrix.
Parameters
----------
coocc_mat : ndarray of shape (n_samples, n_samples)
Matrix of co-occurrence counts.
Returns
-------
ndarray of shape (n_samples, n_samples)
Matrix of pairwise similarities.
"""
np.fill_diagonal(coocc_mat,0)
sim_mat = coocc_mat / np.sum(coocc_mat, axis = 1).reshape((-1,1))
# Make symmetric:
n = sim_mat.shape[0]
i_upper = np.triu_indices(n, 1)
sim_mat[i_upper] = sim_mat.T[i_upper]
return sim_mat
[docs]def edgelist2matrix(df, score_var, id_var_i, id_var_j, time_var = None, time_selected = None):
""" Transform an edgelist to a relationship matrix.
Parameters
----------
df : DataFrame
Data containing the edgelist. Each row should include a pair. Needs to include
two id variables and a score variable. Can also include a time variable.
score_var : string
The score variable.
id_var_i : string
The first id variable.
id_var_j : string
The second id variable.
time_var : string, optional
The time variable (int), by default None
time_selected : int, optional
The selected time, by default None
Returns
-------
S: ndarray of shape (n_samples, n_samples)
A matrix of pairwise relationships.
ids: ndarray of shape (n_samles, )
Identifiers for each element of the matrix.
"""
if not time_var is None:
df = df[df[time_var] == time_selected]
ids = np.unique(np.concatenate([df[id_var_i], df[id_var_j]], axis = 0))
ids = list(ids)
n = len(ids)
df = df[(df[id_var_i].isin(ids)) & (df[id_var_j].isin(ids))]
row = [ids.index(id) for id in df[id_var_i]]
col = [ids.index(id) for id in df[id_var_j]]
scores = list(df[score_var])
S = coo_matrix((scores, (row, col)), shape=(n, n))
S = S.toarray()
S = np.nan_to_num(S, 0)
return S, np.array(ids)
[docs]def edgelist2matrices(df, score_var, id_var_i, id_var_j, time_var):
"""Transform a time-indexed edgelist to a sequence of relationship matrices.
Parameters
----------
df : DataFrame
Data containing the edgelist. Each row should include a pair. Needs to include
two id variables, a score variable, and a time variable.
score_var : string
The score variable.
id_var_i : string
The first id variable.
id_var_j : string
The second id variable.
time_var : string
The time variable (int)
Returns
-------
S_t: list of ndarrays of shape (n_samples, n_samples) with length (n_periods)
A sequence of relationship matrices.
ids_t: ndarray of shape (n_samles, )
Identifiers for each element of the matrix.
"""
df = df.sort_values(by = time_var)
periods = df[time_var].unique()
S_t = []
ids_t = []
for period in periods:
data_t = df[df[time_var] == period]
S, ids = edgelist2matrix(data_t,
score_var = score_var,
id_var_i = id_var_i,
id_var_j = id_var_j)
S_t.append(S)
ids_t.append(ids)
return S_t, ids_t
[docs]def normalize_diss_mat(D):
max_diss = np.max(D)
D_norm = D / max_diss
return D_norm
[docs]def normalize_diss_mats(D_ts):
""" Normalize a sequence of dissimilarity matrices by a common factor
(the max. dissimilarity within the sequence).
Parameters
----------
D_ts : list of ndarrays, each of shape (n_samples, n_samples)
Sequence of dissimilarity matrices.
Returns
-------
D_ts: ndarray of shape (n_samples, n_samples)
Sequence of dissimilarity matrices, normalized by the maximum dissimilarity within
the input sequence.
"""
n_periods = len(D_ts)
max_diss = - np.inf
for t in range(n_periods):
max_diss_t = np.max(D_ts[t])
if max_diss_t > max_diss:
max_diss = max_diss_t
for t in range(n_periods):
D_ts[t] = D_ts[t] / max_diss
return D_ts
[docs]def expand_matrices(Xts, names_t):
""" Exand list of similarity matrices to equal shape and calculate inclusion vectors.
Args:
Returns:
(list, list, list): list of similarity matrices (equal size), list of inclusion vectors (0/1) and list of all labels.
"""
# Step 1: Construct a big similarity matrix
# Get unique list while preserving the order
# careful: list(set(list)) does NOT produce a stable ordering (across runs / seeds)!
all_labels = [[label for label in names] for names in names_t]
all_labels = [item for sublist in all_labels for item in sublist]
seen = set()
seen_add = seen.add
all_labels = [label for label in all_labels if not (label in seen or seen_add(label))]
n_periods = len(Xts)
S_ts = [] # similarity matrices
Inc_ts = [] # inclusion vectors (initialized with -1)
for t in range(n_periods):
Inc_ts.append(np.ones(len(all_labels), dtype = int) * -1)
for t in range(n_periods):
# Step 2: Fill it for each period
S_t = pd.DataFrame(data = np.zeros((len(all_labels), len(all_labels))), index = all_labels, columns = all_labels)
labels_t = names_t[t]
S_t.loc[labels_t,labels_t] = Xts[t]
Inc_t = [1*(label in labels_t) for label in all_labels]
S_ts.append(S_t)
Inc_ts[t][:] = Inc_t
S_ts = [S_t.values for S_t in S_ts]
return S_ts, Inc_ts, all_labels
[docs]def calc_distances(X, metric = 'euclidean'):
""" Caluclate matrix of pairwise distances among the rows of an input
matrix.
Parameters
----------
X : ndarray of shape (n_samples, n_dims)
Input matrix.
metric: string
The distance metric to use. Can be any of
'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation',
'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon',
'kulsinski', 'kulczynski1', 'mahalanobis', 'matching', 'minkowski',
'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',
'sokalsneath', 'sqeuclidean', 'yule'.
Returns:
ndarray of shape (n_samples, n_samples): Matrix of pairwise distances.
"""
from scipy.spatial.distance import squareform, pdist
dist_mat = squareform(pdist(X, metric = metric))
return dist_mat