Module mudcod.community_detection
Expand source code
import warnings
from copy import deepcopy
import numpy as np
from numpy.linalg import eigvals, inv, svd
from scipy.linalg import sqrtm
from scipy.sparse.linalg import eigs
from scipy.special import comb
from sklearn.cluster import KMeans
from mudcod.network_distances import Distances
_eps = 10 ** (-10)
CONVERGENCE_CRITERIA = 10 ** (-5)
warnings.filterwarnings(action="ignore", category=np.ComplexWarning)
class CommunityDetectionMixin:
def __init__(self, method, verbose=False):
self.verbose = verbose
self.method = method
assert type(method) == str and method.lower() in [
"PisCES".lower(),
"MuDCoD".lower(),
"StaticSpectralCoD".lower(),
]
self._embeddings = None
self._model_order_K = None
@property
def embeddings(self):
if self._embeddings is None:
raise ValueError("Embeddings are not computed yet, run 'fit' first.")
return self._embeddings
@property
def model_order_K(self):
if self._model_order_K is None:
raise ValueError("Model order K is not computed yet, run 'fit' first.")
return self._model_order_K
@embeddings.setter
def embeddings(self, value):
if not isinstance(value, np.ndarray):
raise ValueError("Embeddings must be instance 'np.ndarray'.")
else:
self._embeddings = value
@model_order_K.setter
def model_order_K(self, value):
if self.method in ["PisCES", "MuDCoD"] and not isinstance(value, np.ndarray):
raise ValueError("Model order K must be instance of 'np.ndarray'.")
elif self.method in ["StaticSpectralCoD"] and not isinstance(value, int):
raise ValueError("Model order K must be instance of 'int'.")
else:
self._model_order_K = value
@staticmethod
def eigen_complete(adj, cv_idx, epsilon, k):
m = np.zeros(adj.shape)
while True:
adj_cv = (adj * (1 - cv_idx)) + (m * cv_idx)
u, s, vh = svd(adj_cv)
s[k:] = 0
m2 = u @ np.diag(s) @ vh
m2 = np.where(m2 < 0, 0, m2)
m2 = np.where(m2 > 1, 1, m2)
dn = np.sqrt(np.sum((m - m2) ** 2))
if dn < epsilon:
break
else:
m = m2
return m
@staticmethod
def choose_model_order_K(reprs, degrees, max_K, opt="empirical"):
"""
Predicts number of communities/modules in a network.
Parameters
----------
reprs : `np.ndarray` of shape (n, n)
Laplacianized adjacency matrix representation with dimension (n,n).
degrees : `np.ndarray` of shape (n)
Diagonal matrix of degree values with dimension (n).
max_K : `int`
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate K, i.e., number of communities.
Returns
-------
model_order_pred : `int`
Number of modules to consider.
"""
opt_list = ["null", "empirical", "full"]
if opt == opt_list[2]:
model_order_pred = max_K - 1
else:
n = reprs.shape[0]
sorted_eigenvalues = np.sort(eigvals(np.eye(n) - reprs))
gaprw = np.diff(sorted_eigenvalues)[1:max_K]
assert isinstance(gaprw, np.ndarray) and gaprw.ndim == 1
if opt == opt_list[0]:
pin = np.sum(degrees) / comb(n, 2) / 2
threshold = 3.5 / pin ** (0.58) / n ** (1.15)
idx = np.nonzero(gaprw > threshold)[0]
if idx.shape[0] == 0:
model_order_pred = 1
else:
model_order_pred = np.max(idx) + 1
elif opt == opt_list[1]:
if (gaprw == 0).any():
idx = -2
else:
idx = -1
model_order_pred = np.argsort(gaprw, axis=0)[idx] + 1
else:
raise ValueError(
f"Unkown option {opt} is given for predicting number of communities.\n"
f"Use one of {opt_list}."
)
return int(model_order_pred) + 1
@staticmethod
def modularity(adj_test, adj_train, z_pred, cv_idx, resolution=1):
"""
Calculates modularity given imputed training adjacency matrix,
community membership predictions and the actual training matrix.
Parameters
----------
adj_test : `np.ndarray` of shape (n, n)
Test ajdacency matrix of shape (n,n), corresponding to the actual network.
adj_train : `np.ndarray` of shape (n, n)
Training adjacency matrix of shape (n,n); whose edges are removed
and then imputed, where n is the number of vertices.
z_pred : np.ndarray of shape (n)
Predicted community membership label vector of size (n) of each
vertex to compute loss.
cv_idx : `np.ndarray` of shape (n, n)
A marix of size (n,n), indicating the index of removed edges. Value
of the entry is 1 for test and 0 for training edges.
Returns
-------
modu : `float`
Modularity value computed based on the predicted community
memberships.
"""
modu = 0
np.fill_diagonal(cv_idx, 0)
d_out_train = np.sum(adj_train, axis=0)
d_in_train = np.sum(adj_train, axis=1)
m = np.sum(cv_idx * adj_test) / 2
norm = 1 / (np.sum(adj_train) / 2) ** 2
for i, j in zip(*np.nonzero(cv_idx.astype(bool))):
if (cv_idx[i, j] > 0) and (z_pred[i] == z_pred[j]) and (i != j):
c = z_pred[i] = z_pred[j]
out_degree_sum = np.sum(d_out_train[z_pred == c])
in_degree_sum = np.sum(d_in_train[z_pred == c])
modu += adj_test[i, j] / m - (
(resolution * out_degree_sum * in_degree_sum * norm)
/ np.sum(cv_idx[z_pred == c][:, z_pred == c])
/ 2
)
return modu / 2
@staticmethod
def loglikelihood(adj_test, adj_train, z_pred, cv_idx):
"""
Calcualtes loglikelihood given imputed training adjacency matrix,
community membership predictions and the actual training matrix.
Parameters
----------
adj_test : `np.ndarray` of shape (n, n)
Test ajdacency matrix of shape (n,n), corresponding to the actual network.
adj_train : `np.ndarray` of shape (n, n)
Training adjacency matrix of shape (n,n); whose edges are removed
and then imputed, where n is the number of vertices.
z_pred : np.ndarray of shape (n)
Predicted community membership label vector of size (n) of each
vertex to compute loss.
cv_idx : `np.ndarray` of shape (n, n)
A marix of size (n,n), indicating the index of removed edges. Value
of the entry is 1 for test and 0 for training edges.
Returns
-------
logllh : `float`
DCBM loglikelihood value computed based on the predicted community
memberships.
"""
logllh = 0
num_communities = np.max(z_pred) + 1
d_out_train = np.sum(adj_train[:, :], axis=0)
# dp_out_train = d_out_train[:, np.newaxis] @ d_out_train[np.newaxis, :]
# hatB = np.zeros((num_communities, num_communities), dtype=int)
hat0 = np.zeros((num_communities, num_communities), dtype=int)
for comm_k in range(num_communities):
for comm_l in range(num_communities):
z_ix_kl = np.ix_(z_pred == comm_k, z_pred == comm_l)
total_degree = np.sum(adj_train[z_ix_kl])
# sum_degree_product = np.sum(dp_out_train[z_ix_kl])
# hatB[comm_k, comm_l] = total_degree / sum_degree_product
hat0[comm_k, comm_l] = total_degree
for i, j in zip(*np.nonzero(cv_idx.astype(bool))):
# prob = d_out_train[i] * d_out_train[j] * hatB[z_pred[i], z_pred[j]]
prob = (
d_out_train[i]
/ np.sum(hat0[z_pred[i], :])
* d_out_train[j]
/ np.sum(hat0[z_pred[j], :])
* hat0[z_pred[i], z_pred[j]]
* 0.8
)
if prob == 0 or np.isnan(prob):
prob = _eps
elif prob >= 1:
prob = 1 - _eps
else:
pass
logllh = (
logllh
+ np.log(prob) * (adj_test[i, j] >= _eps)
+ np.log(1 - prob) * (adj_test[i, j] < _eps)
)
# return logllh
return logllh / np.sum(adj_train)
class StaticSpectralCoD(CommunityDetectionMixin):
def __init__(self, verbose=False):
super().__init__("StaticSpectralCoD", verbose=verbose)
def fit(self, adj, max_K=None, opt_K="empirical"):
"""
Computes the spectral embeddings from the adjacency matrix of the
network.
Parameters
----------
adj : `np.ndarray` of shape (n, n)
Adjacency matrices of size (n, n), n is the number of vertices.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
Returns
-------
embeddings : `np.ndarray` of shape (n, max_K)
Computed spectral embedding of the adjacency matrix.
"""
self.adj = adj.astype(float)
self.num_vertices = self.adj.shape[0]
self.degrees = np.empty(self.adj.shape[:-1])
self.lapl_adj = np.empty_like(self.adj)
if max_K is None:
max_K = np.ceil(self.num_vertices / 10).astype(int)
assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 2
assert self.adj.shape[0] == self.adj.shape[1]
n = self.num_vertices
self.degrees = np.sum(np.abs(self.adj), axis=0) + _eps
sqinv_degree = sqrtm(inv(np.diag(self.degrees)))
self.lapl_adj = sqinv_degree @ self.adj @ sqinv_degree
v_col = np.zeros((n, max_K))
k = self.choose_model_order_K(self.lapl_adj, self.degrees, max_K, opt=opt_K)
_, v_col[:, :k] = eigs(self.lapl_adj, k=k, which="LM")
self.embeddings = v_col
self.model_order_K = k
return self.embeddings
def predict(self):
"""
Predicts community memberships of vertices.
Parameters
----------
Returns
-------
z_pred : np.ndarray of shape (n)
Predicted community membership labels of each vertex.
"""
kmeans = KMeans(n_clusters=self.model_order_K)
z_pred = kmeans.fit_predict(self.embeddings[:, : self.model_order_K])
return z_pred
def fit_predict(self, adj, max_K=None, opt_K="empirical"):
"""
Predicts community memberships of vertices given the adjacency matrix
of the network.
Parameters
----------
adj : `np.ndarray` of shape (n, n)
Adjacency matrices of size (n, n), n is the number of vertices.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
Returns
-------
z_pred : np.ndarray of shape (n)
Predicted community membership labels of each vertex.
"""
self.fit(adj, max_K=max_K, opt_K=opt_K)
return self.predict()
class PisCES(CommunityDetectionMixin):
def __init__(self, verbose=False):
super().__init__("PisCES", verbose=verbose)
self.convergence_monitor = []
def fit(
self,
adj,
alpha=None,
n_iter=30,
max_K=None,
opt_K="empirical",
monitor_convergence=False,
):
"""
Computes the spectral embeddings from given time series of matrices of
the dynamic network.
Parameters
----------
adj : `np.ndarray` of shape (th, n, n)
Time series of adjacency matrices of size (th,n,n), where n is the
number of vertices, and th is the time horizon.
alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2)
Tuning parameter for smoothing along the time axis.
n_iter : `int`, default=30
Determines the number of iterations to run PisCES.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence : `bool`, default='False'
Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t -
obj_{t-1}| at each iteration to monitor convergence.
Returns
-------
embeddings : `np.ndarray` of shape (th, n, max_K)
Computed and smoothed spectral embeddings of the time series of the
adjacency matrices, with shape (th, n, max_K).
"""
self.adj = adj.astype(float)
self.num_vertices = self.adj.shape[1]
self.time_horizon = self.adj.shape[0]
self.degrees = np.empty(self.adj.shape[:-1])
self.lapl_adj = np.empty_like(self.adj)
if alpha is None:
alpha = 0.05 * np.ones((self.time_horizon, 2))
if max_K is None:
max_K = np.ceil(self.num_vertices / 10).astype(int)
if self.time_horizon < 2:
raise ValueError(
"Time horizon must be at least 2, otherwise use static spectral clustering."
)
assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 3
assert self.adj.shape[1] == self.adj.shape[2]
assert isinstance(alpha, np.ndarray) and alpha.shape == (self.time_horizon, 2)
assert max_K > 0
th = self.time_horizon
n = self.num_vertices
u = np.zeros((th, n, n))
v_col = np.zeros((th, n, max_K))
k = np.zeros(th).astype(int) + max_K
objective = np.zeros(n_iter)
self.convergence_monitor = []
diffU = 0
for t in range(th):
adj_t = self.adj[t, :, :]
self.degrees[t, :] = np.sum(np.abs(adj_t), axis=0) + _eps
sqinv_degree = sqrtm(inv(np.diag(self.degrees[t, :])))
self.lapl_adj[t, :, :] = sqinv_degree @ adj_t @ sqinv_degree
# Initialization of k, v_col.
for t in range(th):
lapl_adj_t = self.lapl_adj[t, :, :]
k[t] = self.choose_model_order_K(
lapl_adj_t, self.degrees[t, :], max_K, opt=opt_K
)
_, v_col[t, :, : k[t]] = eigs(lapl_adj_t, k=k[t], which="LM")
u[t, :, :] = v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T
if monitor_convergence:
diffU = diffU + (
Distances.hamming_distance(
self.lapl_adj[t, :, :],
v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T,
)
)
if monitor_convergence:
self.convergence_monitor.append((-np.inf, diffU))
total_itr = 0
for itr in range(n_iter):
if self.verbose:
print(f"Iteration {itr}/{n_iter} is running.")
total_itr += 1
diffU = 0
v_col_pv = deepcopy(v_col)
for t in range(th):
# reprs = u[t, :, :]
reprs = self.lapl_adj[t, :, :]
if t == 0:
v_col_pv_ktn = v_col_pv[t + 1, :, : k[t + 1]]
reprs_bar = reprs + alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T)
elif t == th - 1:
v_col_pv_ktp = v_col_pv[t - 1, :, : k[t - 1]]
reprs_bar = reprs + alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T)
else:
v_col_pv_ktp = v_col_pv[t - 1, :, : k[t - 1]]
v_col_pv_ktn = v_col_pv[t + 1, :, : k[t + 1]]
reprs_bar = (
reprs
+ (alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T))
+ (alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T))
)
k[t] = self.choose_model_order_K(
reprs_bar, self.degrees[t, :], max_K, opt=opt_K
)
_, v_col[t, :, : k[t]] = eigs(reprs_bar, k=k[t], which="LM")
eig_val = eigvals(v_col[t, :, : k[t]].T @ v_col_pv[t, :, : k[t]])
objective[itr] = objective[itr] + np.sum(np.abs(eig_val), axis=0)
if monitor_convergence:
diffU = diffU + (
Distances.hamming_distance(
v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T,
v_col_pv[t, :, : k[t]] @ v_col_pv[t, :, : k[t]].T,
)
)
if monitor_convergence:
self.convergence_monitor.append((objective[itr], diffU))
if itr >= 1:
diff_obj = objective[itr] - objective[itr - 1]
if abs(diff_obj) < CONVERGENCE_CRITERIA:
break
if (
(total_itr > 1)
and (total_itr == n_iter)
and (objective[-1] - objective[-2] >= CONVERGENCE_CRITERIA)
):
warnings.warn("PisCES does not converge!", RuntimeWarning)
self.embeddings = v_col
self.model_order_K = k
return self.embeddings
def predict(self):
"""
Predicts community memberships of vertices at each time point.
Parameters
----------
Returns
-------
z_pred : np.ndarray of shape (th, n)
Predicted community membership labels of vertices at each time
point, where n is the number of vertices and th is the time
horizon.
"""
th = self.time_horizon
n = self.num_vertices
z_pred = np.empty((th, n), dtype=int)
for t in range(th):
kmeans = KMeans(n_clusters=self.model_order_K[t])
z_pred[t, :] = kmeans.fit_predict(
self.embeddings[t, :, : self.model_order_K[t]]
)
return z_pred
def fit_predict(
self,
adj,
alpha=None,
n_iter=30,
max_K=None,
opt_K="empirical",
monitor_convergence=False,
):
"""
Predicts time series of community memberships given the adjacency
matrices of the dynamic network.
Parameters
----------
adj : `np.ndarray` of shape (th, n, n)
Time series of adjacency matrices of size (th,n,n), where n
is the number of vertices, and th is the time horizon.
alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2)
Tuning parameter for smoothing along the time axis.
n_iter : `int`, default=30
Determines the number of iterations to run PisCES.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence : `bool`, default='False'
Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t -
obj_{t-1}| at each iteration to monitor convergence.
Returns
-------
z_pred : np.ndarray of shape (th, n)
Predicted community membership labels of vertices at each time
point, where n is the number of vertices and th is the time
horizon.
"""
self.fit(
adj,
alpha=alpha,
max_K=max_K,
opt_K=opt_K,
n_iter=n_iter,
monitor_convergence=monitor_convergence,
)
return self.predict()
@classmethod
def cross_validation(
cls,
adj,
num_folds=5,
alpha=None,
n_iter=30,
max_K=None,
opt_K="empirical",
n_jobs=1,
):
"""
Performs cross validation to choose the best value for the alpha parameter.
Parameters
----------
adj : `np.ndarray` of shape (th, n, n)
Time series of adjacency matrices of size (th,n,n), where n is the
number of vertices, and th is the time horizon.
num_folds : `int`, default=5
Number of folds to perform in the cross validation.
alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2)
Tuning parameter for smoothing along the time axis.
n_iter : `int`, default=30
Determines the number of iterations to run PisCES.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
n_jobs : `int`, default=1
The number of parallel `joblib` threads.
Returns
-------
modu : `float`
Sum of the modularity value computed for each fold with respect to
the given alpha.
logllh : `float`
Sum of the log-likelihood value computed for each fold with
respect to the given alpha.
"""
adj = adj.astype(float)
num_vertices = adj.shape[1]
time_horizon = adj.shape[0]
if alpha is None:
alpha = 0.05 * np.ones((time_horizon, 2))
if max_K is None:
max_K = np.ceil(num_vertices / 10).astype(int)
if time_horizon < 2:
raise ValueError(
"Time horizon must be at least 2, otherwise use static spectral clustering."
)
assert type(adj) in [np.ndarray, np.memmap] and adj.ndim == 3
assert adj.shape[1] == adj.shape[2]
assert alpha.shape == (time_horizon, 2)
assert max_K > 0
n = num_vertices
th = time_horizon
idx_n = np.arange(n)
idx = np.c_[np.repeat(idx_n, idx_n.shape), np.tile(idx_n, idx_n.shape)]
r = np.random.choice(n**2, size=n**2, replace=False)
pisces_kwargs = {
"alpha": alpha,
"n_iter": n_iter,
"max_K": max_K,
"opt_K": opt_K,
}
def compute_for_fold(adj, idx_split, n, th, pisces_kwargs={}):
cv_idx = np.zeros((th, n, n), dtype=bool)
adj_train = np.zeros((th, n, n))
adj_train_imputed = np.zeros((th, n, n))
for t in range(th):
idx1, idx2 = idx_split[t, :, 0], idx_split[t, :, 1]
cv_idx_t = np.zeros((n, n), dtype=bool)
cv_idx_t[idx1, idx2] = True
cv_idx_t = np.triu(cv_idx_t) + np.triu(cv_idx_t).T
cv_idx[t, :, :] = cv_idx_t
adj_train[t, :, :] = adj[t, :, :]
adj_train[t, idx1, idx2] = 0
adj_train[t, :, :] = (
np.triu(adj_train[t, :, :]) + np.triu(adj_train[t, :, :]).T
)
adj_train_imputed[t, :, :] = cls.eigen_complete(
adj_train[t, :, :], cv_idx_t, 10, 10
)
z_pred = cls().fit_predict(
deepcopy(adj_train_imputed[:, :, :]),
**pisces_kwargs,
)
modu_fold, logllh_fold = 0, 0
for t in range(th):
modu_fold = modu_fold + cls.modularity(
adj[t, :, :],
adj_train[t, :, :],
z_pred[t, :],
cv_idx[t, :, :],
)
logllh_fold = logllh_fold + cls.loglikelihood(
adj[t, :, :],
adj_train[t, :, :],
z_pred[t, :],
cv_idx[t, :, :],
)
return modu_fold, logllh_fold
modu_total = 0
logllh_total = 0
def split_train_test(fold_idx):
pfold = n**2 // num_folds
start, end = pfold * fold_idx, (fold_idx + 1) * pfold
test = r[start:end]
idx_split = idx[test, :]
return np.tile(idx_split, (th, 1, 1))
if n_jobs > 1:
from joblib import Parallel, delayed
with Parallel(n_jobs=n_jobs) as parallel: # prefer="processes"
loss_zipped = parallel(
delayed(compute_for_fold)(
adj,
split_train_test(fold_idx),
n,
th,
pisces_kwargs=pisces_kwargs,
)
for fold_idx in range(num_folds)
)
modu_fold, logllh_fold = map(np.array, zip(*loss_zipped))
modu_total = sum(modu_fold)
logllh_total = sum(logllh_fold)
else:
for fold_idx in range(num_folds):
modu_fold, logllh_fold = compute_for_fold(
adj, split_train_test(fold_idx), n, th, pisces_kwargs=pisces_kwargs
)
modu_total = modu_total + modu_fold
logllh_total = logllh_total + logllh_fold
num_adj = th
return modu_total / num_adj, logllh_total / num_adj
class MuDCoD(CommunityDetectionMixin):
def __init__(self, verbose=False):
super().__init__("MuDCoD", verbose=verbose)
self.convergence_monitor = []
def fit(
self,
adj,
alpha=None,
beta=None,
n_iter=30,
max_K=None,
opt_K="empirical",
monitor_convergence=False,
):
"""
Computes the spectral embeddings from given multi-subject time series
of matrices of the dynamic networks of different subjects.
Parameters
----------
adj : `np.ndarray` of shape (ns, th, n, n)
Multi-subject time series of adjacency matrices of size (ns,
th,n,n), where n is the number of vertices, th is the time horizon,
and ns is the number of subjects.
alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2)
Tuning parameter for smoothing along the time axis.
beta : `np.ndarray` of shape (ns), default=0.01J(ns)
Tuning parameter for smoothing along the subject axis.
n_iter : `int`, default=30
Determines the number of iterations to run MuDCoD.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence : `bool`, default='False'
Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t -
obj_{t-1}| at each iteration to monitor convergence.
Returns
-------
embeddings : `np.ndarray` of shape (ns, th, n, max_K)
Computed and smoothed spectral embeddings of the multi-subject time
series of the adjacency matrices, with shape (ns, th, n, max_K).
"""
self.adj = adj.astype(float)
self.num_subjects = self.adj.shape[0]
self.time_horizon = self.adj.shape[1]
self.num_vertices = self.adj.shape[2]
self.degrees = np.empty(self.adj.shape[:-1])
self.lapl_adj = np.empty_like(self.adj)
if alpha is None:
alpha = 0.05 * np.ones((self.time_horizon, 2))
if beta is None:
beta = 0.01 * np.ones(self.num_subjects)
if max_K is None:
max_K = np.ceil(self.num_vertices / 10).astype(int)
if self.time_horizon < 2:
raise ValueError(
"Time horizon must be at least 2, otherwise use static spectral clustering."
)
assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 4
assert self.adj.shape[2] == self.adj.shape[3]
assert isinstance(alpha, np.ndarray) and alpha.shape == (self.time_horizon, 2)
assert isinstance(beta, np.ndarray) and beta.shape == (self.num_subjects,)
assert max_K > 0
ns = self.num_subjects
th = self.time_horizon
n = self.num_vertices
k = np.zeros((ns, th)).astype(int) + max_K
v_col = np.zeros((ns, th, n, max_K))
u = np.zeros((ns, th, n, n))
objective = np.zeros((n_iter))
self.convergence_monitor = []
diffU = 0
for t in range(th):
for sbj in range(ns):
adj_t = self.adj[sbj, t, :, :]
self.degrees[sbj, t, :] = np.sum(np.abs(adj_t), axis=0) + _eps
sqinv_degree = sqrtm(inv(np.diag(self.degrees[sbj, t, :])))
self.lapl_adj[sbj, t, :, :] = sqinv_degree @ adj_t @ sqinv_degree
# Initialization of k, v_col.
for t in range(th):
for sbj in range(ns):
lapl_adj_t = self.lapl_adj[sbj, t, :, :]
k[sbj, t] = self.choose_model_order_K(
lapl_adj_t, self.degrees[sbj, t, :], max_K, opt=opt_K
)
_, v_col[sbj, t, :, : k[sbj, t]] = eigs(
lapl_adj_t, k=k[sbj, t], which="LM"
)
u[sbj, t, :, :] = v_col[sbj, t, :, :] @ v_col[sbj, t, :, :].T
if monitor_convergence:
diffU = diffU + (
Distances.hamming_distance(
self.lapl_adj[sbj, t, :, :],
v_col[sbj, t, :, : k[sbj, t]]
@ v_col[sbj, t, :, : k[sbj, t]].T,
)
)
if monitor_convergence:
self.convergence_monitor.append((-np.inf, diffU))
total_itr = 0
for itr in range(n_iter):
if self.verbose:
print(f"Iteration {itr}/{n_iter} is running.")
total_itr += 1
diffU = 0
v_col_pv = deepcopy(v_col)
for t in range(th):
v_col_t = v_col_pv[:, t, :, :]
swp_v_col_t = np.swapaxes(v_col_t, 1, 2)
u_bar_t = v_col_t @ swp_v_col_t
for sbj in range(ns):
# reprs = u[sbj, t, :, :]
mu_u_bar_t = np.mean(np.delete(u_bar_t, sbj, 0), axis=0)
reprs = self.lapl_adj[sbj, t, :, :]
if t == 0:
v_col_pv_ktn = v_col_pv[sbj, t + 1, :, : k[sbj, t + 1]]
reprs_bar = (
reprs
+ alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T)
+ beta[sbj] * mu_u_bar_t
)
elif t == th - 1:
v_col_pv_ktp = v_col_pv[sbj, t - 1, :, : k[sbj, t - 1]]
reprs_bar = (
reprs
+ alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T)
+ beta[sbj] * mu_u_bar_t
)
else:
v_col_pv_ktp = v_col_pv[sbj, t - 1, :, : k[sbj, t - 1]]
v_col_pv_ktn = v_col_pv[sbj, t + 1, :, : k[sbj, t + 1]]
reprs_bar = (
reprs
+ (alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T))
+ (alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T))
+ beta[sbj] * mu_u_bar_t
)
k[sbj, t] = self.choose_model_order_K(
reprs_bar,
self.degrees[sbj, t, :],
max_K,
opt=opt_K,
)
_, v_col[sbj, t, :, : k[sbj, t]] = eigs(
reprs_bar, k=k[sbj, t], which="LM"
)
eig_val = eigvals(
v_col[sbj, t, :, : k[sbj, t]].T
@ v_col_pv[sbj, t, :, : k[sbj, t]]
)
objective[itr] = objective[itr] + np.sum(np.abs(eig_val), axis=0)
if monitor_convergence:
diffU = diffU + (
Distances.hamming_distance(
v_col[sbj, t, :, : k[sbj, t]]
@ v_col[sbj, t, :, : k[sbj, t]].T,
v_col_pv[sbj, t, :, : k[sbj, t]]
@ v_col_pv[sbj, t, :, : k[sbj, t]].T,
)
)
if monitor_convergence:
self.convergence_monitor.append((objective[itr], diffU))
if itr >= 1:
diff_obj = objective[itr] - objective[itr - 1]
if abs(diff_obj) < CONVERGENCE_CRITERIA:
break
if (
(total_itr > 1)
and (total_itr == n_iter)
and (objective[-1] - objective[-2] >= CONVERGENCE_CRITERIA)
):
warnings.warn("MuDCoD does not converge!", RuntimeWarning)
self.embeddings = v_col
self.model_order_K = k
return self.embeddings
def predict(
self,
):
"""
Predicts community memberships of vertices at each time point for each
subject.
Parameters
----------
Returns
-------
z_pred : np.ndarray of shape (ns, th, n)
Predicted community membership labels of vertices at each time
point for each subject, where n is the number of vertices, th is
the time horizon, and ns is the number of subjects.
"""
ns = self.num_subjects
th = self.time_horizon
n = self.num_vertices
z_pred = np.empty((ns, th, n), dtype=int)
for t in range(th):
for sbj in range(ns):
kmeans = KMeans(n_clusters=self.model_order_K[sbj, t])
z_pred[sbj, t, :] = kmeans.fit_predict(
self.embeddings[sbj, t, :, : self.model_order_K[sbj, t]]
)
return z_pred
def fit_predict(
self,
adj,
alpha=None,
beta=None,
n_iter=30,
max_K=None,
opt_K="empirical",
monitor_convergence=False,
):
"""
Predicts multi-subject time series of community memberships given the
adjacency matrices of the dynamic networks for each subject.
Parameters
----------
adj : `np.ndarray` of shape (th, n, n)
Time series of adjacency matrices of size (th,n,n), where n is the
number of vertices, and th is the time horizon.
alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2)
Tuning parameter for smoothing along the time axis.
n_iter : `int`, default=30
Determines the number of iterations to run PisCES.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence : `bool`, default='False'
Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t -
obj_{t-1}> at each iteration to monitor convergence.
Returns
-------
z_pred : np.ndarray of shape (ns, th, n)
Predicted community membership labels of vertices at each time
point for each subject, where n is the number of vertices, th is
the time horizon, and ns is the number of subjects.
"""
self.fit(
adj,
alpha=alpha,
beta=beta,
max_K=max_K,
opt_K=opt_K,
n_iter=n_iter,
monitor_convergence=monitor_convergence,
)
return self.predict()
@classmethod
def cross_validation(
cls,
adj,
num_folds=5,
alpha=None,
beta=None,
n_iter=30,
max_K=None,
opt_K="empirical",
n_jobs=1,
):
"""
Performs cross validation to choose the best pair of values for the alpha
and the beta parameters.
Parameters
----------
num_folds : `int`, default=5
Number of folds to perform in the cross validation.
alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2)
Tuning parameter for smoothing along the time axis.
beta : `np.ndarray` of shape (ns), default=0.01J(ns)
Tuning parameter for smoothing along the subject axis.
n_iter : `int`, default=30
Determines the number of iterations to run PisCES.
max_K : `int`, default=n/10
Determines the maximum number of communities to predict.
opt_K : `string`, default='empirical'
Chooses the technique to estimate k, i.e., number of communities.
n_jobs : `int`, default=1
The number of parallel `joblib` threads.
Returns
-------
modu : `float`
Sum of the modularity value computed for each fold with respect to
the given alpha.
logllh : `float`
Sum of the log-likelihood value computed for each fold with
respect to the given alpha.
"""
adj = adj.astype(float)
num_vertices = adj.shape[2]
time_horizon = adj.shape[1]
num_subjects = adj.shape[0]
if alpha is None:
alpha = 0.05 * np.ones((time_horizon, 2))
if beta is None:
beta = 0.01 * np.ones(num_subjects)
if max_K is None:
max_K = np.ceil(num_vertices / 10).astype(int)
if time_horizon < 2:
raise ValueError(
"Time horizon must be at least 2, otherwise use static spectral clustering."
)
assert type(adj) in [np.ndarray, np.memmap] and adj.ndim == 4
assert adj.shape[2] == adj.shape[3]
assert alpha.shape == (time_horizon, 2)
assert beta.shape == (num_subjects,)
assert max_K > 0
ns = num_subjects
th = time_horizon
n = num_vertices
idx_n = np.arange(n)
idx = np.c_[np.repeat(idx_n, idx_n.shape), np.tile(idx_n, idx_n.shape)]
r = np.random.choice(n**2, size=n**2, replace=False)
mudcod_kwargs = {
"alpha": alpha,
"beta": beta,
"n_iter": n_iter,
"max_K": max_K,
"opt_K": opt_K,
}
def compute_for_fold(adj, idx_split, n, th, ns, mudcod_kwargs={}):
cv_idx = np.empty((ns, th, n, n), dtype=bool)
adj_train = np.zeros((ns, th, n, n))
adj_train_imputed = np.zeros((ns, th, n, n))
for t in range(th):
for sbj in range(ns):
idx1, idx2 = idx_split[sbj, t, :, 0], idx_split[sbj, t, :, 1]
cv_idx_t = np.zeros((n, n), dtype=bool)
cv_idx_t[idx1, idx2] = True
cv_idx_t = np.triu(cv_idx_t) + np.triu(cv_idx_t).T
cv_idx[sbj, t, :, :] = cv_idx_t
adj_train[sbj, t, :, :] = adj[sbj, t, :, :]
adj_train[sbj, t, idx1, idx2] = 0
adj_train[sbj, t] = (
np.triu(adj_train[sbj, t]) + np.triu(adj_train[sbj, t]).T
)
adj_train_imputed[sbj, t, :, :] = cls.eigen_complete(
adj_train[sbj, t], cv_idx_t, 10, 10
)
z_pred = cls().fit_predict(
deepcopy(adj_train_imputed[:, :, :, :]),
**mudcod_kwargs,
)
modu_fold, logllh_fold = 0, 0
for t in range(th):
for sbj in range(ns):
modu_fold = modu_fold + cls.modularity(
adj[sbj, t, :, :],
adj_train[sbj, t, :, :],
z_pred[sbj, t, :],
cv_idx[sbj, t, :, :],
)
logllh_fold = logllh_fold + cls.loglikelihood(
adj[sbj, t, :, :],
adj_train[sbj, t, :, :],
z_pred[sbj, t, :],
cv_idx[sbj, t, :, :],
)
return modu_fold, logllh_fold
modu_total = 0
logllh_total = 0
def split_train_test(fold_idx):
pfold = n**2 // num_folds
start, end = pfold * fold_idx, (fold_idx + 1) * pfold
test = r[start:end]
idx_split = idx[test, :]
return np.tile(idx_split, (ns, th, 1, 1))
if n_jobs > 1:
from joblib import Parallel, delayed
with Parallel(n_jobs=n_jobs) as parallel: # prefer="processes"
loss_zipped = parallel(
delayed(compute_for_fold)(
adj,
split_train_test(fold_idx),
n,
th,
ns,
mudcod_kwargs=mudcod_kwargs,
)
for fold_idx in range(num_folds)
)
modu_fold, logllh_fold = map(np.array, zip(*loss_zipped))
modu_total = sum(modu_fold)
logllh_total = sum(logllh_fold)
else:
for fold_idx in range(num_folds):
modu_fold, logllh_fold = compute_for_fold(
adj,
split_train_test(fold_idx),
n,
th,
ns,
mudcod_kwargs=mudcod_kwargs,
)
modu_total = modu_total + modu_fold
logllh_total = logllh_total + logllh_fold
num_adj = ns * th
return modu_total / num_adj, logllh_total / num_adj
Classes
class CommunityDetectionMixin (method, verbose=False)
-
Expand source code
class CommunityDetectionMixin: def __init__(self, method, verbose=False): self.verbose = verbose self.method = method assert type(method) == str and method.lower() in [ "PisCES".lower(), "MuDCoD".lower(), "StaticSpectralCoD".lower(), ] self._embeddings = None self._model_order_K = None @property def embeddings(self): if self._embeddings is None: raise ValueError("Embeddings are not computed yet, run 'fit' first.") return self._embeddings @property def model_order_K(self): if self._model_order_K is None: raise ValueError("Model order K is not computed yet, run 'fit' first.") return self._model_order_K @embeddings.setter def embeddings(self, value): if not isinstance(value, np.ndarray): raise ValueError("Embeddings must be instance 'np.ndarray'.") else: self._embeddings = value @model_order_K.setter def model_order_K(self, value): if self.method in ["PisCES", "MuDCoD"] and not isinstance(value, np.ndarray): raise ValueError("Model order K must be instance of 'np.ndarray'.") elif self.method in ["StaticSpectralCoD"] and not isinstance(value, int): raise ValueError("Model order K must be instance of 'int'.") else: self._model_order_K = value @staticmethod def eigen_complete(adj, cv_idx, epsilon, k): m = np.zeros(adj.shape) while True: adj_cv = (adj * (1 - cv_idx)) + (m * cv_idx) u, s, vh = svd(adj_cv) s[k:] = 0 m2 = u @ np.diag(s) @ vh m2 = np.where(m2 < 0, 0, m2) m2 = np.where(m2 > 1, 1, m2) dn = np.sqrt(np.sum((m - m2) ** 2)) if dn < epsilon: break else: m = m2 return m @staticmethod def choose_model_order_K(reprs, degrees, max_K, opt="empirical"): """ Predicts number of communities/modules in a network. Parameters ---------- reprs : `np.ndarray` of shape (n, n) Laplacianized adjacency matrix representation with dimension (n,n). degrees : `np.ndarray` of shape (n) Diagonal matrix of degree values with dimension (n). max_K : `int` Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate K, i.e., number of communities. Returns ------- model_order_pred : `int` Number of modules to consider. """ opt_list = ["null", "empirical", "full"] if opt == opt_list[2]: model_order_pred = max_K - 1 else: n = reprs.shape[0] sorted_eigenvalues = np.sort(eigvals(np.eye(n) - reprs)) gaprw = np.diff(sorted_eigenvalues)[1:max_K] assert isinstance(gaprw, np.ndarray) and gaprw.ndim == 1 if opt == opt_list[0]: pin = np.sum(degrees) / comb(n, 2) / 2 threshold = 3.5 / pin ** (0.58) / n ** (1.15) idx = np.nonzero(gaprw > threshold)[0] if idx.shape[0] == 0: model_order_pred = 1 else: model_order_pred = np.max(idx) + 1 elif opt == opt_list[1]: if (gaprw == 0).any(): idx = -2 else: idx = -1 model_order_pred = np.argsort(gaprw, axis=0)[idx] + 1 else: raise ValueError( f"Unkown option {opt} is given for predicting number of communities.\n" f"Use one of {opt_list}." ) return int(model_order_pred) + 1 @staticmethod def modularity(adj_test, adj_train, z_pred, cv_idx, resolution=1): """ Calculates modularity given imputed training adjacency matrix, community membership predictions and the actual training matrix. Parameters ---------- adj_test : `np.ndarray` of shape (n, n) Test ajdacency matrix of shape (n,n), corresponding to the actual network. adj_train : `np.ndarray` of shape (n, n) Training adjacency matrix of shape (n,n); whose edges are removed and then imputed, where n is the number of vertices. z_pred : np.ndarray of shape (n) Predicted community membership label vector of size (n) of each vertex to compute loss. cv_idx : `np.ndarray` of shape (n, n) A marix of size (n,n), indicating the index of removed edges. Value of the entry is 1 for test and 0 for training edges. Returns ------- modu : `float` Modularity value computed based on the predicted community memberships. """ modu = 0 np.fill_diagonal(cv_idx, 0) d_out_train = np.sum(adj_train, axis=0) d_in_train = np.sum(adj_train, axis=1) m = np.sum(cv_idx * adj_test) / 2 norm = 1 / (np.sum(adj_train) / 2) ** 2 for i, j in zip(*np.nonzero(cv_idx.astype(bool))): if (cv_idx[i, j] > 0) and (z_pred[i] == z_pred[j]) and (i != j): c = z_pred[i] = z_pred[j] out_degree_sum = np.sum(d_out_train[z_pred == c]) in_degree_sum = np.sum(d_in_train[z_pred == c]) modu += adj_test[i, j] / m - ( (resolution * out_degree_sum * in_degree_sum * norm) / np.sum(cv_idx[z_pred == c][:, z_pred == c]) / 2 ) return modu / 2 @staticmethod def loglikelihood(adj_test, adj_train, z_pred, cv_idx): """ Calcualtes loglikelihood given imputed training adjacency matrix, community membership predictions and the actual training matrix. Parameters ---------- adj_test : `np.ndarray` of shape (n, n) Test ajdacency matrix of shape (n,n), corresponding to the actual network. adj_train : `np.ndarray` of shape (n, n) Training adjacency matrix of shape (n,n); whose edges are removed and then imputed, where n is the number of vertices. z_pred : np.ndarray of shape (n) Predicted community membership label vector of size (n) of each vertex to compute loss. cv_idx : `np.ndarray` of shape (n, n) A marix of size (n,n), indicating the index of removed edges. Value of the entry is 1 for test and 0 for training edges. Returns ------- logllh : `float` DCBM loglikelihood value computed based on the predicted community memberships. """ logllh = 0 num_communities = np.max(z_pred) + 1 d_out_train = np.sum(adj_train[:, :], axis=0) # dp_out_train = d_out_train[:, np.newaxis] @ d_out_train[np.newaxis, :] # hatB = np.zeros((num_communities, num_communities), dtype=int) hat0 = np.zeros((num_communities, num_communities), dtype=int) for comm_k in range(num_communities): for comm_l in range(num_communities): z_ix_kl = np.ix_(z_pred == comm_k, z_pred == comm_l) total_degree = np.sum(adj_train[z_ix_kl]) # sum_degree_product = np.sum(dp_out_train[z_ix_kl]) # hatB[comm_k, comm_l] = total_degree / sum_degree_product hat0[comm_k, comm_l] = total_degree for i, j in zip(*np.nonzero(cv_idx.astype(bool))): # prob = d_out_train[i] * d_out_train[j] * hatB[z_pred[i], z_pred[j]] prob = ( d_out_train[i] / np.sum(hat0[z_pred[i], :]) * d_out_train[j] / np.sum(hat0[z_pred[j], :]) * hat0[z_pred[i], z_pred[j]] * 0.8 ) if prob == 0 or np.isnan(prob): prob = _eps elif prob >= 1: prob = 1 - _eps else: pass logllh = ( logllh + np.log(prob) * (adj_test[i, j] >= _eps) + np.log(1 - prob) * (adj_test[i, j] < _eps) ) # return logllh return logllh / np.sum(adj_train)
Subclasses
Static methods
def choose_model_order_K(reprs, degrees, max_K, opt='empirical')
-
Predicts number of communities/modules in a network.
Parameters
reprs
:np.ndarray
ofshape (n, n)
- Laplacianized adjacency matrix representation with dimension (n,n).
degrees
:np.ndarray
ofshape (n)
- Diagonal matrix of degree values with dimension (n).
max_K
:int
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate K, i.e., number of communities.
Returns
model_order_pred
:int
- Number of modules to consider.
Expand source code
@staticmethod def choose_model_order_K(reprs, degrees, max_K, opt="empirical"): """ Predicts number of communities/modules in a network. Parameters ---------- reprs : `np.ndarray` of shape (n, n) Laplacianized adjacency matrix representation with dimension (n,n). degrees : `np.ndarray` of shape (n) Diagonal matrix of degree values with dimension (n). max_K : `int` Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate K, i.e., number of communities. Returns ------- model_order_pred : `int` Number of modules to consider. """ opt_list = ["null", "empirical", "full"] if opt == opt_list[2]: model_order_pred = max_K - 1 else: n = reprs.shape[0] sorted_eigenvalues = np.sort(eigvals(np.eye(n) - reprs)) gaprw = np.diff(sorted_eigenvalues)[1:max_K] assert isinstance(gaprw, np.ndarray) and gaprw.ndim == 1 if opt == opt_list[0]: pin = np.sum(degrees) / comb(n, 2) / 2 threshold = 3.5 / pin ** (0.58) / n ** (1.15) idx = np.nonzero(gaprw > threshold)[0] if idx.shape[0] == 0: model_order_pred = 1 else: model_order_pred = np.max(idx) + 1 elif opt == opt_list[1]: if (gaprw == 0).any(): idx = -2 else: idx = -1 model_order_pred = np.argsort(gaprw, axis=0)[idx] + 1 else: raise ValueError( f"Unkown option {opt} is given for predicting number of communities.\n" f"Use one of {opt_list}." ) return int(model_order_pred) + 1
def eigen_complete(adj, cv_idx, epsilon, k)
-
Expand source code
@staticmethod def eigen_complete(adj, cv_idx, epsilon, k): m = np.zeros(adj.shape) while True: adj_cv = (adj * (1 - cv_idx)) + (m * cv_idx) u, s, vh = svd(adj_cv) s[k:] = 0 m2 = u @ np.diag(s) @ vh m2 = np.where(m2 < 0, 0, m2) m2 = np.where(m2 > 1, 1, m2) dn = np.sqrt(np.sum((m - m2) ** 2)) if dn < epsilon: break else: m = m2 return m
def loglikelihood(adj_test, adj_train, z_pred, cv_idx)
-
Calcualtes loglikelihood given imputed training adjacency matrix, community membership predictions and the actual training matrix.
Parameters
adj_test
:np.ndarray
ofshape (n, n)
- Test ajdacency matrix of shape (n,n), corresponding to the actual network.
adj_train
:np.ndarray
ofshape (n, n)
- Training adjacency matrix of shape (n,n); whose edges are removed and then imputed, where n is the number of vertices.
z_pred
:np.ndarray
ofshape (n)
- Predicted community membership label vector of size (n) of each vertex to compute loss.
cv_idx
:np.ndarray
ofshape (n, n)
- A marix of size (n,n), indicating the index of removed edges. Value of the entry is 1 for test and 0 for training edges.
Returns
logllh
:float
- DCBM loglikelihood value computed based on the predicted community memberships.
Expand source code
@staticmethod def loglikelihood(adj_test, adj_train, z_pred, cv_idx): """ Calcualtes loglikelihood given imputed training adjacency matrix, community membership predictions and the actual training matrix. Parameters ---------- adj_test : `np.ndarray` of shape (n, n) Test ajdacency matrix of shape (n,n), corresponding to the actual network. adj_train : `np.ndarray` of shape (n, n) Training adjacency matrix of shape (n,n); whose edges are removed and then imputed, where n is the number of vertices. z_pred : np.ndarray of shape (n) Predicted community membership label vector of size (n) of each vertex to compute loss. cv_idx : `np.ndarray` of shape (n, n) A marix of size (n,n), indicating the index of removed edges. Value of the entry is 1 for test and 0 for training edges. Returns ------- logllh : `float` DCBM loglikelihood value computed based on the predicted community memberships. """ logllh = 0 num_communities = np.max(z_pred) + 1 d_out_train = np.sum(adj_train[:, :], axis=0) # dp_out_train = d_out_train[:, np.newaxis] @ d_out_train[np.newaxis, :] # hatB = np.zeros((num_communities, num_communities), dtype=int) hat0 = np.zeros((num_communities, num_communities), dtype=int) for comm_k in range(num_communities): for comm_l in range(num_communities): z_ix_kl = np.ix_(z_pred == comm_k, z_pred == comm_l) total_degree = np.sum(adj_train[z_ix_kl]) # sum_degree_product = np.sum(dp_out_train[z_ix_kl]) # hatB[comm_k, comm_l] = total_degree / sum_degree_product hat0[comm_k, comm_l] = total_degree for i, j in zip(*np.nonzero(cv_idx.astype(bool))): # prob = d_out_train[i] * d_out_train[j] * hatB[z_pred[i], z_pred[j]] prob = ( d_out_train[i] / np.sum(hat0[z_pred[i], :]) * d_out_train[j] / np.sum(hat0[z_pred[j], :]) * hat0[z_pred[i], z_pred[j]] * 0.8 ) if prob == 0 or np.isnan(prob): prob = _eps elif prob >= 1: prob = 1 - _eps else: pass logllh = ( logllh + np.log(prob) * (adj_test[i, j] >= _eps) + np.log(1 - prob) * (adj_test[i, j] < _eps) ) # return logllh return logllh / np.sum(adj_train)
def modularity(adj_test, adj_train, z_pred, cv_idx, resolution=1)
-
Calculates modularity given imputed training adjacency matrix, community membership predictions and the actual training matrix.
Parameters
adj_test
:np.ndarray
ofshape (n, n)
- Test ajdacency matrix of shape (n,n), corresponding to the actual network.
adj_train
:np.ndarray
ofshape (n, n)
- Training adjacency matrix of shape (n,n); whose edges are removed and then imputed, where n is the number of vertices.
z_pred
:np.ndarray
ofshape (n)
- Predicted community membership label vector of size (n) of each vertex to compute loss.
cv_idx
:np.ndarray
ofshape (n, n)
- A marix of size (n,n), indicating the index of removed edges. Value of the entry is 1 for test and 0 for training edges.
Returns
modu
:float
- Modularity value computed based on the predicted community memberships.
Expand source code
@staticmethod def modularity(adj_test, adj_train, z_pred, cv_idx, resolution=1): """ Calculates modularity given imputed training adjacency matrix, community membership predictions and the actual training matrix. Parameters ---------- adj_test : `np.ndarray` of shape (n, n) Test ajdacency matrix of shape (n,n), corresponding to the actual network. adj_train : `np.ndarray` of shape (n, n) Training adjacency matrix of shape (n,n); whose edges are removed and then imputed, where n is the number of vertices. z_pred : np.ndarray of shape (n) Predicted community membership label vector of size (n) of each vertex to compute loss. cv_idx : `np.ndarray` of shape (n, n) A marix of size (n,n), indicating the index of removed edges. Value of the entry is 1 for test and 0 for training edges. Returns ------- modu : `float` Modularity value computed based on the predicted community memberships. """ modu = 0 np.fill_diagonal(cv_idx, 0) d_out_train = np.sum(adj_train, axis=0) d_in_train = np.sum(adj_train, axis=1) m = np.sum(cv_idx * adj_test) / 2 norm = 1 / (np.sum(adj_train) / 2) ** 2 for i, j in zip(*np.nonzero(cv_idx.astype(bool))): if (cv_idx[i, j] > 0) and (z_pred[i] == z_pred[j]) and (i != j): c = z_pred[i] = z_pred[j] out_degree_sum = np.sum(d_out_train[z_pred == c]) in_degree_sum = np.sum(d_in_train[z_pred == c]) modu += adj_test[i, j] / m - ( (resolution * out_degree_sum * in_degree_sum * norm) / np.sum(cv_idx[z_pred == c][:, z_pred == c]) / 2 ) return modu / 2
Instance variables
var embeddings
-
Expand source code
@property def embeddings(self): if self._embeddings is None: raise ValueError("Embeddings are not computed yet, run 'fit' first.") return self._embeddings
var model_order_K
-
Expand source code
@property def model_order_K(self): if self._model_order_K is None: raise ValueError("Model order K is not computed yet, run 'fit' first.") return self._model_order_K
class MuDCoD (verbose=False)
-
Expand source code
class MuDCoD(CommunityDetectionMixin): def __init__(self, verbose=False): super().__init__("MuDCoD", verbose=verbose) self.convergence_monitor = [] def fit( self, adj, alpha=None, beta=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Computes the spectral embeddings from given multi-subject time series of matrices of the dynamic networks of different subjects. Parameters ---------- adj : `np.ndarray` of shape (ns, th, n, n) Multi-subject time series of adjacency matrices of size (ns, th,n,n), where n is the number of vertices, th is the time horizon, and ns is the number of subjects. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. beta : `np.ndarray` of shape (ns), default=0.01J(ns) Tuning parameter for smoothing along the subject axis. n_iter : `int`, default=30 Determines the number of iterations to run MuDCoD. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence. Returns ------- embeddings : `np.ndarray` of shape (ns, th, n, max_K) Computed and smoothed spectral embeddings of the multi-subject time series of the adjacency matrices, with shape (ns, th, n, max_K). """ self.adj = adj.astype(float) self.num_subjects = self.adj.shape[0] self.time_horizon = self.adj.shape[1] self.num_vertices = self.adj.shape[2] self.degrees = np.empty(self.adj.shape[:-1]) self.lapl_adj = np.empty_like(self.adj) if alpha is None: alpha = 0.05 * np.ones((self.time_horizon, 2)) if beta is None: beta = 0.01 * np.ones(self.num_subjects) if max_K is None: max_K = np.ceil(self.num_vertices / 10).astype(int) if self.time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 4 assert self.adj.shape[2] == self.adj.shape[3] assert isinstance(alpha, np.ndarray) and alpha.shape == (self.time_horizon, 2) assert isinstance(beta, np.ndarray) and beta.shape == (self.num_subjects,) assert max_K > 0 ns = self.num_subjects th = self.time_horizon n = self.num_vertices k = np.zeros((ns, th)).astype(int) + max_K v_col = np.zeros((ns, th, n, max_K)) u = np.zeros((ns, th, n, n)) objective = np.zeros((n_iter)) self.convergence_monitor = [] diffU = 0 for t in range(th): for sbj in range(ns): adj_t = self.adj[sbj, t, :, :] self.degrees[sbj, t, :] = np.sum(np.abs(adj_t), axis=0) + _eps sqinv_degree = sqrtm(inv(np.diag(self.degrees[sbj, t, :]))) self.lapl_adj[sbj, t, :, :] = sqinv_degree @ adj_t @ sqinv_degree # Initialization of k, v_col. for t in range(th): for sbj in range(ns): lapl_adj_t = self.lapl_adj[sbj, t, :, :] k[sbj, t] = self.choose_model_order_K( lapl_adj_t, self.degrees[sbj, t, :], max_K, opt=opt_K ) _, v_col[sbj, t, :, : k[sbj, t]] = eigs( lapl_adj_t, k=k[sbj, t], which="LM" ) u[sbj, t, :, :] = v_col[sbj, t, :, :] @ v_col[sbj, t, :, :].T if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( self.lapl_adj[sbj, t, :, :], v_col[sbj, t, :, : k[sbj, t]] @ v_col[sbj, t, :, : k[sbj, t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((-np.inf, diffU)) total_itr = 0 for itr in range(n_iter): if self.verbose: print(f"Iteration {itr}/{n_iter} is running.") total_itr += 1 diffU = 0 v_col_pv = deepcopy(v_col) for t in range(th): v_col_t = v_col_pv[:, t, :, :] swp_v_col_t = np.swapaxes(v_col_t, 1, 2) u_bar_t = v_col_t @ swp_v_col_t for sbj in range(ns): # reprs = u[sbj, t, :, :] mu_u_bar_t = np.mean(np.delete(u_bar_t, sbj, 0), axis=0) reprs = self.lapl_adj[sbj, t, :, :] if t == 0: v_col_pv_ktn = v_col_pv[sbj, t + 1, :, : k[sbj, t + 1]] reprs_bar = ( reprs + alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T) + beta[sbj] * mu_u_bar_t ) elif t == th - 1: v_col_pv_ktp = v_col_pv[sbj, t - 1, :, : k[sbj, t - 1]] reprs_bar = ( reprs + alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T) + beta[sbj] * mu_u_bar_t ) else: v_col_pv_ktp = v_col_pv[sbj, t - 1, :, : k[sbj, t - 1]] v_col_pv_ktn = v_col_pv[sbj, t + 1, :, : k[sbj, t + 1]] reprs_bar = ( reprs + (alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T)) + (alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T)) + beta[sbj] * mu_u_bar_t ) k[sbj, t] = self.choose_model_order_K( reprs_bar, self.degrees[sbj, t, :], max_K, opt=opt_K, ) _, v_col[sbj, t, :, : k[sbj, t]] = eigs( reprs_bar, k=k[sbj, t], which="LM" ) eig_val = eigvals( v_col[sbj, t, :, : k[sbj, t]].T @ v_col_pv[sbj, t, :, : k[sbj, t]] ) objective[itr] = objective[itr] + np.sum(np.abs(eig_val), axis=0) if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( v_col[sbj, t, :, : k[sbj, t]] @ v_col[sbj, t, :, : k[sbj, t]].T, v_col_pv[sbj, t, :, : k[sbj, t]] @ v_col_pv[sbj, t, :, : k[sbj, t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((objective[itr], diffU)) if itr >= 1: diff_obj = objective[itr] - objective[itr - 1] if abs(diff_obj) < CONVERGENCE_CRITERIA: break if ( (total_itr > 1) and (total_itr == n_iter) and (objective[-1] - objective[-2] >= CONVERGENCE_CRITERIA) ): warnings.warn("MuDCoD does not converge!", RuntimeWarning) self.embeddings = v_col self.model_order_K = k return self.embeddings def predict( self, ): """ Predicts community memberships of vertices at each time point for each subject. Parameters ---------- Returns ------- z_pred : np.ndarray of shape (ns, th, n) Predicted community membership labels of vertices at each time point for each subject, where n is the number of vertices, th is the time horizon, and ns is the number of subjects. """ ns = self.num_subjects th = self.time_horizon n = self.num_vertices z_pred = np.empty((ns, th, n), dtype=int) for t in range(th): for sbj in range(ns): kmeans = KMeans(n_clusters=self.model_order_K[sbj, t]) z_pred[sbj, t, :] = kmeans.fit_predict( self.embeddings[sbj, t, :, : self.model_order_K[sbj, t]] ) return z_pred def fit_predict( self, adj, alpha=None, beta=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Predicts multi-subject time series of community memberships given the adjacency matrices of the dynamic networks for each subject. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}> at each iteration to monitor convergence. Returns ------- z_pred : np.ndarray of shape (ns, th, n) Predicted community membership labels of vertices at each time point for each subject, where n is the number of vertices, th is the time horizon, and ns is the number of subjects. """ self.fit( adj, alpha=alpha, beta=beta, max_K=max_K, opt_K=opt_K, n_iter=n_iter, monitor_convergence=monitor_convergence, ) return self.predict() @classmethod def cross_validation( cls, adj, num_folds=5, alpha=None, beta=None, n_iter=30, max_K=None, opt_K="empirical", n_jobs=1, ): """ Performs cross validation to choose the best pair of values for the alpha and the beta parameters. Parameters ---------- num_folds : `int`, default=5 Number of folds to perform in the cross validation. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. beta : `np.ndarray` of shape (ns), default=0.01J(ns) Tuning parameter for smoothing along the subject axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. n_jobs : `int`, default=1 The number of parallel `joblib` threads. Returns ------- modu : `float` Sum of the modularity value computed for each fold with respect to the given alpha. logllh : `float` Sum of the log-likelihood value computed for each fold with respect to the given alpha. """ adj = adj.astype(float) num_vertices = adj.shape[2] time_horizon = adj.shape[1] num_subjects = adj.shape[0] if alpha is None: alpha = 0.05 * np.ones((time_horizon, 2)) if beta is None: beta = 0.01 * np.ones(num_subjects) if max_K is None: max_K = np.ceil(num_vertices / 10).astype(int) if time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(adj) in [np.ndarray, np.memmap] and adj.ndim == 4 assert adj.shape[2] == adj.shape[3] assert alpha.shape == (time_horizon, 2) assert beta.shape == (num_subjects,) assert max_K > 0 ns = num_subjects th = time_horizon n = num_vertices idx_n = np.arange(n) idx = np.c_[np.repeat(idx_n, idx_n.shape), np.tile(idx_n, idx_n.shape)] r = np.random.choice(n**2, size=n**2, replace=False) mudcod_kwargs = { "alpha": alpha, "beta": beta, "n_iter": n_iter, "max_K": max_K, "opt_K": opt_K, } def compute_for_fold(adj, idx_split, n, th, ns, mudcod_kwargs={}): cv_idx = np.empty((ns, th, n, n), dtype=bool) adj_train = np.zeros((ns, th, n, n)) adj_train_imputed = np.zeros((ns, th, n, n)) for t in range(th): for sbj in range(ns): idx1, idx2 = idx_split[sbj, t, :, 0], idx_split[sbj, t, :, 1] cv_idx_t = np.zeros((n, n), dtype=bool) cv_idx_t[idx1, idx2] = True cv_idx_t = np.triu(cv_idx_t) + np.triu(cv_idx_t).T cv_idx[sbj, t, :, :] = cv_idx_t adj_train[sbj, t, :, :] = adj[sbj, t, :, :] adj_train[sbj, t, idx1, idx2] = 0 adj_train[sbj, t] = ( np.triu(adj_train[sbj, t]) + np.triu(adj_train[sbj, t]).T ) adj_train_imputed[sbj, t, :, :] = cls.eigen_complete( adj_train[sbj, t], cv_idx_t, 10, 10 ) z_pred = cls().fit_predict( deepcopy(adj_train_imputed[:, :, :, :]), **mudcod_kwargs, ) modu_fold, logllh_fold = 0, 0 for t in range(th): for sbj in range(ns): modu_fold = modu_fold + cls.modularity( adj[sbj, t, :, :], adj_train[sbj, t, :, :], z_pred[sbj, t, :], cv_idx[sbj, t, :, :], ) logllh_fold = logllh_fold + cls.loglikelihood( adj[sbj, t, :, :], adj_train[sbj, t, :, :], z_pred[sbj, t, :], cv_idx[sbj, t, :, :], ) return modu_fold, logllh_fold modu_total = 0 logllh_total = 0 def split_train_test(fold_idx): pfold = n**2 // num_folds start, end = pfold * fold_idx, (fold_idx + 1) * pfold test = r[start:end] idx_split = idx[test, :] return np.tile(idx_split, (ns, th, 1, 1)) if n_jobs > 1: from joblib import Parallel, delayed with Parallel(n_jobs=n_jobs) as parallel: # prefer="processes" loss_zipped = parallel( delayed(compute_for_fold)( adj, split_train_test(fold_idx), n, th, ns, mudcod_kwargs=mudcod_kwargs, ) for fold_idx in range(num_folds) ) modu_fold, logllh_fold = map(np.array, zip(*loss_zipped)) modu_total = sum(modu_fold) logllh_total = sum(logllh_fold) else: for fold_idx in range(num_folds): modu_fold, logllh_fold = compute_for_fold( adj, split_train_test(fold_idx), n, th, ns, mudcod_kwargs=mudcod_kwargs, ) modu_total = modu_total + modu_fold logllh_total = logllh_total + logllh_fold num_adj = ns * th return modu_total / num_adj, logllh_total / num_adj
Ancestors
Static methods
def cross_validation(adj, num_folds=5, alpha=None, beta=None, n_iter=30, max_K=None, opt_K='empirical', n_jobs=1)
-
Performs cross validation to choose the best pair of values for the alpha and the beta parameters.
Parameters
num_folds
:int
, default=5
- Number of folds to perform in the cross validation.
alpha
:np.ndarray
ofshape (th, 2)
, default=0.05J(th,2)
- Tuning parameter for smoothing along the time axis.
beta
:np.ndarray
ofshape (ns)
, default=0.01J(ns)
- Tuning parameter for smoothing along the subject axis.
n_iter
:int
, default=30
- Determines the number of iterations to run PisCES.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
n_jobs
:int
, default=1
- The number of parallel
joblib
threads.
Returns
modu
:float
- Sum of the modularity value computed for each fold with respect to the given alpha.
logllh
:float
- Sum of the log-likelihood value computed for each fold with respect to the given alpha.
Expand source code
@classmethod def cross_validation( cls, adj, num_folds=5, alpha=None, beta=None, n_iter=30, max_K=None, opt_K="empirical", n_jobs=1, ): """ Performs cross validation to choose the best pair of values for the alpha and the beta parameters. Parameters ---------- num_folds : `int`, default=5 Number of folds to perform in the cross validation. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. beta : `np.ndarray` of shape (ns), default=0.01J(ns) Tuning parameter for smoothing along the subject axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. n_jobs : `int`, default=1 The number of parallel `joblib` threads. Returns ------- modu : `float` Sum of the modularity value computed for each fold with respect to the given alpha. logllh : `float` Sum of the log-likelihood value computed for each fold with respect to the given alpha. """ adj = adj.astype(float) num_vertices = adj.shape[2] time_horizon = adj.shape[1] num_subjects = adj.shape[0] if alpha is None: alpha = 0.05 * np.ones((time_horizon, 2)) if beta is None: beta = 0.01 * np.ones(num_subjects) if max_K is None: max_K = np.ceil(num_vertices / 10).astype(int) if time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(adj) in [np.ndarray, np.memmap] and adj.ndim == 4 assert adj.shape[2] == adj.shape[3] assert alpha.shape == (time_horizon, 2) assert beta.shape == (num_subjects,) assert max_K > 0 ns = num_subjects th = time_horizon n = num_vertices idx_n = np.arange(n) idx = np.c_[np.repeat(idx_n, idx_n.shape), np.tile(idx_n, idx_n.shape)] r = np.random.choice(n**2, size=n**2, replace=False) mudcod_kwargs = { "alpha": alpha, "beta": beta, "n_iter": n_iter, "max_K": max_K, "opt_K": opt_K, } def compute_for_fold(adj, idx_split, n, th, ns, mudcod_kwargs={}): cv_idx = np.empty((ns, th, n, n), dtype=bool) adj_train = np.zeros((ns, th, n, n)) adj_train_imputed = np.zeros((ns, th, n, n)) for t in range(th): for sbj in range(ns): idx1, idx2 = idx_split[sbj, t, :, 0], idx_split[sbj, t, :, 1] cv_idx_t = np.zeros((n, n), dtype=bool) cv_idx_t[idx1, idx2] = True cv_idx_t = np.triu(cv_idx_t) + np.triu(cv_idx_t).T cv_idx[sbj, t, :, :] = cv_idx_t adj_train[sbj, t, :, :] = adj[sbj, t, :, :] adj_train[sbj, t, idx1, idx2] = 0 adj_train[sbj, t] = ( np.triu(adj_train[sbj, t]) + np.triu(adj_train[sbj, t]).T ) adj_train_imputed[sbj, t, :, :] = cls.eigen_complete( adj_train[sbj, t], cv_idx_t, 10, 10 ) z_pred = cls().fit_predict( deepcopy(adj_train_imputed[:, :, :, :]), **mudcod_kwargs, ) modu_fold, logllh_fold = 0, 0 for t in range(th): for sbj in range(ns): modu_fold = modu_fold + cls.modularity( adj[sbj, t, :, :], adj_train[sbj, t, :, :], z_pred[sbj, t, :], cv_idx[sbj, t, :, :], ) logllh_fold = logllh_fold + cls.loglikelihood( adj[sbj, t, :, :], adj_train[sbj, t, :, :], z_pred[sbj, t, :], cv_idx[sbj, t, :, :], ) return modu_fold, logllh_fold modu_total = 0 logllh_total = 0 def split_train_test(fold_idx): pfold = n**2 // num_folds start, end = pfold * fold_idx, (fold_idx + 1) * pfold test = r[start:end] idx_split = idx[test, :] return np.tile(idx_split, (ns, th, 1, 1)) if n_jobs > 1: from joblib import Parallel, delayed with Parallel(n_jobs=n_jobs) as parallel: # prefer="processes" loss_zipped = parallel( delayed(compute_for_fold)( adj, split_train_test(fold_idx), n, th, ns, mudcod_kwargs=mudcod_kwargs, ) for fold_idx in range(num_folds) ) modu_fold, logllh_fold = map(np.array, zip(*loss_zipped)) modu_total = sum(modu_fold) logllh_total = sum(logllh_fold) else: for fold_idx in range(num_folds): modu_fold, logllh_fold = compute_for_fold( adj, split_train_test(fold_idx), n, th, ns, mudcod_kwargs=mudcod_kwargs, ) modu_total = modu_total + modu_fold logllh_total = logllh_total + logllh_fold num_adj = ns * th return modu_total / num_adj, logllh_total / num_adj
Methods
def fit(self, adj, alpha=None, beta=None, n_iter=30, max_K=None, opt_K='empirical', monitor_convergence=False)
-
Computes the spectral embeddings from given multi-subject time series of matrices of the dynamic networks of different subjects.
Parameters
adj
:np.ndarray
ofshape (ns, th, n, n)
- Multi-subject time series of adjacency matrices of size (ns, th,n,n), where n is the number of vertices, th is the time horizon, and ns is the number of subjects.
alpha
:np.ndarray
ofshape (th, 2)
, default=0.05J(th,2)
- Tuning parameter for smoothing along the time axis.
beta
:np.ndarray
ofshape (ns)
, default=0.01J(ns)
- Tuning parameter for smoothing along the subject axis.
n_iter
:int
, default=30
- Determines the number of iterations to run MuDCoD.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence
:bool
, default='False'
- Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence.
Returns
embeddings
:np.ndarray
ofshape (ns, th, n, max_K)
- Computed and smoothed spectral embeddings of the multi-subject time series of the adjacency matrices, with shape (ns, th, n, max_K).
Expand source code
def fit( self, adj, alpha=None, beta=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Computes the spectral embeddings from given multi-subject time series of matrices of the dynamic networks of different subjects. Parameters ---------- adj : `np.ndarray` of shape (ns, th, n, n) Multi-subject time series of adjacency matrices of size (ns, th,n,n), where n is the number of vertices, th is the time horizon, and ns is the number of subjects. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. beta : `np.ndarray` of shape (ns), default=0.01J(ns) Tuning parameter for smoothing along the subject axis. n_iter : `int`, default=30 Determines the number of iterations to run MuDCoD. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence. Returns ------- embeddings : `np.ndarray` of shape (ns, th, n, max_K) Computed and smoothed spectral embeddings of the multi-subject time series of the adjacency matrices, with shape (ns, th, n, max_K). """ self.adj = adj.astype(float) self.num_subjects = self.adj.shape[0] self.time_horizon = self.adj.shape[1] self.num_vertices = self.adj.shape[2] self.degrees = np.empty(self.adj.shape[:-1]) self.lapl_adj = np.empty_like(self.adj) if alpha is None: alpha = 0.05 * np.ones((self.time_horizon, 2)) if beta is None: beta = 0.01 * np.ones(self.num_subjects) if max_K is None: max_K = np.ceil(self.num_vertices / 10).astype(int) if self.time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 4 assert self.adj.shape[2] == self.adj.shape[3] assert isinstance(alpha, np.ndarray) and alpha.shape == (self.time_horizon, 2) assert isinstance(beta, np.ndarray) and beta.shape == (self.num_subjects,) assert max_K > 0 ns = self.num_subjects th = self.time_horizon n = self.num_vertices k = np.zeros((ns, th)).astype(int) + max_K v_col = np.zeros((ns, th, n, max_K)) u = np.zeros((ns, th, n, n)) objective = np.zeros((n_iter)) self.convergence_monitor = [] diffU = 0 for t in range(th): for sbj in range(ns): adj_t = self.adj[sbj, t, :, :] self.degrees[sbj, t, :] = np.sum(np.abs(adj_t), axis=0) + _eps sqinv_degree = sqrtm(inv(np.diag(self.degrees[sbj, t, :]))) self.lapl_adj[sbj, t, :, :] = sqinv_degree @ adj_t @ sqinv_degree # Initialization of k, v_col. for t in range(th): for sbj in range(ns): lapl_adj_t = self.lapl_adj[sbj, t, :, :] k[sbj, t] = self.choose_model_order_K( lapl_adj_t, self.degrees[sbj, t, :], max_K, opt=opt_K ) _, v_col[sbj, t, :, : k[sbj, t]] = eigs( lapl_adj_t, k=k[sbj, t], which="LM" ) u[sbj, t, :, :] = v_col[sbj, t, :, :] @ v_col[sbj, t, :, :].T if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( self.lapl_adj[sbj, t, :, :], v_col[sbj, t, :, : k[sbj, t]] @ v_col[sbj, t, :, : k[sbj, t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((-np.inf, diffU)) total_itr = 0 for itr in range(n_iter): if self.verbose: print(f"Iteration {itr}/{n_iter} is running.") total_itr += 1 diffU = 0 v_col_pv = deepcopy(v_col) for t in range(th): v_col_t = v_col_pv[:, t, :, :] swp_v_col_t = np.swapaxes(v_col_t, 1, 2) u_bar_t = v_col_t @ swp_v_col_t for sbj in range(ns): # reprs = u[sbj, t, :, :] mu_u_bar_t = np.mean(np.delete(u_bar_t, sbj, 0), axis=0) reprs = self.lapl_adj[sbj, t, :, :] if t == 0: v_col_pv_ktn = v_col_pv[sbj, t + 1, :, : k[sbj, t + 1]] reprs_bar = ( reprs + alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T) + beta[sbj] * mu_u_bar_t ) elif t == th - 1: v_col_pv_ktp = v_col_pv[sbj, t - 1, :, : k[sbj, t - 1]] reprs_bar = ( reprs + alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T) + beta[sbj] * mu_u_bar_t ) else: v_col_pv_ktp = v_col_pv[sbj, t - 1, :, : k[sbj, t - 1]] v_col_pv_ktn = v_col_pv[sbj, t + 1, :, : k[sbj, t + 1]] reprs_bar = ( reprs + (alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T)) + (alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T)) + beta[sbj] * mu_u_bar_t ) k[sbj, t] = self.choose_model_order_K( reprs_bar, self.degrees[sbj, t, :], max_K, opt=opt_K, ) _, v_col[sbj, t, :, : k[sbj, t]] = eigs( reprs_bar, k=k[sbj, t], which="LM" ) eig_val = eigvals( v_col[sbj, t, :, : k[sbj, t]].T @ v_col_pv[sbj, t, :, : k[sbj, t]] ) objective[itr] = objective[itr] + np.sum(np.abs(eig_val), axis=0) if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( v_col[sbj, t, :, : k[sbj, t]] @ v_col[sbj, t, :, : k[sbj, t]].T, v_col_pv[sbj, t, :, : k[sbj, t]] @ v_col_pv[sbj, t, :, : k[sbj, t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((objective[itr], diffU)) if itr >= 1: diff_obj = objective[itr] - objective[itr - 1] if abs(diff_obj) < CONVERGENCE_CRITERIA: break if ( (total_itr > 1) and (total_itr == n_iter) and (objective[-1] - objective[-2] >= CONVERGENCE_CRITERIA) ): warnings.warn("MuDCoD does not converge!", RuntimeWarning) self.embeddings = v_col self.model_order_K = k return self.embeddings
def fit_predict(self, adj, alpha=None, beta=None, n_iter=30, max_K=None, opt_K='empirical', monitor_convergence=False)
-
Predicts multi-subject time series of community memberships given the adjacency matrices of the dynamic networks for each subject.
Parameters
adj
:np.ndarray
ofshape (th, n, n)
- Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon.
alpha
:np.ndarray
ofshape (th, 2)
, default=0.05J(th,2)
- Tuning parameter for smoothing along the time axis.
n_iter
:int
, default=30
- Determines the number of iterations to run PisCES.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence
:bool
, default='False'
- Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}> at each iteration to monitor convergence.
Returns
z_pred
:np.ndarray
ofshape (ns, th, n)
- Predicted community membership labels of vertices at each time point for each subject, where n is the number of vertices, th is the time horizon, and ns is the number of subjects.
Expand source code
def fit_predict( self, adj, alpha=None, beta=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Predicts multi-subject time series of community memberships given the adjacency matrices of the dynamic networks for each subject. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}> at each iteration to monitor convergence. Returns ------- z_pred : np.ndarray of shape (ns, th, n) Predicted community membership labels of vertices at each time point for each subject, where n is the number of vertices, th is the time horizon, and ns is the number of subjects. """ self.fit( adj, alpha=alpha, beta=beta, max_K=max_K, opt_K=opt_K, n_iter=n_iter, monitor_convergence=monitor_convergence, ) return self.predict()
def predict(self)
-
Predicts community memberships of vertices at each time point for each subject.
Parameters
Returns
z_pred
:np.ndarray
ofshape (ns, th, n)
- Predicted community membership labels of vertices at each time point for each subject, where n is the number of vertices, th is the time horizon, and ns is the number of subjects.
Expand source code
def predict( self, ): """ Predicts community memberships of vertices at each time point for each subject. Parameters ---------- Returns ------- z_pred : np.ndarray of shape (ns, th, n) Predicted community membership labels of vertices at each time point for each subject, where n is the number of vertices, th is the time horizon, and ns is the number of subjects. """ ns = self.num_subjects th = self.time_horizon n = self.num_vertices z_pred = np.empty((ns, th, n), dtype=int) for t in range(th): for sbj in range(ns): kmeans = KMeans(n_clusters=self.model_order_K[sbj, t]) z_pred[sbj, t, :] = kmeans.fit_predict( self.embeddings[sbj, t, :, : self.model_order_K[sbj, t]] ) return z_pred
Inherited members
class PisCES (verbose=False)
-
Expand source code
class PisCES(CommunityDetectionMixin): def __init__(self, verbose=False): super().__init__("PisCES", verbose=verbose) self.convergence_monitor = [] def fit( self, adj, alpha=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Computes the spectral embeddings from given time series of matrices of the dynamic network. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence. Returns ------- embeddings : `np.ndarray` of shape (th, n, max_K) Computed and smoothed spectral embeddings of the time series of the adjacency matrices, with shape (th, n, max_K). """ self.adj = adj.astype(float) self.num_vertices = self.adj.shape[1] self.time_horizon = self.adj.shape[0] self.degrees = np.empty(self.adj.shape[:-1]) self.lapl_adj = np.empty_like(self.adj) if alpha is None: alpha = 0.05 * np.ones((self.time_horizon, 2)) if max_K is None: max_K = np.ceil(self.num_vertices / 10).astype(int) if self.time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 3 assert self.adj.shape[1] == self.adj.shape[2] assert isinstance(alpha, np.ndarray) and alpha.shape == (self.time_horizon, 2) assert max_K > 0 th = self.time_horizon n = self.num_vertices u = np.zeros((th, n, n)) v_col = np.zeros((th, n, max_K)) k = np.zeros(th).astype(int) + max_K objective = np.zeros(n_iter) self.convergence_monitor = [] diffU = 0 for t in range(th): adj_t = self.adj[t, :, :] self.degrees[t, :] = np.sum(np.abs(adj_t), axis=0) + _eps sqinv_degree = sqrtm(inv(np.diag(self.degrees[t, :]))) self.lapl_adj[t, :, :] = sqinv_degree @ adj_t @ sqinv_degree # Initialization of k, v_col. for t in range(th): lapl_adj_t = self.lapl_adj[t, :, :] k[t] = self.choose_model_order_K( lapl_adj_t, self.degrees[t, :], max_K, opt=opt_K ) _, v_col[t, :, : k[t]] = eigs(lapl_adj_t, k=k[t], which="LM") u[t, :, :] = v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( self.lapl_adj[t, :, :], v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((-np.inf, diffU)) total_itr = 0 for itr in range(n_iter): if self.verbose: print(f"Iteration {itr}/{n_iter} is running.") total_itr += 1 diffU = 0 v_col_pv = deepcopy(v_col) for t in range(th): # reprs = u[t, :, :] reprs = self.lapl_adj[t, :, :] if t == 0: v_col_pv_ktn = v_col_pv[t + 1, :, : k[t + 1]] reprs_bar = reprs + alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T) elif t == th - 1: v_col_pv_ktp = v_col_pv[t - 1, :, : k[t - 1]] reprs_bar = reprs + alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T) else: v_col_pv_ktp = v_col_pv[t - 1, :, : k[t - 1]] v_col_pv_ktn = v_col_pv[t + 1, :, : k[t + 1]] reprs_bar = ( reprs + (alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T)) + (alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T)) ) k[t] = self.choose_model_order_K( reprs_bar, self.degrees[t, :], max_K, opt=opt_K ) _, v_col[t, :, : k[t]] = eigs(reprs_bar, k=k[t], which="LM") eig_val = eigvals(v_col[t, :, : k[t]].T @ v_col_pv[t, :, : k[t]]) objective[itr] = objective[itr] + np.sum(np.abs(eig_val), axis=0) if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T, v_col_pv[t, :, : k[t]] @ v_col_pv[t, :, : k[t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((objective[itr], diffU)) if itr >= 1: diff_obj = objective[itr] - objective[itr - 1] if abs(diff_obj) < CONVERGENCE_CRITERIA: break if ( (total_itr > 1) and (total_itr == n_iter) and (objective[-1] - objective[-2] >= CONVERGENCE_CRITERIA) ): warnings.warn("PisCES does not converge!", RuntimeWarning) self.embeddings = v_col self.model_order_K = k return self.embeddings def predict(self): """ Predicts community memberships of vertices at each time point. Parameters ---------- Returns ------- z_pred : np.ndarray of shape (th, n) Predicted community membership labels of vertices at each time point, where n is the number of vertices and th is the time horizon. """ th = self.time_horizon n = self.num_vertices z_pred = np.empty((th, n), dtype=int) for t in range(th): kmeans = KMeans(n_clusters=self.model_order_K[t]) z_pred[t, :] = kmeans.fit_predict( self.embeddings[t, :, : self.model_order_K[t]] ) return z_pred def fit_predict( self, adj, alpha=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Predicts time series of community memberships given the adjacency matrices of the dynamic network. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence. Returns ------- z_pred : np.ndarray of shape (th, n) Predicted community membership labels of vertices at each time point, where n is the number of vertices and th is the time horizon. """ self.fit( adj, alpha=alpha, max_K=max_K, opt_K=opt_K, n_iter=n_iter, monitor_convergence=monitor_convergence, ) return self.predict() @classmethod def cross_validation( cls, adj, num_folds=5, alpha=None, n_iter=30, max_K=None, opt_K="empirical", n_jobs=1, ): """ Performs cross validation to choose the best value for the alpha parameter. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. num_folds : `int`, default=5 Number of folds to perform in the cross validation. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. n_jobs : `int`, default=1 The number of parallel `joblib` threads. Returns ------- modu : `float` Sum of the modularity value computed for each fold with respect to the given alpha. logllh : `float` Sum of the log-likelihood value computed for each fold with respect to the given alpha. """ adj = adj.astype(float) num_vertices = adj.shape[1] time_horizon = adj.shape[0] if alpha is None: alpha = 0.05 * np.ones((time_horizon, 2)) if max_K is None: max_K = np.ceil(num_vertices / 10).astype(int) if time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(adj) in [np.ndarray, np.memmap] and adj.ndim == 3 assert adj.shape[1] == adj.shape[2] assert alpha.shape == (time_horizon, 2) assert max_K > 0 n = num_vertices th = time_horizon idx_n = np.arange(n) idx = np.c_[np.repeat(idx_n, idx_n.shape), np.tile(idx_n, idx_n.shape)] r = np.random.choice(n**2, size=n**2, replace=False) pisces_kwargs = { "alpha": alpha, "n_iter": n_iter, "max_K": max_K, "opt_K": opt_K, } def compute_for_fold(adj, idx_split, n, th, pisces_kwargs={}): cv_idx = np.zeros((th, n, n), dtype=bool) adj_train = np.zeros((th, n, n)) adj_train_imputed = np.zeros((th, n, n)) for t in range(th): idx1, idx2 = idx_split[t, :, 0], idx_split[t, :, 1] cv_idx_t = np.zeros((n, n), dtype=bool) cv_idx_t[idx1, idx2] = True cv_idx_t = np.triu(cv_idx_t) + np.triu(cv_idx_t).T cv_idx[t, :, :] = cv_idx_t adj_train[t, :, :] = adj[t, :, :] adj_train[t, idx1, idx2] = 0 adj_train[t, :, :] = ( np.triu(adj_train[t, :, :]) + np.triu(adj_train[t, :, :]).T ) adj_train_imputed[t, :, :] = cls.eigen_complete( adj_train[t, :, :], cv_idx_t, 10, 10 ) z_pred = cls().fit_predict( deepcopy(adj_train_imputed[:, :, :]), **pisces_kwargs, ) modu_fold, logllh_fold = 0, 0 for t in range(th): modu_fold = modu_fold + cls.modularity( adj[t, :, :], adj_train[t, :, :], z_pred[t, :], cv_idx[t, :, :], ) logllh_fold = logllh_fold + cls.loglikelihood( adj[t, :, :], adj_train[t, :, :], z_pred[t, :], cv_idx[t, :, :], ) return modu_fold, logllh_fold modu_total = 0 logllh_total = 0 def split_train_test(fold_idx): pfold = n**2 // num_folds start, end = pfold * fold_idx, (fold_idx + 1) * pfold test = r[start:end] idx_split = idx[test, :] return np.tile(idx_split, (th, 1, 1)) if n_jobs > 1: from joblib import Parallel, delayed with Parallel(n_jobs=n_jobs) as parallel: # prefer="processes" loss_zipped = parallel( delayed(compute_for_fold)( adj, split_train_test(fold_idx), n, th, pisces_kwargs=pisces_kwargs, ) for fold_idx in range(num_folds) ) modu_fold, logllh_fold = map(np.array, zip(*loss_zipped)) modu_total = sum(modu_fold) logllh_total = sum(logllh_fold) else: for fold_idx in range(num_folds): modu_fold, logllh_fold = compute_for_fold( adj, split_train_test(fold_idx), n, th, pisces_kwargs=pisces_kwargs ) modu_total = modu_total + modu_fold logllh_total = logllh_total + logllh_fold num_adj = th return modu_total / num_adj, logllh_total / num_adj
Ancestors
Static methods
def cross_validation(adj, num_folds=5, alpha=None, n_iter=30, max_K=None, opt_K='empirical', n_jobs=1)
-
Performs cross validation to choose the best value for the alpha parameter.
Parameters
adj
:np.ndarray
ofshape (th, n, n)
- Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon.
num_folds
:int
, default=5
- Number of folds to perform in the cross validation.
alpha
:np.ndarray
ofshape (th, 2)
, default=0.05J(th,2)
- Tuning parameter for smoothing along the time axis.
n_iter
:int
, default=30
- Determines the number of iterations to run PisCES.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
n_jobs
:int
, default=1
- The number of parallel
joblib
threads.
Returns
modu
:float
- Sum of the modularity value computed for each fold with respect to the given alpha.
logllh
:float
- Sum of the log-likelihood value computed for each fold with respect to the given alpha.
Expand source code
@classmethod def cross_validation( cls, adj, num_folds=5, alpha=None, n_iter=30, max_K=None, opt_K="empirical", n_jobs=1, ): """ Performs cross validation to choose the best value for the alpha parameter. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. num_folds : `int`, default=5 Number of folds to perform in the cross validation. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. n_jobs : `int`, default=1 The number of parallel `joblib` threads. Returns ------- modu : `float` Sum of the modularity value computed for each fold with respect to the given alpha. logllh : `float` Sum of the log-likelihood value computed for each fold with respect to the given alpha. """ adj = adj.astype(float) num_vertices = adj.shape[1] time_horizon = adj.shape[0] if alpha is None: alpha = 0.05 * np.ones((time_horizon, 2)) if max_K is None: max_K = np.ceil(num_vertices / 10).astype(int) if time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(adj) in [np.ndarray, np.memmap] and adj.ndim == 3 assert adj.shape[1] == adj.shape[2] assert alpha.shape == (time_horizon, 2) assert max_K > 0 n = num_vertices th = time_horizon idx_n = np.arange(n) idx = np.c_[np.repeat(idx_n, idx_n.shape), np.tile(idx_n, idx_n.shape)] r = np.random.choice(n**2, size=n**2, replace=False) pisces_kwargs = { "alpha": alpha, "n_iter": n_iter, "max_K": max_K, "opt_K": opt_K, } def compute_for_fold(adj, idx_split, n, th, pisces_kwargs={}): cv_idx = np.zeros((th, n, n), dtype=bool) adj_train = np.zeros((th, n, n)) adj_train_imputed = np.zeros((th, n, n)) for t in range(th): idx1, idx2 = idx_split[t, :, 0], idx_split[t, :, 1] cv_idx_t = np.zeros((n, n), dtype=bool) cv_idx_t[idx1, idx2] = True cv_idx_t = np.triu(cv_idx_t) + np.triu(cv_idx_t).T cv_idx[t, :, :] = cv_idx_t adj_train[t, :, :] = adj[t, :, :] adj_train[t, idx1, idx2] = 0 adj_train[t, :, :] = ( np.triu(adj_train[t, :, :]) + np.triu(adj_train[t, :, :]).T ) adj_train_imputed[t, :, :] = cls.eigen_complete( adj_train[t, :, :], cv_idx_t, 10, 10 ) z_pred = cls().fit_predict( deepcopy(adj_train_imputed[:, :, :]), **pisces_kwargs, ) modu_fold, logllh_fold = 0, 0 for t in range(th): modu_fold = modu_fold + cls.modularity( adj[t, :, :], adj_train[t, :, :], z_pred[t, :], cv_idx[t, :, :], ) logllh_fold = logllh_fold + cls.loglikelihood( adj[t, :, :], adj_train[t, :, :], z_pred[t, :], cv_idx[t, :, :], ) return modu_fold, logllh_fold modu_total = 0 logllh_total = 0 def split_train_test(fold_idx): pfold = n**2 // num_folds start, end = pfold * fold_idx, (fold_idx + 1) * pfold test = r[start:end] idx_split = idx[test, :] return np.tile(idx_split, (th, 1, 1)) if n_jobs > 1: from joblib import Parallel, delayed with Parallel(n_jobs=n_jobs) as parallel: # prefer="processes" loss_zipped = parallel( delayed(compute_for_fold)( adj, split_train_test(fold_idx), n, th, pisces_kwargs=pisces_kwargs, ) for fold_idx in range(num_folds) ) modu_fold, logllh_fold = map(np.array, zip(*loss_zipped)) modu_total = sum(modu_fold) logllh_total = sum(logllh_fold) else: for fold_idx in range(num_folds): modu_fold, logllh_fold = compute_for_fold( adj, split_train_test(fold_idx), n, th, pisces_kwargs=pisces_kwargs ) modu_total = modu_total + modu_fold logllh_total = logllh_total + logllh_fold num_adj = th return modu_total / num_adj, logllh_total / num_adj
Methods
def fit(self, adj, alpha=None, n_iter=30, max_K=None, opt_K='empirical', monitor_convergence=False)
-
Computes the spectral embeddings from given time series of matrices of the dynamic network.
Parameters
adj
:np.ndarray
ofshape (th, n, n)
- Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon.
alpha
:np.ndarray
ofshape (th, 2)
, default=0.05J(th,2)
- Tuning parameter for smoothing along the time axis.
n_iter
:int
, default=30
- Determines the number of iterations to run PisCES.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence
:bool
, default='False'
- Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence.
Returns
embeddings
:np.ndarray
ofshape (th, n, max_K)
- Computed and smoothed spectral embeddings of the time series of the adjacency matrices, with shape (th, n, max_K).
Expand source code
def fit( self, adj, alpha=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Computes the spectral embeddings from given time series of matrices of the dynamic network. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence. Returns ------- embeddings : `np.ndarray` of shape (th, n, max_K) Computed and smoothed spectral embeddings of the time series of the adjacency matrices, with shape (th, n, max_K). """ self.adj = adj.astype(float) self.num_vertices = self.adj.shape[1] self.time_horizon = self.adj.shape[0] self.degrees = np.empty(self.adj.shape[:-1]) self.lapl_adj = np.empty_like(self.adj) if alpha is None: alpha = 0.05 * np.ones((self.time_horizon, 2)) if max_K is None: max_K = np.ceil(self.num_vertices / 10).astype(int) if self.time_horizon < 2: raise ValueError( "Time horizon must be at least 2, otherwise use static spectral clustering." ) assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 3 assert self.adj.shape[1] == self.adj.shape[2] assert isinstance(alpha, np.ndarray) and alpha.shape == (self.time_horizon, 2) assert max_K > 0 th = self.time_horizon n = self.num_vertices u = np.zeros((th, n, n)) v_col = np.zeros((th, n, max_K)) k = np.zeros(th).astype(int) + max_K objective = np.zeros(n_iter) self.convergence_monitor = [] diffU = 0 for t in range(th): adj_t = self.adj[t, :, :] self.degrees[t, :] = np.sum(np.abs(adj_t), axis=0) + _eps sqinv_degree = sqrtm(inv(np.diag(self.degrees[t, :]))) self.lapl_adj[t, :, :] = sqinv_degree @ adj_t @ sqinv_degree # Initialization of k, v_col. for t in range(th): lapl_adj_t = self.lapl_adj[t, :, :] k[t] = self.choose_model_order_K( lapl_adj_t, self.degrees[t, :], max_K, opt=opt_K ) _, v_col[t, :, : k[t]] = eigs(lapl_adj_t, k=k[t], which="LM") u[t, :, :] = v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( self.lapl_adj[t, :, :], v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((-np.inf, diffU)) total_itr = 0 for itr in range(n_iter): if self.verbose: print(f"Iteration {itr}/{n_iter} is running.") total_itr += 1 diffU = 0 v_col_pv = deepcopy(v_col) for t in range(th): # reprs = u[t, :, :] reprs = self.lapl_adj[t, :, :] if t == 0: v_col_pv_ktn = v_col_pv[t + 1, :, : k[t + 1]] reprs_bar = reprs + alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T) elif t == th - 1: v_col_pv_ktp = v_col_pv[t - 1, :, : k[t - 1]] reprs_bar = reprs + alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T) else: v_col_pv_ktp = v_col_pv[t - 1, :, : k[t - 1]] v_col_pv_ktn = v_col_pv[t + 1, :, : k[t + 1]] reprs_bar = ( reprs + (alpha[t, 0] * (v_col_pv_ktp @ v_col_pv_ktp.T)) + (alpha[t, 1] * (v_col_pv_ktn @ v_col_pv_ktn.T)) ) k[t] = self.choose_model_order_K( reprs_bar, self.degrees[t, :], max_K, opt=opt_K ) _, v_col[t, :, : k[t]] = eigs(reprs_bar, k=k[t], which="LM") eig_val = eigvals(v_col[t, :, : k[t]].T @ v_col_pv[t, :, : k[t]]) objective[itr] = objective[itr] + np.sum(np.abs(eig_val), axis=0) if monitor_convergence: diffU = diffU + ( Distances.hamming_distance( v_col[t, :, : k[t]] @ v_col[t, :, : k[t]].T, v_col_pv[t, :, : k[t]] @ v_col_pv[t, :, : k[t]].T, ) ) if monitor_convergence: self.convergence_monitor.append((objective[itr], diffU)) if itr >= 1: diff_obj = objective[itr] - objective[itr - 1] if abs(diff_obj) < CONVERGENCE_CRITERIA: break if ( (total_itr > 1) and (total_itr == n_iter) and (objective[-1] - objective[-2] >= CONVERGENCE_CRITERIA) ): warnings.warn("PisCES does not converge!", RuntimeWarning) self.embeddings = v_col self.model_order_K = k return self.embeddings
def fit_predict(self, adj, alpha=None, n_iter=30, max_K=None, opt_K='empirical', monitor_convergence=False)
-
Predicts time series of community memberships given the adjacency matrices of the dynamic network.
Parameters
adj
:np.ndarray
ofshape (th, n, n)
- Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon.
alpha
:np.ndarray
ofshape (th, 2)
, default=0.05J(th,2)
- Tuning parameter for smoothing along the time axis.
n_iter
:int
, default=30
- Determines the number of iterations to run PisCES.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
monitor_convergence
:bool
, default='False'
- Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence.
Returns
z_pred
:np.ndarray
ofshape (th, n)
- Predicted community membership labels of vertices at each time point, where n is the number of vertices and th is the time horizon.
Expand source code
def fit_predict( self, adj, alpha=None, n_iter=30, max_K=None, opt_K="empirical", monitor_convergence=False, ): """ Predicts time series of community memberships given the adjacency matrices of the dynamic network. Parameters ---------- adj : `np.ndarray` of shape (th, n, n) Time series of adjacency matrices of size (th,n,n), where n is the number of vertices, and th is the time horizon. alpha : `np.ndarray` of shape (th, 2), default=0.05J(th,2) Tuning parameter for smoothing along the time axis. n_iter : `int`, default=30 Determines the number of iterations to run PisCES. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. monitor_convergence : `bool`, default='False' Controls if method saves ||U_{t} - U_{t-1}|| values and |obj_t - obj_{t-1}| at each iteration to monitor convergence. Returns ------- z_pred : np.ndarray of shape (th, n) Predicted community membership labels of vertices at each time point, where n is the number of vertices and th is the time horizon. """ self.fit( adj, alpha=alpha, max_K=max_K, opt_K=opt_K, n_iter=n_iter, monitor_convergence=monitor_convergence, ) return self.predict()
def predict(self)
-
Predicts community memberships of vertices at each time point.
Parameters
Returns
z_pred
:np.ndarray
ofshape (th, n)
- Predicted community membership labels of vertices at each time point, where n is the number of vertices and th is the time horizon.
Expand source code
def predict(self): """ Predicts community memberships of vertices at each time point. Parameters ---------- Returns ------- z_pred : np.ndarray of shape (th, n) Predicted community membership labels of vertices at each time point, where n is the number of vertices and th is the time horizon. """ th = self.time_horizon n = self.num_vertices z_pred = np.empty((th, n), dtype=int) for t in range(th): kmeans = KMeans(n_clusters=self.model_order_K[t]) z_pred[t, :] = kmeans.fit_predict( self.embeddings[t, :, : self.model_order_K[t]] ) return z_pred
Inherited members
class StaticSpectralCoD (verbose=False)
-
Expand source code
class StaticSpectralCoD(CommunityDetectionMixin): def __init__(self, verbose=False): super().__init__("StaticSpectralCoD", verbose=verbose) def fit(self, adj, max_K=None, opt_K="empirical"): """ Computes the spectral embeddings from the adjacency matrix of the network. Parameters ---------- adj : `np.ndarray` of shape (n, n) Adjacency matrices of size (n, n), n is the number of vertices. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. Returns ------- embeddings : `np.ndarray` of shape (n, max_K) Computed spectral embedding of the adjacency matrix. """ self.adj = adj.astype(float) self.num_vertices = self.adj.shape[0] self.degrees = np.empty(self.adj.shape[:-1]) self.lapl_adj = np.empty_like(self.adj) if max_K is None: max_K = np.ceil(self.num_vertices / 10).astype(int) assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 2 assert self.adj.shape[0] == self.adj.shape[1] n = self.num_vertices self.degrees = np.sum(np.abs(self.adj), axis=0) + _eps sqinv_degree = sqrtm(inv(np.diag(self.degrees))) self.lapl_adj = sqinv_degree @ self.adj @ sqinv_degree v_col = np.zeros((n, max_K)) k = self.choose_model_order_K(self.lapl_adj, self.degrees, max_K, opt=opt_K) _, v_col[:, :k] = eigs(self.lapl_adj, k=k, which="LM") self.embeddings = v_col self.model_order_K = k return self.embeddings def predict(self): """ Predicts community memberships of vertices. Parameters ---------- Returns ------- z_pred : np.ndarray of shape (n) Predicted community membership labels of each vertex. """ kmeans = KMeans(n_clusters=self.model_order_K) z_pred = kmeans.fit_predict(self.embeddings[:, : self.model_order_K]) return z_pred def fit_predict(self, adj, max_K=None, opt_K="empirical"): """ Predicts community memberships of vertices given the adjacency matrix of the network. Parameters ---------- adj : `np.ndarray` of shape (n, n) Adjacency matrices of size (n, n), n is the number of vertices. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. Returns ------- z_pred : np.ndarray of shape (n) Predicted community membership labels of each vertex. """ self.fit(adj, max_K=max_K, opt_K=opt_K) return self.predict()
Ancestors
Methods
def fit(self, adj, max_K=None, opt_K='empirical')
-
Computes the spectral embeddings from the adjacency matrix of the network.
Parameters
adj
:np.ndarray
ofshape (n, n)
- Adjacency matrices of size (n, n), n is the number of vertices.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
Returns
embeddings
:np.ndarray
ofshape (n, max_K)
- Computed spectral embedding of the adjacency matrix.
Expand source code
def fit(self, adj, max_K=None, opt_K="empirical"): """ Computes the spectral embeddings from the adjacency matrix of the network. Parameters ---------- adj : `np.ndarray` of shape (n, n) Adjacency matrices of size (n, n), n is the number of vertices. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. Returns ------- embeddings : `np.ndarray` of shape (n, max_K) Computed spectral embedding of the adjacency matrix. """ self.adj = adj.astype(float) self.num_vertices = self.adj.shape[0] self.degrees = np.empty(self.adj.shape[:-1]) self.lapl_adj = np.empty_like(self.adj) if max_K is None: max_K = np.ceil(self.num_vertices / 10).astype(int) assert type(self.adj) in [np.ndarray, np.memmap] and self.adj.ndim == 2 assert self.adj.shape[0] == self.adj.shape[1] n = self.num_vertices self.degrees = np.sum(np.abs(self.adj), axis=0) + _eps sqinv_degree = sqrtm(inv(np.diag(self.degrees))) self.lapl_adj = sqinv_degree @ self.adj @ sqinv_degree v_col = np.zeros((n, max_K)) k = self.choose_model_order_K(self.lapl_adj, self.degrees, max_K, opt=opt_K) _, v_col[:, :k] = eigs(self.lapl_adj, k=k, which="LM") self.embeddings = v_col self.model_order_K = k return self.embeddings
def fit_predict(self, adj, max_K=None, opt_K='empirical')
-
Predicts community memberships of vertices given the adjacency matrix of the network.
Parameters
adj
:np.ndarray
ofshape (n, n)
- Adjacency matrices of size (n, n), n is the number of vertices.
max_K
:int
, default=n/10
- Determines the maximum number of communities to predict.
opt_K
:string
, default='empirical'
- Chooses the technique to estimate k, i.e., number of communities.
Returns
z_pred
:np.ndarray
ofshape (n)
- Predicted community membership labels of each vertex.
Expand source code
def fit_predict(self, adj, max_K=None, opt_K="empirical"): """ Predicts community memberships of vertices given the adjacency matrix of the network. Parameters ---------- adj : `np.ndarray` of shape (n, n) Adjacency matrices of size (n, n), n is the number of vertices. max_K : `int`, default=n/10 Determines the maximum number of communities to predict. opt_K : `string`, default='empirical' Chooses the technique to estimate k, i.e., number of communities. Returns ------- z_pred : np.ndarray of shape (n) Predicted community membership labels of each vertex. """ self.fit(adj, max_K=max_K, opt_K=opt_K) return self.predict()
def predict(self)
-
Predicts community memberships of vertices.
Parameters
Returns
z_pred
:np.ndarray
ofshape (n)
- Predicted community membership labels of each vertex.
Expand source code
def predict(self): """ Predicts community memberships of vertices. Parameters ---------- Returns ------- z_pred : np.ndarray of shape (n) Predicted community membership labels of each vertex. """ kmeans = KMeans(n_clusters=self.model_order_K) z_pred = kmeans.fit_predict(self.embeddings[:, : self.model_order_K]) return z_pred
Inherited members