Massive univariate linear model.
Source code for mulm.models
# -*- coding: utf-8 -*-
##########################################################################
# Created on Tue Jun 25 13:25:41 2013
# Copyright (c) 2013-2021, CEA/DRF/Joliot/NeuroSpin. All rights reserved.
# @author: Edouard Duchesnay
# @email: edouard.duchesnay@cea.fr
# @license: BSD 3-clause.
##########################################################################
"""
Linear models for massive univariate statistics.
"""
import numpy as np
import scipy
from sklearn.preprocessing import scale
from scipy import stats
[docs]class MUPairwiseCorr:
"""Mass-univariate pairwise correlations. Given two arrays X (n_samples x p)
and Y (n_samples x q). Fit p x q independent linear models. Prediction
and stats return (p x q) array.
Examples
--------
>>> import numpy as np
>>> from mulm import MUPairwiseCorr
>>> X = np.random.randn(10, 5)
>>> Y = np.random.randn(10, 3)
>>> corr = MUPairwiseCorr(X, Y)
>>> corr.fit()
<mulm.models.MUPairwiseCorr instance at 0x30da878>
>>> f, p = corr.stats_f()
>>> print(f.shape)
(5, 3)
"""
def __init__(self, X, Y):
"""
Parameters
----------
Y : numpy array (n_samples, p)
First block of variables.
X : numpy array (n_samples, q)
Second block of variables.
"""
self.Xs = scale(X, copy=True) # TODO PERFORM BASIC CHECK ARRAY
self.Ys = scale(Y, copy=True) # TODO PERFORM BASIC CHECK ARRAY
self.n_samples = X.shape[0]
if X.shape[0] != Y.shape[0]:
raise ValueError('matrices are not aligned')
[docs] def stats_f(self, pval=True):
"""
Parameters
----------
pval
Returns
-------
fstats (k, p) array, pvals (k, p) array, df (k,) array
"""
R2 = self.Corr_ ** 2
df_res = self.n_samples - 2
f_stats = R2 * df_res / (1 - R2)
if not pval:
return (f_stats, None)
else:
p_vals = stats.f.sf(f_stats, 1, df_res)
return f_stats, p_vals, df_res
[docs]class MUOLS:
"""Mass-univariate linear modeling based Ordinary Least Squares.
Given two arrays X (n_samples, p) and Y (n_samples, q).
Fit q independent linear models, ie., for all y in Y fit: lm(y ~ X).
Example
-------
>>> import numpy as np
>>> import mulm
>>> np.random.seed(42)
>>> # n_samples, nb of features that depends on X and that are pure noise
>>> n_samples, n_info, n_noise = 100, 2, 100
>>> beta = np.array([1, 0, 0.5, 0, 2])[:, np.newaxis]
>>> X = np.random.randn(n_samples, 5) # Design matrix
>>> X[:, -1] = 1 # Intercept
>>> Y = np.random.randn(n_samples, n_info + n_noise)
>>> Y[:, :n_info] += np.dot(X, beta) # n_info features depend from X
>>> contrasts = np.identity(X.shape[1])[:4] # don't test the intercept
>>> mod = mulm.MUOLS(Y, X).fit()
>>> tvals, pvals, df = mod.t_test(contrasts, two_tailed=True)
>>> print(pvals.shape)
(4, 102)
>>> print("Nb of uncorrected p-values <5%:", np.sum(pvals < 0.05))
Nb of uncorrected p-values <5%: 18
"""
def __init__(self, Y, X):
"""
Parameters
----------
Y : numpy array (n_samples, p)
dependant (target) variables.
X : numpy array (n_samples, q)
design matrix.
"""
self.coef = None
if X.shape[0] != Y.shape[0]:
raise ValueError('matrices are not aligned')
self.X = X # TODO PERFORM BASIC CHECK ARRAY
self.Y = Y # TODO PERFORM BASIC CHECK ARRAY
def _block_slices(self, dim_size, block_size):
"""Generator that yields slice objects for indexing into
sequential blocks of an array along a particular axis
"""
count = 0
while True:
yield slice(count, count + block_size, 1)
count += block_size
if count >= dim_size:
return
[docs] def fit(self, block=False, max_elements=2 ** 27):
"""Fit p independent linear models, ie., for all y in Y fit: lm(y ~ X).
Parameters
----------
block : boolean
Use block=True for huge matrices Y.
Operations block by block to optimize time and memory.
max_elements : int
block dimension (2**27 corresponds to 1Go)
Returns
-------
self
"""
self.block = block
self.max_elements = max_elements
self.pinv = scipy.linalg.pinv2(self.X)
n, p = self.Y.shape
q = self.X.shape[1]
if self.block:
if self.max_elements < n:
raise ValueError('the maximum number of elements is too small')
max_cols = int(self.max_elements / n)
else:
max_cols = p
self.coef = np.zeros((q, p))
self.err_ss = np.zeros(p)
for pp in self._block_slices(p, max_cols):
if isinstance(self.Y, np.memmap):
Y_block = self.Y[:, pp].copy() # copy to force a read
else: Y_block = self.Y[:, pp]
#Y_block = self.Y[:, pp]
self.coef[:, pp] = np.dot(self.pinv, Y_block)
y_hat = np.dot(self.X, self.coef[:, pp])
err = Y_block - y_hat
del Y_block, y_hat
self.err_ss[pp] = np.sum(err ** 2, axis=0)
del err
return self
[docs] def predict(self, X):
"""Predict Y given a new design matrix X.
Parameters
----------
X : numpy array (n_samples, q)
design matrix of new predictors.
Returns
-------
(n_samples, 1) array of predicted values (X beta)
"""
#from sklearn.utils import safe_asarray
import numpy as np
#X = safe_asarray(X) # TODO PERFORM BASIC CHECK ARRAY
pred_y = np.dot(X, self.coef)
return pred_y
[docs] def t_test(self, contrasts, pval=False, two_tailed=True):
"""Compute T-statistics (t-scores and p-value associated to contrast).
The code has been cloned from the SPM MATLAB implementation.
Parameters
----------
contrasts: array (q, ) or list of arrays or array 2D.
Single contrast (array) or list of contrasts or array of contrasts.
The k contrasts to be tested.
pval: boolean
compute pvalues (default is false)
two_tailed: boolean
one-tailed test or a two-tailed test (default True)
Returns
-------
tstats (k, p) array, pvals (k, p) array, df (k,) array
Example
-------
>>> import numpy as np
>>> import mulm
>>> np.random.seed(42)
>>> # n_samples, nb of features that depends on X and that are pure noise
>>> n_samples, n_info, n_noise = 100, 2, 100
>>> beta = np.array([1, 0, 0.5, 0, 2])[:, np.newaxis]
>>> X = np.random.randn(n_samples, 5) # Design matrix
>>> X[:, -1] = 1 # Intercept
>>> Y = np.random.randn(n_samples, n_info + n_noise)
>>> Y[:, :n_info] += np.dot(X, beta) # n_info features depend from X
>>> contrasts = np.identity(X.shape[1])[:4] # don't test the intercept
>>> mod = mulm.MUOLS(Y, X).fit()
>>> tvals, pvals, df = mod.t_test(contrasts, two_tailed=True)
>>> print(pvals.shape)
(4, 102)
>>> print("Nb of uncorrected p-values <5%:", np.sum(pvals < 0.05))
Nb of uncorrected p-values <5%: 18
"""
contrasts = np.atleast_2d(np.asarray(contrasts))
n = self.X.shape[0]
t_stats_ = list()
p_vals_ = list()
df_ = list()
for contrast in contrasts:
# contrast = contrasts[0]
#ccontrasts = np.asarray(contrasts)
# t = c'beta / std(c'beta)
# std(c'beta) = sqrt(var_err (c'X+)(X+'c))
#Xpinv = scipy.linalg.pinv(X)
cXpinv = np.dot(contrast, self.pinv)
R = np.eye(n) - np.dot(self.X, self.pinv)
df = np.trace(R)
## Broadcast over ss errors
var_errors = self.err_ss / df
std_cbeta = np.sqrt(var_errors * np.dot(cXpinv, cXpinv.T))
t_stats = np.dot(contrast, self.coef) / std_cbeta
p_vals = None
if pval is not None:
if two_tailed:
p_vals = stats.t.sf(np.abs(t_stats), df) * 2
else:
p_vals = stats.t.sf(t_stats, df)
t_stats_.append(t_stats)
p_vals_.append(p_vals)
df_.append(df)
return np.asarray(t_stats_), np.asarray(p_vals_), np.asarray(df_)
[docs] def t_test_maxT(self, contrasts, nperms=1000, two_tailed=True, **kwargs):
"""Correct for multiple comparisons using Westfall and Young, 1993 a.k.a maxT procedure.
It is based on permutation tests procedure. This is the procedure used by FSL (https://fsl.fmrib.ox.ac.uk/).
It should be used when the test statistics, and hence the unadjusted p-values, are dependent.
This is the case when groups of dependant variables (in Y) tend to have highly correlated measures.
Westfall and Young (1993) proposed adjusted p-values for less conservative multiple testing procedures which
take into account the dependence structure among test statistics.
References:
- Anderson M. Winkler "Statistical analysis of areal quantities in the brain through
permutation tests" Ph.D 2017.
- Dudoit et al. "Multiple Hypothesis Testing in Microarray Experiments", Statist. Sci. 2003
Parameters
----------
contrasts: array (q, ) or list of arrays or array 2D.
Single contrast (array) or list of contrasts or array of contrasts.
The k contrasts to be tested.
nperms: int
permutation tests (default 1000).
two_tailed: boolean
one-tailed test or a two-tailed test (default True)
Returns
-------
tstats (k, p) array, pvals (k, p) array corrected for multiple comparisons
df (k,) array.
Examples
--------
>>> import numpy as np
>>> import mulm
>>> np.random.seed(42)
>>> # n_samples, nb of features that depends on X and that are pure noise
>>> n_samples, n_info, n_noise = 100, 2, 100
>>> beta = np.array([1, 0, 0.5, 0, 2])[:, np.newaxis]
>>> X = np.random.randn(n_samples, 5) # Design matrix
>>> X[:, -1] = 1 # Intercept
>>> Y = np.random.randn(n_samples, n_info + n_noise)
>>> Y[:, :n_info] += np.dot(X, beta) # n_info features depend from X
>>> contrasts = np.identity(X.shape[1])[:4] # don't test the intercept
>>> mod = mulm.MUOLS(Y, X).fit()
>>> tvals, pvals, df = mod.t_test(contrasts, two_tailed=True)
>>> print(pvals.shape)
(4, 102)
>>> print("Nb of uncorrected p-values <5%:", np.sum(pvals < 0.05))
Nb of uncorrected p-values <5%: 18
>>> tvals, pvals_corrmaxT, df = mod.t_test_maxT(contrasts, two_tailed=True)
>>> print("Nb of corrected pvalues <5%:", np.sum(pvals_corrmaxT < 0.05))
Nb of corrected pvalues <5%: 4
"""
#contrast = [0, 1] + [0] * (X.shape[1] - 2)
contrasts = np.atleast_2d(np.asarray(contrasts))
tvals, _, df = self.t_test(contrasts=contrasts, pval=False, **kwargs)
max_t = list()
for i in range(nperms):
perm_idx = np.random.permutation(self.X.shape[0])
Xp = self.X[perm_idx, :]
muols = MUOLS(self.Y, Xp).fit(block=self.block,
max_elements=self.max_elements)
tvals_perm, _, _ = muols.t_test(contrasts=contrasts, pval=False,
two_tailed=two_tailed)
if two_tailed:
tvals_perm = np.abs(tvals_perm)
max_t.append(np.max(tvals_perm, axis=1))
del muols
max_t = np.array(max_t)
tvals_ = np.abs(tvals) if two_tailed else tvals
pvalues = np.array(
[np.array([np.sum(max_t[:, con] >= t) for t in tvals_[con, :]])\
/ float(nperms) for con in range(contrasts.shape[0])])
return tvals, pvalues, df
[docs] def t_test_minP(self, contrasts, nperms=10000, two_tailed=True, **kwargs):
"""Correct for multiple comparisons using minP procedure.
References:
- Dudoit et al. "Multiple Hypothesis Testing in Microarray Experiments", Statist. Sci. 2003
Parameters
----------
contrasts: array (q, ) or list of arrays or array 2D.
Single contrast (array) or list of contrasts or array of contrasts.
The k contrasts to be tested.
nperms: int
permutation tests (default 10000).
two_tailed: boolean
one-tailed test or a two-tailed test (default True)
Returns
-------
tstats (k, p) array, pvals (k, p) array corrected for multiple comparisons
df (k,) array.
Examples
--------
>>> import numpy as np
>>> import mulm
>>> np.random.seed(42)
>>> # n_samples, nb of features that depends on X and that are pure noise
>>> n_samples, n_info, n_noise = 100, 2, 100
>>> beta = np.array([1, 0, 0.5, 0, 2])[:, np.newaxis]
>>> X = np.random.randn(n_samples, 5) # Design matrix
>>> X[:, -1] = 1 # Intercept
>>> Y = np.random.randn(n_samples, n_info + n_noise)
>>> Y[:, :n_info] += np.dot(X, beta) # n_info features depend from X
>>> contrasts = np.identity(X.shape[1])[:4] # don't test the intercept
>>> mod = mulm.MUOLS(Y, X).fit()
>>> tvals, pvals, df = mod.t_test(contrasts, two_tailed=True)
>>> print(pvals.shape)
(4, 102)
>>> print("Nb of uncorrected p-values <5%:", np.sum(pvals < 0.05))
Nb of uncorrected p-values <5%: 18
>>> tvals, pval_corrminp, df = mod.t_test_minP(contrasts, two_tailed=True)
>>> print("Nb of corrected pvalues <5%:", np.sum(pval_corrminp < 0.05))
Nb of corrected pvalues <5%: 4
"""
tvals, pvals, df = self.t_test(contrasts=contrasts, pval=True, **kwargs)
min_p = np.ones((contrasts.shape[0], nperms))
perm_idx = np.zeros((self.X.shape[0], nperms + 1), dtype='int')
for i in range(self.Y.shape[1]):
Y_curr = self.Y[:, i]
Yp_curr = np.zeros((self.X.shape[0], nperms + 1))
for j in range(nperms + 1):
if i == 0:
perm_idx[:, j] = np.random.permutation(self.X.shape[0])
Yp_curr[:, j] = Y_curr[perm_idx[:, j]]
muols = MUOLS(Yp_curr, self.X).fit()
tvals_perm, _, _ = muols.t_test(contrasts=contrasts, pval=False,
two_tailed=two_tailed)
if two_tailed:
tvals_perm = np.abs(tvals_perm)
pval_perm = np.array(
[np.array([((np.sum(tvals_perm[con, :] >= tvals_perm[con, k])) - 1) \
for k in range(nperms)]) / float(nperms) \
for con in range(contrasts.shape[0])])
min_p = np.array(
[(np.min(np.vstack((min_p[con, :], pval_perm[con, :])), axis=0)) \
for con in range(contrasts.shape[0])])
pvalues = np.array(
[np.array([np.sum(min_p[con, :] <= p) \
for p in pvals[con, :]]) / float(nperms) \
for con in range(contrasts.shape[0])])
return tvals, pvalues, df
[docs] def f_test(self, contrast, pval=False):
"""Compute F-statistics (F-scores and p-value associated to contrast).
The code has been cloned from the SPM MATLAB implementation.
Parameters
----------
contrasts: array (q, ) or list of arrays or array 2D.
Single contrast (array) or list of contrasts or array of contrasts.
The k contrasts to be tested.
pval: boolean
compute pvalues (default is false)
two_tailed: boolean
one-tailed test or a two-tailed test (default True)
Returns
-------
tstats (k, p) array, pvals (k, p) array, df (k,) array
Example
-------
>>> import numpy as np
>>> import mulm
>>> X = np.random.randn(100, 5)
>>> Y = np.random.randn(100, 10)
>>> beta = np.random.randn(5, 1)
>>> Y[:, :2] += np.dot(X, beta)
>>> contrasts = np.identity(X.shape[1])
>>> mod = mulm.MUOLS(Y, X).fit()
>>> fvals, pvals, df = mod.f_test(contrasts, pval=True)
"""
C1 = np.atleast_2d(contrast).T
n, p = self.X.shape
rank_x = np.linalg.matrix_rank(self.pinv)
C0 = np.eye(p) - np.dot(C1, scipy.linalg.pinv2(C1)) # Ortho. cont. to C1
X0 = np.dot(self.X, C0) # Design matrix of the reduced model
X0pinv = scipy.linalg.pinv2(X0)
rank_x0 = np.linalg.matrix_rank(X0pinv)
# Find the subspace (X1) of Xc1, which is orthogonal to X0
# The projection matrix M due to X1 can be derived from the residual
# forming matrix of the reduced model X0
# R0 is the residual forming matrix of the reduced model
R0 = np.eye(n) - np.dot(X0, X0pinv)
# R is the residual forming matrix of the full model
R = np.eye(n) - np.dot(self.X, self.pinv)
# compute the projection matrix
M = R0 - R
#Ypred = np.dot(self.X, betas)
y_hat = self.predict(self.X)
SS = np.sum(y_hat * np.dot(M, y_hat), axis=0)
df_c1 = rank_x - rank_x0
df_res = n - rank_x
## Broadcast over self.err_ss of Y
f_stats = (SS * df_res) / (self.err_ss * df_c1)
if not pval:
return f_stats, None, df_res
else:
p_vals = stats.f.sf(f_stats, df_c1, df_res)
return f_stats, p_vals, df_res
[docs] def stats_f_coefficients(self, X, Y, contrast, pval=False):
return self.stats_f(contrast, pval=pval)
Follow us
© 2021,
pylearn-mulm developers
.
Inspired by AZMIND template.
Inspired by AZMIND template.