Massive univariate linear model.
Source code for mulm.residualizer.residualizer
# -*- coding: utf-8 -*-
##########################################################################
# Created on Created on Thu Feb 6 15:15:14 2020
# Copyright (c) 2013-2021, CEA/DRF/Joliot/NeuroSpin. All rights reserved.
# @author: Edouard Duchesnay
# @email: edouard.duchesnay@cea.fr
# @license: BSD 3-clause.
##########################################################################
"""
Residualization of a Y data on possibly adjusted for other variables.
"""
import numpy as np
import mulm
import pandas as pd
[docs]class Residualizer:
""" Residualization of a Y data on possibly adjusted for other variables.
Example: Y is a (n, p) array of p-dependant variables, we want to residualize
for "site" adjusted for "age + sex".
1) Use of DataFrame and formula:
1.1) `Residualizer(data=df, formula_res="site", formula_full=site + age + sex")`
1.2) `Z = get_design_mat(data)` will return the numpy (n, k) array design matrix.
Row selection can be done on both Y and design_mat (Cross-val., etc.)
2) Use of raw arrays: if you choose to manually write your design matrix.
In this case provide res_mask ie, the residualization mask within your full.
model. For example: `Residualizer(mask=[False, True, False, False])` will
fit the whole model and residualize on the second regressor, ie, site.
3) `fit(Y, X)` fits the model:
Y = b0 + b1 site + b2 age + b3 sex + eps
=> learn and store b1, b2, b3
4) `transform(Y, X)` residualize Y on X, ie, returns Y - b1 site
Example
-------
>>> import numpy as np
>>> import pandas as pd
>>> import scipy.stats as stats
>>> from mulm.residualizer import Residualizer
>>> import seaborn as sns
>>> np.random.seed(1)
>>>
>>> # Dataset with site effect on age
>>> site = np.array([-1] * 50 + [1] * 50)
>>> age = np.random.uniform(10, 40, size=100) + 5 * site
>>> y = -0.1 * age + site + np.random.normal(size=100)
>>> data = pd.DataFrame(dict(y=y, age=age, site=site.astype(object)))
>>>
>>> # Simple residualization on site
>>> res_spl = Residualizer(data, formula_res="site")
>>> yres = res_spl.fit_transform(y[:, None], res_spl.get_design_mat())
>>>
>>> # Site residualization adjusted for age
>>> res_adj = Residualizer(data, formula_res="site", formula_full="age + site")
>>> yadj = res_adj.fit_transform(y[:, None], res_adj.get_design_mat())
>>>
>>> # Site residualization adjusted for age provides higher correlation,
>>> # and lower stderr than simple residualization
>>> lm_res = stats.linregress(age, yres.ravel())
>>> lm_adj = stats.linregress(age, yadj.ravel())
>>>
>>> np.allclose((lm_res.slope, lm_res.rvalue, lm_res.stderr),
>>> (-0.079187578, -0.623733003, 0.0100242219))
True
>>> np.allclose((lm_adj.slope, lm_adj.rvalue, lm_adj.stderr),
>>> (-0.110779913, -0.7909219758, 0.00865778640))
True
"""
def __init__(self, data=None, formula_res=None, formula_full=None,
contrast_res=None):
"""
Parameters
----------
data: DataFrame
DataFrame containing column to build the design matrix (default None).
formula_res: str
Residualisation formula. Ex: "site" (default None).
formula_full: str
Full model (formula) of residualisation containing other variables
to adjust for. Ex.: "site + age + sex" (default None).
cont_res: boolean array
the contrast for residualisation (matches formula_res).
Ex: [False, True, False, False]. The default None corresponds to True
everywhere.
"""
if isinstance(data, pd.DataFrame) and isinstance(formula_res, str):
if formula_full is None:
formula_full = formula_res
self.formula_full = formula_full
res_terms = mulm.design_matrix(formula=formula_res, data=data)[1].keys()
_, self.t_contrasts, self.f_contrasts = \
mulm.design_matrix(formula=formula_full, data=data)
# mask of terms in residualize formula within full model
self.contrast_res = np.array([cont for term, cont in self.t_contrasts.items()
if term in res_terms]).sum(axis=0) == 1
else:
self.contrast_res = contrast_res
[docs] def get_design_mat(self, data):
design_mat, t_contrasts, f_contrasts = \
mulm.design_matrix(formula=self.formula_full, data=data)
assert np.all([self.t_contrasts[k] == t_contrasts[k]
for k in self.t_contrasts]), "new data doesn't"
return design_mat
[docs] def fit(self, Y, X):
"""Fit parameters of p linear models where each Y is regressed on X.
Parameters
----------
Y: array (n, p)
Dependant variables
X: array(n, k)
Design matrix of independant variables
"""
if self.contrast_res is None:
self.contrast_res = np.ones(X.shape[1]).astype(bool)
assert Y.shape[0] == X.shape[0]
assert self.contrast_res.shape[0] == X.shape[1], "contrast doesn't match design matrix"
self.mod_mulm = mulm.MUOLS(Y, X).fit()
return self
[docs] def transform(self, Y, X):
"""Residualize Y on X.
Parameters
----------
Y: array (n, p)
Dependant variables
X: array(n, k)
Design matrix of independant variables
Returns
-------
Yres: array (n, p)
Residualized Y data.
"""
assert Y.shape[0] == X.shape[0]
assert self.contrast_res.shape[0] == X.shape[1], "contrast doesn't match design matrix"
return Y - np.dot(X[:, self.contrast_res],
self.mod_mulm.coef[self.contrast_res, :])
[docs] def fit_transform(self, Y, X):
"""Fit parameters of p linear models where each Y is regressed on X.
Residualize Y on X.
Parameters
----------
Y: array (n, p)
Dependant variables
X: array(n, k)
Design matrix of independant variables
Returns
-------
Yres: array (n, p)
Residualized Y data.
"""
self.fit(Y, X)
return self.transform(Y, X)
[docs]def residualize(Y, data, formula_res, formula_full=None):
"""Helper function. See Residualizer.
"""
res = Residualizer(data=data, formula_res=formula_res, formula_full=formula_full)
return res.fit_transform(Y, res.get_design_mat(data))
[docs]class ResidualizerEstimator:
"""Wrap Residualizer into an estimator compatible with sklearn API.
Note that to be consistant with sklearn API, here X contains the input variable
and Z is the design matrix for residualization.
"""
def __init__(self, residualizer):
"""
Parameters
----------
residualizer: Residualizer
"""
self.residualizer = residualizer
self.design_mat_ncol = self.residualizer.contrast_res.shape[0]
[docs] def fit_transform(self, X, y=None):
Z, Y = self.upack(X)
self.residualizer.fit(Y, Z)
return self.residualizer.transform(Y, Z)
[docs] def pack(self, Z, X):
"""Pack (concat) Z (design matrix) and X to match scikit-learn pipelines.
Parameters
----------
Z: array (n, k)
the design_matrix
X: array (n, p)
the input data for scikit-learn: fit(X, y) or transform(X)
Returns
-------
(n, (k+p)) array: [design_matrix, X]
"""
return np.hstack([Z, X])
[docs] def upack(self, ZX):
"""Unpack X and Z (design matrix) from X.
Parameters
----------
ZX: array (n, (k+p))
array: [Z, X]
Returns
-------
Z (design_matrix), X
"""
return ZX[:, :self.design_mat_ncol], ZX[:, self.design_mat_ncol:]
Follow us
© 2021,
pylearn-mulm developers
.
Inspired by AZMIND template.
Inspired by AZMIND template.