Menu

Massive univariate linear model.

Source code for mulm.residualizer.residualizer

# -*- coding: utf-8 -*-
##########################################################################
# Created on Created on Thu Feb  6 15:15:14 2020
# Copyright (c) 2013-2021, CEA/DRF/Joliot/NeuroSpin. All rights reserved.
# @author:  Edouard Duchesnay
# @email:   edouard.duchesnay@cea.fr
# @license: BSD 3-clause.
##########################################################################

"""
Residualization of a Y data on possibly adjusted for other variables.
"""

import numpy as np
import mulm
import pandas as pd

[docs]class Residualizer: """ Residualization of a Y data on possibly adjusted for other variables. Example: Y is a (n, p) array of p-dependant variables, we want to residualize for "site" adjusted for "age + sex". 1) Use of DataFrame and formula: 1.1) `Residualizer(data=df, formula_res="site", formula_full=site + age + sex")` 1.2) `Z = get_design_mat(data)` will return the numpy (n, k) array design matrix. Row selection can be done on both Y and design_mat (Cross-val., etc.) 2) Use of raw arrays: if you choose to manually write your design matrix. In this case provide res_mask ie, the residualization mask within your full. model. For example: `Residualizer(mask=[False, True, False, False])` will fit the whole model and residualize on the second regressor, ie, site. 3) `fit(Y, X)` fits the model: Y = b0 + b1 site + b2 age + b3 sex + eps => learn and store b1, b2, b3 4) `transform(Y, X)` residualize Y on X, ie, returns Y - b1 site Example ------- >>> import numpy as np >>> import pandas as pd >>> import scipy.stats as stats >>> from mulm.residualizer import Residualizer >>> import seaborn as sns >>> np.random.seed(1) >>> >>> # Dataset with site effect on age >>> site = np.array([-1] * 50 + [1] * 50) >>> age = np.random.uniform(10, 40, size=100) + 5 * site >>> y = -0.1 * age + site + np.random.normal(size=100) >>> data = pd.DataFrame(dict(y=y, age=age, site=site.astype(object))) >>> >>> # Simple residualization on site >>> res_spl = Residualizer(data, formula_res="site") >>> yres = res_spl.fit_transform(y[:, None], res_spl.get_design_mat()) >>> >>> # Site residualization adjusted for age >>> res_adj = Residualizer(data, formula_res="site", formula_full="age + site") >>> yadj = res_adj.fit_transform(y[:, None], res_adj.get_design_mat()) >>> >>> # Site residualization adjusted for age provides higher correlation, >>> # and lower stderr than simple residualization >>> lm_res = stats.linregress(age, yres.ravel()) >>> lm_adj = stats.linregress(age, yadj.ravel()) >>> >>> np.allclose((lm_res.slope, lm_res.rvalue, lm_res.stderr), >>> (-0.079187578, -0.623733003, 0.0100242219)) True >>> np.allclose((lm_adj.slope, lm_adj.rvalue, lm_adj.stderr), >>> (-0.110779913, -0.7909219758, 0.00865778640)) True """ def __init__(self, data=None, formula_res=None, formula_full=None, contrast_res=None): """ Parameters ---------- data: DataFrame DataFrame containing column to build the design matrix (default None). formula_res: str Residualisation formula. Ex: "site" (default None). formula_full: str Full model (formula) of residualisation containing other variables to adjust for. Ex.: "site + age + sex" (default None). cont_res: boolean array the contrast for residualisation (matches formula_res). Ex: [False, True, False, False]. The default None corresponds to True everywhere. """ if isinstance(data, pd.DataFrame) and isinstance(formula_res, str): if formula_full is None: formula_full = formula_res self.formula_full = formula_full res_terms = mulm.design_matrix(formula=formula_res, data=data)[1].keys() _, self.t_contrasts, self.f_contrasts = \ mulm.design_matrix(formula=formula_full, data=data) # mask of terms in residualize formula within full model self.contrast_res = np.array([cont for term, cont in self.t_contrasts.items() if term in res_terms]).sum(axis=0) == 1 else: self.contrast_res = contrast_res
[docs] def get_design_mat(self, data): design_mat, t_contrasts, f_contrasts = \ mulm.design_matrix(formula=self.formula_full, data=data) assert np.all([self.t_contrasts[k] == t_contrasts[k] for k in self.t_contrasts]), "new data doesn't" return design_mat
[docs] def fit(self, Y, X): """Fit parameters of p linear models where each Y is regressed on X. Parameters ---------- Y: array (n, p) Dependant variables X: array(n, k) Design matrix of independant variables """ if self.contrast_res is None: self.contrast_res = np.ones(X.shape[1]).astype(bool) assert Y.shape[0] == X.shape[0] assert self.contrast_res.shape[0] == X.shape[1], "contrast doesn't match design matrix" self.mod_mulm = mulm.MUOLS(Y, X).fit() return self
[docs] def transform(self, Y, X): """Residualize Y on X. Parameters ---------- Y: array (n, p) Dependant variables X: array(n, k) Design matrix of independant variables Returns ------- Yres: array (n, p) Residualized Y data. """ assert Y.shape[0] == X.shape[0] assert self.contrast_res.shape[0] == X.shape[1], "contrast doesn't match design matrix" return Y - np.dot(X[:, self.contrast_res], self.mod_mulm.coef[self.contrast_res, :])
[docs] def fit_transform(self, Y, X): """Fit parameters of p linear models where each Y is regressed on X. Residualize Y on X. Parameters ---------- Y: array (n, p) Dependant variables X: array(n, k) Design matrix of independant variables Returns ------- Yres: array (n, p) Residualized Y data. """ self.fit(Y, X) return self.transform(Y, X)
[docs]def residualize(Y, data, formula_res, formula_full=None): """Helper function. See Residualizer. """ res = Residualizer(data=data, formula_res=formula_res, formula_full=formula_full) return res.fit_transform(Y, res.get_design_mat(data))
[docs]class ResidualizerEstimator: """Wrap Residualizer into an estimator compatible with sklearn API. Note that to be consistant with sklearn API, here X contains the input variable and Z is the design matrix for residualization. """ def __init__(self, residualizer): """ Parameters ---------- residualizer: Residualizer """ self.residualizer = residualizer self.design_mat_ncol = self.residualizer.contrast_res.shape[0]
[docs] def fit(self, X, y=None): Z, Y = self.upack(X) return self.residualizer.fit(Y, Z)
[docs] def transform(self, X): Z, Y = self.upack(X) return self.residualizer.transform(Y, Z)
[docs] def fit_transform(self, X, y=None): Z, Y = self.upack(X) self.residualizer.fit(Y, Z) return self.residualizer.transform(Y, Z)
[docs] def pack(self, Z, X): """Pack (concat) Z (design matrix) and X to match scikit-learn pipelines. Parameters ---------- Z: array (n, k) the design_matrix X: array (n, p) the input data for scikit-learn: fit(X, y) or transform(X) Returns ------- (n, (k+p)) array: [design_matrix, X] """ return np.hstack([Z, X])
[docs] def upack(self, ZX): """Unpack X and Z (design matrix) from X. Parameters ---------- ZX: array (n, (k+p)) array: [Z, X] Returns ------- Z (design_matrix), X """ return ZX[:, :self.design_mat_ncol], ZX[:, self.design_mat_ncol:]

Follow us

© 2021, pylearn-mulm developers .
Inspired by AZMIND template.