Massive univariate linear model.

Univariate T and F tests¶

Credit: E Duchesnay

Univariate statistics: F-test and T-test with MULM (Massive univariate linear model) compared with statsmodel.

import numpy as np
import mulm
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smfrmla
from collections import OrderedDict

Example 1: Salary dataset¶

Fit model a single model: salary ~ experience + education + management

url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/salary_table.csv'
df = pd.read_csv(url)

Fit with MULM

Y = np.asarray(df.salary)[:, None].astype(float)
X, t_contrasts, f_contrasts = mulm.design_matrix(formula="experience + education + management", data=df)
mod_mulm = mulm.MUOLS(Y, X).fit()
tstat_mulm = OrderedDict((term, mod_mulm.t_test(t_contrasts[term], pval=True)) for term in t_contrasts)
fstat_mulm = OrderedDict((term, mod_mulm.f_test(f_contrasts[term], pval=True)) for term in f_contrasts)

print(mod_mulm.coef)
print(pd.DataFrame(tstat_mulm, index=['tstat', 'pval', 'df']).T)
# print(pd.DataFrame(fstat_mulm, index=['fstat', 'pval']).T)

Out:

[[8035.59762964]
 [3144.03520995]
 [2996.21026198]
 [6883.53100596]
 [ 546.1840189 ]]
                                      tstat                        pval                   df
Intercept             [[20.78050375184039]]  [[2.1992317660106647e-23]]  [41.00000000000001]
education[T.Master]   [[8.685941421521713]]   [[7.733200691899716e-11]]  [41.00000000000001]
education[T.Ph.D]     [[7.276722572205208]]   [[6.722373055820008e-09]]  [41.00000000000001]
management[T.Y]      [[21.927731174885658]]  [[2.9014441220138002e-24]]  [41.00000000000001]
experience           [[17.896410931586338]]   [[5.546313438178072e-21]]  [41.00000000000001]

Fit with statsmodel

mod_sm = smfrmla.ols('salary ~ experience + education + management', df).fit()
print(mod_sm.summary())

fstat_sm = sm.stats.anova_lm(mod_sm, typ=2) # Type 2 ANOVA DataFrame
tstat_sm = OrderedDict((term, (mod_sm.t_test(t_contrasts[term]).tvalue,
                               mod_sm.t_test(t_contrasts[term]).pvalue,
                               mod_sm.t_test(t_contrasts[term]).df_denom)) for term in t_contrasts)


print(fstat_sm)
print(pd.DataFrame(tstat_mulm, index=['tstat', 'pval', 'df']).T)

Out:

OLS Regression Results
==============================================================================
Dep. Variable:                 salary   R-squared:                       0.957
Model:                            OLS   Adj. R-squared:                  0.953
Method:                 Least Squares   F-statistic:                     226.8
Date:              ven., 19 mars 2021   Prob (F-statistic):           2.23e-27
Time:                        14:04:16   Log-Likelihood:                -381.63
No. Observations:                  46   AIC:                             773.3
Df Residuals:                      41   BIC:                             782.4
Df Model:                           4
Covariance Type:            nonrobust
=======================================================================================
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept            8035.5976    386.689     20.781      0.000    7254.663    8816.532
education[T.Master]  3144.0352    361.968      8.686      0.000    2413.025    3875.045
education[T.Ph.D]    2996.2103    411.753      7.277      0.000    2164.659    3827.762
management[T.Y]      6883.5310    313.919     21.928      0.000    6249.559    7517.503
experience            546.1840     30.519     17.896      0.000     484.549     607.819
==============================================================================
Omnibus:                        2.293   Durbin-Watson:                   2.237
Prob(Omnibus):                  0.318   Jarque-Bera (JB):                1.362
Skew:                          -0.077   Prob(JB):                        0.506
Kurtosis:                       2.171   Cond. No.                         33.5
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
                  sum_sq    df           F        PR(>F)
education   9.152624e+07   2.0   43.351589  7.672450e-11
management  5.075724e+08   1.0  480.825394  2.901444e-24
experience  3.380979e+08   1.0  320.281524  5.546313e-21
Residual    4.328072e+07  41.0         NaN           NaN
                                      tstat                        pval                   df
Intercept             [[20.78050375184039]]  [[2.1992317660106647e-23]]  [41.00000000000001]
education[T.Master]   [[8.685941421521713]]   [[7.733200691899716e-11]]  [41.00000000000001]
education[T.Ph.D]     [[7.276722572205208]]   [[6.722373055820008e-09]]  [41.00000000000001]
management[T.Y]      [[21.927731174885658]]  [[2.9014441220138002e-24]]  [41.00000000000001]
experience           [[17.896410931586338]]   [[5.546313438178072e-21]]  [41.00000000000001]

Check equality of model parameters

assert np.allclose(np.asarray(mod_sm.params), mod_mulm.coef[:, 0])

Check equality of F-statistics

assert np.allclose(np.asarray(fstat_sm.loc[:, "F"][:-1]),
    np.asarray([fstat_mulm[iv][0][0] for iv in list(f_contrasts.keys())[1:]]))

Check equality of P-values

assert np.allclose(np.asarray(fstat_sm.loc[:, "PR(>F)"][:-1]),
    np.asarray([fstat_mulm[iv][1][0] for iv in list(f_contrasts.keys())[1:]]))

Example 2: Multiple targets: y_i = age + sex + site¶

dataset with 3 dependant variables yi’s defined as: yi = b0 + b1 * age + b2 * sex + b3 * site

age = np.random.normal(size=100)
sex = np.random.choice([0, 1], 100)
sex_c = ["X%i" % i for i in sex]
site = np.array([2] * 25 + [1] * 25 + [-1] * 50)
site_c = ["S%i" % i for i in site]

# Independent variables
x_df = pd.DataFrame(OrderedDict(age=age, sex=sex_c, site=site_c))

# Dependent variables
y_dict = OrderedDict(
    y0 = 0.1 * age + 0.1 * sex + site + np.random.normal(size=100), # age and sex
    y1 = 0.1 * age + 0.0 * sex + site + np.random.normal(size=100), # age only
    y2 = 0.0 * age + 0.1 * sex + site + np.random.normal(size=100)) # sex only

# for i in range(3, 10):
#    y_dict["y%i" % i] = 0.0 * age + 0.0 * sex + site + np.random.normal(size=100)

y_df = pd.DataFrame(y_dict)
Y = np.asarray(y_df)

data = pd.concat((y_df, x_df), axis=1)

Fit with MULM

X, t_contrasts, f_contrasts = mulm.design_matrix(formula="age + sex + site", data=x_df)
mod_mulm = mulm.MUOLS(Y, X).fit()
fstat_mulm = OrderedDict((term, mod_mulm.f_test(f_contrasts[term], pval=True)) for term in f_contrasts)

With statsmodels

import statsmodels.api as sm
import statsmodels.formula.api as smfrmla

mod_sm = {dv:smfrmla.ols("%s ~ %s" % (dv, "age + sex + site"), data=data).fit() for dv in y_df.columns}