import arviz as az
import numpy as np
import pandas as pd
import pymc as pm
import xarray as xr
%matplotlib inline

az.style.use("arviz-darkgrid")
rng = np.random.default_rng(101010)

/home/xian/anaconda3/envs/pymc-dev-py39/lib/python3.9/site-packages/pkg_resources/__init__.py:123: PkgResourcesDeprecationWarning: main is an invalid version and will not be supported in a future release
  warnings.warn(


npdata = rng.standard_normal(size=(2, 3))
npdata

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])


data = xr.DataArray(npdata, dims=("user", "day"), coords={"user": ['Alice', 'Bob'], "day":["yesterday", "today", "tomorrow"]})
data

<xarray.DataArray (user: 2, day: 3)>
array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])
Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])

array(['Alice', 'Bob'], dtype='<U5')

array(['yesterday', 'today', 'tomorrow'], dtype='<U9')


data.dims

('user', 'day')


data.coords

Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'


npdata[0,0]

-0.9141210682943073


data.sel(user="Alice", day="yesterday")

<xarray.DataArray ()>
array(-0.91412107)
Coordinates:
    user     <U5 'Alice'
    day      <U9 'yesterday'

array(-0.91412107)

array('Alice', dtype='<U5')

array('yesterday', dtype='<U9')


data.sel(day="yesterday", user="Alice")

<xarray.DataArray ()>
array(-0.91412107)
Coordinates:
    user     <U5 'Alice'
    day      <U9 'yesterday'

array(-0.91412107)

array('Alice', dtype='<U5')

array('yesterday', dtype='<U9')


data.sel(user="Alice", day="yesterday")

<xarray.DataArray ()>
array(-0.91412107)
Coordinates:
    user     <U5 'Alice'
    day      <U9 'yesterday'

array(-0.91412107)

array('Alice', dtype='<U5')

array('yesterday', dtype='<U9')


var2 = xr.DataArray(rng.standard_normal(size=(2, 2)),
                    dims=("x", "y"),
                    coords={"x": [0,1], "y":[11,42]}
                   )
var3 = xr.DataArray(rng.standard_normal(size=(2, 2)),
                    dims=("a", "b"),
                    coords={"a": [4.2, 11.8], "b":['Geneva','London']}
                   )
ds = xr.Dataset(dict(orig=data, v2=var2, v3=var3))
ds

<xarray.Dataset>
Dimensions:  (user: 2, day: 3, x: 2, y: 2, a: 2, b: 2)
Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'
  * x        (x) int64 0 1
  * y        (y) int64 11 42
  * a        (a) float64 4.2 11.8
  * b        (b) <U6 'Geneva' 'London'
Data variables:
    orig     (user, day) float64 -0.9141 -0.324 1.292 0.5233 0.9571 0.5214
    v2       (x, y) float64 -0.2851 -1.626 -0.08508 0.9506
    v3       (a, b) float64 -0.4798 1.463 0.07777 0.09208

array(['Alice', 'Bob'], dtype='<U5')

array(['yesterday', 'today', 'tomorrow'], dtype='<U9')

array([0, 1])

array([11, 42])

array([ 4.2, 11.8])

array(['Geneva', 'London'], dtype='<U6')

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])

array([[-0.28513891, -1.62619068],
       [-0.08508324,  0.95058622]])

array([[-0.47979087,  1.46327566],
       [ 0.07776975,  0.09207604]])


ds['orig']

<xarray.DataArray 'orig' (user: 2, day: 3)>
array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])
Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])

array(['Alice', 'Bob'], dtype='<U5')

array(['yesterday', 'today', 'tomorrow'], dtype='<U9')


group_size = 100
systolic_f = rng.normal(loc=123, scale=5, size=group_size)
systolic_m = rng.normal(loc=127, scale=5, size=group_size)
systolic = np.hstack((systolic_f, systolic_m))
sex = (['female'] * group_size) + (['male'] * group_size)
age = rng.normal(loc=50, scale=7, size=len(systolic))
treat_group = rng.integers(low=1, high=3, size=len(systolic))
patient_id = rng.choice(1000, size=len(systolic), replace=False)
systolic = systolic + (.05 * age)
df = pd.DataFrame({'bp':systolic,
                   'sex':sex,
                   'age':age,
                   'treat_group':treat_group})
df.index = patient_id
df.index.name = "patientID"
df.head()


sex_idx,sex_codes = pd.factorize(df["sex"])
n_sex = len(sex_codes)


with pm.Model() as unlabeled_model:
    # hyper prior
    mu_hyper = pm.Normal('mu_hyper', mu=120, sigma=15)
    # per-group prior
    mu = pm.Normal('mu', mu=mu_hyper, sigma=15, shape=n_sex)
    # likelihood
    likelihood = pm.Normal('likelihood', mu=mu[sex_idx], sigma=15, observed=df['bp'])


with unlabeled_model:
    unlabeled_idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [mu_hyper, mu]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 3 seconds.


unlabeled_idata.posterior

<xarray.Dataset>
Dimensions:   (chain: 2, draw: 1000, mu_dim_0: 2)
Coordinates:
  * chain     (chain) int64 0 1
  * draw      (draw) int64 0 1 2 3 4 5 6 7 8 ... 992 993 994 995 996 997 998 999
  * mu_dim_0  (mu_dim_0) int64 0 1
Data variables:
    mu_hyper  (chain, draw) float64 123.0 114.9 126.2 ... 130.0 123.8 125.5
    mu        (chain, draw, mu_dim_0) float64 124.9 129.1 124.0 ... 125.9 127.9
Attributes:
    created_at:                 2022-06-04T19:46:36.434190
    arviz_version:              0.12.1
    inference_library:          pymc
    inference_library_version:  4.0.0b6
    sampling_time:              3.3800315856933594
    tuning_steps:               1000

array([0, 1])

array([  0,   1,   2, ..., 997, 998, 999])

array([0, 1])

array([[123.00435007, 114.86059768, 126.22443518, ..., 106.6128855 ,
        123.30495539, 123.30495539],
       [120.54586625, 116.84453927, 137.36321143, ..., 130.00975918,
        123.78130594, 125.50826176]])

array([[[124.85311034, 129.06437058],
        [124.04599364, 129.73002699],
        [124.74487894, 130.18939123],
        ...,
        [126.171595  , 130.55412392],
        [125.55071625, 129.97903848],
        [125.55071625, 129.97903848]],

       [[124.74847894, 129.09550215],
        [122.20259379, 129.8071807 ],
        [126.15732607, 129.2269007 ],
        ...,
        [126.18684081, 129.06940765],
        [124.92729948, 130.69099774],
        [125.88147002, 127.89700098]]])


az.summary(unlabeled_idata)


# average over chains and draws
unlabeled_idata.posterior['mu'].mean(dim=['chain', 'draw'])

<xarray.DataArray 'mu' (mu_dim_0: 2)>
array([124.98313775, 129.87418843])
Coordinates:
  * mu_dim_0  (mu_dim_0) int64 0 1

array([124.98313775, 129.87418843])

array([0, 1])


( unlabeled_idata.posterior['mu'].sel(mu_dim_0=0) < 
  unlabeled_idata.posterior['mu'].sel(mu_dim_0=1) ).mean()

<xarray.DataArray 'mu' ()>
array(0.993)

array(0.993)


sex_codes

Index(['female', 'male'], dtype='object')


# first patient's index into the sex_codes map
sex_idx[0]

0


# first patient's value in the 'sex' column
df['sex'].iloc[0]

'female'


coords = {'sex':['female','male']}


with pm.Model(coords=coords) as labeled_model:
    # hyper prior
    mu_hyper = pm.Normal('mu_hyper', mu=120, sigma=15)
    # per-group prior
    mu = pm.Normal('mu', mu=mu_hyper, sigma=15, dims='sex')
    # likelihood
    likelihood = pm.Normal('likelihood', mu=mu[sex_idx], sigma=15, observed=df['bp'])


# a few patients' coordinate values
patient_ids = [0,75,150]
for idx in sex_idx[patient_ids]:
    print(coords['sex'][idx])

female
female
male


with labeled_model:
    labeled_idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [mu_hyper, mu]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 3 seconds.


az.summary(labeled_idata)


# average over chains and draws
labeled_idata.posterior['mu'].mean(dim=['chain', 'draw'])

<xarray.DataArray 'mu' (sex: 2)>
array([125.01577402, 129.84107401])
Coordinates:
  * sex      (sex) <U6 'female' 'male'

array([125.01577402, 129.84107401])

array(['female', 'male'], dtype='<U6')


labeled_idata.posterior['mu'].sel(sex='female').mean()

<xarray.DataArray 'mu' ()>
array(125.01577402)
Coordinates:
    sex      <U6 'female'

array(125.01577402)

array('female', dtype='<U6')


labeled_idata.posterior['mu'].sel(sex='male').mean()

<xarray.DataArray 'mu' ()>
array(129.84107401)
Coordinates:
    sex      <U6 'male'

array(129.84107401)

array('male', dtype='<U6')


( labeled_idata.posterior['mu'].sel(sex='female') <
  labeled_idata.posterior['mu'].sel(sex='male') ).mean()

<xarray.DataArray 'mu' ()>
array(0.9915)

array(0.9915)


group_idx,group_codes = pd.factorize(df['treat_group'])
sex_idx,sex_codes = pd.factorize(df['sex'])

coords = {"patientID": df.index}
with pm.Model(coords=coords) as fullModel:
    # hyper prior
    intercept_group = pm.Normal('intercent_hyper', mu=120, sigma=5)
    # per-group prior
    intercept_patient = pm.Normal("intercept_patient", mu=intercept_group, sigma=15, dims="patientID")
    # individual coefficients
    b_sex = pm.Normal("b_sex", mu=0, sigma=5)
    b_age = pm.Normal("b_age", mu=0, sigma=5)
    b_treatment = pm.Normal("b_treatment", mu=0, sigma=5)

    # likelihood
    mu = intercept_patient + (b_sex * sex_idx) + (b_age * df['age'].to_numpy()) + (b_treatment * group_idx)
    likelihood = pm.Normal('likelihood', mu=mu, sigma=5, observed=df['bp'])
    
    idata = pm.sample(4000, tune=4000, target_accept=0.9, chains=4)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 2 jobs)
NUTS: [intercent_hyper, intercept_patient, b_sex, b_age, b_treatment]

Sampling 4 chains for 4_000 tune and 4_000 draw iterations (16_000 + 16_000 draws total) took 123 seconds.


# inspect the descriptive statistics
# omit the per-patient intercepts
az.summary(idata, var_names=['~intercept_patient'])


az.plot_posterior(idata, var_names=['~intercept_patient'], grid=(2,2));


%load_ext watermark
%watermark -p pymc,arviz,numpy,pandas,matplotlib

pymc      : 4.0.0b6
arviz     : 0.12.1
numpy     : 1.22.4
pandas    : 1.4.2
matplotlib: 3.5.2

	bp	sex	age	treat_group
patientID
979	128.750951	female	53.901474	2
924	123.935881	female	50.596643	1
522	121.683066	female	57.149881	1
912	135.144287	female	50.263664	2
787	121.420322	female	47.802665	1

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
mu_hyper	125.107	9.012	107.842	141.253	0.171	0.122	2773.0	1555.0	1.0
mu[0]	124.983	1.484	122.392	127.974	0.027	0.019	2898.0	1531.0	1.0
mu[1]	129.874	1.461	127.117	132.524	0.027	0.019	2902.0	1537.0	1.0

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
mu_hyper	124.935	9.011	107.734	141.650	0.173	0.123	2728.0	1361.0	1.0
mu[female]	125.016	1.464	122.195	127.528	0.030	0.021	2347.0	1394.0	1.0
mu[male]	129.841	1.527	127.085	132.664	0.028	0.020	2989.0	1338.0	1.0

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
intercent_hyper	120.746	4.137	112.950	128.308	0.298	0.211	195.0	469.0	1.03
b_sex	4.226	2.047	0.241	8.006	0.033	0.024	3798.0	6611.0	1.00
b_age	0.092	0.085	-0.064	0.254	0.006	0.004	201.0	517.0	1.03
b_treatment	-0.082	2.018	-3.844	3.736	0.033	0.023	3745.0	6279.0	1.00

PyMC & Inference Data¶

xarray ¶

Example Data ¶

Old Method ¶

New Method ¶

Full Example ¶

Wrap-Up ¶