import arviz as az
import numpy as np
import pandas as pd
import pymc as pm
import xarray as xr

rng = np.random.default_rng(101010)


npdata = rng.standard_normal(size=(2, 3))
npdata

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])


data = xr.DataArray(npdata,
                    dims=("user", "day"),
                    coords={"user": ['Alice', 'Bob'], 
                            "day": ["yesterday", "today", "tomorrow"]
                           }
                   )

data

<xarray.DataArray (user: 2, day: 3)>
array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])
Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])

array(['Alice', 'Bob'], dtype='<U5')

array(['yesterday', 'today', 'tomorrow'], dtype='<U9')


data.dims

('user', 'day')


data.coords

Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'


npdata[0,0]

-0.9141210682943073


data.sel(user="Alice", day="yesterday")

<xarray.DataArray ()>
array(-0.91412107)
Coordinates:
    user     <U5 'Alice'
    day      <U9 'yesterday'

array(-0.91412107)

array('Alice', dtype='<U5')

array('yesterday', dtype='<U9')


data.sel(day="yesterday", user="Alice")

<xarray.DataArray ()>
array(-0.91412107)
Coordinates:
    user     <U5 'Alice'
    day      <U9 'yesterday'

array(-0.91412107)

array('Alice', dtype='<U5')

array('yesterday', dtype='<U9')


data.sel(user="Alice", day="yesterday")

<xarray.DataArray ()>
array(-0.91412107)
Coordinates:
    user     <U5 'Alice'
    day      <U9 'yesterday'

array(-0.91412107)

array('Alice', dtype='<U5')

array('yesterday', dtype='<U9')


var2 = xr.DataArray(rng.standard_normal(size=(2, 2)),
                    dims=("x", "y"),
                    coords={"x": [0,1], 
                            "y": [11,42]}
                   )
var3 = xr.DataArray(rng.standard_normal(size=(2, 2)),
                    dims=("a", "b"),
                    coords={"a": [4.2, 11.8], 
                            "b": ['Geneva','London']}
                   )
ds = xr.Dataset(dict(orig=data, 
                     v2=var2, 
                     v3=var3)
               )
ds

<xarray.Dataset>
Dimensions:  (user: 2, day: 3, x: 2, y: 2, a: 2, b: 2)
Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'
  * x        (x) int64 0 1
  * y        (y) int64 11 42
  * a        (a) float64 4.2 11.8
  * b        (b) <U6 'Geneva' 'London'
Data variables:
    orig     (user, day) float64 -0.9141 -0.324 1.292 0.5233 0.9571 0.5214
    v2       (x, y) float64 -0.2851 -1.626 -0.08508 0.9506
    v3       (a, b) float64 -0.4798 1.463 0.07777 0.09208

array(['Alice', 'Bob'], dtype='<U5')

array(['yesterday', 'today', 'tomorrow'], dtype='<U9')

array([0, 1])

array([11, 42])

array([ 4.2, 11.8])

array(['Geneva', 'London'], dtype='<U6')

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])

array([[-0.28513891, -1.62619068],
       [-0.08508324,  0.95058622]])

array([[-0.47979087,  1.46327566],
       [ 0.07776975,  0.09207604]])


ds['orig']

<xarray.DataArray 'orig' (user: 2, day: 3)>
array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])
Coordinates:
  * user     (user) <U5 'Alice' 'Bob'
  * day      (day) <U9 'yesterday' 'today' 'tomorrow'

array([[-0.91412107, -0.32402107,  1.29165606],
       [ 0.52327975,  0.95708885,  0.52139111]])

array(['Alice', 'Bob'], dtype='<U5')

array(['yesterday', 'today', 'tomorrow'], dtype='<U9')


group_size = 100
systolic_f = rng.normal(loc=123, scale=5, size=group_size)
systolic_m = rng.normal(loc=127, scale=5, size=group_size)
systolic = np.hstack((systolic_f, systolic_m))
sex = (['female'] * group_size) + (['male'] * group_size)
age = rng.normal(loc=50, scale=7, size=len(systolic))
treat_group = rng.integers(low=1, high=3, size=len(systolic))
patient_id = rng.choice(1000, size=len(systolic), replace=False)
systolic = systolic + (.05 * age)
df = pd.DataFrame({'bp':systolic,
                   'sex':sex,
                   'age':age,
                   'treat_group':treat_group})
df.index = patient_id
df.index.name = "patientID"
df.head()


sex_idx,sex_codes = pd.factorize(df["sex"])
n_sex = len(sex_codes)


with pm.Model() as unlabeled_model:
    # hyper prior
    mu_hyper = pm.Normal('mu_hyper', mu=120, sigma=15)
    # per-group prior
    mu = pm.Normal('mu', mu=mu_hyper, sigma=15, shape=n_sex)
    # likelihood
    likelihood = pm.Normal('likelihood', 
                           mu=mu[sex_idx], 
                           sigma=15, 
                           observed=df['bp']
                          )


with unlabeled_model:
    unlabeled_idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [mu_hyper, mu]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 5 seconds.


unlabeled_idata.posterior

<xarray.Dataset>
Dimensions:   (chain: 2, draw: 1000, mu_dim_0: 2)
Coordinates:
  * chain     (chain) int64 0 1
  * draw      (draw) int64 0 1 2 3 4 5 6 7 8 ... 992 993 994 995 996 997 998 999
  * mu_dim_0  (mu_dim_0) int64 0 1
Data variables:
    mu_hyper  (chain, draw) float64 123.8 128.3 126.2 ... 128.1 132.3 116.9
    mu        (chain, draw, mu_dim_0) float64 124.1 128.6 125.5 ... 123.5 130.5
Attributes:
    created_at:                 2022-06-13T20:09:04.820492
    arviz_version:              0.12.1
    inference_library:          pymc
    inference_library_version:  4.0.0b6
    sampling_time:              4.526270627975464
    tuning_steps:               1000

array([0, 1])

array([  0,   1,   2, ..., 997, 998, 999])

array([0, 1])

array([[123.83704342, 128.27390986, 126.19071339, ..., 129.33831544,
        129.85257537, 120.46615604],
       [133.48677709, 113.22021801, 131.0946918 , ..., 128.12171912,
        132.33759155, 116.9052638 ]])

array([[[124.14093606, 128.55854405],
        [125.52497745, 127.65533327],
        [124.77976856, 128.77416628],
        ...,
        [124.64880362, 130.75414181],
        [124.34703766, 129.40272599],
        [126.18479451, 130.39614565]],

       [[125.82809488, 130.04700006],
        [123.95336703, 132.90786846],
        [126.00171379, 127.46920495],
        ...,
        [123.53689027, 129.44499793],
        [122.56246325, 129.00550263],
        [123.46022351, 130.45076266]]])


az.summary(unlabeled_idata, kind="stats", round_to=2)


# average over chains and draws
unlabeled_idata.posterior['mu'].mean(dim=['chain', 'draw'])

<xarray.DataArray 'mu' (mu_dim_0: 2)>
array([124.94933509, 129.89383768])
Coordinates:
  * mu_dim_0  (mu_dim_0) int64 0 1

array([124.94933509, 129.89383768])

array([0, 1])


( unlabeled_idata.posterior['mu'].sel(mu_dim_0=0) < 
  unlabeled_idata.posterior['mu'].sel(mu_dim_0=1) ).mean()

<xarray.DataArray 'mu' ()>
array(0.991)

array(0.991)


sex_codes

Index(['female', 'male'], dtype='object')


# first patient's index into the sex_codes map
sex_idx[0]

0


# first patient's value in the 'sex' column
df['sex'].iloc[0]

'female'


coords = {'sex':['female','male']}


with pm.Model(coords=coords) as labeled_model:
    # hyper prior
    mu_hyper = pm.Normal('mu_hyper', mu=120, sigma=15)
    # per-group prior
    mu = pm.Normal('mu', mu=mu_hyper, sigma=15, dims='sex')
    # likelihood
    likelihood = pm.Normal('likelihood', 
                           mu=mu[sex_idx],
                           sigma=15, 
                           observed=df['bp'])


# a few patients' coordinate values
patient_ids = [0,75,150]
for idx in sex_idx[patient_ids]:
    print(coords['sex'][idx])

female
female
male


with labeled_model:
    labeled_idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [mu_hyper, mu]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 4 seconds.


az.summary(labeled_idata, kind="stats", round_to=2)


# average over chains and draws
labeled_idata.posterior['mu'].mean(dim=['chain', 'draw'])

<xarray.DataArray 'mu' (sex: 2)>
array([125.01960648, 129.87574546])
Coordinates:
  * sex      (sex) <U6 'female' 'male'

array([125.01960648, 129.87574546])

array(['female', 'male'], dtype='<U6')


labeled_idata.posterior['mu'].sel(sex='female').mean()

<xarray.DataArray 'mu' ()>
array(125.01960648)
Coordinates:
    sex      <U6 'female'

array(125.01960648)

array('female', dtype='<U6')


labeled_idata.posterior['mu'].sel(sex='male').mean()

<xarray.DataArray 'mu' ()>
array(129.87574546)
Coordinates:
    sex      <U6 'male'

array(129.87574546)

array('male', dtype='<U6')


( labeled_idata.posterior['mu'].sel(sex='female') <
  labeled_idata.posterior['mu'].sel(sex='male') ).mean()

<xarray.DataArray 'mu' ()>
array(0.9885)

array(0.9885)


with pm.Model() as model:

    # define our one parameter
    coef = pm.Normal('coef', mu=0, sigma=15)

    # our data is used here...
    mean = coef * df['age'].to_numpy()

    # ...and also here, as our observed data
    # likelihood
    likelihood = pm.Normal('likelihood', 
                           mu=mean,
                           sigma=15, 
                           observed=df['bp'].to_numpy())
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [coef]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 4 seconds.


idata.groups

<bound method InferenceData.groups of Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data>


coords = {'patient_id':df.index}
with pm.Model(coords=coords) as model:
    
    # here we define pm.Data objects so that PyMC is aware
    # of our model-relevant data
    age = pm.MutableData('age', df['age'], dims='patient_id')
    bloodpressure = pm.MutableData('bloodpressure', df['bp'], dims='patient_id')

    # define our one parameter
    coef = pm.Normal('coef', mu=0, sigma=15)

    # our data is still used here...
    mean = coef * age

    # ...and here, as our observed data
    # likelihood
    likelihood = pm.Normal('likelihood', 
                           mu=mean,
                           sigma=15, 
                           observed=bloodpressure)
    idata = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [coef]

Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 3 seconds.


idata.groups

<bound method InferenceData.groups of Inference data with groups:
	> posterior
	> log_likelihood
	> sample_stats
	> observed_data
	> constant_data>


idata.constant_data

<xarray.Dataset>
Dimensions:        (patient_id: 200)
Coordinates:
  * patient_id     (patient_id) int64 979 924 522 912 787 ... 783 523 896 968 98
Data variables:
    age            (patient_id) float64 53.9 50.6 57.15 ... 52.35 35.73 50.27
    bloodpressure  (patient_id) float64 128.8 123.9 121.7 ... 128.4 138.2 129.2
Attributes:
    created_at:                 2022-06-13T20:09:32.929479
    arviz_version:              0.12.1
    inference_library:          pymc
    inference_library_version:  4.0.0b6

array([979, 924, 522, 912, 787, 673, 348, 691, 888, 419, 738, 477, 265, 164,
       267,   8,  75, 331, 382, 518, 996, 439, 938, 204, 260, 837, 883, 622,
       278, 193,  34, 476, 576, 963,  87, 361, 825, 515, 406, 663,  62, 580,
       434, 886, 336, 385, 231, 662, 790, 615, 614, 283, 919, 618, 803, 981,
       112,  51, 232, 487, 574, 946, 683,  63, 524, 743, 855, 103, 933, 225,
       648, 934, 697, 185, 566, 906,  71, 653, 329, 667, 186, 944, 957, 728,
       150, 671, 442, 495, 334, 861, 664, 195, 737, 913, 864, 121, 755, 651,
       646, 786, 932, 609,   1, 814, 291, 467, 264,   0, 356, 116, 519, 858,
        96, 536, 224,  21, 333, 207, 926, 144, 998, 475,  90, 665, 548, 483,
       296, 474,  37, 301,  49, 725,  99, 340, 950, 857, 134, 596, 106, 339,
        91, 534, 937, 713, 604, 789, 159, 308, 453,   3, 377, 163, 623, 516,
       161, 802, 869, 853, 435, 530, 203, 970, 964, 826, 413, 958, 627, 172,
       464,  84, 879, 119, 723, 777, 372, 702, 369, 191, 292, 552, 849, 494,
       317, 657, 258, 894, 309,  73, 328, 599, 157, 412, 620, 804, 147, 783,
       523, 896, 968,  98])

array([53.90147389, 50.5966426 , 57.14988141, 50.2636637 , 47.80266515,
       51.83350794, 31.24672234, 54.98711296, 56.79480919, 39.61600411,
       48.81551034, 64.1351038 , 51.6376774 , 39.35594828, 59.97290682,
       44.85177214, 50.07865286, 44.72934491, 53.61617882, 42.2609305 ,
       49.75215627, 52.70837572, 36.8520691 , 39.5804395 , 63.51155525,
       40.24011461, 36.80039478, 49.98453089, 39.38630756, 52.6962435 ,
       45.71167688, 56.0473454 , 56.68660792, 34.57290701, 63.97777459,
       50.84349487, 50.18964092, 56.23684139, 39.50047102, 59.44880155,
       43.72187322, 39.17526294, 49.64121212, 42.9349161 , 52.64278635,
       52.98075841, 57.57050989, 46.27913911, 55.19391263, 41.8245741 ,
       48.96178387, 51.12465199, 50.5185618 , 52.14909777, 45.77552833,
       67.69316582, 47.90974365, 51.91804225, 63.34916891, 51.37447252,
       58.15542603, 47.47390834, 55.8719311 , 58.1878177 , 48.22338614,
       42.71759911, 63.25569328, 44.31776113, 60.16767732, 47.47669244,
       49.58980222, 46.95369539, 53.65291143, 55.83435273, 49.02385696,
       42.64590559, 56.5046152 , 57.59629431, 53.5987158 , 43.63616707,
       47.4709282 , 41.84992167, 54.31387604, 43.94013865, 44.71878856,
       38.24305313, 48.59344889, 54.19160917, 52.95513202, 55.55731878,
       44.50433465, 48.21250651, 51.45017361, 50.22629885, 46.50297183,
       55.94119402, 51.79783903, 54.05728996, 40.71937154, 56.53514968,
       41.95110038, 53.96219511, 52.6921578 , 50.25040305, 53.04373928,
       43.85918536, 56.91038233, 36.24019744, 40.84130492, 50.32480411,
       49.78502035, 43.87729868, 56.84651469, 46.46802197, 62.03573191,
       42.68426188, 59.24131504, 35.90738116, 53.16264909, 46.42559554,
       51.65462117, 32.11666734, 54.73898318, 60.02288001, 53.66384316,
       49.80059937, 53.28568796, 44.4674601 , 42.05278314, 46.93854921,
       44.85836355, 44.00386124, 54.00833893, 51.17948845, 53.99430856,
       64.36697205, 41.44201968, 58.68506271, 48.42198285, 53.39474938,
       49.07427063, 47.66980567, 43.7230174 , 52.47178009, 47.26920475,
       47.21423685, 52.28388339, 44.42629236, 46.09326376, 63.9072557 ,
       55.68508072, 52.29819254, 59.5203019 , 36.79794798, 43.01702051,
       38.77371036, 53.84753156, 58.41005559, 57.02715948, 59.02096707,
       38.46087495, 63.04725782, 43.42904936, 54.19551527, 46.56700152,
       55.57232137, 54.30173552, 40.87469066, 45.64028985, 64.90694935,
       52.5770528 , 47.4319567 , 44.71029876, 45.50520595, 43.95913078,
       51.85076477, 34.98688584, 59.26860079, 42.68924166, 59.14932367,
       46.52091295, 60.86755313, 33.12574033, 56.99262012, 48.25237437,
       43.06701885, 57.26031334, 49.05622224, 55.87658069, 57.43002207,
       52.39629248, 45.46863058, 49.73872476, 52.9040998 , 48.15195917,
       43.88624456, 54.0149081 , 52.34962223, 35.73168522, 50.27173604])

array([128.75095086, 123.93588089, 121.68306616, 135.14428653,
       121.42032247, 129.54757211, 122.15769177, 130.87735338,
       132.9622286 , 126.75440004, 124.88848143, 123.50016818,
       123.60027639, 124.8534441 , 123.26702176, 119.59431226,
       125.22877493, 117.6709765 , 127.72790058, 123.74665199,
       116.50792255, 125.92949427, 129.434414  , 120.0438151 ,
       124.88822096, 126.18654044, 118.8094666 , 123.84500651,
       128.13447883, 136.2779946 , 129.78626533, 119.29053279,
       134.30357741, 126.22620528, 124.86955453, 121.95076967,
       126.02303483, 128.99933748, 130.56674379, 120.39147074,
       122.03288947, 127.34212932, 126.31282055, 125.94066459,
       134.6070909 , 119.79218296, 117.96745121, 128.93068417,
       128.95681347, 123.73199803, 121.88001495, 130.126373  ,
       117.88200922, 128.68011452, 130.3007806 , 123.99538336,
       131.42471748, 124.42113248, 126.27705445, 122.204213  ,
       118.70966982, 125.37992197, 114.461351  , 115.86766297,
       124.58927554, 119.40708549, 129.56842998, 125.77626421,
       125.22217706, 127.98706148, 128.40111957, 126.95620669,
       127.85543559, 119.20164118, 129.3147997 , 116.29135994,
       124.59007472, 124.98814673, 119.9810809 , 124.93157922,
...
       135.06030279, 139.43620358, 128.67722976, 133.0580461 ,
       122.63778413, 127.12214163, 124.03817418, 124.32561017,
       128.40873778, 132.55301991, 127.69117963, 134.04253702,
       130.18307922, 129.63935473, 130.33883827, 136.56473611,
       124.36061914, 129.32990715, 131.63713674, 133.970074  ,
       121.25162355, 130.56467731, 122.17722098, 132.86687551,
       131.17441753, 129.50375566, 131.42508367, 122.64899324,
       125.15076313, 135.58096411, 134.49519076, 124.08479858,
       130.83551936, 125.09129055, 127.5018994 , 133.36806244,
       135.36523989, 127.00015284, 132.18720193, 131.33266312,
       135.01897583, 134.97295621, 127.24913291, 122.81024246,
       124.10618208, 133.1631043 , 128.61997767, 127.19543064,
       127.51103751, 135.44940839, 129.42420603, 139.26585953,
       134.53108799, 122.9995672 , 128.8402919 , 127.86624344,
       126.16422532, 127.84773326, 129.89830775, 140.93947383,
       134.04561645, 134.49589489, 128.64972151, 129.58874389,
       125.04355121, 128.62567666, 130.04500277, 132.45087363,
       131.87192528, 129.20645838, 131.26494059, 122.46486604,
       137.94982293, 127.91380956, 126.11790406, 123.42997753,
       129.09118644, 128.4294642 , 138.23660367, 129.21571582])


idata.constant_data.sel(patient_id=979)

<xarray.Dataset>
Dimensions:        ()
Coordinates:
    patient_id     int64 979
Data variables:
    age            float64 53.9
    bloodpressure  float64 128.8
Attributes:
    created_at:                 2022-06-13T20:09:32.929479
    arviz_version:              0.12.1
    inference_library:          pymc
    inference_library_version:  4.0.0b6

array(979)

array(53.90147389)

array(128.75095086)


%load_ext watermark
%watermark -p pymc,arviz,numpy,pandas

pymc  : 4.0.0b6
arviz : 0.12.1
numpy : 1.22.4
pandas: 1.4.2

	mean	sd	hdi_3%	hdi_97%
mu_hyper	124.93	9.19	107.58	141.55
mu[0]	124.95	1.54	122.00	127.76
mu[1]	129.89	1.46	127.34	132.91

	mean	sd	hdi_3%	hdi_97%
mu_hyper	124.82	8.72	108.21	140.74
mu[female]	125.02	1.48	122.23	127.67
mu[male]	129.88	1.51	127.25	132.76

Christian Luhmann

Coordinates in PyMC & InferenceData Objects

Table of Contents¶

xarray ¶

Example Data ¶

Old Method ¶

New Method ¶

pm.Data ¶

Wrap-Up ¶

	bp	sex	age	treat_group
patientID
979	128.750951	female	53.901474	2
924	123.935881	female	50.596643	1
522	121.683066	female	57.149881	1
912	135.144287	female	50.263664	2
787	121.420322	female	47.802665	1