In [1]:
import sys
import numpy as np
import pandas as pd
import scipy.optimize
import time
from itertools import combinations
from joblib import Parallel, delayed
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [2]:
# utility functions
def rand(a, sd):
    return a + np.random.normal(0, sd, a.shape)

def sse(params, data):
    sse = np.sum(np.power(model(params, data) - data['y'], 2))
    return sse

Broken Line Model Definition¶

In [3]:
# model definition
# note the new specification

def switch(params, data):
    intercept = params[3]
    slope = params[4]
    switchpoint = intercept + (slope * data['z1'])
    switchpoint = np.clip(switchpoint, 0, 1)
    return switchpoint


def model_mean(params, data):
    # params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]

    intercept = params[0]
    slope = params[1]
    slopeDiff = params[2]
    switchPoint = switch(params, data)
    isPost = data['x1'] > switchPoint

    mu = intercept + (slope * data['x1']) + (isPost * (slopeDiff * (data['x1']-switchPoint)))

    return np.exp(mu)


def model_gen(params, data):
    return scipy.stats.poisson.rvs(model_mean(params, data))


def model_like(params, data):
    return - np.sum(np.log(scipy.stats.poisson.pmf(data['y'], model_mean(params, data))))
In [4]:
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params = [3.5, -0.1, -.25, .4, .5]

zs = np.linspace(0, 1, 5, endpoint=True)
plt.figure(figsize=(10,8))
plt.gca().set_facecolor('xkcd:gray')
plt.gca().set_prop_cycle('color',plt.cm.hot(np.linspace(0,1,len(zs))))

for z in zs:
    x1 = pd.Series(np.linspace(0, 1, 100, endpoint=True), name='x1')
    z1 = pd.Series(z * np.ones(len(x1)), name='z1')
    df = pd.concat([x1, z1], axis=1)
    plt.plot(df['x1'], model_mean(params, df), label='z='+str(z), linewidth=4)

plt.xlabel('x1', fontsize=18)
plt.ylabel('y', fontsize=18)
plt.gca().legend(prop={'size': 16}).get_frame().set_facecolor('xkcd:gray')
plt.show()

Simulated Data¶

Let's generate some synthetic data and see if we can recover the parameter values.

In [5]:
# generate some synthetic data

# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params = [3.5, -1.5, 1.8, .5, 0]

nDataPoints = 1000
df = pd.DataFrame(data={'x1': np.random.random(size=nDataPoints), \
                        'z1': np.random.random(size=nDataPoints)})
df['y'] = model_gen(params, df)
In [6]:
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)
plt.scatter(df['x1'], df['y'], s=20, c=df['z1'], alpha=0.85, cmap='hot')
plt.plot(df['x1'].sort_values(), model_mean(params, pd.DataFrame(data={'x1': df['x1'].sort_values(), 'z1': np.ones(len(df['x1']))*df['z1'].quantile(q=.5)})), color='k', linewidth=4)
plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()
In [7]:
# optimize
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
paramsInit = [0, -1, -1, 0, 0]
ranges = (slice(-100, 100, 30), slice(-10, 10, 3), slice(-10, 10, 3), slice(-100, 100, 10), slice(-10, 10, 1))
i = (-5,5)
p = (0,5)
n = (-5,0)
bounds = [p,n,p,i,i]

start = time.time()
# hand the sse function to an optimization routine
#result = scipy.optimize.basinhopping(model_like, paramsInit, minimizer_kwargs={'args':df})
#result = scipy.optimize.differential_evolution(model_like, bounds, args=(df,), popsize=100)
result = scipy.optimize.differential_evolution(model_like, bounds, args=(df,), popsize=100, mutation=(.7,1), recombination=.5)
end = time.time()
print(f'Optimization took: {end - start:.2f}s')
/home/luhmann/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:31: RuntimeWarning: divide by zero encountered in log
/home/luhmann/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py:117: RuntimeWarning: invalid value encountered in subtract
  x = asanyarray(arr - arrmean)
Optimization took: 153.39s
In [8]:
#result = scipy.optimize.differential_evolution(model_like, bounds, args=(replicateDF,), popsize=100, mutation=(.7,1), recombination=.5)
[result.x, np.mean(switch(result.x, df)), result.fun, model_like(params, df)]
Out[8]:
[array([ 3.52226715, -1.5589465 ,  1.95016129,  0.50878192,  0.01748064]),
 0.5175969304970778,
 2942.5876629842282,
 2945.9240003820087]
In [9]:
print(result.message)

print('True parameter values:\t\t'+', '.join('{:.2f}'.format(f) for f in params))
print('Estimates:\t\t\t'+', '.join('{:.2f}'.format(f) for f in result.x))
print('Differences:\t\t\t'+', '.join('{:.2f}'.format(f) for f in (params - result.x)))
print('% Differences:\t\t\t'+', '.join('{:.1f}%'.format(f) for f in 100*(params - result.x)/params))
print(f'Log-likelihood:\t\t\t-{result.fun:.2f}')
print(f'Ground Truth Log-likelihood:\t-{model_like(params, df):.2f}')
Optimization terminated successfully.
True parameter values:		3.50, -1.50, 1.80, 0.50, 0.00
Estimates:			3.52, -1.56, 1.95, 0.51, 0.02
Differences:			-0.02, 0.06, -0.15, -0.01, -0.02
% Differences:			-0.6%, -3.9%, -8.3%, -1.8%, -inf%
Log-likelihood:			-2942.59
Ground Truth Log-likelihood:	-2945.92
/home/luhmann/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: RuntimeWarning: divide by zero encountered in true_divide
  
In [10]:
# plot simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Fit to Broken Line Data', fontsize=20)
plt.scatter(df['x1'], df['y'], s=20, c=df['z1'], alpha=0.85, cmap='hot')
plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')

# plot model fit
#xs = np.arange(df['x1'].min(), df['x1'].max())
xs = np.linspace(df['x1'].min(), df['x1'].max(), len(df['x1']), endpoint=False)
plt.plot(xs, model_mean(result.x, pd.DataFrame(data={'x1': xs, 'z1': np.ones(len(xs))*df['z1'].quantile(q=.5)})), color='r', linewidth=4)
plt.plot(df['x1'].sort_values(), model_mean(params, pd.DataFrame(data={'x1': df['x1'].sort_values(), 'z1': np.ones(len(df['x1']))*df['z1'].quantile(q=.5)})), color='k', linewidth=4)

plt.show()

Simulation Study¶

Use a single set of parameter values to generate a large number of data sets. Use the broken line model to estimate parameters for each synthetic data set. Quantify mean/median of as well as measures of variability of these estimates.

In [11]:
# threaded bootstrap
bootParams = [3.5, -1.5, 1.8, .75, 0]
nDataPoints = 1000

def longFunc():
    # generate replicate data set
    replicateDF = pd.DataFrame(data={'x1': np.random.random(size=nDataPoints), \
                                     'z1': np.random.random(size=nDataPoints)})
    replicateDF['y'] = model_gen(bootParams, replicateDF)

    # optimize
    i = (-5,5)
    p = (0,5)
    n = (-5,0)
    bounds = [p,n,p,[0,1],i]
    result = scipy.optimize.differential_evolution(model_like, bounds, args=(replicateDF,), popsize=100, mutation=(.7,1), recombination=.5)
    return [result.x, np.mean(switch(result.x, replicateDF)), result.fun, model_like(bootParams, replicateDF)]
    #return [result.x, result.fun, model_like(bootParams, replicateDF)]

numReplicates = 100

bootOut = Parallel(n_jobs=3)(delayed(longFunc)() for i in tqdm(range(numReplicates)))
100%|██████████| 100/100 [31:22<00:00, 18.82s/it]
In [12]:
# unpack output of the simulations

parameter_estimates = np.array([i[0] for i in bootOut], dtype=np.float32)
switchpoint_estimates = np.array([i[1] for i in bootOut], dtype=np.float32)
lnLs = np.array([i[2] for i in bootOut], dtype=np.float32)
groundTruthlnL = np.array([i[3] for i in bootOut], dtype=np.float32)
In [13]:
# let's see what the batch of parameter estimates looks like

print('Params:\t\t\t'+', '.join('{:.3f}'.format(f) for f in bootParams))
print('Means:\t\t\t'+', '.join('{:.3f}'.format(f) for f in np.mean(parameter_estimates, axis=0)))
print('Medians:\t\t'+', '.join('{:.3f}'.format(f) for f in np.median(parameter_estimates, axis=0)))
print('SDs:\t\t\t'+', '.join('{:.3f}'.format(f) for f in np.std(parameter_estimates, axis=0)))
print('MSE:\t\t\t'+', '.join('{:.3f}'.format(f) for f in np.mean(np.power(np.array(params)-np.array(parameter_estimates), 2), axis=0)))
print('')
print('Switch Point')
print(f'mean:\t\t\t{np.mean(switchpoint_estimates):.3f}')
print(f'median:\t\t\t{np.median(switchpoint_estimates):.3f}')
print(f'SD:\t\t\t{np.std(switchpoint_estimates):.3f}')
print(f'MSE:\t\t\t{np.mean(np.power(np.array(switchpoint_estimates)-np.array(bootParams[3]), 2), axis=0):.3f}')
print('')
print(f'mean lnL:\t\t{-np.mean(lnLs, axis=0):.2f}')
print(f'mean truelnL:\t\t{-np.mean(groundTruthlnL, axis=0):.2f}')
print(f'mean lnL - truelnL:\t{np.mean(lnLs - groundTruthlnL, axis=0):.2f}')
Params:			3.500, -1.500, 1.800, 0.750, 0.000
Means:			3.499, -1.501, 1.805, 0.750, -0.001
Medians:		3.499, -1.505, 1.776, 0.752, -0.001
SDs:			0.017, 0.044, 0.317, 0.038, 0.042
MSE:			0.000, 0.002, 0.100, 0.064, 0.002

Switch Point
mean:			0.749
median:			0.751
SD:			0.030
MSE:			0.001

mean lnL:		-2814.87
mean truelnL:		-2816.97
mean lnL - truelnL:	-2.10
In [14]:
# let's plot it all to see better

# let's first get the (static) switchpoints and pack them in with the preIntercept, preSlope, and slopeDiff params

paramsForPlotting = np.hstack([np.array(parameter_estimates)[:,0:3], switchpoint_estimates.reshape(-1,1)])
estimatesDF = pd.DataFrame(paramsForPlotting, index=None, columns = ['preIntercept', 'preSlope', 'slopeDiff', 'switchPoint'], dtype=np.float32)

sns.pairplot(estimatesDF)

#for i, col in enumerate(estimatesDF.columns):
#    plt.figure(i)
#    sns.distplot(estimatesDF[col])
Out[14]:
<seaborn.axisgrid.PairGrid at 0x7f3b646b67f0>

More Simulations¶

Let's simulate some data in which z is actually relevant to the switchpoint

In [15]:
# generate some more synthetic data

# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params2 = [3.5, -1.5, 1.8, .33, .33]

nDataPoints = 1000
df2 = pd.DataFrame(data={'x1': np.random.random(size=nDataPoints), \
                        'z1': np.random.random(size=nDataPoints)})
df2['y'] = model_gen(params2, df2)
In [16]:
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)
plt.scatter(df2['x1'], df2['y'], s=20, c=df2['z1'], alpha=0.85, cmap='hot')
#plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(df2['x1']))*df2['z1'].quantile(q=.5)})), color='k', linewidth=4)

#plt.plot(df['x1'].sort_values(), model_mean(params, pd.DataFrame(data={'x1': df['x1'].sort_values(), 'z1': np.ones(len(df['x1']))*df['z1'].quantile(q=.5)})), color='k', linewidth=4)

plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(xs))*df2['z1'].min()})), color='k', linewidth=4)
plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(xs))*df2['z1'].quantile(q=.5)})), color='r', linewidth=4)
plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(xs))*df2['z1'].max()})), color='w', linewidth=4)

plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()
In [17]:
# optimize
i = (-5,5)
p = (0,5)
n = (-5,0)
bounds = [p,n,p,[0,1],i]

start = time.time()
result2 = scipy.optimize.differential_evolution(model_like, bounds, args=(df2,), popsize=100, mutation=(.7,1), recombination=.5)
end = time.time()
print(f'Optimization took: {end - start:.2f}s')
/home/luhmann/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:31: RuntimeWarning: divide by zero encountered in log
/home/luhmann/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py:117: RuntimeWarning: invalid value encountered in subtract
  x = asanyarray(arr - arrmean)
Optimization took: 55.76s
/home/luhmann/anaconda3/lib/python3.7/site-packages/scipy/optimize/optimize.py:663: RuntimeWarning: invalid value encountered in double_scalars
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
In [18]:
print(result.message)

print('True parameter values:\t\t'+', '.join('{:.2f}'.format(f) for f in params2))
print('Estimates:\t\t\t'+', '.join('{:.2f}'.format(f) for f in result2.x))
print('Differences:\t\t\t'+', '.join('{:.2f}'.format(f) for f in (params2 - result2.x)))
print('% Differences:\t\t\t'+', '.join('{:.1f}%'.format(f) for f in 100*(params2 - result2.x)/params2))
print(f'Log-likelihood:\t\t\t-{result2.fun:.2f}')
print(f'Ground Truth Log-likelihood:\t-{model_like(params2, df2):.2f}')
Optimization terminated successfully.
True parameter values:		3.50, -1.50, 1.80, 0.33, 0.33
Estimates:			3.51, -1.48, 1.66, 0.33, 0.34
Differences:			-0.01, -0.02, 0.14, 0.00, -0.01
% Differences:			-0.4%, 1.2%, 7.9%, 0.0%, -2.7%
Log-likelihood:			-2893.42
Ground Truth Log-likelihood:	-2891.36

Simulations with dependency between x and z¶

In [19]:
# generate some synthetic data

# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params3 = [3.5, -1.5, 1.8, .33, .33]

nDataPoints = 1000
mean = [1,1]
cov = [[.2, .0075], [.0075, .2]]
cov = [[.15, .1], [.1, .15]]
x1, z1 = np.random.multivariate_normal(mean, cov, nDataPoints).T
print(f'x1-z1 correlation:\t-{scipy.stats.pearsonr(x1,z1)[0]:.2f}')

df3 = pd.DataFrame(data={'x1': x1, \
                        'z1': z1})
df3['y'] = model_gen(params3, df3)
x1-z1 correlation:	-0.63
In [20]:
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)

plt.scatter(df3['x1'], df3['y'], s=20, c=df3['z1'], alpha=0.85, cmap='hot', zorder=10)
#plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(df2['x1']))*df2['z1'].quantile(q=.5)})), color='k', linewidth=4)

plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(xs))*df3['z1'].quantile(q=.05)})), color='k', linewidth=4, zorder=5)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(xs))*df3['z1'].quantile(q=.5)})), color='r', linewidth=4, zorder=4)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(xs))*df3['z1'].quantile(q=.95)})), color='w', linewidth=4, zorder=3)

plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()
In [21]:
# optimize
i = (-5,5)
p = (0,5)
n = (-5,0)
bounds = [p,n,p,[0,1],i]

start = time.time()
result3 = scipy.optimize.differential_evolution(model_like, bounds, args=(df3,), popsize=100, mutation=(.7,1), recombination=.5)
end = time.time()
print(f'Optimization took: {end - start:.2f}s')
/home/luhmann/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:31: RuntimeWarning: divide by zero encountered in log
/home/luhmann/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py:117: RuntimeWarning: invalid value encountered in subtract
  x = asanyarray(arr - arrmean)
Bootstrapping took: 93.09s
In [22]:
print(result.message)

print('True parameter values:\t\t'+', '.join('{:.2f}'.format(f) for f in params3))
print('Estimates:\t\t\t'+', '.join('{:.2f}'.format(f) for f in result3.x))
print('Differences:\t\t\t'+', '.join('{:.2f}'.format(f) for f in (params3 - result3.x)))
print('% Differences:\t\t\t'+', '.join('{:.1f}%'.format(f) for f in 100*(params3 - result3.x)/params3))
print(f'Log-likelihood:\t\t\t-{result3.fun:.2f}')
print(f'Ground Truth Log-likelihood:\t-{model_like(params3, df3):.2f}')
Optimization terminated successfully.
True parameter values:		3.50, -1.50, 1.80, 0.33, 0.33
Estimates:			3.53, -1.55, 1.87, 0.35, 0.32
Differences:			-0.03, 0.05, -0.07, -0.02, 0.01
% Differences:			-0.9%, -3.4%, -3.9%, -6.0%, 2.9%
Log-likelihood:			-2701.88
Ground Truth Log-likelihood:	-2703.09
In [23]:
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)

plt.scatter(df3['x1'], df3['y'], s=20, c=df3['z1'], alpha=0.85, cmap='hot', zorder=10)
#plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(df2['x1']))*df2['z1'].quantile(q=.5)})), color='k', linewidth=4)

plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.01)})), color='k', linewidth=4, zorder=5)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.5)})), color='r', linewidth=4, zorder=4)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.99)})), color='w', linewidth=4, zorder=3)

plt.plot(df3['x1'].sort_values(), model_mean(result3.x, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.01)})), color='k', linewidth=3, ls='dashed', zorder=5)
plt.plot(df3['x1'].sort_values(), model_mean(result3.x, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.5)})), color='r', linewidth=3, ls='dashed', zorder=4)
plt.plot(df3['x1'].sort_values(), model_mean(result3.x, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.99)})), color='w', linewidth=2, ls='dashed', zorder=3)

plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()