import sys
import numpy as np
import pandas as pd
import scipy.optimize
import time
from itertools import combinations
from joblib import Parallel, delayed
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# utility functions
def rand(a, sd):
return a + np.random.normal(0, sd, a.shape)
def sse(params, data):
sse = np.sum(np.power(model(params, data) - data['y'], 2))
return sse
# model definition
# note the new specification
def switch(params, data):
intercept = params[3]
slope = params[4]
switchpoint = intercept + (slope * data['z1'])
switchpoint = np.clip(switchpoint, 0, 1)
return switchpoint
def model_mean(params, data):
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
intercept = params[0]
slope = params[1]
slopeDiff = params[2]
switchPoint = switch(params, data)
isPost = data['x1'] > switchPoint
mu = intercept + (slope * data['x1']) + (isPost * (slopeDiff * (data['x1']-switchPoint)))
return np.exp(mu)
def model_gen(params, data):
return scipy.stats.poisson.rvs(model_mean(params, data))
def model_like(params, data):
return - np.sum(np.log(scipy.stats.poisson.pmf(data['y'], model_mean(params, data))))
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params = [3.5, -0.1, -.25, .4, .5]
zs = np.linspace(0, 1, 5, endpoint=True)
plt.figure(figsize=(10,8))
plt.gca().set_facecolor('xkcd:gray')
plt.gca().set_prop_cycle('color',plt.cm.hot(np.linspace(0,1,len(zs))))
for z in zs:
x1 = pd.Series(np.linspace(0, 1, 100, endpoint=True), name='x1')
z1 = pd.Series(z * np.ones(len(x1)), name='z1')
df = pd.concat([x1, z1], axis=1)
plt.plot(df['x1'], model_mean(params, df), label='z='+str(z), linewidth=4)
plt.xlabel('x1', fontsize=18)
plt.ylabel('y', fontsize=18)
plt.gca().legend(prop={'size': 16}).get_frame().set_facecolor('xkcd:gray')
plt.show()
Let's generate some synthetic data and see if we can recover the parameter values.
# generate some synthetic data
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params = [3.5, -1.5, 1.8, .5, 0]
nDataPoints = 1000
df = pd.DataFrame(data={'x1': np.random.random(size=nDataPoints), \
'z1': np.random.random(size=nDataPoints)})
df['y'] = model_gen(params, df)
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)
plt.scatter(df['x1'], df['y'], s=20, c=df['z1'], alpha=0.85, cmap='hot')
plt.plot(df['x1'].sort_values(), model_mean(params, pd.DataFrame(data={'x1': df['x1'].sort_values(), 'z1': np.ones(len(df['x1']))*df['z1'].quantile(q=.5)})), color='k', linewidth=4)
plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()
# optimize
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
paramsInit = [0, -1, -1, 0, 0]
ranges = (slice(-100, 100, 30), slice(-10, 10, 3), slice(-10, 10, 3), slice(-100, 100, 10), slice(-10, 10, 1))
i = (-5,5)
p = (0,5)
n = (-5,0)
bounds = [p,n,p,i,i]
start = time.time()
# hand the sse function to an optimization routine
#result = scipy.optimize.basinhopping(model_like, paramsInit, minimizer_kwargs={'args':df})
#result = scipy.optimize.differential_evolution(model_like, bounds, args=(df,), popsize=100)
result = scipy.optimize.differential_evolution(model_like, bounds, args=(df,), popsize=100, mutation=(.7,1), recombination=.5)
end = time.time()
print(f'Optimization took: {end - start:.2f}s')
#result = scipy.optimize.differential_evolution(model_like, bounds, args=(replicateDF,), popsize=100, mutation=(.7,1), recombination=.5)
[result.x, np.mean(switch(result.x, df)), result.fun, model_like(params, df)]
print(result.message)
print('True parameter values:\t\t'+', '.join('{:.2f}'.format(f) for f in params))
print('Estimates:\t\t\t'+', '.join('{:.2f}'.format(f) for f in result.x))
print('Differences:\t\t\t'+', '.join('{:.2f}'.format(f) for f in (params - result.x)))
print('% Differences:\t\t\t'+', '.join('{:.1f}%'.format(f) for f in 100*(params - result.x)/params))
print(f'Log-likelihood:\t\t\t-{result.fun:.2f}')
print(f'Ground Truth Log-likelihood:\t-{model_like(params, df):.2f}')
# plot simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Fit to Broken Line Data', fontsize=20)
plt.scatter(df['x1'], df['y'], s=20, c=df['z1'], alpha=0.85, cmap='hot')
plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
# plot model fit
#xs = np.arange(df['x1'].min(), df['x1'].max())
xs = np.linspace(df['x1'].min(), df['x1'].max(), len(df['x1']), endpoint=False)
plt.plot(xs, model_mean(result.x, pd.DataFrame(data={'x1': xs, 'z1': np.ones(len(xs))*df['z1'].quantile(q=.5)})), color='r', linewidth=4)
plt.plot(df['x1'].sort_values(), model_mean(params, pd.DataFrame(data={'x1': df['x1'].sort_values(), 'z1': np.ones(len(df['x1']))*df['z1'].quantile(q=.5)})), color='k', linewidth=4)
plt.show()
Use a single set of parameter values to generate a large number of data sets. Use the broken line model to estimate parameters for each synthetic data set. Quantify mean/median of as well as measures of variability of these estimates.
# threaded bootstrap
bootParams = [3.5, -1.5, 1.8, .75, 0]
nDataPoints = 1000
def longFunc():
# generate replicate data set
replicateDF = pd.DataFrame(data={'x1': np.random.random(size=nDataPoints), \
'z1': np.random.random(size=nDataPoints)})
replicateDF['y'] = model_gen(bootParams, replicateDF)
# optimize
i = (-5,5)
p = (0,5)
n = (-5,0)
bounds = [p,n,p,[0,1],i]
result = scipy.optimize.differential_evolution(model_like, bounds, args=(replicateDF,), popsize=100, mutation=(.7,1), recombination=.5)
return [result.x, np.mean(switch(result.x, replicateDF)), result.fun, model_like(bootParams, replicateDF)]
#return [result.x, result.fun, model_like(bootParams, replicateDF)]
numReplicates = 100
bootOut = Parallel(n_jobs=3)(delayed(longFunc)() for i in tqdm(range(numReplicates)))
# unpack output of the simulations
parameter_estimates = np.array([i[0] for i in bootOut], dtype=np.float32)
switchpoint_estimates = np.array([i[1] for i in bootOut], dtype=np.float32)
lnLs = np.array([i[2] for i in bootOut], dtype=np.float32)
groundTruthlnL = np.array([i[3] for i in bootOut], dtype=np.float32)
# let's see what the batch of parameter estimates looks like
print('Params:\t\t\t'+', '.join('{:.3f}'.format(f) for f in bootParams))
print('Means:\t\t\t'+', '.join('{:.3f}'.format(f) for f in np.mean(parameter_estimates, axis=0)))
print('Medians:\t\t'+', '.join('{:.3f}'.format(f) for f in np.median(parameter_estimates, axis=0)))
print('SDs:\t\t\t'+', '.join('{:.3f}'.format(f) for f in np.std(parameter_estimates, axis=0)))
print('MSE:\t\t\t'+', '.join('{:.3f}'.format(f) for f in np.mean(np.power(np.array(params)-np.array(parameter_estimates), 2), axis=0)))
print('')
print('Switch Point')
print(f'mean:\t\t\t{np.mean(switchpoint_estimates):.3f}')
print(f'median:\t\t\t{np.median(switchpoint_estimates):.3f}')
print(f'SD:\t\t\t{np.std(switchpoint_estimates):.3f}')
print(f'MSE:\t\t\t{np.mean(np.power(np.array(switchpoint_estimates)-np.array(bootParams[3]), 2), axis=0):.3f}')
print('')
print(f'mean lnL:\t\t{-np.mean(lnLs, axis=0):.2f}')
print(f'mean truelnL:\t\t{-np.mean(groundTruthlnL, axis=0):.2f}')
print(f'mean lnL - truelnL:\t{np.mean(lnLs - groundTruthlnL, axis=0):.2f}')
# let's plot it all to see better
# let's first get the (static) switchpoints and pack them in with the preIntercept, preSlope, and slopeDiff params
paramsForPlotting = np.hstack([np.array(parameter_estimates)[:,0:3], switchpoint_estimates.reshape(-1,1)])
estimatesDF = pd.DataFrame(paramsForPlotting, index=None, columns = ['preIntercept', 'preSlope', 'slopeDiff', 'switchPoint'], dtype=np.float32)
sns.pairplot(estimatesDF)
#for i, col in enumerate(estimatesDF.columns):
# plt.figure(i)
# sns.distplot(estimatesDF[col])
Let's simulate some data in which z is actually relevant to the switchpoint
# generate some more synthetic data
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params2 = [3.5, -1.5, 1.8, .33, .33]
nDataPoints = 1000
df2 = pd.DataFrame(data={'x1': np.random.random(size=nDataPoints), \
'z1': np.random.random(size=nDataPoints)})
df2['y'] = model_gen(params2, df2)
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)
plt.scatter(df2['x1'], df2['y'], s=20, c=df2['z1'], alpha=0.85, cmap='hot')
#plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(df2['x1']))*df2['z1'].quantile(q=.5)})), color='k', linewidth=4)
#plt.plot(df['x1'].sort_values(), model_mean(params, pd.DataFrame(data={'x1': df['x1'].sort_values(), 'z1': np.ones(len(df['x1']))*df['z1'].quantile(q=.5)})), color='k', linewidth=4)
plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(xs))*df2['z1'].min()})), color='k', linewidth=4)
plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(xs))*df2['z1'].quantile(q=.5)})), color='r', linewidth=4)
plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(xs))*df2['z1'].max()})), color='w', linewidth=4)
plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()
# optimize
i = (-5,5)
p = (0,5)
n = (-5,0)
bounds = [p,n,p,[0,1],i]
start = time.time()
result2 = scipy.optimize.differential_evolution(model_like, bounds, args=(df2,), popsize=100, mutation=(.7,1), recombination=.5)
end = time.time()
print(f'Optimization took: {end - start:.2f}s')
print(result.message)
print('True parameter values:\t\t'+', '.join('{:.2f}'.format(f) for f in params2))
print('Estimates:\t\t\t'+', '.join('{:.2f}'.format(f) for f in result2.x))
print('Differences:\t\t\t'+', '.join('{:.2f}'.format(f) for f in (params2 - result2.x)))
print('% Differences:\t\t\t'+', '.join('{:.1f}%'.format(f) for f in 100*(params2 - result2.x)/params2))
print(f'Log-likelihood:\t\t\t-{result2.fun:.2f}')
print(f'Ground Truth Log-likelihood:\t-{model_like(params2, df2):.2f}')
# generate some synthetic data
# params = [preIntercept, preSlope, slopeDiff, switchIntercept, switchSlope]
params3 = [3.5, -1.5, 1.8, .33, .33]
nDataPoints = 1000
mean = [1,1]
cov = [[.2, .0075], [.0075, .2]]
cov = [[.15, .1], [.1, .15]]
x1, z1 = np.random.multivariate_normal(mean, cov, nDataPoints).T
print(f'x1-z1 correlation:\t-{scipy.stats.pearsonr(x1,z1)[0]:.2f}')
df3 = pd.DataFrame(data={'x1': x1, \
'z1': z1})
df3['y'] = model_gen(params3, df3)
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)
plt.scatter(df3['x1'], df3['y'], s=20, c=df3['z1'], alpha=0.85, cmap='hot', zorder=10)
#plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(df2['x1']))*df2['z1'].quantile(q=.5)})), color='k', linewidth=4)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(xs))*df3['z1'].quantile(q=.05)})), color='k', linewidth=4, zorder=5)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(xs))*df3['z1'].quantile(q=.5)})), color='r', linewidth=4, zorder=4)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(xs))*df3['z1'].quantile(q=.95)})), color='w', linewidth=4, zorder=3)
plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()
# optimize
i = (-5,5)
p = (0,5)
n = (-5,0)
bounds = [p,n,p,[0,1],i]
start = time.time()
result3 = scipy.optimize.differential_evolution(model_like, bounds, args=(df3,), popsize=100, mutation=(.7,1), recombination=.5)
end = time.time()
print(f'Optimization took: {end - start:.2f}s')
print(result.message)
print('True parameter values:\t\t'+', '.join('{:.2f}'.format(f) for f in params3))
print('Estimates:\t\t\t'+', '.join('{:.2f}'.format(f) for f in result3.x))
print('Differences:\t\t\t'+', '.join('{:.2f}'.format(f) for f in (params3 - result3.x)))
print('% Differences:\t\t\t'+', '.join('{:.1f}%'.format(f) for f in 100*(params3 - result3.x)/params3))
print(f'Log-likelihood:\t\t\t-{result3.fun:.2f}')
print(f'Ground Truth Log-likelihood:\t-{model_like(params3, df3):.2f}')
# plot the simulated data
plt.figure(figsize=(10,8))
plt.title('Broken Line Data', fontsize=20)
plt.scatter(df3['x1'], df3['y'], s=20, c=df3['z1'], alpha=0.85, cmap='hot', zorder=10)
#plt.plot(df2['x1'].sort_values(), model_mean(params2, pd.DataFrame(data={'x1': df2['x1'].sort_values(), 'z1': np.ones(len(df2['x1']))*df2['z1'].quantile(q=.5)})), color='k', linewidth=4)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.01)})), color='k', linewidth=4, zorder=5)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.5)})), color='r', linewidth=4, zorder=4)
plt.plot(df3['x1'].sort_values(), model_mean(params3, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.99)})), color='w', linewidth=4, zorder=3)
plt.plot(df3['x1'].sort_values(), model_mean(result3.x, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.01)})), color='k', linewidth=3, ls='dashed', zorder=5)
plt.plot(df3['x1'].sort_values(), model_mean(result3.x, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.5)})), color='r', linewidth=3, ls='dashed', zorder=4)
plt.plot(df3['x1'].sort_values(), model_mean(result3.x, pd.DataFrame(data={'x1': df3['x1'].sort_values(), 'z1': np.ones(len(df3['x1']))*df3['z1'].quantile(q=.99)})), color='w', linewidth=2, ls='dashed', zorder=3)
plt.xlabel('x1', fontsize=16)
plt.ylabel('y', fontsize=16)
plt.gca().set_facecolor('xkcd:gray')
plt.show()