import pandas as pd
import numpy as np
import sys
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn import preprocessing
import scipy.stats as stats
import corrstats # allows for comparison of 2 correlation coefficients
import a12 # provides Vargha and Delaney's A (non-parametric common language effect size)
import geometric
import seaborn as sns
sns.set_style("whitegrid")
def cohen_d(x,y):
return (np.mean(x) - np.mean(y)) / math.sqrt((np.std(x, ddof=1) ** 2. + np.std(y, ddof=1) ** 2.) / 2.)
# Load data, excluding "non-response" data
path = 'C:\\Users\\cadlab\\Desktop\\xian\\research\\projects\\risk-delay\\'
df = pd.read_csv(path+'6-17-15.csv', na_values=[-5,-4,-3,-2,-1])
# Rename (some) columns to something sensible
df.rename(columns={'T0961700' : 'PATIENCE_MONTH',\
'T0962000' : 'PATIENCE_YEAR',\
'T0960500' : 'RISK_10K',\
'T0961100' : 'RISK_1K',\
'R4395800' : 'RISK_INCOME_MID_RISK',\
'R4395900' : 'RISK_INCOME_MORE_RISK',\
'R4396000' : 'RISK_INCOME_LESS_RISK',\
'T3094800' : 'RISK',\
'T3094900' : 'DRIVING',\
'T3094901' : 'FINANCIAL',\
'T3094902' : 'OCCUPATION',\
'T3094903' : 'HEALTH',\
'T3094904' : 'FAITH',\
'T3094905' : 'ROMANCE',\
'T3094906' : 'MAJOR_LIFE',\
'T3095000' : 'BETS',\
'T0911100' : 'ALCOHOL',\
'T0911400' : 'BINGE',\
'T2074900' : 'CIGARETTE',\
'R3914300' : 'AGE_START_SMOKE',\
'R3914200' : 'SMOKE_100_CIGS',\
'H0020800' : 'EVER_DEPRESSED',\
'H0021000' : 'DEPRESSED_IN_LAST_YEAR',\
'H0008600' : 'WORRY_ANX_DEPRESS',\
'R6430800' : 'COCAINE',\
'R6431200' : 'CRACK',\
'R6432000' : 'SEDATIVES',\
'R6432100' : 'TRANQUILIZERS',\
'R6432200' : 'STIMULANTS',\
'R6432300' : 'PAINKILLERS',\
'R6432400' : 'INHALANTS',\
'R6432500' : 'HALLUCINOGENS',\
'R6432600' : 'HEROIN',\
'R6432700' : 'STEROIDS',\
'R6432800' : 'INJECTIONS',\
'R6432900' : 'MDMA',\
'R6433000' : 'METH',\
'R0006500' : 'SES',\
'T0988600' : 'EDU',\
'T0989000' : 'AGE',\
'T0990700' : 'NJOBS',\
'T0988200' : 'EMPLOYED',\
'T0910800' : 'MEMORY',\
'T0899200' : 'TRYING_LOSE_WEIGHT',\
'T0897300' : 'WEIGHT',\
'T0897400' : 'HEIGHT_FT',\
'T0897500' : 'HEIGHT_IN',\
'T0897600' : 'EXERCISE_VIG_FREQ',\
'T0897700' : 'EXERCISE_VIG_UNIT',\
'T0898100' : 'EXERCISE_MOD_FREQ',\
'T0898200' : 'EXERCISE_MOD_UNIT',\
'T0898800' : 'DOC_VISIT',\
'R0988000' : 'AGE_FIRST_SEX',\
'T0899900' : 'HEALTH_INSUR',\
'T2176700' : 'CCDEBT',\
'T2181800' : 'CCMISSPAY',\
'R8417600' : 'CCMAX04',\
'T2181900' : 'CCMAX08',\
'R6945200' : 'RETIREMENT',\
'R8046500' : 'INVEST_STOCK',\
'R8046501' : 'INVEST_PRIV_BOND',\
'R8046502' : 'INVEST_GOV_BOND',\
'T4100000' : 'EMERGENCY_FUND',\
'T4100300' : 'FIT_LIT_4',\
'T4100400' : 'FIT_LIT_5',\
'T4100500' : 'FIT_LIT_6',\
'T4100600' : 'FIT_LIT_7',\
'T4100700' : 'FIT_LIT_8',\
'T2463100' : 'ENTREPRENEUR',\
'R0214800' : 'SEX',\
'R0214700' : 'RACE',\
'T0987900' : 'POVERTY',\
'T0987800' : 'INCOME',\
'R0618301' : 'IQ',\
'R0305100' : 'SHOPLIFT',\
'R0305200' : 'STEAL_SMALL',\
'R0305300' : 'STEAL_LARGE',\
'R0307800' : 'CONVICT',\
'R0305000' : 'FIGHT'\
}, inplace=True)
# Frequeny of 0 responses to various combinations of the risk items
print(len(df.query('(RISK_10K > 0) & (RISK_1K > 0)')))
print(len(df.query('(RISK_10K == 0) | (RISK_1K == 0)')))
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0)')))
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0) & (PATIENCE_YEAR == 0)')))
print(len(df.query('(RISK_1K >= 0) & (PATIENCE_MONTH >= 0)')))
print(len(df.query('(RISK_1K == 0) | (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_1K == 0) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_1K > 0) & (PATIENCE_MONTH > 0)')))
print(len(df.query('(RISK_10K == 0) | (PATIENCE_YEAR == 0)')))
print(len(df.query('(RISK_10K > 0) & (PATIENCE_YEAR > 0)')))
# Frequeny of 0 responses to various combinations of the delay items
print(len(df.query('(PATIENCE_YEAR > 0) & (PATIENCE_MONTH > 0)')))
print(len(df.query('(PATIENCE_MONTH == 0)')))
print(len(df.query('(PATIENCE_YEAR == 0)')))
print(len(df.query('(PATIENCE_YEAR == 0) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0) & (PATIENCE_YEAR == 0) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(PATIENCE_YEAR > 0) | (PATIENCE_MONTH > 0)')))
print(len(df.query('(RISK_10K > 0) | (RISK_1K > 0)')))
# Frequeny of maximal responses to various combinations of the beh. econ. items
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000)')))
print(len(df.query('(RISK_10K == 10000) | (RISK_1K == 1000)')))
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000) & (PATIENCE_YEAR == 0)')))
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000) & (PATIENCE_MONTH == 0) & (PATIENCE_YEAR == 0)')))
# Correlation of raw responses
print(df['PATIENCE_MONTH'].corr(df['PATIENCE_YEAR']))
print(df['RISK_1K'].corr(df['RISK_10K']))
data = df.copy()
# Convert zero responses on beh. econ. items to NaN
data.replace({'RISK_1K': {0: np.nan}}, inplace=True)
data.replace({'RISK_10K': {0: np.nan}}, inplace=True)
data.replace({'PATIENCE_MONTH': {0: np.nan}}, inplace=True)
data.replace({'PATIENCE_YEAR': {0: np.nan}}, inplace=True)
# Modify DOC_VISIT so that it's monotonic with recency of doctor visit
data.replace({'DOC_VISIT': {0: 6}}, inplace=True)
# Modify EMPLOYED so that it's only employed/not
data.replace({'EMPLOYED': {2: 0}}, inplace=True)
data.replace({'EMPLOYED': {3: np.nan}}, inplace=True)
data.replace({'EMPLOYED': {4: np.nan}}, inplace=True)
# Log transform the patience items
data['PATIENCE_MONTH'] = np.log(data['PATIENCE_MONTH'].values)
data['PATIENCE_YEAR'] = np.log(data['PATIENCE_YEAR'].values)
# Log transform income
# some response income as $0, so add $1 before transforming
print(len(data.query('(INCOME == 0)')))
data['INCOME'] = np.log(1+data['INCOME'])
# Construct a substance use variable from idividual items
data['DRUG'] = (
(data['COCAINE'] > 0) |
(data['CRACK'] > 0) |
(data['STIMULANTS'] > 0) |
(data['PAINKILLERS'] > 0) |
(data['INHALANTS'] > 0) |
(data['HALLUCINOGENS'] > 0) |
(data['HEROIN'] > 0) |
(data['MDMA'] > 0) |
(data['METH'] > 0)
)
data['DRUG'] = data['DRUG'].astype(float)
# Construct a maxed-out credit card variable
data['CCMAX'] = (data['CCMAX04'] > 0)
data['CCMAX'] = data['CCMAX'].astype(float)
# compute BMI
data['BMI'] = 703.06958 * data['WEIGHT']/((data['HEIGHT_FT'] * 12) + data['HEIGHT_IN'])**2
data['BMI'] = np.log(data['BMI'])
p = plt.hist(data.dropna(subset = ['BMI'])['BMI'].values, 100)
exerData = data.copy()
exerData['EXERCISE'] = exerData['EXERCISE_VIG_FREQ']
exerData[exerData['EXERCISE_VIG_FREQ'] == 5] = 0.
exerData[exerData['EXERCISE_VIG_FREQ'] == 2] = exerData[exerData['EXERCISE_VIG_UNIT'] == 2] / 7.
exerData[exerData['EXERCISE_VIG_FREQ'] == 3] = exerData[exerData['EXERCISE_VIG_UNIT'] == 3] / 30.
exerData[exerData['EXERCISE_VIG_FREQ'] == 4] = exerData[exerData['EXERCISE_VIG_UNIT'] == 4] / 365.
exerData['EXERCISE'] = exerData['EXERCISE'].astype(float)
labels = ['EXERCISE']
p = plt.hist(exerData.dropna(subset = [labels])[labels].values, 10, range=[0,10])
print(len(exerData[exerData['EXERCISE'] == 0]))
print(len(exerData[exerData['EXERCISE'] > 0]))
# convert exercise to a binary variable
data['EXERCISE'] = (exerData['EXERCISE']> 0)
data['EXERCISE'] = data['EXERCISE'].astype(float)
labels = ['EXERCISE']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 2, range=[0,1])
# how many would accept/decline the intial job offer
print(len(data[data['RISK_INCOME_MID_RISK'] == 0]))
print(len(data[data['RISK_INCOME_MID_RISK'] == 1]))
# how many would accept/decline the worse job offer
print(len(data[data['RISK_INCOME_MORE_RISK'] == 0]))
print(len(data[data['RISK_INCOME_MORE_RISK'] == 1]))
# how many would accept/decline the better job offer
print(len(data[data['RISK_INCOME_LESS_RISK'] == 0]))
print(len(data[data['RISK_INCOME_LESS_RISK'] == 1]))
# Bin Rs by how they responded to the income risk items (branching logic)
data.loc[data.loc[:, 'RISK_INCOME_LESS_RISK']==0, 'RISK_INCOME'] = -1.5
data.loc[data.loc[:, 'RISK_INCOME_LESS_RISK']==1, 'RISK_INCOME'] = -.5
data.loc[data.loc[:, 'RISK_INCOME_MORE_RISK']==0, 'RISK_INCOME'] = .5
data.loc[data.loc[:, 'RISK_INCOME_MORE_RISK']==1, 'RISK_INCOME'] = 1.5
labels = ['RISK_INCOME']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 4, range=[-2,2])
print(data['RISK_INCOME'].corr(data['RISK_1K']))
print(data['RISK_INCOME'].corr(data['RISK_10K']))
print(data['RISK_INCOME'].corr(data['PATIENCE_MONTH']))
print(data['RISK_INCOME'].corr(data['PATIENCE_YEAR']))
g = sns.PairGrid(data, y_vars=['RISK_1K', 'RISK_10K', 'PATIENCE_MONTH', 'PATIENCE_YEAR'], x_vars='RISK_INCOME')
g.map(sns.pointplot)
labels = ['BINGE']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 10, range=[0,10])
p = plt.hist(data.dropna(subset = ['AGE_FIRST_SEX'])['AGE_FIRST_SEX'].values, 25)
p = plt.hist(data.dropna(subset = ['AGE_START_SMOKE'])['AGE_START_SMOKE'].values, 35)
data.replace({'AGE_START_SMOKE': {0: np.nan}}, inplace=True)
fig, axes = plt.subplots(nrows=1, ncols=2)
p = sns.violinplot(data.dropna(subset = [['RISK_1K']])[['RISK_1K']], ax=axes[0])
p.axes.set_ylim(0,1000)
p = sns.violinplot(data.dropna(subset = [['RISK_10K']])[['RISK_10K']], ax=axes[1])
p.axes.set_ylim(0,10000)
labels = ['PATIENCE_MONTH', 'PATIENCE_YEAR']
plt.figure()
for i in range(len(labels)):
p = sns.distplot(data.dropna(subset = [[labels[i]]])[[labels[i]]], hist=False, label=labels[i])
plt.xlim(0, 16)
plt.legend()
# RISK_1K as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
takers = data[data['RISK_INCOME'] > 0]
nontakers = data[data['RISK_INCOME'] < 0]
print(np.mean(takers['RISK_1K']))
p = sns.violinplot(takers.dropna(subset=[['RISK_1K']])[['RISK_1K']], ax=axes[0])
p.axes.set_ylim(0,1000)
print(np.mean(nontakers['RISK_1K']))
p = sns.violinplot(nontakers.dropna(subset=[['RISK_1K']])[['RISK_1K']], ax=axes[1])
p.axes.set_ylim(0,1000)
print(stats.ttest_ind(takers.dropna(subset=[['RISK_1K']])['RISK_1K'], \
nontakers.dropna(subset=[['RISK_1K']])['RISK_1K'], equal_var=False))
# RISK_10K as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
print(np.mean(takers['RISK_10K']))
p = sns.violinplot(takers.dropna(subset=[['RISK_10K']])[['RISK_10K']], ax=axes[0])
p.axes.set_ylim(0,10000)
print(np.mean(nontakers['RISK_10K']))
p = sns.violinplot(nontakers.dropna(subset=[['RISK_10K']])[['RISK_10K']], ax=axes[1])
p.axes.set_ylim(0,10000)
print(stats.ttest_ind(takers.dropna(subset=[['RISK_10K']])['RISK_10K'], \
nontakers.dropna(subset=[['RISK_10K']])['RISK_10K'], equal_var=False))
# PATIENCE_MONTH as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
print(np.mean(takers['PATIENCE_MONTH']))
p = sns.violinplot(takers.dropna(subset=[['PATIENCE_MONTH']])[['PATIENCE_MONTH']], ax=axes[0])
p.axes.set_ylim(0,14)
print(np.mean(nontakers['PATIENCE_MONTH']))
p = sns.violinplot(nontakers.dropna(subset=[['PATIENCE_MONTH']])[['PATIENCE_MONTH']], ax=axes[1])
p.axes.set_ylim(0,14)
print(stats.ttest_ind(takers.dropna(subset=[['PATIENCE_MONTH']])['PATIENCE_MONTH'], \
nontakers.dropna(subset=[['PATIENCE_MONTH']])['PATIENCE_MONTH'], equal_var=False))
# PATIENCE_YEAR as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
print(np.mean(takers['PATIENCE_YEAR']))
p = sns.violinplot(takers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']], ax=axes[0])
p.axes.set_ylim(0,14)
print(np.mean(nontakers['PATIENCE_YEAR']))
p = sns.violinplot(nontakers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']], ax=axes[1])
p.axes.set_ylim(0,14)
print(stats.ttest_ind(takers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']]['PATIENCE_YEAR'], \
nontakers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']]['PATIENCE_YEAR'], equal_var=False))
labels = ['IQ']
p = sns.violinplot(data.dropna(subset = [labels])[labels])
p.axes.set_ylim(0,120000)
labels = ['CCMAX04']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 10, range=[0,10])
labels = ['INCOME']
p = sns.violinplot(data.dropna(subset = [labels])[labels])
p.axes.set_ylim(0,15)
labels = ['RISK', 'DRIVING', 'FINANCIAL', 'OCCUPATION', 'HEALTH', 'FAITH', 'ROMANCE', 'MAJOR_LIFE', 'BETS']
plt.figure(figsize=(12, 5))
p = sns.violinplot(data.dropna(subset = [labels])[labels])
p.axes.set_ylim(0,10)
# correlation matrix
# some of the are categorical, beware of interpretation in those cases
labels = ['RISK_1K', 'RISK_10K', 'RISK_INCOME', 'PATIENCE_MONTH', 'PATIENCE_YEAR', 'AGE', 'SEX', 'IQ',\
'INCOME', 'EXERCISE', 'DOC_VISIT', 'HEALTH_INSUR', 'INVEST_STOCK', 'INVEST_PRIV_BOND', 'INVEST_GOV_BOND',\
'CCMAX', 'DRUG', 'ALCOHOL', 'BINGE', 'CONVICT', 'CIGARETTE',\
'RISK', 'DRIVING', 'FINANCIAL', 'OCCUPATION', 'HEALTH', 'FAITH', 'ROMANCE', 'MAJOR_LIFE', 'BETS']
justcorrs = data[labels]
justcorrs = justcorrs.corr()
fig, ax = plt.subplots()
fig.set_size_inches(12,8)
ax.pcolor(justcorrs, cmap='RdBu', edgecolors='k', vmin=-1., vmax=1.)
ax.invert_yaxis()
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off',
labeltop='on') # labels along the bottom edge are off
plt.tick_params(
axis='y', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
right='off', # ticks along the top edge are off
labelright='off',
labelleft='on') # labels along the bottom edge are off
ytick = plt.yticks(np.arange(0.5, len(labels), 1), labels)
xtick = plt.xticks(np.arange(0.5, len(labels), 1), labels, rotation=90)
# where are the large correlations?
fig, ax = plt.subplots()
fig.set_size_inches(12,8)
threshold = .1
slammedcorrs = justcorrs.copy()
slammedcorrs[slammedcorrs[:] < -threshold] = -1
slammedcorrs[slammedcorrs[:] > threshold] = 1
slammedcorrs[(slammedcorrs[:] < threshold) & (slammedcorrs[:] > -threshold)] = 0
ax.pcolor(slammedcorrs, cmap='RdBu', edgecolors='k', vmin=-1., vmax=1.)
ax.invert_yaxis()
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off',
labeltop='on') # labels along the bottom edge are off
plt.tick_params(
axis='y', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
right='off', # ticks along the top edge are off
labelright='off',
labelleft='on') # labels along the bottom edge are off
ytick = plt.yticks(np.arange(0.5, len(labels), 1), labels)
xtick = plt.xticks(np.arange(0.5, len(labels), 1), labels, rotation=90)
print(data[['PATIENCE_MONTH', 'RISK_1K']].corr())
p = sns.lmplot('RISK_1K', 'PATIENCE_MONTH', data)
p.axes[0,0].set_xlim(0,1000)
p.axes[0,0].set_ylim(0,3)
# Waiting a year
print(data[['PATIENCE_YEAR', 'RISK_1K']].corr())
p = sns.lmplot('RISK_1K', 'PATIENCE_YEAR', data)
p.axes[0,0].set_xlim(0,1000)
p.axes[0,0].set_ylim(0,)
# $10,000 lottery ticket
print(data[['PATIENCE_MONTH', 'RISK_10K']].corr())
p = sns.lmplot('RISK_10K', 'PATIENCE_MONTH', data)
p.axes[0,0].set_xlim(0,10000)
# Waiting a year and $10,000 lottery ticket
print(data[['PATIENCE_YEAR', 'RISK_10K']].corr())
p = sns.lmplot('RISK_10K', 'PATIENCE_YEAR', data)
p.axes[0,0].set_xlim(0,10000)
p.axes[0,0].set_ylim(0,)
#RISK
#DRIVING
#FINANCIAL
#OCCUPATION
#HEALTH
#FAITH
#ROMANCE
#MAJOR_LIFE
#BETS
# Eliminate subjects not reporting risk
r = 'RISK'
# beh econ risk vs. self-reported risk
print(data[r].corr(data['RISK_1K']))
p = sns.lmplot(r, 'RISK_1K', data, x_jitter=.3, y_jitter=50)
p.set(ylim=(0, 1000), xlim=(0, 10))