Analyses of NLSY data set

Ground work

In [1]:
import pandas as pd
import numpy as np
import sys
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn import preprocessing
import scipy.stats as stats
import corrstats # allows for comparison of 2 correlation coefficients
import a12 # provides Vargha and Delaney's A (non-parametric common language effect size)
import geometric
import seaborn as sns
sns.set_style("whitegrid")
In [2]:
def cohen_d(x,y):
        return (np.mean(x) - np.mean(y)) / math.sqrt((np.std(x, ddof=1) ** 2. + np.std(y, ddof=1) ** 2.) / 2.)
In [3]:
# Load data, excluding "non-response" data
path = 'C:\\Users\\cadlab\\Desktop\\xian\\research\\projects\\risk-delay\\'
df = pd.read_csv(path+'6-17-15.csv', na_values=[-5,-4,-3,-2,-1])
In [4]:
# Rename (some) columns to something sensible
df.rename(columns={'T0961700' : 'PATIENCE_MONTH',\
                   'T0962000' : 'PATIENCE_YEAR',\
                   'T0960500' : 'RISK_10K',\
                   'T0961100' : 'RISK_1K',\
                   'R4395800' : 'RISK_INCOME_MID_RISK',\
                   'R4395900' : 'RISK_INCOME_MORE_RISK',\
                   'R4396000' : 'RISK_INCOME_LESS_RISK',\
                   'T3094800' : 'RISK',\
                   'T3094900' : 'DRIVING',\
                   'T3094901' : 'FINANCIAL',\
                   'T3094902' : 'OCCUPATION',\
                   'T3094903' : 'HEALTH',\
                   'T3094904' : 'FAITH',\
                   'T3094905' : 'ROMANCE',\
                   'T3094906' : 'MAJOR_LIFE',\
                   'T3095000' : 'BETS',\
                   'T0911100' : 'ALCOHOL',\
                   'T0911400' : 'BINGE',\
                   'T2074900' : 'CIGARETTE',\
                   'R3914300' : 'AGE_START_SMOKE',\
                   'R3914200' : 'SMOKE_100_CIGS',\
                   'H0020800' : 'EVER_DEPRESSED',\
                   'H0021000' : 'DEPRESSED_IN_LAST_YEAR',\
                   'H0008600' : 'WORRY_ANX_DEPRESS',\
                   'R6430800' : 'COCAINE',\
                   'R6431200' : 'CRACK',\
                   'R6432000' : 'SEDATIVES',\
                   'R6432100' : 'TRANQUILIZERS',\
                   'R6432200' : 'STIMULANTS',\
                   'R6432300' : 'PAINKILLERS',\
                   'R6432400' : 'INHALANTS',\
                   'R6432500' : 'HALLUCINOGENS',\
                   'R6432600' : 'HEROIN',\
                   'R6432700' : 'STEROIDS',\
                   'R6432800' : 'INJECTIONS',\
                   'R6432900' : 'MDMA',\
                   'R6433000' : 'METH',\
                   'R0006500' : 'SES',\
                   'T0988600' : 'EDU',\
                   'T0989000' : 'AGE',\
                   'T0990700' : 'NJOBS',\
                   'T0988200' : 'EMPLOYED',\
                   'T0910800' : 'MEMORY',\
                   'T0899200' : 'TRYING_LOSE_WEIGHT',\
                   'T0897300' : 'WEIGHT',\
                   'T0897400' : 'HEIGHT_FT',\
                   'T0897500' : 'HEIGHT_IN',\
                   'T0897600' : 'EXERCISE_VIG_FREQ',\
                   'T0897700' : 'EXERCISE_VIG_UNIT',\
                   'T0898100' : 'EXERCISE_MOD_FREQ',\
                   'T0898200' : 'EXERCISE_MOD_UNIT',\
                   'T0898800' : 'DOC_VISIT',\
                   'R0988000' : 'AGE_FIRST_SEX',\
                   'T0899900' : 'HEALTH_INSUR',\
                   'T2176700' : 'CCDEBT',\
                   'T2181800' : 'CCMISSPAY',\
                   'R8417600' : 'CCMAX04',\
                   'T2181900' : 'CCMAX08',\
                   'R6945200' : 'RETIREMENT',\
                   'R8046500' : 'INVEST_STOCK',\
                   'R8046501' : 'INVEST_PRIV_BOND',\
                   'R8046502' : 'INVEST_GOV_BOND',\
                   'T4100000' : 'EMERGENCY_FUND',\
                   'T4100300' : 'FIT_LIT_4',\
                   'T4100400' : 'FIT_LIT_5',\
                   'T4100500' : 'FIT_LIT_6',\
                   'T4100600' : 'FIT_LIT_7',\
                   'T4100700' : 'FIT_LIT_8',\
                   'T2463100' : 'ENTREPRENEUR',\
                   'R0214800' : 'SEX',\
                   'R0214700' : 'RACE',\
                   'T0987900' : 'POVERTY',\
                   'T0987800' : 'INCOME',\
                   'R0618301' : 'IQ',\
                   'R0305100' : 'SHOPLIFT',\
                   'R0305200' : 'STEAL_SMALL',\
                   'R0305300' : 'STEAL_LARGE',\
                   'R0307800' : 'CONVICT',\
                   'R0305000' : 'FIGHT'\
                   }, inplace=True)

Questionable responses

In [5]:
# Frequeny of 0 responses to various combinations of the risk items
print(len(df.query('(RISK_10K > 0) & (RISK_1K > 0)')))
print(len(df.query('(RISK_10K == 0) | (RISK_1K == 0)')))
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0)')))
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0) & (PATIENCE_YEAR == 0)')))
5860
1189
688
263
249
In [6]:
print(len(df.query('(RISK_1K >= 0) & (PATIENCE_MONTH >= 0)')))
print(len(df.query('(RISK_1K == 0) | (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_1K == 0) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_1K > 0) & (PATIENCE_MONTH > 0)')))
7006
1416
330
5648
In [7]:
print(len(df.query('(RISK_10K == 0) | (PATIENCE_YEAR == 0)')))
print(len(df.query('(RISK_10K > 0) & (PATIENCE_YEAR > 0)')))
1336
5653
In [8]:
# Frequeny of 0 responses to various combinations of the delay items
print(len(df.query('(PATIENCE_YEAR > 0) & (PATIENCE_MONTH > 0)')))
print(len(df.query('(PATIENCE_MONTH == 0)')))
print(len(df.query('(PATIENCE_YEAR == 0)')))
print(len(df.query('(PATIENCE_YEAR == 0) & (PATIENCE_MONTH == 0)')))
6164
801
697
498
In [9]:
print(len(df.query('(RISK_10K == 0) & (RISK_1K == 0) & (PATIENCE_YEAR == 0) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(PATIENCE_YEAR > 0) | (PATIENCE_MONTH > 0)')))
print(len(df.query('(RISK_10K > 0) | (RISK_1K > 0)')))
214
6774
6504
In [10]:
# Frequeny of maximal responses to various combinations of the beh. econ. items
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000)')))
print(len(df.query('(RISK_10K == 10000) | (RISK_1K == 1000)')))
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000) & (PATIENCE_MONTH == 0)')))
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000) & (PATIENCE_YEAR == 0)')))
print(len(df.query('(RISK_10K == 10000) & (RISK_1K == 1000) & (PATIENCE_MONTH == 0) & (PATIENCE_YEAR == 0)')))
956
1699
88
70
46
In [11]:
# Correlation of raw responses
print(df['PATIENCE_MONTH'].corr(df['PATIENCE_YEAR']))
print(df['RISK_1K'].corr(df['RISK_10K']))
0.498386157731
0.674114469672

Data cleaning

In [12]:
data = df.copy()

# Convert zero responses on beh. econ. items to NaN
data.replace({'RISK_1K': {0: np.nan}}, inplace=True)
data.replace({'RISK_10K': {0: np.nan}}, inplace=True)
data.replace({'PATIENCE_MONTH': {0: np.nan}}, inplace=True)
data.replace({'PATIENCE_YEAR': {0: np.nan}}, inplace=True)

# Modify DOC_VISIT so that it's monotonic with recency of doctor visit
data.replace({'DOC_VISIT': {0: 6}}, inplace=True)

# Modify EMPLOYED so that it's only employed/not
data.replace({'EMPLOYED': {2: 0}}, inplace=True)
data.replace({'EMPLOYED': {3: np.nan}}, inplace=True)
data.replace({'EMPLOYED': {4: np.nan}}, inplace=True)

Transforming Variables

In [13]:
# Log transform the patience items
data['PATIENCE_MONTH'] = np.log(data['PATIENCE_MONTH'].values)
data['PATIENCE_YEAR'] = np.log(data['PATIENCE_YEAR'].values)

# Log transform income
# some response income as $0, so add $1 before transforming
print(len(data.query('(INCOME == 0)')))
data['INCOME'] = np.log(1+data['INCOME'])
289

Composite Variables

In [14]:
# Construct a substance use variable from idividual items
data['DRUG'] = (
    (data['COCAINE'] > 0) |
    (data['CRACK'] > 0) |
    (data['STIMULANTS'] > 0) | 
    (data['PAINKILLERS'] > 0) |
    (data['INHALANTS'] > 0) |
    (data['HALLUCINOGENS'] > 0) |
    (data['HEROIN'] > 0) |
    (data['MDMA'] > 0) |
    (data['METH'] > 0)
    )
data['DRUG'] = data['DRUG'].astype(float)

# Construct a maxed-out credit card variable
data['CCMAX'] = (data['CCMAX04'] > 0)
data['CCMAX'] = data['CCMAX'].astype(float)
In [15]:
# compute BMI
data['BMI'] = 703.06958 * data['WEIGHT']/((data['HEIGHT_FT'] * 12) + data['HEIGHT_IN'])**2
data['BMI'] = np.log(data['BMI'])
p = plt.hist(data.dropna(subset = ['BMI'])['BMI'].values, 100)

Deal with Exercise

In [16]:
exerData = data.copy()
exerData['EXERCISE'] = exerData['EXERCISE_VIG_FREQ']
exerData[exerData['EXERCISE_VIG_FREQ'] == 5] = 0.
exerData[exerData['EXERCISE_VIG_FREQ'] == 2] = exerData[exerData['EXERCISE_VIG_UNIT'] == 2] / 7.
exerData[exerData['EXERCISE_VIG_FREQ'] == 3] = exerData[exerData['EXERCISE_VIG_UNIT'] == 3] / 30.
exerData[exerData['EXERCISE_VIG_FREQ'] == 4] = exerData[exerData['EXERCISE_VIG_UNIT'] == 4] / 365.
exerData['EXERCISE'] = exerData['EXERCISE'].astype(float)
labels = ['EXERCISE']
p = plt.hist(exerData.dropna(subset = [labels])[labels].values, 10, range=[0,10])
print(len(exerData[exerData['EXERCISE'] == 0]))
print(len(exerData[exerData['EXERCISE'] > 0]))
2993
2787
In [17]:
# convert exercise to a binary variable
data['EXERCISE'] = (exerData['EXERCISE']> 0)
data['EXERCISE'] = data['EXERCISE'].astype(float)
labels = ['EXERCISE']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 2, range=[0,1])

Deal with Income Risk

In [18]:
# how many would accept/decline the intial job offer
print(len(data[data['RISK_INCOME_MID_RISK'] == 0]))
print(len(data[data['RISK_INCOME_MID_RISK'] == 1]))
5193
3774
In [19]:
# how many would accept/decline the worse job offer
print(len(data[data['RISK_INCOME_MORE_RISK'] == 0]))
print(len(data[data['RISK_INCOME_MORE_RISK'] == 1]))
1504
2257
In [20]:
# how many would accept/decline the better job offer
print(len(data[data['RISK_INCOME_LESS_RISK'] == 0]))
print(len(data[data['RISK_INCOME_LESS_RISK'] == 1]))
4162
1035
In [21]:
# Bin Rs by how they responded to the income risk items (branching logic)

data.loc[data.loc[:, 'RISK_INCOME_LESS_RISK']==0, 'RISK_INCOME'] = -1.5
data.loc[data.loc[:, 'RISK_INCOME_LESS_RISK']==1, 'RISK_INCOME'] = -.5
data.loc[data.loc[:, 'RISK_INCOME_MORE_RISK']==0, 'RISK_INCOME'] = .5
data.loc[data.loc[:, 'RISK_INCOME_MORE_RISK']==1, 'RISK_INCOME'] = 1.5
In [22]:
labels = ['RISK_INCOME']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 4, range=[-2,2])
In [23]:
print(data['RISK_INCOME'].corr(data['RISK_1K']))
print(data['RISK_INCOME'].corr(data['RISK_10K']))
print(data['RISK_INCOME'].corr(data['PATIENCE_MONTH']))
print(data['RISK_INCOME'].corr(data['PATIENCE_YEAR']))
0.00614310914635
0.0055190069437
0.00241784387466
-0.00482971088635
In [24]:
g = sns.PairGrid(data, y_vars=['RISK_1K', 'RISK_10K', 'PATIENCE_MONTH', 'PATIENCE_YEAR'], x_vars='RISK_INCOME')
g.map(sns.pointplot)
Out[24]:
<seaborn.axisgrid.PairGrid at 0xb5d5b00>

Inspect Variables

In [25]:
labels = ['BINGE']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 10, range=[0,10])
In [26]:
p = plt.hist(data.dropna(subset = ['AGE_FIRST_SEX'])['AGE_FIRST_SEX'].values, 25)
In [27]:
p = plt.hist(data.dropna(subset = ['AGE_START_SMOKE'])['AGE_START_SMOKE'].values, 35)
In [28]:
data.replace({'AGE_START_SMOKE': {0: np.nan}}, inplace=True)
In [26]:
fig, axes = plt.subplots(nrows=1, ncols=2)
p = sns.violinplot(data.dropna(subset = [['RISK_1K']])[['RISK_1K']], ax=axes[0])
p.axes.set_ylim(0,1000)
p = sns.violinplot(data.dropna(subset = [['RISK_10K']])[['RISK_10K']], ax=axes[1])
p.axes.set_ylim(0,10000)
C:\Users\cadlab\AppData\Local\Enthought\Canopy\User\lib\site-packages\seaborn\categorical.py:2296: UserWarning: The violinplot API has been changed. Attempting to adjust your arguments for the new API (which might not work). Please update your code. See the version 0.6 release notes for more info.
  warnings.warn(msg, UserWarning)
Out[26]:
(0, 10000)
In [27]:
labels = ['PATIENCE_MONTH', 'PATIENCE_YEAR']
plt.figure()
for i in range(len(labels)):
    p = sns.distplot(data.dropna(subset = [[labels[i]]])[[labels[i]]], hist=False, label=labels[i])
plt.xlim(0, 16)
plt.legend()
Out[27]:
<matplotlib.legend.Legend at 0xe998198>
In [28]:
# RISK_1K as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
takers = data[data['RISK_INCOME'] > 0]
nontakers = data[data['RISK_INCOME'] < 0]
print(np.mean(takers['RISK_1K']))
p = sns.violinplot(takers.dropna(subset=[['RISK_1K']])[['RISK_1K']], ax=axes[0])
p.axes.set_ylim(0,1000)
print(np.mean(nontakers['RISK_1K']))
p = sns.violinplot(nontakers.dropna(subset=[['RISK_1K']])[['RISK_1K']], ax=axes[1])
p.axes.set_ylim(0,1000)
print(stats.ttest_ind(takers.dropna(subset=[['RISK_1K']])['RISK_1K'], \
                      nontakers.dropna(subset=[['RISK_1K']])['RISK_1K'], equal_var=False))
615.327634195
613.467079347
Ttest_indResult(statistic=0.25463761126139706, pvalue=0.79901262051257471)
In [29]:
# RISK_10K as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
print(np.mean(takers['RISK_10K']))
p = sns.violinplot(takers.dropna(subset=[['RISK_10K']])[['RISK_10K']], ax=axes[0])
p.axes.set_ylim(0,10000)
print(np.mean(nontakers['RISK_10K']))
p = sns.violinplot(nontakers.dropna(subset=[['RISK_10K']])[['RISK_10K']], ax=axes[1])
p.axes.set_ylim(0,10000)
print(stats.ttest_ind(takers.dropna(subset=[['RISK_10K']])['RISK_10K'], \
                      nontakers.dropna(subset=[['RISK_10K']])['RISK_10K'], equal_var=False))
5664.92109437
5594.80756014
Ttest_indResult(statistic=0.87488306439506813, pvalue=0.38167555108061479)
In [30]:
# PATIENCE_MONTH as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
print(np.mean(takers['PATIENCE_MONTH']))
p = sns.violinplot(takers.dropna(subset=[['PATIENCE_MONTH']])[['PATIENCE_MONTH']], ax=axes[0])
p.axes.set_ylim(0,14)
print(np.mean(nontakers['PATIENCE_MONTH']))
p = sns.violinplot(nontakers.dropna(subset=[['PATIENCE_MONTH']])[['PATIENCE_MONTH']], ax=axes[1])
p.axes.set_ylim(0,14)
print(stats.ttest_ind(takers.dropna(subset=[['PATIENCE_MONTH']])['PATIENCE_MONTH'], \
                      nontakers.dropna(subset=[['PATIENCE_MONTH']])['PATIENCE_MONTH'], equal_var=False))
5.68267117848
5.66831413124
Ttest_indResult(statistic=0.3587772504100703, pvalue=0.71977516009263709)
In [31]:
# PATIENCE_YEAR as related to income risk
fig, axes = plt.subplots(nrows=1, ncols=2)
print(np.mean(takers['PATIENCE_YEAR']))
p = sns.violinplot(takers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']], ax=axes[0])
p.axes.set_ylim(0,14)
print(np.mean(nontakers['PATIENCE_YEAR']))
p = sns.violinplot(nontakers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']], ax=axes[1])
p.axes.set_ylim(0,14)
print(stats.ttest_ind(takers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']]['PATIENCE_YEAR'], \
                      nontakers.dropna(subset=[['PATIENCE_YEAR']])[['PATIENCE_YEAR']]['PATIENCE_YEAR'], equal_var=False))
6.81986367642
6.83009515285
Ttest_indResult(statistic=-0.26927627853950914, pvalue=0.78772702691942698)
In [32]:
labels = ['IQ']
p = sns.violinplot(data.dropna(subset = [labels])[labels])
p.axes.set_ylim(0,120000)
Out[32]:
(0, 120000)
In [33]:
labels = ['CCMAX04']
p = plt.hist(data.dropna(subset = [labels])[labels].values, 10, range=[0,10])
In [35]:
labels = ['INCOME']
p = sns.violinplot(data.dropna(subset = [labels])[labels])
p.axes.set_ylim(0,15)
Out[35]:
(0, 15)
In [36]:
labels = ['RISK', 'DRIVING', 'FINANCIAL', 'OCCUPATION', 'HEALTH', 'FAITH', 'ROMANCE', 'MAJOR_LIFE', 'BETS']
plt.figure(figsize=(12, 5))
p = sns.violinplot(data.dropna(subset = [labels])[labels])
p.axes.set_ylim(0,10)
Out[36]:
(0, 10)
In [54]:
# correlation matrix
# some of the are categorical, beware of interpretation in those cases
labels = ['RISK_1K', 'RISK_10K', 'RISK_INCOME', 'PATIENCE_MONTH', 'PATIENCE_YEAR', 'AGE', 'SEX', 'IQ',\
          'INCOME', 'EXERCISE', 'DOC_VISIT', 'HEALTH_INSUR', 'INVEST_STOCK', 'INVEST_PRIV_BOND', 'INVEST_GOV_BOND',\
          'CCMAX', 'DRUG', 'ALCOHOL', 'BINGE', 'CONVICT', 'CIGARETTE',\
          'RISK', 'DRIVING', 'FINANCIAL', 'OCCUPATION', 'HEALTH', 'FAITH', 'ROMANCE', 'MAJOR_LIFE', 'BETS']
justcorrs = data[labels]
justcorrs = justcorrs.corr()
fig, ax = plt.subplots()
fig.set_size_inches(12,8)
ax.pcolor(justcorrs, cmap='RdBu', edgecolors='k', vmin=-1., vmax=1.)
ax.invert_yaxis()
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off',
    labeltop='on') # labels along the bottom edge are off
plt.tick_params(
    axis='y',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    left='off',      # ticks along the bottom edge are off
    right='off',         # ticks along the top edge are off
    labelright='off',
    labelleft='on') # labels along the bottom edge are off
ytick = plt.yticks(np.arange(0.5, len(labels), 1), labels)
xtick = plt.xticks(np.arange(0.5, len(labels), 1), labels, rotation=90)
In [55]:
# where are the large correlations?
fig, ax = plt.subplots()
fig.set_size_inches(12,8)
threshold = .1
slammedcorrs = justcorrs.copy()
slammedcorrs[slammedcorrs[:] < -threshold] = -1
slammedcorrs[slammedcorrs[:] > threshold] = 1
slammedcorrs[(slammedcorrs[:] < threshold) & (slammedcorrs[:] > -threshold)] = 0
ax.pcolor(slammedcorrs, cmap='RdBu', edgecolors='k', vmin=-1., vmax=1.)
ax.invert_yaxis()
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off',
    labeltop='on') # labels along the bottom edge are off
plt.tick_params(
    axis='y',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    left='off',      # ticks along the bottom edge are off
    right='off',         # ticks along the top edge are off
    labelright='off',
    labelleft='on') # labels along the bottom edge are off
ytick = plt.yticks(np.arange(0.5, len(labels), 1), labels)
xtick = plt.xticks(np.arange(0.5, len(labels), 1), labels, rotation=90)

Risk-Delay Correlation

In [71]:
print(data[['PATIENCE_MONTH', 'RISK_1K']].corr())
p = sns.lmplot('RISK_1K', 'PATIENCE_MONTH', data)
p.axes[0,0].set_xlim(0,1000)
p.axes[0,0].set_ylim(0,3)
                PATIENCE_MONTH   RISK_1K
PATIENCE_MONTH        1.000000  0.146902
RISK_1K               0.146902  1.000000
Out[71]:
(0, 3)
In [68]:
# Waiting a year
print(data[['PATIENCE_YEAR', 'RISK_1K']].corr())
p = sns.lmplot('RISK_1K', 'PATIENCE_YEAR', data)
p.axes[0,0].set_xlim(0,1000)
p.axes[0,0].set_ylim(0,)
               PATIENCE_YEAR   RISK_1K
PATIENCE_YEAR       1.000000  0.156027
RISK_1K             0.156027  1.000000
Out[68]:
(0, 3.0)
In [69]:
# $10,000 lottery ticket
print(data[['PATIENCE_MONTH', 'RISK_10K']].corr())
p = sns.lmplot('RISK_10K', 'PATIENCE_MONTH', data)
p.axes[0,0].set_xlim(0,10000)
                PATIENCE_MONTH  RISK_10K
PATIENCE_MONTH        1.000000  0.106218
RISK_10K              0.106218  1.000000
Out[69]:
(0, 10000)
In [70]:
# Waiting a year and $10,000 lottery ticket
print(data[['PATIENCE_YEAR', 'RISK_10K']].corr())
p = sns.lmplot('RISK_10K', 'PATIENCE_YEAR', data)
p.axes[0,0].set_xlim(0,10000)
p.axes[0,0].set_ylim(0,)
               PATIENCE_YEAR  RISK_10K
PATIENCE_YEAR       1.000000  0.135245
RISK_10K            0.135245  1.000000
Out[70]:
(0, 3.0)

Self-Reported Risk

In [782]:
#RISK
#DRIVING
#FINANCIAL
#OCCUPATION
#HEALTH
#FAITH
#ROMANCE
#MAJOR_LIFE
#BETS

# Eliminate subjects not reporting risk
r = 'RISK'
In [783]:
# beh econ risk vs. self-reported risk
print(data[r].corr(data['RISK_1K']))
p = sns.lmplot(r, 'RISK_1K', data, x_jitter=.3, y_jitter=50)
p.set(ylim=(0, 1000), xlim=(0, 10))
0.0151567157412
Out[783]:
<seaborn.axisgrid.FacetGrid at 0x363e28d0>