In [2]:
%matplotlib inline

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization


## Resampling Technique -- The Bootstrap¶

In [166]:
#bootstrapping on confidence interval on the difference in means:

iters = 1000 #iterations
N1 = 500 #observations
N2 = 100 #observations
#pretend this data was real:
X1 = pd.Series(np.random.normal(100, 1., N))
X2 = pd.Series(np.random.normal(99, 0.5, N2))
X1.hist(bins = N1/10, alpha=.7)
X2.hist(bins = N2/10, alpha=.5)
#plt.show()

#compute original difference
origDiff = X1.mean() - X2.mean()
print "original diff:", origDiff

#sys.exit(1)

reDiffs = list()
for i in range(iters):
#draw a random resampling of #drawing from the hat:
reX1indices = [int(d) for d in np.random.uniform(0, N1-1, N1)]
reX2indices = [int(d) for d in np.random.uniform(0, N2-1, N2)]
#print reX1indices
#sys.exit(1)
reX1 = X1[reXindices] #resampled X1
reX2 = X2[reX2indices] #resampled X2
#reX1.hist(bins = N1/10, alpha=.3)
reDiff = reX1.mean() - reX2.mean()
#print "resampled diff", reDiff
reDiffs.append(reDiff)

plt.show()
sortedReDiffs = pd.Series(sorted(reDiffs))
sortedReDiffs.hist(bins = iters/6)
print "histogram of bootstrapped diffs"
sRDdesc = sortedReDiffs.describe(percentiles=[.025, .975])
plt.show()
print "The difference in means is %.4f with 95%% CI: [%.4f, %.4f]" % (origDiff,sRDdesc['2.5%'], sRDdesc['97.5%'])

original diff: 0.927254809004

histogram of bootstrapped diffs

The difference in means is 0.9273 with 95% CI: [0.8357, 1.0122]


## Prediction: From m < ~10 to m > ~100¶

(m is number of features)

In [153]:
#load the iris data:
print iris.describe()

iris_train = iris[:5]
iris_test = iris[5:]
y = iris_train['SepalLength'] #from now on, lowercase => vector
X = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
#X = iris_train[['PetalWidth','SepalWidth']] #uppercase => matrix

import statsmodels.api as sm
lr_result = sm.OLS(y, X).fit()
print lr_result.summary()
betas = lr_result.params
print "betas", betas

   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
3          4.6         3.1          1.5         0.2  Iris-setosa
4          5.0         3.6          1.4         0.2  Iris-setosa
SepalLength  SepalWidth  PetalLength  PetalWidth
count   150.000000  150.000000   150.000000  150.000000
mean      5.843333    3.054000     3.758667    1.198667
std       0.828066    0.433594     1.764420    0.763161
min       4.300000    2.000000     1.000000    0.100000
25%       5.100000    2.800000     1.600000    0.300000
50%       5.800000    3.000000     4.350000    1.300000
75%       6.400000    3.300000     5.100000    1.800000
max       7.900000    4.400000     6.900000    2.500000
OLS Regression Results
==============================================================================
Dep. Variable:            SepalLength   R-squared:                       0.469
Model:                            OLS   Adj. R-squared:                 -0.063
Method:                 Least Squares   F-statistic:                    0.8817
Date:                Thu, 31 Mar 2016   Prob (F-statistic):              0.531
Time:                        15:42:26   Log-Likelihood:                 2.9100
No. Observations:                   5   AIC:                            0.1800
Df Residuals:                       2   BIC:                           -0.9917
Df Model:                           2
Covariance Type:            nonrobust
===============================================================================
coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
SepalWidth      0.5361      0.417      1.286      0.327        -1.258     2.330
PetalLength    -0.2319      1.526     -0.152      0.893        -6.798     6.334
PetalWidth     17.1312     13.454      1.273      0.331       -40.758    75.021
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   1.374
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.601
Skew:                           0.254   Prob(JB):                        0.740
Kurtosis:                       1.379   Cond. No.                         506.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
betas SepalWidth      0.536122
PetalLength    -0.231939
PetalWidth     17.131179
dtype: float64

In [154]:
y_test = iris_test['SepalLength'] #from now on, lowercase => vector
X_test = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
#X_test = iris_test[['PetalWidth','SepalWidth']] #uppercase => matrix

y_hat_test = np.dot(X_test, betas)
error = np.mean((y_test - y_hat_test)**2)
print error

393.44278132