%matplotlib inline

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization

Resampling Technique -- The Bootstrap¶

#bootstrapping on confidence interval on the difference in means:

iters = 1000 #iterations
N1 = 500 #observations
N2 = 100 #observations
#pretend this data was real:
X1 = pd.Series(np.random.normal(100, 1., N1))
X2 = pd.Series(np.random.normal(99, 0.5, N2))
X1.hist(bins = N1/10, alpha=.7)
X2.hist(bins = N2/10, alpha=.5)
#plt.show()

#compute original difference
origDiff = X1.mean() - X2.mean()
print "original diff:", origDiff

#sys.exit(1)

reDiffs = list()
for i in range(iters):
    #draw a random resampling of #drawing from the hat: 
    reX1indices = [int(d) for d in np.random.uniform(0, N1-1, N1)]
    reX2indices = [int(d) for d in np.random.uniform(0, N2-1, N2)]
    #print reX1indices
    #sys.exit(1)
    reX1 = X1[reX1indices] #resampled X1
    reX2 = X2[reX2indices] #resampled X2
    #reX1.hist(bins = N1/10, alpha=.3)
    reDiff = reX1.mean() - reX2.mean()
    #print "resampled diff", reDiff
    reDiffs.append(reDiff)
    
plt.show()
sortedReDiffs = pd.Series(sorted(reDiffs))
#print sortedReDiffs.head()
sortedReDiffs.hist(bins = iters/6)
print "histogram of bootstrapped diffs"
sRDdesc = sortedReDiffs.describe(percentiles=[.025, .975])
plt.show()
print "The difference in means is %.4f with 95%% CI: [%.4f, %.4f]" % (origDiff,sRDdesc['2.5%'], sRDdesc['97.5%'])

original diff: 1.08572565467

histogram of bootstrapped diffs

The difference in means is 1.0857 with 95% CI: [0.9526, 1.2283]

Prediction: From m < ~10 to m > ~100¶

(m is number of features)

import statsmodels.api as sm#load the iris data: 
iris = pd.read_csv('iris.csv')
print iris.head()
print iris.describe()

#example: predicting Sepal Length from the other iris characteristics:
iris_train = iris[:130] #training data
iris_test = iris[130:150]  #testing data
model = sm.OLS(iris_train['SepalLength'], iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']]).fit()
y_hat = np.dot(iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']], model.params) #model.params are betas...
#y true,  y predicted,   error
print np.array([iris_test['SepalLength'], y_hat, iris_test['SepalLength'] - y_hat]).T
print "mean squared error: %.3f" % np.mean((iris_test['SepalLength'] - y_hat)**2)

   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
3          4.6         3.1          1.5         0.2  Iris-setosa
4          5.0         3.6          1.4         0.2  Iris-setosa
       SepalLength  SepalWidth  PetalLength  PetalWidth
count   150.000000  150.000000   150.000000  150.000000
mean      5.843333    3.054000     3.758667    1.198667
std       0.828066    0.433594     1.764420    0.763161
min       4.300000    2.000000     1.000000    0.100000
25%       5.100000    2.800000     1.600000    0.300000
50%       5.800000    3.000000     4.350000    1.300000
75%       6.400000    3.300000     5.100000    1.800000
max       7.900000    4.400000     6.900000    2.500000
[[ 7.4         7.12396497  0.27603503]
 [ 7.9         8.41835817 -0.51835817]
 [ 6.4         6.27534882  0.12465118]
 [ 6.3         6.56320208 -0.26320208]
 [ 6.1         6.96383735 -0.86383735]
 [ 7.7         6.89006419  0.80993581]
 [ 6.3         6.7101159  -0.4101159 ]
 [ 6.4         6.959432   -0.559432  ]
 [ 6.          6.13834305 -0.13834305]
 [ 6.9         6.51695611  0.38304389]
 [ 6.7         6.37908542  0.32091458]
 [ 6.9         5.98505703  0.91494297]
 [ 5.8         5.99827082 -0.19827082]
 [ 6.8         6.90768105 -0.10768105]
 [ 6.7         6.58766053  0.11233947]
 [ 6.7         5.9762486   0.7237514 ]
 [ 6.3         5.67604876  0.62395124]
 [ 6.5         6.31718943  0.18281057]
 [ 6.2         6.62069271 -0.42069271]
 [ 5.9         6.44294824 -0.54294824]]
mean squared error: 0.245

#an example of overfitting by using too many features compared to training observations: 

#Let's setup a train and test (no development because we aren't setting any extra parameters)
iris_train = iris[:10] #training data
iris_test = iris[100:]  #testing data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_2feats = iris_train[['PetalWidth','SepalWidth']] #uppercase => matrix

#Train (fit) the models: 
lr_result_3f = sm.OLS(y_train, X_3feats).fit()  
lr_result_2f = sm.OLS(y_train, X_2feats).fit()  
betas_3f = lr_result_3f.params
print "\nbetas for 3 features:\n", betas_3f
betas_2f = lr_result_2f.params
print "\nbetas for 2 features:\n", betas_2f

#setup test
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_2f = iris_test[['PetalWidth','SepalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']

#apply model to test and check eror:
y_hat_test_3f = np.dot(X_test_3f, betas_3f)
y_hat_train_3f = np.dot(X_3feats, betas_3f)
y_hat_test_2f = np.dot(X_test_2f, betas_2f)
error_3f = np.mean((y_test - y_hat_test_3f)**2)
error_3ftrain = np.mean((y_train - y_hat_train_3f)**2)
error_2f = np.mean((y_test - y_hat_test_2f)**2)
print "\nMSerror when using 3 feats(test): %.2f" % error_3f
print "MSerror when using 3 feats(train): %.2f" % error_3ftrain
print "MSerror when using 2 feats: %.2f" % error_2f

betas for 3 features:
SepalWidth     0.979918
PetalLength    1.429045
PetalWidth    -2.086311
dtype: float64

betas for 2 features:
PetalWidth   -2.452010
SepalWidth    1.627737
dtype: float64

MSerror when using 3 feats(test): 0.32
MSerror when using 3 feats(train): 0.02
MSerror when using 2 feats: 45.76

#Regularization Comparison

from sklearn.linear_model import Ridge, Lasso

#RIDGE:
weights = []
mses = []
#alphas = ([.001, .01, .1, 1, 10]
alphas = [(2**i)/10000.0 for i in xrange(20)]
for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_)
    #print model.coef_
    #figure out accuracy:
    y_hat = model.predict(X_test_3f)
    print y_hat.std()
    mses.append(np.sum((y_test - y_hat)**2))
    
plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)#
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
                
plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()


#LASSO:
weights = []
mses = []
alphas = alphas[:13]
for alpha in alphas:
    model = Lasso(alpha=alpha)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_)
    y_hat = model.predict(X_test_3f)
    mses.append(np.sum((y_test - y_hat)**2))
    

plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()

plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()

0.605594198186
0.604279611313
0.601681540688
0.596606403731
0.586914401569
0.569179238929
0.539110689769
0.494067750435
0.437160049804
0.377088450755
0.320826539772
0.268341000979
0.215102507904
0.1593096816
0.106374043306
0.0642056018456
0.035861701378
0.0190530196901
0.00983519989978
0.00499871005247

#clustering:
X = iris[['PetalWidth','SepalWidth']]
X.plot(kind='scatter', x = 'PetalWidth', y ='SepalWidth')

<matplotlib.axes._subplots.AxesSubplot at 0xc6dd390>