In [122]:
%matplotlib inline

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization

Resampling Technique -- The Bootstrap

In [123]:
#bootstrapping on confidence interval on the difference in means:

iters = 1000 #iterations
N1 = 500 #observations
N2 = 100 #observations
#pretend this data was real:
X1 = pd.Series(np.random.normal(100, 1., N1))
X2 = pd.Series(np.random.normal(99, 0.5, N2))
X1.hist(bins = N1/10, alpha=.7)
X2.hist(bins = N2/10, alpha=.5)
#plt.show()

#compute original difference
origDiff = X1.mean() - X2.mean()
print "original diff:", origDiff

#sys.exit(1)

reDiffs = list()
for i in range(iters):
    #draw a random resampling of #drawing from the hat: 
    reX1indices = [int(d) for d in np.random.uniform(0, N1-1, N1)]
    reX2indices = [int(d) for d in np.random.uniform(0, N2-1, N2)]
    #print reX1indices
    #sys.exit(1)
    reX1 = X1[reX1indices] #resampled X1
    reX2 = X2[reX2indices] #resampled X2
    #reX1.hist(bins = N1/10, alpha=.3)
    reDiff = reX1.mean() - reX2.mean()
    #print "resampled diff", reDiff
    reDiffs.append(reDiff)
    
plt.show()
sortedReDiffs = pd.Series(sorted(reDiffs))
#print sortedReDiffs.head()
sortedReDiffs.hist(bins = iters/6)
print "histogram of bootstrapped diffs"
sRDdesc = sortedReDiffs.describe(percentiles=[.025, .975])
plt.show()
print "The difference in means is %.4f with 95%% CI: [%.4f, %.4f]" % (origDiff,sRDdesc['2.5%'], sRDdesc['97.5%'])
original diff: 0.995722858479
histogram of bootstrapped diffs
The difference in means is 0.9957 with 95% CI: [0.8689, 1.1364]

Prediction: From m < ~10 to m > ~100

(m is number of features)

In [124]:
import statsmodels.api as sm#load the iris data: 
iris = pd.read_csv('iris.csv')
print iris.head()
print iris.describe()

#example: predicting Sepal Length from the other iris characteristics:
iris_train = iris[:130] #training data
iris_test = iris[130:150]  #testing data
model = sm.OLS(iris_train['SepalLength'], iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']]).fit()
y_hat = np.dot(iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']], model.params) #model.params are betas...
#y true,  y predicted,   error
print np.array([iris_test['SepalLength'], y_hat, iris_test['SepalLength'] - y_hat]).T
print "mean squared error: %.3f" % np.mean((iris_test['SepalLength'] - y_hat)**2)
   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
3          4.6         3.1          1.5         0.2  Iris-setosa
4          5.0         3.6          1.4         0.2  Iris-setosa
       SepalLength  SepalWidth  PetalLength  PetalWidth
count   150.000000  150.000000   150.000000  150.000000
mean      5.843333    3.054000     3.758667    1.198667
std       0.828066    0.433594     1.764420    0.763161
min       4.300000    2.000000     1.000000    0.100000
25%       5.100000    2.800000     1.600000    0.300000
50%       5.800000    3.000000     4.350000    1.300000
75%       6.400000    3.300000     5.100000    1.800000
max       7.900000    4.400000     6.900000    2.500000
[[ 7.4         7.12396497  0.27603503]
 [ 7.9         8.41835817 -0.51835817]
 [ 6.4         6.27534882  0.12465118]
 [ 6.3         6.56320208 -0.26320208]
 [ 6.1         6.96383735 -0.86383735]
 [ 7.7         6.89006419  0.80993581]
 [ 6.3         6.7101159  -0.4101159 ]
 [ 6.4         6.959432   -0.559432  ]
 [ 6.          6.13834305 -0.13834305]
 [ 6.9         6.51695611  0.38304389]
 [ 6.7         6.37908542  0.32091458]
 [ 6.9         5.98505703  0.91494297]
 [ 5.8         5.99827082 -0.19827082]
 [ 6.8         6.90768105 -0.10768105]
 [ 6.7         6.58766053  0.11233947]
 [ 6.7         5.9762486   0.7237514 ]
 [ 6.3         5.67604876  0.62395124]
 [ 6.5         6.31718943  0.18281057]
 [ 6.2         6.62069271 -0.42069271]
 [ 5.9         6.44294824 -0.54294824]]
mean squared error: 0.245
In [125]:
#an example of overfitting by using too many features compared to training observations: 

#Let's setup a train and test (no development because we aren't setting any extra parameters)
iris_train = iris[:10] #training data
iris_test = iris[100:]  #testing data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_2feats = iris_train[['PetalWidth','SepalWidth']] #uppercase => matrix

#Train (fit) the models: 
lr_result_3f = sm.OLS(y_train, X_3feats).fit()  
lr_result_2f = sm.OLS(y_train, X_2feats).fit()  
betas_3f = lr_result_3f.params
print "\nbetas for 3 features:\n", betas_3f
betas_2f = lr_result_2f.params
print "\nbetas for 2 features:\n", betas_2f

#setup test
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_2f = iris_test[['PetalWidth','SepalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']

#apply model to test and check eror:
y_hat_test_3f = np.dot(X_test_3f, betas_3f)
y_hat_train_3f = np.dot(X_3feats, betas_3f)
y_hat_test_2f = np.dot(X_test_2f, betas_2f)
error_3f = np.mean((y_test - y_hat_test_3f)**2)
error_3ftrain = np.mean((y_train - y_hat_train_3f)**2)
error_2f = np.mean((y_test - y_hat_test_2f)**2)
print "\nMSerror when using 3 feats(test): %.2f" % error_3f
print "MSerror when using 3 feats(train): %.2f" % error_3ftrain
print "MSerror when using 2 feats: %.2f" % error_2f
betas for 3 features:
SepalWidth     0.979918
PetalLength    1.429045
PetalWidth    -2.086311
dtype: float64

betas for 2 features:
PetalWidth   -2.452010
SepalWidth    1.627737
dtype: float64

MSerror when using 3 feats(test): 0.32
MSerror when using 3 feats(train): 0.02
MSerror when using 2 feats: 45.76

Regularization

In [126]:
#Regularization Comparison

from sklearn.linear_model import Ridge, Lasso

#RIDGE:
weights = []
mses = []
#alphas = ([.001, .01, .1, 1, 10]
alphas = [(2**i)/10000.0 for i in xrange(20)]
print "standard deviation of y_hats as alpha increases"
for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_)
    #print model.coef_
    #figure out accuracy:
    y_hat = model.predict(X_test_3f)
    print y_hat.std()
    mses.append(np.sum((y_test - y_hat)**2))
    
plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)#
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
                
plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()


#LASSO:
weights = []
mses = []
alphas = alphas[:13]
for alpha in alphas:
    model = Lasso(alpha=alpha)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_)
    y_hat = model.predict(X_test_3f)
    mses.append(np.sum((y_test - y_hat)**2))
    

plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()

plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
standard deviation of y_hats as alpha increases
0.605594198186
0.604279611313
0.601681540688
0.596606403731
0.586914401569
0.569179238929
0.539110689769
0.494067750435
0.437160049804
0.377088450755
0.320826539772
0.268341000979
0.215102507904
0.1593096816
0.106374043306
0.0642056018456
0.035861701378
0.0190530196901
0.00983519989978
0.00499871005247

Classification

In [127]:
#re-setup data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']

#turn y into a classification problem:
split_point = y_train.mean()
y_train = [1 if i else 0 for i in y_train > split_point]
y_test = [1 if i else 0 for i in y_test > split_point]
print y_train

plt.scatter(X_3feats[['SepalWidth']], y_train) 
plt.xlabel('SepalWidth')
plt.ylabel('Is Long?')
[1, 1, 0, 0, 1, 1, 0, 1, 0, 1]
Out[127]:
<matplotlib.text.Text at 0xcf90b00>

Regularized Classification (Logistic Regression)

In [128]:
from sklearn.linear_model import LogisticRegression

#L2:
weights = []
mses = []
#alphas = np.array([.1, 1, 10, 100, 1000, 10000, 100000])
alphas = [.1*3**i for i in xrange(14)]
Cs = alphas[::-1] #reverse
for c in Cs:
    model = LogisticRegression(penalty='l2', C=c)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_[0])
    #print model.coef_
    #figure out accuracy:
    y_hat = model.predict(X_test_3f)
    mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()
                
# plt.plot([i for i in xrange(len(Cs))], mses)
#plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
# plt.show()


#L1
weights = []
mses = []
Cs = Cs[3:-2]
for c in Cs:
    model = LogisticRegression(penalty='l1', C=c)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_[0])
    #print model.coef_
    #figure out accuracy:
    y_hat = model.predict(X_test_3f)
    mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()

Clustering

In [129]:
#setup data:
X = iris[['PetalWidth','SepalWidth']] #'SepalLength', 'PetalLength'
X.plot(kind='scatter', x = 'PetalWidth', y ='SepalWidth', alpha=.3)
plt.show()

#run kmeans using sklearn:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2)
model.fit(X) #run KMeans
clusters = model.predict(X)
print "Clusters assigned", clusters

#plot the results
plt.scatter(X['PetalWidth'], X['SepalWidth'], c = clusters, alpha=.4) #color-coded points
centers = model.cluster_centers_
#mark the centers
plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=5, color='teal', alpha=.6)
Clusters assigned [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]
Out[129]:
<matplotlib.collections.PathCollection at 0xd0769b0>

PCA

In [130]:
#setup data (could use same as above but changing it up for variety)
X = iris[['SepalLength', 'PetalWidth']]
#X = iris[['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']]
X = (X - X.mean()) #mean center the data
X.plot(kind='scatter', x ='SepalLength', y = 'PetalWidth', alpha=.4)
#sys.exit(1)

#Run PCA, using sklearn:
from sklearn.decomposition import PCA
model = PCA(n_components = 2)
model.fit(X)

#let's look at the components (directions)
V = model.components_ #components = directions
print "v martrix:\n", V
plt.scatter([V[0][0]], [V[0][1]], c = [9])
#sys.exit(1)

#draw each direction:
for c in V:
    slope = c[1]/c[0]    
    xpoints = np.linspace(float(X[[0]].min()), float(X[[0]].max()), 10)
    UDc = [xp*slope for xp in xpoints] #the pricipal component for the given direction
    plt.plot(xpoints, UDc)
#plt.ylim([-3,3])
#plt.xlim([-3,3])
plt.show()
#sys.exit(1)

#let's apply the transform based on v:
X_lowdim = model.transform(X)
X_lowdim2 = np.dot(X,V.T) #same as above
print V
print X.head()
print X_lowdim[:5]
print X_lowdim2[:5]
Xld = pd.Series(X_lowdim[:,0])
Xld.hist()
plt.scatter(Xld, [-1]*len(Xld), alpha = 0.3)
plt.show()
#sys.exit(1)

#explained variance
print "percentage variance explained:" , model.explained_variance_ratio_ 
var = X_lowdim.std(axis=0)**2
print "variance per component ", var
print "our percentage variance explained", [vi / sum(var) for vi in var]
v martrix:
[[ 0.7414199   0.67104138]
 [-0.67104138  0.7414199 ]]
[[ 0.7414199   0.67104138]
 [-0.67104138  0.7414199 ]]
   SepalLength  PetalWidth
0    -0.743333   -0.998667
1    -0.943333   -0.998667
2    -1.143333   -0.998667
3    -1.243333   -0.998667
4    -0.843333   -0.998667
[[-1.22126878 -0.24162391]
 [-1.36955276 -0.10741563]
 [-1.51783674  0.02679264]
 [-1.59197873  0.09389678]
 [-1.29541077 -0.17451977]]
[[-1.22126878 -0.24162391]
 [-1.36955276 -0.10741563]
 [-1.51783674  0.02679264]
 [-1.59197873  0.09389678]
 [-1.29541077 -0.17451977]]
percentage variance explained: [ 0.90964722  0.09035278]
variance per component  [ 1.14584055  0.11381323]
our percentage variance explained [0.90964721614072486, 0.090352783859275143]

Naive Bayes

In [135]:
#setup class names to be flower names
print iris[:3]
names = list(set(iris['Name'].tolist()))
nameToIndex = dict([(names[i],i) for i in range(len(names))])
print nameToIndex

#setup X and y:
iris = iris.iloc[np.random.permutation(len(iris))]
iris_train = iris[:100] #training data
iris_test = iris[100:]  #testing data
Xtrain = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtrain = iris_train[['SepalWidth']]
Xtest = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtest = iris_test[['SepalWidth']]
ytrain = np.array([nameToIndex[n] for n in iris_train['Name'].tolist()])
ytest = np.array([nameToIndex[n] for n in iris_test['Name'].tolist()])
Xtrain[['SepalWidth']].hist()
plt.show()

#setup
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(Xtrain, ytrain)
print "Class priors: ", clf.class_prior_
print "\nClass means: \n", clf.theta_
print "\nClass variance: \n", clf.sigma_
    SepalLength  SepalWidth  PetalLength  PetalWidth             Name
34          4.9         3.1          1.5         0.1      Iris-setosa
61          5.9         3.0          4.2         1.5  Iris-versicolor
95          5.7         3.0          4.2         1.2  Iris-versicolor
{'Iris-virginica': 0, 'Iris-setosa': 1, 'Iris-versicolor': 2}
Class priors:  [ 0.34  0.32  0.34]

Class means: 
[[ 3.04117647  5.61470588  2.07058824  6.67352941]
 [ 3.421875    1.4625      0.225       4.990625  ]
 [ 2.78823529  4.29705882  1.33823529  5.92941176]]

Class variance: 
[[ 0.10536332  0.24831315  0.06207613  0.30488755]
 [ 0.11608399  0.03671875  0.0075      0.10959961]
 [ 0.09633218  0.23263841  0.04236159  0.26972319]]
In [137]:
#lets just look at one test example:
oneExample = Xtest.iloc[6]
print oneExample
print "\npredicted class: ", clf.predict([oneExample])
print "\nprobabilities for each class:", ["%.6f"%p for p in clf.predict_proba([oneExample])[0]]

#check accuracy for all predictions
y_hat = clf.predict(Xtest)
accuracy = sum([1 if y_hat[i] == ytest[i] else 0 for i in range(len(ytest))]) / float(len(ytest))
print accuracy
SepalWidth     2.8
PetalLength    4.8
PetalWidth     1.8
SepalLength    6.2
Name: 126, dtype: float64

predicted class:  [0]

probabilities for each class: ['0.573733', '0.000000', '0.426267']
0.94