%matplotlib inline
#python includes
import sys
#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
#bootstrapping on confidence interval on the difference in means:
iters = 1000 #iterations
N1 = 500 #observations
N2 = 100 #observations
#pretend this data was real:
X1 = pd.Series(np.random.normal(100, 1., N))
X2 = pd.Series(np.random.normal(99, 0.5, N2))
X1.hist(bins = N1/10, alpha=.7)
X2.hist(bins = N2/10, alpha=.5)
#plt.show()
#compute original difference
origDiff = X1.mean() - X2.mean()
print "original diff:", origDiff
#sys.exit(1)
reDiffs = list()
for i in range(iters):
#draw a random resampling of #drawing from the hat:
reX1indices = [int(d) for d in np.random.uniform(0, N1-1, N1)]
reX2indices = [int(d) for d in np.random.uniform(0, N2-1, N2)]
#print reX1indices
#sys.exit(1)
reX1 = X1[reXindices] #resampled X1
reX2 = X2[reX2indices] #resampled X2
#reX1.hist(bins = N1/10, alpha=.3)
reDiff = reX1.mean() - reX2.mean()
#print "resampled diff", reDiff
reDiffs.append(reDiff)
plt.show()
sortedReDiffs = pd.Series(sorted(reDiffs))
#print sortedReDiffs.head()
sortedReDiffs.hist(bins = iters/6)
print "histogram of bootstrapped diffs"
sRDdesc = sortedReDiffs.describe(percentiles=[.025, .975])
plt.show()
print "The difference in means is %.4f with 95%% CI: [%.4f, %.4f]" % (origDiff,sRDdesc['2.5%'], sRDdesc['97.5%'])
(m is number of features)
#load the iris data:
iris = pd.read_csv('iris.csv')
print iris.head()
print iris.describe()
iris_train = iris[:5]
iris_test = iris[5:]
y = iris_train['SepalLength'] #from now on, lowercase => vector
X = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
#X = iris_train[['PetalWidth','SepalWidth']] #uppercase => matrix
import statsmodels.api as sm
lr_result = sm.OLS(y, X).fit()
print lr_result.summary()
betas = lr_result.params
print "betas", betas
y_test = iris_test['SepalLength'] #from now on, lowercase => vector
X_test = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
#X_test = iris_test[['PetalWidth','SepalWidth']] #uppercase => matrix
y_hat_test = np.dot(X_test, betas)
error = np.mean((y_test - y_hat_test)**2)
print error