In [1]:
%matplotlib inline

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization

Resampling Technique -- The Bootstrap

In [2]:
#bootstrapping on confidence interval on the difference in means:

iters = 1000 #iterations
N1 = 500 #observations
N2 = 100 #observations
#pretend this data was real:
X1 = pd.Series(np.random.normal(100, 1., N1))
X2 = pd.Series(np.random.normal(99, 0.5, N2))
X1.hist(bins = N1/10, alpha=.7)
X2.hist(bins = N2/10, alpha=.5)
#plt.show()

#compute original difference
origDiff = X1.mean() - X2.mean()
print "original diff:", origDiff

#sys.exit(1)

reDiffs = list()
for i in range(iters):
    #draw a random resampling of #drawing from the hat: 
    reX1indices = [int(d) for d in np.random.uniform(0, N1-1, N1)]
    reX2indices = [int(d) for d in np.random.uniform(0, N2-1, N2)]
    #print reX1indices
    #sys.exit(1)
    reX1 = X1[reX1indices] #resampled X1
    reX2 = X2[reX2indices] #resampled X2
    #reX1.hist(bins = N1/10, alpha=.3)
    reDiff = reX1.mean() - reX2.mean()
    #print "resampled diff", reDiff
    reDiffs.append(reDiff)
    
plt.show()
sortedReDiffs = pd.Series(sorted(reDiffs))
#print sortedReDiffs.head()
sortedReDiffs.hist(bins = iters/6)
print "histogram of bootstrapped diffs"
sRDdesc = sortedReDiffs.describe(percentiles=[.025, .975])
plt.show()
print "The difference in means is %.4f with 95%% CI: [%.4f, %.4f]" % (origDiff,sRDdesc['2.5%'], sRDdesc['97.5%'])
original diff: 1.02191769844
histogram of bootstrapped diffs
The difference in means is 1.0219 with 95% CI: [0.8945, 1.1574]

Prediction: From m < ~10 to m > ~100

(m is number of features)

In [3]:
import statsmodels.api as sm#load the iris data: 
iris = pd.read_csv('iris.csv')
print iris.head()
print iris.describe()

#example: predicting Sepal Length from the other iris characteristics:
iris_train = iris[:130] #training data
iris_test = iris[130:150]  #testing data
model = sm.OLS(iris_train['SepalLength'], iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']]).fit()
y_hat = np.dot(iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']], model.params) #model.params are betas...
#y true,  y predicted,   error
print np.array([iris_test['SepalLength'], y_hat, iris_test['SepalLength'] - y_hat]).T
print "mean squared error: %.3f" % np.mean((iris_test['SepalLength'] - y_hat)**2)
   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
3          4.6         3.1          1.5         0.2  Iris-setosa
4          5.0         3.6          1.4         0.2  Iris-setosa
       SepalLength  SepalWidth  PetalLength  PetalWidth
count   150.000000  150.000000   150.000000  150.000000
mean      5.843333    3.054000     3.758667    1.198667
std       0.828066    0.433594     1.764420    0.763161
min       4.300000    2.000000     1.000000    0.100000
25%       5.100000    2.800000     1.600000    0.300000
50%       5.800000    3.000000     4.350000    1.300000
75%       6.400000    3.300000     5.100000    1.800000
max       7.900000    4.400000     6.900000    2.500000
[[ 7.4         7.12396497  0.27603503]
 [ 7.9         8.41835817 -0.51835817]
 [ 6.4         6.27534882  0.12465118]
 [ 6.3         6.56320208 -0.26320208]
 [ 6.1         6.96383735 -0.86383735]
 [ 7.7         6.89006419  0.80993581]
 [ 6.3         6.7101159  -0.4101159 ]
 [ 6.4         6.959432   -0.559432  ]
 [ 6.          6.13834305 -0.13834305]
 [ 6.9         6.51695611  0.38304389]
 [ 6.7         6.37908542  0.32091458]
 [ 6.9         5.98505703  0.91494297]
 [ 5.8         5.99827082 -0.19827082]
 [ 6.8         6.90768105 -0.10768105]
 [ 6.7         6.58766053  0.11233947]
 [ 6.7         5.9762486   0.7237514 ]
 [ 6.3         5.67604876  0.62395124]
 [ 6.5         6.31718943  0.18281057]
 [ 6.2         6.62069271 -0.42069271]
 [ 5.9         6.44294824 -0.54294824]]
mean squared error: 0.245
In [4]:
#an example of overfitting by using too many features compared to training observations: 

#Let's setup a train and test (no development because we aren't setting any extra parameters)
iris_train = iris[:10] #training data
iris_test = iris[100:]  #testing data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_2feats = iris_train[['PetalWidth','SepalWidth']] #uppercase => matrix

#Train (fit) the models: 
lr_result_3f = sm.OLS(y_train, X_3feats).fit()  
lr_result_2f = sm.OLS(y_train, X_2feats).fit()  
betas_3f = lr_result_3f.params
print "\nbetas for 3 features:\n", betas_3f
betas_2f = lr_result_2f.params
print "\nbetas for 2 features:\n", betas_2f

#setup test
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_2f = iris_test[['PetalWidth','SepalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']

#apply model to test and check eror:
y_hat_test_3f = np.dot(X_test_3f, betas_3f)
y_hat_train_3f = np.dot(X_3feats, betas_3f)
y_hat_test_2f = np.dot(X_test_2f, betas_2f)
error_3f = np.mean((y_test - y_hat_test_3f)**2)
error_3ftrain = np.mean((y_train - y_hat_train_3f)**2)
error_2f = np.mean((y_test - y_hat_test_2f)**2)
print "\nMSerror when using 3 feats(test): %.2f" % error_3f
print "MSerror when using 3 feats(train): %.2f" % error_3ftrain
print "MSerror when using 2 feats: %.2f" % error_2f
betas for 3 features:
SepalWidth     0.979918
PetalLength    1.429045
PetalWidth    -2.086311
dtype: float64

betas for 2 features:
PetalWidth   -2.452010
SepalWidth    1.627737
dtype: float64

MSerror when using 3 feats(test): 0.32
MSerror when using 3 feats(train): 0.02
MSerror when using 2 feats: 45.76

Regularization

In [5]:
#Regularization Comparison

from sklearn.linear_model import Ridge, Lasso

#RIDGE:
weights = []
mses = []
#alphas = ([.001, .01, .1, 1, 10]
alphas = [(2**i)/10000.0 for i in xrange(20)]
print "standard deviation of y_hats as alpha increases"
for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_)
    #print model.coef_
    #figure out accuracy:
    y_hat = model.predict(X_test_3f)
    print y_hat.std()
    mses.append(np.sum((y_test - y_hat)**2))
    
plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)#
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
                
plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()


#LASSO:
weights = []
mses = []
alphas = alphas[:13]
for alpha in alphas:
    model = Lasso(alpha=alpha)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_)
    y_hat = model.predict(X_test_3f)
    mses.append(np.sum((y_test - y_hat)**2))
    

plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()

plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
standard deviation of y_hats as alpha increases
0.605594198186
0.604279611313
0.601681540688
0.596606403731
0.586914401569
0.569179238929
0.539110689769
0.494067750435
0.437160049804
0.377088450755
0.320826539772
0.268341000979
0.215102507904
0.1593096816
0.106374043306
0.0642056018456
0.035861701378
0.0190530196901
0.00983519989978
0.00499871005247

Classification

In [6]:
#re-setup data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']

#turn y into a classification problem:
split_point = y_train.mean()
y_train = [1 if i else 0 for i in y_train > split_point]
y_test = [1 if i else 0 for i in y_test > split_point]
print y_train

plt.scatter(X_3feats[['SepalWidth']], y_train) 
plt.xlabel('SepalWidth')
plt.ylabel('Is Long?')
[1, 1, 0, 0, 1, 1, 0, 1, 0, 1]
Out[6]:
<matplotlib.text.Text at 0x9207748>

Regularized Classification (Logistic Regression)

In [7]:
from sklearn.linear_model import LogisticRegression

#L2:
weights = []
mses = []
#alphas = np.array([.1, 1, 10, 100, 1000, 10000, 100000])
alphas = [.1*3**i for i in xrange(14)]
Cs = alphas[::-1] #reverse
for c in Cs:
    model = LogisticRegression(penalty='l2', C=c)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_[0])
    #print model.coef_
    #figure out accuracy:
    y_hat = model.predict(X_test_3f)
    mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()
                
# plt.plot([i for i in xrange(len(Cs))], mses)
#plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
# plt.show()


#L1
weights = []
mses = []
Cs = Cs[3:-2]
for c in Cs:
    model = LogisticRegression(penalty='l1', C=c)
    model.fit(X_3feats, y_train)
    weights.append(model.coef_[0])
    #print model.coef_
    #figure out accuracy:
    y_hat = model.predict(X_test_3f)
    mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()

Clustering

In [8]:
#setup data:
X = iris[['PetalWidth','SepalWidth']] #'SepalLength', 'PetalLength'
X.plot(kind='scatter', x = 'PetalWidth', y ='SepalWidth', alpha=.3)
plt.show()

#run kmeans using sklearn:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2)
model.fit(X) #run KMeans
clusters = model.predict(X)
print "Clusters assigned", clusters

#plot the results
plt.scatter(X['PetalWidth'], X['SepalWidth'], c = clusters, alpha=.4) #color-coded points
centers = model.cluster_centers_
#mark the centers
plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=5, color='teal', alpha=.6)
Clusters assigned [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]
Out[8]:
<matplotlib.collections.PathCollection at 0xc6a7588>

PCA

In [9]:
#setup data (could use same as above but changing it up for variety)
X = iris[['SepalLength', 'PetalWidth']]
#X = iris[['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']]
X = (X - X.mean()) #mean center the data
X.plot(kind='scatter', x ='SepalLength', y = 'PetalWidth', alpha=.4)
#sys.exit(1)

#Run PCA, using sklearn:
from sklearn.decomposition import PCA
model = PCA(n_components = 2)
model.fit(X)

#let's look at the components (directions)
V = model.components_ #components = directions
print "v martrix:\n", V
plt.scatter([V[0][0]], [V[0][1]], c = [9])
#sys.exit(1)

#draw each direction:
for c in V:
    slope = c[1]/c[0]    
    xpoints = np.linspace(float(X[[0]].min()), float(X[[0]].max()), 10)
    UDc = [xp*slope for xp in xpoints] #the pricipal component for the given direction
    plt.plot(xpoints, UDc)
#plt.ylim([-3,3])
#plt.xlim([-3,3])
plt.show()
#sys.exit(1)

#let's apply the transform based on v:
X_lowdim = model.transform(X)
X_lowdim2 = np.dot(X,V.T) #same as above
print V
print X.head()
print X_lowdim[:5]
print X_lowdim2[:5]
Xld = pd.Series(X_lowdim[:,0])
Xld.hist()
plt.scatter(Xld, [-1]*len(Xld), alpha = 0.3)
plt.show()
#sys.exit(1)

#explained variance
print "percentage variance explained:" , model.explained_variance_ratio_ 
var = X_lowdim.std(axis=0)**2
print "variance per component ", var
print "our percentage variance explained", [vi / sum(var) for vi in var]
v martrix:
[[ 0.7414199   0.67104138]
 [-0.67104138  0.7414199 ]]
[[ 0.7414199   0.67104138]
 [-0.67104138  0.7414199 ]]
   SepalLength  PetalWidth
0    -0.743333   -0.998667
1    -0.943333   -0.998667
2    -1.143333   -0.998667
3    -1.243333   -0.998667
4    -0.843333   -0.998667
[[-1.22126878 -0.24162391]
 [-1.36955276 -0.10741563]
 [-1.51783674  0.02679264]
 [-1.59197873  0.09389678]
 [-1.29541077 -0.17451977]]
[[-1.22126878 -0.24162391]
 [-1.36955276 -0.10741563]
 [-1.51783674  0.02679264]
 [-1.59197873  0.09389678]
 [-1.29541077 -0.17451977]]
percentage variance explained: [ 0.90964722  0.09035278]
variance per component  [ 1.14584055  0.11381323]
our percentage variance explained [0.90964721614072486, 0.090352783859275143]

Naive Bayes

In [10]:
#setup class names to be flower names
print iris[:3]
names = list(set(iris['Name'].tolist()))
nameToIndex = dict([(names[i],i) for i in range(len(names))])
print nameToIndex

#setup X and y:
iris = iris.iloc[np.random.permutation(len(iris))]
iris_train = iris[:100] #training data
iris_test = iris[100:]  #testing data
Xtrain = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtrain = iris_train[['SepalWidth']]
Xtest = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtest = iris_test[['SepalWidth']]
ytrain = np.array([nameToIndex[n] for n in iris_train['Name'].tolist()])
ytest = np.array([nameToIndex[n] for n in iris_test['Name'].tolist()])
Xtrain[['SepalWidth']].hist()
plt.show()

#setup
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(Xtrain, ytrain)
print "Class priors: ", clf.class_prior_
print "\nClass means: \n", clf.theta_
print "\nClass variance: \n", clf.sigma_
   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
{'Iris-virginica': 0, 'Iris-setosa': 1, 'Iris-versicolor': 2}
Class priors:  [ 0.39  0.31  0.3 ]

Class means: 
[[ 2.93333333  5.52051282  2.02564103  6.51538462]
 [ 3.45806452  1.49032258  0.2483871   5.04193548]
 [ 2.78333333  4.28333333  1.32333333  5.98      ]]

Class variance: 
[[ 0.09145299  0.27496384  0.07780408  0.3218146 ]
 [ 0.18953174  0.0240999   0.01217482  0.138564  ]
 [ 0.10072223  0.26205556  0.04112223  0.3056    ]]
In [11]:
#lets just look at one test example:
oneExample = Xtest.iloc[6]
print oneExample
print "\npredicted class: ", clf.predict([oneExample])
print "\nprobabilities for each class:", ["%.6f"%p for p in clf.predict_proba([oneExample])[0]]

#check accuracy for all predictions
y_hat = clf.predict(Xtest)
accuracy = sum([1 if y_hat[i] == ytest[i] else 0 for i in range(len(ytest))]) / float(len(ytest))
print accuracy
SepalWidth     3.2
PetalLength    4.8
PetalWidth     1.8
SepalLength    5.9
Name: 70, dtype: float64

predicted class:  [0]

probabilities for each class: ['0.862690', '0.000000', '0.137310']
0.98

Time Series Analysis

In [13]:
#Let's make up some time series data:
#each index is a day; a day is 50% likely to continue in the same direction as the average of the last 3 days

def MakeFakeTimeSeries(mod = None): 
    #start with a random 3 days. 
    x = ss.norm(10, 1).rvs(size=3)
    #print x
    for i in range(3, 30):
        #current trend direction:
        currentTrendDir = 1 if x[i-1] - x[i-3] > 0 else -1

        #Find a magnitude of change:
        magnitudeOfChange = ss.norm(1, .7).rvs(size=1)[0]

        #% chance it continues with same direction
        changeDirection = 1 if ss.bernoulli(.75).rvs(size=1)[0]>0 else -1
        delta = magnitudeOfChange * (currentTrendDir * changeDirection)
        #print (currentTrendDir, magnitudeOfChange, changeDirection, delta)

        x = np.append(x, x[i-1] + delta)
        #print x
    return pd.Series(x)

x1 = MakeFakeTimeSeries()
x2 = MakeFakeTimeSeries()
x1.plot(kind='line')
x2.plot(kind='line')

print ss.pearsonr(x1, x2)

#Run this multiple times to see how often time-series like data correlate with themself
(0.96013398088665491, 4.9133080635818904e-17)

autocorrelation

In [14]:
def corrWithSelf(y, lag = 1):
    return ss.pearsonr(y[:-1*lag], y[lag:])

fakeTS = x1
print "fake TS autocorrelation with lag = 1: ", corrWithSelf(fakeTS)
fake TS autocorrelation with lag = 1:  (0.98240530547556293, 3.2670737373533364e-21)
In [15]:
#white noise:
wnoise = pd.Series(ss.norm(0, 1).rvs(size=100))
wnoise.plot(kind='line')
plt.show()
print "With noise autocorrelation with lag = 1: ", corrWithSelf(wnoise, 1)
With noise autocorrelation with lag = 1:  (-0.32868542709750792, 0.00089490406200044381)
In [16]:
#periodic data (sinusoidal):
x = np.linspace(0, 50, 100)
ssdal = pd.Series(np.sin(x) + np.random.random(100))
ssdal.plot(kind='line', lw=3, alpha=.5)
print "sinusoidal autocorrelation with lag = 1: ", corrWithSelf(ssdal)
print "sinusoidal autocorrelation with lag = 4: ", corrWithSelf(ssdal, 4)
print "sinusoidal autocorrelation with lag = 7: ", corrWithSelf(ssdal, 7)
print "sinusoidal autocorrelation with lag = 10: ", corrWithSelf(ssdal, 10)
print "sinusoidal autocorrelation with lag = 13: ", corrWithSelf(ssdal, 13)
#plot the lag=13 version: 
pd.Series(ssdal[13:].tolist()).plot(kind='line', lw=3, alpha=.6)
plt.show()
sinusoidal autocorrelation with lag = 1:  (0.75891044359011617, 8.9958915006473709e-20)
sinusoidal autocorrelation with lag = 4:  (-0.32869492916909698, 0.0010766015651224104)
sinusoidal autocorrelation with lag = 7:  (-0.80273751117274905, 3.833235950062747e-22)
sinusoidal autocorrelation with lag = 10:  (0.25028654431882447, 0.017348523672109612)
sinusoidal autocorrelation with lag = 13:  (0.82651205323817778, 6.3726702045595101e-23)
In [30]:
fcs = {'2003-12': 0.1991, '2003-10': 0.2686, '2003-11': 0.1956, '2014-09': 1.9471, '2014-08': 1.9398, '2014-05': 1.6808, '2014-04': 1.7328, '2014-07': 1.983, '2014-06': 1.8564, '2001-10': 0.25, '2001-11': 0.2393, '2001-12': 0.1948, '2014-02': 1.6022, '2010-01': 2.1771, '2010-03': 2.3553, '2010-02': 2.1249, '2010-05': 2.1108, '2010-04': 2.3586, '2002-08': 0.529, '2002-09': 0.5384, '2002-06': 0.3795, '2002-07': 0.4673, '2002-04': 0.2561, '2002-05': 0.3084, '2002-02': 0.203, '2002-03': 0.2409, '2013-02': 1.0462, '2002-01': 0.2237, '2000-01': 0.4537, '2000-02': 0.4531, '2000-03': 0.4669, '2000-04': 0.4489, '2000-05': 0.4922, '2000-06': 0.4004, '2000-07': 0.3288, '2000-08': 0.2904, '2000-09': 0.3035, '2012-07': 0.7055, '2004-12': 0.684, '2004-11': 0.6087, '2004-10': 0.5991, '2012-04': 0.8279, '2015-06': 2.5328, '2015-07': 2.4714, '2015-04': 2.7388, '2015-05': 2.5827, '2015-02': 2.3967, '2015-03': 2.6437, '2015-01': 2.1805, '2014-12': 2.2094, '2014-01': 1.5996, '2014-10': 2.1816, '2014-11': 2.163, '2011-11': 0.8462, '2011-10': 0.7601, '2015-08': 2.6307, '2011-12': 0.9108, '2012-09': 0.9374, '2012-08': 0.7143, '2014-03': 1.6035, '2012-03': 0.8842, '2012-02': 0.8243, '2012-01': 0.8573, '2010-12': 1.2324, '2012-06': 0.7444, '2010-10': 2.1948, '2010-11': 1.6971, '2012-05': 0.7355, '2004-04': 0.7475, '2004-05': 0.71, '2004-06': 0.7159, '2004-07': 0.7272, '2004-01': 0.4498, '2004-02': 0.6706, '2004-03': 0.7329, '2013-08': 1.2974, '2004-08': 0.6687, '2004-09': 0.5591, '2013-09': 1.3191, '2011-08': 0.722, '2011-09': 0.7176, '2010-07': 1.8553, '2011-02': 0.7774, '2011-03': 0.7912, '2010-06': 1.9288, '2011-06': 0.6414, '2011-07': 0.6254, '2011-04': 0.7136, '2011-05': 0.6154, '2010-09': 2.2013, '2007-02': 0.8849, '2010-08': 2.1603, '2012-10': 0.9287, '2012-11': 0.8851, '2012-12': 0.9112, '2013-06': 1.0281, '2007-09': 1.2881, '2007-08': 1.3326, '2013-07': 1.1001, '2006-11': 0.7659, '2001-09': 0.2584, '2006-12': 0.7898, '2007-01': 0.8339, '2007-03': 0.9444, '2001-08': 0.2581, '2007-05': 1.1781, '2007-04': 1.056, '2007-07': 1.294, '2007-06': 1.2395, '2005-03': 0.7924, '2005-02': 0.7441, '2005-01': 0.7297, '2005-07': 0.727, '2005-06': 0.7982, '2005-05': 0.7518, '2005-04': 0.7811, '2005-09': 0.6982, '2005-08': 0.7234, '2003-09': 0.3893, '2003-08': 0.5029, '2007-12': 1.5599, '2007-10': 1.5662, '2007-11': 1.599, '2006-02': 0.8533, '2006-03': 0.9117, '2006-01': 0.9119, '2006-06': 0.8982, '2006-07': 0.9514, '2006-04': 0.869, '2006-05': 0.8514, '2008-12': 1.8907, '2006-08': 0.9158, '2006-09': 0.8749, '2005-10': 0.7011, '2005-11': 0.6953, '2005-12': 0.7685, '2008-08': 1.8377, '2008-09': 1.911, '2013-04': 0.9641, '2008-01': 1.6453, '2008-02': 1.7691, '2008-03': 1.945, '2008-04': 1.9918, '2008-05': 1.9785, '2008-06': 1.9643, '2008-07': 1.8963, '2013-01': 0.9678, '2013-10': 1.3612, '2013-03': 1.0694, '2011-01': 0.9035, '2015-11': 2.9898, '1998-09': 0.3616, '1998-08': 0.3926, '1998-05': 0.3818, '1998-04': 0.3728, '1998-07': 0.431, '1998-06': 0.4074, '1998-01': 0.4211, '1998-03': 0.4276, '1998-02': 0.4382, '2015-10': 2.8921, '1999-11': 0.3886, '1999-10': 0.4095, '1999-12': 0.4362, '2008-11': 1.9555, '2008-10': 2.1204, '1999-06': 0.4582, '1999-07': 0.4643, '1999-04': 0.488, '1999-05': 0.4591, '1999-02': 0.3732, '1999-03': 0.4864, '1999-01': 0.3894, '1998-12': 0.4039, '1998-10': 0.3764, '1998-11': 0.3995, '1999-08': 0.456, '1999-09': 0.4113, '2015-12': 3.2485, '2009-07': 2.1571, '2009-06': 2.0605, '2009-05': 2.0474, '2009-04': 2.1199, '2009-03': 2.1778, '2009-02': 1.8791, '2009-01': 1.8403, '2009-09': 2.1493, '2009-08': 2.2106, '2003-05': 0.4944, '2003-04': 0.5394, '2003-07': 0.4775, '2003-06': 0.4736, '2003-01': 0.5452, '2003-03': 0.5142, '2003-02': 0.4805, '2001-07': 0.2446, '2001-06': 0.2633, '2001-05': 0.3091, '2001-04': 0.3604, '2001-03': 0.3872, '2001-02': 0.3901, '2001-01': 0.3247, '2002-11': 0.5827, '2002-10': 0.5392, '2002-12': 0.6221, '2015-09': 2.6822, '2013-12': 1.4755, '2013-11': 1.412, '2006-10': 0.8813, '2000-12': 0.3148, '2000-11': 0.269, '2000-10': 0.3237, '2013-05': 1.0311, '2009-10': 2.0288, '2009-11': 2.0994, '2009-12': 2.0438}
fcs=pd.DataFrame(fcs.items()).sort_values(0)
fcs=fcs.set_index(0)
fcs.plot(kind="line")
plt.xlabel("year-month")
print "NY Foreclosures autocorrelation with lag = 1: ", corrWithSelf(ssdal)
NY Foreclosures autocorrelation with lag = 1:  (0.75891044359011617, 8.9958915006473709e-20)
In [60]:
def autocorrelate_plot(y):
    series = [corrWithSelf(y, lag = lag)[0] for lag in range(1, int(len(y)-15))]
    return pd.Series(series)

ac_fcs = autocorrelate_plot(fcs.loc[:,1])
ac_fcs.plot(kind="line")
plt.xlabel("lag")
Out[60]:
<matplotlib.text.Text at 0xd768e10>
In [63]:
##more autocorrelation plots:
ac_fts = autocorrelate_plot(fakeTS)
ac_fts.plot(kind="line")
plt.xlabel("lag")
plt.show()

ac_wn = autocorrelate_plot(wnoise)
ac_wn.plot(kind="line")
plt.xlabel("lag")
plt.show()

ac_ssdal = autocorrelate_plot(ssdal)
ac_ssdal.plot(kind="line")
plt.xlabel("lag")
plt.show()