In [1]:
%matplotlib inline

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
In [7]:
##ASSIGNMENT 1 Solution. Some steps from this setup the data for our hypothesis test so they are kept in this cell.
## A. Read in the CSV.
file = '2015 CHR Analytic Data.csv'
if (len(sys.argv) > 1) and (sys.argv[1][-4:].lower() == 'csv'):
    file = sys.argv[1]
print "loading %s" % file
data = pd.read_csv(file,sep=',',low_memory=False)

#print "\n (1) COLUMN HEADERS:"
valsColumn = [ currColumn for currColumn in data.columns  if "Value" in currColumn 
              or "COUNTYCODE" in currColumn or "County" in currColumn]
data = data[data.COUNTYCODE != 0] #filter data frame to only county rows (those with countycode != 0)
data = data[valsColumn] #filter to "value" columns
valsColumn = [v for v in valsColumn if "Value" in v] #drop the non-value columns
#print valsColumn
#A.2) TOTAL COUNTIES IN FILE: The total number of counties in the file. (see “NOTE” above)
print("\t\t\t\t%d "%(len(data.index)))
#A.3) TOTAL RANKED COUNTIES: The total number of counties without a “1” in the field “County that was not ranked”
notRankedCounties = [ idx for idx,isRanked in enumerate(data['County that was not ranked']) if isRanked==1 ]
rankedCounties = [ idx for idx,isRanked in enumerate(data['County that was not ranked']) if isRanked!=1 ]
#print("\t\t\t\t%d "%(len(rankedCounties)))

## B. Model whether a county was ranked based on its population.  
#B.4) HISTOGRAM OF POPULATION: a1_4_histpop.png: 
#A histogram of the field “'2011 population estimate Value'”. Choose an appropriate number of bins

#png_file = 'a1_4_histpop.png'
#print "\n(4) HISTOGRAM OF POPULATION: %s" % png_file
import string
if data['2011 population estimate Value'].dtype != 'int':
    data['2011 population estimate Value'] = data['2011 population estimate Value'].apply(lambda val: int(string.replace(val,',',''))) #make sure all ints
    data['2011 population estimate Value'] = data['2011 population estimate Value'].astype('int')
#hist = data['2011 population estimate Value'].hist(bins=60)

#B.5) HISTOGRAM OF LOG POPULATION: a1_5_histlog.png: Add a column, “log_pop” = log(“2011 population value”). (Side-note: log transforming the data makes it easier to model.)
#png_file = 'a1_5_histlog.png'
#print "\n(5) HISTOGRAM OF LOG POPULATION: %s" % png_file
data['log_pop'] = np.log(data['2011 population estimate Value'])
#lhist = data['log_pop'].hist(bins=30)

#B.6) KERNEL DENSITY ESTIMATES: a1_6_KDE.png: 2 kernel density plots based on log_pop: (a) counties not ranked, and (b) counties ranked. Overlay the density plots over each other into a single graph. Zoom in if necessary to see the the non-ranked distribution clearly. 
#png_file = 'a1_6_KDE.png'
#print "\n(6) KERNEL DENSITY ESTIMATES: %s" % png_file
dataRanked = data.iloc[rankedCounties]
dataNotRanked = data.iloc[notRankedCounties]
#kde = dataNotRanked['log_pop'].plot(kind='kde')

#B.7) PROBABILITY RANKED GIVEN POP: Three probabilities 
#     --The estimated probability that an unseen county would be ranked, 
#     given the following (non-logged) populations: 300, 3100, 5000. 
def probGivenPop (RPOP_data, NRPOP_data, log_pop):
    """#Approach: The two sets of data (ranked counties and not ranked) can be modeled as normal CRVs: RPOP and NRPOP
       while the question is asking about something being true (i.e. a Bernoulli): P(ranked | pop)
       where pop is population of the given county
       P(RPOP = pop) = 0 and P(RPOP = pop) = 0, since they are CRVs, but we can simply add an interval
       in order to get non-zero values. 
       Then, we can look at the ratio of P(pop -.5 < RPOP < pop + .5) to P(pop -.5 < NRPOP < pop + .5)
       and normalize by the total count of each. (we will work with everything logged)"""
    #first we compute the probabilities under each population 
    bw = .5 #bandwidth for estimates
    P_RPOP = ss.norm.cdf(log_pop+bw, RPOP_data.mean(), RPOP_data.std()) - \
        ss.norm.cdf(log_pop-bw, RPOP_data.mean(), RPOP_data.std())#probability among ranked
    P_NRPOP = ss.norm.cdf(log_pop+bw, NRPOP_data.mean(), NRPOP_data.std()) - \
        ss.norm.cdf(log_pop-bw, NRPOP_data.mean(), NRPOP_data.std())#probability among not ranked
    #next normalize by population of each to get an estimated number of counties with each population:
    Est_Counties_Ranked_at_pop = P_RPOP * len(RPOP_data)
    Est_Counties_NotRanked_at_pop = P_NRPOP * len(NRPOP_data)
    #finally compute the probability: counties ranked / all counties (in the given population)
    return Est_Counties_Ranked_at_pop / (Est_Counties_Ranked_at_pop + Est_Counties_NotRanked_at_pop)

#print "\t 300: %.4f" % probGivenPop(dataRanked['log_pop'], dataNotRanked['log_pop'], np.log(300))
#print "\t 3100: %.4f" % probGivenPop(dataRanked['log_pop'], dataNotRanked['log_pop'], np.log(3100))
#print "\t 5000: %.4f" % probGivenPop(dataRanked['log_pop'], dataNotRanked['log_pop'], np.log(5000))

## C. Model the health scores as normal. As in (1), consider each column ending in “Value”. 

#    For each value column, output it’s mean and standard deviation according to MLE, 
#    assuming a normal distribution (pprint a dictionary of {column: (mean, std-dev), … }).
#mean_sd = data[valsColumn].describe()[1:3]
#mean_sd_dict = dict([
#    (c, (round(mean_sd[c]['mean'], 4), round(mean_sd[c]['std'], 4), )) for c in mean_sd.columns
#from pprint import pprint
loading 2015 CHR Analytic Data.csv



For the purposes of this assignment, we will call two continuous random variables, A and B “pseodo-independent” iff
$| E(A| B < \mu_B) - E(A| B > \mu_B) | < 0.5*\sigma_A$.

In [8]:
#since we're about to use numbers in every value column, let's make sure we're dealing with types where means can counted
for c in valsColumn:
    if not (data[c].dtype == np.float64 or data[c].dtype == np.int64):
        data[c] = data[c].apply(lambda val: float(string.replace(str(val),',',''))) ##change to string then floats... 
        data[c] = data[c].astype('float')

#    List of columns which appear to be dependent on log_pop.  
#    For the purposes of this assignment, we will call two continuous random variables, 
#    A and B “pseodo-independent” iff  | E(A| B<muB) - E(A| B>muB) | < 0.5*sigmaA.  

# Let's designate population as variable B, and every other value column as A
# frist find where B<muB and B>muB:
dataLt = data[data['log_pop']  < data['log_pop'].mean()]
dataGt = data[data['log_pop']  > data['log_pop'].mean()]

#now iterate through each potential A (value column) and test if pseudo-independent
dep_cols = list()
for c in valsColumn:
    expected_diff = np.abs(dataLt[c].mean() - dataGt[c].mean()) # | E(A| B<muB) - E(A| B>muB) |
    if expected_diff > 0.5*data[c].std(): #greater than because we're looking for dependent
print sorted(dep_cols)
['2011 population estimate Value', 'Access to exercise opportunities Value', 'Child mortality Value', 'Dentists Value', 'Drug poisoning deaths Value', 'Homicide rate Value', 'Infant mortality Value', 'Injury deaths Value', 'Mammography screening Value', 'Median household income Value', 'Motor vehicle crash deaths Value', 'Physical inactivity Value', 'Population living in a rural area Value', 'Primary care physicians Value', 'Severe housing problems Value', 'Social associations Value', 'Some college Value', 'Uninsured children Value', 'Violent crime Value']

Hypothesis Testing

Hypothesis: The coin is biased.

Formally, $H_0$: A coin is not biased. In other words: $X ~ $Binomial(n, 0.5)

We want to be 95% sure we can reject $H_0$. We can flip the coin 1,000 and make sure it falls in the middle 95% of a binomial pmf.

In [9]:
#We plan to try 1000 tosses:
n_tosses = 100

#Let's find the middle 95% range:
lower_bound = ss.binom.ppf(0.025, n_tosses, 0.16) ##ppf is inverse cdf (takes percentile, returns x (i.e. count))
upper_bound = ss.binom.ppf(0.975, n_tosses, 0.16)
print "We want a count outside [%d, %d] to reject the null" % (lower_bound, upper_bound)

#now let's flip a coin
def coin_flip():
    return 1 if np.random.rand() < 0.25 else 0 #CHANGE THE COIN BIAS HERE

observed_head_count = np.sum([coin_flip() for _ in range(n_tosses)])#assume we took this from the real world...

print "After %d tosses, we ended up with a count of %d" % (n_tosses, observed_head_count)
if observed_head_count >= lower_bound and observed_head_count <= upper_bound:
        print "Failed to reject the null!"
        print "Null rejected!"
We want a count outside [9, 23] to reject the null
After 100 tosses, we ended up with a count of 31
Null rejected!

Hypothesis: Communities with higher population have different amounts of violent crimes (per capita) than those with lower population.

In [10]:
# Going back to the assignment 1 data: 
#dataLt is data for the counties with the lowest half of population
#dataGt is data for greatest population

#let's zoom in on Violent crime and remove nans  (not a number / nulls)
dataLtVC = dataLt['Violent crime Value'][~dataLt['Violent crime Value'].isnull()][:100] #or .dropna()
dataGtVC = dataGt['Violent crime Value'][~dataGt['Violent crime Value'].isnull()][:100]
#dataVC = pd.concat([dataLtVC, dataGtVC])

#let's see as a kde:
plt.axis([-500,1500,0,0.0035]) #zoom in to these dimensions

Using a t-test to compare means

In [11]:
##gather means, ns, and standard deviation:
mean1, mean2 = dataLtVC.mean(), dataGtVC.mean()
sd1, sd2 = dataLtVC.std(), dataGtVC.std() #standard deviation across both
n1, n2 = len(dataLtVC), len(dataGtVC)
df1, df2 = (n1 - 1), (n2 - 1)
print "Mean of lower 50%%: %.1f (%d) \nMean of upper 50%%: %.1f (%d) \n " % \
    (mean1, n1, mean2, n2)

##two sample t-test, assuming equal variance:
pooled_var = (df1*sd1**2 + df2*sd2**2) / (df1 + df2) #pooled variance
t = (mean1 - mean2) / np.sqrt(pooled_var * (1.0/n1 + 1.0/n2)) 
print t
p = 1 - ss.t.cdf(np.abs(t), df1+df2)
print "t: %.4f, df: %.1f, p: %.5f" % (t, df1+df2, p) #one tail (if you hypothesize)
print 't-statistic = %6.3f pvalue = %6.4f' %  ss.ttest_ind(dataLtVC, dataGtVC)#two tails
Mean of lower 50%: 318.1 (100) 
Mean of upper 50%: 407.6 (100) 
t: -2.8981, df: 198.0, p: 0.00209
t-statistic = -2.898 pvalue = 0.0042

shape of a student's t distribution

In [12]:
##what does a t distribution look like?
xpoints = np.linspace(-5, 5, 200)
y_3df = ss.t.pdf(xpoints, 3)
y_6df = ss.t.pdf(xpoints, 8)
y_bigdf = ss.t.pdf(xpoints, 1000)
plt.plot(xpoints, y_3df, linewidth=2, color='#5555ff')#blue
plt.plot(xpoints, y_6df, linewidth=2, color='#22cc22')#green
plt.plot(xpoints, y_bigdf, linewidth=2, color='#ff5555')#red

##how does it compare to a normal?
y_znormal = ss.norm.pdf(xpoints, 0, 1)
plt.plot(xpoints, y_znormal, linewidth=2, color='#dddd33')#yellow
[<matplotlib.lines.Line2D at 0x7f630e106f50>]

Linear Regression

In [28]:
#let's get back to working with the original data
data = data[~data['Violent crime Value'].isnull()]#drop nas
data = data[data['Violent crime Value']!=0]#drop zeros
data['vc']= data['Violent crime Value']

#let's just see how this data lays out:
data.plot(kind='scatter', x = 'log_pop', y='vc', alpha=0.3)

#if we wanted to see the scatter without logging population:
#data.plot(kind='scatter', x = '2011 population estimate Value', y='vc', alpha=0.3)

##if we want to have scipy figure out the regression coefficients:
beta1, beta0, r, p, stderr = ss.linregress(data['log_pop'], data['vc'])# we will talk about r, p, and stderr next
#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas)
print "yhat = beta0 + beta1*x = %.2f + %.2fx" % (beta0, beta1)
#now plot the line:
xpoints = np.linspace(data['log_pop'].min()*.90, data['log_pop'].max()*1.1, 100)
plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra
yhat = beta0 + beta1*x = -253.33 + 48.65x
[[22 28]
 [49 64]]
[[14 32]
 [32 77]]
[[ 1.42592593 -0.59259259]
 [-0.59259259  0.25925926]]

Matrix Linear Algebra Functions

In [29]:
X1 = np.array([[1, 2, 3],[4, 5 ,6]])
X2 = np.array([[1, 2],[3, 4], [5, 6]])
X1X2 =, X2) # dot product multiplication
print X1X2
X1X1t =, np.transpose(X1))#transpose
print X1X1t
X1X1t_inv = np.linalg.inv(X1X1t) #matrix inversion: X^{-1}
print X1X1t_inv
[[22 28]
 [49 64]]
[[14 32]
 [32 77]]
[[ 1.42592593 -0.59259259]
 [-0.59259259  0.25925926]]
In [36]:
#back to regressoin: 
#good to transform vc to be more normal

#before transform
print "before logging VC; r = %.3f, p = %.5f" % ss.pearsonr(data['log_pop'], data['vc'])
data['log_vc'] = np.log(data['vc']+1)
data['log_vc'].plot(kind='kde')#after transform
print "after logging VC; r = %.3f, p = %.5f" % ss.pearsonr(data['log_pop'], data['log_vc'])

data.plot(kind='scatter', x = 'log_pop', y='log_vc', alpha=0.3)
beta1, beta0, r, p, stderr = ss.linregress(data['log_pop'], data['log_vc'])# we will talk about r, p, and stderr next
xpoints = np.linspace(data['log_pop'].min()*.90, data['log_pop'].max()*1.1, 100)
plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra
print "from linregress: r = %.3f, p = %.5f" % (r, p)

#note that the red line has a steeper slope now => greater r
before logging VC; r = 0.354, p = 0.00000
after logging VC; r = 0.400, p = 0.00000
from linregress: r = 0.400, p = 0.00000
In [6]:
def logistic_function(x):
    return np.exp(x) / (1+np.exp(x))

xpoints = np.linspace(-10, 10, 100)
plt.plot(xpoints, [logistic_function(x) for x in xpoints])
[<matplotlib.lines.Line2D at 0x7f630f93f050>]