In [3]:
%matplotlib inline

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
In [4]:
##ASSIGNMENT 1 Solution. Some steps from this setup the data for our hypothesis test so they are kept in this cell.
## A. Read in the CSV.
file = '2015 CHR Analytic Data.csv'
if (len(sys.argv) > 1) and (sys.argv[1][-4:].lower() == 'csv'):
    file = sys.argv[1]
print "loading %s" % file
data = pd.read_csv(file,sep=',',low_memory=False)

#A.1) COLUMN HEADERS:
#print "\n (1) COLUMN HEADERS:"
valsColumn = [ currColumn for currColumn in data.columns  if "Value" in currColumn 
              or "COUNTYCODE" in currColumn or "County" in currColumn]
data = data[data.COUNTYCODE != 0] #filter data frame to only county rows (those with countycode != 0)
data = data[valsColumn] #filter to "value" columns
valsColumn = [v for v in valsColumn if "Value" in v] #drop the non-value columns
#print valsColumn
#A.2) TOTAL COUNTIES IN FILE: The total number of counties in the file. (see “NOTE” above)
print "\n(2) TOTAL COUNTIES IN FILE:"
print("\t\t\t\t%d "%(len(data.index)))
#A.3) TOTAL RANKED COUNTIES: The total number of counties without a “1” in the field “County that was not ranked”
notRankedCounties = [ idx for idx,isRanked in enumerate(data['County that was not ranked']) if isRanked==1 ]
rankedCounties = [ idx for idx,isRanked in enumerate(data['County that was not ranked']) if isRanked!=1 ]
#print "\n(3) TOTAL RANKED COUNTIES:"
#print("\t\t\t\t%d "%(len(rankedCounties)))

## B. Model whether a county was ranked based on its population.  
#B.4) HISTOGRAM OF POPULATION: a1_4_histpop.png: 
#A histogram of the field “'2011 population estimate Value'”. Choose an appropriate number of bins

#png_file = 'a1_4_histpop.png'
#print "\n(4) HISTOGRAM OF POPULATION: %s" % png_file
import string
if data['2011 population estimate Value'].dtype != 'int':
    data['2011 population estimate Value'] = data['2011 population estimate Value'].apply(lambda val: int(string.replace(val,',',''))) #make sure all ints
    data['2011 population estimate Value'] = data['2011 population estimate Value'].astype('int')
#hist = data['2011 population estimate Value'].hist(bins=60)
#hist.get_figure().savefig(png_file)
#plt.show()

#B.5) HISTOGRAM OF LOG POPULATION: a1_5_histlog.png: Add a column, “log_pop” = log(“2011 population value”). (Side-note: log transforming the data makes it easier to model.)
#png_file = 'a1_5_histlog.png'
#print "\n(5) HISTOGRAM OF LOG POPULATION: %s" % png_file
data['log_pop'] = np.log(data['2011 population estimate Value'])
#lhist = data['log_pop'].hist(bins=30)
#lhist.get_figure().savefig(png_file)
#plt.show()

#B.6) KERNEL DENSITY ESTIMATES: a1_6_KDE.png: 2 kernel density plots based on log_pop: (a) counties not ranked, and (b) counties ranked. Overlay the density plots over each other into a single graph. Zoom in if necessary to see the the non-ranked distribution clearly. 
#png_file = 'a1_6_KDE.png'
#print "\n(6) KERNEL DENSITY ESTIMATES: %s" % png_file
dataRanked = data.iloc[rankedCounties]
dataNotRanked = data.iloc[notRankedCounties]
#dataRanked['log_pop'].plot(kind='kde')
#kde = dataNotRanked['log_pop'].plot(kind='kde')
#kde.get_figure().savefig(png_file)
#plt.show()

#B.7) PROBABILITY RANKED GIVEN POP: Three probabilities 
#     --The estimated probability that an unseen county would be ranked, 
#     given the following (non-logged) populations: 300, 3100, 5000. 
#print "\n(7) PROBABILITY RANKED GIVEN POP:"
def probGivenPop (RPOP_data, NRPOP_data, log_pop):
    """#Approach: The two sets of data (ranked counties and not ranked) can be modeled as normal CRVs: RPOP and NRPOP
       while the question is asking about something being true (i.e. a Bernoulli): P(ranked | pop)
       where pop is population of the given county
       P(RPOP = pop) = 0 and P(RPOP = pop) = 0, since they are CRVs, but we can simply add an interval
       in order to get non-zero values. 
       Then, we can look at the ratio of P(pop -.5 < RPOP < pop + .5) to P(pop -.5 < NRPOP < pop + .5)
       and normalize by the total count of each. (we will work with everything logged)"""
    
    #first we compute the probabilities under each population 
    bw = .5 #bandwidth for estimates
    P_RPOP = ss.norm.cdf(log_pop+bw, RPOP_data.mean(), RPOP_data.std()) - \
        ss.norm.cdf(log_pop-bw, RPOP_data.mean(), RPOP_data.std())#probability among ranked
    P_NRPOP = ss.norm.cdf(log_pop+bw, NRPOP_data.mean(), NRPOP_data.std()) - \
        ss.norm.cdf(log_pop-bw, NRPOP_data.mean(), NRPOP_data.std())#probability among not ranked
    
    #next normalize by population of each to get an estimated number of counties with each population:
    Est_Counties_Ranked_at_pop = P_RPOP * len(RPOP_data)
    Est_Counties_NotRanked_at_pop = P_NRPOP * len(NRPOP_data)
    
    #finally compute the probability: counties ranked / all counties (in the given population)
    return Est_Counties_Ranked_at_pop / (Est_Counties_Ranked_at_pop + Est_Counties_NotRanked_at_pop)

#print "\t 300: %.4f" % probGivenPop(dataRanked['log_pop'], dataNotRanked['log_pop'], np.log(300))
#print "\t 3100: %.4f" % probGivenPop(dataRanked['log_pop'], dataNotRanked['log_pop'], np.log(3100))
#print "\t 5000: %.4f" % probGivenPop(dataRanked['log_pop'], dataNotRanked['log_pop'], np.log(5000))


## C. Model the health scores as normal. As in (1), consider each column ending in “Value”. 

#C.8) LIST MEAN AND STD_DEV PER COLUMN: 
#    For each value column, output it’s mean and standard deviation according to MLE, 
#    assuming a normal distribution (pprint a dictionary of {column: (mean, std-dev), … }).
#print "\n(8) LIST MEAN AND STD_DEC PER COLUMN:"
#mean_sd = data[valsColumn].describe()[1:3]
#mean_sd_dict = dict([
#    (c, (round(mean_sd[c]['mean'], 4), round(mean_sd[c]['std'], 4), )) for c in mean_sd.columns
#])
#from pprint import pprint
#pprint(mean_sd_dict)
loading 2015 CHR Analytic Data.csv

(2) TOTAL COUNTIES IN FILE:
				3141 

Pseudo-independence:

For the purposes of this assignment, we will call two continuous random variables, A and B “pseodo-independent” iff
$| E(A| B < \mu_B) - E(A| B > \mu_B) | < 0.5*\sigma_A$.

In [5]:
#since we're about to use numbers in every value column, let's make sure we're dealing with types where means can counted
for c in valsColumn:
    if not (data[c].dtype == np.float64 or data[c].dtype == np.int64):
        data[c] = data[c].apply(lambda val: float(string.replace(str(val),',',''))) ##change to string then floats... 
        data[c] = data[c].astype('float')

#c.9) PSEUDO-POP-DEPENDENT COLUMNS:
#    List of columns which appear to be dependent on log_pop.  
#    For the purposes of this assignment, we will call two continuous random variables, 
#    A and B “pseodo-independent” iff  | E(A| B<muB) - E(A| B>muB) | < 0.5*sigmaA.  

print "\n(9) PSEUDO_POP_DEPENDENT COLUMNS:"
# Let's designate population as variable B, and every other value column as A
# frist find where B<muB and B>muB:
dataLt = data[data['log_pop']  < data['log_pop'].mean()]
dataGt = data[data['log_pop']  > data['log_pop'].mean()]

#now iterate through each potential A (value column) and test if pseudo-independent
dep_cols = list()
for c in valsColumn:
    expected_diff = np.abs(dataLt[c].mean() - dataGt[c].mean()) # | E(A| B<muB) - E(A| B>muB) |
    if expected_diff > 0.5*data[c].std(): #greater than because we're looking for dependent
        dep_cols.append(c)
        
print sorted(dep_cols)
(9) PSEUDO_POP_DEPENDENT COLUMNS:
['2011 population estimate Value', 'Access to exercise opportunities Value', 'Child mortality Value', 'Dentists Value', 'Drug poisoning deaths Value', 'Homicide rate Value', 'Infant mortality Value', 'Injury deaths Value', 'Mammography screening Value', 'Median household income Value', 'Motor vehicle crash deaths Value', 'Physical inactivity Value', 'Population living in a rural area Value', 'Primary care physicians Value', 'Severe housing problems Value', 'Social associations Value', 'Some college Value', 'Uninsured children Value', 'Violent crime Value']

Hypothesis Testing

Hypothesis: The coin is biased.

Formally, $H_0$: A coin is not biased. In other words: $X ~ $Binomial(n, 0.5)

We want to be 95% sure we can reject $H_0$. We can flip the coin 1,000 and make sure it falls in the middle 95% of a binomial pmf.

In [6]:
#We plan to try 1000 tosses:
n_tosses = 100

#Let's find the middle 95% range:
lower_bound = ss.binom.ppf(0.025, n_tosses, 0.16) ##ppf is inverse cdf (takes percentile, returns x (i.e. count))
upper_bound = ss.binom.ppf(0.975, n_tosses, 0.16)
print "We want a count outside [%d, %d] to reject the null" % (lower_bound, upper_bound)

#now let's flip a coin
def coin_flip():
    return 1 if np.random.rand() < 0.25 else 0 #CHANGE THE COIN BIAS HERE


observed_head_count = np.sum([coin_flip() for _ in range(n_tosses)])#assume we took this from the real world...

print "After %d tosses, we ended up with a count of %d" % (n_tosses, observed_head_count)
if observed_head_count >= lower_bound and observed_head_count <= upper_bound:
        print "Failed to reject the null!"
else:
        print "Null rejected!"
We want a count outside [9, 23] to reject the null
After 100 tosses, we ended up with a count of 25
Null rejected!

Hypothesis: Communities with higher population have different amounts of violent crimes (per capita) than those with lower population.

In [7]:
# Going back to the assignment 1 data: 
#dataLt is data for the counties with the lowest half of population
#dataGt is data for greatest population

#let's zoom in on Violent crime and remove nans  (not a number / nulls)
dataLtVC = dataLt['Violent crime Value'][~dataLt['Violent crime Value'].isnull()][:100] #or .dropna()
dataGtVC = dataGt['Violent crime Value'][~dataGt['Violent crime Value'].isnull()][:100]
#dataVC = pd.concat([dataLtVC, dataGtVC])

#let's see as a kde:
dataLtVC.plot(kind='kde')
dataGtVC.plot(kind='kde')
plt.axis([-500,1500,0,0.0035]) #zoom in to these dimensions
plt.show()

Using a t-test to compare means

In [8]:
##gather means, ns, and standard deviation:
mean1, mean2 = dataLtVC.mean(), dataGtVC.mean()
sd1, sd2 = dataLtVC.std(), dataGtVC.std() #standard deviation across both
n1, n2 = len(dataLtVC), len(dataGtVC)
df1, df2 = (n1 - 1), (n2 - 1)
print "Mean of lower 50%%: %.1f (%d) \nMean of upper 50%%: %.1f (%d) \n " % \
    (mean1, n1, mean2, n2)

##two sample t-test, assuming equal variance:
pooled_var = (df1*sd1**2 + df2*sd2**2) / (df1 + df2) #pooled variance
t = (mean1 - mean2) / np.sqrt(pooled_var * (1.0/n1 + 1.0/n2)) 
print t
p = 1 - ss.t.cdf(np.abs(t), df1+df2)
print "t: %.4f, df: %.1f, p: %.5f" % (t, df1+df2, p) #one tail (if you hypothesize)
print 't-statistic = %6.3f pvalue = %6.4f' %  ss.ttest_ind(dataLtVC, dataGtVC)#two tails
Mean of lower 50%: 318.1 (100) 
Mean of upper 50%: 407.6 (100) 
 
-2.89806489604
t: -2.8981, df: 198.0, p: 0.00209
t-statistic = -2.898 pvalue = 0.0042

shape of a student's t distribution

In [9]:
##what does a t distribution look like?
xpoints = np.linspace(-5, 5, 200)
y_3df = ss.t.pdf(xpoints, 3)
y_6df = ss.t.pdf(xpoints, 8)
y_bigdf = ss.t.pdf(xpoints, 1000)
plt.plot(xpoints, y_3df, linewidth=2, color='#5555ff')#blue
plt.plot(xpoints, y_6df, linewidth=2, color='#22cc22')#green
plt.plot(xpoints, y_bigdf, linewidth=2, color='#ff5555')#red

##how does it compare to a normal?
y_znormal = ss.norm.pdf(xpoints, 0, 1)
plt.plot(xpoints, y_znormal, linewidth=2, color='#dddd33')#yellow
Out[9]:
[<matplotlib.lines.Line2D at 0x7f5937cbac50>]

Linear Regression

In [10]:
#let's get back to working with the original data
data = data[~data['Violent crime Value'].isnull()]#drop nas
data = data[data['Violent crime Value']!=0]#drop zeros
data['vc']= data['Violent crime Value']

#let's just see how this data lays out:
data.plot(kind='scatter', x = 'log_pop', y='vc', alpha=0.3)

#if we wanted to see the scatter without logging population:
#data.plot(kind='scatter', x = '2011 population estimate Value', y='vc', alpha=0.3)

##if we want to have scipy figure out the regression coefficients:
beta1, beta0, r, p, stderr = ss.linregress(data['log_pop'], data['vc'])# we will talk about r, p, and stderr next
#(assume beta1 and beta0 are estimate; i.e. beta1hat, beta0hat; we will almost never know non-estimated betas)
print "yhat = beta0 + beta1*x = %.2f + %.2fx" % (beta0, beta1)
#now plot the line:
xpoints = np.linspace(data['log_pop'].min()*.90, data['log_pop'].max()*1.1, 100)
plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra
yhat = beta0 + beta1*x = -253.33 + 48.65x
Out[10]:
[<matplotlib.lines.Line2D at 0x7f59381b0250>]

Matrix Linear Algebra Functions

In [11]:
X1 = np.array([[1, 2, 3],[4, 5 ,6]])
X2 = np.array([[1, 2],[3, 4], [5, 6]])
X1X2 = np.dot(X1, X2) # dot product multiplication
print X1X2
X1X1t = np.dot(X1, np.transpose(X1))#transpose
print X1X1t
X1X1t_inv = np.linalg.inv(X1X1t) #matrix inversion: X^{-1}
print X1X1t_inv
[[22 28]
 [49 64]]
[[14 32]
 [32 77]]
[[ 1.42592593 -0.59259259]
 [-0.59259259  0.25925926]]
In [12]:
#back to regressoin: 
#good to transform vc to be more normal

#before transform
data['vc'].plot(kind='kde') 
print "before logging VC; r = %.3f, p = %.5f" % ss.pearsonr(data['log_pop'], data['vc'])
plt.show()
data['log_vc'] = np.log(data['vc']+1)
data['log_vc'].plot(kind='kde')#after transform
print "after logging VC; r = %.3f, p = %.5f" % ss.pearsonr(data['log_pop'], data['log_vc'])

data.plot(kind='scatter', x = 'log_pop', y='log_vc', alpha=0.3)
beta1, beta0, r, p, stderr = ss.linregress(data['log_pop'], data['log_vc'])# we will talk about r, p, and stderr next
xpoints = np.linspace(data['log_pop'].min()*.90, data['log_pop'].max()*1.1, 100)
plt.plot(xpoints, beta0 + beta1*xpoints, color='red', linewidth=2) #note: "beta0 + beta1*xpoints" is using vector algebra
plt.show()
print "from linregress: r = %.3f, p = %.5f" % (r, p)

#note that the red line has a steeper slope now => greater r
before logging VC; r = 0.354, p = 0.00000
after logging VC; r = 0.400, p = 0.00000
from linregress: r = 0.400, p = 0.00000
In [13]:
def logistic_function(x):
    return np.exp(x) / (1+np.exp(x))

xpoints = np.linspace(-10, 10, 100)
plt.plot(xpoints, [logistic_function(x) for x in xpoints])
Out[13]:
[<matplotlib.lines.Line2D at 0x7f5938bfa910>]

Mediation, Moderation, HLM

In [14]:
#working with vc, log_pop, and premature death(pd)
data['pd'] = data['Premature death Value']
data = data[~data['pd'].isnull()]#drop nas
data = data[data['pd']!=0]#drop zeros
data['pd'].hist(bins=20)
plt.show()
data['log_pd'] = np.log(data['pd']+1)
data['log_pd'].hist(bins=20)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5938b66d50>
In [15]:
import statsmodels.api as sm

#mediation analysis: trying to predict rates of premature death from rates of violent
X = ss.zscore(data['log_vc'])
Y = ss.zscore(data['log_pd'])
M = ss.zscore(data['log_pop'])
print X
                
result1 = sm.OLS(Y, X).fit()
print result1.summary()




result2 = sm.OLS(X, M).fit()
print result2.summary()




#result3 = sm.OLS(Y, X).fit()
XM = np.array([X,M]).T
print XM
result3 = sm.OLS(Y, XM).fit()
print result3.summary()
[ 0.3461505   0.1716063  -0.33784993 ..., -1.79807896 -1.26295055
 -0.8436973 ]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.060
Model:                            OLS   Adj. R-squared:                  0.060
Method:                 Least Squares   F-statistic:                     181.5
Date:                Thu, 24 Mar 2016   Prob (F-statistic):           3.84e-40
Time:                        15:21:39   Log-Likelihood:                -3958.8
No. Observations:                2852   AIC:                             7920.
Df Residuals:                    2851   BIC:                             7926.
Df Model:                           1                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.2446      0.018     13.471      0.000         0.209     0.280
==============================================================================
Omnibus:                       10.755   Durbin-Watson:                   1.360
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               13.221
Skew:                           0.053   Prob(JB):                      0.00135
Kurtosis:                       3.316   Cond. No.                         1.00
==============================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.144
Model:                            OLS   Adj. R-squared:                  0.144
Method:                 Least Squares   F-statistic:                     480.4
Date:                Thu, 24 Mar 2016   Prob (F-statistic):           1.51e-98
Time:                        15:21:39   Log-Likelihood:                -3824.7
No. Observations:                2852   AIC:                             7651.
Df Residuals:                    2851   BIC:                             7657.
Df Model:                           1                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.3798      0.017     21.919      0.000         0.346     0.414
==============================================================================
Omnibus:                       50.360   Durbin-Watson:                   1.388
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               52.694
Skew:                          -0.322   Prob(JB):                     3.61e-12
Kurtosis:                       3.167   Cond. No.                         1.00
==============================================================================
[[ 0.3461505   0.33717488]
 [ 0.1716063   1.27606381]
 [-0.33784993 -0.1925588 ]
 ..., 
 [-1.79807896 -0.37899474]
 [-1.26295055 -1.05640533]
 [-0.8436973  -1.18080604]]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.223
Model:                            OLS   Adj. R-squared:                  0.223
Method:                 Least Squares   F-statistic:                     409.2
Date:                Thu, 24 Mar 2016   Prob (F-statistic):          5.85e-157
Time:                        15:21:39   Log-Likelihood:                -3686.8
No. Observations:                2852   AIC:                             7378.
Df Residuals:                    2850   BIC:                             7390.
Df Model:                           2                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4105      0.018     23.000      0.000         0.375     0.445
x2            -0.4368      0.018    -24.473      0.000        -0.472    -0.402
==============================================================================
Omnibus:                       65.185   Durbin-Watson:                   1.330
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              142.232
Skew:                          -0.054   Prob(JB):                     1.30e-31
Kurtosis:                       4.089   Cond. No.                         1.49
==============================================================================
In [16]:
#moderation
# X: vc
# Y: PD
# M: pop

meanM = np.mean(M)
Mbin = np.array([0 if m < meanM else 1 for m in M])
print Mbin

XMbin = X*Mbin #element-wise multiplicaiton
print XMbin

Xconcat = np.array([[1]*X.shape[0], X, XMbin, Mbin]).T
print Xconcat
print Y.shape
print Xconcat.shape
result = sm.OLS(Y, Xconcat).fit()
print result.summary()
[1 1 0 ..., 0 0 0]
[ 0.3461505  0.1716063 -0.        ..., -0.        -0.        -0.       ]
[[ 1.          0.3461505   0.3461505   1.        ]
 [ 1.          0.1716063   0.1716063   1.        ]
 [ 1.         -0.33784993 -0.          0.        ]
 ..., 
 [ 1.         -1.79807896 -0.          0.        ]
 [ 1.         -1.26295055 -0.          0.        ]
 [ 1.         -0.8436973  -0.          0.        ]]
(2852,)
(2852, 4)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.171
Model:                            OLS   Adj. R-squared:                  0.170
Method:                 Least Squares   F-statistic:                     195.7
Date:                Thu, 24 Mar 2016   Prob (F-statistic):          2.18e-115
Time:                        15:21:39   Log-Likelihood:                -3779.6
No. Observations:                2852   AIC:                             7567.
Df Residuals:                    2848   BIC:                             7591.
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.2957      0.024     12.463      0.000         0.249     0.342
x1             0.2931      0.022     13.195      0.000         0.250     0.337
x2             0.1410      0.037      3.789      0.000         0.068     0.214
x3            -0.7024      0.036    -19.472      0.000        -0.773    -0.632
==============================================================================
Omnibus:                       31.732   Durbin-Watson:                   1.351
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               53.224
Skew:                          -0.014   Prob(JB):                     2.77e-12
Kurtosis:                       3.669   Cond. No.                         2.78
==============================================================================

Discrete Variable Comparison Metrics

In [97]:
#cancer, smokes
people = [[0, 0],
          [0, 0], 
          [1, 1],#tp 
          [0, 0],#tn
          [1, 1], 
          [1, 1], 
          [0, 1],#fp
          [0, 0],
          [1, 0],#fn
          [0, 1],
          [0, 0],
          [1, 1],
          [1, 1],
          [0, 0],
          [0, 0], 
          [0, 0]]
people = pd.DataFrame(np.array(people), columns=['cancer', 'smoker'])

#build contingency table:
a = people['cancer']>0
b = people['smoker']>0
cntg_people = people.groupby([a,b]).count()#['cancer']
X = cntg_people['cancer'].unstack().as_matrix()
print "Observed:"
print X

#calculate chi^2
E = np.array([[(np.sum(X[i,:])*np.sum(X[:,j]))/float(len(people))
         for j in range(X.shape[1])] for i in range(X.shape[0])])
print "Expected given the base rates:"
print E
U = np.sum( [((X[i,j]-E[i,j])**2)/E[i,j] for i in range(X.shape[0]) for j in range(X.shape[1]) ]) 
df = (X.shape[0]-1)*(X.shape[1]-1)
print "chi2: %.3f, df: %.3f" % (U, df)
print "p*(u < U) ~= %.5f" % float(1-ss.chi2(df).cdf(U))

#check against scipy stats' version:
U2, p2, df2, expected = ss.chi2_contingency(X, False)
print "from scipy.stats:"
print "chi2: %.3f, df: %.3f" % (U2, df2)
print "p*(u < U) ~= %.5f" % p
Observed:
[[8 2]
 [1 5]]
Expected given the base rates:
[[ 5.625  4.375]
 [ 3.375  2.625]]
chi2: 6.112, df: 1.000
p*(u < U) ~= 0.01343
from scipy.stats:
chi2: 6.112, df: 1.000
p*(u < U) ~= 0.01343
In [ ]: