%matplotlib inline 

#above allows plots to discplay on the screen. 

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization

#let's just look at what happens to the logit function as we change the beta coefficients

def logistic_function(x):
    return np.exp(x) / (1+np.exp(x))

def logistic_function_with_betas(x, beta0=0, beta1=1):
    #now using linear function: beta0 + beta1*x for the exponent:
    return np.exp(beta0 + beta1*x) / (1+np.exp(beta0 + beta1*x))

xpoints = np.linspace(-10, 10, 100)
plt.plot(xpoints, [logistic_function(x) for x in xpoints])
plt.plot(xpoints, [logistic_function_with_betas(x, 2, 1) for x in xpoints]) #shifts the intercept with zero
plt.plot(xpoints, [logistic_function_with_betas(x, 0, 3.145914159653) for x in xpoints])#twists the line verically
plt.plot(xpoints, [logistic_function_with_betas(x, 0, 1/3.145914159653) for x in xpoints]) #twists it horizontally

[<matplotlib.lines.Line2D at 0x21aea534cc0>]

Load the data¶

#load sklearn and out data sets: 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split #used for splitting data into train and test
from sklearn.preprocessing import StandardScaler #used for scaling data to the same range
from sklearn import metrics  #for scoring performance

from sklearn.datasets import load_wine, load_breast_cancer, load_digits
#data = load_wine()
#data = load_breast_cancer()
data = load_digits()

#what is in the file:
print(data.keys(), "\n")

#print(data.DESCR)

print("classes for y:         ", data.target_names) #names of the classes
    
#print("feature (X) names:     ", data.feature_names) #names of the features

#print("number of features:    ", len(data.feature_names)) #how many fetaures?

print("actual feature dimensions (n, m):", data.data.shape) #actual feature data

dict_keys(['data', 'target', 'target_names', 'images', 'DESCR']) 

classes for y:          [0 1 2 3 4 5 6 7 8 9]
actual feature dimensions (n, m): (1797, 64)

# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,
                                                    test_size=0.20,
                                                    random_state=42)

print("training data: ", X_train.shape, y_train.shape) 
print("testing data:  ", X_test.shape, y_test.shape)

training data:  (1437, 64) (1437,)
testing data:   (360, 64) (360,)

Find the betas, given the data¶

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

#set hyperparameters for the model
#try the following values for penaltyC to see a difference: 10000, 1000, 100, 10, 1, .1, .01, .001, .0001
penaltyC = 0.0001#regularization parameter (higher yields less regularization)
penaltyType = 'l2' #regularization function type

logreg = LogisticRegression(C=penaltyC, penalty=penaltyType, random_state=42,\
                           solver="liblinear", multi_class="auto") #instantiate a class

#learn the betas (i.e. parameters or coeficients
logreg.fit(X_train, y_train)
print("betas for 0vAll classifier: \n", logreg.coef_[0])
##learned 3 models:
#0: class_0 or not
#1: class_1 or not
#2: class_3 or not


#produce class predictions: 
y_test_pred = logreg.predict(X_test) #run the model on the test set
#produce probabilities
y_test_probs = logreg.predict_proba(X_test) #run the model and get the prediction probabilities

#print accuracy: 
print('\nModel Accuracy: {:.2%}\n'.format(metrics.accuracy_score(y_test, y_test_pred)))

#print the probabilities, the predicted class, and the true class: 
np.set_printoptions(precision=6, suppress=True) #decrease decimals and scientific notation
print("",["0vAll", "1vAll", "2vAll", "...", "kvAll", "pred", "true"])
print(np.c_[y_test_probs, y_test_pred, y_test])

betas for 0vAll classifier: 
 [ 0.       -0.001825 -0.009848 -0.00352  -0.015155 -0.02188  -0.008859
 -0.00111  -0.000023 -0.008056  0.000131 -0.003361  0.003714  0.011588
 -0.00679  -0.001066 -0.00003   0.003444  0.007307 -0.013437 -0.027022
  0.018018  0.004135 -0.000413 -0.000012  0.011382  0.003448 -0.040871
 -0.057175 -0.003354  0.015899 -0.000038  0.        0.012687  0.011369
 -0.044007 -0.059775 -0.017442  0.016092  0.       -0.000178  0.003317
  0.025756 -0.034256 -0.03827   0.000767  0.003018 -0.000164 -0.000255
 -0.003346  0.018461 -0.002136  0.001792  0.010393 -0.015832 -0.001109
 -0.000006 -0.001829 -0.012216 -0.004175 -0.010668 -0.019519 -0.012095
 -0.001809]

Model Accuracy: 92.22%

 ['0vAll', '1vAll', '2vAll', '...', 'kvAll', 'pred', 'true']
[[0.054136 0.062127 0.035904 ... 0.06433  6.       6.      ]
 [0.094366 0.03005  0.036173 ... 0.365759 9.       9.      ]
 [0.017686 0.045614 0.087777 ... 0.108796 3.       3.      ]
 ...
 [0.091881 0.05119  0.102221 ... 0.099769 8.       8.      ]
 [0.083545 0.040892 0.136813 ... 0.203112 3.       3.      ]
 [0.090689 0.02148  0.042277 ... 0.254759 5.       5.      ]]

#let's explore what happens to the betas
Cs = [100000, 10000, 1000, 100, 10, 1, .1, .01, .001, .0001]
penaltyTypes = ['l1', 'l2']

from numpy import log

#plot the beta values for the first 15 features, and the accuracies
# while trying different C values: 
for penaltyType in penaltyTypes:
    model0coefs = []#will hold all the coefficients
    modelAccs = []
    numCoefs = 15
    for penaltyC in Cs:
        logreg = LogisticRegression(C=penaltyC, penalty=penaltyType, random_state=42,\
                                    solver="liblinear", multi_class="auto")
        logreg.fit(X_train, y_train)
        model0coefs.append(logreg.coef_[0][:numCoefs])
        modelAccs.append(100*metrics.accuracy_score(y_test, logreg.predict(X_test)))
    
    ##PLOT The coefficents (betas)
    plt.plot(Cs, model0coefs, linewidth=2, alpha=0.75)
    plt.xscale('log')
    plt.xticks(Cs)
    plt.title("Cofficient values when increasing C for "+penaltyType+" regularization")
    plt.show()
    
    ##PLOT the accuracy
    plt.plot(Cs, modelAccs, linewidth=3)
    plt.xscale('log')
    plt.xticks(Cs)
    plt.ylim([max(np.min(modelAccs), 80), 100])
    plt.title("Accuracy % for Cs of "+penaltyType+" regularization")
    plt.show()

print("NLP is for me.")

NLP is for me.