%matplotlib inline
#above allows plots to discplay on the screen.
#python includes
import sys
#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
#let's just look at what happens to the logit function as we change the beta coefficients
def logistic_function(x):
return np.exp(x) / (1+np.exp(x))
def logistic_function_with_betas(x, beta0=0, beta1=1):
#now using linear function: beta0 + beta1*x for the exponent:
return np.exp(beta0 + beta1*x) / (1+np.exp(beta0 + beta1*x))
xpoints = np.linspace(-10, 10, 100)
plt.plot(xpoints, [logistic_function(x) for x in xpoints])
plt.plot(xpoints, [logistic_function_with_betas(x, 2, 1) for x in xpoints]) #shifts the intercept with zero
plt.plot(xpoints, [logistic_function_with_betas(x, 0, 3.145914159653) for x in xpoints])#twists the line verically
plt.plot(xpoints, [logistic_function_with_betas(x, 0, 1/3.145914159653) for x in xpoints]) #twists it horizontally
#load sklearn and out data sets:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split #used for splitting data into train and test
from sklearn.preprocessing import StandardScaler #used for scaling data to the same range
from sklearn import metrics #for scoring performance
from sklearn.datasets import load_wine, load_breast_cancer, load_digits
#data = load_wine()
#data = load_breast_cancer()
data = load_digits()
#what is in the file:
print(data.keys(), "\n")
#print(data.DESCR)
print("classes for y: ", data.target_names) #names of the classes
#print("feature (X) names: ", data.feature_names) #names of the features
#print("number of features: ", len(data.feature_names)) #how many fetaures?
print("actual feature dimensions (n, m):", data.data.shape) #actual feature data
# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,
test_size=0.20,
random_state=42)
print("training data: ", X_train.shape, y_train.shape)
print("testing data: ", X_test.shape, y_test.shape)
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
#set hyperparameters for the model
#try the following values for penaltyC to see a difference: 10000, 1000, 100, 10, 1, .1, .01, .001, .0001
penaltyC = 0.0001#regularization parameter (higher yields less regularization)
penaltyType = 'l2' #regularization function type
logreg = LogisticRegression(C=penaltyC, penalty=penaltyType, random_state=42,\
solver="liblinear", multi_class="auto") #instantiate a class
#learn the betas (i.e. parameters or coeficients
logreg.fit(X_train, y_train)
print("betas for 0vAll classifier: \n", logreg.coef_[0])
##learned 3 models:
#0: class_0 or not
#1: class_1 or not
#2: class_3 or not
#produce class predictions:
y_test_pred = logreg.predict(X_test) #run the model on the test set
#produce probabilities
y_test_probs = logreg.predict_proba(X_test) #run the model and get the prediction probabilities
#print accuracy:
print('\nModel Accuracy: {:.2%}\n'.format(metrics.accuracy_score(y_test, y_test_pred)))
#print the probabilities, the predicted class, and the true class:
np.set_printoptions(precision=6, suppress=True) #decrease decimals and scientific notation
print("",["0vAll", "1vAll", "2vAll", "...", "kvAll", "pred", "true"])
print(np.c_[y_test_probs, y_test_pred, y_test])
#let's explore what happens to the betas
Cs = [100000, 10000, 1000, 100, 10, 1, .1, .01, .001, .0001]
penaltyTypes = ['l1', 'l2']
from numpy import log
#plot the beta values for the first 15 features, and the accuracies
# while trying different C values:
for penaltyType in penaltyTypes:
model0coefs = []#will hold all the coefficients
modelAccs = []
numCoefs = 15
for penaltyC in Cs:
logreg = LogisticRegression(C=penaltyC, penalty=penaltyType, random_state=42,\
solver="liblinear", multi_class="auto")
logreg.fit(X_train, y_train)
model0coefs.append(logreg.coef_[0][:numCoefs])
modelAccs.append(100*metrics.accuracy_score(y_test, logreg.predict(X_test)))
##PLOT The coefficents (betas)
plt.plot(Cs, model0coefs, linewidth=2, alpha=0.75)
plt.xscale('log')
plt.xticks(Cs)
plt.title("Cofficient values when increasing C for "+penaltyType+" regularization")
plt.show()
##PLOT the accuracy
plt.plot(Cs, modelAccs, linewidth=3)
plt.xscale('log')
plt.xticks(Cs)
plt.ylim([max(np.min(modelAccs), 80), 100])
plt.title("Accuracy % for Cs of "+penaltyType+" regularization")
plt.show()
print("NLP is for me.")