#MachineLearning #SupervisedLearning #Classification

By Billy Gustave

Digits Classifier ¶

Goal:

Numbers classification.
Data: sklearn.datasets.load_digits
using PCA and GridSearchCV for hyper parameters tuning

Data Exploration ¶

import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.datasets import load_digits

digits = load_digits()
digits.images.shape

(1797, 8, 8)

First 3 rows

def plot_image(data):
    fig, axes = plt.subplots(3, 10, figsize=(8, 8),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
    
    for i, ax in enumerate(axes.flat):
        ax.imshow(data.images[i], cmap='binary', interpolation='nearest')
        ax.text(0.05, 0.05, str(digits.target[i]),
                transform=ax.transAxes, color='green')
    plt.show()
plot_image(digits)

# Features and Target
X = digits.data
y = digits.target
X.shape

(1797, 64)

Model Selection ¶

# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

models = []
models.append(('LR',LogisticRegression(solver = 'newton-cg')))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('GBC',GradientBoostingClassifier()))
models.append(('SVC',SVC()))
models.append(('KNN',KNeighborsClassifier()))

Using Kfold and Cross Validation:

from sklearn.model_selection import cross_val_score, KFold
kfold = KFold(n_splits=10, random_state=17, shuffle=True)

names = []
scores = []
for name, model in models:
    score = cross_val_score(model, X, y, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results

# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()

Best model SVC -> 0.987213

Feature Engineering ¶

PCA

from sklearn.decomposition import PCA

#Fitting the PCA algorithm with our Data
pca = PCA().fit(X)

#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([1, X.shape[1]])
plt.grid()
plt.show()

45 Components explains more than 99% of variance (information)

pca = PCA(n_components=45).fit(X)
X_Trnsf = pca.transform(X)
# total explained variance ratio
pca.explained_variance_ratio_.sum()

0.9957719632305035

names = []
scores = []
for name, model in models:
    score = cross_val_score(model, X_Trnsf, y, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results

# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()

Standerdized PCA

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

#Fitting the PCA algorithm with our Data
pca = PCA().fit(X_scaled)

#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([1, X.shape[1]])
plt.grid()
plt.show()

pca = PCA(n_components=60).fit(X_scaled)
X_scaled_Trnsf = pca.transform(X_scaled)
# total explained variance ratio
pca.explained_variance_ratio_.sum()

0.999174649055182

names = []
scores = []
for name, model in models:
    score = cross_val_score(model, X_scaled_Trnsf, y, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results

# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()

The non normalized version is better

SVC Fine Tuning ¶

from sklearn.model_selection import GridSearchCV

# parameters
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},
              {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

gsearch = GridSearchCV(SVC(), param_grid=param_grid, cv=kfold, scoring='accuracy')
gsearch.fit(X_Trnsf, y)
print(gsearch.best_params_)
gsearch.best_estimator_

{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

svc_final = SVC(C=1, gamma=0.001, kernel='rbf')
final_score = cross_val_score(svc_final, X_Trnsf, y, cv=kfold, scoring='accuracy').mean()
print("Final accuracy : {} ".format(final_score))

Final accuracy : 0.9927746741154563

Billy Gustave

Digits Classifier

Digits Classifier ¶

Data Exploration ¶

Model Selection ¶

Feature Engineering ¶

SVC Fine Tuning ¶

Contact Me

www.linkedin.com/in/billygustave

billygustave.com

Billy Gustave

	Model	Accuracy
0	LR	0.963833
1	NB	0.838097
2	DTC	0.852585
3	RFC	0.973873
4	GBC	0.972191
5	SVC	0.987213
6	KNN	0.986660

	Model	Accuracy
0	LR	0.966061
1	NB	0.946608
2	DTC	0.847514
3	RFC	0.970525
4	GBC	0.945456
5	SVC	0.992219
6	KNN	0.986660

	Model	Accuracy
0	LR	0.971633
1	NB	0.907095
2	DTC	0.829165
3	RFC	0.961052
4	GBC	0.938234
5	SVC	0.982765
6	KNN	0.976629