#MachineLearning #SupervisedLearning #Classification

By Billy Gustave

Digits Classifier

Goal:

  • Numbers classification.
    Data: sklearn.datasets.load_digits

  • using PCA and GridSearchCV for hyper parameters tuning

Data Exploration

In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.datasets import load_digits
In [2]:
digits = load_digits()
digits.images.shape
Out[2]:
(1797, 8, 8)

First 3 rows

In [3]:
def plot_image(data):
    fig, axes = plt.subplots(3, 10, figsize=(8, 8),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
    
    for i, ax in enumerate(axes.flat):
        ax.imshow(data.images[i], cmap='binary', interpolation='nearest')
        ax.text(0.05, 0.05, str(digits.target[i]),
                transform=ax.transAxes, color='green')
    plt.show()
plot_image(digits)
In [4]:
# Features and Target
X = digits.data
y = digits.target
X.shape
Out[4]:
(1797, 64)

Model Selection

In [5]:
# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
In [6]:
models = []
models.append(('LR',LogisticRegression(solver = 'newton-cg')))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('GBC',GradientBoostingClassifier()))
models.append(('SVC',SVC()))
models.append(('KNN',KNeighborsClassifier()))

Using Kfold and Cross Validation:

In [7]:
from sklearn.model_selection import cross_val_score, KFold
kfold = KFold(n_splits=10, random_state=17, shuffle=True)
In [8]:
names = []
scores = []
for name, model in models:
    score = cross_val_score(model, X, y, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results
Out[8]:
Model Accuracy
0 LR 0.963833
1 NB 0.838097
2 DTC 0.852585
3 RFC 0.973873
4 GBC 0.972191
5 SVC 0.987213
6 KNN 0.986660
In [9]:
# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()

Best model SVC -> 0.987213

Feature Engineering

PCA

In [10]:
from sklearn.decomposition import PCA
In [11]:
#Fitting the PCA algorithm with our Data
pca = PCA().fit(X)

#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([1, X.shape[1]])
plt.grid()
plt.show()

45 Components explains more than 99% of variance (information)

In [12]:
pca = PCA(n_components=45).fit(X)
X_Trnsf = pca.transform(X)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
Out[12]:
0.9957719632305035
In [13]:
names = []
scores = []
for name, model in models:
    score = cross_val_score(model, X_Trnsf, y, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results
Out[13]:
Model Accuracy
0 LR 0.966061
1 NB 0.946608
2 DTC 0.847514
3 RFC 0.970525
4 GBC 0.945456
5 SVC 0.992219
6 KNN 0.986660
In [14]:
# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()

Standerdized PCA

In [15]:
from sklearn.preprocessing import StandardScaler
In [16]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
In [17]:
#Fitting the PCA algorithm with our Data
pca = PCA().fit(X_scaled)

#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([1, X.shape[1]])
plt.grid()
plt.show()
In [18]:
pca = PCA(n_components=60).fit(X_scaled)
X_scaled_Trnsf = pca.transform(X_scaled)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
Out[18]:
0.999174649055182
In [19]:
names = []
scores = []
for name, model in models:
    score = cross_val_score(model, X_scaled_Trnsf, y, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results
Out[19]:
Model Accuracy
0 LR 0.971633
1 NB 0.907095
2 DTC 0.829165
3 RFC 0.961052
4 GBC 0.938234
5 SVC 0.982765
6 KNN 0.976629
In [20]:
# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()

The non normalized version is better

SVC Fine Tuning

In [21]:
from sklearn.model_selection import GridSearchCV
In [22]:
# parameters
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},
              {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
In [23]:
gsearch = GridSearchCV(SVC(), param_grid=param_grid, cv=kfold, scoring='accuracy')
gsearch.fit(X_Trnsf, y)
print(gsearch.best_params_)
gsearch.best_estimator_
{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
Out[23]:
SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
In [24]:
svc_final = SVC(C=1, gamma=0.001, kernel='rbf')
final_score = cross_val_score(svc_final, X_Trnsf, y, cv=kfold, scoring='accuracy').mean()
print("Final accuracy : {} ".format(final_score))
Final accuracy : 0.9927746741154563 

Contact Me

www.linkedin.com/in/billygustave

billygustave.com