#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Goal:
Numbers classification.
Data: sklearn.datasets.load_digits
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape
First 3 rows
def plot_image(data):
fig, axes = plt.subplots(3, 10, figsize=(8, 8),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(data.images[i], cmap='binary', interpolation='nearest')
ax.text(0.05, 0.05, str(digits.target[i]),
transform=ax.transAxes, color='green')
plt.show()
plot_image(digits)
# Features and Target
X = digits.data
y = digits.target
X.shape
# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
models = []
models.append(('LR',LogisticRegression(solver = 'newton-cg')))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('GBC',GradientBoostingClassifier()))
models.append(('SVC',SVC()))
models.append(('KNN',KNeighborsClassifier()))
Using Kfold and Cross Validation:
from sklearn.model_selection import cross_val_score, KFold
kfold = KFold(n_splits=10, random_state=17, shuffle=True)
names = []
scores = []
for name, model in models:
score = cross_val_score(model, X, y, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
results = pd.DataFrame({'Model': names,'Accuracy': scores})
results
# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center")
plt.show()
Best model SVC -> 0.987213
PCA
from sklearn.decomposition import PCA
#Fitting the PCA algorithm with our Data
pca = PCA().fit(X)
#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([1, X.shape[1]])
plt.grid()
plt.show()
45 Components explains more than 99% of variance (information)
pca = PCA(n_components=45).fit(X)
X_Trnsf = pca.transform(X)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
names = []
scores = []
for name, model in models:
score = cross_val_score(model, X_Trnsf, y, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
results = pd.DataFrame({'Model': names,'Accuracy': scores})
results
# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center")
plt.show()
Standerdized PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
#Fitting the PCA algorithm with our Data
pca = PCA().fit(X_scaled)
#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([1, X.shape[1]])
plt.grid()
plt.show()
pca = PCA(n_components=60).fit(X_scaled)
X_scaled_Trnsf = pca.transform(X_scaled)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
names = []
scores = []
for name, model in models:
score = cross_val_score(model, X_scaled_Trnsf, y, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
results = pd.DataFrame({'Model': names,'Accuracy': scores})
results
# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center")
plt.show()
The non normalized version is better
from sklearn.model_selection import GridSearchCV
# parameters
param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
gsearch = GridSearchCV(SVC(), param_grid=param_grid, cv=kfold, scoring='accuracy')
gsearch.fit(X_Trnsf, y)
print(gsearch.best_params_)
gsearch.best_estimator_
svc_final = SVC(C=1, gamma=0.001, kernel='rbf')
final_score = cross_val_score(svc_final, X_Trnsf, y, cv=kfold, scoring='accuracy').mean()
print("Final accuracy : {} ".format(final_score))