#MachineLearning #SupervisedLearning #Classification

By Billy Gustave

Letter C G Classifier ¶

Goal :

Classify the observations as C and G
Data: letterCG.bin
AdaBoostClassifier

Data Cleaning and Exploration ¶

import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

# load data (manually removed extra space from file's header)
df = pd.read_csv('letterCG.bin', sep=' ')
df.shape

(1509, 17)

df.head()

df.describe()

fig, ax = plt.subplots(figsize=(8,7))
df.groupby('Class').Class.count().plot(kind='bar', title='Class distribution')

<matplotlib.axes._subplots.AxesSubplot at 0x264d7ce7688>

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Class   1509 non-null   object
 1   x-box   1509 non-null   int64 
 2   y-box   1509 non-null   int64 
 3   width   1509 non-null   int64 
 4   high    1509 non-null   int64 
 5   onpix   1509 non-null   int64 
 6   x-bar   1509 non-null   int64 
 7   y-bar   1509 non-null   int64 
 8   x2bar   1509 non-null   int64 
 9   y2bar   1509 non-null   int64 
 10  xybar   1509 non-null   int64 
 11  x2ybr   1509 non-null   int64 
 12  xy2br   1509 non-null   int64 
 13  x-ege   1509 non-null   int64 
 14  xegvy   1509 non-null   int64 
 15  y-ege   1509 non-null   int64 
 16  yegvx   1509 non-null   int64 
dtypes: int64(16), object(1)
memory usage: 200.5+ KB

Missing values

# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100

Class    0.0
x-box    0.0
y-box    0.0
width    0.0
high     0.0
onpix    0.0
x-bar    0.0
y-bar    0.0
x2bar    0.0
y2bar    0.0
xybar    0.0
x2ybr    0.0
xy2br    0.0
x-ege    0.0
xegvy    0.0
y-ege    0.0
yegvx    0.0
dtype: float64

No missing values

Mapping Target and Split

# mapping Class values to numerical: C = 0, G = 1
df['Class'] = df.Class.map({'C':0,'G':1})

# train test split
from sklearn.model_selection import train_test_split
X = df.drop('Class', axis=1)
y = df.Class
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

Visualization of Outliers and Distribution

fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(16,14))
for i, column in enumerate(X.columns):
    sns.set()
    sns.distplot(X[column],ax=axes[i//4,i%4])

fig, ax = plt.subplots(figsize=(16,7))
X.boxplot()

<matplotlib.axes._subplots.AxesSubplot at 0x264d8f737c8>

Note :
Because AdaBoost try to fit every point perfectly, it is sensitive to noise and highly affected by outliers.

Unique values

from sklearn.feature_selection import VarianceThreshold
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
columns_to_remove = [name for name in x_train.columns if name not in x_train.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)

Unique features:  []

No unique values

Highly correlated values
Threshold: 75%

# Correlation matrix for all independent vars
corrMatrix = x_train.corr()
allVars = corrMatrix.keys()

absCorrWithDep = []
for var in allVars:
    absCorrWithDep.append(abs(y.corr(x_train[var])))
# threshold seeting
corrTol = 0.75

# for each column in the corr matrix
for col in corrMatrix:
    
    if col in corrMatrix.keys():
        thisCol = []
        thisVars = []
        temp = corrMatrix[col]
        
        # Store the corr with the dep var for fields that are highly correlated with each other
        for i in range(len(corrMatrix)):
            
            if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
                thisCorr = 0
            else:
                thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
            thisCol.append(thisCorr)
            thisVars.append(corrMatrix.keys()[i])
        
        mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
        
        ctDelCol = 0 # To keep track of the number of columns deleted
        
        for n, j in enumerate(thisCol):
            # Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
            # or (b) completely corr with the 'col'
            mask[n] = not (j != max(thisCol) and j >= 0)
            
            if j != max(thisCol) and j >= 0:
                # Delete the column from the corr matrix
                corrMatrix.pop('%s' %thisVars[n])
                ctDelCol += 1
                
        # Delete the corresponding row(s) from the corr matrix
        corrMatrix = corrMatrix[mask]

columns_to_keep = corrMatrix.columns
print(columns_to_keep)
len(columns_to_keep)

Index(['x-box', 'onpix', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr',
       'xy2br', 'xegvy', 'y-ege', 'yegvx'],
      dtype='object')

12

x_train_clean = x_train[columns_to_keep]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(x_train_clean.corr(), cmap='Reds',annot=True, linewidths=.5, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x264d8d4a5c8>

Keeping 12 feature below the 75% correlation threshold

Feature importance

from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(x_train_clean, y_train)
features = x_train_clean.columns
importances = abc.feature_importances_
indices = np.argsort(importances)
fig, ax = plt.subplots(figsize=(16,14))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

Recursive Feature Selection using RFECV with AdaBoost model

from sklearn.model_selection import cross_val_score, StratifiedKFold
kfold = StratifiedKFold(n_splits=10, random_state=17, shuffle=True)

from sklearn.feature_selection import RFECV
model = AdaBoostClassifier()
rfecv = RFECV(estimator=model, cv=kfold, scoring='accuracy')
rfecv.fit(x_train_clean, y_train)

RFECV(cv=StratifiedKFold(n_splits=10, random_state=17, shuffle=True),
      estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                   learning_rate=1.0, n_estimators=50,
                                   random_state=None),
      min_features_to_select=1, n_jobs=None, scoring='accuracy', step=1,
      verbose=0)

plt.figure()
fig, ax = plt.subplots(figsize=(16,14))
plt.title('XGB CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.grid()
plt.show()

<Figure size 432x288 with 0 Axes>

feature_importance = list(zip(x_train_clean.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])
        
print(new_features)
x_train_best = x_train_clean[new_features]
x_test_best = x_test[new_features]

['x-box', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'xegvy', 'y-ege', 'yegvx']

Model Selection ¶

Choosing base_estimator model

# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRFClassifier

models = []
models.append(('LR',LogisticRegression()))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('ETC',ExtraTreesClassifier()))

names = []
scores = []
for name, model in models:
    score = cross_val_score(AdaBoostClassifier(base_estimator=model), x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results

fig, ax = plt.subplots(figsize=(16,14))
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()

Though ExtraTreeClassifier is the winner, we will use DecisionTreeClassifier instead for performance.

Model tuning ¶

Tuning steps are:
High learning rate + initial pamameter settings
-> # of trees -> tree parameters tuning -> lower learning rate and up estimators for better results

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
def modelfit(alg, xtrain, ytrain, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(xtrain, ytrain)

    #Predict training set:
    ypred = alg.predict(xtrain)
    ypredprob = alg.predict_proba(xtrain)[:,1]

    #Perform cross-validation:
    if performCV:
        cv_score = cross_val_score(alg, xtrain, ytrain, cv=cv_folds, scoring='accuracy')

    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % accuracy_score(ytrain, ypred))
    print("AUC Score (Train): %f" % roc_auc_score(ytrain, ypredprob))

    if performCV:
        print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % 
              (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))

    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, xtrain.columns).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')

abc = AdaBoostClassifier(random_state=7)
modelfit(abc, x_train_best, y_train)

Model Report
Accuracy : 0.9558
AUC Score (Train): 0.993451
CV Score : Mean - 0.9416709 | Std - 0.02468439 | Min - 0.9118943 | Max - 0.9778761

High learning rate + initial pamameter settings
Getting best estimator for learning_rate 0.1

# initializing parameters
learning_rate = 0.1
n_estimators = 150
random_state = 7
min_samples_split = 300
min_samples_leaf = 50
max_depth = 8
max_features = 'sqrt'
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features,
                                        min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split,
                                        random_state=random_state)

from sklearn.model_selection import GridSearchCV
param = {'n_estimators':range(1,501)}
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
                                                      random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=8,
                                                                                max_features='sqrt',
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=50,
                                                                                min_samples_split=300,
                                                                                min_weight_fraction_leaf=0.0,
                                                                                presort='deprecated',
                                                                                random_state=7,
                                                                                splitter='best'),
                                          learning_rate=0.1, n_estimators=50,
                                          random_state=7),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_estimators': range(1, 501)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

n_estimators = grdsrch.best_params_['n_estimators']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_

({'n_estimators': 434}, 0.9823281743401816)

Tree parameters tuning
max_depth & min_samples_splits

Note :
Doing parameters separately is not the best approach for gridsearch.

param = {'base_estimator__max_depth':range(0,16),
         'base_estimator__min_samples_split':range(2,101)}
base_estimator = DecisionTreeClassifier(max_features=max_features, min_samples_leaf=min_samples_leaf, random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate, 
                                                      n_estimators=n_estimators,random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=None,
                                                                                max_features='sqrt',
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=50,
                                                                                min_samples_split=2,
                                                                                min_weight_fraction_leaf=0.0,
                                                                                presort='deprecated',
                                                                                random_state=7,
                                                                                splitter='best'),
                                          learning_rate=0.1, n_estimators=434,
                                          random_state=7),
             iid='deprecated', n_jobs=-1,
             param_grid={'base_estimator__max_depth': range(0, 16),
                         'base_estimator__min_samples_split': range(2, 101)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

max_depth = grdsrch.best_params_['base_estimator__max_depth']
min_samples_split = grdsrch.best_params_['base_estimator__min_samples_split']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_

({'base_estimator__max_depth': 5, 'base_estimator__min_samples_split': 2},
 0.9858602003820514)

min_samples_splits & min_samples_leaf

param = {'base_estimator__min_samples_leaf':range(1,101)}
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split,
                                        random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate, 
                                                      n_estimators=n_estimators,random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=5,
                                                                                max_features='sqrt',
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=1,
                                                                                min_samples_split=2,
                                                                                min_weight_fraction_leaf=0.0,
                                                                                presort='deprecated',
                                                                                random_state=7,
                                                                                splitter='best'),
                                          learning_rate=0.1, n_estimators=434,
                                          random_state=7),
             iid='deprecated', n_jobs=-1,
             param_grid={'base_estimator__min_samples_leaf': range(1, 101)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

min_samples_leaf = grdsrch.best_params_['base_estimator__min_samples_leaf']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_

({'base_estimator__min_samples_leaf': 38}, 0.9858602003820514)

Lowering learning rate and finding best n_estomators
Lower the rate more if you have powerfull system

learning_rate: 0.05

learning_rate = 0.05
param = {'n_estimators':range(1,1001)}
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, 
                                        min_samples_split=min_samples_split, random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
                                                      random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                class_weight=None,
                                                                                criterion='gini',
                                                                                max_depth=5,
                                                                                max_features='sqrt',
                                                                                max_leaf_nodes=None,
                                                                                min_impurity_decrease=0.0,
                                                                                min_impurity_split=None,
                                                                                min_samples_leaf=38,
                                                                                min_samples_split=2,
                                                                                min_weight_fraction_leaf=0.0,
                                                                                presort='deprecated',
                                                                                random_state=7,
                                                                                splitter='best'),
                                          learning_rate=0.05, n_estimators=50,
                                          random_state=7),
             iid='deprecated', n_jobs=-1,
             param_grid={'n_estimators': range(1, 1001)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

n_estimators = grdsrch.best_params_['n_estimators']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_

({'n_estimators': 937}, 0.9832053331254142)

Note :
Dropping the learning rate more generally results in more robust results.
learning_rate: 0.001 might get better result.

Predicting ¶

base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, 
                                        min_samples_split=min_samples_split, random_state=random_state)
abc = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate, n_estimators=n_estimators, 
                         random_state=random_state)

Train data accuracy

final_score = cross_val_score(abc, x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
print("Final train accuracy : {} ".format(final_score))

Final train accuracy : 0.9770066759819903

Test data accuracy

abc.fit(x_train_best, y_train)
y_pred = abc.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Final test accuracy : {} ".format(score))

Final test accuracy : 0.9814814814814815

Improvements :

Lower learning_rate
More n_estimators search
Better gridsearch parameters

	x-box	y-box	width	high	onpix	x-bar	y-bar	x2bar	y2bar	xybar	x2ybr	xy2br	x-ege	xegvy	y-ege	yegvx
count	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000	1509.000000
mean	4.059642	7.025182	4.848244	5.288933	3.180915	6.169649	7.094102	5.947647	6.235255	8.084824	6.809808	10.737575	2.422797	8.616965	4.626243	8.893970
std	1.744379	3.327707	1.612338	2.156974	1.934883	1.296437	1.138498	1.268069	2.071898	2.059967	1.443088	2.022762	1.768471	1.081741	1.578873	1.397002
min	0.000000	0.000000	1.000000	0.000000	0.000000	2.000000	3.000000	2.000000	1.000000	4.000000	2.000000	4.000000	0.000000	5.000000	2.000000	4.000000
25%	3.000000	5.000000	4.000000	4.000000	2.000000	5.000000	6.000000	5.000000	5.000000	6.000000	6.000000	9.000000	1.000000	8.000000	4.000000	8.000000
50%	4.000000	7.000000	5.000000	6.000000	3.000000	6.000000	7.000000	6.000000	6.000000	8.000000	7.000000	11.000000	2.000000	9.000000	4.000000	9.000000
75%	5.000000	9.000000	6.000000	7.000000	4.000000	7.000000	8.000000	7.000000	8.000000	9.000000	8.000000	12.000000	3.000000	9.000000	5.000000	10.000000
max	10.000000	15.000000	10.000000	9.000000	12.000000	11.000000	11.000000	10.000000	12.000000	13.000000	11.000000	15.000000	11.000000	13.000000	10.000000	14.000000

Billy Gustave

Letter C G Classifier

Letter C G Classifier ¶

Data Cleaning and Exploration ¶

Model Selection ¶

Model tuning ¶

Predicting ¶

Contact Me

www.linkedin.com/in/billygustave

billygustave.com

Billy Gustave

	Class	x-box	y-box	width	high	onpix	x-bar	y-bar	x2bar	y2bar	xybar	x2ybr	xy2br	x-ege	xegvy	y-ege	yegvx
0	C	4	10	5	8	2	5	7	7	10	7	6	13	1	8	4	9
1	C	3	5	4	4	2	4	8	5	7	12	9	11	1	10	2	7
2	G	5	10	6	7	3	8	7	8	8	6	7	9	2	7	5	10
3	C	5	9	6	7	4	3	8	6	7	12	10	13	1	9	3	7
4	G	4	8	5	6	3	6	6	7	6	10	7	12	2	9	4	9

	Model	Accuracy
0	LR	0.883318
1	NB	0.748005
2	DTC	0.960216
3	RFC	0.978777
4	ETC	0.983194

	Class	x-box	y-box	width	high	onpix	x-bar	y-bar	x2bar	y2bar	xybar	x2ybr	xy2br	x-ege	xegvy	y-ege	yegvx
0	C	4	10	5	8	2	5	7	7	10	7	6	13	1	8	4	9
1	C	3	5	4	4	2	4	8	5	7	12	9	11	1	10	2	7
2	G	5	10	6	7	3	8	7	8	8	6	7	9	2	7	5	10
3	C	5	9	6	7	4	3	8	6	7	12	10	13	1	9	3	7
4	G	4	8	5	6	3	6	6	7	6	10	7	12	2	9	4	9

	Class	x-box	y-box	width	high	onpix	x-bar	y-bar	x2bar	y2bar	xybar	x2ybr	xy2br	x-ege	xegvy	y-ege	yegvx
0	C	4	10	5	8	2	5	7	7	10	7	6	13	1	8	4	9
1	C	3	5	4	4	2	4	8	5	7	12	9	11	1	10	2	7
2	G	5	10	6	7	3	8	7	8	8	6	7	9	2	7	5	10
3	C	5	9	6	7	4	3	8	6	7	12	10	13	1	9	3	7
4	G	4	8	5	6	3	6	6	7	6	10	7	12	2	9	4	9