#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Goal
:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
# load data (manually removed extra space from file's header)
df = pd.read_csv('letterCG.bin', sep=' ')
df.shape
df.head()
df.describe()
fig, ax = plt.subplots(figsize=(8,7))
df.groupby('Class').Class.count().plot(kind='bar', title='Class distribution')
df.info()
Missing values
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
No missing values
Mapping Target and Split
# mapping Class values to numerical: C = 0, G = 1
df['Class'] = df.Class.map({'C':0,'G':1})
# train test split
from sklearn.model_selection import train_test_split
X = df.drop('Class', axis=1)
y = df.Class
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)
Visualization of Outliers and Distribution
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(16,14))
for i, column in enumerate(X.columns):
sns.set()
sns.distplot(X[column],ax=axes[i//4,i%4])
fig, ax = plt.subplots(figsize=(16,7))
X.boxplot()
Note
:
Because AdaBoost try to fit every point perfectly, it is sensitive to noise and highly affected by outliers.
Unique values
from sklearn.feature_selection import VarianceThreshold
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
columns_to_remove = [name for name in x_train.columns if name not in x_train.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)
No unique values
Highly correlated values
Threshold: 75%
# Correlation matrix for all independent vars
corrMatrix = x_train.corr()
allVars = corrMatrix.keys()
absCorrWithDep = []
for var in allVars:
absCorrWithDep.append(abs(y.corr(x_train[var])))
# threshold seeting
corrTol = 0.75
# for each column in the corr matrix
for col in corrMatrix:
if col in corrMatrix.keys():
thisCol = []
thisVars = []
temp = corrMatrix[col]
# Store the corr with the dep var for fields that are highly correlated with each other
for i in range(len(corrMatrix)):
if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
thisCorr = 0
else:
thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
thisCol.append(thisCorr)
thisVars.append(corrMatrix.keys()[i])
mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
ctDelCol = 0 # To keep track of the number of columns deleted
for n, j in enumerate(thisCol):
# Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
# or (b) completely corr with the 'col'
mask[n] = not (j != max(thisCol) and j >= 0)
if j != max(thisCol) and j >= 0:
# Delete the column from the corr matrix
corrMatrix.pop('%s' %thisVars[n])
ctDelCol += 1
# Delete the corresponding row(s) from the corr matrix
corrMatrix = corrMatrix[mask]
columns_to_keep = corrMatrix.columns
print(columns_to_keep)
len(columns_to_keep)
x_train_clean = x_train[columns_to_keep]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(x_train_clean.corr(), cmap='Reds',annot=True, linewidths=.5, ax=ax)
Keeping 12 feature below the 75% correlation threshold
Feature importance
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(x_train_clean, y_train)
features = x_train_clean.columns
importances = abc.feature_importances_
indices = np.argsort(importances)
fig, ax = plt.subplots(figsize=(16,14))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Recursive Feature Selection using RFECV with AdaBoost model
from sklearn.model_selection import cross_val_score, StratifiedKFold
kfold = StratifiedKFold(n_splits=10, random_state=17, shuffle=True)
from sklearn.feature_selection import RFECV
model = AdaBoostClassifier()
rfecv = RFECV(estimator=model, cv=kfold, scoring='accuracy')
rfecv.fit(x_train_clean, y_train)
plt.figure()
fig, ax = plt.subplots(figsize=(16,14))
plt.title('XGB CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.grid()
plt.show()
feature_importance = list(zip(x_train_clean.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
if(value[1]) == True:
new_features.append(value[0])
print(new_features)
x_train_best = x_train_clean[new_features]
x_test_best = x_test[new_features]
Choosing base_estimator model
# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRFClassifier
models = []
models.append(('LR',LogisticRegression()))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('ETC',ExtraTreesClassifier()))
names = []
scores = []
for name, model in models:
score = cross_val_score(AdaBoostClassifier(base_estimator=model), x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
results = pd.DataFrame({'Model': names,'Accuracy': scores})
results
fig, ax = plt.subplots(figsize=(16,14))
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center")
plt.show()
Though ExtraTreeClassifier is the winner, we will use DecisionTreeClassifier instead for performance.
Tuning steps are:
High learning rate + initial pamameter settings
-> # of trees -> tree parameters tuning -> lower learning rate and up estimators for better results
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
def modelfit(alg, xtrain, ytrain, performCV=True, printFeatureImportance=True, cv_folds=5):
#Fit the algorithm on the data
alg.fit(xtrain, ytrain)
#Predict training set:
ypred = alg.predict(xtrain)
ypredprob = alg.predict_proba(xtrain)[:,1]
#Perform cross-validation:
if performCV:
cv_score = cross_val_score(alg, xtrain, ytrain, cv=cv_folds, scoring='accuracy')
#Print model report:
print("\nModel Report")
print("Accuracy : %.4g" % accuracy_score(ytrain, ypred))
print("AUC Score (Train): %f" % roc_auc_score(ytrain, ypredprob))
if performCV:
print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" %
(np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
#Print Feature Importance:
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, xtrain.columns).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
abc = AdaBoostClassifier(random_state=7)
modelfit(abc, x_train_best, y_train)
High learning rate + initial pamameter settings
Getting best estimator for learning_rate 0.1
# initializing parameters
learning_rate = 0.1
n_estimators = 150
random_state = 7
min_samples_split = 300
min_samples_leaf = 50
max_depth = 8
max_features = 'sqrt'
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features,
min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split,
random_state=random_state)
from sklearn.model_selection import GridSearchCV
param = {'n_estimators':range(1,501)}
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
random_state=random_state),
param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
n_estimators = grdsrch.best_params_['n_estimators']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
# print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
Tree parameters tuning
max_depth & min_samples_splits
Note
:
Doing parameters separately is not the best approach for gridsearch.
param = {'base_estimator__max_depth':range(0,16),
'base_estimator__min_samples_split':range(2,101)}
base_estimator = DecisionTreeClassifier(max_features=max_features, min_samples_leaf=min_samples_leaf, random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
n_estimators=n_estimators,random_state=random_state),
param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
max_depth = grdsrch.best_params_['base_estimator__max_depth']
min_samples_split = grdsrch.best_params_['base_estimator__min_samples_split']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
# print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
min_samples_splits & min_samples_leaf
param = {'base_estimator__min_samples_leaf':range(1,101)}
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split,
random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
n_estimators=n_estimators,random_state=random_state),
param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
min_samples_leaf = grdsrch.best_params_['base_estimator__min_samples_leaf']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
# print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
Lowering learning rate and finding best n_estomators
Lower the rate more if you have powerfull system
learning_rate: 0.05
learning_rate = 0.05
param = {'n_estimators':range(1,1001)}
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf,
min_samples_split=min_samples_split, random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
random_state=random_state),
param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
n_estimators = grdsrch.best_params_['n_estimators']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
# print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
Note
:
Dropping the learning rate more generally results in more robust results.
learning_rate: 0.001 might get better result.
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf,
min_samples_split=min_samples_split, random_state=random_state)
abc = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate, n_estimators=n_estimators,
random_state=random_state)
Train data accuracy
final_score = cross_val_score(abc, x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
print("Final train accuracy : {} ".format(final_score))
Test data accuracy
abc.fit(x_train_best, y_train)
y_pred = abc.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Final test accuracy : {} ".format(score))
Improvements
: