#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
      
       Goal
      
      :
      
     
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
      # load data (manually removed extra space from file's header)
df = pd.read_csv('letterCG.bin', sep=' ')
df.shape
      df.head()
      df.describe()
      fig, ax = plt.subplots(figsize=(8,7))
df.groupby('Class').Class.count().plot(kind='bar', title='Class distribution')
      df.info()
      Missing values
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
      No missing values
Mapping Target and Split
# mapping Class values to numerical: C = 0, G = 1
df['Class'] = df.Class.map({'C':0,'G':1})
      # train test split
from sklearn.model_selection import train_test_split
X = df.drop('Class', axis=1)
y = df.Class
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)
      Visualization of Outliers and Distribution
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(16,14))
for i, column in enumerate(X.columns):
    sns.set()
    sns.distplot(X[column],ax=axes[i//4,i%4])
      fig, ax = plt.subplots(figsize=(16,7))
X.boxplot()
      
      
       
        Note
       
       :
       
       Because AdaBoost try to fit every point perfectly, it is sensitive to noise and highly affected by outliers.
      
     
Unique values
from sklearn.feature_selection import VarianceThreshold
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
columns_to_remove = [name for name in x_train.columns if name not in x_train.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)
      No unique values
      
       Highly correlated values
      
      
      Threshold: 75%
     
# Correlation matrix for all independent vars
corrMatrix = x_train.corr()
allVars = corrMatrix.keys()
absCorrWithDep = []
for var in allVars:
    absCorrWithDep.append(abs(y.corr(x_train[var])))
# threshold seeting
corrTol = 0.75
# for each column in the corr matrix
for col in corrMatrix:
    
    if col in corrMatrix.keys():
        thisCol = []
        thisVars = []
        temp = corrMatrix[col]
        
        # Store the corr with the dep var for fields that are highly correlated with each other
        for i in range(len(corrMatrix)):
            
            if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
                thisCorr = 0
            else:
                thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
            thisCol.append(thisCorr)
            thisVars.append(corrMatrix.keys()[i])
        
        mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
        
        ctDelCol = 0 # To keep track of the number of columns deleted
        
        for n, j in enumerate(thisCol):
            # Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
            # or (b) completely corr with the 'col'
            mask[n] = not (j != max(thisCol) and j >= 0)
            
            if j != max(thisCol) and j >= 0:
                # Delete the column from the corr matrix
                corrMatrix.pop('%s' %thisVars[n])
                ctDelCol += 1
                
        # Delete the corresponding row(s) from the corr matrix
        corrMatrix = corrMatrix[mask]
      columns_to_keep = corrMatrix.columns
print(columns_to_keep)
len(columns_to_keep)
      x_train_clean = x_train[columns_to_keep]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(x_train_clean.corr(), cmap='Reds',annot=True, linewidths=.5, ax=ax)
      Keeping 12 feature below the 75% correlation threshold
Feature importance
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc.fit(x_train_clean, y_train)
features = x_train_clean.columns
importances = abc.feature_importances_
indices = np.argsort(importances)
fig, ax = plt.subplots(figsize=(16,14))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
      Recursive Feature Selection using RFECV with AdaBoost model
from sklearn.model_selection import cross_val_score, StratifiedKFold
kfold = StratifiedKFold(n_splits=10, random_state=17, shuffle=True)
      from sklearn.feature_selection import RFECV
model = AdaBoostClassifier()
rfecv = RFECV(estimator=model, cv=kfold, scoring='accuracy')
rfecv.fit(x_train_clean, y_train)
      plt.figure()
fig, ax = plt.subplots(figsize=(16,14))
plt.title('XGB CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.grid()
plt.show()
      feature_importance = list(zip(x_train_clean.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])
        
print(new_features)
x_train_best = x_train_clean[new_features]
x_test_best = x_test[new_features]
      Choosing base_estimator model
# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRFClassifier
      models = []
models.append(('LR',LogisticRegression()))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('ETC',ExtraTreesClassifier()))
      names = []
scores = []
for name, model in models:
    score = cross_val_score(AdaBoostClassifier(base_estimator=model), x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)
results  = pd.DataFrame({'Model': names,'Accuracy': scores})
results
      fig, ax = plt.subplots(figsize=(16,14))
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
    height = p.get_height()
    axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
plt.show()
      Though ExtraTreeClassifier is the winner, we will use DecisionTreeClassifier instead for performance.
      Tuning steps are:
      
      High learning rate + initial pamameter settings
      
      -> # of trees -> tree parameters tuning -> lower learning rate and up estimators for better results
     
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
def modelfit(alg, xtrain, ytrain, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(xtrain, ytrain)
    #Predict training set:
    ypred = alg.predict(xtrain)
    ypredprob = alg.predict_proba(xtrain)[:,1]
    #Perform cross-validation:
    if performCV:
        cv_score = cross_val_score(alg, xtrain, ytrain, cv=cv_folds, scoring='accuracy')
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % accuracy_score(ytrain, ypred))
    print("AUC Score (Train): %f" % roc_auc_score(ytrain, ypredprob))
    if performCV:
        print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % 
              (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, xtrain.columns).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
      abc = AdaBoostClassifier(random_state=7)
modelfit(abc, x_train_best, y_train)
      
      
       High learning rate + initial pamameter settings
      
      
      Getting best estimator for learning_rate 0.1
     
# initializing parameters
learning_rate = 0.1
n_estimators = 150
random_state = 7
min_samples_split = 300
min_samples_leaf = 50
max_depth = 8
max_features = 'sqrt'
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features,
                                        min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split,
                                        random_state=random_state)
      from sklearn.model_selection import GridSearchCV
param = {'n_estimators':range(1,501)}
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
                                                      random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
      n_estimators = grdsrch.best_params_['n_estimators']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
      
      
       Tree parameters tuning
      
      
      max_depth & min_samples_splits
     
      
       
        Note
       
       :
       
       Doing parameters separately is not the best approach for gridsearch.
      
     
param = {'base_estimator__max_depth':range(0,16),
         'base_estimator__min_samples_split':range(2,101)}
base_estimator = DecisionTreeClassifier(max_features=max_features, min_samples_leaf=min_samples_leaf, random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate, 
                                                      n_estimators=n_estimators,random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
      max_depth = grdsrch.best_params_['base_estimator__max_depth']
min_samples_split = grdsrch.best_params_['base_estimator__min_samples_split']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
      min_samples_splits & min_samples_leaf
param = {'base_estimator__min_samples_leaf':range(1,101)}
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split,
                                        random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate, 
                                                      n_estimators=n_estimators,random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
      min_samples_leaf = grdsrch.best_params_['base_estimator__min_samples_leaf']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
      
      
       Lowering learning rate and finding best n_estomators
      
      
      
       Lower the rate more if you have powerfull system
      
     
learning_rate: 0.05
learning_rate = 0.05
param = {'n_estimators':range(1,1001)}
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, 
                                        min_samples_split=min_samples_split, random_state=random_state)
grdsrch = GridSearchCV(estimator = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate,
                                                      random_state=random_state), 
                       param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
grdsrch.fit(x_train_best,y_train)
      n_estimators = grdsrch.best_params_['n_estimators']
#results = grdsrch.cv_results_
#for i in range(0,n_estimators):
#    print('mean: ',results['mean_test_score'][i],' std: ',results['std_test_score'][i],' params: ',results['params'][i])
grdsrch.best_params_, grdsrch.best_score_
      
      
       
        Note
       
       :
       
       Dropping the learning rate more generally results in more robust results.
       
       learning_rate: 0.001 might get better result.
      
     
base_estimator = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, 
                                        min_samples_split=min_samples_split, random_state=random_state)
abc = AdaBoostClassifier(base_estimator=base_estimator, learning_rate=learning_rate, n_estimators=n_estimators, 
                         random_state=random_state)
      Train data accuracy
final_score = cross_val_score(abc, x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
print("Final train accuracy : {} ".format(final_score))
      Test data accuracy
abc.fit(x_train_best, y_train)
y_pred = abc.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Final test accuracy : {} ".format(score))
      
      
       
        Improvements
       
      
      :