#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Goal
:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 16, 14
df = pd.read_csv('glass.csv')
df.shape
df.head()
df.describe()
df.info()
fig, ax = plt.subplots(figsize=(8,7))
df.groupby('Type').Type.count().plot(kind='bar', title='Glasse types distribution')
#number of classes
num_class = len(df.Type.unique())
Missing values
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
# Features and Target
X = df.drop(['Type'], axis=1)
# Some models only use target starting from 0 to classes-1, must map even if target is already numbers.
y = df.Type.map({1: 0, 2: 1, 3: 2, 5: 3, 6: 4, 7: 5})
from sklearn.model_selection import train_test_split
# random_state guarantees the same output everytime the program is run
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state=17)
No missing values
Unique values
from sklearn.feature_selection import VarianceThreshold
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
columns_to_remove = [name for name in x_train.columns if name not in x_train.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)
No unique values
Highly correlated values
Threshold: 75%
# Correlation matrix for all independent vars
corrMatrix = x_train.corr()
allVars = corrMatrix.keys()
absCorrWithDep = []
for var in allVars:
absCorrWithDep.append(abs(y.corr(x_train[var])))
# threshold seeting
corrTol = 0.75
# for each column in the corr matrix
for col in corrMatrix:
if col in corrMatrix.keys():
thisCol = []
thisVars = []
temp = corrMatrix[col]
# Store the corr with the dep var for fields that are highly correlated with each other
for i in range(len(corrMatrix)):
if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
thisCorr = 0
else:
thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
thisCol.append(thisCorr)
thisVars.append(corrMatrix.keys()[i])
mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
ctDelCol = 0 # To keep track of the number of columns deleted
for n, j in enumerate(thisCol):
# Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
# or (b) completely corr with the 'col'
mask[n] = not (j != max(thisCol) and j >= 0)
if j != max(thisCol) and j >= 0:
# Delete the column from the corr matrix
corrMatrix.pop('%s' %thisVars[n])
ctDelCol += 1
# Delete the corresponding row(s) from the corr matrix
corrMatrix = corrMatrix[mask]
columns_to_keep = corrMatrix.columns
print(columns_to_keep)
len(columns_to_keep)
x_train_clean = x_train[columns_to_keep]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(x_train_clean.corr(), cmap='Reds',annot=True, linewidths=.5, ax=ax)
Keeping 8 feature below the 75% correlation threshold
Feature importance
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train_clean, y_train)
features = x_train_clean.columns
importances = rfc.feature_importances_
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Keeping all features
Recursive Feature Selection using RFECV with XGBoost model
from sklearn.model_selection import cross_val_score, KFold
kfold = KFold(n_splits=num_class, random_state=17, shuffle=True)
from xgboost import XGBClassifier
from sklearn.feature_selection import RFECV
objective='multi:softmax'
model = XGBClassifier(objective=objective, num_class=num_class)
rfecv = RFECV(estimator=model, cv=kfold, scoring='accuracy')
rfecv.fit(x_train_clean, y_train)
plt.figure()
plt.title('XGB CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.grid()
plt.show()
feature_importance = list(zip(x_train_clean.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
if(value[1]) == True:
new_features.append(value[0])
print(new_features)
x_train_best = x_train_clean[new_features]
x_test_best = x_test[new_features]
Keeping all features
Note
:
**Recursive Feature selection** works for data with low feature count as it is an exhaustive process which using a lot of computing power.
Comapring results from 9 different models
# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRFClassifier
models = []
models.append(('LR',LogisticRegression(solver = 'newton-cg')))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('SVC',SVC()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('GBC',GradientBoostingClassifier()))
models.append(('ABC',AdaBoostClassifier()))
models.append(('XGB',XGBClassifier(objective='multi:softmax', num_class=num_class)))
models.append(('XGBRF',XGBRFClassifier()))
Using Kfold and Cross Validation
names = []
scores = []
for name, model in models:
score = cross_val_score(model, x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
results = pd.DataFrame({'Model': names,'Accuracy': scores})
results
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center")
plt.show()
Usng XGBoost
Note
:
XGBoost
is a very powerful model and will perform better than other models 80% of the time.
Learning rate and estimators
This function allows us to find the best number of estimators for our starting learning rate. We will run it again after we tune the learning for best estimators for that learning rate.
import xgboost as XGB
from sklearn.metrics import accuracy_score, roc_auc_score
def modelfit(alg, useTrainCV=True, cv_folds=num_class, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
#xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
xgtrain = XGB.DMatrix(x_train_best.values, label=y_train.values)
cvresult = XGB.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='mlogloss', early_stopping_rounds=early_stopping_rounds, verbose_eval=-1)
n_estimators = cvresult.shape[0]
alg.set_params(n_estimators=n_estimators)
#Fit the algorithm on the data
alg.fit(x_train_best, y_train)
#Predict training set:
dtrain_predictions = alg.predict(x_train_best)
#Print model report:
print("\nModel Report")
print("Accuracy (Train): %.4g" % accuracy_score(y_train.values, dtrain_predictions))
print('n_estimators :',n_estimators )
feat_imp = pd.Series(alg._Booster.get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
return n_estimators
# initial parameters value (starting point which will change later on)
xgb = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8,
colsample_bytree=0.8, objective=objective, num_class=num_class, seed=7)
n_estimators = modelfit(xgb)
from sklearn.model_selection import GridSearchCV
max_depth and min_child_weight
param = {'max_depth':range(0,15,1),
'min_child_weight':range(0,15,2)}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, subsample=0.8, colsample_bytree=0.8,
objective=objective, num_class=num_class, seed=7),
param_grid=param, scoring='accuracy', n_jobs=-1,cv=kfold)
gsrch.fit(x_train_best, y_train)
max_depth = gsrch.best_params_['max_depth']
min_child_weight = gsrch.best_params_['min_child_weight']
gsrch.best_params_, gsrch.best_score_
gamma
param = {'gamma':[i/10.0 for i in range(0,101)]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, subsample=0.8, colsample_bytree=0.8,
objective=objective, num_class=num_class, seed=7, max_depth=max_depth,
min_child_weight=min_child_weight), param_grid=param, scoring='accuracy',
n_jobs=-1, cv=kfold)
gsrch.fit(x_train_best, y_train)
gamma = gsrch.best_params_['gamma']
gsrch.best_params_, gsrch.best_score_
subsample and colsample_bytree
param = {'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, objective=objective,
num_class=num_class, seed=7, max_depth=max_depth,
min_child_weight=min_child_weight, gamma=gamma),
param_grid=param, scoring='accuracy', n_jobs=-1, cv=kfold)
gsrch.fit(x_train_best, y_train)
colsample_bytree = gsrch.best_params_['colsample_bytree']
subsample = gsrch.best_params_['subsample']
gsrch.best_params_, gsrch.best_score_
reg_alpha
param = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100, 0, 0.001, 0.005, 0.01, 0.05]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=n_estimators, objective=objective,
num_class=num_class, seed=7, max_depth=max_depth,
min_child_weight=min_child_weight, gamma=gamma,
colsample_bytree=colsample_bytree, subsample=subsample), param_grid=param,
scoring='accuracy', n_jobs=-1, cv=kfold)
gsrch.fit(x_train_best, y_train)
reg_alpha = gsrch.best_params_['reg_alpha']
gsrch.best_params_, gsrch.best_score_
estimators for learning rate 0.01
xgb = XGBClassifier(learning_rate=0.01, n_estimators=100000, objective=objective, num_class=num_class,seed=7,
max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, colsample_bytree=colsample_bytree,
subsample=subsample)
n_estimators = modelfit(xgb)
xgb = XGBClassifier(learning_rate=0.01, n_estimators=n_estimators, objective=objective, num_class=num_class, seed=7,
max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, colsample_bytree=colsample_bytree,
subsample=subsample, reg_alpha=reg_alpha)
final_score = cross_val_score(xgb, x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
print("Final train accuracy : {} ".format(final_score))
xgb.fit(x_train_best, y_train)
y_pred = xgb.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Final test accuracy : {} ".format(score))
xgb2 = XGBClassifier()
xgb2.fit(x_train_best, y_train)
y_pred = xgb.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Test accuracy without tuning : {} ".format(score))
Tuning again...
param = {'max_depth':range(0,15,1),
'min_child_weight':range(0,15,2)}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.01, n_estimators=n_estimators, objective=objective,
num_class=num_class, seed=7, gamma=gamma, colsample_bytree=colsample_bytree,
subsample=subsample), param_grid=param, scoring='accuracy',n_jobs=-1, cv=kfold)
gsrch.fit(x_train_best, y_train)
max_depth = gsrch.best_params_['max_depth']
min_child_weight = gsrch.best_params_['min_child_weight']
gsrch.best_params_, gsrch.best_score_
param = {'gamma':[i/10.0 for i in range(0,101)]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.01, n_estimators=n_estimators, objective=objective,
num_class=num_class, seed=7, max_depth=max_depth,
min_child_weight=min_child_weight, colsample_bytree=colsample_bytree,
subsample=subsample), param_grid=param, scoring='accuracy',
n_jobs=-1, cv=kfold)
gsrch.fit(x_train_best, y_train)
gamma = gsrch.best_params_['gamma']
gsrch.best_params_, gsrch.best_score_
param = {'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.01, n_estimators=n_estimators, objective=objective,
num_class=num_class, seed=7, max_depth=max_depth,
min_child_weight=min_child_weight, gamma=gamma), param_grid=param,
scoring='accuracy', n_jobs=-1, cv=kfold)
gsrch.fit(x_train_best, y_train)
colsample_bytree = gsrch.best_params_['colsample_bytree']
subsample = gsrch.best_params_['subsample']
gsrch.best_params_, gsrch.best_score_
param = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100, 0, 0.001, 0.005, 0.01, 0.05]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.01, n_estimators=n_estimators, objective=objective,
num_class=num_class, seed=7, max_depth=max_depth,
min_child_weight=min_child_weight, gamma=gamma, colsample_bytree=colsample_bytree,
subsample=subsample), param_grid=param,
scoring='accuracy', n_jobs=-1, cv=kfold)
gsrch.fit(x_train_best, y_train)
reg_alpha = gsrch.best_params_['reg_alpha']
gsrch.best_params_, gsrch.best_score_
xgb = XGBClassifier(learning_rate=0.01, n_estimators=100000, objective=objective, num_class=num_class,seed=7,
max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, colsample_bytree=colsample_bytree,
subsample=subsample, reg_alpha=reg_alpha)
n_estimators = modelfit(xgb)
Predicting...
xgb = XGBClassifier(learning_rate=0.01, n_estimators=n_estimators, objective=objective, num_class=num_class, seed=7,
max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, colsample_bytree=colsample_bytree,
subsample=subsample)
final_score = cross_val_score(xgb, x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
print("Final train accuracy : {} ".format(final_score))
xgb.fit(x_train_best, y_train)
y_pred = xgb.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Final test accuracy : {} ".format(score))
Note:
Tuning improvements are minimal. Also repeating tuning process for XGBoost moel seems to improve accuracy
Improvement