#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Business challenge/requirement
FuPont is a leading chemical company across the globe. The Company is on a CSR (Corporate Social Responsibility) mission. It wants to identify biodegradable products based on a study of the relationships between chemical structure and biodegradation of molecules.
Goal
:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
# features columns names
names = ['SpMax_L', 'J_Dz', 'nHM', 'F01[N-N]', 'F04[C-N]', 'NssssC', 'nCb-', 'C%','nCp','nO','F03[C-N]','SdssC','HyWi_B','LOC','SM6_L','F03[C-O]','Me','Mi','nN-N','nArNO2','nCRX3','SpPosA','nCIR','B01[C-Br]','B03[C-Cl]','N-073','SpMax_A','Psi_i_1d','B04[C-Br]','SdO','TI2_L','nCrt','C-026','F02[C-N]','nHDon','SpMax_B','Psi_i_A','nN','SM6_B','nArCOOR','nX','experimentalclass']
df = pd.read_csv('bio-degradabale-data.csv', header=None, sep=';',names=names)
df.shape
df.head()
df.describe()
df.info()
Only the target variables is not a number column
Missing values
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
# mapping our target variable
# RB:1, NRB:0
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df.experimentalclass)
df['experimentalclass'] = le.transform(df.experimentalclass)
# Features and Target
X = df.drop(['experimentalclass'], axis=1)
y = df.experimentalclass
from sklearn.model_selection import train_test_split
# random_state guarantees the same output everytime the program is run
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state=7)
No missing values
Unique values
from sklearn.feature_selection import VarianceThreshold
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
columns_to_remove = [name for name in x_train.columns if name not in x_train.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)
No unique values
Highly correlated values
Threshold: 75%
# Correlation matrix for all independent vars
corrMatrix = x_train.corr()
allVars = corrMatrix.keys()
absCorrWithDep = []
for var in allVars:
absCorrWithDep.append(abs(y.corr(x_train[var])))
# threshold seeting
corrTol = 0.75
# for each column in the corr matrix
for col in corrMatrix:
if col in corrMatrix.keys():
thisCol = []
thisVars = []
temp = corrMatrix[col]
# Store the corr with the dep var for fields that are highly correlated with each other
for i in range(len(corrMatrix)):
if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
thisCorr = 0
else:
thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
thisCol.append(thisCorr)
thisVars.append(corrMatrix.keys()[i])
mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
ctDelCol = 0 # To keep track of the number of columns deleted
for n, j in enumerate(thisCol):
# Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
# or (b) completely corr with the 'col'
mask[n] = not (j != max(thisCol) and j >= 0)
if j != max(thisCol) and j >= 0:
# Delete the column from the corr matrix
corrMatrix.pop('%s' %thisVars[n])
ctDelCol += 1
# Delete the corresponding row(s) from the corr matrix
corrMatrix = corrMatrix[mask]
columns_to_keep = corrMatrix.columns
len(columns_to_keep)
x_train_clean = x_train[columns_to_keep]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(x_train_clean.corr(), cmap='Reds',annot=True, linewidths=.5, ax=ax)
Keeping 30 feature below the 75% correlation threshold
Feature importance
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train_clean, y_train)
features = x_train_clean.columns
importances = rfc.feature_importances_
indices = np.argsort(importances)
fig, ax = plt.subplots(figsize=(16,14))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Removing the bottom 7 features.
low_importance_features = ['B03[C-Cl]','F01[N-N]','nArNO2','B01[C-Br]','N-073','nN-N','nCRX3']
x_train_clean = x_train_clean.drop(low_importance_features,axis=1)
Renaming features for xbg model, it doesn't accept names with [, ] or <
x_train_clean = x_train_clean.rename(columns = {'F04[C-N]':'F04C-N'})
x_test = x_test.rename(columns = {'F04[C-N]':'F04C-N'})
Recursive Feature Selection using RFECV with XGBoost model
from sklearn.model_selection import cross_val_score, KFold
kfold = KFold(n_splits=10, random_state=17, shuffle=True)
from xgboost import XGBClassifier
from sklearn.feature_selection import RFECV
model = XGBClassifier()
rfecv = RFECV(estimator=model, cv=kfold, scoring='accuracy')
rfecv.fit(x_train_clean, y_train)
plt.figure()
fig, ax = plt.subplots(figsize=(16,14))
plt.title('XGB CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.grid()
plt.show()
feature_importance = list(zip(x_train_clean.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
if(value[1]) == True:
new_features.append(value[0])
print(new_features)
x_train_best = x_train_clean[new_features]
x_test_best = x_test[new_features]
Comapring results from 9 different models
# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRFClassifier
models = []
models.append(('LR',LogisticRegression(solver = 'newton-cg')))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('SVC',SVC()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('GBC',GradientBoostingClassifier()))
models.append(('ABC',AdaBoostClassifier()))
models.append(('XGB',XGBClassifier()))
models.append(('XGBRF',XGBRFClassifier()))
Using Kfold and Cross Validation
names = []
scores = []
for name, model in models:
score = cross_val_score(model, x_train_best, y_train, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
results = pd.DataFrame({'Model': names,'Accuracy': scores})
results
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center")
plt.show()
Though Logistic Regression has slightly better accuracy than XGBoost because it performs a lot faster so that 1% lost in accuracy for huge boost in performance is worth it.
Learning rate and estimators
This function allows us to find the best number of estimators for our starting learning rate. We will run it again after we tune the learning for best estimators for that learning rate.
import xgboost as XGB
from sklearn.metrics import accuracy_score, roc_auc_score
def modelfit(alg, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
#xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
xgtrain = XGB.DMatrix(x_train_best.values, label=y_train.values)
cvresult = XGB.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=-1)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(x_train_best, y_train, eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(x_train_best)
dtrain_predprob = alg.predict_proba(x_train_best)[:,1]
#Print model report:
print("\nModel Report")
print("Accuracy (Train): %.4g" % accuracy_score(y_train.values, dtrain_predictions))
print("AUC Score (Train): %f" % roc_auc_score(y_train, dtrain_predprob))
feat_imp = pd.Series(alg._Booster.get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
# initial parameters value (starting point which will change later on)
xgb = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=500, subsample=0.8,
colsample_bytree=0.8,objective='binary:logistic',seed=7)
modelfit(xgb)
max_depth and min_child_weight
from sklearn.model_selection import GridSearchCV
param = {'max_depth':range(0,15,1),
'min_child_weight':range(0,15,2)}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=141, subsample=0.8, colsample_bytree=0.8,
objective='binary:logistic',seed=7), param_grid=param, scoring='accuracy',
n_jobs=-1,cv=5)
gsrch.fit(x_train_best, y_train)
max_depth = gsrch.best_params_['max_depth']
min_child_weight = gsrch.best_params_['min_child_weight']
gsrch.best_params_, gsrch.best_score_
gamma
param = {'gamma':[i/10.0 for i in range(0,101)]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=141, subsample=0.8, colsample_bytree=0.8,
objective='binary:logistic',seed=7, max_depth=max_depth,
min_child_weight=min_child_weight), param_grid=param, scoring='accuracy',
n_jobs=-1, cv=5)
gsrch.fit(x_train_best, y_train)
gamma = gsrch.best_params_['gamma']
gsrch.best_params_, gsrch.best_score_
subsample and colsample_bytree
param = {'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=141, objective='binary:logistic',seed=7,
max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma),
param_grid=param, scoring='accuracy', n_jobs=-1, cv=5)
gsrch.fit(x_train_best, y_train)
colsample_bytree = gsrch.best_params_['colsample_bytree']
subsample = gsrch.best_params_['subsample']
gsrch.best_params_, gsrch.best_score_
reg_alpha
param = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100, 0, 0.001, 0.005, 0.01, 0.05]}
gsrch = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=141, objective='binary:logistic',seed=7,
max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma,
colsample_bytree=colsample_bytree, subsample=subsample), param_grid=param,
scoring='accuracy', n_jobs=-1, cv=5)
gsrch.fit(x_train_best, y_train)
reg_alpha = gsrch.best_params_['reg_alpha']
gsrch.best_params_, gsrch.best_score_
estimators for learning rate 0.01
xgb = XGBClassifier(learning_rate=0.01, n_estimators=100000, objective='binary:logistic',seed=7, max_depth=max_depth,
min_child_weight=min_child_weight, gamma=gamma, colsample_bytree=colsample_bytree, subsample=subsample)
modelfit(xgb)
423
xgb = XGBClassifier(learning_rate=0.01, n_estimators=423, objective='binary:logistic',seed=7, max_depth=max_depth,
min_child_weight=min_child_weight, gamma=gamma, colsample_bytree=colsample_bytree, subsample=subsample)
final_score = cross_val_score(xgb, x_train_best, y_train, cv=10, scoring='accuracy').mean()
print("Final train accuracy : {} ".format(final_score))
xgb.fit(x_train_best, y_train)
y_pred = xgb.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Final test accuracy : {} ".format(score))
xgb = XGBClassifier()
xgb.fit(x_train_best, y_train)
y_pred = xgb.predict(x_test_best)
score = accuracy_score(y_test, y_pred)
print("Test accuracy without tuning : {} ".format(score))
Note:
Tuning improvements are minimal.