#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Business Challenge/Requiment
John Cancer Hospital (JCH) is a leading cancer hospital in USA. It specializes in preventingbreast cancer.
Over the period of the last few years, JCH has collected breast cancer data from patients who came for screening/treatment.
However,this data has almost 30 attributes and is difficult to run and interpret the result.
Goal:
Classify observations diagnosis.
Data: breast-cancer-data.csv
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
df = pd.read_csv('breast-cancer-data.csv')
df.shape
df.head()
df.groupby('diagnosis').size()
df.diagnosis = df.diagnosis.map({'M':1,'B':0})
# Features and Target
X = df.drop(['id','diagnosis'], axis=1)
y = df.diagnosis
X.shape
#df.drop('id',axis=1).groupby('diagnosis').hist(figsize=(15,15))
X.describe()
df.info()
No Missing values
from sklearn.feature_selection import VarianceThreshold
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(X)
columns_to_remove = [name for name in X.columns if name not in X.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)
No Features with Unique values
Hadling high correlation
Threshold 0.75
from feature_selector import FeatureSelector
fs = FeatureSelector(data = X, labels = y)
fs.identify_collinear(correlation_threshold=0.75)
keep = [name for name in X.columns if name not in fs.ops['collinear']]
keep
X_clean = X[keep]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(X_clean.corr(), cmap='Reds', annot=True, linewidths=.5, ax=ax)
# model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
models = []
models.append(('LR',LogisticRegression(solver = 'newton-cg')))
models.append(('NB',GaussianNB()))
models.append(('DTC',DecisionTreeClassifier()))
models.append(('RFC',RandomForestClassifier()))
models.append(('GBC',GradientBoostingClassifier()))
models.append(('SVC',SVC()))
models.append(('KNN',KNeighborsClassifier()))
Using Kfold and Cross Validation:
from sklearn.model_selection import cross_val_score, KFold
kfold = KFold(n_splits=10, random_state=17, shuffle=True)
names = []
scores = []
for name, model in models:
score = cross_val_score(model, X_clean, y, cv=kfold, scoring='accuracy').mean()
names.append(name)
scores.append(score)
results = pd.DataFrame({'Model': names,'Accuracy': scores})
results
# graph of performance
axis = sns.barplot(x = 'Model', y = 'Accuracy', data = results)
axis.set(xlabel='Classifier', ylabel='Accuracy')
for p in axis.patches:
height = p.get_height()
axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center")
plt.show()
Both Random Forest and Gradient Boosting have about same accuracy
We will use Gradient Boosting
Feature Selection
from sklearn.feature_selection import RFECV
gbc = GradientBoostingClassifier()
rfecv = RFECV(estimator=gbc, step=1, cv=kfold, scoring='accuracy')
rfecv.fit(X_clean,y)
plt.figure()
plt.title('Gradient Boosting CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
feature_importance = list(zip(X.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
if(value[1]) == True:
new_features.append(value[0])
print(new_features)
# Calculate accuracy scores
X_clean_new = df[new_features]
initial_score = cross_val_score(gbc, X_clean, y, cv=kfold, scoring='accuracy').mean()
print("Initial accuracy : {} ".format(initial_score))
fe_score = cross_val_score(gbc, X_clean_new, y, cv=kfold, scoring='accuracy').mean()
print("Accuracy after Feature Selection : {} ".format(fe_score))
Though the accuracy on filtered 1 is less, we will be using it because it runs faster and the difference is small.
PCA
Testing accuracy with PCA
from sklearn.decomposition import PCA
#Fitting the PCA algorithm with our Data
pca = PCA().fit(X_clean_new)
#Plotting the Cumulative Summation of the Explained Variance
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.ylim([0.995, 1.01])
plt.xlim([0,7])
plt.grid()
plt.show()
pca = PCA(n_components=2).fit(X_clean_new)
X_cleam_new_Trnsf = pca.transform(X_clean_new)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
# Calculate accuracy scores
initial_score = cross_val_score(gbc, X_clean_new, y, cv=kfold, scoring='accuracy').mean()
print("Initial accuracy : {} ".format(initial_score))
fe_score = cross_val_score(gbc, X_cleam_new_Trnsf, y, cv=kfold, scoring='accuracy').mean()
print("Accuracy after Feature Selection : {} ".format(fe_score))
Not better
PCA Standardized
Testing accuracy with PCA normalized
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_clean_new)
X_clean_new_scaled = scaler.transform(X_clean_new)
#Fitting the PCA algorithm with our Data
pca = PCA().fit(X_clean_new_scaled)
#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([1, X_clean_new_scaled.shape[1]])
plt.grid()
plt.show()
pca = PCA(n_components=5).fit(X_clean_new_scaled)
X_clean_new_scaled_Trnsf = pca.transform(X_clean_new_scaled)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
# Calculate accuracy scores
initial_score = cross_val_score(gbc, X_clean_new, y, cv=kfold, scoring='accuracy').mean()
print("Initial accuracy : {} ".format(initial_score))
fe_score = cross_val_score(gbc, X_clean_new_scaled_Trnsf, y, cv=kfold, scoring='accuracy').mean()
print("Accuracy after Feature Selection : {} ".format(fe_score))
Reduced and normalized data is better.
from sklearn.model_selection import GridSearchCV
# parameters
param_grid = {'loss':['deviance','exponential'],
'learning_rate': [0.01, 0.05, 0.1, 0.2],
'max_depth':[2, 3, 5],
'max_features':['log2','sqrt', None],
'criterion': ['friedman_mse', 'mse'],
'subsample':[0.25, 0.5, 1.0],
'n_estimators':[10, 100, 200]}
gsearch = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, cv=kfold, scoring='accuracy')
gsearch.fit(X_clean_new_scaled_Trnsf, y)
print(gsearch.best_params_)
gsearch.best_estimator_
gbc_final = GradientBoostingClassifier(criterion='mse',
learning_rate=0.1,
loss='exponential',
max_depth=3,
max_features='sqrt',
n_estimators=100,
subsample=0.5)
final_score = cross_val_score(gbc_final, X_clean_new_scaled_Trnsf, y, cv=kfold, scoring='accuracy').mean()
print("Final accuracy : {} ".format(final_score))
Note: