#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Goal:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
df = pd.read_csv('College.csv')
df.shape
df.head()
df.groupby('Private').size()
# Yes:1, No:0
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df.Private)
df['Private'] = le.transform(df.Private)
df.head()
Feature distribution:
df.hist(figsize=(18, 18))
# checking for missing values
df.info()
No missing values
# Features and Target
X = df.drop(['Private'], axis=1)
y = df.Private
X.shape
# checking for unique values
from feature_selector import FeatureSelector
fs = FeatureSelector(data = X, labels = y)
fs.identify_single_unique()
# train/test split with 20% testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=31)
Using Kfold and Cross Validation:
from sklearn.model_selection import cross_val_score, KFold # StratifiedKFold is a better a choice
kfold = KFold(n_splits=5, random_state=17, shuffle=True)
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
model = SVC(kernel='linear')
rfecv = RFECV(estimator=model, cv=kfold, scoring='accuracy')
rfecv.fit(x_train, y_train)
plt.figure()
plt.title('Logistic Regression CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
feature_importance = list(zip(X.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
if(value[1]) == True:
new_features.append(value[0])
print(new_features)
# Calculate accuracy scores
x_train_new = x_train[new_features]
x_test_new = x_test[new_features]
initial_score = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy').mean()
print("Initial accuracy : {} ".format(initial_score))
fe_score = cross_val_score(model, x_train_new, y_train, cv=kfold, scoring='accuracy').mean()
print("Accuracy after Feature Selection : {} ".format(fe_score))
Normalized:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x_train_new)
x_train_nw_scld = scaler.transform(x_train_new)
x_test_nw_scld = scaler.transform(x_test_new)
PCA:
from sklearn.decomposition import PCA
pca = PCA().fit(x_train_new)
#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([0, x_train_new.shape[1]])
plt.grid()
plt.show()
pca = PCA(n_components=2).fit(x_train_new)
x_train_nw_trfm = pca.transform(x_train_new)
x_test_nw_trfm = pca.transform(x_test_new)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
Normalized PCA:
pca = PCA().fit(x_train_nw_scld)
#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([0, x_train_nw_scld.shape[1]])
plt.grid()
plt.show()
pca = PCA(n_components=10).fit(x_train_nw_scld)
x_train_nw_scld_trfm = pca.transform(x_train_nw_scld)
x_test_nw_scld_trfm = pca.transform(x_test_nw_scld)
# total explained variance ratio
pca.explained_variance_ratio_.sum()
model = SVC(kernel='linear')
# Calculate accuracy scores
score = cross_val_score(model, x_train_new, y_train, cv=kfold, scoring='accuracy').mean()
print("Accuracy : {} ".format(score))
nrml_score = cross_val_score(model, x_train_nw_scld, y_train, cv=kfold, scoring='accuracy').mean()
print("Normalized accuracy : {} ".format(nrml_score))
pca_score = cross_val_score(model, x_train_nw_trfm, y_train, cv=kfold, scoring='accuracy').mean()
print("PCA accuracy : {} ".format(pca_score))
npca_score = cross_val_score(model, x_train_nw_scld_trfm, y_train, cv=kfold, scoring='accuracy').mean()
print("Norm+PCA accuracy : {} ".format(npca_score))
pred_y = model.fit(x_train_new,y_train).predict(x_test_new)
nrml_pred_y = model.fit(x_train_nw_scld,y_train).predict(x_test_nw_scld)
pca_pred_y = model.fit(x_train_nw_trfm,y_train).predict(x_test_nw_trfm)
npca_pred_y = model.fit(x_train_nw_scld_trfm,y_train).predict(x_test_nw_scld_trfm)
from sklearn.metrics import accuracy_score
accuracy_score(pred_y, y_test)
accuracy_score(nrml_pred_y, y_test)
accuracy_score(pca_pred_y, y_test)
accuracy_score(npca_pred_y, y_test)
Normalized observations has best accuracy for this project.
Improvements