#MachineLearning #SupervisedLearning #Classification

By Billy Gustave

Public/Private College ¶

Goal:

Classify colleges as public or private using various features
Data: College.csv
SVC classifier
Feature selection
Compare various accuracy results: raw, PCA, Normalized

Data Exploration ¶

import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

df = pd.read_csv('College.csv')
df.shape

(777, 18)

df.head()

df.groupby('Private').size()

Private
No     212
Yes    565
dtype: int64

# Yes:1, No:0
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df.Private)
df['Private'] = le.transform(df.Private)

df.head()

Feature distribution:

df.hist(figsize=(18, 18))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE49B0C48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4AB96C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4AF2188>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4B29F48>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4B63088>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4B9A188>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4BD22C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4C0B308>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4C11EC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4C4F0C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4CBAE48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4CEC748>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4D22848>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4D5A948>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4D95A48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4DCDB88>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4E06C88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4E3ED48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4E76E88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000022CE4EB10C8>]],
      dtype=object)

Data Cleaning ¶

# checking for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
Private        777 non-null int32
Apps           777 non-null int64
Accept         777 non-null int64
Enroll         777 non-null int64
Top10perc      777 non-null int64
Top25perc      777 non-null int64
F.Undergrad    777 non-null int64
P.Undergrad    777 non-null int64
Outstate       777 non-null int64
Room.Board     777 non-null int64
Books          777 non-null int64
Personal       777 non-null int64
PhD            777 non-null int64
Terminal       777 non-null int64
S.F.Ratio      777 non-null float64
perc.alumni    777 non-null int64
Expend         777 non-null int64
Grad.Rate      777 non-null int64
dtypes: float64(1), int32(1), int64(16)
memory usage: 106.4 KB

No missing values

# Features and Target
X = df.drop(['Private'], axis=1)
y = df.Private
X.shape

(777, 17)

# checking for unique values
from feature_selector import FeatureSelector
fs = FeatureSelector(data = X, labels = y)
fs.identify_single_unique()

0 features with a single unique value.

# train/test split with 20% testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=31)

Feature Selection ¶

Using Kfold and Cross Validation:

from sklearn.model_selection import cross_val_score, KFold # StratifiedKFold is a better a choice
kfold = KFold(n_splits=5, random_state=17, shuffle=True)

from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
model = SVC(kernel='linear')
rfecv = RFECV(estimator=model, cv=kfold, scoring='accuracy')
rfecv.fit(x_train, y_train)

RFECV(cv=KFold(n_splits=5, random_state=17, shuffle=True),
      estimator=SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                    coef0=0.0, decision_function_shape='ovr', degree=3,
                    gamma='scale', kernel='linear', max_iter=-1,
                    probability=False, random_state=None, shrinking=True,
                    tol=0.001, verbose=False),
      min_features_to_select=1, n_jobs=None, scoring='accuracy', step=1,
      verbose=0)

plt.figure()
plt.title('Logistic Regression CV score vs No of Features')
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

feature_importance = list(zip(X.columns, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])
        
print(new_features)

['Apps', 'Enroll', 'Top10perc', 'Top25perc', 'Outstate', 'Books', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Grad.Rate']

# Calculate accuracy scores 
x_train_new = x_train[new_features]
x_test_new = x_test[new_features]
initial_score = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy').mean()
print("Initial accuracy : {} ".format(initial_score))
fe_score = cross_val_score(model, x_train_new, y_train, cv=kfold, scoring='accuracy').mean()
print("Accuracy after Feature Selection : {} ".format(fe_score))

Initial accuracy : 0.9258967741935484 
Accuracy after Feature Selection : 0.9226967741935483

Model Feature Tuning ¶

Normalized:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x_train_new)
x_train_nw_scld = scaler.transform(x_train_new)
x_test_nw_scld = scaler.transform(x_test_new)

PCA:

from sklearn.decomposition import PCA
pca = PCA().fit(x_train_new)
#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([0, x_train_new.shape[1]])
plt.grid()
plt.show()

pca = PCA(n_components=2).fit(x_train_new)
x_train_nw_trfm = pca.transform(x_train_new)
x_test_nw_trfm = pca.transform(x_test_new)
# total explained variance ratio
pca.explained_variance_ratio_.sum()

0.9926068404021873

Normalized PCA:

pca = PCA().fit(x_train_nw_scld)
#Plotting the Cumulative Summation of the Explained Variance
fig, ax = plt.subplots(figsize=(16,14))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.xlim([0, x_train_nw_scld.shape[1]])
plt.grid()
plt.show()

pca = PCA(n_components=10).fit(x_train_nw_scld)
x_train_nw_scld_trfm = pca.transform(x_train_nw_scld)
x_test_nw_scld_trfm = pca.transform(x_test_nw_scld)
# total explained variance ratio
pca.explained_variance_ratio_.sum()

0.9913149427586461

Predict ¶

model = SVC(kernel='linear')
# Calculate accuracy scores 
score = cross_val_score(model, x_train_new, y_train, cv=kfold, scoring='accuracy').mean()
print("Accuracy : {} ".format(score))
nrml_score = cross_val_score(model, x_train_nw_scld, y_train, cv=kfold, scoring='accuracy').mean()
print("Normalized accuracy : {} ".format(nrml_score))
pca_score = cross_val_score(model, x_train_nw_trfm, y_train, cv=kfold, scoring='accuracy').mean()
print("PCA accuracy : {} ".format(pca_score))
npca_score = cross_val_score(model, x_train_nw_scld_trfm, y_train, cv=kfold, scoring='accuracy').mean()
print("Norm+PCA accuracy : {} ".format(npca_score))

Accuracy : 0.9226967741935483 
Normalized accuracy : 0.9420129032258064 
PCA accuracy : 0.9114322580645162 
Norm+PCA accuracy : 0.9404129032258066

pred_y = model.fit(x_train_new,y_train).predict(x_test_new)
nrml_pred_y = model.fit(x_train_nw_scld,y_train).predict(x_test_nw_scld)
pca_pred_y = model.fit(x_train_nw_trfm,y_train).predict(x_test_nw_trfm)
npca_pred_y = model.fit(x_train_nw_scld_trfm,y_train).predict(x_test_nw_scld_trfm)

from sklearn.metrics import accuracy_score

accuracy_score(pred_y, y_test)

0.9358974358974359

accuracy_score(nrml_pred_y, y_test)

0.9423076923076923

accuracy_score(pca_pred_y, y_test)

0.9166666666666666

accuracy_score(npca_pred_y, y_test)

0.9358974358974359

Normalized observations has best accuracy for this project.

Improvements

High correlation filtering
GridSearchCV parameter fine tuning
Better model

	Private	Apps	Accept	Enroll	Top10perc	Top25perc	F.Undergrad	P.Undergrad	Outstate	Room.Board	Books	Personal	PhD	Terminal	S.F.Ratio	perc.alumni	Expend	Grad.Rate
0	Yes	1660	1232	721	23	52	2885	537	7440	3300	450	2200	70	78	18.1	12	7041	60
1	Yes	2186	1924	512	16	29	2683	1227	12280	6450	750	1500	29	30	12.2	16	10527	56
2	Yes	1428	1097	336	22	50	1036	99	11250	3750	400	1165	53	66	12.9	30	8735	54
3	Yes	417	349	137	60	89	510	63	12960	5450	450	875	92	97	7.7	37	19016	59
4	Yes	193	146	55	16	44	249	869	7560	4120	800	1500	76	72	11.9	2	10922	15

	Private	Apps	Accept	Enroll	Top10perc	Top25perc	F.Undergrad	P.Undergrad	Outstate	Room.Board	Books	Personal	PhD	Terminal	S.F.Ratio	perc.alumni	Expend	Grad.Rate
0	1	1660	1232	721	23	52	2885	537	7440	3300	450	2200	70	78	18.1	12	7041	60
1	1	2186	1924	512	16	29	2683	1227	12280	6450	750	1500	29	30	12.2	16	10527	56
2	1	1428	1097	336	22	50	1036	99	11250	3750	400	1165	53	66	12.9	30	8735	54
3	1	417	349	137	60	89	510	63	12960	5450	450	875	92	97	7.7	37	19016	59
4	1	193	146	55	16	44	249	869	7560	4120	800	1500	76	72	11.9	2	10922	15

Billy Gustave

Public/Private Colleges