#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Using voice.csv data to classify voices as Male or Female
#libraries
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from feature_selector import FeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
df = pd.read_csv('voice.csv')
df.head()
Applying LabelEncoder to label:
df.label = LabelEncoder().fit_transform(df.label)
df.head()
Goal:
df.shape
df.info()
Data:
X = df.drop('label', axis=1)
y = df.label
# testing data size at 20%
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = .2, random_state=4)
x_train.shape
No Missing values, checking for zero and no variance features:
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)
print(x_train.columns[constant_filter.get_support()])
x_train = x_train[x_train.columns[constant_filter.get_support()]]
x_train.var()
Note: This data is not ideal for low variance cut off.
# Correlation matrix for all independent vars
corrMatrix = x_train.corr()
allVars = corrMatrix.keys()
absCorrWithDep = []
for var in allVars:
absCorrWithDep.append(abs(y.corr(x_train[var])))
# threshold seeting
corrTol = 0.75
# for each column in the corr matrix
for col in corrMatrix:
if col in corrMatrix.keys():
thisCol = []
thisVars = []
temp = corrMatrix[col]
# Store the corr with the dep var for fields that are highly correlated with each other
for i in range(len(corrMatrix)):
if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
thisCorr = 0
else:
thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
thisCol.append(thisCorr)
thisVars.append(corrMatrix.keys()[i])
mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
ctDelCol = 0 # To keep track of the number of columns deleted
for n, j in enumerate(thisCol):
# Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
# or (b) completely corr with the 'col'
mask[n] = not (j != max(thisCol) and j >= 0)
if j != max(thisCol) and j >= 0:
# Delete the column from the corr matrix
corrMatrix.pop('%s' %thisVars[n])
ctDelCol += 1
# Delete the corresponding row(s) from the corr matrix
corrMatrix = corrMatrix[mask]
print(corrMatrix.columns)
x_train = x_train[corrMatrix.columns]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(x_train.corr(), cmap='Reds', annot=True, linewidths=.5, ax=ax)
Filtering for feature importance:
x_2 = x_train
x_2.shape
fs = FeatureSelector(data = x_train, labels = y_train)
fs.identify_zero_importance(task='classification', eval_metric='auc')
Feature Importance:
etc = RandomForestClassifier()
etc.fit(x_train, y_train)
features = x_train.columns
importances = etc.feature_importances_
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
Testing different models using tose 12 features:
# LogisticRegression
logr = LogisticRegression()
logr.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = logr.predict(x_test)
metrics.accuracy_score(pred_y, y_test)
# RandomForest
rfr = RandomForestClassifier()
rfr.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = rfr.predict(x_test)
metrics.accuracy_score(pred_y, y_test)
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = nb.predict(x_test)
metrics.accuracy_score(pred_y, y_test)
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = svc.predict(x_test)
metrics.accuracy_score(pred_y, y_test)
Random Forest classifier as best accuracy: 97%
***Improvements:***