#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Goal
:
Attempting to predict the survival of a horse based on various observed medical conditions.
Data: horse.csv
Also comaparing 2 classifiers:
#from sklearn.feature_selection import VarianceThreshold
#
#from sklearn.linear_model import LogisticRegression
#libraries
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from feature_selector import FeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
# changing dataframe setting to display all columns
pd.options.display.max_columns = 30
df = pd.read_csv('horse.csv')
df.head()
df.shape
df.info()
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
Missing values
Imputation techniques
we will use
Only a few features with missing values, hence imputation can be applied.
3 types of missing values:
- MCAR: Missing completely at random _ cannot perform imputation
- MAR: Missing at random - can perform imputation
- NMAR: Not Missing At Random - structured missing values
</small>
# features and target
X = df.drop(['hospital_number','outcome'],axis=1)
y = df.outcome
# removing features with 95% missing data:
abdomo_appearance = df.abdomo_appearance
def removing_high_missing(data, threshold=95):
for column in data.columns:
missing_ratio = data[column].isnull().sum()/len(data[column])*100
if( missing_ratio >= threshold):
data = data.drop([column], axis=1)
return data
X = removing_high_missing(X)
# filling in categorial missing values:
categorical_features = ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse','mucous_membrane',
'capillary_refill_time', 'pain', 'peristalsis','abdominal_distention', 'nasogastric_tube',
'nasogastric_reflux', 'rectal_exam_feces', 'abdomen','abdomo_appearance', 'surgical_lesion', 'cp_data']
def fill_categorical_missing(data, column_name_list):
for column in column_name_list:
missing_ratio = data[column].isnull().sum()/len(data[column])*100
if( missing_ratio <= 50):
data[column].fillna(data[column].value_counts().index[0],inplace=True) #mode
elif(missing_ratio > 50 and missing_ratio < 95):
data[column].fillna('Missing',inplace=True) # flag
return data
X = fill_categorical_missing(X, categorical_features)
# filling in continuous missing values:
X.info()
numerical_features_missings = ['rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume',
'total_protein','abdomo_protein']
numerical_features = ['rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume',
'total_protein','abdomo_protein','lesion_1','lesion_2','lesion_3']
count = 0
fig, axes = plt.subplots(1, 7, figsize=(21,4))
for column in numerical_features_missings:
X[column].hist(ax=axes[count])
axes[count].set_title(column)
count += 1
plt.show()
We will use mean for 'rectal_temp' and median for the rest:
for column in numerical_features_missings:
if column == 'rectal_temp':
mean_value=X[column].mean()
X[column].fillna(mean_value, inplace=True)
else:
median_value=X[column].median()
X[column].fillna(median_value, inplace=True)
X.isnull().sum()/len(X)*100
X.info()
Encoding Categoricals
# dummies for features:
X = pd.get_dummies(X, columns=categorical_features,drop_first=True)
y.unique()
# mapping for target {'died':0,'euthanized':1,'lived':2}
y = y.map({'died':0,'euthanized':1,'lived':2})
Train-Test-Split
# random_state guarantees the same output everytime the program is run
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state=1)
Removing Unique values
from sklearn.feature_selection import VarianceThreshold
# zero variance (unique values)
x_train_num = x_train[numerical_features]
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train_num)
print(x_train_num.columns[constant_filter.get_support()])
x_train_num = x_train_num[x_train_num.columns[constant_filter.get_support()]]
No uniques
Removing Highly correlated features
Threshold: 75%
# Correlation matrix for all independent vars
corrMatrix = x_train_num.corr()
allVars = corrMatrix.keys()
absCorrWithDep = []
for var in allVars:
absCorrWithDep.append(abs(y.corr(x_train_num[var])))
# threshold seeting
corrTol = 0.75
# for each column in the corr matrix
for col in corrMatrix:
if col in corrMatrix.keys():
thisCol = []
thisVars = []
temp = corrMatrix[col]
# Store the corr with the dep var for fields that are highly correlated with each other
for i in range(len(corrMatrix)):
if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
thisCorr = 0
else:
thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
thisCol.append(thisCorr)
thisVars.append(corrMatrix.keys()[i])
mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
ctDelCol = 0 # To keep track of the number of columns deleted
for n, j in enumerate(thisCol):
# Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
# or (b) completely corr with the 'col'
mask[n] = not (j != max(thisCol) and j >= 0)
if j != max(thisCol) and j >= 0:
# Delete the column from the corr matrix
corrMatrix.pop('%s' %thisVars[n])
ctDelCol += 1
# Delete the corresponding row(s) from the corr matrix
corrMatrix = corrMatrix[mask]
len(corrMatrix.columns)
len(x_train_num.columns)
No highly correlated features
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(x_train_num.corr(), cmap='Reds', annot=True, linewidths=.5, ax=ax)
x_train.info()
Removing 0 importance features
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
features = x_train.columns
importances = rfc.feature_importances_
indices = np.argsort(importances)
fig, ax = plt.subplots(figsize=(16,14))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
# removing lesion_3
x_train = x_train.drop('lesion_3',axis=1)
x_test = x_test.drop('lesion_3',axis=1)
DecisionTreeClassifier
No Cross validation
# DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = dtc.predict(x_test)
metrics.accuracy_score(pred_y, y_test)
# RandomForest
rfr = RandomForestClassifier()
rfr.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = rfr.predict(x_test)
metrics.accuracy_score(pred_y, y_test)
With cross-validation (using train data only)
from sklearn.model_selection import KFold, cross_val_score
# DecisionTreeClassifier
model = DecisionTreeClassifier()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy',)
result.mean()
# RandomForest
model = RandomForestClassifier()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy')
result.mean()
RandomForest gives better results than DecisionTree because it is an ensemble model made of multiple DecisionTrees.
Improvements