#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Goal
:
Attempting to predict the diagnosis as either Malignant or Benign.
Data: data.csv
# libraries
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
from feature_selector import FeatureSelector
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# changing dataframe setting to display all columns
pd.options.display.max_columns = 40
df = pd.read_csv("data.csv")
df.head(3)
df.shape
df.describe()
Handling missing data
df.info()
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
# dropping Unnamed: 32
df.drop('Unnamed: 32', axis=1, inplace=True)
df.isnull().sum()/len(df)*100
# features and target
X = df.drop(['id','diagnosis'], axis=1)
df.diagnosis = df.diagnosis.map({'M':1,'B':0})
y = df.diagnosis
Handling Unique features
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(X)
columns_to_remove = [name for name in X.columns if name not in X.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)
No unique features
Handling highly correlated features
Threshold: 65%
fs = FeatureSelector(data = X, labels = y)
fs.identify_collinear(correlation_threshold=0.65)
keep = [name for name in X.columns if name not in fs.ops['collinear']]
keep
X = X[keep]
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(X.corr(), cmap='Reds', annot=True, linewidths=.5, ax=ax)
Handling Feature Importance
fs = FeatureSelector(data = X, labels = y)
fs.identify_zero_importance(task='classification', eval_metric='auc')
No zero importance features
rfc = RandomForestClassifier()
rfc.fit(X, y)
features = X.columns
importances = rfc.feature_importances_
indices = np.argsort(importances)
fig, ax = plt.subplots(figsize=(16,14))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
# LogisticRegression
model = LogisticRegression()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy')
result.mean()
# DecisionTreeClassifier
model = DecisionTreeClassifier()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy',)
result.mean()
# RandomForest
model = RandomForestClassifier()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy')
result.mean()
Side investigation
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs1 = SFS(estimator=model,k_features=8,forward=True,floating=False,scoring='accuracy',cv=5)
sfs1 = sfs1.fit(X, y)
for k,v in sfs1.subsets_.items():
print('avg_score: ',v.get('avg_score'))
print('feature_names: ',v.get('feature_names'))