#MachineLearning #SupervisedLearning #Classification

By Billy Gustave

Cancer Diagnosis

Goal :

  • Attempting to predict the diagnosis as either Malignant or Benign.
    Data: data.csv

  • Comapring cross validated results from different classifiers
In [1]:
# libraries
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
from feature_selector import FeatureSelector
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

Data Cleaning and Exploration

In [2]:
# changing dataframe setting to display all columns
pd.options.display.max_columns = 40
df = pd.read_csv("data.csv")
df.head(3)
Out[2]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.8 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193 25.38 17.33 184.6 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.9 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 0.5435 0.7339 3.398 74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532 24.99 23.41 158.8 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.0 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 0.7456 0.7869 4.585 94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571 23.57 25.53 152.5 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
In [3]:
df.shape
Out[3]:
(569, 33)
In [4]:
df.describe()
Out[4]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se area_se smoothness_se compactness_se concavity_se concave points_se symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 0.0
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 0.062798 0.405172 1.216853 2.866059 40.337079 0.007041 0.025478 0.031894 0.011796 0.020542 0.003795 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946 NaN
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 0.007060 0.277313 0.551648 2.021855 45.491006 0.003003 0.017908 0.030186 0.006170 0.008266 0.002646 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061 NaN
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 0.049960 0.111500 0.360200 0.757000 6.802000 0.001713 0.002252 0.000000 0.000000 0.007882 0.000895 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040 NaN
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 0.057700 0.232400 0.833900 1.606000 17.850000 0.005169 0.013080 0.015090 0.007638 0.015160 0.002248 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460 NaN
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 0.061540 0.324200 1.108000 2.287000 24.530000 0.006380 0.020450 0.025890 0.010930 0.018730 0.003187 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040 NaN
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 0.066120 0.478900 1.474000 3.357000 45.190000 0.008146 0.032450 0.042050 0.014710 0.023480 0.004558 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080 NaN
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 0.097440 2.873000 4.885000 21.980000 542.200000 0.031130 0.135400 0.396000 0.052790 0.078950 0.029840 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500 NaN

Handling missing data

In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-null float64
concave points_se          569 non-null float64
symmetry_se                569 non-null float64
fractal_dimension_se       569 non-null float64
radius_worst               569 non-null float64
texture_worst              569 non-null float64
perimeter_worst            569 non-null float64
area_worst                 569 non-null float64
smoothness_worst           569 non-null float64
compactness_worst          569 non-null float64
concavity_worst            569 non-null float64
concave points_worst       569 non-null float64
symmetry_worst             569 non-null float64
fractal_dimension_worst    569 non-null float64
Unnamed: 32                0 non-null float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
In [6]:
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
Out[6]:
id                           0.0
diagnosis                    0.0
radius_mean                  0.0
texture_mean                 0.0
perimeter_mean               0.0
area_mean                    0.0
smoothness_mean              0.0
compactness_mean             0.0
concavity_mean               0.0
concave points_mean          0.0
symmetry_mean                0.0
fractal_dimension_mean       0.0
radius_se                    0.0
texture_se                   0.0
perimeter_se                 0.0
area_se                      0.0
smoothness_se                0.0
compactness_se               0.0
concavity_se                 0.0
concave points_se            0.0
symmetry_se                  0.0
fractal_dimension_se         0.0
radius_worst                 0.0
texture_worst                0.0
perimeter_worst              0.0
area_worst                   0.0
smoothness_worst             0.0
compactness_worst            0.0
concavity_worst              0.0
concave points_worst         0.0
symmetry_worst               0.0
fractal_dimension_worst      0.0
Unnamed: 32                100.0
dtype: float64
In [7]:
# dropping Unnamed: 32
df.drop('Unnamed: 32', axis=1, inplace=True)
df.isnull().sum()/len(df)*100
Out[7]:
id                         0.0
diagnosis                  0.0
radius_mean                0.0
texture_mean               0.0
perimeter_mean             0.0
area_mean                  0.0
smoothness_mean            0.0
compactness_mean           0.0
concavity_mean             0.0
concave points_mean        0.0
symmetry_mean              0.0
fractal_dimension_mean     0.0
radius_se                  0.0
texture_se                 0.0
perimeter_se               0.0
area_se                    0.0
smoothness_se              0.0
compactness_se             0.0
concavity_se               0.0
concave points_se          0.0
symmetry_se                0.0
fractal_dimension_se       0.0
radius_worst               0.0
texture_worst              0.0
perimeter_worst            0.0
area_worst                 0.0
smoothness_worst           0.0
compactness_worst          0.0
concavity_worst            0.0
concave points_worst       0.0
symmetry_worst             0.0
fractal_dimension_worst    0.0
dtype: float64
In [8]:
# features and target
X = df.drop(['id','diagnosis'], axis=1)
df.diagnosis = df.diagnosis.map({'M':1,'B':0})
y = df.diagnosis

Handling Unique features

In [9]:
# zero variance (unique values)
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(X)
columns_to_remove = [name for name in X.columns if name not in X.columns[constant_filter.get_support()]]
print('Unique features: ', columns_to_remove)
Unique features:  []

No unique features

Handling highly correlated features
Threshold: 65%

In [10]:
fs = FeatureSelector(data = X, labels = y)
fs.identify_collinear(correlation_threshold=0.65)
keep = [name for name in X.columns if name not in fs.ops['collinear']]
keep
22 features with a correlation magnitude greater than 0.65.

Out[10]:
['radius_mean',
 'texture_mean',
 'smoothness_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'texture_se',
 'smoothness_se',
 'symmetry_se']
In [11]:
X = X[keep]
In [12]:
fig, ax = plt.subplots(figsize=(16,14))
sns.heatmap(X.corr(), cmap='Reds', annot=True, linewidths=.5, ax=ax)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c0c6bc1c08>

Handling Feature Importance

In [13]:
fs = FeatureSelector(data = X, labels = y)
fs.identify_zero_importance(task='classification', eval_metric='auc')
Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[82]	valid_0's auc: 0.987269	valid_0's binary_logloss: 0.166614
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[81]	valid_0's auc: 0.96875	valid_0's binary_logloss: 0.196911
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[58]	valid_0's auc: 0.987847	valid_0's binary_logloss: 0.164243
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[73]	valid_0's auc: 0.973958	valid_0's binary_logloss: 0.234046
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[52]	valid_0's auc: 0.996528	valid_0's binary_logloss: 0.139927
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[64]	valid_0's auc: 0.987847	valid_0's binary_logloss: 0.17107
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[109]	valid_0's auc: 0.983218	valid_0's binary_logloss: 0.163392
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[130]	valid_0's auc: 0.990162	valid_0's binary_logloss: 0.143988
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[71]	valid_0's auc: 0.961806	valid_0's binary_logloss: 0.232261
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[80]	valid_0's auc: 0.96875	valid_0's binary_logloss: 0.198908

0 features with zero importance after one-hot encoding.

No zero importance features

In [14]:
rfc = RandomForestClassifier()
rfc.fit(X, y)
features = X.columns
importances = rfc.feature_importances_
indices = np.argsort(importances)
fig, ax = plt.subplots(figsize=(16,14))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

Prediction

In [15]:
# LogisticRegression
model = LogisticRegression()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy')
result.mean()
Out[15]:
0.887532991771464
In [16]:
# DecisionTreeClassifier
model = DecisionTreeClassifier()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy',)
result.mean()
Out[16]:
0.8786989597888526
In [17]:
# RandomForest
model = RandomForestClassifier()
result = cross_val_score(model, X, y, cv=5, scoring='accuracy')
result.mean()
Out[17]:
0.935010091600683

Side investigation

In [18]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
In [19]:
sfs1 = SFS(estimator=model,k_features=8,forward=True,floating=False,scoring='accuracy',cv=5)
sfs1 = sfs1.fit(X, y)
for k,v in sfs1.subsets_.items():
    print('avg_score: ',v.get('avg_score'))
    print('feature_names: ',v.get('feature_names'))
avg_score:  0.8260052786834342
feature_names:  ('radius_mean',)
avg_score:  0.8875174662319516
feature_names:  ('radius_mean', 'symmetry_mean')
avg_score:  0.9104176370128861
feature_names:  ('radius_mean', 'texture_mean', 'symmetry_mean')
avg_score:  0.9226983387672721
feature_names:  ('radius_mean', 'texture_mean', 'smoothness_mean', 'symmetry_mean')
avg_score:  0.9314547430523211
feature_names:  ('radius_mean', 'texture_mean', 'smoothness_mean', 'symmetry_mean', 'symmetry_se')
avg_score:  0.9296848315478963
feature_names:  ('radius_mean', 'texture_mean', 'smoothness_mean', 'symmetry_mean', 'texture_se', 'symmetry_se')
avg_score:  0.935010091600683
feature_names:  ('radius_mean', 'texture_mean', 'smoothness_mean', 'symmetry_mean', 'fractal_dimension_mean', 'texture_se', 'symmetry_se')
avg_score:  0.9314702685918336
feature_names:  ('radius_mean', 'texture_mean', 'smoothness_mean', 'symmetry_mean', 'fractal_dimension_mean', 'texture_se', 'smoothness_se', 'symmetry_se')
In [ ]:
 

Contact Me

www.linkedin.com/in/billygustave

billygustave.com