#MachineLearning #SupervisedLearning #Classification

By Billy Gustave

StormClassifier

Goal:

  • Predict storm category base on given data.
    Data: pacific.csv

  • Comparing filtered data result to unfiltered ones
In [1]:
#libraries
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_selector import FeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

Data Cleaning and Exploration

In [2]:
# changing dataframe setting to display all columns
pd.options.display.max_columns = 25
df = pd.read_csv("pacific.csv")
df.head()
Out[2]:
ID Name Date Time Event Status Latitude Longitude Maximum Wind Minimum Pressure Low Wind NE Low Wind SE Low Wind SW Low Wind NW Moderate Wind NE Moderate Wind SE Moderate Wind SW Moderate Wind NW High Wind NE High Wind SE High Wind SW High Wind NW
0 EP011949 UNNAMED 19490611 0 TS 20.2N 106.3W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
1 EP011949 UNNAMED 19490611 600 TS 20.2N 106.4W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
2 EP011949 UNNAMED 19490611 1200 TS 20.2N 106.7W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
3 EP011949 UNNAMED 19490611 1800 TS 20.3N 107.7W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
4 EP011949 UNNAMED 19490612 0 TS 20.4N 108.6W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
In [3]:
# changing status column to categorical (alternative/build-in label encoding)
df.Status = pd.Categorical(df.Status)
df.Status = df.Status.cat.codes
df.head()
Out[3]:
ID Name Date Time Event Status Latitude Longitude Maximum Wind Minimum Pressure Low Wind NE Low Wind SE Low Wind SW Low Wind NW Moderate Wind NE Moderate Wind SE Moderate Wind SW Moderate Wind NW High Wind NE High Wind SE High Wind SW High Wind NW
0 EP011949 UNNAMED 19490611 0 11 20.2N 106.3W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
1 EP011949 UNNAMED 19490611 600 11 20.2N 106.4W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
2 EP011949 UNNAMED 19490611 1200 11 20.2N 106.7W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
3 EP011949 UNNAMED 19490611 1800 11 20.3N 107.7W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999
4 EP011949 UNNAMED 19490612 0 11 20.4N 108.6W 45 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999 -999

Handling missing data

In [4]:
df.columns
Out[4]:
Index(['ID', 'Name', 'Date', 'Time', 'Event', 'Status', 'Latitude',
       'Longitude', 'Maximum Wind', 'Minimum Pressure', 'Low Wind NE',
       'Low Wind SE', 'Low Wind SW', 'Low Wind NW', 'Moderate Wind NE',
       'Moderate Wind SE', 'Moderate Wind SW', 'Moderate Wind NW',
       'High Wind NE', 'High Wind SE', 'High Wind SW', 'High Wind NW'],
      dtype='object')
In [5]:
# features and target
X = df.drop(['ID', 'Name', 'Date', 'Time', 'Event', 'Status', 'Latitude',
       'Longitude'], axis=1)
y = df.Status
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26137 entries, 0 to 26136
Data columns (total 22 columns):
ID                  26137 non-null object
Name                26137 non-null object
Date                26137 non-null int64
Time                26137 non-null int64
Event               26137 non-null object
Status              26137 non-null int8
Latitude            26137 non-null object
Longitude           26137 non-null object
Maximum Wind        26137 non-null int64
Minimum Pressure    26137 non-null int64
Low Wind NE         26137 non-null int64
Low Wind SE         26137 non-null int64
Low Wind SW         26137 non-null int64
Low Wind NW         26137 non-null int64
Moderate Wind NE    26137 non-null int64
Moderate Wind SE    26137 non-null int64
Moderate Wind SW    26137 non-null int64
Moderate Wind NW    26137 non-null int64
High Wind NE        26137 non-null int64
High Wind SE        26137 non-null int64
High Wind SW        26137 non-null int64
High Wind NW        26137 non-null int64
dtypes: int64(16), int8(1), object(5)
memory usage: 4.2+ MB
In [7]:
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
Out[7]:
ID                  0.0
Name                0.0
Date                0.0
Time                0.0
Event               0.0
Status              0.0
Latitude            0.0
Longitude           0.0
Maximum Wind        0.0
Minimum Pressure    0.0
Low Wind NE         0.0
Low Wind SE         0.0
Low Wind SW         0.0
Low Wind NW         0.0
Moderate Wind NE    0.0
Moderate Wind SE    0.0
Moderate Wind SW    0.0
Moderate Wind NW    0.0
High Wind NE        0.0
High Wind SE        0.0
High Wind SW        0.0
High Wind NW        0.0
dtype: float64

No missing values

Handling Unique features

In [8]:
fs = FeatureSelector(X,y)
In [9]:
fs.identify_single_unique()
0 features with a single unique value.

No unique features

Handling highly correlated features
Threshold: 75%

In [10]:
fs.identify_collinear(correlation_threshold=0.75)
keep = [name for name in X.columns if name not in fs.ops['collinear']]
keep
11 features with a correlation magnitude greater than 0.75.

Out[10]:
['Maximum Wind', 'Minimum Pressure', 'Low Wind NE']
In [11]:
X = X[keep]
In [12]:
fig, ax = plt.subplots()
sns.heatmap(X.corr(), cmap='Reds', annot=True, linewidths=.5, ax=ax)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x199fb7bc708>

Train-Test-Split

In [13]:
# testing data size at 30%
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = .3, random_state=37)

Feature Importance

In [14]:
etc = RandomForestClassifier()
etc.fit(x_train, y_train)
features = x_train.columns
importances = etc.feature_importances_
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

Prediction

In [15]:
# DecisionTree
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = dtc.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
Accuracy:  0.9626370823769447
In [16]:
# RandomForest
rfr = RandomForestClassifier()
rfr.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = rfr.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
Accuracy:  0.96327467482785
In [17]:
# Support Vector Machine
svm = SVC()
svm.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = svm.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
Accuracy:  0.8486355521550625

Unfiltered data

In [18]:
X = df.drop(['ID', 'Name', 'Date', 'Time', 'Event', 'Status', 'Latitude',
       'Longitude'], axis=1)
y = df.Status
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = .3, random_state=37)
# RandomForest
rfr = RandomForestClassifier()
rfr.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = rfr.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
Accuracy:  0.96327467482785
In [19]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = dtc.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
Accuracy:  0.9617444529456771
In [20]:
# Support Vector Machine
svm = SVC()
svm.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = svm.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
Accuracy:  0.6513644478449375

Contact Me

www.linkedin.com/in/billygustave

billygustave.com