#MachineLearning #SupervisedLearning #Classification
By Billy Gustave
Goal:
Predict storm category base on given data.
Data: pacific.csv
#libraries
import pandas as pd, seaborn as sns, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_selector import FeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# changing dataframe setting to display all columns
pd.options.display.max_columns = 25
df = pd.read_csv("pacific.csv")
df.head()
# changing status column to categorical (alternative/build-in label encoding)
df.Status = pd.Categorical(df.Status)
df.Status = df.Status.cat.codes
df.head()
Handling missing data
df.columns
# features and target
X = df.drop(['ID', 'Name', 'Date', 'Time', 'Event', 'Status', 'Latitude',
'Longitude'], axis=1)
y = df.Status
df.info()
# checking the percentage of missing values in each variable
df.isnull().sum()/len(df)*100
No missing values
Handling Unique features
fs = FeatureSelector(X,y)
fs.identify_single_unique()
No unique features
Handling highly correlated features
Threshold: 75%
fs.identify_collinear(correlation_threshold=0.75)
keep = [name for name in X.columns if name not in fs.ops['collinear']]
keep
X = X[keep]
fig, ax = plt.subplots()
sns.heatmap(X.corr(), cmap='Reds', annot=True, linewidths=.5, ax=ax)
Train-Test-Split
# testing data size at 30%
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = .3, random_state=37)
Feature Importance
etc = RandomForestClassifier()
etc.fit(x_train, y_train)
features = x_train.columns
importances = etc.feature_importances_
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
# DecisionTree
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = dtc.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
# RandomForest
rfr = RandomForestClassifier()
rfr.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = rfr.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
# Support Vector Machine
svm = SVC()
svm.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = svm.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
Unfiltered data
X = df.drop(['ID', 'Name', 'Date', 'Time', 'Event', 'Status', 'Latitude',
'Longitude'], axis=1)
y = df.Status
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = .3, random_state=37)
# RandomForest
rfr = RandomForestClassifier()
rfr.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = rfr.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = dtc.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))
# Support Vector Machine
svm = SVC()
svm.fit(x_train,y_train)
x_test = x_test[x_train.columns]
pred_y = svm.predict(x_test)
print('Accuracy: ', accuracy_score(pred_y, y_test))