#MachineLearning #SupervisedLearning #LinearRegression

By Billy Gustave

Cereal ratings

Using regression technique to predict cereal ratings

In [1]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import seaborn as sns

Import and Explore the data

In [2]:
# load data
df = pd.read_csv('cereal.csv')
df.head()
Out[2]:
name mfr type calories protein fat sodium fiber carbo sugars potass vitamins shelf weight cups rating
0 100% Bran N C 70 4 1 130 10.0 5.0 6 280 25 3 1.0 0.33 68.402973
1 100% Natural Bran Q C 120 3 5 15 2.0 8.0 8 135 0 3 1.0 1.00 33.983679
2 All-Bran K C 70 4 1 260 9.0 7.0 5 320 25 3 1.0 0.33 59.425505
3 All-Bran with Extra Fiber K C 50 4 0 140 14.0 8.0 0 330 25 3 1.0 0.50 93.704912
4 Almond Delight R C 110 2 2 200 1.0 14.0 8 -1 25 3 1.0 0.75 34.384843

Data cleaning / Preprocessing

In [3]:
# features and target
X = df.drop(['name', 'mfr', 'type', 'rating'], axis=1)
y = df.rating
In [4]:
# Checking for missing values
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 12 columns):
calories    77 non-null int64
protein     77 non-null int64
fat         77 non-null int64
sodium      77 non-null int64
fiber       77 non-null float64
carbo       77 non-null float64
sugars      77 non-null int64
potass      77 non-null int64
vitamins    77 non-null int64
shelf       77 non-null int64
weight      77 non-null float64
cups        77 non-null float64
dtypes: float64(4), int64(8)
memory usage: 7.3 KB
In [5]:
X.describe()
Out[5]:
calories protein fat sodium fiber carbo sugars potass vitamins shelf weight cups
count 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000
mean 106.883117 2.545455 1.012987 159.675325 2.151948 14.597403 6.922078 96.077922 28.246753 2.207792 1.029610 0.821039
std 19.484119 1.094790 1.006473 83.832295 2.383364 4.278956 4.444885 71.286813 22.342523 0.832524 0.150477 0.232716
min 50.000000 1.000000 0.000000 0.000000 0.000000 -1.000000 -1.000000 -1.000000 0.000000 1.000000 0.500000 0.250000
25% 100.000000 2.000000 0.000000 130.000000 1.000000 12.000000 3.000000 40.000000 25.000000 1.000000 1.000000 0.670000
50% 110.000000 3.000000 1.000000 180.000000 2.000000 14.000000 7.000000 90.000000 25.000000 2.000000 1.000000 0.750000
75% 110.000000 3.000000 2.000000 210.000000 3.000000 17.000000 11.000000 120.000000 25.000000 3.000000 1.000000 1.000000
max 160.000000 6.000000 5.000000 320.000000 14.000000 23.000000 15.000000 330.000000 100.000000 3.000000 1.500000 1.500000
In [6]:
# removing highly correlated features (threshold = .6)
In [7]:
from feature_selector import FeatureSelector
fs = FeatureSelector(data=X, labels=y)
fs.identify_collinear(correlation_threshold=0.9)
correlated_features = fs.ops['collinear']
correlated_features
1 features with a correlation magnitude greater than 0.90.

Out[7]:
['potass']
In [8]:
X = X.drop(correlated_features, axis=1)
In [9]:
X.corr()
Out[9]:
calories protein fat sodium fiber carbo sugars vitamins shelf weight cups
calories 1.000000 0.019066 0.498610 0.300649 -0.293413 0.250681 0.562340 0.265356 0.097234 0.696091 0.087200
protein 0.019066 1.000000 0.208431 -0.054674 0.500330 -0.130864 -0.329142 0.007335 0.133865 0.216158 -0.244469
fat 0.498610 0.208431 1.000000 -0.005407 0.016719 -0.318043 0.270819 -0.031156 0.263691 0.214625 -0.175892
sodium 0.300649 -0.054674 -0.005407 1.000000 -0.070675 0.355983 0.101451 0.361477 -0.069719 0.308576 0.119665
fiber -0.293413 0.500330 0.016719 -0.070675 1.000000 -0.356083 -0.141205 -0.032243 0.297539 0.247226 -0.513061
carbo 0.250681 -0.130864 -0.318043 0.355983 -0.356083 1.000000 -0.331665 0.258148 -0.101790 0.135136 0.363932
sugars 0.562340 -0.329142 0.270819 0.101451 -0.141205 -0.331665 1.000000 0.125137 0.100438 0.450648 -0.032358
vitamins 0.265356 0.007335 -0.031156 0.361477 -0.032243 0.258148 0.125137 1.000000 0.299262 0.320324 0.128405
shelf 0.097234 0.133865 0.263691 -0.069719 0.297539 -0.101790 0.100438 0.299262 1.000000 0.190762 -0.335269
weight 0.696091 0.216158 0.214625 0.308576 0.247226 0.135136 0.450648 0.320324 0.190762 1.000000 -0.199583
cups 0.087200 -0.244469 -0.175892 0.119665 -0.513061 0.363932 -0.032358 0.128405 -0.335269 -0.199583 1.000000
In [10]:
sns.heatmap(X.corr(),cmap='Greens')
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x28c2a3a5a08>
In [11]:
# Comparing feature importances from 2 regressors
# 1. ExtraTreeRegressor
In [12]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor(random_state=7)
model.fit(X, y)
model.feature_importances_
Out[12]:
array([0.22735543, 0.06655542, 0.03839334, 0.04386816, 0.11083203,
       0.02835178, 0.4183826 , 0.02898388, 0.01286661, 0.00721282,
       0.01719792])
In [13]:
feat_imp = pd.DataFrame({'importance':model.feature_importances_})
feat_imp['feature'] = X.columns
feat_imp.sort_values(by='importance', ascending=False, inplace=True)
feat_imp.sort_values(by='importance', inplace=True)
feat_imp = feat_imp.set_index('feature', drop=True)
feat_imp.plot.barh(title='Feature Importances', figsize=(8,8), color='r')
plt.xlabel('Feature Importance Score')
plt.show()
In [14]:
# 2. RandomForestRegressor
In [15]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=7)
model.fit(X, y)
model.feature_importances_
Out[15]:
array([0.22442809, 0.04339421, 0.01331483, 0.0573405 , 0.08147957,
       0.03085787, 0.51684929, 0.00787198, 0.00941274, 0.00219633,
       0.01285459])
In [16]:
feat_imp = pd.DataFrame({'importance':model.feature_importances_})
feat_imp['feature'] = X.columns
feat_imp.sort_values(by='importance', ascending=False, inplace=True)
feat_imp.sort_values(by='importance', inplace=True)
feat_imp = feat_imp.set_index('feature', drop=True)
feat_imp.plot.barh(title='Feature Importances', figsize=(8,8), color='r')
plt.xlabel('Feature Importance Score')
plt.show()

Will use top 5 features for prediction

In [17]:
X_final = X.drop(['sodium','cups','vitamins','shelf'], axis=1)
In [18]:
X.head()
Out[18]:
calories protein fat sodium fiber carbo sugars vitamins shelf weight cups
0 70 4 1 130 10.0 5.0 6 25 3 1.0 0.33
1 120 3 5 15 2.0 8.0 8 0 3 1.0 1.00
2 70 4 1 260 9.0 7.0 5 25 3 1.0 0.33
3 50 4 0 140 14.0 8.0 0 25 3 1.0 0.50
4 110 2 2 200 1.0 14.0 8 25 3 1.0 0.75
In [19]:
# testing data size at 25%
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = .25, random_state=3)
In [20]:
# fitting
lr = LinearRegression()
model = lr.fit(x_train, y_train)
# predict
pred_y = lr.predict(x_test)
# mse
print("mse = ", metrics.mean_squared_error(y_test, pred_y))
print("score = ", lr.score(x_test,y_test))
frame = { 'test': y_test, 'predict': pred_y }
results = pd.DataFrame(frame)
results.head()
mse =  1.2380873770099874
score =  0.9893855090307969
Out[20]:
test predict
31 23.804043 24.106909
32 52.076897 50.953908
23 44.330856 45.216714
60 55.333142 56.444973
8 49.120253 48.609967

***Improvements:***

  • CrossValidation can be used for better score
  • GridSearchCV for fine tuning and model selection
  • Sequential/Greedy selection method can be used for feature selection since the features' numbers are low

Contact Me

www.linkedin.com/in/billygustave

billygustave.com