#MachineLearning #SupervisedLearning #LinearRegression
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import seaborn as sns
# load data
df = pd.read_csv('cereal.csv')
df.head()
# features and target
X = df.drop(['name', 'mfr', 'type', 'rating'], axis=1)
y = df.rating
# Checking for missing values
X.info()
X.describe()
# removing highly correlated features (threshold = .6)
from feature_selector import FeatureSelector
fs = FeatureSelector(data=X, labels=y)
fs.identify_collinear(correlation_threshold=0.9)
correlated_features = fs.ops['collinear']
correlated_features
X = X.drop(correlated_features, axis=1)
X.corr()
sns.heatmap(X.corr(),cmap='Greens')
# Comparing feature importances from 2 regressors
# 1. ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor(random_state=7)
model.fit(X, y)
model.feature_importances_
feat_imp = pd.DataFrame({'importance':model.feature_importances_})
feat_imp['feature'] = X.columns
feat_imp.sort_values(by='importance', ascending=False, inplace=True)
feat_imp.sort_values(by='importance', inplace=True)
feat_imp = feat_imp.set_index('feature', drop=True)
feat_imp.plot.barh(title='Feature Importances', figsize=(8,8), color='r')
plt.xlabel('Feature Importance Score')
plt.show()
# 2. RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=7)
model.fit(X, y)
model.feature_importances_
feat_imp = pd.DataFrame({'importance':model.feature_importances_})
feat_imp['feature'] = X.columns
feat_imp.sort_values(by='importance', ascending=False, inplace=True)
feat_imp.sort_values(by='importance', inplace=True)
feat_imp = feat_imp.set_index('feature', drop=True)
feat_imp.plot.barh(title='Feature Importances', figsize=(8,8), color='r')
plt.xlabel('Feature Importance Score')
plt.show()
X_final = X.drop(['sodium','cups','vitamins','shelf'], axis=1)
X.head()
# testing data size at 25%
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = .25, random_state=3)
# fitting
lr = LinearRegression()
model = lr.fit(x_train, y_train)
# predict
pred_y = lr.predict(x_test)
# mse
print("mse = ", metrics.mean_squared_error(y_test, pred_y))
print("score = ", lr.score(x_test,y_test))
frame = { 'test': y_test, 'predict': pred_y }
results = pd.DataFrame(frame)
results.head()
***Improvements:***