#MachineLearning #UnsupervisedLearning #Clustering
By Billy Gustave
Goal
:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from PIL import Image
img = Image.open('dogs.jpeg')
img
# image data
data = np.array(img)
# array dimension
x, y, z = data.shape
data.shape
# reshape image to 2D
data_2d = data.reshape(x*y,z)
data_2d.shape
KMeans
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_2d_scaled = scaler.fit_transform(data_2d)
sse = {}
for k in range(2,10):
kmeans = KMeans(n_clusters=k, random_state=13).fit(data_2d)
label = kmeans.labels_
# Inertia: Sum of distances of samples to their closest cluster center
sse[k] = kmeans.inertia_
sil_coeff = silhouette_score(data_2d, label, metric='euclidean')
print("For n_clusters={}, The Silhouette Coefficient is {}".format(k, sil_coeff))
plt.figure(figsize=(16,14))
plt.plot(list(sse.keys()),list(sse.values()))
plt.grid()
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
Reconstructing the image using our clustered labels
k = 3
kmeans = KMeans(n_clusters=k, random_state=17).fit(data_2d)
y_pred = kmeans.predict(data_2d)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
# plotting imgage back
fig, ax = plt.subplots(figsize=(16,14))
plt.imshow(centroids[labels].reshape(x,y,z).astype(np.uint8))
Dominant colors
# grab the number of different clusters and create a histogram
# based on the number of pixels assigned to each cluster
numLabels = np.arange(0, len(np.unique(labels)) + 1)
(hist, _) = np.histogram(labels, bins = numLabels)
# normalize the histogram, such that it sums to one
hist = hist.astype("float")
hist /= hist.sum()
import cv2
# initialize the bar chart representing the relative frequency
# of each of the colors
bar = np.zeros((50, 300, 3), dtype = "uint8")
startX = 0
# loop over the percentage of each cluster and the color of
# each cluster
for (percent, color) in zip(hist, centroids):
# plot the relative percentage of each cluster
endX = startX + (percent * 300)
cv2.rectangle(bar, (int(startX), 0), (int(endX), 50),
color.astype("uint8").tolist(), -1)
startX = endX
# graphical representation of the 3 dominant color in the image
fig, ax = plt.subplots(figsize=(16,14))
plt.axis("off")
plt.imshow(bar)
Agglomerative Clustering
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_2d_scaled = scaler.fit_transform(data_2d)
from sklearn.cluster import AgglomerativeClustering
agc = AgglomerativeClustering(n_clusters=3).fit(data_2d_scaled)
#centroids = agc.cluster_centers_
labels = agc.labels_
# plotting imgage back
fig, ax = plt.subplots(figsize=(16,14))
plt.imshow(np.reshape(np.reshape(labels, [x, y]), [x, y]).astype(np.uint8))
numLabels = np.arange(0, len(np.unique(labels)) + 1)
(hist, _) = np.histogram(labels, bins = numLabels)
hist = hist.astype("float")
hist /= hist.sum()
bar = np.zeros((50, 300, 3), dtype = "uint8")
startX = 0
for (percent, color) in zip(hist, centroids):
# plot the relative percentage of each cluster
endX = startX + (percent * 300)
cv2.rectangle(bar, (int(startX), 0), (int(endX), 50),color.astype("uint8").tolist(), -1)
startX = endX
fig, ax = plt.subplots(figsize=(16,14))
plt.axis("off")
plt.imshow(bar)
Note : for images the more clusters you have, the less pixelated the reconstructed image.