Image Color-clustering ¶

Goal :

Use clustering techniques to identify dominant colors in image.

Data Exploration ¶

import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from PIL import Image

img = Image.open('dogs.jpeg')
img

# image data
data = np.array(img)

# array dimension
x, y, z = data.shape
data.shape

(185, 272, 3)

# reshape image to 2D
data_2d = data.reshape(x*y,z)
data_2d.shape

(50320, 3)

Clustering ¶

KMeans

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_2d_scaled = scaler.fit_transform(data_2d)

sse = {}
for k in range(2,10):
    kmeans = KMeans(n_clusters=k, random_state=13).fit(data_2d)
    label = kmeans.labels_
    # Inertia: Sum of distances of samples to their closest cluster center
    sse[k] = kmeans.inertia_ 
    sil_coeff = silhouette_score(data_2d, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(k, sil_coeff))
plt.figure(figsize=(16,14))
plt.plot(list(sse.keys()),list(sse.values()))
plt.grid()
plt.xlabel("Number of cluster")
plt.ylabel("SSE")

For n_clusters=2, The Silhouette Coefficient is 0.7788062647874106
For n_clusters=3, The Silhouette Coefficient is 0.720107059787817
For n_clusters=4, The Silhouette Coefficient is 0.6693274335311045
For n_clusters=5, The Silhouette Coefficient is 0.6441888385482647
For n_clusters=6, The Silhouette Coefficient is 0.6467463269358362
For n_clusters=7, The Silhouette Coefficient is 0.6358293966276302
For n_clusters=8, The Silhouette Coefficient is 0.6265592517977603
For n_clusters=9, The Silhouette Coefficient is 0.6189569242384274

Text(0, 0.5, 'SSE')

Reconstructing the image using our clustered labels

k = 3
kmeans = KMeans(n_clusters=k, random_state=17).fit(data_2d)

y_pred = kmeans.predict(data_2d)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_

# plotting imgage back
fig, ax = plt.subplots(figsize=(16,14))
plt.imshow(centroids[labels].reshape(x,y,z).astype(np.uint8))

<matplotlib.image.AxesImage at 0x20af444b6c8>

Dominant colors

# grab the number of different clusters and create a histogram
# based on the number of pixels assigned to each cluster
numLabels = np.arange(0, len(np.unique(labels)) + 1)
(hist, _) = np.histogram(labels, bins = numLabels)

# normalize the histogram, such that it sums to one
hist = hist.astype("float")
hist /= hist.sum()

import cv2
# initialize the bar chart representing the relative frequency
# of each of the colors
bar = np.zeros((50, 300, 3), dtype = "uint8")
startX = 0

# loop over the percentage of each cluster and the color of
# each cluster
for (percent, color) in zip(hist, centroids):
    # plot the relative percentage of each cluster
    endX = startX + (percent * 300)
    cv2.rectangle(bar, (int(startX), 0), (int(endX), 50),
                  color.astype("uint8").tolist(), -1)
    startX = endX

# graphical representation of the 3 dominant color in the image
fig, ax = plt.subplots(figsize=(16,14))
plt.axis("off")
plt.imshow(bar)

<matplotlib.image.AxesImage at 0x20af47576c8>

Agglomerative Clustering

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_2d_scaled = scaler.fit_transform(data_2d)

from sklearn.cluster import AgglomerativeClustering
agc = AgglomerativeClustering(n_clusters=3).fit(data_2d_scaled)
#centroids = agc.cluster_centers_
labels = agc.labels_
# plotting imgage back
fig, ax = plt.subplots(figsize=(16,14))
plt.imshow(np.reshape(np.reshape(labels, [x, y]), [x, y]).astype(np.uint8))

<matplotlib.image.AxesImage at 0x20af6aba848>

numLabels = np.arange(0, len(np.unique(labels)) + 1)
(hist, _) = np.histogram(labels, bins = numLabels)
hist = hist.astype("float")
hist /= hist.sum()
bar = np.zeros((50, 300, 3), dtype = "uint8")
startX = 0
for (percent, color) in zip(hist, centroids):
    # plot the relative percentage of each cluster
    endX = startX + (percent * 300)
    cv2.rectangle(bar, (int(startX), 0), (int(endX), 50),color.astype("uint8").tolist(), -1)
    startX = endX
fig, ax = plt.subplots(figsize=(16,14))
plt.axis("off")
plt.imshow(bar)

<matplotlib.image.AxesImage at 0x20af42cff48>

Note : for images the more clusters you have, the less pixelated the reconstructed image.

Billy Gustave

Image Color-Clustering

Image Color-clustering ¶

Data Exploration ¶

Clustering ¶

Contact Me

www.linkedin.com/in/billygustave

billygustave.com

Billy Gustave