import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from mpl_toolkits.mplot3d import Axes3D
K Means
KMeans is a clustering algorithm that partitions a dataset into K distinct, non-overlapping subsets (clusters) based on similarity patterns within the data. It minimizes the sum of squared distances between data points and their assigned cluster centroids.
Import Libraries
KMeans on Dataset
Data
= load_digits()
y = y.data x
print(y.data.shape)
print(y.target.shape)
(1797, 64)
(1797,)
Closest Centroids
def find_closest_centroids(X, centroids):
= np.zeros(X.shape[0], dtype=int)
idx = np.linalg.norm(X[:, np.newaxis, :]-centroids, axis=-1)
distances = np.argmin(distances, axis=1)
idx
return idx
New Centroids
def compute_centroids(X, idx, centroids, k):
for i in range(k):
= np.mean(X[idx == i], axis=0)
centroids[i] return centroids
Cost Fxn
def calculate_cost(X, idx, centroids):
= 0
cost for i in range(len(centroids)):
= X[idx == i]
cluster_points = centroids[i]
centroid = np.sum((cluster_points - centroid) ** 2)
squared_distances += squared_distances
cost return cost
KMeans
def run_kMeans(X, max_iters, k):
= X.shape
n, d = X[np.random.choice(n, k, replace=False)]
centroids
print(f"{'Iteration':<10}{'Cost':<15}")
=np.inf
prev_cost=0
cost
for i in range(max_iters):
if np.abs(prev_cost-cost)<1e-5:
break
= find_closest_centroids(X, centroids)
idx =cost
prev_cost= calculate_cost(X, idx, centroids)
cost
print(f"{i + 1:<10}{cost:<15}")
= compute_centroids(X, idx, centroids, k)
centroids
return centroids, idx
Accuracy
= run_kMeans(x, 500, 10)
a, b print("Accuracy =", 100*np.mean(y.target == b) ,"%")
Iteration Cost
1 2312326.0
2 1367849.205211381
3 1272514.7623466342
4 1230713.3819844497
5 1217124.8773493604
6 1214638.5685729042
7 1214183.403014829
8 1214039.6629300476
9 1213981.741130655
10 1213950.3799817525
11 1213908.3568474432
12 1213896.1627396639
13 1213875.909452301
14 1213848.8851707997
15 1213820.8739280729
16 1213820.8739280729
Accuracy = 19.47690595436839 %
Image compression with K-means
Original Image
= plt.imread('img.jpg')
original_img
plt.imshow(original_img) plt.show()
# Gives the blue intensity of the pixel at row 50 and column 33.
50, 33, 2] original_img[
3
print("Shape of original_img is:", original_img.shape)
Shape of original_img is: (477, 350, 3)
KMeans
= np.reshape(
X_img 0] * original_img.shape[1], 3))
original_img, (original_img.shape[print("Shape of X_img is:", X_img.shape)
Shape of X_img is: (166950, 3)
= 30
max_iter = 15
k = run_kMeans(X_img, max_iter, k) centroids, idx
Iteration Cost
1 40518784
2 49937840
3 38376899
4 48464334
5 38469257
6 44709319
7 45348539
8 48159095
9 46984087
10 41761944
11 45318104
12 45340223
13 46252334
14 47408590
15 40701877
16 48466062
17 52976465
18 50425878
19 48800590
20 49841789
21 50815087
22 46825849
23 47234052
24 50064839
25 45833848
26 44100214
27 44035480
28 42271855
29 46069851
30 46009134
RGB Values plot with Centroids
= plt.figure(figsize=(8, 6))
fig = fig.add_subplot(111, projection='3d')
ax 0], X_img[:, 1], X_img[:, 2], s=0.3)
ax.scatter(X_img[:, 'R value')
ax.set_xlabel('G value')
ax.set_ylabel('B value')
ax.set_zlabel('RGB Values Plot')
ax.set_title(0], centroids[:, 1], centroids[:, 2], s=200, alpha=1, marker='*', c='red', zorder=2)
ax.scatter(centroids[:,
fig.tight_layout() plt.show()
Colors of Centroids
= plt.subplots(1, len(centroids))
fig, axs for i in range(len(centroids)):
1, 1, 3))
axs[i].imshow(centroids[i].reshape('off')
axs[i].axis( plt.show()
Decomposed Image
= centroids[idx, :]
X_recovered = np.reshape(X_recovered, original_img.shape)
X_recovered
plt.imshow(X_recovered)
plt.show()