import numpy as np import matplotlib.pyplot as plt # Lloyds algorithm for k-means def kmeans(X, k, max_iter = 100, tolerance = 10**(-3)): n_samples = X.shape[0] n_features = X.shape[1] classifications = np.zeros(n_samples, dtype = np.int64) # Choose initial cluster centroids randomly I = np.random.choice(n_samples, k) centroids = X[I, :] loss = 0 for m in range(0, max_iter): # Compute the classifications for i in range(0, n_samples): distances = np.zeros(k) for j in range(0, k): distances[j] = np.sqrt(np.sum(np.power(X[i, :] - centroids[j], 2))) classifications[i] = np.argmin(distances) # Compute the new centroids and new loss new_centroids = np.zeros((k, n_features)) new_loss = 0 for j in range(0, k): # compute centroids J = np.where(classifications == j) X_C = X[J] new_centroids[j] = X_C.mean(axis = 0) # Compute loss for i in range(0, X_C.shape[0]): new_loss += np.sum(np.power(X_C[i, :] - centroids[j], 2)) # Stopping criterion if np.abs(loss - new_loss) < tolerance: return new_centroids, classifications, new_loss centroids = new_centroids loss = new_loss print("Failed to converge!") return centroids, classifications, lossWe create some clusters using Sklearn
from sklearn.datasets import make_blobs # Make clusters X, y = make_blobs(n_samples=1000, centers = 4) fig = plt.figure(figsize=(8, 6)) plt.scatter(X[:, 0], X[:, 1], c = y, s = 3) plt.savefig('tmp.pdf', bbox_inches='tight') plt.show()
We fit k-means with Lloyd's algorithm and plot the estimated clusters and the true clusters.
centers, classifications, loss = kmeans(X, 4) # Plot fig = plt.figure(figsize=(10, 6)) fig.tight_layout() s1 = plt.subplot(1, 2, 1) s1.set_title("Estimated clusters") s1.scatter(X[:, 0], X[:, 1], c = classifications, s = 2) s1.scatter(centers[:, 0], centers[:,1], c = "r", s = 20) s2 = plt.subplot(1, 2, 2) s2.set_title("True clusters") s2.scatter(X[:, 0], X[:, 1], c = y, s = 2) plt.show()
The number of clusters is never known. One way to estimate the number of clusters is to use the elbow method. We fit k-means with multiple $k$ values from $2$ to $8$ and plot the loss.
# Choosing cluster size # Not very good actually. K = np.arange(2, 8) losses = np.zeros(8 - 2) for i in range(0, 8 - 2): centers, classifications, loss = kmeans(X, K[i]) losses[i] = loss fig = plt.figure(figsize=(10, 6)) fig.tight_layout() plt.plot(K, losses, marker = "o") plt.xlabel("k") plt.ylabel("loss") plt.savefig('tmp.pdf', bbox_inches='tight') plt.show()
This shows that the elbow method can be problematic.
References.
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+DigitsIan Goodfellow, Yoshua Bengio, and Aaron Courville. 2016. Deep Learning. The MIT Press.
Richard O. Duda, Peter E. Hart, and David G. Stork. 2000. Pattern Classification (2nd Edition). Wiley-Interscience, New York, NY, USA.
Kevin P. Murphy. 2012. Machine Learning: A Probabilistic Perspective. The MIT Press.
Christopher M. Bishop. 2006. Pattern Recognition and Machine Learning (Information Science and Statistics). Springer-Verlag, Berlin, Heidelberg.
Simon Haykin. 2007. Neural Networks (3rd Edition). Prentice-Hall, Inc., Upper Saddle River, NJ, USA.
Tweets by austindavbrown