import numpy as np
import matplotlib.pyplot as plt
# Lloyds algorithm for k-means
def kmeans(X, k, max_iter = 100, tolerance = 10**(-3)):
n_samples = X.shape[0]
n_features = X.shape[1]
classifications = np.zeros(n_samples, dtype = np.int64)
# Choose initial cluster centroids randomly
I = np.random.choice(n_samples, k)
centroids = X[I, :]
loss = 0
for m in range(0, max_iter):
# Compute the classifications
for i in range(0, n_samples):
distances = np.zeros(k)
for j in range(0, k):
distances[j] = np.sqrt(np.sum(np.power(X[i, :] - centroids[j], 2)))
classifications[i] = np.argmin(distances)
# Compute the new centroids and new loss
new_centroids = np.zeros((k, n_features))
new_loss = 0
for j in range(0, k):
# compute centroids
J = np.where(classifications == j)
X_C = X[J]
new_centroids[j] = X_C.mean(axis = 0)
# Compute loss
for i in range(0, X_C.shape[0]):
new_loss += np.sum(np.power(X_C[i, :] - centroids[j], 2))
# Stopping criterion
if np.abs(loss - new_loss) < tolerance:
return new_centroids, classifications, new_loss
centroids = new_centroids
loss = new_loss
print("Failed to converge!")
return centroids, classifications, loss
We create some clusters using Sklearn
from sklearn.datasets import make_blobs
# Make clusters
X, y = make_blobs(n_samples=1000, centers = 4)
fig = plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c = y, s = 3)
plt.savefig('tmp.pdf', bbox_inches='tight')
plt.show()
We fit k-means with Lloyd's algorithm and plot the estimated clusters and the true clusters.
centers, classifications, loss = kmeans(X, 4)
# Plot
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()
s1 = plt.subplot(1, 2, 1)
s1.set_title("Estimated clusters")
s1.scatter(X[:, 0], X[:, 1], c = classifications, s = 2)
s1.scatter(centers[:, 0], centers[:,1], c = "r", s = 20)
s2 = plt.subplot(1, 2, 2)
s2.set_title("True clusters")
s2.scatter(X[:, 0], X[:, 1], c = y, s = 2)
plt.show()
The number of clusters is never known. One way to estimate the number of clusters is to use the elbow method. We fit k-means with multiple $k$ values from $2$ to $8$ and plot the loss.
# Choosing cluster size
# Not very good actually.
K = np.arange(2, 8)
losses = np.zeros(8 - 2)
for i in range(0, 8 - 2):
centers, classifications, loss = kmeans(X, K[i])
losses[i] = loss
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()
plt.plot(K, losses, marker = "o")
plt.xlabel("k")
plt.ylabel("loss")
plt.savefig('tmp.pdf', bbox_inches='tight')
plt.show()
This shows that the elbow method can be problematic.
References.
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+DigitsIan Goodfellow, Yoshua Bengio, and Aaron Courville. 2016. Deep Learning. The MIT Press.
Richard O. Duda, Peter E. Hart, and David G. Stork. 2000. Pattern Classification (2nd Edition). Wiley-Interscience, New York, NY, USA.
Kevin P. Murphy. 2012. Machine Learning: A Probabilistic Perspective. The MIT Press.
Christopher M. Bishop. 2006. Pattern Recognition and Machine Learning (Information Science and Statistics). Springer-Verlag, Berlin, Heidelberg.
Simon Haykin. 2007. Neural Networks (3rd Edition). Prentice-Hall, Inc., Upper Saddle River, NJ, USA.
Tweets by austindavbrown